Actual source code: memory.cxx
1: #include <petsc/private/deviceimpl.h>
3: #include <petsc/private/cpp/register_finalize.hpp>
4: #include <petsc/private/cpp/type_traits.hpp>
6: #include <unordered_map>
7: #include <algorithm> // std::find_if
8: #include <cstring> // std::memset
10: const char *const PetscDeviceCopyModes[] = {"host_to_host", "device_to_host", "host_to_device", "device_to_device", "auto", "PetscDeviceCopyMode", "PETSC_DEVICE_COPY_", nullptr};
11: static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_HTOH) == 0, "");
12: static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_DTOH) == 1, "");
13: static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_HTOD) == 2, "");
14: static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_DTOD) == 3, "");
15: static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_AUTO) == 4, "");
17: // GCC implementation for std::hash<T*>. LLVM's libc++ is almost 2x slower because they do all
18: // kinds of complicated murmur hashing, so we make sure to enforce GCC's version.
19: struct PointerHash {
20: template <typename T>
21: PETSC_NODISCARD std::size_t operator()(const T *ptr) const noexcept
22: {
23: return reinterpret_cast<std::size_t>(ptr);
24: }
25: };
27: // ==========================================================================================
28: // PointerAttributes
29: // ==========================================================================================
31: struct PointerAttributes {
32: PetscMemType mtype = PETSC_MEMTYPE_HOST; // memtype of allocation
33: PetscObjectId id = 0; // id of allocation
34: std::size_t size = 0; // size of allocation (bytes)
36: // even though this is a POD and can be aggregate initialized, the STL uses () constructors
37: // in unordered_map and so we need to provide a trivial constructor...
38: constexpr PointerAttributes(PetscMemType, PetscObjectId, std::size_t) noexcept;
40: bool operator==(const PointerAttributes &) const noexcept;
42: PETSC_NODISCARD bool contains(const void *, const void *) const noexcept;
43: };
45: // ==========================================================================================
46: // PointerAttributes - Public API
47: // ==========================================================================================
49: inline constexpr PointerAttributes::PointerAttributes(PetscMemType mtype_, PetscObjectId id_, std::size_t size_) noexcept : mtype(mtype_), id(id_), size(size_) { }
51: inline bool PointerAttributes::operator==(const PointerAttributes &other) const noexcept
52: {
53: return (mtype == other.mtype) && (id == other.id) && (size == other.size);
54: }
56: /*
57: PointerAttributes::contains - asks and answers the question, does ptr_begin contain ptr
59: Input Parameters:
60: + ptr_begin - pointer to the start of the range to check
61: - ptr - the pointer to query
63: Notes:
64: Returns true if ptr falls within ptr_begins range, false otherwise.
65: */
66: inline bool PointerAttributes::contains(const void *ptr_begin, const void *ptr) const noexcept
67: {
68: return (ptr >= ptr_begin) && (ptr < (static_cast<const char *>(ptr_begin) + size));
69: }
71: // ==========================================================================================
72: // MemoryMap
73: //
74: // Since the pointers allocated via PetscDeviceAllocate_Private() may be device pointers we
75: // cannot just store meta-data within the pointer itself (as we can't dereference them). So
76: // instead we need to keep an extra map to keep track of them
77: //
78: // Each entry maps pointer -> {
79: // PetscMemType - The memtype of the pointer
80: // PetscObjectId - A unique ID assigned at allocation or registratrion so auto-dep can
81: // identify the pointer
82: // size - The size (in bytes) of the allocation
83: // }
84: // ==========================================================================================
86: class MemoryMap : public Petsc::RegisterFinalizeable<MemoryMap> {
87: public:
88: using map_type = std::unordered_map<void *, PointerAttributes, PointerHash>;
90: map_type map{};
92: PETSC_NODISCARD map_type::const_iterator search_for(const void *, bool = false) const noexcept;
94: private:
95: friend class Petsc::RegisterFinalizeable<MemoryMap>;
96: PETSC_NODISCARD PetscErrorCode register_finalize_() noexcept;
97: PETSC_NODISCARD PetscErrorCode finalize_() noexcept;
98: };
100: // ==========================================================================================
101: // MemoryMap - Private API
102: // ==========================================================================================
104: PetscErrorCode MemoryMap::register_finalize_() noexcept
105: {
106: // Preallocate, this does give a modest performance bump since unordered_map is so __dog__
107: // slow if it needs to rehash. Experiments show that users tend not to have more than 5 or
108: // so concurrently live pointers lying around. 10 at most.
109: map.reserve(16);
110: return 0;
111: }
113: PetscErrorCode MemoryMap::finalize_() noexcept
114: {
115: PetscInfo(nullptr, "Finalizing memory map\n");
116: map = map_type{};
117: return 0;
118: }
120: // ==========================================================================================
121: // MemoryMap - Public API
122: // ==========================================================================================
124: /*
125: MemoryMap::search_for - retrieve an iterator to the key-value pair for a pointer in the map
127: Input Parameters:
128: + ptr - pointer to search for
129: - must_find - true if an error is raised if the pointer is not found (default: false)
131: Notes:
132: Accounts for sub-regions, i.e. if ptr is contained within another pointers region, it returns
133: the iterator to the super-pointers key-value pair.
135: If ptr is not found and must_find is false returns map.end(), otherwise raises an error
136: */
137: MemoryMap::map_type::const_iterator MemoryMap::search_for(const void *ptr, bool must_find) const noexcept
138: {
139: const auto end = map.end();
140: auto it = map.find(const_cast<map_type::key_type>(ptr));
142: // ptr was found, and points to an entire block
143: if (it != end) return it;
144: // wasn't found, but maybe its part of a block. have to search every block for it
145: // clang-format off
146: it = std::find_if(map.begin(), end, [ptr](const map_type::const_iterator::value_type &map_it) {
147: return map_it.second.contains(map_it.first, ptr);
148: });
150: return it;
151: // clang-format on
152: }
154: static MemoryMap memory_map;
156: // ==========================================================================================
157: // Utility functions
158: // ==========================================================================================
160: static PetscErrorCode PetscDeviceCheckCapable_Private(PetscDeviceContext dctx, bool cond, const char descr[])
161: {
163: return 0;
164: }
166: // A helper utility, since register is called from PetscDeviceRegisterMemory() and
167: // PetscDevicAllocate(). The latter also needs the generated id, so instead of making it search
168: // the map again we just return it here
169: static PetscErrorCode PetscDeviceRegisterMemory_Private(const void *PETSC_RESTRICT ptr, PetscMemType mtype, std::size_t size, PetscObjectId *PETSC_RESTRICT id = nullptr)
170: {
171: auto &map = memory_map.map;
172: const auto it = memory_map.search_for(ptr);
174: if (it == map.cend()) {
175: // pointer was never registered with the map, insert it and bail
176: const auto newid = PetscObjectNewId_Internal();
178: if (PetscDefined(USE_DEBUG)) {
179: const auto tmp = PointerAttributes(mtype, newid, size);
181: for (const auto &entry : map) {
182: // REVIEW ME: maybe this should just be handled...
184: entry.first, PetscMemTypeToString(entry.second.mtype), entry.second.size);
185: }
186: }
187: // clang-format off
188: if (id) *id = newid;
189: PetscCallCXX(map.emplace(
190: std::piecewise_construct,
191: std::forward_as_tuple(const_cast<MemoryMap::map_type::key_type>(ptr)),
192: std::forward_as_tuple(mtype, newid, size)
193: ));
194: // clang-format on
195: return 0;
196: }
197: if (PetscDefined(USE_DEBUG)) {
198: const auto &old = it->second;
201: PetscMemTypeToString(old.mtype), old.size, old.id, PetscMemTypeToString(mtype), size, old.id);
202: }
203: if (id) *id = it->second.id;
204: return 0;
205: }
207: /*@C
208: PetscDeviceRegisterMemory - Register a pointer for use with device-aware memory system
210: Not Collective
212: Input Parameters:
213: + ptr - The pointer to register
214: . mtype - The `PetscMemType` of the pointer
215: - size - The size (in bytes) of the memory region
217: Notes:
218: `ptr` need not point to the beginning of the memory range, however the user should register
219: the
221: It's OK to re-register the same `ptr` repeatedly (subsequent registrations do nothing)
222: however the given `mtype` and `size` must match the original registration.
224: `size` may be 0 (in which case this routine does nothing).
226: Level: intermediate
228: .seealso: `PetscDeviceMalloc()`, `PetscDeviceArrayCopy()`, `PetscDeviceFree()`,
229: `PetscDeviceArrayZero()`
230: @*/
231: PetscErrorCode PetscDeviceRegisterMemory(const void *PETSC_RESTRICT ptr, PetscMemType mtype, std::size_t size)
232: {
234: if (PetscUnlikely(!size)) return 0; // there is no point registering empty range
235: PetscDeviceRegisterMemory_Private(ptr, mtype, size);
236: return 0;
237: }
239: /*
240: PetscDeviceAllocate_Private - Allocate device-aware memory
242: Not Collective, Asynchronous, Auto-dependency aware
244: Input Parameters:
245: + dctx - The `PetscDeviceContext` used to allocate the memory
246: . clear - Whether or not the memory should be zeroed
247: . mtype - The type of memory to allocate
248: . n - The amount (in bytes) to allocate
249: - alignment - The alignment requirement (in bytes) of the allocated pointer
251: Output Parameter:
252: . ptr - The pointer to store the result in
254: Notes:
255: The user should prefer `PetscDeviceMalloc()` over this routine as it automatically computes
256: the size of the allocation and alignment based on the size of the datatype.
258: If the user is unsure about `alignment` -- or unable to compute it -- passing
259: `PETSC_MEMALIGN` will always work, though the user should beware that this may be quite
260: wasteful for very small allocations.
262: Memory allocated with this function must be freed with `PetscDeviceFree()` (or
263: `PetscDeviceDeallocate_Private()`).
265: If `n` is zero, then `ptr` is set to `PETSC_NULLPTR`.
267: This routine falls back to using `PetscMalloc1()` or `PetscCalloc1()` (depending on the value
268: of `clear`) if PETSc was not configured with device support. The user should note that
269: `mtype` and `alignment` are ignored in this case, as these routines allocate only host memory
270: aligned to `PETSC_MEMALIGN`.
272: Note result stored `ptr` is immediately valid and the user may freely inspect or manipulate
273: its value on function return, i.e.\:
275: .vb
276: PetscInt *ptr;
278: PetscDeviceAllocate_Private(dctx, PETSC_FALSE, PETSC_MEMTYPE_DEVICE, 20, alignof(PetscInt), (void**)&ptr);
280: PetscInt *sub_ptr = ptr + 10; // OK, no need to synchronize
282: ptr[0] = 10; // ERROR, directly accessing contents of ptr is undefined until synchronization
283: .ve
285: DAG representation:
286: .vb
287: time ->
289: -> dctx - |= CALL =| -\- dctx -->
290: \- ptr ->
291: .ve
293: Level: intermediate
295: .N ASYNC_API
297: .seealso: `PetscDeviceMalloc()`, `PetscDeviceFree()`, `PetscDeviceDeallocate_Private()`,
298: `PetscDeviceArrayCopy()`, `PetscDeviceArrayZero()`, `PetscMemType`
299: */
300: PetscErrorCode PetscDeviceAllocate_Private(PetscDeviceContext dctx, PetscBool clear, PetscMemType mtype, std::size_t n, std::size_t alignment, void **PETSC_RESTRICT ptr)
301: {
302: PetscObjectId id = 0;
304: if (PetscDefined(USE_DEBUG)) {
305: const auto is_power_of_2 = [](std::size_t num) { return (num & (num - 1)) == 0; };
309: }
311: *ptr = nullptr;
312: if (PetscUnlikely(!n)) return 0;
313: memory_map.register_finalize();
314: PetscDeviceContextGetOptionalNullContext_Internal(&dctx);
316: // get our pointer here
317: if (dctx->ops->memalloc) {
318: PetscUseTypeMethod(dctx, memalloc, clear, mtype, n, alignment, ptr);
319: } else {
320: PetscDeviceCheckCapable_Private(dctx, PetscMemTypeHost(mtype), "allocating");
321: PetscMallocA(1, clear, __LINE__, PETSC_FUNCTION_NAME, __FILE__, n, ptr);
322: }
323: PetscDeviceRegisterMemory_Private(*ptr, mtype, n, &id);
324: // Note this is a "write" so that the next dctx to try and read from the pointer has to wait
325: // for the allocation to be ready
326: PetscDeviceContextMarkIntentFromID(dctx, id, PETSC_MEMORY_ACCESS_WRITE, "memory allocation");
327: return 0;
328: }
330: /*
331: PetscDeviceDeallocate_Private - Free device-aware memory
333: Not Collective, Asynchronous, Auto-dependency aware
335: Input Parameters:
336: + dctx - The `PetscDeviceContext` used to free the memory
337: - ptr - The pointer to free
339: Notes:
340: `ptr` must have been allocated using any of `PetscDeviceMalloc()`, `PetscDeviceCalloc()` or
341: `PetscDeviceAllocate_Private()`, or registered with the system via `PetscDeviceRegisterMemory()`.
343: The user should prefer `PetscDeviceFree()` over this routine as it automatically sets `ptr`
344: to `PETSC_NULLPTR` on successful deallocation.
346: `ptr` may be `NULL`.
348: This routine falls back to using `PetscFree()` if PETSc was not configured with device
349: support. The user should note that `PetscFree()` frees only host memory.
351: DAG representation:
352: .vb
353: time ->
355: -> dctx -/- |= CALL =| - dctx ->
356: -> ptr -/
357: .ve
359: Level: intermediate
361: .N ASYNC_API
363: .seealso: `PetscDeviceFree()`, `PetscDeviceAllocate_Private()`
364: */
365: PetscErrorCode PetscDeviceDeallocate_Private(PetscDeviceContext dctx, void *PETSC_RESTRICT ptr)
366: {
367: if (ptr) {
368: auto &map = memory_map.map;
369: const auto found_it = map.find(const_cast<MemoryMap::map_type::key_type>(ptr));
371: if (PetscUnlikelyDebug(found_it == map.end())) {
372: // OK this is a bad pointer, now determine why
373: const auto it = memory_map.search_for(ptr);
375: // if it is map.cend() then no allocation owns it, meaning it was not allocated by us!
377: // if we are here then we did allocate it but the user has tried to do something along
378: // the lines of:
379: //
380: // allocate(&ptr, size);
381: // deallocate(ptr+5);
382: //
383: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Attempting to deallocate pointer %p which is a suballocation of %p (memtype %s, id %" PetscInt64_FMT ", size %zu bytes)", ptr, it->first, PetscMemTypeToString(it->second.mtype), it->second.id,
384: it->second.size);
385: }
387: PetscDeviceContextGetOptionalNullContext_Internal(&dctx);
388: // mark intent BEFORE we free, note we mark as write so that we are made to wait on any
389: // outstanding reads (don't want to kill the pointer before they are done)
390: PetscDeviceContextMarkIntentFromID(dctx, found_it->second.id, PETSC_MEMORY_ACCESS_WRITE, "memory deallocation");
391: // do free
392: if (dctx->ops->memfree) {
393: PetscUseTypeMethod(dctx, memfree, found_it->second.mtype, (void **)&ptr);
394: } else {
395: PetscDeviceCheckCapable_Private(dctx, PetscMemTypeHost(found_it->second.mtype), "freeing");
396: }
397: // if ptr still exists, then the device context could not handle it
398: if (ptr) PetscFree(ptr);
399: map.erase(found_it);
400: }
401: return 0;
402: }
404: /*@C
405: PetscDeviceMemcpy - Copy memory in a device-aware manner
407: Not Collective, Asynchronous, Auto-dependency aware
409: Input Parameters:
410: + dctx - The `PetscDeviceContext` used to copy the memory
411: . dest - The pointer to copy to
412: . src - The pointer to copy from
413: - n - The amount (in bytes) to copy
415: Notes:
416: Both `dest` and `src` must have been allocated by `PetscDeviceMalloc()` or
417: `PetscDeviceCalloc()`.
419: `src` and `dest` cannot overlap.
421: If both `src` and `dest` are on the host this routine is fully synchronous.
423: The user should prefer `PetscDeviceArrayCopy()` over this routine as it automatically
424: computes the number of bytes to copy from the size of the pointer types.
426: DAG representation:
427: .vb
428: time ->
430: -> dctx - |= CALL =| - dctx ->
431: -> dest --------------------->
432: -> src ---------------------->
433: .ve
435: Level: intermediate
437: .N ASYNC_API
439: .seealso: `PetscDeviceArrayCopy()`, `PetscDeviceMalloc()`, `PetscDeviceCalloc()`,
440: `PetscDeviceFree()`
441: @*/
442: PetscErrorCode PetscDeviceMemcpy(PetscDeviceContext dctx, void *PETSC_RESTRICT dest, const void *PETSC_RESTRICT src, std::size_t n)
443: {
444: if (!n) return 0;
447: if (dest == src) return 0;
448: PetscDeviceContextGetOptionalNullContext_Internal(&dctx);
449: {
450: const auto dest_it = memory_map.search_for(dest, true);
451: const auto src_it = memory_map.search_for(src, true);
452: const auto mode = PetscMemTypeToDeviceCopyMode(dest_it->second.mtype, src_it->second.mtype);
454: PetscDeviceContextMarkIntentFromID(dctx, src_it->second.id, PETSC_MEMORY_ACCESS_READ, "memory copy (src)");
455: PetscDeviceContextMarkIntentFromID(dctx, dest_it->second.id, PETSC_MEMORY_ACCESS_WRITE, "memory copy (dest)");
456: // perform the copy
457: if (dctx->ops->memcopy) {
458: PetscUseTypeMethod(dctx, memcopy, dest, src, n, mode);
459: if (mode == PETSC_DEVICE_COPY_HTOD) {
460: PetscLogCpuToGpu(n);
461: } else if (mode == PETSC_DEVICE_COPY_DTOH) {
462: PetscLogGpuToCpu(n);
463: }
464: } else {
465: // REVIEW ME: we might potentially need to sync here if the memory is device-allocated
466: // (pinned) but being copied by a host dctx
467: PetscDeviceCheckCapable_Private(dctx, mode == PETSC_DEVICE_COPY_HTOH, "copying");
468: PetscMemcpy(dest, src, n);
469: }
470: }
471: return 0;
472: }
474: /*@C
475: PetscDeviceMemset - Memset device-aware memory
477: Not Collective, Asynchronous, Auto-dependency aware
479: Input Parameters:
480: + dctx - The `PetscDeviceContext` used to memset the memory
481: . ptr - The pointer to the memory
482: . v - The value to set
483: - n - The amount (in bytes) to set
485: Notes:
486: `ptr` must have been allocated by `PetscDeviceMalloc()` or `PetscDeviceCalloc()`.
488: The user should prefer `PetscDeviceArrayZero()` over this routine as it automatically
489: computes the number of bytes to copy from the size of the pointer types, though they should
490: note that it only zeros memory.
492: This routine is analogous to `memset()`. That is, this routine copies the value
493: `static_cast<unsigned char>(v)` into each of the first count characters of the object pointed
494: to by `dest`.
496: If `dest` is on device, this routine is asynchronous.
498: DAG representation:
499: .vb
500: time ->
502: -> dctx - |= CALL =| - dctx ->
503: -> dest --------------------->
504: .ve
506: Level: intermediate
508: .N ASYNC_API
510: .seealso: `PetscDeviceArrayZero()`, `PetscDeviceMalloc()`, `PetscDeviceCalloc()`,
511: `PetscDeviceFree()`
512: @*/
513: PetscErrorCode PetscDeviceMemset(PetscDeviceContext dctx, void *ptr, PetscInt v, std::size_t n)
514: {
515: if (PetscUnlikely(!n)) return 0;
517: PetscDeviceContextGetOptionalNullContext_Internal(&dctx);
518: {
519: const auto ptr_it = memory_map.search_for(ptr, true);
520: const auto mtype = ptr_it->second.mtype;
522: PetscDeviceContextMarkIntentFromID(dctx, ptr_it->second.id, PETSC_MEMORY_ACCESS_WRITE, "memory set");
523: if (dctx->ops->memset) {
524: PetscUseTypeMethod(dctx, memset, mtype, ptr, v, n);
525: } else {
526: // REVIEW ME: we might potentially need to sync here if the memory is device-allocated
527: // (pinned) but being memset by a host dctx
528: PetscDeviceCheckCapable_Private(dctx, PetscMemTypeHost(mtype), "memsetting");
529: std::memset(ptr, static_cast<int>(v), n);
530: }
531: }
532: return 0;
533: }