Actual source code: memory.cxx

  1: #include <petsc/private/deviceimpl.h>

  3: #include <petsc/private/cpp/register_finalize.hpp>
  4: #include <petsc/private/cpp/type_traits.hpp>

  6: #include <unordered_map>
  7: #include <algorithm> // std::find_if
  8: #include <cstring>   // std::memset

 10: const char *const PetscDeviceCopyModes[] = {"host_to_host", "device_to_host", "host_to_device", "device_to_device", "auto", "PetscDeviceCopyMode", "PETSC_DEVICE_COPY_", nullptr};
 11: static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_HTOH) == 0, "");
 12: static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_DTOH) == 1, "");
 13: static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_HTOD) == 2, "");
 14: static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_DTOD) == 3, "");
 15: static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_AUTO) == 4, "");

 17: // GCC implementation for std::hash<T*>. LLVM's libc++ is almost 2x slower because they do all
 18: // kinds of complicated murmur hashing, so we make sure to enforce GCC's version.
 19: struct PointerHash {
 20:   template <typename T>
 21:   PETSC_NODISCARD std::size_t operator()(const T *ptr) const noexcept
 22:   {
 23:     return reinterpret_cast<std::size_t>(ptr);
 24:   }
 25: };

 27: // ==========================================================================================
 28: // PointerAttributes
 29: // ==========================================================================================

 31: struct PointerAttributes {
 32:   PetscMemType  mtype = PETSC_MEMTYPE_HOST; // memtype of allocation
 33:   PetscObjectId id    = 0;                  // id of allocation
 34:   std::size_t   size  = 0;                  // size of allocation (bytes)

 36:   // even though this is a POD and can be aggregate initialized, the STL uses () constructors
 37:   // in unordered_map and so we need to provide a trivial constructor...
 38:   constexpr PointerAttributes(PetscMemType, PetscObjectId, std::size_t) noexcept;

 40:   bool operator==(const PointerAttributes &) const noexcept;

 42:   PETSC_NODISCARD bool contains(const void *, const void *) const noexcept;
 43: };

 45: // ==========================================================================================
 46: // PointerAttributes - Public API
 47: // ==========================================================================================

 49: inline constexpr PointerAttributes::PointerAttributes(PetscMemType mtype_, PetscObjectId id_, std::size_t size_) noexcept : mtype(mtype_), id(id_), size(size_) { }

 51: inline bool PointerAttributes::operator==(const PointerAttributes &other) const noexcept
 52: {
 53:   return (mtype == other.mtype) && (id == other.id) && (size == other.size);
 54: }

 56: /*
 57:   PointerAttributes::contains - asks and answers the question, does ptr_begin contain ptr

 59:   Input Parameters:
 60: + ptr_begin - pointer to the start of the range to check
 61: - ptr       - the pointer to query

 63:   Notes:
 64:   Returns true if ptr falls within ptr_begins range, false otherwise.
 65: */
 66: inline bool PointerAttributes::contains(const void *ptr_begin, const void *ptr) const noexcept
 67: {
 68:   return (ptr >= ptr_begin) && (ptr < (static_cast<const char *>(ptr_begin) + size));
 69: }

 71: // ==========================================================================================
 72: // MemoryMap
 73: //
 74: // Since the pointers allocated via PetscDeviceAllocate_Private() may be device pointers we
 75: // cannot just store meta-data within the pointer itself (as we can't dereference them). So
 76: // instead we need to keep an extra map to keep track of them
 77: //
 78: // Each entry maps pointer -> {
 79: //   PetscMemType  - The memtype of the pointer
 80: //   PetscObjectId - A unique ID assigned at allocation or registratrion so auto-dep can
 81: //                   identify the pointer
 82: //   size          - The size (in bytes) of the allocation
 83: // }
 84: // ==========================================================================================

 86: class MemoryMap : public Petsc::RegisterFinalizeable<MemoryMap> {
 87: public:
 88:   using map_type = std::unordered_map<void *, PointerAttributes, PointerHash>;

 90:   map_type map{};

 92:   PETSC_NODISCARD map_type::const_iterator search_for(const void *, bool = false) const noexcept;

 94: private:
 95:   friend class Petsc::RegisterFinalizeable<MemoryMap>;
 96:   PETSC_NODISCARD PetscErrorCode register_finalize_() noexcept;
 97:   PETSC_NODISCARD PetscErrorCode finalize_() noexcept;
 98: };

100: // ==========================================================================================
101: // MemoryMap - Private API
102: // ==========================================================================================

104: PetscErrorCode MemoryMap::register_finalize_() noexcept
105: {
106:   // Preallocate, this does give a modest performance bump since unordered_map is so __dog__
107:   // slow if it needs to rehash. Experiments show that users tend not to have more than 5 or
108:   // so concurrently live pointers lying around. 10 at most.
109:   map.reserve(16);
110:   return 0;
111: }

113: PetscErrorCode MemoryMap::finalize_() noexcept
114: {
115:   PetscInfo(nullptr, "Finalizing memory map\n");
116:   map = map_type{};
117:   return 0;
118: }

120: // ==========================================================================================
121: // MemoryMap - Public API
122: // ==========================================================================================

124: /*
125:   MemoryMap::search_for - retrieve an iterator to the key-value pair for a pointer in the map

127:   Input Parameters:
128: + ptr       - pointer to search for
129: - must_find - true if an error is raised if the pointer is not found (default: false)

131:   Notes:
132:   Accounts for sub-regions, i.e. if ptr is contained within another pointers region, it returns
133:   the iterator to the super-pointers key-value pair.

135:   If ptr is not found and must_find is false returns map.end(), otherwise raises an error
136: */
137: MemoryMap::map_type::const_iterator MemoryMap::search_for(const void *ptr, bool must_find) const noexcept
138: {
139:   const auto end = map.end();
140:   auto       it  = map.find(const_cast<map_type::key_type>(ptr));

142:   // ptr was found, and points to an entire block
143:   if (it != end) return it;
144:   // wasn't found, but maybe its part of a block. have to search every block for it
145:   // clang-format off
146:   it = std::find_if(map.begin(), end, [ptr](const map_type::const_iterator::value_type &map_it) {
147:     return map_it.second.contains(map_it.first, ptr);
148:   });
150:   return it;
151:   // clang-format on
152: }

154: static MemoryMap memory_map;

156: // ==========================================================================================
157: // Utility functions
158: // ==========================================================================================

160: static PetscErrorCode PetscDeviceCheckCapable_Private(PetscDeviceContext dctx, bool cond, const char descr[])
161: {
163:   return 0;
164: }

166: // A helper utility, since register is called from PetscDeviceRegisterMemory() and
167: // PetscDevicAllocate(). The latter also needs the generated id, so instead of making it search
168: // the map again we just return it here
169: static PetscErrorCode PetscDeviceRegisterMemory_Private(const void *PETSC_RESTRICT ptr, PetscMemType mtype, std::size_t size, PetscObjectId *PETSC_RESTRICT id = nullptr)
170: {
171:   auto      &map = memory_map.map;
172:   const auto it  = memory_map.search_for(ptr);

174:   if (it == map.cend()) {
175:     // pointer was never registered with the map, insert it and bail
176:     const auto newid = PetscObjectNewId_Internal();

178:     if (PetscDefined(USE_DEBUG)) {
179:       const auto tmp = PointerAttributes(mtype, newid, size);

181:       for (const auto &entry : map) {
182:         // REVIEW ME: maybe this should just be handled...
184:                    entry.first, PetscMemTypeToString(entry.second.mtype), entry.second.size);
185:       }
186:     }
187:     // clang-format off
188:     if (id) *id = newid;
189:     PetscCallCXX(map.emplace(
190:       std::piecewise_construct,
191:       std::forward_as_tuple(const_cast<MemoryMap::map_type::key_type>(ptr)),
192:       std::forward_as_tuple(mtype, newid, size)
193:     ));
194:     // clang-format on
195:     return 0;
196:   }
197:   if (PetscDefined(USE_DEBUG)) {
198:     const auto &old = it->second;

201:                PetscMemTypeToString(old.mtype), old.size, old.id, PetscMemTypeToString(mtype), size, old.id);
202:   }
203:   if (id) *id = it->second.id;
204:   return 0;
205: }

207: /*@C
208:   PetscDeviceRegisterMemory - Register a pointer for use with device-aware memory system

210:   Not Collective

212:   Input Parameters:
213: + ptr   - The pointer to register
214: . mtype - The `PetscMemType` of the pointer
215: - size  - The size (in bytes) of the memory region

217:   Notes:
218:   `ptr` need not point to the beginning of the memory range, however the user should register
219:   the

221:   It's OK to re-register the same `ptr` repeatedly (subsequent registrations do nothing)
222:   however the given `mtype` and `size` must match the original registration.

224:   `size` may be 0 (in which case this routine does nothing).

226:   Level: intermediate

228: .seealso: `PetscDeviceMalloc()`, `PetscDeviceArrayCopy()`, `PetscDeviceFree()`,
229: `PetscDeviceArrayZero()`
230: @*/
231: PetscErrorCode PetscDeviceRegisterMemory(const void *PETSC_RESTRICT ptr, PetscMemType mtype, std::size_t size)
232: {
234:   if (PetscUnlikely(!size)) return 0; // there is no point registering empty range
235:   PetscDeviceRegisterMemory_Private(ptr, mtype, size);
236:   return 0;
237: }

239: /*
240:   PetscDeviceAllocate_Private - Allocate device-aware memory

242:   Not Collective, Asynchronous, Auto-dependency aware

244:   Input Parameters:
245: + dctx      - The `PetscDeviceContext` used to allocate the memory
246: . clear     - Whether or not the memory should be zeroed
247: . mtype     - The type of memory to allocate
248: . n         - The amount (in bytes) to allocate
249: - alignment - The alignment requirement (in bytes) of the allocated pointer

251:   Output Parameter:
252: . ptr - The pointer to store the result in

254:   Notes:
255:   The user should prefer `PetscDeviceMalloc()` over this routine as it automatically computes
256:   the size of the allocation and alignment based on the size of the datatype.

258:   If the user is unsure about `alignment` -- or unable to compute it -- passing
259:   `PETSC_MEMALIGN` will always work, though the user should beware that this may be quite
260:   wasteful for very small allocations.

262:   Memory allocated with this function must be freed with `PetscDeviceFree()` (or
263:   `PetscDeviceDeallocate_Private()`).

265:   If `n` is zero, then `ptr` is set to `PETSC_NULLPTR`.

267:   This routine falls back to using `PetscMalloc1()` or `PetscCalloc1()` (depending on the value
268:   of `clear`) if PETSc was not configured with device support. The user should note that
269:   `mtype` and `alignment` are ignored in this case, as these routines allocate only host memory
270:   aligned to `PETSC_MEMALIGN`.

272:   Note result stored `ptr` is immediately valid and the user may freely inspect or manipulate
273:   its value on function return, i.e.\:

275: .vb
276:   PetscInt *ptr;

278:   PetscDeviceAllocate_Private(dctx, PETSC_FALSE, PETSC_MEMTYPE_DEVICE, 20, alignof(PetscInt), (void**)&ptr);

280:   PetscInt *sub_ptr = ptr + 10; // OK, no need to synchronize

282:   ptr[0] = 10; // ERROR, directly accessing contents of ptr is undefined until synchronization
283: .ve

285:   DAG representation:
286: .vb
287:   time ->

289:   -> dctx - |= CALL =| -\- dctx -->
290:                          \- ptr ->
291: .ve

293:   Level: intermediate

295: .N ASYNC_API

297: .seealso: `PetscDeviceMalloc()`, `PetscDeviceFree()`, `PetscDeviceDeallocate_Private()`,
298: `PetscDeviceArrayCopy()`, `PetscDeviceArrayZero()`, `PetscMemType`
299: */
300: PetscErrorCode PetscDeviceAllocate_Private(PetscDeviceContext dctx, PetscBool clear, PetscMemType mtype, std::size_t n, std::size_t alignment, void **PETSC_RESTRICT ptr)
301: {
302:   PetscObjectId id = 0;

304:   if (PetscDefined(USE_DEBUG)) {
305:     const auto is_power_of_2 = [](std::size_t num) { return (num & (num - 1)) == 0; };

309:   }
311:   *ptr = nullptr;
312:   if (PetscUnlikely(!n)) return 0;
313:   memory_map.register_finalize();
314:   PetscDeviceContextGetOptionalNullContext_Internal(&dctx);

316:   // get our pointer here
317:   if (dctx->ops->memalloc) {
318:     PetscUseTypeMethod(dctx, memalloc, clear, mtype, n, alignment, ptr);
319:   } else {
320:     PetscDeviceCheckCapable_Private(dctx, PetscMemTypeHost(mtype), "allocating");
321:     PetscMallocA(1, clear, __LINE__, PETSC_FUNCTION_NAME, __FILE__, n, ptr);
322:   }
323:   PetscDeviceRegisterMemory_Private(*ptr, mtype, n, &id);
324:   // Note this is a "write" so that the next dctx to try and read from the pointer has to wait
325:   // for the allocation to be ready
326:   PetscDeviceContextMarkIntentFromID(dctx, id, PETSC_MEMORY_ACCESS_WRITE, "memory allocation");
327:   return 0;
328: }

330: /*
331:   PetscDeviceDeallocate_Private - Free device-aware memory

333:   Not Collective, Asynchronous, Auto-dependency aware

335:   Input Parameters:
336: + dctx  - The `PetscDeviceContext` used to free the memory
337: - ptr   - The pointer to free

339:   Notes:
340:   `ptr` must have been allocated using any of `PetscDeviceMalloc()`, `PetscDeviceCalloc()` or
341:   `PetscDeviceAllocate_Private()`, or registered with the system via `PetscDeviceRegisterMemory()`.

343:   The user should prefer `PetscDeviceFree()` over this routine as it automatically sets `ptr`
344:   to `PETSC_NULLPTR` on successful deallocation.

346:   `ptr` may be `NULL`.

348:   This routine falls back to using `PetscFree()` if PETSc was not configured with device
349:   support. The user should note that `PetscFree()` frees only host memory.

351:   DAG representation:
352: .vb
353:   time ->

355:   -> dctx -/- |= CALL =| - dctx ->
356:   -> ptr -/
357: .ve

359:   Level: intermediate

361: .N ASYNC_API

363: .seealso: `PetscDeviceFree()`, `PetscDeviceAllocate_Private()`
364: */
365: PetscErrorCode PetscDeviceDeallocate_Private(PetscDeviceContext dctx, void *PETSC_RESTRICT ptr)
366: {
367:   if (ptr) {
368:     auto      &map      = memory_map.map;
369:     const auto found_it = map.find(const_cast<MemoryMap::map_type::key_type>(ptr));

371:     if (PetscUnlikelyDebug(found_it == map.end())) {
372:       // OK this is a bad pointer, now determine why
373:       const auto it = memory_map.search_for(ptr);

375:       // if it is map.cend() then no allocation owns it, meaning it was not allocated by us!
377:       // if we are here then we did allocate it but the user has tried to do something along
378:       // the lines of:
379:       //
380:       // allocate(&ptr, size);
381:       // deallocate(ptr+5);
382:       //
383:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Attempting to deallocate pointer %p which is a suballocation of %p (memtype %s, id %" PetscInt64_FMT ", size %zu bytes)", ptr, it->first, PetscMemTypeToString(it->second.mtype), it->second.id,
384:               it->second.size);
385:     }

387:     PetscDeviceContextGetOptionalNullContext_Internal(&dctx);
388:     // mark intent BEFORE we free, note we mark as write so that we are made to wait on any
389:     // outstanding reads (don't want to kill the pointer before they are done)
390:     PetscDeviceContextMarkIntentFromID(dctx, found_it->second.id, PETSC_MEMORY_ACCESS_WRITE, "memory deallocation");
391:     // do free
392:     if (dctx->ops->memfree) {
393:       PetscUseTypeMethod(dctx, memfree, found_it->second.mtype, (void **)&ptr);
394:     } else {
395:       PetscDeviceCheckCapable_Private(dctx, PetscMemTypeHost(found_it->second.mtype), "freeing");
396:     }
397:     // if ptr still exists, then the device context could not handle it
398:     if (ptr) PetscFree(ptr);
399:     map.erase(found_it);
400:   }
401:   return 0;
402: }

404: /*@C
405:   PetscDeviceMemcpy - Copy memory in a device-aware manner

407:   Not Collective, Asynchronous, Auto-dependency aware

409:   Input Parameters:
410: + dctx - The `PetscDeviceContext` used to copy the memory
411: . dest - The pointer to copy to
412: . src  - The pointer to copy from
413: - n    - The amount (in bytes) to copy

415:   Notes:
416:   Both `dest` and `src` must have been allocated by `PetscDeviceMalloc()` or
417:   `PetscDeviceCalloc()`.

419:   `src` and `dest` cannot overlap.

421:   If both `src` and `dest` are on the host this routine is fully synchronous.

423:   The user should prefer `PetscDeviceArrayCopy()` over this routine as it automatically
424:   computes the number of bytes to copy from the size of the pointer types.

426:   DAG representation:
427: .vb
428:   time ->

430:   -> dctx - |= CALL =| - dctx ->
431:   -> dest --------------------->
432:   -> src ---------------------->
433: .ve

435:   Level: intermediate

437: .N ASYNC_API

439: .seealso: `PetscDeviceArrayCopy()`, `PetscDeviceMalloc()`, `PetscDeviceCalloc()`,
440: `PetscDeviceFree()`
441: @*/
442: PetscErrorCode PetscDeviceMemcpy(PetscDeviceContext dctx, void *PETSC_RESTRICT dest, const void *PETSC_RESTRICT src, std::size_t n)
443: {
444:   if (!n) return 0;
447:   if (dest == src) return 0;
448:   PetscDeviceContextGetOptionalNullContext_Internal(&dctx);
449:   {
450:     const auto dest_it = memory_map.search_for(dest, true);
451:     const auto src_it  = memory_map.search_for(src, true);
452:     const auto mode    = PetscMemTypeToDeviceCopyMode(dest_it->second.mtype, src_it->second.mtype);

454:     PetscDeviceContextMarkIntentFromID(dctx, src_it->second.id, PETSC_MEMORY_ACCESS_READ, "memory copy (src)");
455:     PetscDeviceContextMarkIntentFromID(dctx, dest_it->second.id, PETSC_MEMORY_ACCESS_WRITE, "memory copy (dest)");
456:     // perform the copy
457:     if (dctx->ops->memcopy) {
458:       PetscUseTypeMethod(dctx, memcopy, dest, src, n, mode);
459:       if (mode == PETSC_DEVICE_COPY_HTOD) {
460:         PetscLogCpuToGpu(n);
461:       } else if (mode == PETSC_DEVICE_COPY_DTOH) {
462:         PetscLogGpuToCpu(n);
463:       }
464:     } else {
465:       // REVIEW ME: we might potentially need to sync here if the memory is device-allocated
466:       // (pinned) but being copied by a host dctx
467:       PetscDeviceCheckCapable_Private(dctx, mode == PETSC_DEVICE_COPY_HTOH, "copying");
468:       PetscMemcpy(dest, src, n);
469:     }
470:   }
471:   return 0;
472: }

474: /*@C
475:   PetscDeviceMemset - Memset device-aware memory

477:   Not Collective, Asynchronous, Auto-dependency aware

479:   Input Parameters:
480: + dctx  - The `PetscDeviceContext` used to memset the memory
481: . ptr   - The pointer to the memory
482: . v     - The value to set
483: - n     - The amount (in bytes) to set

485:   Notes:
486:   `ptr` must have been allocated by `PetscDeviceMalloc()` or `PetscDeviceCalloc()`.

488:   The user should prefer `PetscDeviceArrayZero()` over this routine as it automatically
489:   computes the number of bytes to copy from the size of the pointer types, though they should
490:   note that it only zeros memory.

492:   This routine is analogous to `memset()`. That is, this routine copies the value
493:   `static_cast<unsigned char>(v)` into each of the first count characters of the object pointed
494:   to by `dest`.

496:   If `dest` is on device, this routine is asynchronous.

498:   DAG representation:
499: .vb
500:   time ->

502:   -> dctx - |= CALL =| - dctx ->
503:   -> dest --------------------->
504: .ve

506:   Level: intermediate

508: .N ASYNC_API

510: .seealso: `PetscDeviceArrayZero()`, `PetscDeviceMalloc()`, `PetscDeviceCalloc()`,
511: `PetscDeviceFree()`
512: @*/
513: PetscErrorCode PetscDeviceMemset(PetscDeviceContext dctx, void *ptr, PetscInt v, std::size_t n)
514: {
515:   if (PetscUnlikely(!n)) return 0;
517:   PetscDeviceContextGetOptionalNullContext_Internal(&dctx);
518:   {
519:     const auto ptr_it = memory_map.search_for(ptr, true);
520:     const auto mtype  = ptr_it->second.mtype;

522:     PetscDeviceContextMarkIntentFromID(dctx, ptr_it->second.id, PETSC_MEMORY_ACCESS_WRITE, "memory set");
523:     if (dctx->ops->memset) {
524:       PetscUseTypeMethod(dctx, memset, mtype, ptr, v, n);
525:     } else {
526:       // REVIEW ME: we might potentially need to sync here if the memory is device-allocated
527:       // (pinned) but being memset by a host dctx
528:       PetscDeviceCheckCapable_Private(dctx, PetscMemTypeHost(mtype), "memsetting");
529:       std::memset(ptr, static_cast<int>(v), n);
530:     }
531:   }
532:   return 0;
533: }