Actual source code: cupminterface.hpp

  1: #ifndef PETSCCUPMINTERFACE_HPP
  2: #define PETSCCUPMINTERFACE_HPP

  4: #if defined(__cplusplus)
  5: #include <petsc/private/cpputil.hpp>
  6: #include <petsc/private/petscadvancedmacros.h>
  7: #include <petscdevice_cupm.h>

  9:   #include <array>

 11: namespace Petsc
 12: {

 14: namespace device
 15: {

 17: namespace cupm
 18: {

 20: // enum describing available cupm devices, this is used as the template parameter to any
 21: // class subclassing the Interface or using it as a member variable
 22: enum class DeviceType : int {
 23:   CUDA,
 24:   HIP
 25: };

 27: // clang-format off
 28: static constexpr std::array<const char *const, 5> DeviceTypes = {
 29:   "cuda",
 30:   "hip",
 31:   "Petsc::Device::CUPM::DeviceType",
 32:   "Petsc::Device::CUPM::DeviceType::",
 33:   nullptr
 34: };
 35: // clang-format on

 37: namespace impl
 38: {

 40:   // A backend agnostic PetscCallCUPM() function, this will only work inside the member
 41:   // functions of a class inheriting from CUPM::Interface. Thanks to __VA_ARGS__ templated
 42:   // functions can also be wrapped inline:
 43:   //
 44:   // foo<int,char,bool>();
 45:   #define PetscCallCUPM(...) \
 46:     do { \
 47:       const cupmError_t cerr_p_ = __VA_ARGS__; \
 49:     } while (0)

 51:   #define PetscCallCUPMAbort(comm_, ...) \
 52:     do { \
 53:       const cupmError_t cerr_abort_p_ = __VA_ARGS__; \
 55:     } while (0)

 57:   // PETSC_CUPM_ALIAS_FUNCTION() - declaration to alias a cuda/hip function
 58:   //
 59:   // input params:
 60:   // our_name   - the name of the alias
 61:   // their_name - the name of the function being aliased
 62:   //
 63:   // notes:
 64:   // see PETSC_ALIAS_FUNCTION() for the exact nature of the expansion
 65:   //
 66:   // example usage:
 67:   // PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, cudaMalloc) ->
 68:   // template <typename... T>
 69:   // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
 70:   // {
 71:   //   return cudaMalloc(std::forward<T>(args)...);
 72:   // }
 73:   //
 74:   // PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, hipMalloc) ->
 75:   // template <typename... T>
 76:   // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
 77:   // {
 78:   //   return hipMalloc(std::forward<T>(args)...);
 79:   // }
 80:   #define PETSC_CUPM_ALIAS_FUNCTION(our_name, their_name) PETSC_ALIAS_FUNCTION(static our_name, their_name)

 82:   // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE() - declaration to alias a cuda/hip function but
 83:   // discard the last N arguments
 84:   //
 85:   // input params:
 86:   // our_name   - the name of the alias
 87:   // their_name - the name of the function being aliased
 88:   // N          - integer constant [0, INT_MAX) dictating how many arguments to chop off the end
 89:   //
 90:   // notes:
 91:   // see PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS() for the exact nature of the expansion
 92:   //
 93:   // example use:
 94:   // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(cupmMallocAsync, cudaMalloc, 1) ->
 95:   // template <typename... T, typename Tend>
 96:   // static constexpr auto cupmMallocAsync(T&&... args, Tend argend) *noexcept and trailing
 97:   // return type deduction*
 98:   // {
 99:   //   (void)argend;
100:   //   return cudaMalloc(std::forward<T>(args)...);
101:   // }
102:   #define PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(our_name, their_name, N) PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS(static our_name, their_name, N)

104: // Base class that holds functions and variables that don't require CUDA or HIP to be present
105: // on the system
106: template <DeviceType T>
107: struct InterfaceBase {
108:   static const DeviceType type = T;

110:   PETSC_NODISCARD static constexpr const char *cupmName() noexcept
111:   {
112:     static_assert(util::integral_value(DeviceType::CUDA) == 0, "");
113:     static_assert(util::integral_value(DeviceType::HIP) == 1, "");
114:     return std::get<util::integral_value(T)>(DeviceTypes);
115:   }

117:   PETSC_NODISCARD static constexpr PetscDeviceType PETSC_DEVICE_CUPM() noexcept { return T == DeviceType::CUDA ? PETSC_DEVICE_CUDA : PETSC_DEVICE_HIP; }

119:   PETSC_NODISCARD static constexpr PetscMemType PETSC_MEMTYPE_CUPM() noexcept { return T == DeviceType::CUDA ? PETSC_MEMTYPE_CUDA : PETSC_MEMTYPE_HIP; }
120: };

122: // declare the base class static member variables
123: template <DeviceType T>
124: const DeviceType InterfaceBase<T>::type;

126:   #define PETSC_CUPM_BASE_CLASS_HEADER(base_name, DEVICE_TYPE) \
127:     using base_name = ::Petsc::device::cupm::impl::InterfaceBase<DEVICE_TYPE>; \
128:     using base_name::type; \
129:     using base_name::cupmName; \
130:     using base_name::PETSC_DEVICE_CUPM; \
131:     using base_name::PETSC_MEMTYPE_CUPM

133: // A templated C++ struct that defines the entire CUPM interface. Use of templating vs
134: // preprocessor macros allows us to use both interfaces simultaneously as well as easily
135: // import them into classes.
136: template <DeviceType>
137: struct InterfaceImpl;

139:   #if PetscDefined(HAVE_CUDA)
140: template <>
141: struct InterfaceImpl<DeviceType::CUDA> : InterfaceBase<DeviceType::CUDA> {
142:   PETSC_CUPM_BASE_CLASS_HEADER(base_type, DeviceType::CUDA);

144:   // typedefs
145:   using cupmError_t             = cudaError_t;
146:   using cupmEvent_t             = cudaEvent_t;
147:   using cupmStream_t            = cudaStream_t;
148:   using cupmDeviceProp_t        = cudaDeviceProp;
149:   using cupmMemcpyKind_t        = cudaMemcpyKind;
150:   using cupmComplex_t           = util::conditional_t<PetscDefined(USE_REAL_SINGLE), cuComplex, cuDoubleComplex>;
151:   using cupmPointerAttributes_t = cudaPointerAttributes;
152:   using cupmMemoryType_t        = enum cudaMemoryType;
153:   using cupmDim3                = dim3;
154:   using cupmHostFn_t            = cudaHostFn_t;
155:     #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
156:   using cupmMemPool_t   = cudaMemPool_t;
157:   using cupmMemPoolAttr = cudaMemPoolAttr;
158:     #else
159:   using cupmMemPool_t   = void *;
160:   using cupmMemPoolAttr = unsigned int;
161:     #endif

163:   // values
164:   static const auto cupmSuccess                 = cudaSuccess;
165:   static const auto cupmErrorNotReady           = cudaErrorNotReady;
166:   static const auto cupmErrorDeviceAlreadyInUse = cudaErrorDeviceAlreadyInUse;
167:   static const auto cupmErrorSetOnActiveProcess = cudaErrorSetOnActiveProcess;
168:   static const auto cupmErrorStubLibrary =
169:     #if PETSC_PKG_CUDA_VERSION_GE(11, 1, 0)
170:     cudaErrorStubLibrary;
171:     #else
172:     cudaErrorInsufficientDriver;
173:     #endif

175:   static const auto cupmErrorNoDevice          = cudaErrorNoDevice;
176:   static const auto cupmStreamDefault          = cudaStreamDefault;
177:   static const auto cupmStreamNonBlocking      = cudaStreamNonBlocking;
178:   static const auto cupmDeviceMapHost          = cudaDeviceMapHost;
179:   static const auto cupmMemcpyHostToDevice     = cudaMemcpyHostToDevice;
180:   static const auto cupmMemcpyDeviceToHost     = cudaMemcpyDeviceToHost;
181:   static const auto cupmMemcpyDeviceToDevice   = cudaMemcpyDeviceToDevice;
182:   static const auto cupmMemcpyHostToHost       = cudaMemcpyHostToHost;
183:   static const auto cupmMemcpyDefault          = cudaMemcpyDefault;
184:   static const auto cupmMemoryTypeHost         = cudaMemoryTypeHost;
185:   static const auto cupmMemoryTypeDevice       = cudaMemoryTypeDevice;
186:   static const auto cupmMemoryTypeManaged      = cudaMemoryTypeManaged;
187:   static const auto cupmEventDisableTiming     = cudaEventDisableTiming;
188:   static const auto cupmHostAllocDefault       = cudaHostAllocDefault;
189:   static const auto cupmHostAllocWriteCombined = cudaHostAllocWriteCombined;
190:   static const auto cupmMemPoolAttrReleaseThreshold =
191:     #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
192:     cudaMemPoolAttrReleaseThreshold;
193:     #else
194:     cupmMemPoolAttr{0};
195:     #endif

197:   // error functions
198:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorName, cudaGetErrorName)
199:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorString, cudaGetErrorString)
200:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetLastError, cudaGetLastError)

202:   // device management
203:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceCount, cudaGetDeviceCount)
204:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceProperties, cudaGetDeviceProperties)
205:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDevice, cudaGetDevice)
206:   PETSC_CUPM_ALIAS_FUNCTION(cupmSetDevice, cudaSetDevice)
207:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceFlags, cudaGetDeviceFlags)
208:   PETSC_CUPM_ALIAS_FUNCTION(cupmSetDeviceFlags, cudaSetDeviceFlags)
209:   PETSC_CUPM_ALIAS_FUNCTION(cupmPointerGetAttributes, cudaPointerGetAttributes)
210:     #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
211:   PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetMemPool, cudaDeviceGetMemPool)
212:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemPoolSetAttribute, cudaMemPoolSetAttribute)
213:     #else
214:   PETSC_NODISCARD static cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int) noexcept
215:   {
216:     *pool = nullptr;
217:     return cupmSuccess;
218:   }

220:   PETSC_NODISCARD static cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *) noexcept { return cupmSuccess; }
221:     #endif
222:   // CUDA has no cudaInit() to match hipInit()
223:   PETSC_NODISCARD static cupmError_t cupmInit(unsigned int) noexcept { return cudaFree(nullptr); }

225:   // stream management
226:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreate, cudaEventCreate)
227:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreateWithFlags, cudaEventCreateWithFlags)
228:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventDestroy, cudaEventDestroy)
229:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventRecord, cudaEventRecord)
230:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventSynchronize, cudaEventSynchronize)
231:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventElapsedTime, cudaEventElapsedTime)
232:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventQuery, cudaEventQuery)
233:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreate, cudaStreamCreate)
234:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreateWithFlags, cudaStreamCreateWithFlags)
235:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamGetFlags, cudaStreamGetFlags)
236:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamDestroy, cudaStreamDestroy)
237:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamWaitEvent, cudaStreamWaitEvent)
238:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamQuery, cudaStreamQuery)
239:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamSynchronize, cudaStreamSynchronize)
240:   PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceSynchronize, cudaDeviceSynchronize)
241:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetSymbolAddress, cudaGetSymbolAddress)

243:   // memory management
244:   PETSC_CUPM_ALIAS_FUNCTION(cupmFree, cudaFree)
245:   PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, cudaMalloc)
246:     #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
247:   PETSC_CUPM_ALIAS_FUNCTION(cupmFreeAsync, cudaFreeAsync)
248:   PETSC_CUPM_ALIAS_FUNCTION(cupmMallocAsync, cudaMallocAsync)
249:     #else
250:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmFreeAsync, cudaFree, 1)
251:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMallocAsync, cudaMalloc, 1)
252:     #endif
253:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy, cudaMemcpy)
254:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpyAsync, cudaMemcpyAsync)
255:   PETSC_CUPM_ALIAS_FUNCTION(cupmMallocHost, cudaMallocHost)
256:   PETSC_CUPM_ALIAS_FUNCTION(cupmFreeHost, cudaFreeHost)
257:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemset, cudaMemset)
258:     #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
259:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemsetAsync, cudaMemsetAsync)
260:     #else
261:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemsetAsync, cudaMemset, 1)
262:     #endif

264:   // launch control
265:   PETSC_CUPM_ALIAS_FUNCTION(cupmLaunchHostFunc, cudaLaunchHostFunc)
266:   template <typename FunctionT, typename... KernelArgsT>
267:   PETSC_NODISCARD static cudaError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, cudaStream_t stream, KernelArgsT &&...kernelArgs) noexcept
268:   {
269:     void *args[] = {(void *)&kernelArgs...};
270:     return cudaLaunchKernel((void *)func, std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream));
271:   }
272: };
273:   #endif // PetscDefined(HAVE_CUDA)

275:   #if PetscDefined(HAVE_HIP)
276: template <>
277: struct InterfaceImpl<DeviceType::HIP> : InterfaceBase<DeviceType::HIP> {
278:   PETSC_CUPM_BASE_CLASS_HEADER(base_type, DeviceType::HIP);

280:   // typedefs
281:   using cupmError_t             = hipError_t;
282:   using cupmEvent_t             = hipEvent_t;
283:   using cupmStream_t            = hipStream_t;
284:   using cupmDeviceProp_t        = hipDeviceProp_t;
285:   using cupmMemcpyKind_t        = hipMemcpyKind;
286:   using cupmComplex_t           = util::conditional_t<PetscDefined(USE_REAL_SINGLE), hipComplex, hipDoubleComplex>;
287:   using cupmPointerAttributes_t = hipPointerAttribute_t;
288:   using cupmMemoryType_t        = enum hipMemoryType;
289:   using cupmDim3                = dim3;
290:     #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
291:   using cupmHostFn_t    = hipHostFn_t;
292:   using cupmMemPool_t   = hipMemPool_t;
293:   using cupmMemPoolAttr = hipMemPoolAttr;
294:     #else
295:   using cupmHostFn_t    = void (*)(void *);
296:   using cupmMemPool_t   = void *;
297:   using cupmMemPoolAttr = unsigned int;
298:     #endif

300:   // values
301:   static const auto cupmSuccess       = hipSuccess;
302:   static const auto cupmErrorNotReady = hipErrorNotReady;
303:   // see https://github.com/ROCm-Developer-Tools/HIP/blob/develop/bin/hipify-perl
304:   static const auto cupmErrorDeviceAlreadyInUse = hipErrorContextAlreadyInUse;
305:   static const auto cupmErrorSetOnActiveProcess = hipErrorSetOnActiveProcess;
306:   // as of HIP v4.2 cudaErrorStubLibrary has no HIP equivalent
307:   static const auto cupmErrorStubLibrary     = hipErrorInsufficientDriver;
308:   static const auto cupmErrorNoDevice        = hipErrorNoDevice;
309:   static const auto cupmStreamDefault        = hipStreamDefault;
310:   static const auto cupmStreamNonBlocking    = hipStreamNonBlocking;
311:   static const auto cupmDeviceMapHost        = hipDeviceMapHost;
312:   static const auto cupmMemcpyHostToDevice   = hipMemcpyHostToDevice;
313:   static const auto cupmMemcpyDeviceToHost   = hipMemcpyDeviceToHost;
314:   static const auto cupmMemcpyDeviceToDevice = hipMemcpyDeviceToDevice;
315:   static const auto cupmMemcpyHostToHost     = hipMemcpyHostToHost;
316:   static const auto cupmMemcpyDefault        = hipMemcpyDefault;
317:   static const auto cupmMemoryTypeHost       = hipMemoryTypeHost;
318:   static const auto cupmMemoryTypeDevice     = hipMemoryTypeDevice;
319:   // see
320:   // https://github.com/ROCm-Developer-Tools/HIP/blob/develop/include/hip/hip_runtime_api.h#L156
321:   static const auto cupmMemoryTypeManaged      = hipMemoryTypeUnified;
322:   static const auto cupmEventDisableTiming     = hipEventDisableTiming;
323:   static const auto cupmHostAllocDefault       = hipHostMallocDefault;
324:   static const auto cupmHostAllocWriteCombined = hipHostMallocWriteCombined;
325:   static const auto cupmMemPoolAttrReleaseThreshold =
326:     #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
327:     hipMemPoolAttrReleaseThreshold;
328:     #else
329:     cupmMemPoolAttr{0};
330:     #endif

332:   // error functions
333:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorName, hipGetErrorName)
334:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorString, hipGetErrorString)
335:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetLastError, hipGetLastError)

337:   // device management
338:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceCount, hipGetDeviceCount)
339:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceProperties, hipGetDeviceProperties)
340:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDevice, hipGetDevice)
341:   PETSC_CUPM_ALIAS_FUNCTION(cupmSetDevice, hipSetDevice)
342:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceFlags, hipGetDeviceFlags)
343:   PETSC_CUPM_ALIAS_FUNCTION(cupmSetDeviceFlags, hipSetDeviceFlags)
344:   PETSC_CUPM_ALIAS_FUNCTION(cupmPointerGetAttributes, hipPointerGetAttributes)
345:     #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
346:   PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetMemPool, hipDeviceGetMemPool)
347:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemPoolSetAttribute, hipMemPoolSetAttribute)
348:     #else
349:   PETSC_NODISCARD static cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int) noexcept
350:   {
351:     *pool = nullptr;
352:     return cupmSuccess;
353:   }

355:   PETSC_NODISCARD static cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *) noexcept { return cupmSuccess; }
356:     #endif
357:   PETSC_CUPM_ALIAS_FUNCTION(cupmInit, hipInit)

359:   // stream management
360:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreate, hipEventCreate)
361:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreateWithFlags, hipEventCreateWithFlags)
362:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventDestroy, hipEventDestroy)
363:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventRecord, hipEventRecord)
364:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventSynchronize, hipEventSynchronize)
365:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventElapsedTime, hipEventElapsedTime)
366:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventQuery, hipEventQuery)
367:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreate, hipStreamCreate)
368:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreateWithFlags, hipStreamCreateWithFlags)
369:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamGetFlags, hipStreamGetFlags)
370:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamDestroy, hipStreamDestroy)
371:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamWaitEvent, hipStreamWaitEvent)
372:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamQuery, hipStreamQuery)
373:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamSynchronize, hipStreamSynchronize)
374:   PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceSynchronize, hipDeviceSynchronize)
375:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetSymbolAddress, hipGetSymbolAddress)

377:   // memory management
378:   PETSC_CUPM_ALIAS_FUNCTION(cupmFree, hipFree)
379:   PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, hipMalloc)
380:     #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
381:   PETSC_CUPM_ALIAS_FUNCTION(cupmMallocAsync, hipMallocAsync)
382:   PETSC_CUPM_ALIAS_FUNCTION(cupmFreeAsync, hipFreeAsync)
383:     #else
384:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMallocAsync, hipMalloc, 1)
385:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmFreeAsync, hipFree, 1)
386:     #endif
387:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy, hipMemcpy)
388:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpyAsync, hipMemcpyAsync)
389:   // hipMallocHost is deprecated
390:   PETSC_CUPM_ALIAS_FUNCTION(cupmMallocHost, hipHostMalloc)
391:   // hipFreeHost is deprecated
392:   PETSC_CUPM_ALIAS_FUNCTION(cupmFreeHost, hipHostFree)
393:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemset, hipMemset)
394:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemsetAsync, hipMemsetAsync)

396:       // launch control
397:       // HIP appears to only have hipLaunchHostFunc from 5.2.0 onwards
398:       // https://github.com/ROCm-Developer-Tools/HIPIFY/blob/master/doc/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md#7-execution-control=
399:     #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
400:   PETSC_CUPM_ALIAS_FUNCTION(cupmLaunchHostFunc, hipLaunchHostFunc)
401:     #else
402:   PETSC_NODISCARD static hipError_t cupmLaunchHostFunc(hipStream_t stream, cupmHostFn_t fn, void *ctx) noexcept
403:   {
404:     // the only correct way to spoof this function is to do it synchronously...
405:     auto herr = hipStreamSynchronize(stream);
406:     if (PetscUnlikely(herr != hipSuccess)) return herr;
407:     fn(ctx);
408:     return herr;
409:   }
410:     #endif

412:   template <typename FunctionT, typename... KernelArgsT>
413:   PETSC_NODISCARD static hipError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, hipStream_t stream, KernelArgsT &&...kernelArgs) noexcept
414:   {
415:     void *args[] = {(void *)&kernelArgs...};
416:     return hipLaunchKernel((void *)func, std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream));
417:   }
418: };
419:   #endif // PetscDefined(HAVE_HIP)

421:   // shorthand for bringing all of the typedefs from the base Interface class into your own,
422:   // it's annoying that c++ doesn't have a way to do this automatically
423:   #define PETSC_CUPM_IMPL_CLASS_HEADER(base_name, T) \
424:     PETSC_CUPM_BASE_CLASS_HEADER(PetscConcat(base_, base_name), T); \
425:     using base_name = ::Petsc::device::cupm::impl::InterfaceImpl<T>; \
426:     /* types */ \
427:     using cupmComplex_t           = typename base_name::cupmComplex_t; \
428:     using cupmError_t             = typename base_name::cupmError_t; \
429:     using cupmEvent_t             = typename base_name::cupmEvent_t; \
430:     using cupmStream_t            = typename base_name::cupmStream_t; \
431:     using cupmDeviceProp_t        = typename base_name::cupmDeviceProp_t; \
432:     using cupmMemcpyKind_t        = typename base_name::cupmMemcpyKind_t; \
433:     using cupmPointerAttributes_t = typename base_name::cupmPointerAttributes_t; \
434:     using cupmMemoryType_t        = typename base_name::cupmMemoryType_t; \
435:     using cupmDim3                = typename base_name::cupmDim3; \
436:     using cupmMemPool_t           = typename base_name::cupmMemPool_t; \
437:     using cupmMemPoolAttr         = typename base_name::cupmMemPoolAttr; \
438:     /* variables */ \
439:     using base_name::cupmSuccess; \
440:     using base_name::cupmErrorNotReady; \
441:     using base_name::cupmErrorDeviceAlreadyInUse; \
442:     using base_name::cupmErrorSetOnActiveProcess; \
443:     using base_name::cupmErrorStubLibrary; \
444:     using base_name::cupmErrorNoDevice; \
445:     using base_name::cupmStreamDefault; \
446:     using base_name::cupmStreamNonBlocking; \
447:     using base_name::cupmDeviceMapHost; \
448:     using base_name::cupmMemcpyHostToDevice; \
449:     using base_name::cupmMemcpyDeviceToHost; \
450:     using base_name::cupmMemcpyDeviceToDevice; \
451:     using base_name::cupmMemcpyHostToHost; \
452:     using base_name::cupmMemcpyDefault; \
453:     using base_name::cupmMemoryTypeHost; \
454:     using base_name::cupmMemoryTypeDevice; \
455:     using base_name::cupmMemoryTypeManaged; \
456:     using base_name::cupmEventDisableTiming; \
457:     using base_name::cupmHostAllocDefault; \
458:     using base_name::cupmHostAllocWriteCombined; \
459:     using base_name::cupmMemPoolAttrReleaseThreshold; \
460:     /* functions */ \
461:     using base_name::cupmGetErrorName; \
462:     using base_name::cupmGetErrorString; \
463:     using base_name::cupmGetLastError; \
464:     using base_name::cupmGetDeviceCount; \
465:     using base_name::cupmGetDeviceProperties; \
466:     using base_name::cupmGetDevice; \
467:     using base_name::cupmSetDevice; \
468:     using base_name::cupmGetDeviceFlags; \
469:     using base_name::cupmSetDeviceFlags; \
470:     using base_name::cupmPointerGetAttributes; \
471:     using base_name::cupmDeviceGetMemPool; \
472:     using base_name::cupmMemPoolSetAttribute; \
473:     using base_name::cupmInit; \
474:     using base_name::cupmEventCreate; \
475:     using base_name::cupmEventCreateWithFlags; \
476:     using base_name::cupmEventDestroy; \
477:     using base_name::cupmEventRecord; \
478:     using base_name::cupmEventSynchronize; \
479:     using base_name::cupmEventElapsedTime; \
480:     using base_name::cupmEventQuery; \
481:     using base_name::cupmStreamCreate; \
482:     using base_name::cupmStreamCreateWithFlags; \
483:     using base_name::cupmStreamGetFlags; \
484:     using base_name::cupmStreamDestroy; \
485:     using base_name::cupmStreamWaitEvent; \
486:     using base_name::cupmStreamQuery; \
487:     using base_name::cupmStreamSynchronize; \
488:     using base_name::cupmDeviceSynchronize; \
489:     using base_name::cupmGetSymbolAddress; \
490:     using base_name::cupmMalloc; \
491:     using base_name::cupmMallocAsync; \
492:     using base_name::cupmMemcpy; \
493:     using base_name::cupmMemcpyAsync; \
494:     using base_name::cupmMallocHost; \
495:     using base_name::cupmMemset; \
496:     using base_name::cupmMemsetAsync; \
497:     using base_name::cupmLaunchHostFunc

499: template <DeviceType>
500: struct Interface;

502: // The actual interface class
503: template <DeviceType T>
504: struct Interface : InterfaceImpl<T> {
505:   PETSC_CUPM_IMPL_CLASS_HEADER(interface_type, T);

507:   using cupmReal_t   = util::conditional_t<PetscDefined(USE_REAL_SINGLE), float, double>;
508:   using cupmScalar_t = util::conditional_t<PetscDefined(USE_COMPLEX), cupmComplex_t, cupmReal_t>;

510:   // REVIEW ME: this needs to be cleaned up, it is unreadable
511:   PETSC_NODISCARD static constexpr cupmScalar_t makeCupmScalar(PetscScalar s) noexcept
512:   {
513:   #if PetscDefined(USE_COMPLEX)
514:     return cupmComplex_t{PetscRealPart(s), PetscImaginaryPart(s)};
515:   #else
516:     return static_cast<cupmReal_t>(s);
517:   #endif
518:   }

520:   PETSC_NODISCARD static constexpr const cupmScalar_t *cupmScalarCast(const PetscScalar *s) noexcept { return reinterpret_cast<const cupmScalar_t *>(s); }

522:   PETSC_NODISCARD static constexpr cupmScalar_t *cupmScalarCast(PetscScalar *s) noexcept { return reinterpret_cast<cupmScalar_t *>(s); }

524:   PETSC_NODISCARD static constexpr const cupmReal_t *cupmRealCast(const PetscReal *s) noexcept { return reinterpret_cast<const cupmReal_t *>(s); }

526:   PETSC_NODISCARD static constexpr cupmReal_t *cupmRealCast(PetscReal *s) noexcept { return reinterpret_cast<cupmReal_t *>(s); }

528:   #if !defined(PETSC_PKG_CUDA_VERSION_GE)
529:     #define PETSC_PKG_CUDA_VERSION_GE(...) 0
530:     #define CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE
531:   #endif
532:   PETSC_NODISCARD static PetscErrorCode PetscCUPMGetMemType(const void *data, PetscMemType *type, PetscBool *registered = nullptr, PetscBool *managed = nullptr) noexcept
533:   {
534:     cupmPointerAttributes_t attr;
535:     cupmError_t             cerr;

538:     if (registered) {
540:       *registered = PETSC_FALSE;
541:     }
542:     if (managed) {
544:       *managed = PETSC_FALSE;
545:     }
546:     // Do not check error, instead reset it via GetLastError() since before CUDA 11.0, passing
547:     // a host pointer returns cudaErrorInvalidValue
548:     cerr = cupmPointerGetAttributes(&attr, data);
549:     cerr = cupmGetLastError();
550:       // HIP seems to always have used memoryType though
551:   #if (defined(CUDART_VERSION) && (CUDART_VERSION < 10000)) || defined(__HIP_PLATFORM_HCC__)
552:     const auto mtype = attr.memoryType;
553:     if (managed) *managed = static_cast<PetscBool>((cerr == cupmSuccess) && attr.isManaged);
554:   #else
555:     if (PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) && (T == DeviceType::CUDA)) cerr;
556:     const auto mtype = attr.type;
557:     if (managed) *managed = static_cast<PetscBool>(mtype == cupmMemoryTypeManaged);
558:   #endif // CUDART_VERSION && CUDART_VERSION < 10000 || __HIP_PLATFORM_HCC__
559:     if (type) *type = ((cerr == cupmSuccess) && (mtype == cupmMemoryTypeDevice)) ? PETSC_MEMTYPE_CUPM() : PETSC_MEMTYPE_HOST;
560:     if (registered && (cerr == cupmSuccess) && (mtype == cupmMemoryTypeHost)) *registered = PETSC_TRUE;
561:     return 0;
562:   }
563:   #if defined(CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE)
564:     #undef PETSC_PKG_CUDA_VERSION_GE
565:   #endif

567:   PETSC_NODISCARD static PETSC_CONSTEXPR_14 cupmMemcpyKind_t PetscDeviceCopyModeToCUPMMemcpyKind(PetscDeviceCopyMode mode) noexcept
568:   {
569:     switch (mode) {
570:     case PETSC_DEVICE_COPY_HTOH:
571:       return cupmMemcpyHostToHost;
572:     case PETSC_DEVICE_COPY_HTOD:
573:       return cupmMemcpyHostToDevice;
574:     case PETSC_DEVICE_COPY_DTOD:
575:       return cupmMemcpyDeviceToDevice;
576:     case PETSC_DEVICE_COPY_DTOH:
577:       return cupmMemcpyDeviceToHost;
578:     case PETSC_DEVICE_COPY_AUTO:
579:       return cupmMemcpyDefault;
580:     }
581:     PetscUnreachable();
582:     return cupmMemcpyDefault;
583:   }

585:   // these change what the arguments mean, so need to namespace these
586:   template <typename M>
587:   PETSC_NODISCARD static PetscErrorCode PetscCUPMMallocAsync(M **ptr, std::size_t n, cupmStream_t stream = nullptr) noexcept
588:   {
589:     static_assert(!std::is_void<M>::value, "");

592:     if (PetscLikely(n)) {
593:       cupmMallocAsync(reinterpret_cast<void **>(ptr), n * sizeof(M), stream);
594:     } else {
595:       *ptr = nullptr;
596:     }
597:     return 0;
598:   }

600:   template <typename M>
601:   PETSC_NODISCARD static PetscErrorCode PetscCUPMMalloc(M **ptr, std::size_t n) noexcept
602:   {
603:     PetscCUPMMallocAsync(ptr, n);
604:     return 0;
605:   }

607:   template <typename M>
608:   PETSC_NODISCARD static PetscErrorCode PetscCUPMMallocHost(M **ptr, std::size_t n, unsigned int flags = cupmHostAllocDefault) noexcept
609:   {
610:     static_assert(!std::is_void<M>::value, "");

613:     *ptr = nullptr;
614:     cupmMallocHost(reinterpret_cast<void **>(ptr), n * sizeof(M), flags);
615:     return 0;
616:   }

618:   template <typename D, typename S = D>
619:   PETSC_NODISCARD static PetscErrorCode PetscCUPMMemcpyAsync(D *dest, const S *src, std::size_t n, cupmMemcpyKind_t kind, cupmStream_t stream = nullptr, bool use_async = false) noexcept
620:   {
621:     static_assert(sizeof(D) == sizeof(S), "");
622:     static_assert(!std::is_void<D>::value && !std::is_void<S>::value, "");
623:     const auto size = n * sizeof(D);

625:     if (PetscUnlikely(!n)) return 0;
629:     // do early return after nullptr check since we need to check that they are not both nullptrs
630:     if (PetscUnlikely(dest == src)) return 0;
631:     if (kind == cupmMemcpyHostToHost) {
632:       // If we are HTOH it is cheaper to check if the stream is idle and do a basic mempcy()
633:       // than it is to just call the vendor functions. This assumes of course that the stream
634:       // accounts for both memory regions being "idle"
635:       if (cupmStreamQuery(stream) == cupmSuccess) {
636:         PetscMemcpy(dest, src, size);
637:         return 0;
638:       }
639:       // need to clear the potential cupmErrorNotReady generated by query above...
640:       auto cerr = cupmGetLastError();

642:       if (PetscUnlikely(cerr != cupmErrorNotReady)) cerr;
643:     }
644:     if (use_async || stream || (kind != cupmMemcpyDeviceToHost)) {
645:       cupmMemcpyAsync(dest, src, size, kind, stream);
646:     } else {
647:       cupmMemcpy(dest, src, size, kind);
648:     }

650:     // only the explicit HTOD or DTOH are handled, since we either don't log the other cases
651:     // (yet) or don't know the direction
652:     if (kind == cupmMemcpyDeviceToHost) {
653:       PetscLogGpuToCpu(size);
654:     } else if (kind == cupmMemcpyHostToDevice) {
655:       PetscLogCpuToGpu(size);
656:     }
657:     return 0;
658:   }

660:   template <typename D, typename S = D>
661:   PETSC_NODISCARD static PetscErrorCode PetscCUPMMemcpy(D *dest, const S *src, std::size_t n, cupmMemcpyKind_t kind) noexcept
662:   {
663:     PetscCUPMMemcpyAsync(dest, src, n, kind);
664:     return 0;
665:   }

667:   template <typename M>
668:   PETSC_NODISCARD static PetscErrorCode PetscCUPMMemsetAsync(M *ptr, int value, std::size_t n, cupmStream_t stream = nullptr, bool use_async = false) noexcept
669:   {
670:     static_assert(!std::is_void<M>::value, "");

672:     if (PetscLikely(n)) {
673:       const auto bytes = n * sizeof(M);

676:       if (stream || use_async) {
677:         cupmMemsetAsync(ptr, value, bytes, stream);
678:       } else {
679:         cupmMemset(ptr, value, bytes);
680:       }
681:     }
682:     return 0;
683:   }

685:   template <typename M>
686:   PETSC_NODISCARD static PetscErrorCode PetscCUPMMemset(M *ptr, int value, std::size_t n) noexcept
687:   {
688:     PetscCUPMMemsetAsync(ptr, value, n);
689:     return 0;
690:   }

692:   // these we can transparently wrap, no need to namespace it to Petsc
693:   template <typename M>
694:   PETSC_NODISCARD static cupmError_t cupmFreeAsync(M &&ptr, cupmStream_t stream = nullptr) noexcept
695:   {
696:     static_assert(std::is_pointer<util::decay_t<M>>::value, "");

698:     if (ptr) {
699:       auto cerr = interface_type::cupmFreeAsync(std::forward<M>(ptr), stream);

701:       ptr = nullptr;
702:       if (PetscUnlikely(cerr != cupmSuccess)) return cerr;
703:     }
704:     return cupmSuccess;
705:   }

707:   PETSC_NODISCARD static cupmError_t cupmFreeAsync(std::nullptr_t ptr, cupmStream_t stream = nullptr) noexcept { return interface_type::cupmFreeAsync(ptr, stream); }

709:   template <typename M>
710:   PETSC_NODISCARD static cupmError_t cupmFree(M &&ptr) noexcept
711:   {
712:     return cupmFreeAsync(std::forward<M>(ptr));
713:   }

715:   PETSC_NODISCARD static cupmError_t cupmFree(std::nullptr_t ptr) noexcept { return cupmFreeAsync(ptr); }

717:   template <typename M>
718:   PETSC_NODISCARD static cupmError_t cupmFreeHost(M &&ptr) noexcept
719:   {
720:     static_assert(std::is_pointer<util::decay_t<M>>::value, "");
721:     const auto cerr = interface_type::cupmFreeHost(std::forward<M>(ptr));
722:     ptr             = nullptr;
723:     return cerr;
724:   }

726:   PETSC_NODISCARD static cupmError_t cupmFreeHost(std::nullptr_t ptr) noexcept { return interface_type::cupmFreeHost(ptr); }

728:   // specific wrapper for device launch function, as the real function is a C routine and
729:   // doesn't have variable arguments. The actual mechanics of this are a bit complicated but
730:   // boils down to the fact that ultimately we pass a
731:   //
732:   // void *args[] = {(void*)&kernel_args...};
733:   //
734:   // to the kernel launcher. Since we pass void* this means implicit conversion does **not**
735:   // happen to the kernel arguments so we must do it ourselves here. This function does this in
736:   // 3 stages:
737:   // 1. Enumerate the kernel arguments (cupmLaunchKernel)
738:   // 2. Deduce the signature of func() and static_cast the kernel arguments to the type
739:   //    expected by func() using the enumeration above (deduceKernelCall)
740:   // 3. Form the void* array with the converted arguments and call cuda/hipLaunchKernel with
741:   //    it. (interface_type::cupmLaunchKernel)
742:   template <typename F, typename... Args>
743:   PETSC_NODISCARD static cupmError_t cupmLaunchKernel(F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs) noexcept
744:   {
745:     return deduceKernelCall(util::index_sequence_for<Args...>{}, std::forward<F>(func), std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream), std::forward<Args>(kernelArgs)...);
746:   }

748:   template <std::size_t block_size = 256, std::size_t warp_size = 32, typename F, typename... Args>
749:   PETSC_NODISCARD static PetscErrorCode PetscCUPMLaunchKernel1D(std::size_t n, std::size_t sharedMem, cupmStream_t stream, F &&func, Args &&...kernelArgs) noexcept
750:   {
751:     static_assert(block_size > 0, "");
752:     static_assert(warp_size > 0, "");
753:     // want block_size to be a multiple of the warp_size
754:     static_assert(block_size % warp_size == 0, "");
755:     const auto nthread = std::min(n, block_size);
756:     const auto nblock  = (n + block_size - 1) / block_size;

758:     // if n = 0 then nthread = 0, which is not allowed. rather than letting the user try to
759:     // decipher cryptic 'cuda/hipErrorLaunchFailure' we explicitly check for zero here
760:     PetscAssert(nthread, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Trying to launch kernel with grid/block size 0");
761:     cupmLaunchKernel(std::forward<F>(func), nblock, nthread, sharedMem, stream, std::forward<Args>(kernelArgs)...);
762:     return 0;
763:   }

765: private:
766:   template <typename S, typename D, typename = void>
767:   struct is_static_castable : std::false_type { };

769:   template <typename S, typename D>
770:   struct is_static_castable<S, D, util::void_t<decltype(static_cast<D>(std::declval<S>()))>> : std::true_type { };

772:   template <typename D, typename S>
773:   static constexpr util::enable_if_t<is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept
774:   {
775:     return static_cast<D>(std::forward<S>(src));
776:   }

778:   template <typename D, typename S>
779:   static constexpr util::enable_if_t<!is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept
780:   {
781:     return const_cast<D>(std::forward<S>(src));
782:   }

784:   template <typename F, typename... Args, std::size_t... Idx>
785:   PETSC_NODISCARD static cupmError_t deduceKernelCall(util::index_sequence<Idx...>, F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs) noexcept
786:   {
787:     // clang-format off
788:     return interface_type::template cupmLaunchKernel(
789:       std::forward<F>(func),
790:       std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream),
791:       // can't static_cast() here since the function argument type may be cv-qualified, in
792:       // which case we would need to const_cast(). But you can only const_cast()
793:       // indirect types (pointers, references) and I don't want to add a
794:       // static_cast_that_becomes_a_const_cast() SFINAE monster to this template mess. C-style
795:       // casts luckily work here since it tries the following and uses the first one that
796:       // succeeds:
797:       // 1. const_cast()
798:       // 2. static_cast()
799:       // 3. static_cast() then const_cast()
800:       // 4. reinterpret_cast()...
801:       // hopefully we never get to reinterpret_cast() land
802:       //(typename util::func_traits<F>::template arg<Idx>::type)(kernelArgs)...
803:       cast_to<typename util::func_traits<F>::template arg<Idx>::type>(std::forward<Args>(kernelArgs))...
804:     );
805:     // clang-format on
806:   }
807: };

809:   #define PETSC_CUPM_INHERIT_INTERFACE_TYPEDEFS_USING(base_name, T) \
810:     PETSC_CUPM_IMPL_CLASS_HEADER(PetscConcat(base_name, _impl), T); \
811:     using base_name    = ::Petsc::device::cupm::impl::Interface<T>; \
812:     using cupmReal_t   = typename base_name::cupmReal_t; \
813:     using cupmScalar_t = typename base_name::cupmScalar_t; \
814:     using base_name::makeCupmScalar; \
815:     using base_name::cupmScalarCast; \
816:     using base_name::cupmRealCast; \
817:     using base_name::PetscCUPMGetMemType; \
818:     using base_name::PetscCUPMMemset; \
819:     using base_name::PetscCUPMMemsetAsync; \
820:     using base_name::PetscCUPMMalloc; \
821:     using base_name::PetscCUPMMallocAsync; \
822:     using base_name::PetscCUPMMallocHost; \
823:     using base_name::PetscCUPMMemcpy; \
824:     using base_name::PetscCUPMMemcpyAsync; \
825:     using base_name::cupmFree; \
826:     using base_name::cupmFreeAsync; \
827:     using base_name::cupmFreeHost; \
828:     using base_name::cupmLaunchKernel; \
829:     using base_name::PetscCUPMLaunchKernel1D; \
830:     using base_name::PetscDeviceCopyModeToCUPMMemcpyKind

832: } // namespace impl

834: } // namespace cupm

836: } // namespace device

838: } // namespace Petsc

840: #endif /* __cplusplus */

842: #endif /* PETSCCUPMINTERFACE_HPP */