Actual source code: sfhip.hip.cpp
1: #include <../src/vec/is/sf/impls/basic/sfpack.h>
2: #include <petscpkg_version.h>
4: /* compilation issues on SPOCK */
5: #undef PETSC_HAVE_COMPLEX
7: /* Map a thread id to an index in root/leaf space through a series of 3D subdomains. See PetscSFPackOpt. */
8: __device__ static inline PetscInt MapTidToIndex(const PetscInt *opt, PetscInt tid)
9: {
10: PetscInt i, j, k, m, n, r;
11: const PetscInt *offset, *start, *dx, *dy, *X, *Y;
13: n = opt[0];
14: offset = opt + 1;
15: start = opt + n + 2;
16: dx = opt + 2 * n + 2;
17: dy = opt + 3 * n + 2;
18: X = opt + 5 * n + 2;
19: Y = opt + 6 * n + 2;
20: for (r = 0; r < n; r++) {
21: if (tid < offset[r + 1]) break;
22: }
23: m = (tid - offset[r]);
24: k = m / (dx[r] * dy[r]);
25: j = (m - k * dx[r] * dy[r]) / dx[r];
26: i = m - k * dx[r] * dy[r] - j * dx[r];
28: return (start[r] + k * X[r] * Y[r] + j * X[r] + i);
29: }
31: /*====================================================================================*/
32: /* Templated HIP kernels for pack/unpack. The Op can be regular or atomic */
33: /*====================================================================================*/
35: /* Suppose user calls PetscSFReduce(sf,unit,...) and <unit> is an MPI data type made of 16 PetscReals, then
36: <Type> is PetscReal, which is the primitive type we operate on.
37: <bs> is 16, which says <unit> contains 16 primitive types.
38: <BS> is 8, which is the maximal SIMD width we will try to vectorize operations on <unit>.
39: <EQ> is 0, which is (bs == BS ? 1 : 0)
41: If instead, <unit> has 8 PetscReals, then bs=8, BS=8, EQ=1, rendering MBS below to a compile time constant.
42: For the common case in VecScatter, bs=1, BS=1, EQ=1, MBS=1, the inner for-loops below will be totally unrolled.
43: */
44: template <class Type, PetscInt BS, PetscInt EQ>
45: __global__ static void d_Pack(PetscInt bs, PetscInt count, PetscInt start, const PetscInt *opt, const PetscInt *idx, const Type *data, Type *buf)
46: {
47: PetscInt i, s, t, tid = blockIdx.x * blockDim.x + threadIdx.x;
48: const PetscInt grid_size = gridDim.x * blockDim.x;
49: const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */
50: const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile-time const when EQ=1. */
52: for (; tid < count; tid += grid_size) {
53: /* opt != NULL ==> idx == NULL, i.e., the indices have patterns but not contiguous;
54: opt == NULL && idx == NULL ==> the indices are contiguous;
55: */
56: t = (opt ? MapTidToIndex(opt, tid) : (idx ? idx[tid] : start + tid)) * MBS;
57: s = tid * MBS;
58: for (i = 0; i < MBS; i++) buf[s + i] = data[t + i];
59: }
60: }
62: template <class Type, class Op, PetscInt BS, PetscInt EQ>
63: __global__ static void d_UnpackAndOp(PetscInt bs, PetscInt count, PetscInt start, const PetscInt *opt, const PetscInt *idx, Type *data, const Type *buf)
64: {
65: PetscInt i, s, t, tid = blockIdx.x * blockDim.x + threadIdx.x;
66: const PetscInt grid_size = gridDim.x * blockDim.x;
67: const PetscInt M = (EQ) ? 1 : bs / BS, MBS = M * BS;
68: Op op;
70: for (; tid < count; tid += grid_size) {
71: t = (opt ? MapTidToIndex(opt, tid) : (idx ? idx[tid] : start + tid)) * MBS;
72: s = tid * MBS;
73: for (i = 0; i < MBS; i++) op(data[t + i], buf[s + i]);
74: }
75: }
77: template <class Type, class Op, PetscInt BS, PetscInt EQ>
78: __global__ static void d_FetchAndOp(PetscInt bs, PetscInt count, PetscInt rootstart, const PetscInt *rootopt, const PetscInt *rootidx, Type *rootdata, Type *leafbuf)
79: {
80: PetscInt i, l, r, tid = blockIdx.x * blockDim.x + threadIdx.x;
81: const PetscInt grid_size = gridDim.x * blockDim.x;
82: const PetscInt M = (EQ) ? 1 : bs / BS, MBS = M * BS;
83: Op op;
85: for (; tid < count; tid += grid_size) {
86: r = (rootopt ? MapTidToIndex(rootopt, tid) : (rootidx ? rootidx[tid] : rootstart + tid)) * MBS;
87: l = tid * MBS;
88: for (i = 0; i < MBS; i++) leafbuf[l + i] = op(rootdata[r + i], leafbuf[l + i]);
89: }
90: }
92: template <class Type, class Op, PetscInt BS, PetscInt EQ>
93: __global__ static void d_ScatterAndOp(PetscInt bs, PetscInt count, PetscInt srcx, PetscInt srcy, PetscInt srcX, PetscInt srcY, PetscInt srcStart, const PetscInt *srcIdx, const Type *src, PetscInt dstx, PetscInt dsty, PetscInt dstX, PetscInt dstY, PetscInt dstStart, const PetscInt *dstIdx, Type *dst)
94: {
95: PetscInt i, j, k, s, t, tid = blockIdx.x * blockDim.x + threadIdx.x;
96: const PetscInt grid_size = gridDim.x * blockDim.x;
97: const PetscInt M = (EQ) ? 1 : bs / BS, MBS = M * BS;
98: Op op;
100: for (; tid < count; tid += grid_size) {
101: if (!srcIdx) { /* src is either contiguous or 3D */
102: k = tid / (srcx * srcy);
103: j = (tid - k * srcx * srcy) / srcx;
104: i = tid - k * srcx * srcy - j * srcx;
105: s = srcStart + k * srcX * srcY + j * srcX + i;
106: } else {
107: s = srcIdx[tid];
108: }
110: if (!dstIdx) { /* dst is either contiguous or 3D */
111: k = tid / (dstx * dsty);
112: j = (tid - k * dstx * dsty) / dstx;
113: i = tid - k * dstx * dsty - j * dstx;
114: t = dstStart + k * dstX * dstY + j * dstX + i;
115: } else {
116: t = dstIdx[tid];
117: }
119: s *= MBS;
120: t *= MBS;
121: for (i = 0; i < MBS; i++) op(dst[t + i], src[s + i]);
122: }
123: }
125: template <class Type, class Op, PetscInt BS, PetscInt EQ>
126: __global__ static void d_FetchAndOpLocal(PetscInt bs, PetscInt count, PetscInt rootstart, const PetscInt *rootopt, const PetscInt *rootidx, Type *rootdata, PetscInt leafstart, const PetscInt *leafopt, const PetscInt *leafidx, const Type *leafdata, Type *leafupdate)
127: {
128: PetscInt i, l, r, tid = blockIdx.x * blockDim.x + threadIdx.x;
129: const PetscInt grid_size = gridDim.x * blockDim.x;
130: const PetscInt M = (EQ) ? 1 : bs / BS, MBS = M * BS;
131: Op op;
133: for (; tid < count; tid += grid_size) {
134: r = (rootopt ? MapTidToIndex(rootopt, tid) : (rootidx ? rootidx[tid] : rootstart + tid)) * MBS;
135: l = (leafopt ? MapTidToIndex(leafopt, tid) : (leafidx ? leafidx[tid] : leafstart + tid)) * MBS;
136: for (i = 0; i < MBS; i++) leafupdate[l + i] = op(rootdata[r + i], leafdata[l + i]);
137: }
138: }
140: /*====================================================================================*/
141: /* Regular operations on device */
142: /*====================================================================================*/
143: template <typename Type>
144: struct Insert {
145: __device__ Type operator()(Type &x, Type y) const
146: {
147: Type old = x;
148: x = y;
149: return old;
150: }
151: };
152: template <typename Type>
153: struct Add {
154: __device__ Type operator()(Type &x, Type y) const
155: {
156: Type old = x;
157: x += y;
158: return old;
159: }
160: };
161: template <typename Type>
162: struct Mult {
163: __device__ Type operator()(Type &x, Type y) const
164: {
165: Type old = x;
166: x *= y;
167: return old;
168: }
169: };
170: template <typename Type>
171: struct Min {
172: __device__ Type operator()(Type &x, Type y) const
173: {
174: Type old = x;
175: x = PetscMin(x, y);
176: return old;
177: }
178: };
179: template <typename Type>
180: struct Max {
181: __device__ Type operator()(Type &x, Type y) const
182: {
183: Type old = x;
184: x = PetscMax(x, y);
185: return old;
186: }
187: };
188: template <typename Type>
189: struct LAND {
190: __device__ Type operator()(Type &x, Type y) const
191: {
192: Type old = x;
193: x = x && y;
194: return old;
195: }
196: };
197: template <typename Type>
198: struct LOR {
199: __device__ Type operator()(Type &x, Type y) const
200: {
201: Type old = x;
202: x = x || y;
203: return old;
204: }
205: };
206: template <typename Type>
207: struct LXOR {
208: __device__ Type operator()(Type &x, Type y) const
209: {
210: Type old = x;
211: x = !x != !y;
212: return old;
213: }
214: };
215: template <typename Type>
216: struct BAND {
217: __device__ Type operator()(Type &x, Type y) const
218: {
219: Type old = x;
220: x = x & y;
221: return old;
222: }
223: };
224: template <typename Type>
225: struct BOR {
226: __device__ Type operator()(Type &x, Type y) const
227: {
228: Type old = x;
229: x = x | y;
230: return old;
231: }
232: };
233: template <typename Type>
234: struct BXOR {
235: __device__ Type operator()(Type &x, Type y) const
236: {
237: Type old = x;
238: x = x ^ y;
239: return old;
240: }
241: };
242: template <typename Type>
243: struct Minloc {
244: __device__ Type operator()(Type &x, Type y) const
245: {
246: Type old = x;
247: if (y.a < x.a) x = y;
248: else if (y.a == x.a) x.b = min(x.b, y.b);
249: return old;
250: }
251: };
252: template <typename Type>
253: struct Maxloc {
254: __device__ Type operator()(Type &x, Type y) const
255: {
256: Type old = x;
257: if (y.a > x.a) x = y;
258: else if (y.a == x.a) x.b = min(x.b, y.b); /* See MPI MAXLOC */
259: return old;
260: }
261: };
263: /*====================================================================================*/
264: /* Atomic operations on device */
265: /*====================================================================================*/
267: /*
268: Atomic Insert (exchange) operations
270: See Cuda version
271: */
272: #if PETSC_PKG_HIP_VERSION_LT(4, 4, 0)
273: __device__ static double atomicExch(double *address, double val)
274: {
275: return __longlong_as_double(atomicExch((ullint *)address, __double_as_longlong(val)));
276: }
277: #endif
279: __device__ static llint atomicExch(llint *address, llint val)
280: {
281: return (llint)(atomicExch((ullint *)address, (ullint)val));
282: }
284: template <typename Type>
285: struct AtomicInsert {
286: __device__ Type operator()(Type &x, Type y) const { return atomicExch(&x, y); }
287: };
289: #if defined(PETSC_HAVE_COMPLEX)
290: #if defined(PETSC_USE_REAL_DOUBLE)
291: template <>
292: struct AtomicInsert<PetscComplex> {
293: __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
294: {
295: PetscComplex old, *z = &old;
296: double *xp = (double *)&x, *yp = (double *)&y;
297: AtomicInsert<double> op;
298: z[0] = op(xp[0], yp[0]);
299: z[1] = op(xp[1], yp[1]);
300: return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
301: }
302: };
303: #elif defined(PETSC_USE_REAL_SINGLE)
304: template <>
305: struct AtomicInsert<PetscComplex> {
306: __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
307: {
308: double *xp = (double *)&x, *yp = (double *)&y;
309: AtomicInsert<double> op;
310: return op(xp[0], yp[0]);
311: }
312: };
313: #endif
314: #endif
316: /*
317: Atomic add operations
319: */
320: __device__ static llint atomicAdd(llint *address, llint val)
321: {
322: return (llint)atomicAdd((ullint *)address, (ullint)val);
323: }
325: template <typename Type>
326: struct AtomicAdd {
327: __device__ Type operator()(Type &x, Type y) const { return atomicAdd(&x, y); }
328: };
330: template <>
331: struct AtomicAdd<double> {
332: __device__ double operator()(double &x, double y) const
333: {
334: /* Cuda version does more checks that may be needed */
335: return atomicAdd(&x, y);
336: }
337: };
339: template <>
340: struct AtomicAdd<float> {
341: __device__ float operator()(float &x, float y) const
342: {
343: /* Cuda version does more checks that may be needed */
344: return atomicAdd(&x, y);
345: }
346: };
348: #if defined(PETSC_HAVE_COMPLEX)
349: template <>
350: struct AtomicAdd<PetscComplex> {
351: __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
352: {
353: PetscComplex old, *z = &old;
354: PetscReal *xp = (PetscReal *)&x, *yp = (PetscReal *)&y;
355: AtomicAdd<PetscReal> op;
356: z[0] = op(xp[0], yp[0]);
357: z[1] = op(xp[1], yp[1]);
358: return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
359: }
360: };
361: #endif
363: /*
364: Atomic Mult operations:
366: HIP has no atomicMult at all, so we build our own with atomicCAS
367: */
368: #if defined(PETSC_USE_REAL_DOUBLE)
369: __device__ static double atomicMult(double *address, double val)
370: {
371: ullint *address_as_ull = (ullint *)(address);
372: ullint old = *address_as_ull, assumed;
373: do {
374: assumed = old;
375: /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
376: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * __longlong_as_double(assumed)));
377: } while (assumed != old);
378: return __longlong_as_double(old);
379: }
380: #elif defined(PETSC_USE_REAL_SINGLE)
381: __device__ static float atomicMult(float *address, float val)
382: {
383: int *address_as_int = (int *)(address);
384: int old = *address_as_int, assumed;
385: do {
386: assumed = old;
387: old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed)));
388: } while (assumed != old);
389: return __int_as_float(old);
390: }
391: #endif
393: __device__ static int atomicMult(int *address, int val)
394: {
395: int *address_as_int = (int *)(address);
396: int old = *address_as_int, assumed;
397: do {
398: assumed = old;
399: old = atomicCAS(address_as_int, assumed, val * assumed);
400: } while (assumed != old);
401: return (int)old;
402: }
404: __device__ static llint atomicMult(llint *address, llint val)
405: {
406: ullint *address_as_ull = (ullint *)(address);
407: ullint old = *address_as_ull, assumed;
408: do {
409: assumed = old;
410: old = atomicCAS(address_as_ull, assumed, (ullint)(val * (llint)assumed));
411: } while (assumed != old);
412: return (llint)old;
413: }
415: template <typename Type>
416: struct AtomicMult {
417: __device__ Type operator()(Type &x, Type y) const { return atomicMult(&x, y); }
418: };
420: /*
421: Atomic Min/Max operations
423: See CUDA version for comments.
424: */
425: #if PETSC_PKG_HIP_VERSION_LT(4, 4, 0)
426: #if defined(PETSC_USE_REAL_DOUBLE)
427: __device__ static double atomicMin(double *address, double val)
428: {
429: ullint *address_as_ull = (ullint *)(address);
430: ullint old = *address_as_ull, assumed;
431: do {
432: assumed = old;
433: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val, __longlong_as_double(assumed))));
434: } while (assumed != old);
435: return __longlong_as_double(old);
436: }
438: __device__ static double atomicMax(double *address, double val)
439: {
440: ullint *address_as_ull = (ullint *)(address);
441: ullint old = *address_as_ull, assumed;
442: do {
443: assumed = old;
444: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val, __longlong_as_double(assumed))));
445: } while (assumed != old);
446: return __longlong_as_double(old);
447: }
448: #elif defined(PETSC_USE_REAL_SINGLE)
449: __device__ static float atomicMin(float *address, float val)
450: {
451: int *address_as_int = (int *)(address);
452: int old = *address_as_int, assumed;
453: do {
454: assumed = old;
455: old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val, __int_as_float(assumed))));
456: } while (assumed != old);
457: return __int_as_float(old);
458: }
460: __device__ static float atomicMax(float *address, float val)
461: {
462: int *address_as_int = (int *)(address);
463: int old = *address_as_int, assumed;
464: do {
465: assumed = old;
466: old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val, __int_as_float(assumed))));
467: } while (assumed != old);
468: return __int_as_float(old);
469: }
470: #endif
471: #endif
473: /* As of ROCm 3.10 llint atomicMin/Max(llint*, llint) is not supported */
474: __device__ static llint atomicMin(llint *address, llint val)
475: {
476: ullint *address_as_ull = (ullint *)(address);
477: ullint old = *address_as_ull, assumed;
478: do {
479: assumed = old;
480: old = atomicCAS(address_as_ull, assumed, (ullint)(PetscMin(val, (llint)assumed)));
481: } while (assumed != old);
482: return (llint)old;
483: }
485: __device__ static llint atomicMax(llint *address, llint val)
486: {
487: ullint *address_as_ull = (ullint *)(address);
488: ullint old = *address_as_ull, assumed;
489: do {
490: assumed = old;
491: old = atomicCAS(address_as_ull, assumed, (ullint)(PetscMax(val, (llint)assumed)));
492: } while (assumed != old);
493: return (llint)old;
494: }
496: template <typename Type>
497: struct AtomicMin {
498: __device__ Type operator()(Type &x, Type y) const { return atomicMin(&x, y); }
499: };
500: template <typename Type>
501: struct AtomicMax {
502: __device__ Type operator()(Type &x, Type y) const { return atomicMax(&x, y); }
503: };
505: /*
506: Atomic bitwise operations
507: As of ROCm 3.10, the llint atomicAnd/Or/Xor(llint*, llint) is not supported
508: */
510: __device__ static llint atomicAnd(llint *address, llint val)
511: {
512: ullint *address_as_ull = (ullint *)(address);
513: ullint old = *address_as_ull, assumed;
514: do {
515: assumed = old;
516: old = atomicCAS(address_as_ull, assumed, (ullint)(val & (llint)assumed));
517: } while (assumed != old);
518: return (llint)old;
519: }
520: __device__ static llint atomicOr(llint *address, llint val)
521: {
522: ullint *address_as_ull = (ullint *)(address);
523: ullint old = *address_as_ull, assumed;
524: do {
525: assumed = old;
526: old = atomicCAS(address_as_ull, assumed, (ullint)(val | (llint)assumed));
527: } while (assumed != old);
528: return (llint)old;
529: }
531: __device__ static llint atomicXor(llint *address, llint val)
532: {
533: ullint *address_as_ull = (ullint *)(address);
534: ullint old = *address_as_ull, assumed;
535: do {
536: assumed = old;
537: old = atomicCAS(address_as_ull, assumed, (ullint)(val ^ (llint)assumed));
538: } while (assumed != old);
539: return (llint)old;
540: }
542: template <typename Type>
543: struct AtomicBAND {
544: __device__ Type operator()(Type &x, Type y) const { return atomicAnd(&x, y); }
545: };
546: template <typename Type>
547: struct AtomicBOR {
548: __device__ Type operator()(Type &x, Type y) const { return atomicOr(&x, y); }
549: };
550: template <typename Type>
551: struct AtomicBXOR {
552: __device__ Type operator()(Type &x, Type y) const { return atomicXor(&x, y); }
553: };
555: /*
556: Atomic logical operations:
558: CUDA has no atomic logical operations at all. We support them on integer types.
559: */
561: /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
562: which is what we want since we only support 32-bit and 64-bit integers.
563: */
564: template <typename Type, class Op, int size /* sizeof(Type) */>
565: struct AtomicLogical;
567: template <typename Type, class Op>
568: struct AtomicLogical<Type, Op, 4> {
569: __device__ Type operator()(Type &x, Type y) const
570: {
571: int *address_as_int = (int *)(&x);
572: int old = *address_as_int, assumed;
573: Op op;
574: do {
575: assumed = old;
576: old = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed, y)));
577: } while (assumed != old);
578: return (Type)old;
579: }
580: };
582: template <typename Type, class Op>
583: struct AtomicLogical<Type, Op, 8> {
584: __device__ Type operator()(Type &x, Type y) const
585: {
586: ullint *address_as_ull = (ullint *)(&x);
587: ullint old = *address_as_ull, assumed;
588: Op op;
589: do {
590: assumed = old;
591: old = atomicCAS(address_as_ull, assumed, (ullint)(op((Type)assumed, y)));
592: } while (assumed != old);
593: return (Type)old;
594: }
595: };
597: /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
598: template <typename Type>
599: struct land {
600: __device__ Type operator()(Type x, Type y) { return x && y; }
601: };
602: template <typename Type>
603: struct lor {
604: __device__ Type operator()(Type x, Type y) { return x || y; }
605: };
606: template <typename Type>
607: struct lxor {
608: __device__ Type operator()(Type x, Type y) { return (!x != !y); }
609: };
611: template <typename Type>
612: struct AtomicLAND {
613: __device__ Type operator()(Type &x, Type y) const
614: {
615: AtomicLogical<Type, land<Type>, sizeof(Type)> op;
616: return op(x, y);
617: }
618: };
619: template <typename Type>
620: struct AtomicLOR {
621: __device__ Type operator()(Type &x, Type y) const
622: {
623: AtomicLogical<Type, lor<Type>, sizeof(Type)> op;
624: return op(x, y);
625: }
626: };
627: template <typename Type>
628: struct AtomicLXOR {
629: __device__ Type operator()(Type &x, Type y) const
630: {
631: AtomicLogical<Type, lxor<Type>, sizeof(Type)> op;
632: return op(x, y);
633: }
634: };
636: /*====================================================================================*/
637: /* Wrapper functions of hip kernels. Function pointers are stored in 'link' */
638: /*====================================================================================*/
639: template <typename Type, PetscInt BS, PetscInt EQ>
640: static PetscErrorCode Pack(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, const void *data, void *buf)
641: {
642: PetscInt nthreads = 256;
643: PetscInt nblocks = (count + nthreads - 1) / nthreads;
644: const PetscInt *iarray = opt ? opt->array : NULL;
646: if (!count) return 0;
647: nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);
648: hipLaunchKernelGGL(HIP_KERNEL_NAME(d_Pack<Type, BS, EQ>), dim3(nblocks), dim3(nthreads), 0, link->stream, link->bs, count, start, iarray, idx, (const Type *)data, (Type *)buf);
649: hipGetLastError();
650: return 0;
651: }
653: template <typename Type, class Op, PetscInt BS, PetscInt EQ>
654: static PetscErrorCode UnpackAndOp(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *data, const void *buf)
655: {
656: PetscInt nthreads = 256;
657: PetscInt nblocks = (count + nthreads - 1) / nthreads;
658: const PetscInt *iarray = opt ? opt->array : NULL;
660: if (!count) return 0;
661: nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);
662: hipLaunchKernelGGL(HIP_KERNEL_NAME(d_UnpackAndOp<Type, Op, BS, EQ>), dim3(nblocks), dim3(nthreads), 0, link->stream, link->bs, count, start, iarray, idx, (Type *)data, (const Type *)buf);
663: hipGetLastError();
664: return 0;
665: }
667: template <typename Type, class Op, PetscInt BS, PetscInt EQ>
668: static PetscErrorCode FetchAndOp(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *data, void *buf)
669: {
670: PetscInt nthreads = 256;
671: PetscInt nblocks = (count + nthreads - 1) / nthreads;
672: const PetscInt *iarray = opt ? opt->array : NULL;
674: if (!count) return 0;
675: nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);
676: hipLaunchKernelGGL(HIP_KERNEL_NAME(d_FetchAndOp<Type, Op, BS, EQ>), dim3(nblocks), dim3(nthreads), 0, link->stream, link->bs, count, start, iarray, idx, (Type *)data, (Type *)buf);
677: hipGetLastError();
678: return 0;
679: }
681: template <typename Type, class Op, PetscInt BS, PetscInt EQ>
682: static PetscErrorCode ScatterAndOp(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst)
683: {
684: PetscInt nthreads = 256;
685: PetscInt nblocks = (count + nthreads - 1) / nthreads;
686: PetscInt srcx = 0, srcy = 0, srcX = 0, srcY = 0, dstx = 0, dsty = 0, dstX = 0, dstY = 0;
688: if (!count) return 0;
689: nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);
691: /* The 3D shape of source subdomain may be different than that of the destination, which makes it difficult to use CUDA 3D grid and block */
692: if (srcOpt) {
693: srcx = srcOpt->dx[0];
694: srcy = srcOpt->dy[0];
695: srcX = srcOpt->X[0];
696: srcY = srcOpt->Y[0];
697: srcStart = srcOpt->start[0];
698: srcIdx = NULL;
699: } else if (!srcIdx) {
700: srcx = srcX = count;
701: srcy = srcY = 1;
702: }
704: if (dstOpt) {
705: dstx = dstOpt->dx[0];
706: dsty = dstOpt->dy[0];
707: dstX = dstOpt->X[0];
708: dstY = dstOpt->Y[0];
709: dstStart = dstOpt->start[0];
710: dstIdx = NULL;
711: } else if (!dstIdx) {
712: dstx = dstX = count;
713: dsty = dstY = 1;
714: }
716: hipLaunchKernelGGL(HIP_KERNEL_NAME(d_ScatterAndOp<Type, Op, BS, EQ>), dim3(nblocks), dim3(nthreads), 0, link->stream, link->bs, count, srcx, srcy, srcX, srcY, srcStart, srcIdx, (const Type *)src, dstx, dsty, dstX, dstY, dstStart, dstIdx, (Type *)dst);
717: hipGetLastError();
718: return 0;
719: }
721: /* Specialization for Insert since we may use hipMemcpyAsync */
722: template <typename Type, PetscInt BS, PetscInt EQ>
723: static PetscErrorCode ScatterAndInsert(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst)
724: {
725: if (!count) return 0;
726: /*src and dst are contiguous */
727: if ((!srcOpt && !srcIdx) && (!dstOpt && !dstIdx) && src != dst) {
728: hipMemcpyAsync((Type *)dst + dstStart * link->bs, (const Type *)src + srcStart * link->bs, count * link->unitbytes, hipMemcpyDeviceToDevice, link->stream);
729: } else {
730: ScatterAndOp<Type, Insert<Type>, BS, EQ>(link, count, srcStart, srcOpt, srcIdx, src, dstStart, dstOpt, dstIdx, dst);
731: }
732: return 0;
733: }
735: template <typename Type, class Op, PetscInt BS, PetscInt EQ>
736: static PetscErrorCode FetchAndOpLocal(PetscSFLink link, PetscInt count, PetscInt rootstart, PetscSFPackOpt rootopt, const PetscInt *rootidx, void *rootdata, PetscInt leafstart, PetscSFPackOpt leafopt, const PetscInt *leafidx, const void *leafdata, void *leafupdate)
737: {
738: PetscInt nthreads = 256;
739: PetscInt nblocks = (count + nthreads - 1) / nthreads;
740: const PetscInt *rarray = rootopt ? rootopt->array : NULL;
741: const PetscInt *larray = leafopt ? leafopt->array : NULL;
743: if (!count) return 0;
744: nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);
745: hipLaunchKernelGGL(HIP_KERNEL_NAME(d_FetchAndOpLocal<Type, Op, BS, EQ>), dim3(nblocks), dim3(nthreads), 0, link->stream, link->bs, count, rootstart, rarray, rootidx, (Type *)rootdata, leafstart, larray, leafidx, (const Type *)leafdata, (Type *)leafupdate);
746: hipGetLastError();
747: return 0;
748: }
750: /*====================================================================================*/
751: /* Init various types and instantiate pack/unpack function pointers */
752: /*====================================================================================*/
753: template <typename Type, PetscInt BS, PetscInt EQ>
754: static void PackInit_RealType(PetscSFLink link)
755: {
756: /* Pack/unpack for remote communication */
757: link->d_Pack = Pack<Type, BS, EQ>;
758: link->d_UnpackAndInsert = UnpackAndOp<Type, Insert<Type>, BS, EQ>;
759: link->d_UnpackAndAdd = UnpackAndOp<Type, Add<Type>, BS, EQ>;
760: link->d_UnpackAndMult = UnpackAndOp<Type, Mult<Type>, BS, EQ>;
761: link->d_UnpackAndMin = UnpackAndOp<Type, Min<Type>, BS, EQ>;
762: link->d_UnpackAndMax = UnpackAndOp<Type, Max<Type>, BS, EQ>;
763: link->d_FetchAndAdd = FetchAndOp<Type, Add<Type>, BS, EQ>;
765: /* Scatter for local communication */
766: link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>; /* Has special optimizations */
767: link->d_ScatterAndAdd = ScatterAndOp<Type, Add<Type>, BS, EQ>;
768: link->d_ScatterAndMult = ScatterAndOp<Type, Mult<Type>, BS, EQ>;
769: link->d_ScatterAndMin = ScatterAndOp<Type, Min<Type>, BS, EQ>;
770: link->d_ScatterAndMax = ScatterAndOp<Type, Max<Type>, BS, EQ>;
771: link->d_FetchAndAddLocal = FetchAndOpLocal<Type, Add<Type>, BS, EQ>;
773: /* Atomic versions when there are data-race possibilities */
774: link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>;
775: link->da_UnpackAndAdd = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>;
776: link->da_UnpackAndMult = UnpackAndOp<Type, AtomicMult<Type>, BS, EQ>;
777: link->da_UnpackAndMin = UnpackAndOp<Type, AtomicMin<Type>, BS, EQ>;
778: link->da_UnpackAndMax = UnpackAndOp<Type, AtomicMax<Type>, BS, EQ>;
779: link->da_FetchAndAdd = FetchAndOp<Type, AtomicAdd<Type>, BS, EQ>;
781: link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>;
782: link->da_ScatterAndAdd = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>;
783: link->da_ScatterAndMult = ScatterAndOp<Type, AtomicMult<Type>, BS, EQ>;
784: link->da_ScatterAndMin = ScatterAndOp<Type, AtomicMin<Type>, BS, EQ>;
785: link->da_ScatterAndMax = ScatterAndOp<Type, AtomicMax<Type>, BS, EQ>;
786: link->da_FetchAndAddLocal = FetchAndOpLocal<Type, AtomicAdd<Type>, BS, EQ>;
787: }
789: /* Have this templated class to specialize for char integers */
790: template <typename Type, PetscInt BS, PetscInt EQ, PetscInt size /*sizeof(Type)*/>
791: struct PackInit_IntegerType_Atomic {
792: static void Init(PetscSFLink link)
793: {
794: link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>;
795: link->da_UnpackAndAdd = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>;
796: link->da_UnpackAndMult = UnpackAndOp<Type, AtomicMult<Type>, BS, EQ>;
797: link->da_UnpackAndMin = UnpackAndOp<Type, AtomicMin<Type>, BS, EQ>;
798: link->da_UnpackAndMax = UnpackAndOp<Type, AtomicMax<Type>, BS, EQ>;
799: link->da_UnpackAndLAND = UnpackAndOp<Type, AtomicLAND<Type>, BS, EQ>;
800: link->da_UnpackAndLOR = UnpackAndOp<Type, AtomicLOR<Type>, BS, EQ>;
801: link->da_UnpackAndLXOR = UnpackAndOp<Type, AtomicLXOR<Type>, BS, EQ>;
802: link->da_UnpackAndBAND = UnpackAndOp<Type, AtomicBAND<Type>, BS, EQ>;
803: link->da_UnpackAndBOR = UnpackAndOp<Type, AtomicBOR<Type>, BS, EQ>;
804: link->da_UnpackAndBXOR = UnpackAndOp<Type, AtomicBXOR<Type>, BS, EQ>;
805: link->da_FetchAndAdd = FetchAndOp<Type, AtomicAdd<Type>, BS, EQ>;
807: link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>;
808: link->da_ScatterAndAdd = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>;
809: link->da_ScatterAndMult = ScatterAndOp<Type, AtomicMult<Type>, BS, EQ>;
810: link->da_ScatterAndMin = ScatterAndOp<Type, AtomicMin<Type>, BS, EQ>;
811: link->da_ScatterAndMax = ScatterAndOp<Type, AtomicMax<Type>, BS, EQ>;
812: link->da_ScatterAndLAND = ScatterAndOp<Type, AtomicLAND<Type>, BS, EQ>;
813: link->da_ScatterAndLOR = ScatterAndOp<Type, AtomicLOR<Type>, BS, EQ>;
814: link->da_ScatterAndLXOR = ScatterAndOp<Type, AtomicLXOR<Type>, BS, EQ>;
815: link->da_ScatterAndBAND = ScatterAndOp<Type, AtomicBAND<Type>, BS, EQ>;
816: link->da_ScatterAndBOR = ScatterAndOp<Type, AtomicBOR<Type>, BS, EQ>;
817: link->da_ScatterAndBXOR = ScatterAndOp<Type, AtomicBXOR<Type>, BS, EQ>;
818: link->da_FetchAndAddLocal = FetchAndOpLocal<Type, AtomicAdd<Type>, BS, EQ>;
819: }
820: };
822: /* See cuda version */
823: template <typename Type, PetscInt BS, PetscInt EQ>
824: struct PackInit_IntegerType_Atomic<Type, BS, EQ, 1> {
825: static void Init(PetscSFLink link)
826: { /* Nothing to leave function pointers NULL */
827: }
828: };
830: template <typename Type, PetscInt BS, PetscInt EQ>
831: static void PackInit_IntegerType(PetscSFLink link)
832: {
833: link->d_Pack = Pack<Type, BS, EQ>;
834: link->d_UnpackAndInsert = UnpackAndOp<Type, Insert<Type>, BS, EQ>;
835: link->d_UnpackAndAdd = UnpackAndOp<Type, Add<Type>, BS, EQ>;
836: link->d_UnpackAndMult = UnpackAndOp<Type, Mult<Type>, BS, EQ>;
837: link->d_UnpackAndMin = UnpackAndOp<Type, Min<Type>, BS, EQ>;
838: link->d_UnpackAndMax = UnpackAndOp<Type, Max<Type>, BS, EQ>;
839: link->d_UnpackAndLAND = UnpackAndOp<Type, LAND<Type>, BS, EQ>;
840: link->d_UnpackAndLOR = UnpackAndOp<Type, LOR<Type>, BS, EQ>;
841: link->d_UnpackAndLXOR = UnpackAndOp<Type, LXOR<Type>, BS, EQ>;
842: link->d_UnpackAndBAND = UnpackAndOp<Type, BAND<Type>, BS, EQ>;
843: link->d_UnpackAndBOR = UnpackAndOp<Type, BOR<Type>, BS, EQ>;
844: link->d_UnpackAndBXOR = UnpackAndOp<Type, BXOR<Type>, BS, EQ>;
845: link->d_FetchAndAdd = FetchAndOp<Type, Add<Type>, BS, EQ>;
847: link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>;
848: link->d_ScatterAndAdd = ScatterAndOp<Type, Add<Type>, BS, EQ>;
849: link->d_ScatterAndMult = ScatterAndOp<Type, Mult<Type>, BS, EQ>;
850: link->d_ScatterAndMin = ScatterAndOp<Type, Min<Type>, BS, EQ>;
851: link->d_ScatterAndMax = ScatterAndOp<Type, Max<Type>, BS, EQ>;
852: link->d_ScatterAndLAND = ScatterAndOp<Type, LAND<Type>, BS, EQ>;
853: link->d_ScatterAndLOR = ScatterAndOp<Type, LOR<Type>, BS, EQ>;
854: link->d_ScatterAndLXOR = ScatterAndOp<Type, LXOR<Type>, BS, EQ>;
855: link->d_ScatterAndBAND = ScatterAndOp<Type, BAND<Type>, BS, EQ>;
856: link->d_ScatterAndBOR = ScatterAndOp<Type, BOR<Type>, BS, EQ>;
857: link->d_ScatterAndBXOR = ScatterAndOp<Type, BXOR<Type>, BS, EQ>;
858: link->d_FetchAndAddLocal = FetchAndOpLocal<Type, Add<Type>, BS, EQ>;
859: PackInit_IntegerType_Atomic<Type, BS, EQ, sizeof(Type)>::Init(link);
860: }
862: #if defined(PETSC_HAVE_COMPLEX)
863: template <typename Type, PetscInt BS, PetscInt EQ>
864: static void PackInit_ComplexType(PetscSFLink link)
865: {
866: link->d_Pack = Pack<Type, BS, EQ>;
867: link->d_UnpackAndInsert = UnpackAndOp<Type, Insert<Type>, BS, EQ>;
868: link->d_UnpackAndAdd = UnpackAndOp<Type, Add<Type>, BS, EQ>;
869: link->d_UnpackAndMult = UnpackAndOp<Type, Mult<Type>, BS, EQ>;
870: link->d_FetchAndAdd = FetchAndOp<Type, Add<Type>, BS, EQ>;
872: link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>;
873: link->d_ScatterAndAdd = ScatterAndOp<Type, Add<Type>, BS, EQ>;
874: link->d_ScatterAndMult = ScatterAndOp<Type, Mult<Type>, BS, EQ>;
875: link->d_FetchAndAddLocal = FetchAndOpLocal<Type, Add<Type>, BS, EQ>;
877: link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>;
878: link->da_UnpackAndAdd = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>;
879: link->da_UnpackAndMult = NULL; /* Not implemented yet */
880: link->da_FetchAndAdd = NULL; /* Return value of atomicAdd on complex is not atomic */
882: link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>;
883: link->da_ScatterAndAdd = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>;
884: }
885: #endif
887: typedef signed char SignedChar;
888: typedef unsigned char UnsignedChar;
889: typedef struct {
890: int a;
891: int b;
892: } PairInt;
893: typedef struct {
894: PetscInt a;
895: PetscInt b;
896: } PairPetscInt;
898: template <typename Type>
899: static void PackInit_PairType(PetscSFLink link)
900: {
901: link->d_Pack = Pack<Type, 1, 1>;
902: link->d_UnpackAndInsert = UnpackAndOp<Type, Insert<Type>, 1, 1>;
903: link->d_UnpackAndMaxloc = UnpackAndOp<Type, Maxloc<Type>, 1, 1>;
904: link->d_UnpackAndMinloc = UnpackAndOp<Type, Minloc<Type>, 1, 1>;
906: link->d_ScatterAndInsert = ScatterAndOp<Type, Insert<Type>, 1, 1>;
907: link->d_ScatterAndMaxloc = ScatterAndOp<Type, Maxloc<Type>, 1, 1>;
908: link->d_ScatterAndMinloc = ScatterAndOp<Type, Minloc<Type>, 1, 1>;
909: /* Atomics for pair types are not implemented yet */
910: }
912: template <typename Type, PetscInt BS, PetscInt EQ>
913: static void PackInit_DumbType(PetscSFLink link)
914: {
915: link->d_Pack = Pack<Type, BS, EQ>;
916: link->d_UnpackAndInsert = UnpackAndOp<Type, Insert<Type>, BS, EQ>;
917: link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>;
918: /* Atomics for dumb types are not implemented yet */
919: }
921: /* Some device-specific utilities */
922: static PetscErrorCode PetscSFLinkSyncDevice_HIP(PetscSFLink link)
923: {
924: hipDeviceSynchronize();
925: return 0;
926: }
928: static PetscErrorCode PetscSFLinkSyncStream_HIP(PetscSFLink link)
929: {
930: hipStreamSynchronize(link->stream);
931: return 0;
932: }
934: static PetscErrorCode PetscSFLinkMemcpy_HIP(PetscSFLink link, PetscMemType dstmtype, void *dst, PetscMemType srcmtype, const void *src, size_t n)
935: {
936: enum hipMemcpyKind kinds[2][2] = {
937: {hipMemcpyHostToHost, hipMemcpyHostToDevice },
938: {hipMemcpyDeviceToHost, hipMemcpyDeviceToDevice}
939: };
941: if (n) {
942: if (PetscMemTypeHost(dstmtype) && PetscMemTypeHost(srcmtype)) { /* Separate HostToHost so that pure-cpu code won't call hip runtime */
943: PetscMemcpy(dst, src, n);
944: } else {
945: int stype = PetscMemTypeDevice(srcmtype) ? 1 : 0;
946: int dtype = PetscMemTypeDevice(dstmtype) ? 1 : 0;
947: hipMemcpyAsync(dst, src, n, kinds[stype][dtype], link->stream);
948: }
949: }
950: return 0;
951: }
953: PetscErrorCode PetscSFMalloc_HIP(PetscMemType mtype, size_t size, void **ptr)
954: {
955: if (PetscMemTypeHost(mtype)) PetscMalloc(size, ptr);
956: else if (PetscMemTypeDevice(mtype)) {
957: PetscDeviceInitialize(PETSC_DEVICE_HIP);
958: hipMalloc(ptr, size);
959: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Wrong PetscMemType %d", (int)mtype);
960: return 0;
961: }
963: PetscErrorCode PetscSFFree_HIP(PetscMemType mtype, void *ptr)
964: {
965: if (PetscMemTypeHost(mtype)) PetscFree(ptr);
966: else if (PetscMemTypeDevice(mtype)) hipFree(ptr);
967: else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Wrong PetscMemType %d", (int)mtype);
968: return 0;
969: }
971: /* Destructor when the link uses MPI for communication on HIP device */
972: static PetscErrorCode PetscSFLinkDestroy_MPI_HIP(PetscSF sf, PetscSFLink link)
973: {
974: for (int i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
975: hipFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);
976: hipFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);
977: }
978: return 0;
979: }
981: /*====================================================================================*/
982: /* Main driver to init MPI datatype on device */
983: /*====================================================================================*/
985: /* Some fields of link are initialized by PetscSFPackSetUp_Host. This routine only does what needed on device */
986: PetscErrorCode PetscSFLinkSetUp_HIP(PetscSF sf, PetscSFLink link, MPI_Datatype unit)
987: {
988: PetscInt nSignedChar = 0, nUnsignedChar = 0, nInt = 0, nPetscInt = 0, nPetscReal = 0;
989: PetscBool is2Int, is2PetscInt;
990: #if defined(PETSC_HAVE_COMPLEX)
991: PetscInt nPetscComplex = 0;
992: #endif
994: if (link->deviceinited) return 0;
995: MPIPetsc_Type_compare_contig(unit, MPI_SIGNED_CHAR, &nSignedChar);
996: MPIPetsc_Type_compare_contig(unit, MPI_UNSIGNED_CHAR, &nUnsignedChar);
997: /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
998: MPIPetsc_Type_compare_contig(unit, MPI_INT, &nInt);
999: MPIPetsc_Type_compare_contig(unit, MPIU_INT, &nPetscInt);
1000: MPIPetsc_Type_compare_contig(unit, MPIU_REAL, &nPetscReal);
1001: #if defined(PETSC_HAVE_COMPLEX)
1002: MPIPetsc_Type_compare_contig(unit, MPIU_COMPLEX, &nPetscComplex);
1003: #endif
1004: MPIPetsc_Type_compare(unit, MPI_2INT, &is2Int);
1005: MPIPetsc_Type_compare(unit, MPIU_2INT, &is2PetscInt);
1007: if (is2Int) {
1008: PackInit_PairType<PairInt>(link);
1009: } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
1010: PackInit_PairType<PairPetscInt>(link);
1011: } else if (nPetscReal) {
1012: #if !defined(PETSC_HAVE_DEVICE)
1013: if (nPetscReal == 8) PackInit_RealType<PetscReal, 8, 1>(link);
1014: else if (nPetscReal % 8 == 0) PackInit_RealType<PetscReal, 8, 0>(link);
1015: else if (nPetscReal == 4) PackInit_RealType<PetscReal, 4, 1>(link);
1016: else if (nPetscReal % 4 == 0) PackInit_RealType<PetscReal, 4, 0>(link);
1017: else if (nPetscReal == 2) PackInit_RealType<PetscReal, 2, 1>(link);
1018: else if (nPetscReal % 2 == 0) PackInit_RealType<PetscReal, 2, 0>(link);
1019: else if (nPetscReal == 1) PackInit_RealType<PetscReal, 1, 1>(link);
1020: else if (nPetscReal % 1 == 0)
1021: #endif
1022: PackInit_RealType<PetscReal, 1, 0>(link);
1023: } else if (nPetscInt && sizeof(PetscInt) == sizeof(llint)) {
1024: #if !defined(PETSC_HAVE_DEVICE)
1025: if (nPetscInt == 8) PackInit_IntegerType<llint, 8, 1>(link);
1026: else if (nPetscInt % 8 == 0) PackInit_IntegerType<llint, 8, 0>(link);
1027: else if (nPetscInt == 4) PackInit_IntegerType<llint, 4, 1>(link);
1028: else if (nPetscInt % 4 == 0) PackInit_IntegerType<llint, 4, 0>(link);
1029: else if (nPetscInt == 2) PackInit_IntegerType<llint, 2, 1>(link);
1030: else if (nPetscInt % 2 == 0) PackInit_IntegerType<llint, 2, 0>(link);
1031: else if (nPetscInt == 1) PackInit_IntegerType<llint, 1, 1>(link);
1032: else if (nPetscInt % 1 == 0)
1033: #endif
1034: PackInit_IntegerType<llint, 1, 0>(link);
1035: } else if (nInt) {
1036: #if !defined(PETSC_HAVE_DEVICE)
1037: if (nInt == 8) PackInit_IntegerType<int, 8, 1>(link);
1038: else if (nInt % 8 == 0) PackInit_IntegerType<int, 8, 0>(link);
1039: else if (nInt == 4) PackInit_IntegerType<int, 4, 1>(link);
1040: else if (nInt % 4 == 0) PackInit_IntegerType<int, 4, 0>(link);
1041: else if (nInt == 2) PackInit_IntegerType<int, 2, 1>(link);
1042: else if (nInt % 2 == 0) PackInit_IntegerType<int, 2, 0>(link);
1043: else if (nInt == 1) PackInit_IntegerType<int, 1, 1>(link);
1044: else if (nInt % 1 == 0)
1045: #endif
1046: PackInit_IntegerType<int, 1, 0>(link);
1047: } else if (nSignedChar) {
1048: #if !defined(PETSC_HAVE_DEVICE)
1049: if (nSignedChar == 8) PackInit_IntegerType<SignedChar, 8, 1>(link);
1050: else if (nSignedChar % 8 == 0) PackInit_IntegerType<SignedChar, 8, 0>(link);
1051: else if (nSignedChar == 4) PackInit_IntegerType<SignedChar, 4, 1>(link);
1052: else if (nSignedChar % 4 == 0) PackInit_IntegerType<SignedChar, 4, 0>(link);
1053: else if (nSignedChar == 2) PackInit_IntegerType<SignedChar, 2, 1>(link);
1054: else if (nSignedChar % 2 == 0) PackInit_IntegerType<SignedChar, 2, 0>(link);
1055: else if (nSignedChar == 1) PackInit_IntegerType<SignedChar, 1, 1>(link);
1056: else if (nSignedChar % 1 == 0)
1057: #endif
1058: PackInit_IntegerType<SignedChar, 1, 0>(link);
1059: } else if (nUnsignedChar) {
1060: #if !defined(PETSC_HAVE_DEVICE)
1061: if (nUnsignedChar == 8) PackInit_IntegerType<UnsignedChar, 8, 1>(link);
1062: else if (nUnsignedChar % 8 == 0) PackInit_IntegerType<UnsignedChar, 8, 0>(link);
1063: else if (nUnsignedChar == 4) PackInit_IntegerType<UnsignedChar, 4, 1>(link);
1064: else if (nUnsignedChar % 4 == 0) PackInit_IntegerType<UnsignedChar, 4, 0>(link);
1065: else if (nUnsignedChar == 2) PackInit_IntegerType<UnsignedChar, 2, 1>(link);
1066: else if (nUnsignedChar % 2 == 0) PackInit_IntegerType<UnsignedChar, 2, 0>(link);
1067: else if (nUnsignedChar == 1) PackInit_IntegerType<UnsignedChar, 1, 1>(link);
1068: else if (nUnsignedChar % 1 == 0)
1069: #endif
1070: PackInit_IntegerType<UnsignedChar, 1, 0>(link);
1071: #if defined(PETSC_HAVE_COMPLEX)
1072: } else if (nPetscComplex) {
1073: #if !defined(PETSC_HAVE_DEVICE)
1074: if (nPetscComplex == 8) PackInit_ComplexType<PetscComplex, 8, 1>(link);
1075: else if (nPetscComplex % 8 == 0) PackInit_ComplexType<PetscComplex, 8, 0>(link);
1076: else if (nPetscComplex == 4) PackInit_ComplexType<PetscComplex, 4, 1>(link);
1077: else if (nPetscComplex % 4 == 0) PackInit_ComplexType<PetscComplex, 4, 0>(link);
1078: else if (nPetscComplex == 2) PackInit_ComplexType<PetscComplex, 2, 1>(link);
1079: else if (nPetscComplex % 2 == 0) PackInit_ComplexType<PetscComplex, 2, 0>(link);
1080: else if (nPetscComplex == 1) PackInit_ComplexType<PetscComplex, 1, 1>(link);
1081: else if (nPetscComplex % 1 == 0)
1082: #endif
1083: PackInit_ComplexType<PetscComplex, 1, 0>(link);
1084: #endif
1085: } else {
1086: MPI_Aint lb, nbyte;
1087: MPI_Type_get_extent(unit, &lb, &nbyte);
1089: if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
1090: #if !defined(PETSC_HAVE_DEVICE)
1091: if (nbyte == 4) PackInit_DumbType<char, 4, 1>(link);
1092: else if (nbyte % 4 == 0) PackInit_DumbType<char, 4, 0>(link);
1093: else if (nbyte == 2) PackInit_DumbType<char, 2, 1>(link);
1094: else if (nbyte % 2 == 0) PackInit_DumbType<char, 2, 0>(link);
1095: else if (nbyte == 1) PackInit_DumbType<char, 1, 1>(link);
1096: else if (nbyte % 1 == 0)
1097: #endif
1098: PackInit_DumbType<char, 1, 0>(link);
1099: } else {
1100: nInt = nbyte / sizeof(int);
1101: #if !defined(PETSC_HAVE_DEVICE)
1102: if (nInt == 8) PackInit_DumbType<int, 8, 1>(link);
1103: else if (nInt % 8 == 0) PackInit_DumbType<int, 8, 0>(link);
1104: else if (nInt == 4) PackInit_DumbType<int, 4, 1>(link);
1105: else if (nInt % 4 == 0) PackInit_DumbType<int, 4, 0>(link);
1106: else if (nInt == 2) PackInit_DumbType<int, 2, 1>(link);
1107: else if (nInt % 2 == 0) PackInit_DumbType<int, 2, 0>(link);
1108: else if (nInt == 1) PackInit_DumbType<int, 1, 1>(link);
1109: else if (nInt % 1 == 0)
1110: #endif
1111: PackInit_DumbType<int, 1, 0>(link);
1112: }
1113: }
1115: if (!sf->maxResidentThreadsPerGPU) { /* Not initialized */
1116: int device;
1117: struct hipDeviceProp_t props;
1118: hipGetDevice(&device);
1119: hipGetDeviceProperties(&props, device);
1120: sf->maxResidentThreadsPerGPU = props.maxThreadsPerMultiProcessor * props.multiProcessorCount;
1121: }
1122: link->maxResidentThreadsPerGPU = sf->maxResidentThreadsPerGPU;
1124: link->stream = PetscDefaultHipStream;
1125: link->Destroy = PetscSFLinkDestroy_MPI_HIP;
1126: link->SyncDevice = PetscSFLinkSyncDevice_HIP;
1127: link->SyncStream = PetscSFLinkSyncStream_HIP;
1128: link->Memcpy = PetscSFLinkMemcpy_HIP;
1129: link->deviceinited = PETSC_TRUE;
1130: return 0;
1131: }