Actual source code: sfcuda.cu
1: #include <../src/vec/is/sf/impls/basic/sfpack.h>
3: /* Map a thread id to an index in root/leaf space through a series of 3D subdomains. See PetscSFPackOpt. */
4: __device__ static inline PetscInt MapTidToIndex(const PetscInt *opt, PetscInt tid)
5: {
6: PetscInt i, j, k, m, n, r;
7: const PetscInt *offset, *start, *dx, *dy, *X, *Y;
9: n = opt[0];
10: offset = opt + 1;
11: start = opt + n + 2;
12: dx = opt + 2 * n + 2;
13: dy = opt + 3 * n + 2;
14: X = opt + 5 * n + 2;
15: Y = opt + 6 * n + 2;
16: for (r = 0; r < n; r++) {
17: if (tid < offset[r + 1]) break;
18: }
19: m = (tid - offset[r]);
20: k = m / (dx[r] * dy[r]);
21: j = (m - k * dx[r] * dy[r]) / dx[r];
22: i = m - k * dx[r] * dy[r] - j * dx[r];
24: return (start[r] + k * X[r] * Y[r] + j * X[r] + i);
25: }
27: /*====================================================================================*/
28: /* Templated CUDA kernels for pack/unpack. The Op can be regular or atomic */
29: /*====================================================================================*/
31: /* Suppose user calls PetscSFReduce(sf,unit,...) and <unit> is an MPI data type made of 16 PetscReals, then
32: <Type> is PetscReal, which is the primitive type we operate on.
33: <bs> is 16, which says <unit> contains 16 primitive types.
34: <BS> is 8, which is the maximal SIMD width we will try to vectorize operations on <unit>.
35: <EQ> is 0, which is (bs == BS ? 1 : 0)
37: If instead, <unit> has 8 PetscReals, then bs=8, BS=8, EQ=1, rendering MBS below to a compile time constant.
38: For the common case in VecScatter, bs=1, BS=1, EQ=1, MBS=1, the inner for-loops below will be totally unrolled.
39: */
40: template <class Type, PetscInt BS, PetscInt EQ>
41: __global__ static void d_Pack(PetscInt bs, PetscInt count, PetscInt start, const PetscInt *opt, const PetscInt *idx, const Type *data, Type *buf)
42: {
43: PetscInt i, s, t, tid = blockIdx.x * blockDim.x + threadIdx.x;
44: const PetscInt grid_size = gridDim.x * blockDim.x;
45: const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */
46: const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile-time const when EQ=1. */
48: for (; tid < count; tid += grid_size) {
49: /* opt != NULL ==> idx == NULL, i.e., the indices have patterns but not contiguous;
50: opt == NULL && idx == NULL ==> the indices are contiguous;
51: */
52: t = (opt ? MapTidToIndex(opt, tid) : (idx ? idx[tid] : start + tid)) * MBS;
53: s = tid * MBS;
54: for (i = 0; i < MBS; i++) buf[s + i] = data[t + i];
55: }
56: }
58: template <class Type, class Op, PetscInt BS, PetscInt EQ>
59: __global__ static void d_UnpackAndOp(PetscInt bs, PetscInt count, PetscInt start, const PetscInt *opt, const PetscInt *idx, Type *data, const Type *buf)
60: {
61: PetscInt i, s, t, tid = blockIdx.x * blockDim.x + threadIdx.x;
62: const PetscInt grid_size = gridDim.x * blockDim.x;
63: const PetscInt M = (EQ) ? 1 : bs / BS, MBS = M * BS;
64: Op op;
66: for (; tid < count; tid += grid_size) {
67: t = (opt ? MapTidToIndex(opt, tid) : (idx ? idx[tid] : start + tid)) * MBS;
68: s = tid * MBS;
69: for (i = 0; i < MBS; i++) op(data[t + i], buf[s + i]);
70: }
71: }
73: template <class Type, class Op, PetscInt BS, PetscInt EQ>
74: __global__ static void d_FetchAndOp(PetscInt bs, PetscInt count, PetscInt rootstart, const PetscInt *rootopt, const PetscInt *rootidx, Type *rootdata, Type *leafbuf)
75: {
76: PetscInt i, l, r, tid = blockIdx.x * blockDim.x + threadIdx.x;
77: const PetscInt grid_size = gridDim.x * blockDim.x;
78: const PetscInt M = (EQ) ? 1 : bs / BS, MBS = M * BS;
79: Op op;
81: for (; tid < count; tid += grid_size) {
82: r = (rootopt ? MapTidToIndex(rootopt, tid) : (rootidx ? rootidx[tid] : rootstart + tid)) * MBS;
83: l = tid * MBS;
84: for (i = 0; i < MBS; i++) leafbuf[l + i] = op(rootdata[r + i], leafbuf[l + i]);
85: }
86: }
88: template <class Type, class Op, PetscInt BS, PetscInt EQ>
89: __global__ static void d_ScatterAndOp(PetscInt bs, PetscInt count, PetscInt srcx, PetscInt srcy, PetscInt srcX, PetscInt srcY, PetscInt srcStart, const PetscInt *srcIdx, const Type *src, PetscInt dstx, PetscInt dsty, PetscInt dstX, PetscInt dstY, PetscInt dstStart, const PetscInt *dstIdx, Type *dst)
90: {
91: PetscInt i, j, k, s, t, tid = blockIdx.x * blockDim.x + threadIdx.x;
92: const PetscInt grid_size = gridDim.x * blockDim.x;
93: const PetscInt M = (EQ) ? 1 : bs / BS, MBS = M * BS;
94: Op op;
96: for (; tid < count; tid += grid_size) {
97: if (!srcIdx) { /* src is either contiguous or 3D */
98: k = tid / (srcx * srcy);
99: j = (tid - k * srcx * srcy) / srcx;
100: i = tid - k * srcx * srcy - j * srcx;
101: s = srcStart + k * srcX * srcY + j * srcX + i;
102: } else {
103: s = srcIdx[tid];
104: }
106: if (!dstIdx) { /* dst is either contiguous or 3D */
107: k = tid / (dstx * dsty);
108: j = (tid - k * dstx * dsty) / dstx;
109: i = tid - k * dstx * dsty - j * dstx;
110: t = dstStart + k * dstX * dstY + j * dstX + i;
111: } else {
112: t = dstIdx[tid];
113: }
115: s *= MBS;
116: t *= MBS;
117: for (i = 0; i < MBS; i++) op(dst[t + i], src[s + i]);
118: }
119: }
121: template <class Type, class Op, PetscInt BS, PetscInt EQ>
122: __global__ static void d_FetchAndOpLocal(PetscInt bs, PetscInt count, PetscInt rootstart, const PetscInt *rootopt, const PetscInt *rootidx, Type *rootdata, PetscInt leafstart, const PetscInt *leafopt, const PetscInt *leafidx, const Type *leafdata, Type *leafupdate)
123: {
124: PetscInt i, l, r, tid = blockIdx.x * blockDim.x + threadIdx.x;
125: const PetscInt grid_size = gridDim.x * blockDim.x;
126: const PetscInt M = (EQ) ? 1 : bs / BS, MBS = M * BS;
127: Op op;
129: for (; tid < count; tid += grid_size) {
130: r = (rootopt ? MapTidToIndex(rootopt, tid) : (rootidx ? rootidx[tid] : rootstart + tid)) * MBS;
131: l = (leafopt ? MapTidToIndex(leafopt, tid) : (leafidx ? leafidx[tid] : leafstart + tid)) * MBS;
132: for (i = 0; i < MBS; i++) leafupdate[l + i] = op(rootdata[r + i], leafdata[l + i]);
133: }
134: }
136: /*====================================================================================*/
137: /* Regular operations on device */
138: /*====================================================================================*/
139: template <typename Type>
140: struct Insert {
141: __device__ Type operator()(Type &x, Type y) const
142: {
143: Type old = x;
144: x = y;
145: return old;
146: }
147: };
148: template <typename Type>
149: struct Add {
150: __device__ Type operator()(Type &x, Type y) const
151: {
152: Type old = x;
153: x += y;
154: return old;
155: }
156: };
157: template <typename Type>
158: struct Mult {
159: __device__ Type operator()(Type &x, Type y) const
160: {
161: Type old = x;
162: x *= y;
163: return old;
164: }
165: };
166: template <typename Type>
167: struct Min {
168: __device__ Type operator()(Type &x, Type y) const
169: {
170: Type old = x;
171: x = PetscMin(x, y);
172: return old;
173: }
174: };
175: template <typename Type>
176: struct Max {
177: __device__ Type operator()(Type &x, Type y) const
178: {
179: Type old = x;
180: x = PetscMax(x, y);
181: return old;
182: }
183: };
184: template <typename Type>
185: struct LAND {
186: __device__ Type operator()(Type &x, Type y) const
187: {
188: Type old = x;
189: x = x && y;
190: return old;
191: }
192: };
193: template <typename Type>
194: struct LOR {
195: __device__ Type operator()(Type &x, Type y) const
196: {
197: Type old = x;
198: x = x || y;
199: return old;
200: }
201: };
202: template <typename Type>
203: struct LXOR {
204: __device__ Type operator()(Type &x, Type y) const
205: {
206: Type old = x;
207: x = !x != !y;
208: return old;
209: }
210: };
211: template <typename Type>
212: struct BAND {
213: __device__ Type operator()(Type &x, Type y) const
214: {
215: Type old = x;
216: x = x & y;
217: return old;
218: }
219: };
220: template <typename Type>
221: struct BOR {
222: __device__ Type operator()(Type &x, Type y) const
223: {
224: Type old = x;
225: x = x | y;
226: return old;
227: }
228: };
229: template <typename Type>
230: struct BXOR {
231: __device__ Type operator()(Type &x, Type y) const
232: {
233: Type old = x;
234: x = x ^ y;
235: return old;
236: }
237: };
238: template <typename Type>
239: struct Minloc {
240: __device__ Type operator()(Type &x, Type y) const
241: {
242: Type old = x;
243: if (y.a < x.a) x = y;
244: else if (y.a == x.a) x.b = min(x.b, y.b);
245: return old;
246: }
247: };
248: template <typename Type>
249: struct Maxloc {
250: __device__ Type operator()(Type &x, Type y) const
251: {
252: Type old = x;
253: if (y.a > x.a) x = y;
254: else if (y.a == x.a) x.b = min(x.b, y.b); /* See MPI MAXLOC */
255: return old;
256: }
257: };
259: /*====================================================================================*/
260: /* Atomic operations on device */
261: /*====================================================================================*/
263: /*
264: Atomic Insert (exchange) operations
266: CUDA C Programming Guide V10.1 Chapter B.12.1.3:
268: int atomicExch(int* address, int val);
269: unsigned int atomicExch(unsigned int* address, unsigned int val);
270: unsigned long long int atomicExch(unsigned long long int* address, unsigned long long int val);
271: float atomicExch(float* address, float val);
273: reads the 32-bit or 64-bit word old located at the address address in global or shared
274: memory and stores val back to memory at the same address. These two operations are
275: performed in one atomic transaction. The function returns old.
277: PETSc notes:
279: It may be useful in PetscSFFetchAndOp with op = MPI_REPLACE.
281: VecScatter with multiple entries scattered to the same location using INSERT_VALUES does not need
282: atomic insertion, since it does not need the old value. A 32-bit or 64-bit store instruction should
283: be atomic itself.
285: With bs>1 and a unit > 64 bits, the current element-wise atomic approach can not guarantee the whole
286: insertion is atomic. Hope no user codes rely on that.
287: */
288: __device__ static double atomicExch(double *address, double val)
289: {
290: return __longlong_as_double(atomicExch((ullint *)address, __double_as_longlong(val)));
291: }
293: __device__ static llint atomicExch(llint *address, llint val)
294: {
295: return (llint)(atomicExch((ullint *)address, (ullint)val));
296: }
298: template <typename Type>
299: struct AtomicInsert {
300: __device__ Type operator()(Type &x, Type y) const { return atomicExch(&x, y); }
301: };
303: #if defined(PETSC_HAVE_COMPLEX)
304: #if defined(PETSC_USE_REAL_DOUBLE)
305: /* CUDA does not support 128-bit atomics. Users should not insert different 128-bit PetscComplex values to the same location */
306: template <>
307: struct AtomicInsert<PetscComplex> {
308: __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
309: {
310: PetscComplex old, *z = &old;
311: double *xp = (double *)&x, *yp = (double *)&y;
312: AtomicInsert<double> op;
313: z[0] = op(xp[0], yp[0]);
314: z[1] = op(xp[1], yp[1]);
315: return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
316: }
317: };
318: #elif defined(PETSC_USE_REAL_SINGLE)
319: template <>
320: struct AtomicInsert<PetscComplex> {
321: __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
322: {
323: double *xp = (double *)&x, *yp = (double *)&y;
324: AtomicInsert<double> op;
325: return op(xp[0], yp[0]);
326: }
327: };
328: #endif
329: #endif
331: /*
332: Atomic add operations
334: CUDA C Programming Guide V10.1 Chapter B.12.1.1:
336: int atomicAdd(int* address, int val);
337: unsigned int atomicAdd(unsigned int* address,unsigned int val);
338: unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val);
339: float atomicAdd(float* address, float val);
340: double atomicAdd(double* address, double val);
341: __half2 atomicAdd(__half2 *address, __half2 val);
342: __half atomicAdd(__half *address, __half val);
344: reads the 16-bit, 32-bit or 64-bit word old located at the address address in global or shared memory, computes (old + val),
345: and stores the result back to memory at the same address. These three operations are performed in one atomic transaction. The
346: function returns old.
348: The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher.
349: The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher.
350: The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and
351: higher. The atomicity of the __half2 add operation is guaranteed separately for each of the two __half elements;
352: the entire __half2 is not guaranteed to be atomic as a single 32-bit access.
353: The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
354: */
355: __device__ static llint atomicAdd(llint *address, llint val)
356: {
357: return (llint)atomicAdd((ullint *)address, (ullint)val);
358: }
360: template <typename Type>
361: struct AtomicAdd {
362: __device__ Type operator()(Type &x, Type y) const { return atomicAdd(&x, y); }
363: };
365: template <>
366: struct AtomicAdd<double> {
367: __device__ double operator()(double &x, double y) const
368: {
369: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
370: return atomicAdd(&x, y);
371: #else
372: double *address = &x, val = y;
373: ullint *address_as_ull = (ullint *)address;
374: ullint old = *address_as_ull, assumed;
375: do {
376: assumed = old;
377: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
378: /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
379: } while (assumed != old);
380: return __longlong_as_double(old);
381: #endif
382: }
383: };
385: template <>
386: struct AtomicAdd<float> {
387: __device__ float operator()(float &x, float y) const
388: {
389: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
390: return atomicAdd(&x, y);
391: #else
392: float *address = &x, val = y;
393: int *address_as_int = (int *)address;
394: int old = *address_as_int, assumed;
395: do {
396: assumed = old;
397: old = atomicCAS(address_as_int, assumed, __float_as_int(val + __int_as_float(assumed)));
398: /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
399: } while (assumed != old);
400: return __int_as_float(old);
401: #endif
402: }
403: };
405: #if defined(PETSC_HAVE_COMPLEX)
406: template <>
407: struct AtomicAdd<PetscComplex> {
408: __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
409: {
410: PetscComplex old, *z = &old;
411: PetscReal *xp = (PetscReal *)&x, *yp = (PetscReal *)&y;
412: AtomicAdd<PetscReal> op;
413: z[0] = op(xp[0], yp[0]);
414: z[1] = op(xp[1], yp[1]);
415: return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
416: }
417: };
418: #endif
420: /*
421: Atomic Mult operations:
423: CUDA has no atomicMult at all, so we build our own with atomicCAS
424: */
425: #if defined(PETSC_USE_REAL_DOUBLE)
426: __device__ static double atomicMult(double *address, double val)
427: {
428: ullint *address_as_ull = (ullint *)(address);
429: ullint old = *address_as_ull, assumed;
430: do {
431: assumed = old;
432: /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
433: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * __longlong_as_double(assumed)));
434: } while (assumed != old);
435: return __longlong_as_double(old);
436: }
437: #elif defined(PETSC_USE_REAL_SINGLE)
438: __device__ static float atomicMult(float *address, float val)
439: {
440: int *address_as_int = (int *)(address);
441: int old = *address_as_int, assumed;
442: do {
443: assumed = old;
444: old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed)));
445: } while (assumed != old);
446: return __int_as_float(old);
447: }
448: #endif
450: __device__ static int atomicMult(int *address, int val)
451: {
452: int *address_as_int = (int *)(address);
453: int old = *address_as_int, assumed;
454: do {
455: assumed = old;
456: old = atomicCAS(address_as_int, assumed, val * assumed);
457: } while (assumed != old);
458: return (int)old;
459: }
461: __device__ static llint atomicMult(llint *address, llint val)
462: {
463: ullint *address_as_ull = (ullint *)(address);
464: ullint old = *address_as_ull, assumed;
465: do {
466: assumed = old;
467: old = atomicCAS(address_as_ull, assumed, (ullint)(val * (llint)assumed));
468: } while (assumed != old);
469: return (llint)old;
470: }
472: template <typename Type>
473: struct AtomicMult {
474: __device__ Type operator()(Type &x, Type y) const { return atomicMult(&x, y); }
475: };
477: /*
478: Atomic Min/Max operations
480: CUDA C Programming Guide V10.1 Chapter B.12.1.4~5:
482: int atomicMin(int* address, int val);
483: unsigned int atomicMin(unsigned int* address,unsigned int val);
484: unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val);
486: reads the 32-bit or 64-bit word old located at the address address in global or shared
487: memory, computes the minimum of old and val, and stores the result back to memory
488: at the same address. These three operations are performed in one atomic transaction.
489: The function returns old.
490: The 64-bit version of atomicMin() is only supported by devices of compute capability 3.5 and higher.
492: atomicMax() is similar.
493: */
495: #if defined(PETSC_USE_REAL_DOUBLE)
496: __device__ static double atomicMin(double *address, double val)
497: {
498: ullint *address_as_ull = (ullint *)(address);
499: ullint old = *address_as_ull, assumed;
500: do {
501: assumed = old;
502: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val, __longlong_as_double(assumed))));
503: } while (assumed != old);
504: return __longlong_as_double(old);
505: }
507: __device__ static double atomicMax(double *address, double val)
508: {
509: ullint *address_as_ull = (ullint *)(address);
510: ullint old = *address_as_ull, assumed;
511: do {
512: assumed = old;
513: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val, __longlong_as_double(assumed))));
514: } while (assumed != old);
515: return __longlong_as_double(old);
516: }
517: #elif defined(PETSC_USE_REAL_SINGLE)
518: __device__ static float atomicMin(float *address, float val)
519: {
520: int *address_as_int = (int *)(address);
521: int old = *address_as_int, assumed;
522: do {
523: assumed = old;
524: old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val, __int_as_float(assumed))));
525: } while (assumed != old);
526: return __int_as_float(old);
527: }
529: __device__ static float atomicMax(float *address, float val)
530: {
531: int *address_as_int = (int *)(address);
532: int old = *address_as_int, assumed;
533: do {
534: assumed = old;
535: old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val, __int_as_float(assumed))));
536: } while (assumed != old);
537: return __int_as_float(old);
538: }
539: #endif
541: /*
542: atomicMin/Max(long long *, long long) are not in Nvidia's documentation. But on OLCF Summit we found
543: atomicMin/Max/And/Or/Xor(long long *, long long) in /sw/summit/cuda/10.1.243/include/sm_32_atomic_functions.h.
544: This causes compilation errors with pgi compilers and 64-bit indices:
545: error: function "atomicMin(long long *, long long)" has already been defined
547: So we add extra conditions defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
548: */
549: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
550: __device__ static llint atomicMin(llint *address, llint val)
551: {
552: ullint *address_as_ull = (ullint *)(address);
553: ullint old = *address_as_ull, assumed;
554: do {
555: assumed = old;
556: old = atomicCAS(address_as_ull, assumed, (ullint)(PetscMin(val, (llint)assumed)));
557: } while (assumed != old);
558: return (llint)old;
559: }
561: __device__ static llint atomicMax(llint *address, llint val)
562: {
563: ullint *address_as_ull = (ullint *)(address);
564: ullint old = *address_as_ull, assumed;
565: do {
566: assumed = old;
567: old = atomicCAS(address_as_ull, assumed, (ullint)(PetscMax(val, (llint)assumed)));
568: } while (assumed != old);
569: return (llint)old;
570: }
571: #endif
573: template <typename Type>
574: struct AtomicMin {
575: __device__ Type operator()(Type &x, Type y) const { return atomicMin(&x, y); }
576: };
577: template <typename Type>
578: struct AtomicMax {
579: __device__ Type operator()(Type &x, Type y) const { return atomicMax(&x, y); }
580: };
582: /*
583: Atomic bitwise operations
585: CUDA C Programming Guide V10.1 Chapter B.12.2.1 ~ B.12.2.3:
587: int atomicAnd(int* address, int val);
588: unsigned int atomicAnd(unsigned int* address,unsigned int val);
589: unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val);
591: reads the 32-bit or 64-bit word old located at the address address in global or shared
592: memory, computes (old & val), and stores the result back to memory at the same
593: address. These three operations are performed in one atomic transaction.
594: The function returns old.
596: The 64-bit version of atomicAnd() is only supported by devices of compute capability 3.5 and higher.
598: atomicOr() and atomicXor are similar.
599: */
601: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320) /* Why 320? see comments at atomicMin() above */
602: __device__ static llint atomicAnd(llint *address, llint val)
603: {
604: ullint *address_as_ull = (ullint *)(address);
605: ullint old = *address_as_ull, assumed;
606: do {
607: assumed = old;
608: old = atomicCAS(address_as_ull, assumed, (ullint)(val & (llint)assumed));
609: } while (assumed != old);
610: return (llint)old;
611: }
612: __device__ static llint atomicOr(llint *address, llint val)
613: {
614: ullint *address_as_ull = (ullint *)(address);
615: ullint old = *address_as_ull, assumed;
616: do {
617: assumed = old;
618: old = atomicCAS(address_as_ull, assumed, (ullint)(val | (llint)assumed));
619: } while (assumed != old);
620: return (llint)old;
621: }
623: __device__ static llint atomicXor(llint *address, llint val)
624: {
625: ullint *address_as_ull = (ullint *)(address);
626: ullint old = *address_as_ull, assumed;
627: do {
628: assumed = old;
629: old = atomicCAS(address_as_ull, assumed, (ullint)(val ^ (llint)assumed));
630: } while (assumed != old);
631: return (llint)old;
632: }
633: #endif
635: template <typename Type>
636: struct AtomicBAND {
637: __device__ Type operator()(Type &x, Type y) const { return atomicAnd(&x, y); }
638: };
639: template <typename Type>
640: struct AtomicBOR {
641: __device__ Type operator()(Type &x, Type y) const { return atomicOr(&x, y); }
642: };
643: template <typename Type>
644: struct AtomicBXOR {
645: __device__ Type operator()(Type &x, Type y) const { return atomicXor(&x, y); }
646: };
648: /*
649: Atomic logical operations:
651: CUDA has no atomic logical operations at all. We support them on integer types.
652: */
654: /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
655: which is what we want since we only support 32-bit and 64-bit integers.
656: */
657: template <typename Type, class Op, int size /* sizeof(Type) */>
658: struct AtomicLogical;
660: template <typename Type, class Op>
661: struct AtomicLogical<Type, Op, 4> {
662: __device__ Type operator()(Type &x, Type y) const
663: {
664: int *address_as_int = (int *)(&x);
665: int old = *address_as_int, assumed;
666: Op op;
667: do {
668: assumed = old;
669: old = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed, y)));
670: } while (assumed != old);
671: return (Type)old;
672: }
673: };
675: template <typename Type, class Op>
676: struct AtomicLogical<Type, Op, 8> {
677: __device__ Type operator()(Type &x, Type y) const
678: {
679: ullint *address_as_ull = (ullint *)(&x);
680: ullint old = *address_as_ull, assumed;
681: Op op;
682: do {
683: assumed = old;
684: old = atomicCAS(address_as_ull, assumed, (ullint)(op((Type)assumed, y)));
685: } while (assumed != old);
686: return (Type)old;
687: }
688: };
690: /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
691: template <typename Type>
692: struct land {
693: __device__ Type operator()(Type x, Type y) { return x && y; }
694: };
695: template <typename Type>
696: struct lor {
697: __device__ Type operator()(Type x, Type y) { return x || y; }
698: };
699: template <typename Type>
700: struct lxor {
701: __device__ Type operator()(Type x, Type y) { return (!x != !y); }
702: };
704: template <typename Type>
705: struct AtomicLAND {
706: __device__ Type operator()(Type &x, Type y) const
707: {
708: AtomicLogical<Type, land<Type>, sizeof(Type)> op;
709: return op(x, y);
710: }
711: };
712: template <typename Type>
713: struct AtomicLOR {
714: __device__ Type operator()(Type &x, Type y) const
715: {
716: AtomicLogical<Type, lor<Type>, sizeof(Type)> op;
717: return op(x, y);
718: }
719: };
720: template <typename Type>
721: struct AtomicLXOR {
722: __device__ Type operator()(Type &x, Type y) const
723: {
724: AtomicLogical<Type, lxor<Type>, sizeof(Type)> op;
725: return op(x, y);
726: }
727: };
729: /*====================================================================================*/
730: /* Wrapper functions of cuda kernels. Function pointers are stored in 'link' */
731: /*====================================================================================*/
732: template <typename Type, PetscInt BS, PetscInt EQ>
733: static PetscErrorCode Pack(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, const void *data, void *buf)
734: {
735: PetscInt nthreads = 256;
736: PetscInt nblocks = (count + nthreads - 1) / nthreads;
737: const PetscInt *iarray = opt ? opt->array : NULL;
739: if (!count) return 0;
740: if (!opt && !idx) { /* It is a 'CUDA data to nvshmem buf' memory copy */
741: cudaMemcpyAsync(buf, (char *)data + start * link->unitbytes, count * link->unitbytes, cudaMemcpyDeviceToDevice, link->stream);
742: } else {
743: nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);
744: d_Pack<Type, BS, EQ><<<nblocks, nthreads, 0, link->stream>>>(link->bs, count, start, iarray, idx, (const Type *)data, (Type *)buf);
745: cudaGetLastError();
746: }
747: return 0;
748: }
750: /* To specialize UnpackAndOp for the cudaMemcpyAsync() below. Usually if this is a contiguous memcpy, we use root/leafdirect and do
751: not need UnpackAndOp. Only with nvshmem, we need this 'nvshmem buf to CUDA data' memory copy
752: */
753: template <typename Type, PetscInt BS, PetscInt EQ>
754: static PetscErrorCode Unpack(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *data, const void *buf)
755: {
756: PetscInt nthreads = 256;
757: PetscInt nblocks = (count + nthreads - 1) / nthreads;
758: const PetscInt *iarray = opt ? opt->array : NULL;
760: if (!count) return 0;
761: if (!opt && !idx) { /* It is a 'nvshmem buf to CUDA data' memory copy */
762: cudaMemcpyAsync((char *)data + start * link->unitbytes, buf, count * link->unitbytes, cudaMemcpyDeviceToDevice, link->stream);
763: } else {
764: nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);
765: d_UnpackAndOp<Type, Insert<Type>, BS, EQ><<<nblocks, nthreads, 0, link->stream>>>(link->bs, count, start, iarray, idx, (Type *)data, (const Type *)buf);
766: cudaGetLastError();
767: }
768: return 0;
769: }
771: template <typename Type, class Op, PetscInt BS, PetscInt EQ>
772: static PetscErrorCode UnpackAndOp(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *data, const void *buf)
773: {
774: PetscInt nthreads = 256;
775: PetscInt nblocks = (count + nthreads - 1) / nthreads;
776: const PetscInt *iarray = opt ? opt->array : NULL;
778: if (!count) return 0;
779: nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);
780: d_UnpackAndOp<Type, Op, BS, EQ><<<nblocks, nthreads, 0, link->stream>>>(link->bs, count, start, iarray, idx, (Type *)data, (const Type *)buf);
781: cudaGetLastError();
782: return 0;
783: }
785: template <typename Type, class Op, PetscInt BS, PetscInt EQ>
786: static PetscErrorCode FetchAndOp(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *data, void *buf)
787: {
788: PetscInt nthreads = 256;
789: PetscInt nblocks = (count + nthreads - 1) / nthreads;
790: const PetscInt *iarray = opt ? opt->array : NULL;
792: if (!count) return 0;
793: nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);
794: d_FetchAndOp<Type, Op, BS, EQ><<<nblocks, nthreads, 0, link->stream>>>(link->bs, count, start, iarray, idx, (Type *)data, (Type *)buf);
795: cudaGetLastError();
796: return 0;
797: }
799: template <typename Type, class Op, PetscInt BS, PetscInt EQ>
800: static PetscErrorCode ScatterAndOp(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst)
801: {
802: PetscInt nthreads = 256;
803: PetscInt nblocks = (count + nthreads - 1) / nthreads;
804: PetscInt srcx = 0, srcy = 0, srcX = 0, srcY = 0, dstx = 0, dsty = 0, dstX = 0, dstY = 0;
806: if (!count) return 0;
807: nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);
809: /* The 3D shape of source subdomain may be different than that of the destination, which makes it difficult to use CUDA 3D grid and block */
810: if (srcOpt) {
811: srcx = srcOpt->dx[0];
812: srcy = srcOpt->dy[0];
813: srcX = srcOpt->X[0];
814: srcY = srcOpt->Y[0];
815: srcStart = srcOpt->start[0];
816: srcIdx = NULL;
817: } else if (!srcIdx) {
818: srcx = srcX = count;
819: srcy = srcY = 1;
820: }
822: if (dstOpt) {
823: dstx = dstOpt->dx[0];
824: dsty = dstOpt->dy[0];
825: dstX = dstOpt->X[0];
826: dstY = dstOpt->Y[0];
827: dstStart = dstOpt->start[0];
828: dstIdx = NULL;
829: } else if (!dstIdx) {
830: dstx = dstX = count;
831: dsty = dstY = 1;
832: }
834: d_ScatterAndOp<Type, Op, BS, EQ><<<nblocks, nthreads, 0, link->stream>>>(link->bs, count, srcx, srcy, srcX, srcY, srcStart, srcIdx, (const Type *)src, dstx, dsty, dstX, dstY, dstStart, dstIdx, (Type *)dst);
835: cudaGetLastError();
836: return 0;
837: }
839: /* Specialization for Insert since we may use cudaMemcpyAsync */
840: template <typename Type, PetscInt BS, PetscInt EQ>
841: static PetscErrorCode ScatterAndInsert(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst)
842: {
843: if (!count) return 0;
844: /*src and dst are contiguous */
845: if ((!srcOpt && !srcIdx) && (!dstOpt && !dstIdx) && src != dst) {
846: cudaMemcpyAsync((Type *)dst + dstStart * link->bs, (const Type *)src + srcStart * link->bs, count * link->unitbytes, cudaMemcpyDeviceToDevice, link->stream);
847: } else {
848: ScatterAndOp<Type, Insert<Type>, BS, EQ>(link, count, srcStart, srcOpt, srcIdx, src, dstStart, dstOpt, dstIdx, dst);
849: }
850: return 0;
851: }
853: template <typename Type, class Op, PetscInt BS, PetscInt EQ>
854: static PetscErrorCode FetchAndOpLocal(PetscSFLink link, PetscInt count, PetscInt rootstart, PetscSFPackOpt rootopt, const PetscInt *rootidx, void *rootdata, PetscInt leafstart, PetscSFPackOpt leafopt, const PetscInt *leafidx, const void *leafdata, void *leafupdate)
855: {
856: PetscInt nthreads = 256;
857: PetscInt nblocks = (count + nthreads - 1) / nthreads;
858: const PetscInt *rarray = rootopt ? rootopt->array : NULL;
859: const PetscInt *larray = leafopt ? leafopt->array : NULL;
861: if (!count) return 0;
862: nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);
863: d_FetchAndOpLocal<Type, Op, BS, EQ><<<nblocks, nthreads, 0, link->stream>>>(link->bs, count, rootstart, rarray, rootidx, (Type *)rootdata, leafstart, larray, leafidx, (const Type *)leafdata, (Type *)leafupdate);
864: cudaGetLastError();
865: return 0;
866: }
868: /*====================================================================================*/
869: /* Init various types and instantiate pack/unpack function pointers */
870: /*====================================================================================*/
871: template <typename Type, PetscInt BS, PetscInt EQ>
872: static void PackInit_RealType(PetscSFLink link)
873: {
874: /* Pack/unpack for remote communication */
875: link->d_Pack = Pack<Type, BS, EQ>;
876: link->d_UnpackAndInsert = Unpack<Type, BS, EQ>;
877: link->d_UnpackAndAdd = UnpackAndOp<Type, Add<Type>, BS, EQ>;
878: link->d_UnpackAndMult = UnpackAndOp<Type, Mult<Type>, BS, EQ>;
879: link->d_UnpackAndMin = UnpackAndOp<Type, Min<Type>, BS, EQ>;
880: link->d_UnpackAndMax = UnpackAndOp<Type, Max<Type>, BS, EQ>;
881: link->d_FetchAndAdd = FetchAndOp<Type, Add<Type>, BS, EQ>;
883: /* Scatter for local communication */
884: link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>; /* Has special optimizations */
885: link->d_ScatterAndAdd = ScatterAndOp<Type, Add<Type>, BS, EQ>;
886: link->d_ScatterAndMult = ScatterAndOp<Type, Mult<Type>, BS, EQ>;
887: link->d_ScatterAndMin = ScatterAndOp<Type, Min<Type>, BS, EQ>;
888: link->d_ScatterAndMax = ScatterAndOp<Type, Max<Type>, BS, EQ>;
889: link->d_FetchAndAddLocal = FetchAndOpLocal<Type, Add<Type>, BS, EQ>;
891: /* Atomic versions when there are data-race possibilities */
892: link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>;
893: link->da_UnpackAndAdd = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>;
894: link->da_UnpackAndMult = UnpackAndOp<Type, AtomicMult<Type>, BS, EQ>;
895: link->da_UnpackAndMin = UnpackAndOp<Type, AtomicMin<Type>, BS, EQ>;
896: link->da_UnpackAndMax = UnpackAndOp<Type, AtomicMax<Type>, BS, EQ>;
897: link->da_FetchAndAdd = FetchAndOp<Type, AtomicAdd<Type>, BS, EQ>;
899: link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>;
900: link->da_ScatterAndAdd = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>;
901: link->da_ScatterAndMult = ScatterAndOp<Type, AtomicMult<Type>, BS, EQ>;
902: link->da_ScatterAndMin = ScatterAndOp<Type, AtomicMin<Type>, BS, EQ>;
903: link->da_ScatterAndMax = ScatterAndOp<Type, AtomicMax<Type>, BS, EQ>;
904: link->da_FetchAndAddLocal = FetchAndOpLocal<Type, AtomicAdd<Type>, BS, EQ>;
905: }
907: /* Have this templated class to specialize for char integers */
908: template <typename Type, PetscInt BS, PetscInt EQ, PetscInt size /*sizeof(Type)*/>
909: struct PackInit_IntegerType_Atomic {
910: static void Init(PetscSFLink link)
911: {
912: link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>;
913: link->da_UnpackAndAdd = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>;
914: link->da_UnpackAndMult = UnpackAndOp<Type, AtomicMult<Type>, BS, EQ>;
915: link->da_UnpackAndMin = UnpackAndOp<Type, AtomicMin<Type>, BS, EQ>;
916: link->da_UnpackAndMax = UnpackAndOp<Type, AtomicMax<Type>, BS, EQ>;
917: link->da_UnpackAndLAND = UnpackAndOp<Type, AtomicLAND<Type>, BS, EQ>;
918: link->da_UnpackAndLOR = UnpackAndOp<Type, AtomicLOR<Type>, BS, EQ>;
919: link->da_UnpackAndLXOR = UnpackAndOp<Type, AtomicLXOR<Type>, BS, EQ>;
920: link->da_UnpackAndBAND = UnpackAndOp<Type, AtomicBAND<Type>, BS, EQ>;
921: link->da_UnpackAndBOR = UnpackAndOp<Type, AtomicBOR<Type>, BS, EQ>;
922: link->da_UnpackAndBXOR = UnpackAndOp<Type, AtomicBXOR<Type>, BS, EQ>;
923: link->da_FetchAndAdd = FetchAndOp<Type, AtomicAdd<Type>, BS, EQ>;
925: link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>;
926: link->da_ScatterAndAdd = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>;
927: link->da_ScatterAndMult = ScatterAndOp<Type, AtomicMult<Type>, BS, EQ>;
928: link->da_ScatterAndMin = ScatterAndOp<Type, AtomicMin<Type>, BS, EQ>;
929: link->da_ScatterAndMax = ScatterAndOp<Type, AtomicMax<Type>, BS, EQ>;
930: link->da_ScatterAndLAND = ScatterAndOp<Type, AtomicLAND<Type>, BS, EQ>;
931: link->da_ScatterAndLOR = ScatterAndOp<Type, AtomicLOR<Type>, BS, EQ>;
932: link->da_ScatterAndLXOR = ScatterAndOp<Type, AtomicLXOR<Type>, BS, EQ>;
933: link->da_ScatterAndBAND = ScatterAndOp<Type, AtomicBAND<Type>, BS, EQ>;
934: link->da_ScatterAndBOR = ScatterAndOp<Type, AtomicBOR<Type>, BS, EQ>;
935: link->da_ScatterAndBXOR = ScatterAndOp<Type, AtomicBXOR<Type>, BS, EQ>;
936: link->da_FetchAndAddLocal = FetchAndOpLocal<Type, AtomicAdd<Type>, BS, EQ>;
937: }
938: };
940: /* CUDA does not support atomics on chars. It is TBD in PETSc. */
941: template <typename Type, PetscInt BS, PetscInt EQ>
942: struct PackInit_IntegerType_Atomic<Type, BS, EQ, 1> {
943: static void Init(PetscSFLink link)
944: { /* Nothing to leave function pointers NULL */
945: }
946: };
948: template <typename Type, PetscInt BS, PetscInt EQ>
949: static void PackInit_IntegerType(PetscSFLink link)
950: {
951: link->d_Pack = Pack<Type, BS, EQ>;
952: link->d_UnpackAndInsert = Unpack<Type, BS, EQ>;
953: link->d_UnpackAndAdd = UnpackAndOp<Type, Add<Type>, BS, EQ>;
954: link->d_UnpackAndMult = UnpackAndOp<Type, Mult<Type>, BS, EQ>;
955: link->d_UnpackAndMin = UnpackAndOp<Type, Min<Type>, BS, EQ>;
956: link->d_UnpackAndMax = UnpackAndOp<Type, Max<Type>, BS, EQ>;
957: link->d_UnpackAndLAND = UnpackAndOp<Type, LAND<Type>, BS, EQ>;
958: link->d_UnpackAndLOR = UnpackAndOp<Type, LOR<Type>, BS, EQ>;
959: link->d_UnpackAndLXOR = UnpackAndOp<Type, LXOR<Type>, BS, EQ>;
960: link->d_UnpackAndBAND = UnpackAndOp<Type, BAND<Type>, BS, EQ>;
961: link->d_UnpackAndBOR = UnpackAndOp<Type, BOR<Type>, BS, EQ>;
962: link->d_UnpackAndBXOR = UnpackAndOp<Type, BXOR<Type>, BS, EQ>;
963: link->d_FetchAndAdd = FetchAndOp<Type, Add<Type>, BS, EQ>;
965: link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>;
966: link->d_ScatterAndAdd = ScatterAndOp<Type, Add<Type>, BS, EQ>;
967: link->d_ScatterAndMult = ScatterAndOp<Type, Mult<Type>, BS, EQ>;
968: link->d_ScatterAndMin = ScatterAndOp<Type, Min<Type>, BS, EQ>;
969: link->d_ScatterAndMax = ScatterAndOp<Type, Max<Type>, BS, EQ>;
970: link->d_ScatterAndLAND = ScatterAndOp<Type, LAND<Type>, BS, EQ>;
971: link->d_ScatterAndLOR = ScatterAndOp<Type, LOR<Type>, BS, EQ>;
972: link->d_ScatterAndLXOR = ScatterAndOp<Type, LXOR<Type>, BS, EQ>;
973: link->d_ScatterAndBAND = ScatterAndOp<Type, BAND<Type>, BS, EQ>;
974: link->d_ScatterAndBOR = ScatterAndOp<Type, BOR<Type>, BS, EQ>;
975: link->d_ScatterAndBXOR = ScatterAndOp<Type, BXOR<Type>, BS, EQ>;
976: link->d_FetchAndAddLocal = FetchAndOpLocal<Type, Add<Type>, BS, EQ>;
977: PackInit_IntegerType_Atomic<Type, BS, EQ, sizeof(Type)>::Init(link);
978: }
980: #if defined(PETSC_HAVE_COMPLEX)
981: template <typename Type, PetscInt BS, PetscInt EQ>
982: static void PackInit_ComplexType(PetscSFLink link)
983: {
984: link->d_Pack = Pack<Type, BS, EQ>;
985: link->d_UnpackAndInsert = Unpack<Type, BS, EQ>;
986: link->d_UnpackAndAdd = UnpackAndOp<Type, Add<Type>, BS, EQ>;
987: link->d_UnpackAndMult = UnpackAndOp<Type, Mult<Type>, BS, EQ>;
988: link->d_FetchAndAdd = FetchAndOp<Type, Add<Type>, BS, EQ>;
990: link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>;
991: link->d_ScatterAndAdd = ScatterAndOp<Type, Add<Type>, BS, EQ>;
992: link->d_ScatterAndMult = ScatterAndOp<Type, Mult<Type>, BS, EQ>;
993: link->d_FetchAndAddLocal = FetchAndOpLocal<Type, Add<Type>, BS, EQ>;
995: link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>;
996: link->da_UnpackAndAdd = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>;
997: link->da_UnpackAndMult = NULL; /* Not implemented yet */
998: link->da_FetchAndAdd = NULL; /* Return value of atomicAdd on complex is not atomic */
1000: link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>;
1001: link->da_ScatterAndAdd = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>;
1002: }
1003: #endif
1005: typedef signed char SignedChar;
1006: typedef unsigned char UnsignedChar;
1007: typedef struct {
1008: int a;
1009: int b;
1010: } PairInt;
1011: typedef struct {
1012: PetscInt a;
1013: PetscInt b;
1014: } PairPetscInt;
1016: template <typename Type>
1017: static void PackInit_PairType(PetscSFLink link)
1018: {
1019: link->d_Pack = Pack<Type, 1, 1>;
1020: link->d_UnpackAndInsert = Unpack<Type, 1, 1>;
1021: link->d_UnpackAndMaxloc = UnpackAndOp<Type, Maxloc<Type>, 1, 1>;
1022: link->d_UnpackAndMinloc = UnpackAndOp<Type, Minloc<Type>, 1, 1>;
1024: link->d_ScatterAndInsert = ScatterAndOp<Type, Insert<Type>, 1, 1>;
1025: link->d_ScatterAndMaxloc = ScatterAndOp<Type, Maxloc<Type>, 1, 1>;
1026: link->d_ScatterAndMinloc = ScatterAndOp<Type, Minloc<Type>, 1, 1>;
1027: /* Atomics for pair types are not implemented yet */
1028: }
1030: template <typename Type, PetscInt BS, PetscInt EQ>
1031: static void PackInit_DumbType(PetscSFLink link)
1032: {
1033: link->d_Pack = Pack<Type, BS, EQ>;
1034: link->d_UnpackAndInsert = Unpack<Type, BS, EQ>;
1035: link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>;
1036: /* Atomics for dumb types are not implemented yet */
1037: }
1039: /* Some device-specific utilities */
1040: static PetscErrorCode PetscSFLinkSyncDevice_CUDA(PetscSFLink link)
1041: {
1042: cudaDeviceSynchronize();
1043: return 0;
1044: }
1046: static PetscErrorCode PetscSFLinkSyncStream_CUDA(PetscSFLink link)
1047: {
1048: cudaStreamSynchronize(link->stream);
1049: return 0;
1050: }
1052: static PetscErrorCode PetscSFLinkMemcpy_CUDA(PetscSFLink link, PetscMemType dstmtype, void *dst, PetscMemType srcmtype, const void *src, size_t n)
1053: {
1054: enum cudaMemcpyKind kinds[2][2] = {
1055: {cudaMemcpyHostToHost, cudaMemcpyHostToDevice },
1056: {cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice}
1057: };
1059: if (n) {
1060: if (PetscMemTypeHost(dstmtype) && PetscMemTypeHost(srcmtype)) { /* Separate HostToHost so that pure-cpu code won't call cuda runtime */
1061: PetscMemcpy(dst, src, n);
1062: } else {
1063: int stype = PetscMemTypeDevice(srcmtype) ? 1 : 0;
1064: int dtype = PetscMemTypeDevice(dstmtype) ? 1 : 0;
1065: cudaMemcpyAsync(dst, src, n, kinds[stype][dtype], link->stream);
1066: }
1067: }
1068: return 0;
1069: }
1071: PetscErrorCode PetscSFMalloc_CUDA(PetscMemType mtype, size_t size, void **ptr)
1072: {
1073: if (PetscMemTypeHost(mtype)) PetscMalloc(size, ptr);
1074: else if (PetscMemTypeDevice(mtype)) {
1075: PetscDeviceInitialize(PETSC_DEVICE_CUDA);
1076: cudaMalloc(ptr, size);
1077: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Wrong PetscMemType %d", (int)mtype);
1078: return 0;
1079: }
1081: PetscErrorCode PetscSFFree_CUDA(PetscMemType mtype, void *ptr)
1082: {
1083: if (PetscMemTypeHost(mtype)) PetscFree(ptr);
1084: else if (PetscMemTypeDevice(mtype)) cudaFree(ptr);
1085: else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Wrong PetscMemType %d", (int)mtype);
1086: return 0;
1087: }
1089: /* Destructor when the link uses MPI for communication on CUDA device */
1090: static PetscErrorCode PetscSFLinkDestroy_MPI_CUDA(PetscSF sf, PetscSFLink link)
1091: {
1092: for (int i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
1093: cudaFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);
1094: cudaFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);
1095: }
1096: return 0;
1097: }
1099: /* Some fields of link are initialized by PetscSFPackSetUp_Host. This routine only does what needed on device */
1100: PetscErrorCode PetscSFLinkSetUp_CUDA(PetscSF sf, PetscSFLink link, MPI_Datatype unit)
1101: {
1102: PetscInt nSignedChar = 0, nUnsignedChar = 0, nInt = 0, nPetscInt = 0, nPetscReal = 0;
1103: PetscBool is2Int, is2PetscInt;
1104: #if defined(PETSC_HAVE_COMPLEX)
1105: PetscInt nPetscComplex = 0;
1106: #endif
1108: if (link->deviceinited) return 0;
1109: MPIPetsc_Type_compare_contig(unit, MPI_SIGNED_CHAR, &nSignedChar);
1110: MPIPetsc_Type_compare_contig(unit, MPI_UNSIGNED_CHAR, &nUnsignedChar);
1111: /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
1112: MPIPetsc_Type_compare_contig(unit, MPI_INT, &nInt);
1113: MPIPetsc_Type_compare_contig(unit, MPIU_INT, &nPetscInt);
1114: MPIPetsc_Type_compare_contig(unit, MPIU_REAL, &nPetscReal);
1115: #if defined(PETSC_HAVE_COMPLEX)
1116: MPIPetsc_Type_compare_contig(unit, MPIU_COMPLEX, &nPetscComplex);
1117: #endif
1118: MPIPetsc_Type_compare(unit, MPI_2INT, &is2Int);
1119: MPIPetsc_Type_compare(unit, MPIU_2INT, &is2PetscInt);
1121: if (is2Int) {
1122: PackInit_PairType<PairInt>(link);
1123: } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
1124: PackInit_PairType<PairPetscInt>(link);
1125: } else if (nPetscReal) {
1126: #if !defined(PETSC_HAVE_DEVICE)
1127: if (nPetscReal == 8) PackInit_RealType<PetscReal, 8, 1>(link);
1128: else if (nPetscReal % 8 == 0) PackInit_RealType<PetscReal, 8, 0>(link);
1129: else if (nPetscReal == 4) PackInit_RealType<PetscReal, 4, 1>(link);
1130: else if (nPetscReal % 4 == 0) PackInit_RealType<PetscReal, 4, 0>(link);
1131: else if (nPetscReal == 2) PackInit_RealType<PetscReal, 2, 1>(link);
1132: else if (nPetscReal % 2 == 0) PackInit_RealType<PetscReal, 2, 0>(link);
1133: else if (nPetscReal == 1) PackInit_RealType<PetscReal, 1, 1>(link);
1134: else if (nPetscReal % 1 == 0)
1135: #endif
1136: PackInit_RealType<PetscReal, 1, 0>(link);
1137: } else if (nPetscInt && sizeof(PetscInt) == sizeof(llint)) {
1138: #if !defined(PETSC_HAVE_DEVICE)
1139: if (nPetscInt == 8) PackInit_IntegerType<llint, 8, 1>(link);
1140: else if (nPetscInt % 8 == 0) PackInit_IntegerType<llint, 8, 0>(link);
1141: else if (nPetscInt == 4) PackInit_IntegerType<llint, 4, 1>(link);
1142: else if (nPetscInt % 4 == 0) PackInit_IntegerType<llint, 4, 0>(link);
1143: else if (nPetscInt == 2) PackInit_IntegerType<llint, 2, 1>(link);
1144: else if (nPetscInt % 2 == 0) PackInit_IntegerType<llint, 2, 0>(link);
1145: else if (nPetscInt == 1) PackInit_IntegerType<llint, 1, 1>(link);
1146: else if (nPetscInt % 1 == 0)
1147: #endif
1148: PackInit_IntegerType<llint, 1, 0>(link);
1149: } else if (nInt) {
1150: #if !defined(PETSC_HAVE_DEVICE)
1151: if (nInt == 8) PackInit_IntegerType<int, 8, 1>(link);
1152: else if (nInt % 8 == 0) PackInit_IntegerType<int, 8, 0>(link);
1153: else if (nInt == 4) PackInit_IntegerType<int, 4, 1>(link);
1154: else if (nInt % 4 == 0) PackInit_IntegerType<int, 4, 0>(link);
1155: else if (nInt == 2) PackInit_IntegerType<int, 2, 1>(link);
1156: else if (nInt % 2 == 0) PackInit_IntegerType<int, 2, 0>(link);
1157: else if (nInt == 1) PackInit_IntegerType<int, 1, 1>(link);
1158: else if (nInt % 1 == 0)
1159: #endif
1160: PackInit_IntegerType<int, 1, 0>(link);
1161: } else if (nSignedChar) {
1162: #if !defined(PETSC_HAVE_DEVICE)
1163: if (nSignedChar == 8) PackInit_IntegerType<SignedChar, 8, 1>(link);
1164: else if (nSignedChar % 8 == 0) PackInit_IntegerType<SignedChar, 8, 0>(link);
1165: else if (nSignedChar == 4) PackInit_IntegerType<SignedChar, 4, 1>(link);
1166: else if (nSignedChar % 4 == 0) PackInit_IntegerType<SignedChar, 4, 0>(link);
1167: else if (nSignedChar == 2) PackInit_IntegerType<SignedChar, 2, 1>(link);
1168: else if (nSignedChar % 2 == 0) PackInit_IntegerType<SignedChar, 2, 0>(link);
1169: else if (nSignedChar == 1) PackInit_IntegerType<SignedChar, 1, 1>(link);
1170: else if (nSignedChar % 1 == 0)
1171: #endif
1172: PackInit_IntegerType<SignedChar, 1, 0>(link);
1173: } else if (nUnsignedChar) {
1174: #if !defined(PETSC_HAVE_DEVICE)
1175: if (nUnsignedChar == 8) PackInit_IntegerType<UnsignedChar, 8, 1>(link);
1176: else if (nUnsignedChar % 8 == 0) PackInit_IntegerType<UnsignedChar, 8, 0>(link);
1177: else if (nUnsignedChar == 4) PackInit_IntegerType<UnsignedChar, 4, 1>(link);
1178: else if (nUnsignedChar % 4 == 0) PackInit_IntegerType<UnsignedChar, 4, 0>(link);
1179: else if (nUnsignedChar == 2) PackInit_IntegerType<UnsignedChar, 2, 1>(link);
1180: else if (nUnsignedChar % 2 == 0) PackInit_IntegerType<UnsignedChar, 2, 0>(link);
1181: else if (nUnsignedChar == 1) PackInit_IntegerType<UnsignedChar, 1, 1>(link);
1182: else if (nUnsignedChar % 1 == 0)
1183: #endif
1184: PackInit_IntegerType<UnsignedChar, 1, 0>(link);
1185: #if defined(PETSC_HAVE_COMPLEX)
1186: } else if (nPetscComplex) {
1187: #if !defined(PETSC_HAVE_DEVICE)
1188: if (nPetscComplex == 8) PackInit_ComplexType<PetscComplex, 8, 1>(link);
1189: else if (nPetscComplex % 8 == 0) PackInit_ComplexType<PetscComplex, 8, 0>(link);
1190: else if (nPetscComplex == 4) PackInit_ComplexType<PetscComplex, 4, 1>(link);
1191: else if (nPetscComplex % 4 == 0) PackInit_ComplexType<PetscComplex, 4, 0>(link);
1192: else if (nPetscComplex == 2) PackInit_ComplexType<PetscComplex, 2, 1>(link);
1193: else if (nPetscComplex % 2 == 0) PackInit_ComplexType<PetscComplex, 2, 0>(link);
1194: else if (nPetscComplex == 1) PackInit_ComplexType<PetscComplex, 1, 1>(link);
1195: else if (nPetscComplex % 1 == 0)
1196: #endif
1197: PackInit_ComplexType<PetscComplex, 1, 0>(link);
1198: #endif
1199: } else {
1200: MPI_Aint lb, nbyte;
1201: MPI_Type_get_extent(unit, &lb, &nbyte);
1203: if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
1204: #if !defined(PETSC_HAVE_DEVICE)
1205: if (nbyte == 4) PackInit_DumbType<char, 4, 1>(link);
1206: else if (nbyte % 4 == 0) PackInit_DumbType<char, 4, 0>(link);
1207: else if (nbyte == 2) PackInit_DumbType<char, 2, 1>(link);
1208: else if (nbyte % 2 == 0) PackInit_DumbType<char, 2, 0>(link);
1209: else if (nbyte == 1) PackInit_DumbType<char, 1, 1>(link);
1210: else if (nbyte % 1 == 0)
1211: #endif
1212: PackInit_DumbType<char, 1, 0>(link);
1213: } else {
1214: nInt = nbyte / sizeof(int);
1215: #if !defined(PETSC_HAVE_DEVICE)
1216: if (nInt == 8) PackInit_DumbType<int, 8, 1>(link);
1217: else if (nInt % 8 == 0) PackInit_DumbType<int, 8, 0>(link);
1218: else if (nInt == 4) PackInit_DumbType<int, 4, 1>(link);
1219: else if (nInt % 4 == 0) PackInit_DumbType<int, 4, 0>(link);
1220: else if (nInt == 2) PackInit_DumbType<int, 2, 1>(link);
1221: else if (nInt % 2 == 0) PackInit_DumbType<int, 2, 0>(link);
1222: else if (nInt == 1) PackInit_DumbType<int, 1, 1>(link);
1223: else if (nInt % 1 == 0)
1224: #endif
1225: PackInit_DumbType<int, 1, 0>(link);
1226: }
1227: }
1229: if (!sf->maxResidentThreadsPerGPU) { /* Not initialized */
1230: int device;
1231: struct cudaDeviceProp props;
1232: cudaGetDevice(&device);
1233: cudaGetDeviceProperties(&props, device);
1234: sf->maxResidentThreadsPerGPU = props.maxThreadsPerMultiProcessor * props.multiProcessorCount;
1235: }
1236: link->maxResidentThreadsPerGPU = sf->maxResidentThreadsPerGPU;
1238: link->stream = PetscDefaultCudaStream;
1239: link->Destroy = PetscSFLinkDestroy_MPI_CUDA;
1240: link->SyncDevice = PetscSFLinkSyncDevice_CUDA;
1241: link->SyncStream = PetscSFLinkSyncStream_CUDA;
1242: link->Memcpy = PetscSFLinkMemcpy_CUDA;
1243: link->deviceinited = PETSC_TRUE;
1244: return 0;
1245: }