Actual source code: sfcuda.cu

  1: #include <../src/vec/is/sf/impls/basic/sfpack.h>

  3: /* Map a thread id to an index in root/leaf space through a series of 3D subdomains. See PetscSFPackOpt. */
  4: __device__ static inline PetscInt MapTidToIndex(const PetscInt *opt, PetscInt tid)
  5: {
  6:   PetscInt        i, j, k, m, n, r;
  7:   const PetscInt *offset, *start, *dx, *dy, *X, *Y;

  9:   n      = opt[0];
 10:   offset = opt + 1;
 11:   start  = opt + n + 2;
 12:   dx     = opt + 2 * n + 2;
 13:   dy     = opt + 3 * n + 2;
 14:   X      = opt + 5 * n + 2;
 15:   Y      = opt + 6 * n + 2;
 16:   for (r = 0; r < n; r++) {
 17:     if (tid < offset[r + 1]) break;
 18:   }
 19:   m = (tid - offset[r]);
 20:   k = m / (dx[r] * dy[r]);
 21:   j = (m - k * dx[r] * dy[r]) / dx[r];
 22:   i = m - k * dx[r] * dy[r] - j * dx[r];

 24:   return (start[r] + k * X[r] * Y[r] + j * X[r] + i);
 25: }

 27: /*====================================================================================*/
 28: /*  Templated CUDA kernels for pack/unpack. The Op can be regular or atomic           */
 29: /*====================================================================================*/

 31: /* Suppose user calls PetscSFReduce(sf,unit,...) and <unit> is an MPI data type made of 16 PetscReals, then
 32:    <Type> is PetscReal, which is the primitive type we operate on.
 33:    <bs>   is 16, which says <unit> contains 16 primitive types.
 34:    <BS>   is 8, which is the maximal SIMD width we will try to vectorize operations on <unit>.
 35:    <EQ>   is 0, which is (bs == BS ? 1 : 0)

 37:   If instead, <unit> has 8 PetscReals, then bs=8, BS=8, EQ=1, rendering MBS below to a compile time constant.
 38:   For the common case in VecScatter, bs=1, BS=1, EQ=1, MBS=1, the inner for-loops below will be totally unrolled.
 39: */
 40: template <class Type, PetscInt BS, PetscInt EQ>
 41: __global__ static void d_Pack(PetscInt bs, PetscInt count, PetscInt start, const PetscInt *opt, const PetscInt *idx, const Type *data, Type *buf)
 42: {
 43:   PetscInt       i, s, t, tid = blockIdx.x * blockDim.x + threadIdx.x;
 44:   const PetscInt grid_size = gridDim.x * blockDim.x;
 45:   const PetscInt M         = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */
 46:   const PetscInt MBS       = M * BS;             /* MBS=bs. We turn MBS into a compile-time const when EQ=1. */

 48:   for (; tid < count; tid += grid_size) {
 49:     /* opt != NULL ==> idx == NULL, i.e., the indices have patterns but not contiguous;
 50:        opt == NULL && idx == NULL ==> the indices are contiguous;
 51:      */
 52:     t = (opt ? MapTidToIndex(opt, tid) : (idx ? idx[tid] : start + tid)) * MBS;
 53:     s = tid * MBS;
 54:     for (i = 0; i < MBS; i++) buf[s + i] = data[t + i];
 55:   }
 56: }

 58: template <class Type, class Op, PetscInt BS, PetscInt EQ>
 59: __global__ static void d_UnpackAndOp(PetscInt bs, PetscInt count, PetscInt start, const PetscInt *opt, const PetscInt *idx, Type *data, const Type *buf)
 60: {
 61:   PetscInt       i, s, t, tid = blockIdx.x * blockDim.x + threadIdx.x;
 62:   const PetscInt grid_size = gridDim.x * blockDim.x;
 63:   const PetscInt M = (EQ) ? 1 : bs / BS, MBS = M * BS;
 64:   Op             op;

 66:   for (; tid < count; tid += grid_size) {
 67:     t = (opt ? MapTidToIndex(opt, tid) : (idx ? idx[tid] : start + tid)) * MBS;
 68:     s = tid * MBS;
 69:     for (i = 0; i < MBS; i++) op(data[t + i], buf[s + i]);
 70:   }
 71: }

 73: template <class Type, class Op, PetscInt BS, PetscInt EQ>
 74: __global__ static void d_FetchAndOp(PetscInt bs, PetscInt count, PetscInt rootstart, const PetscInt *rootopt, const PetscInt *rootidx, Type *rootdata, Type *leafbuf)
 75: {
 76:   PetscInt       i, l, r, tid = blockIdx.x * blockDim.x + threadIdx.x;
 77:   const PetscInt grid_size = gridDim.x * blockDim.x;
 78:   const PetscInt M = (EQ) ? 1 : bs / BS, MBS = M * BS;
 79:   Op             op;

 81:   for (; tid < count; tid += grid_size) {
 82:     r = (rootopt ? MapTidToIndex(rootopt, tid) : (rootidx ? rootidx[tid] : rootstart + tid)) * MBS;
 83:     l = tid * MBS;
 84:     for (i = 0; i < MBS; i++) leafbuf[l + i] = op(rootdata[r + i], leafbuf[l + i]);
 85:   }
 86: }

 88: template <class Type, class Op, PetscInt BS, PetscInt EQ>
 89: __global__ static void d_ScatterAndOp(PetscInt bs, PetscInt count, PetscInt srcx, PetscInt srcy, PetscInt srcX, PetscInt srcY, PetscInt srcStart, const PetscInt *srcIdx, const Type *src, PetscInt dstx, PetscInt dsty, PetscInt dstX, PetscInt dstY, PetscInt dstStart, const PetscInt *dstIdx, Type *dst)
 90: {
 91:   PetscInt       i, j, k, s, t, tid = blockIdx.x * blockDim.x + threadIdx.x;
 92:   const PetscInt grid_size = gridDim.x * blockDim.x;
 93:   const PetscInt M = (EQ) ? 1 : bs / BS, MBS = M * BS;
 94:   Op             op;

 96:   for (; tid < count; tid += grid_size) {
 97:     if (!srcIdx) { /* src is either contiguous or 3D */
 98:       k = tid / (srcx * srcy);
 99:       j = (tid - k * srcx * srcy) / srcx;
100:       i = tid - k * srcx * srcy - j * srcx;
101:       s = srcStart + k * srcX * srcY + j * srcX + i;
102:     } else {
103:       s = srcIdx[tid];
104:     }

106:     if (!dstIdx) { /* dst is either contiguous or 3D */
107:       k = tid / (dstx * dsty);
108:       j = (tid - k * dstx * dsty) / dstx;
109:       i = tid - k * dstx * dsty - j * dstx;
110:       t = dstStart + k * dstX * dstY + j * dstX + i;
111:     } else {
112:       t = dstIdx[tid];
113:     }

115:     s *= MBS;
116:     t *= MBS;
117:     for (i = 0; i < MBS; i++) op(dst[t + i], src[s + i]);
118:   }
119: }

121: template <class Type, class Op, PetscInt BS, PetscInt EQ>
122: __global__ static void d_FetchAndOpLocal(PetscInt bs, PetscInt count, PetscInt rootstart, const PetscInt *rootopt, const PetscInt *rootidx, Type *rootdata, PetscInt leafstart, const PetscInt *leafopt, const PetscInt *leafidx, const Type *leafdata, Type *leafupdate)
123: {
124:   PetscInt       i, l, r, tid = blockIdx.x * blockDim.x + threadIdx.x;
125:   const PetscInt grid_size = gridDim.x * blockDim.x;
126:   const PetscInt M = (EQ) ? 1 : bs / BS, MBS = M * BS;
127:   Op             op;

129:   for (; tid < count; tid += grid_size) {
130:     r = (rootopt ? MapTidToIndex(rootopt, tid) : (rootidx ? rootidx[tid] : rootstart + tid)) * MBS;
131:     l = (leafopt ? MapTidToIndex(leafopt, tid) : (leafidx ? leafidx[tid] : leafstart + tid)) * MBS;
132:     for (i = 0; i < MBS; i++) leafupdate[l + i] = op(rootdata[r + i], leafdata[l + i]);
133:   }
134: }

136: /*====================================================================================*/
137: /*                             Regular operations on device                           */
138: /*====================================================================================*/
139: template <typename Type>
140: struct Insert {
141:   __device__ Type operator()(Type &x, Type y) const
142:   {
143:     Type old = x;
144:     x        = y;
145:     return old;
146:   }
147: };
148: template <typename Type>
149: struct Add {
150:   __device__ Type operator()(Type &x, Type y) const
151:   {
152:     Type old = x;
153:     x += y;
154:     return old;
155:   }
156: };
157: template <typename Type>
158: struct Mult {
159:   __device__ Type operator()(Type &x, Type y) const
160:   {
161:     Type old = x;
162:     x *= y;
163:     return old;
164:   }
165: };
166: template <typename Type>
167: struct Min {
168:   __device__ Type operator()(Type &x, Type y) const
169:   {
170:     Type old = x;
171:     x        = PetscMin(x, y);
172:     return old;
173:   }
174: };
175: template <typename Type>
176: struct Max {
177:   __device__ Type operator()(Type &x, Type y) const
178:   {
179:     Type old = x;
180:     x        = PetscMax(x, y);
181:     return old;
182:   }
183: };
184: template <typename Type>
185: struct LAND {
186:   __device__ Type operator()(Type &x, Type y) const
187:   {
188:     Type old = x;
189:     x        = x && y;
190:     return old;
191:   }
192: };
193: template <typename Type>
194: struct LOR {
195:   __device__ Type operator()(Type &x, Type y) const
196:   {
197:     Type old = x;
198:     x        = x || y;
199:     return old;
200:   }
201: };
202: template <typename Type>
203: struct LXOR {
204:   __device__ Type operator()(Type &x, Type y) const
205:   {
206:     Type old = x;
207:     x        = !x != !y;
208:     return old;
209:   }
210: };
211: template <typename Type>
212: struct BAND {
213:   __device__ Type operator()(Type &x, Type y) const
214:   {
215:     Type old = x;
216:     x        = x & y;
217:     return old;
218:   }
219: };
220: template <typename Type>
221: struct BOR {
222:   __device__ Type operator()(Type &x, Type y) const
223:   {
224:     Type old = x;
225:     x        = x | y;
226:     return old;
227:   }
228: };
229: template <typename Type>
230: struct BXOR {
231:   __device__ Type operator()(Type &x, Type y) const
232:   {
233:     Type old = x;
234:     x        = x ^ y;
235:     return old;
236:   }
237: };
238: template <typename Type>
239: struct Minloc {
240:   __device__ Type operator()(Type &x, Type y) const
241:   {
242:     Type old = x;
243:     if (y.a < x.a) x = y;
244:     else if (y.a == x.a) x.b = min(x.b, y.b);
245:     return old;
246:   }
247: };
248: template <typename Type>
249: struct Maxloc {
250:   __device__ Type operator()(Type &x, Type y) const
251:   {
252:     Type old = x;
253:     if (y.a > x.a) x = y;
254:     else if (y.a == x.a) x.b = min(x.b, y.b); /* See MPI MAXLOC */
255:     return old;
256:   }
257: };

259: /*====================================================================================*/
260: /*                             Atomic operations on device                            */
261: /*====================================================================================*/

263: /*
264:   Atomic Insert (exchange) operations

266:   CUDA C Programming Guide V10.1 Chapter B.12.1.3:

268:   int atomicExch(int* address, int val);
269:   unsigned int atomicExch(unsigned int* address, unsigned int val);
270:   unsigned long long int atomicExch(unsigned long long int* address, unsigned long long int val);
271:   float atomicExch(float* address, float val);

273:   reads the 32-bit or 64-bit word old located at the address address in global or shared
274:   memory and stores val back to memory at the same address. These two operations are
275:   performed in one atomic transaction. The function returns old.

277:   PETSc notes:

279:   It may be useful in PetscSFFetchAndOp with op = MPI_REPLACE.

281:   VecScatter with multiple entries scattered to the same location using INSERT_VALUES does not need
282:   atomic insertion, since it does not need the old value. A 32-bit or 64-bit store instruction should
283:   be atomic itself.

285:   With bs>1 and a unit > 64 bits, the current element-wise atomic approach can not guarantee the whole
286:   insertion is atomic. Hope no user codes rely on that.
287: */
288: __device__ static double atomicExch(double *address, double val)
289: {
290:   return __longlong_as_double(atomicExch((ullint *)address, __double_as_longlong(val)));
291: }

293: __device__ static llint atomicExch(llint *address, llint val)
294: {
295:   return (llint)(atomicExch((ullint *)address, (ullint)val));
296: }

298: template <typename Type>
299: struct AtomicInsert {
300:   __device__ Type operator()(Type &x, Type y) const { return atomicExch(&x, y); }
301: };

303: #if defined(PETSC_HAVE_COMPLEX)
304:   #if defined(PETSC_USE_REAL_DOUBLE)
305: /* CUDA does not support 128-bit atomics. Users should not insert different 128-bit PetscComplex values to the same location */
306: template <>
307: struct AtomicInsert<PetscComplex> {
308:   __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
309:   {
310:     PetscComplex         old, *z = &old;
311:     double              *xp = (double *)&x, *yp = (double *)&y;
312:     AtomicInsert<double> op;
313:     z[0] = op(xp[0], yp[0]);
314:     z[1] = op(xp[1], yp[1]);
315:     return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
316:   }
317: };
318:   #elif defined(PETSC_USE_REAL_SINGLE)
319: template <>
320: struct AtomicInsert<PetscComplex> {
321:   __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
322:   {
323:     double              *xp = (double *)&x, *yp = (double *)&y;
324:     AtomicInsert<double> op;
325:     return op(xp[0], yp[0]);
326:   }
327: };
328:   #endif
329: #endif

331: /*
332:   Atomic add operations

334:   CUDA C Programming Guide V10.1 Chapter B.12.1.1:

336:   int atomicAdd(int* address, int val);
337:   unsigned int atomicAdd(unsigned int* address,unsigned int val);
338:   unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val);
339:   float atomicAdd(float* address, float val);
340:   double atomicAdd(double* address, double val);
341:   __half2 atomicAdd(__half2 *address, __half2 val);
342:   __half atomicAdd(__half *address, __half val);

344:   reads the 16-bit, 32-bit or 64-bit word old located at the address address in global or shared memory, computes (old + val),
345:   and stores the result back to memory at the same address. These three operations are performed in one atomic transaction. The
346:   function returns old.

348:   The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher.
349:   The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher.
350:   The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and
351:   higher. The atomicity of the __half2 add operation is guaranteed separately for each of the two __half elements;
352:   the entire __half2 is not guaranteed to be atomic as a single 32-bit access.
353:   The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
354: */
355: __device__ static llint atomicAdd(llint *address, llint val)
356: {
357:   return (llint)atomicAdd((ullint *)address, (ullint)val);
358: }

360: template <typename Type>
361: struct AtomicAdd {
362:   __device__ Type operator()(Type &x, Type y) const { return atomicAdd(&x, y); }
363: };

365: template <>
366: struct AtomicAdd<double> {
367:   __device__ double operator()(double &x, double y) const
368:   {
369: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
370:     return atomicAdd(&x, y);
371: #else
372:     double *address = &x, val = y;
373:     ullint *address_as_ull = (ullint *)address;
374:     ullint  old            = *address_as_ull, assumed;
375:     do {
376:       assumed = old;
377:       old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
378:       /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
379:     } while (assumed != old);
380:     return __longlong_as_double(old);
381: #endif
382:   }
383: };

385: template <>
386: struct AtomicAdd<float> {
387:   __device__ float operator()(float &x, float y) const
388:   {
389: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
390:     return atomicAdd(&x, y);
391: #else
392:     float *address = &x, val = y;
393:     int   *address_as_int = (int *)address;
394:     int    old            = *address_as_int, assumed;
395:     do {
396:       assumed = old;
397:       old     = atomicCAS(address_as_int, assumed, __float_as_int(val + __int_as_float(assumed)));
398:       /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
399:     } while (assumed != old);
400:     return __int_as_float(old);
401: #endif
402:   }
403: };

405: #if defined(PETSC_HAVE_COMPLEX)
406: template <>
407: struct AtomicAdd<PetscComplex> {
408:   __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
409:   {
410:     PetscComplex         old, *z = &old;
411:     PetscReal           *xp = (PetscReal *)&x, *yp = (PetscReal *)&y;
412:     AtomicAdd<PetscReal> op;
413:     z[0] = op(xp[0], yp[0]);
414:     z[1] = op(xp[1], yp[1]);
415:     return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
416:   }
417: };
418: #endif

420: /*
421:   Atomic Mult operations:

423:   CUDA has no atomicMult at all, so we build our own with atomicCAS
424:  */
425: #if defined(PETSC_USE_REAL_DOUBLE)
426: __device__ static double atomicMult(double *address, double val)
427: {
428:   ullint *address_as_ull = (ullint *)(address);
429:   ullint  old            = *address_as_ull, assumed;
430:   do {
431:     assumed = old;
432:     /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
433:     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * __longlong_as_double(assumed)));
434:   } while (assumed != old);
435:   return __longlong_as_double(old);
436: }
437: #elif defined(PETSC_USE_REAL_SINGLE)
438: __device__ static float atomicMult(float *address, float val)
439: {
440:   int *address_as_int = (int *)(address);
441:   int old = *address_as_int, assumed;
442:   do {
443:     assumed = old;
444:     old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed)));
445:   } while (assumed != old);
446:   return __int_as_float(old);
447: }
448: #endif

450: __device__ static int atomicMult(int *address, int val)
451: {
452:   int *address_as_int = (int *)(address);
453:   int  old            = *address_as_int, assumed;
454:   do {
455:     assumed = old;
456:     old     = atomicCAS(address_as_int, assumed, val * assumed);
457:   } while (assumed != old);
458:   return (int)old;
459: }

461: __device__ static llint atomicMult(llint *address, llint val)
462: {
463:   ullint *address_as_ull = (ullint *)(address);
464:   ullint  old            = *address_as_ull, assumed;
465:   do {
466:     assumed = old;
467:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val * (llint)assumed));
468:   } while (assumed != old);
469:   return (llint)old;
470: }

472: template <typename Type>
473: struct AtomicMult {
474:   __device__ Type operator()(Type &x, Type y) const { return atomicMult(&x, y); }
475: };

477: /*
478:   Atomic Min/Max operations

480:   CUDA C Programming Guide V10.1 Chapter B.12.1.4~5:

482:   int atomicMin(int* address, int val);
483:   unsigned int atomicMin(unsigned int* address,unsigned int val);
484:   unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val);

486:   reads the 32-bit or 64-bit word old located at the address address in global or shared
487:   memory, computes the minimum of old and val, and stores the result back to memory
488:   at the same address. These three operations are performed in one atomic transaction.
489:   The function returns old.
490:   The 64-bit version of atomicMin() is only supported by devices of compute capability 3.5 and higher.

492:   atomicMax() is similar.
493:  */

495: #if defined(PETSC_USE_REAL_DOUBLE)
496: __device__ static double atomicMin(double *address, double val)
497: {
498:   ullint *address_as_ull = (ullint *)(address);
499:   ullint  old            = *address_as_ull, assumed;
500:   do {
501:     assumed = old;
502:     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val, __longlong_as_double(assumed))));
503:   } while (assumed != old);
504:   return __longlong_as_double(old);
505: }

507: __device__ static double atomicMax(double *address, double val)
508: {
509:   ullint *address_as_ull = (ullint *)(address);
510:   ullint  old            = *address_as_ull, assumed;
511:   do {
512:     assumed = old;
513:     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val, __longlong_as_double(assumed))));
514:   } while (assumed != old);
515:   return __longlong_as_double(old);
516: }
517: #elif defined(PETSC_USE_REAL_SINGLE)
518: __device__ static float atomicMin(float *address, float val)
519: {
520:   int *address_as_int = (int *)(address);
521:   int old = *address_as_int, assumed;
522:   do {
523:     assumed = old;
524:     old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val, __int_as_float(assumed))));
525:   } while (assumed != old);
526:   return __int_as_float(old);
527: }

529: __device__ static float atomicMax(float *address, float val)
530: {
531:   int *address_as_int = (int *)(address);
532:   int old = *address_as_int, assumed;
533:   do {
534:     assumed = old;
535:     old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val, __int_as_float(assumed))));
536:   } while (assumed != old);
537:   return __int_as_float(old);
538: }
539: #endif

541: /*
542:   atomicMin/Max(long long *, long long) are not in Nvidia's documentation. But on OLCF Summit we found
543:   atomicMin/Max/And/Or/Xor(long long *, long long) in /sw/summit/cuda/10.1.243/include/sm_32_atomic_functions.h.
544:   This causes compilation errors with pgi compilers and 64-bit indices:
545:       error: function "atomicMin(long long *, long long)" has already been defined

547:   So we add extra conditions defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
548: */
549: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
550: __device__ static llint atomicMin(llint *address, llint val)
551: {
552:   ullint *address_as_ull = (ullint *)(address);
553:   ullint  old            = *address_as_ull, assumed;
554:   do {
555:     assumed = old;
556:     old     = atomicCAS(address_as_ull, assumed, (ullint)(PetscMin(val, (llint)assumed)));
557:   } while (assumed != old);
558:   return (llint)old;
559: }

561: __device__ static llint atomicMax(llint *address, llint val)
562: {
563:   ullint *address_as_ull = (ullint *)(address);
564:   ullint  old            = *address_as_ull, assumed;
565:   do {
566:     assumed = old;
567:     old     = atomicCAS(address_as_ull, assumed, (ullint)(PetscMax(val, (llint)assumed)));
568:   } while (assumed != old);
569:   return (llint)old;
570: }
571: #endif

573: template <typename Type>
574: struct AtomicMin {
575:   __device__ Type operator()(Type &x, Type y) const { return atomicMin(&x, y); }
576: };
577: template <typename Type>
578: struct AtomicMax {
579:   __device__ Type operator()(Type &x, Type y) const { return atomicMax(&x, y); }
580: };

582: /*
583:   Atomic bitwise operations

585:   CUDA C Programming Guide V10.1 Chapter B.12.2.1 ~ B.12.2.3:

587:   int atomicAnd(int* address, int val);
588:   unsigned int atomicAnd(unsigned int* address,unsigned int val);
589:   unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val);

591:   reads the 32-bit or 64-bit word old located at the address address in global or shared
592:   memory, computes (old & val), and stores the result back to memory at the same
593:   address. These three operations are performed in one atomic transaction.
594:   The function returns old.

596:   The 64-bit version of atomicAnd() is only supported by devices of compute capability 3.5 and higher.

598:   atomicOr() and atomicXor are similar.
599: */

601: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320) /* Why 320? see comments at atomicMin() above */
602: __device__ static llint atomicAnd(llint *address, llint val)
603: {
604:   ullint *address_as_ull = (ullint *)(address);
605:   ullint  old            = *address_as_ull, assumed;
606:   do {
607:     assumed = old;
608:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val & (llint)assumed));
609:   } while (assumed != old);
610:   return (llint)old;
611: }
612: __device__ static llint atomicOr(llint *address, llint val)
613: {
614:   ullint *address_as_ull = (ullint *)(address);
615:   ullint  old            = *address_as_ull, assumed;
616:   do {
617:     assumed = old;
618:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val | (llint)assumed));
619:   } while (assumed != old);
620:   return (llint)old;
621: }

623: __device__ static llint atomicXor(llint *address, llint val)
624: {
625:   ullint *address_as_ull = (ullint *)(address);
626:   ullint  old            = *address_as_ull, assumed;
627:   do {
628:     assumed = old;
629:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val ^ (llint)assumed));
630:   } while (assumed != old);
631:   return (llint)old;
632: }
633: #endif

635: template <typename Type>
636: struct AtomicBAND {
637:   __device__ Type operator()(Type &x, Type y) const { return atomicAnd(&x, y); }
638: };
639: template <typename Type>
640: struct AtomicBOR {
641:   __device__ Type operator()(Type &x, Type y) const { return atomicOr(&x, y); }
642: };
643: template <typename Type>
644: struct AtomicBXOR {
645:   __device__ Type operator()(Type &x, Type y) const { return atomicXor(&x, y); }
646: };

648: /*
649:   Atomic logical operations:

651:   CUDA has no atomic logical operations at all. We support them on integer types.
652: */

654: /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
655:    which is what we want since we only support 32-bit and 64-bit integers.
656:  */
657: template <typename Type, class Op, int size /* sizeof(Type) */>
658: struct AtomicLogical;

660: template <typename Type, class Op>
661: struct AtomicLogical<Type, Op, 4> {
662:   __device__ Type operator()(Type &x, Type y) const
663:   {
664:     int *address_as_int = (int *)(&x);
665:     int  old            = *address_as_int, assumed;
666:     Op   op;
667:     do {
668:       assumed = old;
669:       old     = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed, y)));
670:     } while (assumed != old);
671:     return (Type)old;
672:   }
673: };

675: template <typename Type, class Op>
676: struct AtomicLogical<Type, Op, 8> {
677:   __device__ Type operator()(Type &x, Type y) const
678:   {
679:     ullint *address_as_ull = (ullint *)(&x);
680:     ullint  old            = *address_as_ull, assumed;
681:     Op      op;
682:     do {
683:       assumed = old;
684:       old     = atomicCAS(address_as_ull, assumed, (ullint)(op((Type)assumed, y)));
685:     } while (assumed != old);
686:     return (Type)old;
687:   }
688: };

690: /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
691: template <typename Type>
692: struct land {
693:   __device__ Type operator()(Type x, Type y) { return x && y; }
694: };
695: template <typename Type>
696: struct lor {
697:   __device__ Type operator()(Type x, Type y) { return x || y; }
698: };
699: template <typename Type>
700: struct lxor {
701:   __device__ Type operator()(Type x, Type y) { return (!x != !y); }
702: };

704: template <typename Type>
705: struct AtomicLAND {
706:   __device__ Type operator()(Type &x, Type y) const
707:   {
708:     AtomicLogical<Type, land<Type>, sizeof(Type)> op;
709:     return op(x, y);
710:   }
711: };
712: template <typename Type>
713: struct AtomicLOR {
714:   __device__ Type operator()(Type &x, Type y) const
715:   {
716:     AtomicLogical<Type, lor<Type>, sizeof(Type)> op;
717:     return op(x, y);
718:   }
719: };
720: template <typename Type>
721: struct AtomicLXOR {
722:   __device__ Type operator()(Type &x, Type y) const
723:   {
724:     AtomicLogical<Type, lxor<Type>, sizeof(Type)> op;
725:     return op(x, y);
726:   }
727: };

729: /*====================================================================================*/
730: /*  Wrapper functions of cuda kernels. Function pointers are stored in 'link'         */
731: /*====================================================================================*/
732: template <typename Type, PetscInt BS, PetscInt EQ>
733: static PetscErrorCode Pack(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, const void *data, void *buf)
734: {
735:   PetscInt        nthreads = 256;
736:   PetscInt        nblocks  = (count + nthreads - 1) / nthreads;
737:   const PetscInt *iarray   = opt ? opt->array : NULL;

739:   if (!count) return 0;
740:   if (!opt && !idx) { /* It is a 'CUDA data to nvshmem buf' memory copy */
741:     cudaMemcpyAsync(buf, (char *)data + start * link->unitbytes, count * link->unitbytes, cudaMemcpyDeviceToDevice, link->stream);
742:   } else {
743:     nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);
744:     d_Pack<Type, BS, EQ><<<nblocks, nthreads, 0, link->stream>>>(link->bs, count, start, iarray, idx, (const Type *)data, (Type *)buf);
745:     cudaGetLastError();
746:   }
747:   return 0;
748: }

750: /* To specialize UnpackAndOp for the cudaMemcpyAsync() below. Usually if this is a contiguous memcpy, we use root/leafdirect and do
751:    not need UnpackAndOp. Only with nvshmem, we need this 'nvshmem buf to CUDA data' memory copy
752: */
753: template <typename Type, PetscInt BS, PetscInt EQ>
754: static PetscErrorCode Unpack(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *data, const void *buf)
755: {
756:   PetscInt        nthreads = 256;
757:   PetscInt        nblocks  = (count + nthreads - 1) / nthreads;
758:   const PetscInt *iarray   = opt ? opt->array : NULL;

760:   if (!count) return 0;
761:   if (!opt && !idx) { /* It is a 'nvshmem buf to CUDA data' memory copy */
762:     cudaMemcpyAsync((char *)data + start * link->unitbytes, buf, count * link->unitbytes, cudaMemcpyDeviceToDevice, link->stream);
763:   } else {
764:     nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);
765:     d_UnpackAndOp<Type, Insert<Type>, BS, EQ><<<nblocks, nthreads, 0, link->stream>>>(link->bs, count, start, iarray, idx, (Type *)data, (const Type *)buf);
766:     cudaGetLastError();
767:   }
768:   return 0;
769: }

771: template <typename Type, class Op, PetscInt BS, PetscInt EQ>
772: static PetscErrorCode UnpackAndOp(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *data, const void *buf)
773: {
774:   PetscInt        nthreads = 256;
775:   PetscInt        nblocks  = (count + nthreads - 1) / nthreads;
776:   const PetscInt *iarray   = opt ? opt->array : NULL;

778:   if (!count) return 0;
779:   nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);
780:   d_UnpackAndOp<Type, Op, BS, EQ><<<nblocks, nthreads, 0, link->stream>>>(link->bs, count, start, iarray, idx, (Type *)data, (const Type *)buf);
781:   cudaGetLastError();
782:   return 0;
783: }

785: template <typename Type, class Op, PetscInt BS, PetscInt EQ>
786: static PetscErrorCode FetchAndOp(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *data, void *buf)
787: {
788:   PetscInt        nthreads = 256;
789:   PetscInt        nblocks  = (count + nthreads - 1) / nthreads;
790:   const PetscInt *iarray   = opt ? opt->array : NULL;

792:   if (!count) return 0;
793:   nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);
794:   d_FetchAndOp<Type, Op, BS, EQ><<<nblocks, nthreads, 0, link->stream>>>(link->bs, count, start, iarray, idx, (Type *)data, (Type *)buf);
795:   cudaGetLastError();
796:   return 0;
797: }

799: template <typename Type, class Op, PetscInt BS, PetscInt EQ>
800: static PetscErrorCode ScatterAndOp(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst)
801: {
802:   PetscInt nthreads = 256;
803:   PetscInt nblocks  = (count + nthreads - 1) / nthreads;
804:   PetscInt srcx = 0, srcy = 0, srcX = 0, srcY = 0, dstx = 0, dsty = 0, dstX = 0, dstY = 0;

806:   if (!count) return 0;
807:   nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);

809:   /* The 3D shape of source subdomain may be different than that of the destination, which makes it difficult to use CUDA 3D grid and block */
810:   if (srcOpt) {
811:     srcx     = srcOpt->dx[0];
812:     srcy     = srcOpt->dy[0];
813:     srcX     = srcOpt->X[0];
814:     srcY     = srcOpt->Y[0];
815:     srcStart = srcOpt->start[0];
816:     srcIdx   = NULL;
817:   } else if (!srcIdx) {
818:     srcx = srcX = count;
819:     srcy = srcY = 1;
820:   }

822:   if (dstOpt) {
823:     dstx     = dstOpt->dx[0];
824:     dsty     = dstOpt->dy[0];
825:     dstX     = dstOpt->X[0];
826:     dstY     = dstOpt->Y[0];
827:     dstStart = dstOpt->start[0];
828:     dstIdx   = NULL;
829:   } else if (!dstIdx) {
830:     dstx = dstX = count;
831:     dsty = dstY = 1;
832:   }

834:   d_ScatterAndOp<Type, Op, BS, EQ><<<nblocks, nthreads, 0, link->stream>>>(link->bs, count, srcx, srcy, srcX, srcY, srcStart, srcIdx, (const Type *)src, dstx, dsty, dstX, dstY, dstStart, dstIdx, (Type *)dst);
835:   cudaGetLastError();
836:   return 0;
837: }

839: /* Specialization for Insert since we may use cudaMemcpyAsync */
840: template <typename Type, PetscInt BS, PetscInt EQ>
841: static PetscErrorCode ScatterAndInsert(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst)
842: {
843:   if (!count) return 0;
844:   /*src and dst are contiguous */
845:   if ((!srcOpt && !srcIdx) && (!dstOpt && !dstIdx) && src != dst) {
846:     cudaMemcpyAsync((Type *)dst + dstStart * link->bs, (const Type *)src + srcStart * link->bs, count * link->unitbytes, cudaMemcpyDeviceToDevice, link->stream);
847:   } else {
848:     ScatterAndOp<Type, Insert<Type>, BS, EQ>(link, count, srcStart, srcOpt, srcIdx, src, dstStart, dstOpt, dstIdx, dst);
849:   }
850:   return 0;
851: }

853: template <typename Type, class Op, PetscInt BS, PetscInt EQ>
854: static PetscErrorCode FetchAndOpLocal(PetscSFLink link, PetscInt count, PetscInt rootstart, PetscSFPackOpt rootopt, const PetscInt *rootidx, void *rootdata, PetscInt leafstart, PetscSFPackOpt leafopt, const PetscInt *leafidx, const void *leafdata, void *leafupdate)
855: {
856:   PetscInt        nthreads = 256;
857:   PetscInt        nblocks  = (count + nthreads - 1) / nthreads;
858:   const PetscInt *rarray   = rootopt ? rootopt->array : NULL;
859:   const PetscInt *larray   = leafopt ? leafopt->array : NULL;

861:   if (!count) return 0;
862:   nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);
863:   d_FetchAndOpLocal<Type, Op, BS, EQ><<<nblocks, nthreads, 0, link->stream>>>(link->bs, count, rootstart, rarray, rootidx, (Type *)rootdata, leafstart, larray, leafidx, (const Type *)leafdata, (Type *)leafupdate);
864:   cudaGetLastError();
865:   return 0;
866: }

868: /*====================================================================================*/
869: /*  Init various types and instantiate pack/unpack function pointers                  */
870: /*====================================================================================*/
871: template <typename Type, PetscInt BS, PetscInt EQ>
872: static void PackInit_RealType(PetscSFLink link)
873: {
874:   /* Pack/unpack for remote communication */
875:   link->d_Pack            = Pack<Type, BS, EQ>;
876:   link->d_UnpackAndInsert = Unpack<Type, BS, EQ>;
877:   link->d_UnpackAndAdd    = UnpackAndOp<Type, Add<Type>, BS, EQ>;
878:   link->d_UnpackAndMult   = UnpackAndOp<Type, Mult<Type>, BS, EQ>;
879:   link->d_UnpackAndMin    = UnpackAndOp<Type, Min<Type>, BS, EQ>;
880:   link->d_UnpackAndMax    = UnpackAndOp<Type, Max<Type>, BS, EQ>;
881:   link->d_FetchAndAdd     = FetchAndOp<Type, Add<Type>, BS, EQ>;

883:   /* Scatter for local communication */
884:   link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>; /* Has special optimizations */
885:   link->d_ScatterAndAdd    = ScatterAndOp<Type, Add<Type>, BS, EQ>;
886:   link->d_ScatterAndMult   = ScatterAndOp<Type, Mult<Type>, BS, EQ>;
887:   link->d_ScatterAndMin    = ScatterAndOp<Type, Min<Type>, BS, EQ>;
888:   link->d_ScatterAndMax    = ScatterAndOp<Type, Max<Type>, BS, EQ>;
889:   link->d_FetchAndAddLocal = FetchAndOpLocal<Type, Add<Type>, BS, EQ>;

891:   /* Atomic versions when there are data-race possibilities */
892:   link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>;
893:   link->da_UnpackAndAdd    = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>;
894:   link->da_UnpackAndMult   = UnpackAndOp<Type, AtomicMult<Type>, BS, EQ>;
895:   link->da_UnpackAndMin    = UnpackAndOp<Type, AtomicMin<Type>, BS, EQ>;
896:   link->da_UnpackAndMax    = UnpackAndOp<Type, AtomicMax<Type>, BS, EQ>;
897:   link->da_FetchAndAdd     = FetchAndOp<Type, AtomicAdd<Type>, BS, EQ>;

899:   link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>;
900:   link->da_ScatterAndAdd    = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>;
901:   link->da_ScatterAndMult   = ScatterAndOp<Type, AtomicMult<Type>, BS, EQ>;
902:   link->da_ScatterAndMin    = ScatterAndOp<Type, AtomicMin<Type>, BS, EQ>;
903:   link->da_ScatterAndMax    = ScatterAndOp<Type, AtomicMax<Type>, BS, EQ>;
904:   link->da_FetchAndAddLocal = FetchAndOpLocal<Type, AtomicAdd<Type>, BS, EQ>;
905: }

907: /* Have this templated class to specialize for char integers */
908: template <typename Type, PetscInt BS, PetscInt EQ, PetscInt size /*sizeof(Type)*/>
909: struct PackInit_IntegerType_Atomic {
910:   static void Init(PetscSFLink link)
911:   {
912:     link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>;
913:     link->da_UnpackAndAdd    = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>;
914:     link->da_UnpackAndMult   = UnpackAndOp<Type, AtomicMult<Type>, BS, EQ>;
915:     link->da_UnpackAndMin    = UnpackAndOp<Type, AtomicMin<Type>, BS, EQ>;
916:     link->da_UnpackAndMax    = UnpackAndOp<Type, AtomicMax<Type>, BS, EQ>;
917:     link->da_UnpackAndLAND   = UnpackAndOp<Type, AtomicLAND<Type>, BS, EQ>;
918:     link->da_UnpackAndLOR    = UnpackAndOp<Type, AtomicLOR<Type>, BS, EQ>;
919:     link->da_UnpackAndLXOR   = UnpackAndOp<Type, AtomicLXOR<Type>, BS, EQ>;
920:     link->da_UnpackAndBAND   = UnpackAndOp<Type, AtomicBAND<Type>, BS, EQ>;
921:     link->da_UnpackAndBOR    = UnpackAndOp<Type, AtomicBOR<Type>, BS, EQ>;
922:     link->da_UnpackAndBXOR   = UnpackAndOp<Type, AtomicBXOR<Type>, BS, EQ>;
923:     link->da_FetchAndAdd     = FetchAndOp<Type, AtomicAdd<Type>, BS, EQ>;

925:     link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>;
926:     link->da_ScatterAndAdd    = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>;
927:     link->da_ScatterAndMult   = ScatterAndOp<Type, AtomicMult<Type>, BS, EQ>;
928:     link->da_ScatterAndMin    = ScatterAndOp<Type, AtomicMin<Type>, BS, EQ>;
929:     link->da_ScatterAndMax    = ScatterAndOp<Type, AtomicMax<Type>, BS, EQ>;
930:     link->da_ScatterAndLAND   = ScatterAndOp<Type, AtomicLAND<Type>, BS, EQ>;
931:     link->da_ScatterAndLOR    = ScatterAndOp<Type, AtomicLOR<Type>, BS, EQ>;
932:     link->da_ScatterAndLXOR   = ScatterAndOp<Type, AtomicLXOR<Type>, BS, EQ>;
933:     link->da_ScatterAndBAND   = ScatterAndOp<Type, AtomicBAND<Type>, BS, EQ>;
934:     link->da_ScatterAndBOR    = ScatterAndOp<Type, AtomicBOR<Type>, BS, EQ>;
935:     link->da_ScatterAndBXOR   = ScatterAndOp<Type, AtomicBXOR<Type>, BS, EQ>;
936:     link->da_FetchAndAddLocal = FetchAndOpLocal<Type, AtomicAdd<Type>, BS, EQ>;
937:   }
938: };

940: /* CUDA does not support atomics on chars. It is TBD in PETSc. */
941: template <typename Type, PetscInt BS, PetscInt EQ>
942: struct PackInit_IntegerType_Atomic<Type, BS, EQ, 1> {
943:   static void Init(PetscSFLink link)
944:   { /* Nothing to leave function pointers NULL */
945:   }
946: };

948: template <typename Type, PetscInt BS, PetscInt EQ>
949: static void PackInit_IntegerType(PetscSFLink link)
950: {
951:   link->d_Pack            = Pack<Type, BS, EQ>;
952:   link->d_UnpackAndInsert = Unpack<Type, BS, EQ>;
953:   link->d_UnpackAndAdd    = UnpackAndOp<Type, Add<Type>, BS, EQ>;
954:   link->d_UnpackAndMult   = UnpackAndOp<Type, Mult<Type>, BS, EQ>;
955:   link->d_UnpackAndMin    = UnpackAndOp<Type, Min<Type>, BS, EQ>;
956:   link->d_UnpackAndMax    = UnpackAndOp<Type, Max<Type>, BS, EQ>;
957:   link->d_UnpackAndLAND   = UnpackAndOp<Type, LAND<Type>, BS, EQ>;
958:   link->d_UnpackAndLOR    = UnpackAndOp<Type, LOR<Type>, BS, EQ>;
959:   link->d_UnpackAndLXOR   = UnpackAndOp<Type, LXOR<Type>, BS, EQ>;
960:   link->d_UnpackAndBAND   = UnpackAndOp<Type, BAND<Type>, BS, EQ>;
961:   link->d_UnpackAndBOR    = UnpackAndOp<Type, BOR<Type>, BS, EQ>;
962:   link->d_UnpackAndBXOR   = UnpackAndOp<Type, BXOR<Type>, BS, EQ>;
963:   link->d_FetchAndAdd     = FetchAndOp<Type, Add<Type>, BS, EQ>;

965:   link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>;
966:   link->d_ScatterAndAdd    = ScatterAndOp<Type, Add<Type>, BS, EQ>;
967:   link->d_ScatterAndMult   = ScatterAndOp<Type, Mult<Type>, BS, EQ>;
968:   link->d_ScatterAndMin    = ScatterAndOp<Type, Min<Type>, BS, EQ>;
969:   link->d_ScatterAndMax    = ScatterAndOp<Type, Max<Type>, BS, EQ>;
970:   link->d_ScatterAndLAND   = ScatterAndOp<Type, LAND<Type>, BS, EQ>;
971:   link->d_ScatterAndLOR    = ScatterAndOp<Type, LOR<Type>, BS, EQ>;
972:   link->d_ScatterAndLXOR   = ScatterAndOp<Type, LXOR<Type>, BS, EQ>;
973:   link->d_ScatterAndBAND   = ScatterAndOp<Type, BAND<Type>, BS, EQ>;
974:   link->d_ScatterAndBOR    = ScatterAndOp<Type, BOR<Type>, BS, EQ>;
975:   link->d_ScatterAndBXOR   = ScatterAndOp<Type, BXOR<Type>, BS, EQ>;
976:   link->d_FetchAndAddLocal = FetchAndOpLocal<Type, Add<Type>, BS, EQ>;
977:   PackInit_IntegerType_Atomic<Type, BS, EQ, sizeof(Type)>::Init(link);
978: }

980: #if defined(PETSC_HAVE_COMPLEX)
981: template <typename Type, PetscInt BS, PetscInt EQ>
982: static void PackInit_ComplexType(PetscSFLink link)
983: {
984:   link->d_Pack            = Pack<Type, BS, EQ>;
985:   link->d_UnpackAndInsert = Unpack<Type, BS, EQ>;
986:   link->d_UnpackAndAdd    = UnpackAndOp<Type, Add<Type>, BS, EQ>;
987:   link->d_UnpackAndMult   = UnpackAndOp<Type, Mult<Type>, BS, EQ>;
988:   link->d_FetchAndAdd     = FetchAndOp<Type, Add<Type>, BS, EQ>;

990:   link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>;
991:   link->d_ScatterAndAdd    = ScatterAndOp<Type, Add<Type>, BS, EQ>;
992:   link->d_ScatterAndMult   = ScatterAndOp<Type, Mult<Type>, BS, EQ>;
993:   link->d_FetchAndAddLocal = FetchAndOpLocal<Type, Add<Type>, BS, EQ>;

995:   link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>;
996:   link->da_UnpackAndAdd    = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>;
997:   link->da_UnpackAndMult   = NULL; /* Not implemented yet */
998:   link->da_FetchAndAdd     = NULL; /* Return value of atomicAdd on complex is not atomic */

1000:   link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>;
1001:   link->da_ScatterAndAdd    = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>;
1002: }
1003: #endif

1005: typedef signed char   SignedChar;
1006: typedef unsigned char UnsignedChar;
1007: typedef struct {
1008:   int a;
1009:   int b;
1010: } PairInt;
1011: typedef struct {
1012:   PetscInt a;
1013:   PetscInt b;
1014: } PairPetscInt;

1016: template <typename Type>
1017: static void PackInit_PairType(PetscSFLink link)
1018: {
1019:   link->d_Pack            = Pack<Type, 1, 1>;
1020:   link->d_UnpackAndInsert = Unpack<Type, 1, 1>;
1021:   link->d_UnpackAndMaxloc = UnpackAndOp<Type, Maxloc<Type>, 1, 1>;
1022:   link->d_UnpackAndMinloc = UnpackAndOp<Type, Minloc<Type>, 1, 1>;

1024:   link->d_ScatterAndInsert = ScatterAndOp<Type, Insert<Type>, 1, 1>;
1025:   link->d_ScatterAndMaxloc = ScatterAndOp<Type, Maxloc<Type>, 1, 1>;
1026:   link->d_ScatterAndMinloc = ScatterAndOp<Type, Minloc<Type>, 1, 1>;
1027:   /* Atomics for pair types are not implemented yet */
1028: }

1030: template <typename Type, PetscInt BS, PetscInt EQ>
1031: static void PackInit_DumbType(PetscSFLink link)
1032: {
1033:   link->d_Pack             = Pack<Type, BS, EQ>;
1034:   link->d_UnpackAndInsert  = Unpack<Type, BS, EQ>;
1035:   link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>;
1036:   /* Atomics for dumb types are not implemented yet */
1037: }

1039: /* Some device-specific utilities */
1040: static PetscErrorCode PetscSFLinkSyncDevice_CUDA(PetscSFLink link)
1041: {
1042:   cudaDeviceSynchronize();
1043:   return 0;
1044: }

1046: static PetscErrorCode PetscSFLinkSyncStream_CUDA(PetscSFLink link)
1047: {
1048:   cudaStreamSynchronize(link->stream);
1049:   return 0;
1050: }

1052: static PetscErrorCode PetscSFLinkMemcpy_CUDA(PetscSFLink link, PetscMemType dstmtype, void *dst, PetscMemType srcmtype, const void *src, size_t n)
1053: {
1054:   enum cudaMemcpyKind kinds[2][2] = {
1055:     {cudaMemcpyHostToHost,   cudaMemcpyHostToDevice  },
1056:     {cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice}
1057:   };

1059:   if (n) {
1060:     if (PetscMemTypeHost(dstmtype) && PetscMemTypeHost(srcmtype)) { /* Separate HostToHost so that pure-cpu code won't call cuda runtime */
1061:       PetscMemcpy(dst, src, n);
1062:     } else {
1063:       int stype = PetscMemTypeDevice(srcmtype) ? 1 : 0;
1064:       int dtype = PetscMemTypeDevice(dstmtype) ? 1 : 0;
1065:       cudaMemcpyAsync(dst, src, n, kinds[stype][dtype], link->stream);
1066:     }
1067:   }
1068:   return 0;
1069: }

1071: PetscErrorCode PetscSFMalloc_CUDA(PetscMemType mtype, size_t size, void **ptr)
1072: {
1073:   if (PetscMemTypeHost(mtype)) PetscMalloc(size, ptr);
1074:   else if (PetscMemTypeDevice(mtype)) {
1075:     PetscDeviceInitialize(PETSC_DEVICE_CUDA);
1076:     cudaMalloc(ptr, size);
1077:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Wrong PetscMemType %d", (int)mtype);
1078:   return 0;
1079: }

1081: PetscErrorCode PetscSFFree_CUDA(PetscMemType mtype, void *ptr)
1082: {
1083:   if (PetscMemTypeHost(mtype)) PetscFree(ptr);
1084:   else if (PetscMemTypeDevice(mtype)) cudaFree(ptr);
1085:   else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Wrong PetscMemType %d", (int)mtype);
1086:   return 0;
1087: }

1089: /* Destructor when the link uses MPI for communication on CUDA device */
1090: static PetscErrorCode PetscSFLinkDestroy_MPI_CUDA(PetscSF sf, PetscSFLink link)
1091: {
1092:   for (int i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
1093:     cudaFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);
1094:     cudaFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);
1095:   }
1096:   return 0;
1097: }

1099: /* Some fields of link are initialized by PetscSFPackSetUp_Host. This routine only does what needed on device */
1100: PetscErrorCode PetscSFLinkSetUp_CUDA(PetscSF sf, PetscSFLink link, MPI_Datatype unit)
1101: {
1102:   PetscInt  nSignedChar = 0, nUnsignedChar = 0, nInt = 0, nPetscInt = 0, nPetscReal = 0;
1103:   PetscBool is2Int, is2PetscInt;
1104: #if defined(PETSC_HAVE_COMPLEX)
1105:   PetscInt nPetscComplex = 0;
1106: #endif

1108:   if (link->deviceinited) return 0;
1109:   MPIPetsc_Type_compare_contig(unit, MPI_SIGNED_CHAR, &nSignedChar);
1110:   MPIPetsc_Type_compare_contig(unit, MPI_UNSIGNED_CHAR, &nUnsignedChar);
1111:   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
1112:   MPIPetsc_Type_compare_contig(unit, MPI_INT, &nInt);
1113:   MPIPetsc_Type_compare_contig(unit, MPIU_INT, &nPetscInt);
1114:   MPIPetsc_Type_compare_contig(unit, MPIU_REAL, &nPetscReal);
1115: #if defined(PETSC_HAVE_COMPLEX)
1116:   MPIPetsc_Type_compare_contig(unit, MPIU_COMPLEX, &nPetscComplex);
1117: #endif
1118:   MPIPetsc_Type_compare(unit, MPI_2INT, &is2Int);
1119:   MPIPetsc_Type_compare(unit, MPIU_2INT, &is2PetscInt);

1121:   if (is2Int) {
1122:     PackInit_PairType<PairInt>(link);
1123:   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
1124:     PackInit_PairType<PairPetscInt>(link);
1125:   } else if (nPetscReal) {
1126: #if !defined(PETSC_HAVE_DEVICE)
1127:     if (nPetscReal == 8) PackInit_RealType<PetscReal, 8, 1>(link);
1128:     else if (nPetscReal % 8 == 0) PackInit_RealType<PetscReal, 8, 0>(link);
1129:     else if (nPetscReal == 4) PackInit_RealType<PetscReal, 4, 1>(link);
1130:     else if (nPetscReal % 4 == 0) PackInit_RealType<PetscReal, 4, 0>(link);
1131:     else if (nPetscReal == 2) PackInit_RealType<PetscReal, 2, 1>(link);
1132:     else if (nPetscReal % 2 == 0) PackInit_RealType<PetscReal, 2, 0>(link);
1133:     else if (nPetscReal == 1) PackInit_RealType<PetscReal, 1, 1>(link);
1134:     else if (nPetscReal % 1 == 0)
1135: #endif
1136:       PackInit_RealType<PetscReal, 1, 0>(link);
1137:   } else if (nPetscInt && sizeof(PetscInt) == sizeof(llint)) {
1138: #if !defined(PETSC_HAVE_DEVICE)
1139:     if (nPetscInt == 8) PackInit_IntegerType<llint, 8, 1>(link);
1140:     else if (nPetscInt % 8 == 0) PackInit_IntegerType<llint, 8, 0>(link);
1141:     else if (nPetscInt == 4) PackInit_IntegerType<llint, 4, 1>(link);
1142:     else if (nPetscInt % 4 == 0) PackInit_IntegerType<llint, 4, 0>(link);
1143:     else if (nPetscInt == 2) PackInit_IntegerType<llint, 2, 1>(link);
1144:     else if (nPetscInt % 2 == 0) PackInit_IntegerType<llint, 2, 0>(link);
1145:     else if (nPetscInt == 1) PackInit_IntegerType<llint, 1, 1>(link);
1146:     else if (nPetscInt % 1 == 0)
1147: #endif
1148:       PackInit_IntegerType<llint, 1, 0>(link);
1149:   } else if (nInt) {
1150: #if !defined(PETSC_HAVE_DEVICE)
1151:     if (nInt == 8) PackInit_IntegerType<int, 8, 1>(link);
1152:     else if (nInt % 8 == 0) PackInit_IntegerType<int, 8, 0>(link);
1153:     else if (nInt == 4) PackInit_IntegerType<int, 4, 1>(link);
1154:     else if (nInt % 4 == 0) PackInit_IntegerType<int, 4, 0>(link);
1155:     else if (nInt == 2) PackInit_IntegerType<int, 2, 1>(link);
1156:     else if (nInt % 2 == 0) PackInit_IntegerType<int, 2, 0>(link);
1157:     else if (nInt == 1) PackInit_IntegerType<int, 1, 1>(link);
1158:     else if (nInt % 1 == 0)
1159: #endif
1160:       PackInit_IntegerType<int, 1, 0>(link);
1161:   } else if (nSignedChar) {
1162: #if !defined(PETSC_HAVE_DEVICE)
1163:     if (nSignedChar == 8) PackInit_IntegerType<SignedChar, 8, 1>(link);
1164:     else if (nSignedChar % 8 == 0) PackInit_IntegerType<SignedChar, 8, 0>(link);
1165:     else if (nSignedChar == 4) PackInit_IntegerType<SignedChar, 4, 1>(link);
1166:     else if (nSignedChar % 4 == 0) PackInit_IntegerType<SignedChar, 4, 0>(link);
1167:     else if (nSignedChar == 2) PackInit_IntegerType<SignedChar, 2, 1>(link);
1168:     else if (nSignedChar % 2 == 0) PackInit_IntegerType<SignedChar, 2, 0>(link);
1169:     else if (nSignedChar == 1) PackInit_IntegerType<SignedChar, 1, 1>(link);
1170:     else if (nSignedChar % 1 == 0)
1171: #endif
1172:       PackInit_IntegerType<SignedChar, 1, 0>(link);
1173:   } else if (nUnsignedChar) {
1174: #if !defined(PETSC_HAVE_DEVICE)
1175:     if (nUnsignedChar == 8) PackInit_IntegerType<UnsignedChar, 8, 1>(link);
1176:     else if (nUnsignedChar % 8 == 0) PackInit_IntegerType<UnsignedChar, 8, 0>(link);
1177:     else if (nUnsignedChar == 4) PackInit_IntegerType<UnsignedChar, 4, 1>(link);
1178:     else if (nUnsignedChar % 4 == 0) PackInit_IntegerType<UnsignedChar, 4, 0>(link);
1179:     else if (nUnsignedChar == 2) PackInit_IntegerType<UnsignedChar, 2, 1>(link);
1180:     else if (nUnsignedChar % 2 == 0) PackInit_IntegerType<UnsignedChar, 2, 0>(link);
1181:     else if (nUnsignedChar == 1) PackInit_IntegerType<UnsignedChar, 1, 1>(link);
1182:     else if (nUnsignedChar % 1 == 0)
1183: #endif
1184:       PackInit_IntegerType<UnsignedChar, 1, 0>(link);
1185: #if defined(PETSC_HAVE_COMPLEX)
1186:   } else if (nPetscComplex) {
1187:   #if !defined(PETSC_HAVE_DEVICE)
1188:     if (nPetscComplex == 8) PackInit_ComplexType<PetscComplex, 8, 1>(link);
1189:     else if (nPetscComplex % 8 == 0) PackInit_ComplexType<PetscComplex, 8, 0>(link);
1190:     else if (nPetscComplex == 4) PackInit_ComplexType<PetscComplex, 4, 1>(link);
1191:     else if (nPetscComplex % 4 == 0) PackInit_ComplexType<PetscComplex, 4, 0>(link);
1192:     else if (nPetscComplex == 2) PackInit_ComplexType<PetscComplex, 2, 1>(link);
1193:     else if (nPetscComplex % 2 == 0) PackInit_ComplexType<PetscComplex, 2, 0>(link);
1194:     else if (nPetscComplex == 1) PackInit_ComplexType<PetscComplex, 1, 1>(link);
1195:     else if (nPetscComplex % 1 == 0)
1196:   #endif
1197:       PackInit_ComplexType<PetscComplex, 1, 0>(link);
1198: #endif
1199:   } else {
1200:     MPI_Aint lb, nbyte;
1201:     MPI_Type_get_extent(unit, &lb, &nbyte);
1203:     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
1204: #if !defined(PETSC_HAVE_DEVICE)
1205:       if (nbyte == 4) PackInit_DumbType<char, 4, 1>(link);
1206:       else if (nbyte % 4 == 0) PackInit_DumbType<char, 4, 0>(link);
1207:       else if (nbyte == 2) PackInit_DumbType<char, 2, 1>(link);
1208:       else if (nbyte % 2 == 0) PackInit_DumbType<char, 2, 0>(link);
1209:       else if (nbyte == 1) PackInit_DumbType<char, 1, 1>(link);
1210:       else if (nbyte % 1 == 0)
1211: #endif
1212:         PackInit_DumbType<char, 1, 0>(link);
1213:     } else {
1214:       nInt = nbyte / sizeof(int);
1215: #if !defined(PETSC_HAVE_DEVICE)
1216:       if (nInt == 8) PackInit_DumbType<int, 8, 1>(link);
1217:       else if (nInt % 8 == 0) PackInit_DumbType<int, 8, 0>(link);
1218:       else if (nInt == 4) PackInit_DumbType<int, 4, 1>(link);
1219:       else if (nInt % 4 == 0) PackInit_DumbType<int, 4, 0>(link);
1220:       else if (nInt == 2) PackInit_DumbType<int, 2, 1>(link);
1221:       else if (nInt % 2 == 0) PackInit_DumbType<int, 2, 0>(link);
1222:       else if (nInt == 1) PackInit_DumbType<int, 1, 1>(link);
1223:       else if (nInt % 1 == 0)
1224: #endif
1225:         PackInit_DumbType<int, 1, 0>(link);
1226:     }
1227:   }

1229:   if (!sf->maxResidentThreadsPerGPU) { /* Not initialized */
1230:     int                   device;
1231:     struct cudaDeviceProp props;
1232:     cudaGetDevice(&device);
1233:     cudaGetDeviceProperties(&props, device);
1234:     sf->maxResidentThreadsPerGPU = props.maxThreadsPerMultiProcessor * props.multiProcessorCount;
1235:   }
1236:   link->maxResidentThreadsPerGPU = sf->maxResidentThreadsPerGPU;

1238:   link->stream       = PetscDefaultCudaStream;
1239:   link->Destroy      = PetscSFLinkDestroy_MPI_CUDA;
1240:   link->SyncDevice   = PetscSFLinkSyncDevice_CUDA;
1241:   link->SyncStream   = PetscSFLinkSyncStream_CUDA;
1242:   link->Memcpy       = PetscSFLinkMemcpy_CUDA;
1243:   link->deviceinited = PETSC_TRUE;
1244:   return 0;
1245: }