Actual source code: sfpack.c

  1: #include "petsc/private/sfimpl.h"
  2: #include <../src/vec/is/sf/impls/basic/sfpack.h>
  3: #include <../src/vec/is/sf/impls/basic/sfbasic.h>

  5: /* This is a C file that contains packing facilities, with dispatches to device if enabled. */

  7: /*
  8:  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
  9:  * therefore we pack data types manually. This file defines packing routines for the standard data types.
 10:  */

 12: #define CPPJoin4(a, b, c, d) a##_##b##_##c##_##d

 14: /* Operations working like s += t */
 15: #define OP_BINARY(op, s, t) \
 16:   do { \
 17:     (s) = (s)op(t); \
 18:   } while (0) /* binary ops in the middle such as +, *, && etc. */
 19: #define OP_FUNCTION(op, s, t) \
 20:   do { \
 21:     (s) = op((s), (t)); \
 22:   } while (0) /* ops like a function, such as PetscMax, PetscMin */
 23: #define OP_LXOR(op, s, t) \
 24:   do { \
 25:     (s) = (!(s)) != (!(t)); \
 26:   } while (0) /* logical exclusive OR */
 27: #define OP_ASSIGN(op, s, t) \
 28:   do { \
 29:     (s) = (t); \
 30:   } while (0)
 31: /* Ref MPI MAXLOC */
 32: #define OP_XLOC(op, s, t) \
 33:   do { \
 34:     if ((s).u == (t).u) (s).i = PetscMin((s).i, (t).i); \
 35:     else if (!((s).u op(t).u)) s = t; \
 36:   } while (0)

 38: /* DEF_PackFunc - macro defining a Pack routine

 40:    Arguments of the macro:
 41:    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
 42:    .BS        Block size for vectorization. It is a factor of bsz.
 43:    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.

 45:    Arguments of the Pack routine:
 46:    +count     Number of indices in idx[].
 47:    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
 48:    .opt       Per-pack optimization plan. NULL means no such plan.
 49:    .idx       Indices of entries to packed.
 50:    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
 51:    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
 52:    -packed    Address of the packed data.
 53:  */
 54: #define DEF_PackFunc(Type, BS, EQ) \
 55:   static PetscErrorCode CPPJoin4(Pack, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, const void *unpacked, void *packed) \
 56:   { \
 57:     const Type    *u = (const Type *)unpacked, *u2; \
 58:     Type          *p = (Type *)packed, *p2; \
 59:     PetscInt       i, j, k, X, Y, r, bs = link->bs; \
 60:     const PetscInt M   = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
 61:     const PetscInt MBS = M * BS;             /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
 62:     if (!idx) PetscArraycpy(p, u + start * MBS, MBS * count); /* idx[] are contiguous */ \
 63:     else if (opt) { /* has optimizations available */ p2 = p; \
 64:       for (r = 0; r < opt->n; r++) { \
 65:         u2 = u + opt->start[r] * MBS; \
 66:         X  = opt->X[r]; \
 67:         Y  = opt->Y[r]; \
 68:         for (k = 0; k < opt->dz[r]; k++) \
 69:           for (j = 0; j < opt->dy[r]; j++) { \
 70:             PetscArraycpy(p2, u2 + (X * Y * k + X * j) * MBS, opt->dx[r] * MBS); \
 71:             p2 += opt->dx[r] * MBS; \
 72:           } \
 73:       } \
 74:     } else { \
 75:       for (i = 0; i < count; i++) \
 76:         for (j = 0; j < M; j++)    /* Decent compilers should eliminate this loop when M = const 1 */ \
 77:           for (k = 0; k < BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \
 78:             p[i * MBS + j * BS + k] = u[idx[i] * MBS + j * BS + k]; \
 79:     } \
 80:     return 0; \
 81:   }

 83: /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
 84:                 and inserts into a sparse array.

 86:    Arguments:
 87:   .Type       Type of the data
 88:   .BS         Block size for vectorization
 89:   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.

 91:   Notes:
 92:    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
 93:  */
 94: #define DEF_UnpackFunc(Type, BS, EQ) \
 95:   static PetscErrorCode CPPJoin4(UnpackAndInsert, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) \
 96:   { \
 97:     Type          *u = (Type *)unpacked, *u2; \
 98:     const Type    *p = (const Type *)packed; \
 99:     PetscInt       i, j, k, X, Y, r, bs = link->bs; \
100:     const PetscInt M   = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
101:     const PetscInt MBS = M * BS;             /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
102:     if (!idx) { \
103:       u += start * MBS; \
104:       if (u != p) PetscArraycpy(u, p, count *MBS); \
105:     } else if (opt) { /* has optimizations available */ \
106:       for (r = 0; r < opt->n; r++) { \
107:         u2 = u + opt->start[r] * MBS; \
108:         X  = opt->X[r]; \
109:         Y  = opt->Y[r]; \
110:         for (k = 0; k < opt->dz[r]; k++) \
111:           for (j = 0; j < opt->dy[r]; j++) { \
112:             PetscArraycpy(u2 + (X * Y * k + X * j) * MBS, p, opt->dx[r] * MBS); \
113:             p += opt->dx[r] * MBS; \
114:           } \
115:       } \
116:     } else { \
117:       for (i = 0; i < count; i++) \
118:         for (j = 0; j < M; j++) \
119:           for (k = 0; k < BS; k++) u[idx[i] * MBS + j * BS + k] = p[i * MBS + j * BS + k]; \
120:     } \
121:     return 0; \
122:   }

124: /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert

126:    Arguments:
127:   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
128:   .Type       Type of the data
129:   .BS         Block size for vectorization
130:   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
131:   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
132:   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
133:  */
134: #define DEF_UnpackAndOp(Type, BS, EQ, Opname, Op, OpApply) \
135:   static PetscErrorCode CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) \
136:   { \
137:     Type          *u = (Type *)unpacked, *u2; \
138:     const Type    *p = (const Type *)packed; \
139:     PetscInt       i, j, k, X, Y, r, bs = link->bs; \
140:     const PetscInt M   = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
141:     const PetscInt MBS = M * BS;             /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
142:     if (!idx) { \
143:       u += start * MBS; \
144:       for (i = 0; i < count; i++) \
145:         for (j = 0; j < M; j++) \
146:           for (k = 0; k < BS; k++) OpApply(Op, u[i * MBS + j * BS + k], p[i * MBS + j * BS + k]); \
147:     } else if (opt) { /* idx[] has patterns */ \
148:       for (r = 0; r < opt->n; r++) { \
149:         u2 = u + opt->start[r] * MBS; \
150:         X  = opt->X[r]; \
151:         Y  = opt->Y[r]; \
152:         for (k = 0; k < opt->dz[r]; k++) \
153:           for (j = 0; j < opt->dy[r]; j++) { \
154:             for (i = 0; i < opt->dx[r] * MBS; i++) OpApply(Op, u2[(X * Y * k + X * j) * MBS + i], p[i]); \
155:             p += opt->dx[r] * MBS; \
156:           } \
157:       } \
158:     } else { \
159:       for (i = 0; i < count; i++) \
160:         for (j = 0; j < M; j++) \
161:           for (k = 0; k < BS; k++) OpApply(Op, u[idx[i] * MBS + j * BS + k], p[i * MBS + j * BS + k]); \
162:     } \
163:     return 0; \
164:   }

166: #define DEF_FetchAndOp(Type, BS, EQ, Opname, Op, OpApply) \
167:   static PetscErrorCode CPPJoin4(FetchAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, void *packed) \
168:   { \
169:     Type          *u = (Type *)unpacked, *p = (Type *)packed, tmp; \
170:     PetscInt       i, j, k, r, l, bs = link->bs; \
171:     const PetscInt M   = (EQ) ? 1 : bs / BS; \
172:     const PetscInt MBS = M * BS; \
173:     for (i = 0; i < count; i++) { \
174:       r = (!idx ? start + i : idx[i]) * MBS; \
175:       l = i * MBS; \
176:       for (j = 0; j < M; j++) \
177:         for (k = 0; k < BS; k++) { \
178:           tmp = u[r + j * BS + k]; \
179:           OpApply(Op, u[r + j * BS + k], p[l + j * BS + k]); \
180:           p[l + j * BS + k] = tmp; \
181:         } \
182:     } \
183:     return 0; \
184:   }

186: #define DEF_ScatterAndOp(Type, BS, EQ, Opname, Op, OpApply) \
187:   static PetscErrorCode CPPJoin4(ScatterAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst) \
188:   { \
189:     const Type    *u = (const Type *)src; \
190:     Type          *v = (Type *)dst; \
191:     PetscInt       i, j, k, s, t, X, Y, bs = link->bs; \
192:     const PetscInt M   = (EQ) ? 1 : bs / BS; \
193:     const PetscInt MBS = M * BS; \
194:     if (!srcIdx) { /* src is contiguous */ \
195:       u += srcStart * MBS; \
196:       CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(link, count, dstStart, dstOpt, dstIdx, dst, u); \
197:     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \
198:       u += srcOpt->start[0] * MBS; \
199:       v += dstStart * MBS; \
200:       X = srcOpt->X[0]; \
201:       Y = srcOpt->Y[0]; \
202:       for (k = 0; k < srcOpt->dz[0]; k++) \
203:         for (j = 0; j < srcOpt->dy[0]; j++) { \
204:           for (i = 0; i < srcOpt->dx[0] * MBS; i++) OpApply(Op, v[i], u[(X * Y * k + X * j) * MBS + i]); \
205:           v += srcOpt->dx[0] * MBS; \
206:         } \
207:     } else { /* all other cases */ \
208:       for (i = 0; i < count; i++) { \
209:         s = (!srcIdx ? srcStart + i : srcIdx[i]) * MBS; \
210:         t = (!dstIdx ? dstStart + i : dstIdx[i]) * MBS; \
211:         for (j = 0; j < M; j++) \
212:           for (k = 0; k < BS; k++) OpApply(Op, v[t + j * BS + k], u[s + j * BS + k]); \
213:       } \
214:     } \
215:     return 0; \
216:   }

218: #define DEF_FetchAndOpLocal(Type, BS, EQ, Opname, Op, OpApply) \
219:   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt rootstart, PetscSFPackOpt rootopt, const PetscInt *rootidx, void *rootdata, PetscInt leafstart, PetscSFPackOpt leafopt, const PetscInt *leafidx, const void *leafdata, void *leafupdate) \
220:   { \
221:     Type          *rdata = (Type *)rootdata, *lupdate = (Type *)leafupdate; \
222:     const Type    *ldata = (const Type *)leafdata; \
223:     PetscInt       i, j, k, r, l, bs = link->bs; \
224:     const PetscInt M   = (EQ) ? 1 : bs / BS; \
225:     const PetscInt MBS = M * BS; \
226:     for (i = 0; i < count; i++) { \
227:       r = (rootidx ? rootidx[i] : rootstart + i) * MBS; \
228:       l = (leafidx ? leafidx[i] : leafstart + i) * MBS; \
229:       for (j = 0; j < M; j++) \
230:         for (k = 0; k < BS; k++) { \
231:           lupdate[l + j * BS + k] = rdata[r + j * BS + k]; \
232:           OpApply(Op, rdata[r + j * BS + k], ldata[l + j * BS + k]); \
233:         } \
234:     } \
235:     return 0; \
236:   }

238: /* Pack, Unpack/Fetch ops */
239: #define DEF_Pack(Type, BS, EQ) \
240:   DEF_PackFunc(Type, BS, EQ) DEF_UnpackFunc(Type, BS, EQ) DEF_ScatterAndOp(Type, BS, EQ, Insert, =, OP_ASSIGN) static void CPPJoin4(PackInit_Pack, Type, BS, EQ)(PetscSFLink link) \
241:   { \
242:     link->h_Pack             = CPPJoin4(Pack, Type, BS, EQ); \
243:     link->h_UnpackAndInsert  = CPPJoin4(UnpackAndInsert, Type, BS, EQ); \
244:     link->h_ScatterAndInsert = CPPJoin4(ScatterAndInsert, Type, BS, EQ); \
245:   }

247: /* Add, Mult ops */
248: #define DEF_Add(Type, BS, EQ) \
249:   DEF_UnpackAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOpLocal(Type, BS, EQ, Add, +, OP_BINARY) static void CPPJoin4(PackInit_Add, Type, BS, EQ)(PetscSFLink link) \
250:   { \
251:     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd, Type, BS, EQ); \
252:     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult, Type, BS, EQ); \
253:     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd, Type, BS, EQ); \
254:     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd, Type, BS, EQ); \
255:     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult, Type, BS, EQ); \
256:     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal, Type, BS, EQ); \
257:   }

259: /* Max, Min ops */
260: #define DEF_Cmp(Type, BS, EQ) \
261:   DEF_UnpackAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_UnpackAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) static void CPPJoin4(PackInit_Compare, Type, BS, EQ)(PetscSFLink link) \
262:   { \
263:     link->h_UnpackAndMax  = CPPJoin4(UnpackAndMax, Type, BS, EQ); \
264:     link->h_UnpackAndMin  = CPPJoin4(UnpackAndMin, Type, BS, EQ); \
265:     link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type, BS, EQ); \
266:     link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type, BS, EQ); \
267:   }

269: /* Logical ops.
270:   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
271:   the compilation warning "empty macro arguments are undefined in ISO C90"
272:  */
273: #define DEF_Log(Type, BS, EQ) \
274:   DEF_UnpackAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) DEF_ScatterAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) static void CPPJoin4(PackInit_Logical, Type, BS, EQ)(PetscSFLink link) \
275:   { \
276:     link->h_UnpackAndLAND  = CPPJoin4(UnpackAndLAND, Type, BS, EQ); \
277:     link->h_UnpackAndLOR   = CPPJoin4(UnpackAndLOR, Type, BS, EQ); \
278:     link->h_UnpackAndLXOR  = CPPJoin4(UnpackAndLXOR, Type, BS, EQ); \
279:     link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND, Type, BS, EQ); \
280:     link->h_ScatterAndLOR  = CPPJoin4(ScatterAndLOR, Type, BS, EQ); \
281:     link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR, Type, BS, EQ); \
282:   }

284: /* Bitwise ops */
285: #define DEF_Bit(Type, BS, EQ) \
286:   DEF_UnpackAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) static void CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(PetscSFLink link) \
287:   { \
288:     link->h_UnpackAndBAND  = CPPJoin4(UnpackAndBAND, Type, BS, EQ); \
289:     link->h_UnpackAndBOR   = CPPJoin4(UnpackAndBOR, Type, BS, EQ); \
290:     link->h_UnpackAndBXOR  = CPPJoin4(UnpackAndBXOR, Type, BS, EQ); \
291:     link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND, Type, BS, EQ); \
292:     link->h_ScatterAndBOR  = CPPJoin4(ScatterAndBOR, Type, BS, EQ); \
293:     link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR, Type, BS, EQ); \
294:   }

296: /* Maxloc, Minloc ops */
297: #define DEF_Xloc(Type, BS, EQ) \
298:   DEF_UnpackAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_UnpackAndOp(Type, BS, EQ, Min, <, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Min, <, OP_XLOC) static void CPPJoin4(PackInit_Xloc, Type, BS, EQ)(PetscSFLink link) \
299:   { \
300:     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type, BS, EQ); \
301:     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type, BS, EQ); \
302:     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax, Type, BS, EQ); \
303:     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin, Type, BS, EQ); \
304:   }

306: #define DEF_IntegerType(Type, BS, EQ) \
307:   DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) DEF_Log(Type, BS, EQ) DEF_Bit(Type, BS, EQ) static void CPPJoin4(PackInit_IntegerType, Type, BS, EQ)(PetscSFLink link) \
308:   { \
309:     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
310:     CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
311:     CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \
312:     CPPJoin4(PackInit_Logical, Type, BS, EQ)(link); \
313:     CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(link); \
314:   }

316: #define DEF_RealType(Type, BS, EQ) \
317:   DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) static void CPPJoin4(PackInit_RealType, Type, BS, EQ)(PetscSFLink link) \
318:   { \
319:     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
320:     CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
321:     CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \
322:   }

324: #if defined(PETSC_HAVE_COMPLEX)
325:   #define DEF_ComplexType(Type, BS, EQ) \
326:     DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) static void CPPJoin4(PackInit_ComplexType, Type, BS, EQ)(PetscSFLink link) \
327:     { \
328:       CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
329:       CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
330:     }
331: #endif

333: #define DEF_DumbType(Type, BS, EQ) \
334:   DEF_Pack(Type, BS, EQ) static void CPPJoin4(PackInit_DumbType, Type, BS, EQ)(PetscSFLink link) \
335:   { \
336:     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
337:   }

339: /* Maxloc, Minloc */
340: #define DEF_PairType(Type, BS, EQ) \
341:   DEF_Pack(Type, BS, EQ) DEF_Xloc(Type, BS, EQ) static void CPPJoin4(PackInit_PairType, Type, BS, EQ)(PetscSFLink link) \
342:   { \
343:     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
344:     CPPJoin4(PackInit_Xloc, Type, BS, EQ)(link); \
345:   }

347: DEF_IntegerType(PetscInt, 1, 1)   /* unit = 1 MPIU_INT  */
348:   DEF_IntegerType(PetscInt, 2, 1) /* unit = 2 MPIU_INTs */
349:   DEF_IntegerType(PetscInt, 4, 1) /* unit = 4 MPIU_INTs */
350:   DEF_IntegerType(PetscInt, 8, 1) /* unit = 8 MPIU_INTs */
351:   DEF_IntegerType(PetscInt, 1, 0) /* unit = 1*n MPIU_INTs, n>1 */
352:   DEF_IntegerType(PetscInt, 2, 0) /* unit = 2*n MPIU_INTs, n>1 */
353:   DEF_IntegerType(PetscInt, 4, 0) /* unit = 4*n MPIU_INTs, n>1 */
354:   DEF_IntegerType(PetscInt, 8, 0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */

356: #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
357:   DEF_IntegerType(int, 1, 1) DEF_IntegerType(int, 2, 1) DEF_IntegerType(int, 4, 1) DEF_IntegerType(int, 8, 1) DEF_IntegerType(int, 1, 0) DEF_IntegerType(int, 2, 0) DEF_IntegerType(int, 4, 0) DEF_IntegerType(int, 8, 0)
358: #endif

360:   /* The typedefs are used to get a typename without space that CPPJoin can handle */
361:   typedef signed char SignedChar;
362: DEF_IntegerType(SignedChar, 1, 1) DEF_IntegerType(SignedChar, 2, 1) DEF_IntegerType(SignedChar, 4, 1) DEF_IntegerType(SignedChar, 8, 1) DEF_IntegerType(SignedChar, 1, 0) DEF_IntegerType(SignedChar, 2, 0) DEF_IntegerType(SignedChar, 4, 0) DEF_IntegerType(SignedChar, 8, 0)

364:   typedef unsigned char UnsignedChar;
365: DEF_IntegerType(UnsignedChar, 1, 1) DEF_IntegerType(UnsignedChar, 2, 1) DEF_IntegerType(UnsignedChar, 4, 1) DEF_IntegerType(UnsignedChar, 8, 1) DEF_IntegerType(UnsignedChar, 1, 0) DEF_IntegerType(UnsignedChar, 2, 0) DEF_IntegerType(UnsignedChar, 4, 0) DEF_IntegerType(UnsignedChar, 8, 0)

367:   DEF_RealType(PetscReal, 1, 1) DEF_RealType(PetscReal, 2, 1) DEF_RealType(PetscReal, 4, 1) DEF_RealType(PetscReal, 8, 1) DEF_RealType(PetscReal, 1, 0) DEF_RealType(PetscReal, 2, 0) DEF_RealType(PetscReal, 4, 0) DEF_RealType(PetscReal, 8, 0)
368: #if defined(PETSC_HAVE_COMPLEX)
369:     DEF_ComplexType(PetscComplex, 1, 1) DEF_ComplexType(PetscComplex, 2, 1) DEF_ComplexType(PetscComplex, 4, 1) DEF_ComplexType(PetscComplex, 8, 1) DEF_ComplexType(PetscComplex, 1, 0) DEF_ComplexType(PetscComplex, 2, 0) DEF_ComplexType(PetscComplex, 4, 0) DEF_ComplexType(PetscComplex, 8, 0)
370: #endif

372: #define PairType(Type1, Type2) Type1##_##Type2
373:       typedef struct {
374:   int u;
375:   int i;
376: } PairType(int, int);
377: typedef struct {
378:   PetscInt u;
379:   PetscInt i;
380: } PairType(PetscInt, PetscInt);
381: DEF_PairType(PairType(int, int), 1, 1) DEF_PairType(PairType(PetscInt, PetscInt), 1, 1)

383:   /* If we don't know the basic type, we treat it as a stream of chars or ints */
384:   DEF_DumbType(char, 1, 1) DEF_DumbType(char, 2, 1) DEF_DumbType(char, 4, 1) DEF_DumbType(char, 1, 0) DEF_DumbType(char, 2, 0) DEF_DumbType(char, 4, 0)

386:     typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
387: DEF_DumbType(DumbInt, 1, 1) DEF_DumbType(DumbInt, 2, 1) DEF_DumbType(DumbInt, 4, 1) DEF_DumbType(DumbInt, 8, 1) DEF_DumbType(DumbInt, 1, 0) DEF_DumbType(DumbInt, 2, 0) DEF_DumbType(DumbInt, 4, 0) DEF_DumbType(DumbInt, 8, 0)

389:   PetscErrorCode PetscSFLinkDestroy(PetscSF sf, PetscSFLink link)
390: {
391:   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
392:   PetscInt       i, nreqs = (bas->nrootreqs + sf->nleafreqs) * 8;

394:   /* Destroy device-specific fields */
395:   if (link->deviceinited) (*link->Destroy)(sf, link);

397:   /* Destroy host related fields */
398:   if (!link->isbuiltin) MPI_Type_free(&link->unit);
399:   if (!link->use_nvshmem) {
400:     for (i = 0; i < nreqs; i++) { /* Persistent reqs must be freed. */
401:       if (link->reqs[i] != MPI_REQUEST_NULL) MPI_Request_free(&link->reqs[i]);
402:     }
403:     PetscFree(link->reqs);
404:     for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
405:       PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);
406:       PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);
407:     }
408:   }
409:   PetscFree(link);
410:   return 0;
411: }

413: PetscErrorCode PetscSFLinkCreate(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink)
414: {
415:   PetscSFSetErrorOnUnsupportedOverlap(sf, unit, rootdata, leafdata);
416: #if defined(PETSC_HAVE_NVSHMEM)
417:   {
418:     PetscBool use_nvshmem;
419:     PetscSFLinkNvshmemCheck(sf, rootmtype, rootdata, leafmtype, leafdata, &use_nvshmem);
420:     if (use_nvshmem) {
421:       PetscSFLinkCreate_NVSHMEM(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink);
422:       return 0;
423:     }
424:   }
425: #endif
426:   PetscSFLinkCreate_MPI(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink);
427:   return 0;
428: }

430: /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
431:    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
432: */
433: PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void **rootbuf, void **leafbuf, MPI_Request **rootreqs, MPI_Request **leafreqs)
434: {
435:   PetscSF_Basic     *bas = (PetscSF_Basic *)sf->data;
436:   PetscInt           i, j, cnt, nrootranks, ndrootranks, nleafranks, ndleafranks;
437:   const PetscInt    *rootoffset, *leafoffset;
438:   MPI_Aint           disp;
439:   MPI_Comm           comm          = PetscObjectComm((PetscObject)sf);
440:   MPI_Datatype       unit          = link->unit;
441:   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
442:   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;

444:   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
445:   if (sf->persistent) {
446:     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
447:       PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL);
448:       if (direction == PETSCSF_LEAF2../../../../../..) {
449:         for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
450:           disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
451:           cnt  = rootoffset[i + 1] - rootoffset[i];
452:           MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j);
453:         }
454:       } else { /* PETSCSF_../../../../../..2LEAF */
455:         for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
456:           disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
457:           cnt  = rootoffset[i + 1] - rootoffset[i];
458:           MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j);
459:         }
460:       }
461:       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
462:     }

464:     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
465:       PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL);
466:       if (direction == PETSCSF_LEAF2../../../../../..) {
467:         for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
468:           disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
469:           cnt  = leafoffset[i + 1] - leafoffset[i];
470:           MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j);
471:         }
472:       } else { /* PETSCSF_../../../../../..2LEAF */
473:         for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
474:           disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
475:           cnt  = leafoffset[i + 1] - leafoffset[i];
476:           MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j);
477:         }
478:       }
479:       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
480:     }
481:   }
482:   if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
483:   if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
484:   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
485:   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
486:   return 0;
487: }

489: PetscErrorCode PetscSFLinkGetInUse(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata, PetscCopyMode cmode, PetscSFLink *mylink)
490: {
491:   PetscSFLink    link, *p;
492:   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;

494:   /* Look for types in cache */
495:   for (p = &bas->inuse; (link = *p); p = &link->next) {
496:     PetscBool match;
497:     MPIPetsc_Type_compare(unit, link->unit, &match);
498:     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
499:       switch (cmode) {
500:       case PETSC_OWN_POINTER:
501:         *p = link->next;
502:         break; /* Remove from inuse list */
503:       case PETSC_USE_POINTER:
504:         break;
505:       default:
506:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "invalid cmode");
507:       }
508:       *mylink = link;
509:       return 0;
510:     }
511:   }
512:   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Could not find pack");
513: }

515: PetscErrorCode PetscSFLinkReclaim(PetscSF sf, PetscSFLink *mylink)
516: {
517:   PetscSF_Basic *bas  = (PetscSF_Basic *)sf->data;
518:   PetscSFLink    link = *mylink;

520:   link->rootdata = NULL;
521:   link->leafdata = NULL;
522:   link->next     = bas->avail;
523:   bas->avail     = link;
524:   *mylink        = NULL;
525:   return 0;
526: }

528: /* Error out on unsupported overlapped communications */
529: PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata)
530: {
531:   PetscSFLink    link, *p;
532:   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
533:   PetscBool      match;

535:   if (PetscDefined(USE_DEBUG)) {
536:     /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
537:        the potential overlapping since this process does not participate in communication. Overlapping is harmless.
538:     */
539:     if (rootdata || leafdata) {
540:       for (p = &bas->inuse; (link = *p); p = &link->next) {
541:         MPIPetsc_Type_compare(unit, link->unit, &match);
543:       }
544:     }
545:   }
546:   return 0;
547: }

549: static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link, PetscMemType dstmtype, void *dst, PetscMemType srcmtype, const void *src, size_t n)
550: {
551:   if (n) PetscMemcpy(dst, src, n);
552:   return 0;
553: }

555: PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf, PetscSFLink link, MPI_Datatype unit)
556: {
557:   PetscInt    nSignedChar = 0, nUnsignedChar = 0, nInt = 0, nPetscInt = 0, nPetscReal = 0;
558:   PetscBool   is2Int, is2PetscInt;
559:   PetscMPIInt ni, na, nd, combiner;
560: #if defined(PETSC_HAVE_COMPLEX)
561:   PetscInt nPetscComplex = 0;
562: #endif

564:   MPIPetsc_Type_compare_contig(unit, MPI_SIGNED_CHAR, &nSignedChar);
565:   MPIPetsc_Type_compare_contig(unit, MPI_UNSIGNED_CHAR, &nUnsignedChar);
566:   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
567:   MPIPetsc_Type_compare_contig(unit, MPI_INT, &nInt);
568:   MPIPetsc_Type_compare_contig(unit, MPIU_INT, &nPetscInt);
569:   MPIPetsc_Type_compare_contig(unit, MPIU_REAL, &nPetscReal);
570: #if defined(PETSC_HAVE_COMPLEX)
571:   MPIPetsc_Type_compare_contig(unit, MPIU_COMPLEX, &nPetscComplex);
572: #endif
573:   MPIPetsc_Type_compare(unit, MPI_2INT, &is2Int);
574:   MPIPetsc_Type_compare(unit, MPIU_2INT, &is2PetscInt);
575:   /* TODO: shaell we also handle Fortran MPI_2REAL? */
576:   MPI_Type_get_envelope(unit, &ni, &na, &nd, &combiner);
577:   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
578:   link->bs        = 1;                                                           /* default */

580:   if (is2Int) {
581:     PackInit_PairType_int_int_1_1(link);
582:     link->bs        = 1;
583:     link->unitbytes = 2 * sizeof(int);
584:     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
585:     link->basicunit = MPI_2INT;
586:     link->unit      = MPI_2INT;
587:   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
588:     PackInit_PairType_PetscInt_PetscInt_1_1(link);
589:     link->bs        = 1;
590:     link->unitbytes = 2 * sizeof(PetscInt);
591:     link->basicunit = MPIU_2INT;
592:     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
593:     link->unit      = MPIU_2INT;
594:   } else if (nPetscReal) {
595:     if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link);
596:     else if (nPetscReal % 8 == 0) PackInit_RealType_PetscReal_8_0(link);
597:     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link);
598:     else if (nPetscReal % 4 == 0) PackInit_RealType_PetscReal_4_0(link);
599:     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link);
600:     else if (nPetscReal % 2 == 0) PackInit_RealType_PetscReal_2_0(link);
601:     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link);
602:     else if (nPetscReal % 1 == 0) PackInit_RealType_PetscReal_1_0(link);
603:     link->bs        = nPetscReal;
604:     link->unitbytes = nPetscReal * sizeof(PetscReal);
605:     link->basicunit = MPIU_REAL;
606:     if (link->bs == 1) {
607:       link->isbuiltin = PETSC_TRUE;
608:       link->unit      = MPIU_REAL;
609:     }
610:   } else if (nPetscInt) {
611:     if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link);
612:     else if (nPetscInt % 8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
613:     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link);
614:     else if (nPetscInt % 4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
615:     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link);
616:     else if (nPetscInt % 2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
617:     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link);
618:     else if (nPetscInt % 1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
619:     link->bs        = nPetscInt;
620:     link->unitbytes = nPetscInt * sizeof(PetscInt);
621:     link->basicunit = MPIU_INT;
622:     if (link->bs == 1) {
623:       link->isbuiltin = PETSC_TRUE;
624:       link->unit      = MPIU_INT;
625:     }
626: #if defined(PETSC_USE_64BIT_INDICES)
627:   } else if (nInt) {
628:     if (nInt == 8) PackInit_IntegerType_int_8_1(link);
629:     else if (nInt % 8 == 0) PackInit_IntegerType_int_8_0(link);
630:     else if (nInt == 4) PackInit_IntegerType_int_4_1(link);
631:     else if (nInt % 4 == 0) PackInit_IntegerType_int_4_0(link);
632:     else if (nInt == 2) PackInit_IntegerType_int_2_1(link);
633:     else if (nInt % 2 == 0) PackInit_IntegerType_int_2_0(link);
634:     else if (nInt == 1) PackInit_IntegerType_int_1_1(link);
635:     else if (nInt % 1 == 0) PackInit_IntegerType_int_1_0(link);
636:     link->bs        = nInt;
637:     link->unitbytes = nInt * sizeof(int);
638:     link->basicunit = MPI_INT;
639:     if (link->bs == 1) {
640:       link->isbuiltin = PETSC_TRUE;
641:       link->unit      = MPI_INT;
642:     }
643: #endif
644:   } else if (nSignedChar) {
645:     if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link);
646:     else if (nSignedChar % 8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
647:     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link);
648:     else if (nSignedChar % 4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
649:     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link);
650:     else if (nSignedChar % 2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
651:     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link);
652:     else if (nSignedChar % 1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
653:     link->bs        = nSignedChar;
654:     link->unitbytes = nSignedChar * sizeof(SignedChar);
655:     link->basicunit = MPI_SIGNED_CHAR;
656:     if (link->bs == 1) {
657:       link->isbuiltin = PETSC_TRUE;
658:       link->unit      = MPI_SIGNED_CHAR;
659:     }
660:   } else if (nUnsignedChar) {
661:     if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link);
662:     else if (nUnsignedChar % 8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
663:     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link);
664:     else if (nUnsignedChar % 4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
665:     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link);
666:     else if (nUnsignedChar % 2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
667:     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link);
668:     else if (nUnsignedChar % 1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
669:     link->bs        = nUnsignedChar;
670:     link->unitbytes = nUnsignedChar * sizeof(UnsignedChar);
671:     link->basicunit = MPI_UNSIGNED_CHAR;
672:     if (link->bs == 1) {
673:       link->isbuiltin = PETSC_TRUE;
674:       link->unit      = MPI_UNSIGNED_CHAR;
675:     }
676: #if defined(PETSC_HAVE_COMPLEX)
677:   } else if (nPetscComplex) {
678:     if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link);
679:     else if (nPetscComplex % 8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
680:     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link);
681:     else if (nPetscComplex % 4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
682:     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link);
683:     else if (nPetscComplex % 2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
684:     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link);
685:     else if (nPetscComplex % 1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
686:     link->bs        = nPetscComplex;
687:     link->unitbytes = nPetscComplex * sizeof(PetscComplex);
688:     link->basicunit = MPIU_COMPLEX;
689:     if (link->bs == 1) {
690:       link->isbuiltin = PETSC_TRUE;
691:       link->unit      = MPIU_COMPLEX;
692:     }
693: #endif
694:   } else {
695:     MPI_Aint lb, nbyte;
696:     MPI_Type_get_extent(unit, &lb, &nbyte);
698:     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
699:       if (nbyte == 4) PackInit_DumbType_char_4_1(link);
700:       else if (nbyte % 4 == 0) PackInit_DumbType_char_4_0(link);
701:       else if (nbyte == 2) PackInit_DumbType_char_2_1(link);
702:       else if (nbyte % 2 == 0) PackInit_DumbType_char_2_0(link);
703:       else if (nbyte == 1) PackInit_DumbType_char_1_1(link);
704:       else if (nbyte % 1 == 0) PackInit_DumbType_char_1_0(link);
705:       link->bs        = nbyte;
706:       link->unitbytes = nbyte;
707:       link->basicunit = MPI_BYTE;
708:     } else {
709:       nInt = nbyte / sizeof(int);
710:       if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link);
711:       else if (nInt % 8 == 0) PackInit_DumbType_DumbInt_8_0(link);
712:       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link);
713:       else if (nInt % 4 == 0) PackInit_DumbType_DumbInt_4_0(link);
714:       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link);
715:       else if (nInt % 2 == 0) PackInit_DumbType_DumbInt_2_0(link);
716:       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link);
717:       else if (nInt % 1 == 0) PackInit_DumbType_DumbInt_1_0(link);
718:       link->bs        = nInt;
719:       link->unitbytes = nbyte;
720:       link->basicunit = MPI_INT;
721:     }
722:     if (link->isbuiltin) link->unit = unit;
723:   }

725:   if (!link->isbuiltin) MPI_Type_dup(unit, &link->unit);

727:   link->Memcpy = PetscSFLinkMemcpy_Host;
728:   return 0;
729: }

731: PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *))
732: {
733:   *UnpackAndOp = NULL;
734:   if (PetscMemTypeHost(mtype)) {
735:     if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert;
736:     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
737:     else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult;
738:     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
739:     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
740:     else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND;
741:     else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND;
742:     else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR;
743:     else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR;
744:     else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR;
745:     else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR;
746:     else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc;
747:     else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc;
748:   }
749: #if defined(PETSC_HAVE_DEVICE)
750:   else if (PetscMemTypeDevice(mtype) && !atomic) {
751:     if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert;
752:     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
753:     else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult;
754:     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
755:     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
756:     else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND;
757:     else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND;
758:     else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR;
759:     else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR;
760:     else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR;
761:     else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR;
762:     else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc;
763:     else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc;
764:   } else if (PetscMemTypeDevice(mtype) && atomic) {
765:     if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert;
766:     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
767:     else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult;
768:     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
769:     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
770:     else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND;
771:     else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND;
772:     else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR;
773:     else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR;
774:     else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR;
775:     else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR;
776:     else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc;
777:     else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc;
778:   }
779: #endif
780:   return 0;
781: }

783: PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *))
784: {
785:   *ScatterAndOp = NULL;
786:   if (PetscMemTypeHost(mtype)) {
787:     if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert;
788:     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
789:     else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult;
790:     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
791:     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
792:     else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND;
793:     else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND;
794:     else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR;
795:     else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR;
796:     else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR;
797:     else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR;
798:     else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc;
799:     else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc;
800:   }
801: #if defined(PETSC_HAVE_DEVICE)
802:   else if (PetscMemTypeDevice(mtype) && !atomic) {
803:     if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert;
804:     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
805:     else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult;
806:     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
807:     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
808:     else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND;
809:     else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND;
810:     else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR;
811:     else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR;
812:     else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR;
813:     else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR;
814:     else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc;
815:     else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc;
816:   } else if (PetscMemTypeDevice(mtype) && atomic) {
817:     if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert;
818:     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
819:     else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult;
820:     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
821:     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
822:     else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND;
823:     else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND;
824:     else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR;
825:     else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR;
826:     else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR;
827:     else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR;
828:     else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc;
829:     else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc;
830:   }
831: #endif
832:   return 0;
833: }

835: PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *))
836: {
837:   *FetchAndOp = NULL;
839:   if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd;
840: #if defined(PETSC_HAVE_DEVICE)
841:   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd;
842:   else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd;
843: #endif
844:   return 0;
845: }

847: PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *))
848: {
849:   *FetchAndOpLocal = NULL;
851:   if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal;
852: #if defined(PETSC_HAVE_DEVICE)
853:   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
854:   else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal;
855: #endif
856:   return 0;
857: }

859: static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op)
860: {
861:   PetscLogDouble flops;
862:   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;

864:   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
865:     flops = bas->rootbuflen[scope] * link->bs;               /* # of roots in buffer x # of scalars in unit */
866: #if defined(PETSC_HAVE_DEVICE)
867:     if (PetscMemTypeDevice(link->rootmtype)) PetscLogGpuFlops(flops);
868:     else
869: #endif
870:       PetscLogFlops(flops);
871:   }
872:   return 0;
873: }

875: static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op)
876: {
877:   PetscLogDouble flops;

879:   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
880:     flops = sf->leafbuflen[scope] * link->bs;                /* # of roots in buffer x # of scalars in unit */
881: #if defined(PETSC_HAVE_DEVICE)
882:     if (PetscMemTypeDevice(link->leafmtype)) PetscLogGpuFlops(flops);
883:     else
884: #endif
885:       PetscLogFlops(flops);
886:   }
887:   return 0;
888: }

890: /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
891:   Input Parameters:
892:   +sf      - The StarForest
893:   .link    - The link
894:   .count   - Number of entries to unpack
895:   .start   - The first index, significent when indices=NULL
896:   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
897:   .buf     - A contiguous buffer to unpack from
898:   -op      - Operation after unpack

900:   Output Parameters:
901:   .data    - The data to unpack to
902: */
903: static inline PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt start, const PetscInt *indices, void *data, const void *buf, MPI_Op op)
904: {
905: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
906:   {
907:     PetscInt i;
908:     if (indices) {
909:       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
910:          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
911:       */
912:       for (i = 0; i < count; i++) MPI_Reduce_local((const char *)buf + i * link->unitbytes, (char *)data + indices[i] * link->unitbytes, 1, link->unit, op);
913:     } else {
914:       MPIU_Reduce_local(buf, (char *)data + start * link->unitbytes, count, link->unit, op);
915:     }
916:   }
917:   return 0;
918: #else
919:   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op");
920: #endif
921: }

923: static inline PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt srcStart, const PetscInt *srcIdx, const void *src, PetscInt dstStart, const PetscInt *dstIdx, void *dst, MPI_Op op)
924: {
925: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
926:   {
927:     PetscInt i, disp;
928:     if (!srcIdx) {
929:       PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, dstStart, dstIdx, dst, (const char *)src + srcStart * link->unitbytes, op);
930:     } else {
931:       for (i = 0; i < count; i++) {
932:         disp = dstIdx ? dstIdx[i] : dstStart + i;
933:         MPIU_Reduce_local((const char *)src + srcIdx[i] * link->unitbytes, (char *)dst + disp * link->unitbytes, 1, link->unit, op);
934:       }
935:     }
936:   }
937:   return 0;
938: #else
939:   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op");
940: #endif
941: }

943: /*=============================================================================
944:               Pack/Unpack/Fetch/Scatter routines
945:  ============================================================================*/

947: /* Pack rootdata to rootbuf
948:   Input Parameters:
949:   + sf       - The SF this packing works on.
950:   . link     - It gives the memtype of the roots and also provides root buffer.
951:   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
952:   - rootdata - Where to read the roots.

954:   Notes:
955:   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
956:   in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not)
957:  */
958: PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata)
959: {
960:   const PetscInt *rootindices = NULL;
961:   PetscInt        count, start;
962:   PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
963:   PetscMemType   rootmtype                                                                                        = link->rootmtype;
964:   PetscSFPackOpt opt                                                                                              = NULL;

966:   PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0);
967:   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */
968:     PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices);
969:     PetscSFLinkGetPack(link, rootmtype, &Pack);
970:     (*Pack)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype]);
971:   }
972:   PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0);
973:   return 0;
974: }

976: /* Pack leafdata to leafbuf */
977: PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata)
978: {
979:   const PetscInt *leafindices = NULL;
980:   PetscInt        count, start;
981:   PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
982:   PetscMemType   leafmtype                                                                                        = link->leafmtype;
983:   PetscSFPackOpt opt                                                                                              = NULL;

985:   PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0);
986:   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */
987:     PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices);
988:     PetscSFLinkGetPack(link, leafmtype, &Pack);
989:     (*Pack)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype]);
990:   }
991:   PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0);
992:   return 0;
993: }

995: /* Pack rootdata to rootbuf, which are in the same memory space */
996: PetscErrorCode PetscSFLinkPackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata)
997: {
998:   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;

1000:   if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */
1001:     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) (*link->SyncDevice)(link);
1002:     if (link->PrePack) (*link->PrePack)(sf, link, PETSCSF_../../../../../..2LEAF); /* Used by SF nvshmem */
1003:   }
1004:   PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0);
1005:   if (bas->rootbuflen[scope]) PetscSFLinkPackRootData_Private(sf, link, scope, rootdata);
1006:   PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0);
1007:   return 0;
1008: }
1009: /* Pack leafdata to leafbuf, which are in the same memory space */
1010: PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata)
1011: {
1012:   if (scope == PETSCSF_REMOTE) {
1013:     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) (*link->SyncDevice)(link);
1014:     if (link->PrePack) (*link->PrePack)(sf, link, PETSCSF_LEAF2../../../../../..); /* Used by SF nvshmem */
1015:   }
1016:   PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0);
1017:   if (sf->leafbuflen[scope]) PetscSFLinkPackLeafData_Private(sf, link, scope, leafdata);
1018:   PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0);
1019:   return 0;
1020: }

1022: PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op)
1023: {
1024:   const PetscInt *rootindices = NULL;
1025:   PetscInt        count, start;
1026:   PetscSF_Basic  *bas                                                                                                    = (PetscSF_Basic *)sf->data;
1027:   PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL;
1028:   PetscMemType   rootmtype                                                                                               = link->rootmtype;
1029:   PetscSFPackOpt opt                                                                                                     = NULL;

1031:   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1032:     PetscSFLinkGetUnpackAndOp(link, rootmtype, op, bas->rootdups[scope], &UnpackAndOp);
1033:     if (UnpackAndOp) {
1034:       PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices);
1035:       (*UnpackAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype]);
1036:     } else {
1037:       PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &rootindices);
1038:       PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, rootindices, rootdata, link->rootbuf[scope][rootmtype], op);
1039:     }
1040:   }
1041:   PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, scope, op);
1042:   return 0;
1043: }

1045: PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op)
1046: {
1047:   const PetscInt *leafindices = NULL;
1048:   PetscInt        count, start;
1049:   PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL;
1050:   PetscMemType   leafmtype                                                                                               = link->leafmtype;
1051:   PetscSFPackOpt opt                                                                                                     = NULL;

1053:   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1054:     PetscSFLinkGetUnpackAndOp(link, leafmtype, op, sf->leafdups[scope], &UnpackAndOp);
1055:     if (UnpackAndOp) {
1056:       PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices);
1057:       (*UnpackAndOp)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype]);
1058:     } else {
1059:       PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &leafindices);
1060:       PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, leafindices, leafdata, link->leafbuf[scope][leafmtype], op);
1061:     }
1062:   }
1063:   PetscSFLinkLogFlopsAfterUnpackLeafData(sf, link, scope, op);
1064:   return 0;
1065: }
1066: /* Unpack rootbuf to rootdata, which are in the same memory space */
1067: PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op)
1068: {
1069:   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;

1071:   PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0);
1072:   if (bas->rootbuflen[scope]) PetscSFLinkUnpackRootData_Private(sf, link, scope, rootdata, op);
1073:   PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0);
1074:   if (scope == PETSCSF_REMOTE) {
1075:     if (link->PostUnpack) (*link->PostUnpack)(sf, link, PETSCSF_LEAF2../../../../../..); /* Used by SF nvshmem */
1076:     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) (*link->SyncDevice)(link);
1077:   }
1078:   return 0;
1079: }

1081: /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */
1082: PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op)
1083: {
1084:   PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0);
1085:   if (sf->leafbuflen[scope]) PetscSFLinkUnpackLeafData_Private(sf, link, scope, leafdata, op);
1086:   PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0);
1087:   if (scope == PETSCSF_REMOTE) {
1088:     if (link->PostUnpack) (*link->PostUnpack)(sf, link, PETSCSF_../../../../../..2LEAF); /* Used by SF nvshmem */
1089:     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) (*link->SyncDevice)(link);
1090:   }
1091:   return 0;
1092: }

1094: /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */
1095: PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf, PetscSFLink link, void *rootdata, MPI_Op op)
1096: {
1097:   const PetscInt *rootindices = NULL;
1098:   PetscInt        count, start;
1099:   PetscSF_Basic  *bas                                                                                             = (PetscSF_Basic *)sf->data;
1100:   PetscErrorCode (*FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *) = NULL;
1101:   PetscMemType   rootmtype                                                                                        = link->rootmtype;
1102:   PetscSFPackOpt opt                                                                                              = NULL;

1104:   PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0);
1105:   if (bas->rootbuflen[PETSCSF_REMOTE]) {
1106:     /* Do FetchAndOp on rootdata with rootbuf */
1107:     PetscSFLinkGetFetchAndOp(link, rootmtype, op, bas->rootdups[PETSCSF_REMOTE], &FetchAndOp);
1108:     PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_REMOTE, &count, &start, &opt, &rootindices);
1109:     (*FetchAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[PETSCSF_REMOTE][rootmtype]);
1110:   }
1111:   PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, PETSCSF_REMOTE, op);
1112:   PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0);
1113:   return 0;
1114: }

1116: PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void *rootdata, void *leafdata, MPI_Op op)
1117: {
1118:   const PetscInt *rootindices = NULL, *leafindices = NULL;
1119:   PetscInt        count, rootstart, leafstart;
1120:   PetscSF_Basic  *bas                                                                                                                                                 = (PetscSF_Basic *)sf->data;
1121:   PetscErrorCode (*ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *) = NULL;
1122:   PetscMemType   rootmtype = link->rootmtype, leafmtype = link->leafmtype, srcmtype, dstmtype;
1123:   PetscSFPackOpt leafopt = NULL, rootopt = NULL;
1124:   PetscInt       buflen = sf->leafbuflen[PETSCSF_LOCAL];
1125:   char          *srcbuf = NULL, *dstbuf = NULL;
1126:   PetscBool      dstdups;

1128:   if (!buflen) return 0;
1129:   if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */
1130:     if (direction == PETSCSF_../../../../../..2LEAF) {
1131:       PetscSFLinkPackRootData(sf, link, PETSCSF_LOCAL, rootdata);
1132:       srcmtype = rootmtype;
1133:       srcbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1134:       dstmtype = leafmtype;
1135:       dstbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1136:     } else {
1137:       PetscSFLinkPackLeafData(sf, link, PETSCSF_LOCAL, leafdata);
1138:       srcmtype = leafmtype;
1139:       srcbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1140:       dstmtype = rootmtype;
1141:       dstbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1142:     }
1143:     (*link->Memcpy)(link, dstmtype, dstbuf, srcmtype, srcbuf, buflen * link->unitbytes);
1144:     /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */
1145:     if (PetscMemTypeHost(dstmtype)) (*link->SyncStream)(link);
1146:     if (direction == PETSCSF_../../../../../..2LEAF) {
1147:       PetscSFLinkUnpackLeafData(sf, link, PETSCSF_LOCAL, leafdata, op);
1148:     } else {
1149:       PetscSFLinkUnpackRootData(sf, link, PETSCSF_LOCAL, rootdata, op);
1150:     }
1151:   } else {
1152:     dstdups  = (direction == PETSCSF_../../../../../..2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL];
1153:     dstmtype = (direction == PETSCSF_../../../../../..2LEAF) ? link->leafmtype : link->rootmtype;
1154:     PetscSFLinkGetScatterAndOp(link, dstmtype, op, dstdups, &ScatterAndOp);
1155:     if (ScatterAndOp) {
1156:       PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices);
1157:       PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices);
1158:       if (direction == PETSCSF_../../../../../..2LEAF) {
1159:         (*ScatterAndOp)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata);
1160:       } else {
1161:         (*ScatterAndOp)(link, count, leafstart, leafopt, leafindices, leafdata, rootstart, rootopt, rootindices, rootdata);
1162:       }
1163:     } else {
1164:       PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices);
1165:       PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices);
1166:       if (direction == PETSCSF_../../../../../..2LEAF) {
1167:         PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, rootstart, rootindices, rootdata, leafstart, leafindices, leafdata, op);
1168:       } else {
1169:         PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, leafstart, leafindices, leafdata, rootstart, rootindices, rootdata, op);
1170:       }
1171:     }
1172:   }
1173:   return 0;
1174: }

1176: /* Fetch rootdata to leafdata and leafupdate locally */
1177: PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf, PetscSFLink link, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1178: {
1179:   const PetscInt *rootindices = NULL, *leafindices = NULL;
1180:   PetscInt        count, rootstart, leafstart;
1181:   PetscSF_Basic  *bas                                                                                                                                                            = (PetscSF_Basic *)sf->data;
1182:   PetscErrorCode (*FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
1183:   const PetscMemType rootmtype = link->rootmtype, leafmtype = link->leafmtype;
1184:   PetscSFPackOpt     leafopt = NULL, rootopt = NULL;

1186:   if (!bas->rootbuflen[PETSCSF_LOCAL]) return 0;
1187:   if (rootmtype != leafmtype) {
1188:     /* The local communication has to go through pack and unpack */
1189:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1190:   } else {
1191:     PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices);
1192:     PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices);
1193:     PetscSFLinkGetFetchAndOpLocal(link, rootmtype, op, bas->rootdups[PETSCSF_LOCAL], &FetchAndOpLocal);
1194:     (*FetchAndOpLocal)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata, leafupdate);
1195:   }
1196:   return 0;
1197: }

1199: /*
1200:   Create per-rank pack/unpack optimizations based on indice patterns

1202:    Input Parameters:
1203:   +  n       - Number of destination ranks
1204:   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1205:   -  idx     - [*]   Array storing indices

1207:    Output Parameters:
1208:   +  opt     - Pack optimizations. NULL if no optimizations.
1209: */
1210: PetscErrorCode PetscSFCreatePackOpt(PetscInt n, const PetscInt *offset, const PetscInt *idx, PetscSFPackOpt *out)
1211: {
1212:   PetscInt       r, p, start, i, j, k, dx, dy, dz, dydz, m, X, Y;
1213:   PetscBool      optimizable = PETSC_TRUE;
1214:   PetscSFPackOpt opt;

1216:   PetscMalloc1(1, &opt);
1217:   PetscMalloc1(7 * n + 2, &opt->array);
1218:   opt->n = opt->array[0] = n;
1219:   opt->offset            = opt->array + 1;
1220:   opt->start             = opt->array + n + 2;
1221:   opt->dx                = opt->array + 2 * n + 2;
1222:   opt->dy                = opt->array + 3 * n + 2;
1223:   opt->dz                = opt->array + 4 * n + 2;
1224:   opt->X                 = opt->array + 5 * n + 2;
1225:   opt->Y                 = opt->array + 6 * n + 2;

1227:   for (r = 0; r < n; r++) {            /* For each destination rank */
1228:     m     = offset[r + 1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1229:     p     = offset[r];
1230:     start = idx[p]; /* First index for this rank */
1231:     p++;

1233:     /* Search in X dimension */
1234:     for (dx = 1; dx < m; dx++, p++) {
1235:       if (start + dx != idx[p]) break;
1236:     }

1238:     dydz = m / dx;
1239:     X    = dydz > 1 ? (idx[p] - start) : dx;
1240:     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1241:     if (m % dx || X <= 0) {
1242:       optimizable = PETSC_FALSE;
1243:       goto finish;
1244:     }
1245:     for (dy = 1; dy < dydz; dy++) { /* Search in Y dimension */
1246:       for (i = 0; i < dx; i++, p++) {
1247:         if (start + X * dy + i != idx[p]) {
1248:           if (i) {
1249:             optimizable = PETSC_FALSE;
1250:             goto finish;
1251:           } /* The pattern is violated in the middle of an x-walk */
1252:           else
1253:             goto Z_dimension;
1254:         }
1255:       }
1256:     }

1258:   Z_dimension:
1259:     dz = m / (dx * dy);
1260:     Y  = dz > 1 ? (idx[p] - start) / X : dy;
1261:     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1262:     if (m % (dx * dy) || Y <= 0) {
1263:       optimizable = PETSC_FALSE;
1264:       goto finish;
1265:     }
1266:     for (k = 1; k < dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1267:       for (j = 0; j < dy; j++) {
1268:         for (i = 0; i < dx; i++, p++) {
1269:           if (start + X * Y * k + X * j + i != idx[p]) {
1270:             optimizable = PETSC_FALSE;
1271:             goto finish;
1272:           }
1273:         }
1274:       }
1275:     }
1276:     opt->start[r] = start;
1277:     opt->dx[r]    = dx;
1278:     opt->dy[r]    = dy;
1279:     opt->dz[r]    = dz;
1280:     opt->X[r]     = X;
1281:     opt->Y[r]     = Y;
1282:   }

1284: finish:
1285:   /* If not optimizable, free arrays to save memory */
1286:   if (!n || !optimizable) {
1287:     PetscFree(opt->array);
1288:     PetscFree(opt);
1289:     *out = NULL;
1290:   } else {
1291:     opt->offset[0] = 0;
1292:     for (r = 0; r < n; r++) opt->offset[r + 1] = opt->offset[r] + opt->dx[r] * opt->dy[r] * opt->dz[r];
1293:     *out = opt;
1294:   }
1295:   return 0;
1296: }

1298: static inline PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf, PetscMemType mtype, PetscSFPackOpt *out)
1299: {
1300:   PetscSFPackOpt opt = *out;

1302:   if (opt) {
1303:     PetscSFFree(sf, mtype, opt->array);
1304:     PetscFree(opt);
1305:     *out = NULL;
1306:   }
1307:   return 0;
1308: }

1310: PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1311: {
1312:   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1313:   PetscInt       i, j;

1315:   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1316:   for (i = 0; i < 2; i++) { /* Set defaults */
1317:     sf->leafstart[i]   = 0;
1318:     sf->leafcontig[i]  = PETSC_TRUE;
1319:     sf->leafdups[i]    = PETSC_FALSE;
1320:     bas->rootstart[i]  = 0;
1321:     bas->rootcontig[i] = PETSC_TRUE;
1322:     bas->rootdups[i]   = PETSC_FALSE;
1323:   }

1325:   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1326:   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];

1328:   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1329:   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];

1331:   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1332:   for (i = 0; i < sf->roffset[sf->ndranks]; i++) { /* self */
1333:     if (sf->rmine[i] != sf->leafstart[0] + i) {
1334:       sf->leafcontig[0] = PETSC_FALSE;
1335:       break;
1336:     }
1337:   }
1338:   for (i = sf->roffset[sf->ndranks], j = 0; i < sf->roffset[sf->nranks]; i++, j++) { /* remote */
1339:     if (sf->rmine[i] != sf->leafstart[1] + j) {
1340:       sf->leafcontig[1] = PETSC_FALSE;
1341:       break;
1342:     }
1343:   }

1345:   /* If not, see if we can have per-rank optimizations by doing index analysis */
1346:   if (!sf->leafcontig[0]) PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);
1347:   if (!sf->leafcontig[1]) PetscSFCreatePackOpt(sf->nranks - sf->ndranks, sf->roffset + sf->ndranks, sf->rmine, &sf->leafpackopt[1]);

1349:   /* Are root indices for self and remote contiguous? */
1350:   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1351:   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];

1353:   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1354:   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];

1356:   for (i = 0; i < bas->ioffset[bas->ndiranks]; i++) {
1357:     if (bas->irootloc[i] != bas->rootstart[0] + i) {
1358:       bas->rootcontig[0] = PETSC_FALSE;
1359:       break;
1360:     }
1361:   }
1362:   for (i = bas->ioffset[bas->ndiranks], j = 0; i < bas->ioffset[bas->niranks]; i++, j++) {
1363:     if (bas->irootloc[i] != bas->rootstart[1] + j) {
1364:       bas->rootcontig[1] = PETSC_FALSE;
1365:       break;
1366:     }
1367:   }

1369:   if (!bas->rootcontig[0]) PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);
1370:   if (!bas->rootcontig[1]) PetscSFCreatePackOpt(bas->niranks - bas->ndiranks, bas->ioffset + bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);

1372:   /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1373:   if (PetscDefined(HAVE_DEVICE)) {
1374:     PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
1379:   }
1380:   return 0;
1381: }

1383: PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1384: {
1385:   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1386:   PetscInt       i;

1388:   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
1389:     PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &sf->leafpackopt[i]);
1390:     PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &bas->rootpackopt[i]);
1391: #if defined(PETSC_HAVE_DEVICE)
1392:     PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &sf->leafpackopt_d[i]);
1393:     PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &bas->rootpackopt_d[i]);
1394: #endif
1395:   }
1396:   return 0;
1397: }