Actual source code: sfpack.c
1: #include "petsc/private/sfimpl.h"
2: #include <../src/vec/is/sf/impls/basic/sfpack.h>
3: #include <../src/vec/is/sf/impls/basic/sfbasic.h>
5: /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
7: /*
8: * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
9: * therefore we pack data types manually. This file defines packing routines for the standard data types.
10: */
12: #define CPPJoin4(a, b, c, d) a##_##b##_##c##_##d
14: /* Operations working like s += t */
15: #define OP_BINARY(op, s, t) \
16: do { \
17: (s) = (s)op(t); \
18: } while (0) /* binary ops in the middle such as +, *, && etc. */
19: #define OP_FUNCTION(op, s, t) \
20: do { \
21: (s) = op((s), (t)); \
22: } while (0) /* ops like a function, such as PetscMax, PetscMin */
23: #define OP_LXOR(op, s, t) \
24: do { \
25: (s) = (!(s)) != (!(t)); \
26: } while (0) /* logical exclusive OR */
27: #define OP_ASSIGN(op, s, t) \
28: do { \
29: (s) = (t); \
30: } while (0)
31: /* Ref MPI MAXLOC */
32: #define OP_XLOC(op, s, t) \
33: do { \
34: if ((s).u == (t).u) (s).i = PetscMin((s).i, (t).i); \
35: else if (!((s).u op(t).u)) s = t; \
36: } while (0)
38: /* DEF_PackFunc - macro defining a Pack routine
40: Arguments of the macro:
41: +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
42: .BS Block size for vectorization. It is a factor of bsz.
43: -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
45: Arguments of the Pack routine:
46: +count Number of indices in idx[].
47: .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
48: .opt Per-pack optimization plan. NULL means no such plan.
49: .idx Indices of entries to packed.
50: .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
51: .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
52: -packed Address of the packed data.
53: */
54: #define DEF_PackFunc(Type, BS, EQ) \
55: static PetscErrorCode CPPJoin4(Pack, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, const void *unpacked, void *packed) \
56: { \
57: const Type *u = (const Type *)unpacked, *u2; \
58: Type *p = (Type *)packed, *p2; \
59: PetscInt i, j, k, X, Y, r, bs = link->bs; \
60: const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
61: const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
62: if (!idx) PetscArraycpy(p, u + start * MBS, MBS * count); /* idx[] are contiguous */ \
63: else if (opt) { /* has optimizations available */ p2 = p; \
64: for (r = 0; r < opt->n; r++) { \
65: u2 = u + opt->start[r] * MBS; \
66: X = opt->X[r]; \
67: Y = opt->Y[r]; \
68: for (k = 0; k < opt->dz[r]; k++) \
69: for (j = 0; j < opt->dy[r]; j++) { \
70: PetscArraycpy(p2, u2 + (X * Y * k + X * j) * MBS, opt->dx[r] * MBS); \
71: p2 += opt->dx[r] * MBS; \
72: } \
73: } \
74: } else { \
75: for (i = 0; i < count; i++) \
76: for (j = 0; j < M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \
77: for (k = 0; k < BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \
78: p[i * MBS + j * BS + k] = u[idx[i] * MBS + j * BS + k]; \
79: } \
80: return 0; \
81: }
83: /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
84: and inserts into a sparse array.
86: Arguments:
87: .Type Type of the data
88: .BS Block size for vectorization
89: .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const.
91: Notes:
92: This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
93: */
94: #define DEF_UnpackFunc(Type, BS, EQ) \
95: static PetscErrorCode CPPJoin4(UnpackAndInsert, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) \
96: { \
97: Type *u = (Type *)unpacked, *u2; \
98: const Type *p = (const Type *)packed; \
99: PetscInt i, j, k, X, Y, r, bs = link->bs; \
100: const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
101: const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
102: if (!idx) { \
103: u += start * MBS; \
104: if (u != p) PetscArraycpy(u, p, count *MBS); \
105: } else if (opt) { /* has optimizations available */ \
106: for (r = 0; r < opt->n; r++) { \
107: u2 = u + opt->start[r] * MBS; \
108: X = opt->X[r]; \
109: Y = opt->Y[r]; \
110: for (k = 0; k < opt->dz[r]; k++) \
111: for (j = 0; j < opt->dy[r]; j++) { \
112: PetscArraycpy(u2 + (X * Y * k + X * j) * MBS, p, opt->dx[r] * MBS); \
113: p += opt->dx[r] * MBS; \
114: } \
115: } \
116: } else { \
117: for (i = 0; i < count; i++) \
118: for (j = 0; j < M; j++) \
119: for (k = 0; k < BS; k++) u[idx[i] * MBS + j * BS + k] = p[i * MBS + j * BS + k]; \
120: } \
121: return 0; \
122: }
124: /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
126: Arguments:
127: +Opname Name of the Op, such as Add, Mult, LAND, etc.
128: .Type Type of the data
129: .BS Block size for vectorization
130: .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const.
131: .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
132: .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
133: */
134: #define DEF_UnpackAndOp(Type, BS, EQ, Opname, Op, OpApply) \
135: static PetscErrorCode CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) \
136: { \
137: Type *u = (Type *)unpacked, *u2; \
138: const Type *p = (const Type *)packed; \
139: PetscInt i, j, k, X, Y, r, bs = link->bs; \
140: const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
141: const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
142: if (!idx) { \
143: u += start * MBS; \
144: for (i = 0; i < count; i++) \
145: for (j = 0; j < M; j++) \
146: for (k = 0; k < BS; k++) OpApply(Op, u[i * MBS + j * BS + k], p[i * MBS + j * BS + k]); \
147: } else if (opt) { /* idx[] has patterns */ \
148: for (r = 0; r < opt->n; r++) { \
149: u2 = u + opt->start[r] * MBS; \
150: X = opt->X[r]; \
151: Y = opt->Y[r]; \
152: for (k = 0; k < opt->dz[r]; k++) \
153: for (j = 0; j < opt->dy[r]; j++) { \
154: for (i = 0; i < opt->dx[r] * MBS; i++) OpApply(Op, u2[(X * Y * k + X * j) * MBS + i], p[i]); \
155: p += opt->dx[r] * MBS; \
156: } \
157: } \
158: } else { \
159: for (i = 0; i < count; i++) \
160: for (j = 0; j < M; j++) \
161: for (k = 0; k < BS; k++) OpApply(Op, u[idx[i] * MBS + j * BS + k], p[i * MBS + j * BS + k]); \
162: } \
163: return 0; \
164: }
166: #define DEF_FetchAndOp(Type, BS, EQ, Opname, Op, OpApply) \
167: static PetscErrorCode CPPJoin4(FetchAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, void *packed) \
168: { \
169: Type *u = (Type *)unpacked, *p = (Type *)packed, tmp; \
170: PetscInt i, j, k, r, l, bs = link->bs; \
171: const PetscInt M = (EQ) ? 1 : bs / BS; \
172: const PetscInt MBS = M * BS; \
173: for (i = 0; i < count; i++) { \
174: r = (!idx ? start + i : idx[i]) * MBS; \
175: l = i * MBS; \
176: for (j = 0; j < M; j++) \
177: for (k = 0; k < BS; k++) { \
178: tmp = u[r + j * BS + k]; \
179: OpApply(Op, u[r + j * BS + k], p[l + j * BS + k]); \
180: p[l + j * BS + k] = tmp; \
181: } \
182: } \
183: return 0; \
184: }
186: #define DEF_ScatterAndOp(Type, BS, EQ, Opname, Op, OpApply) \
187: static PetscErrorCode CPPJoin4(ScatterAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst) \
188: { \
189: const Type *u = (const Type *)src; \
190: Type *v = (Type *)dst; \
191: PetscInt i, j, k, s, t, X, Y, bs = link->bs; \
192: const PetscInt M = (EQ) ? 1 : bs / BS; \
193: const PetscInt MBS = M * BS; \
194: if (!srcIdx) { /* src is contiguous */ \
195: u += srcStart * MBS; \
196: CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(link, count, dstStart, dstOpt, dstIdx, dst, u); \
197: } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \
198: u += srcOpt->start[0] * MBS; \
199: v += dstStart * MBS; \
200: X = srcOpt->X[0]; \
201: Y = srcOpt->Y[0]; \
202: for (k = 0; k < srcOpt->dz[0]; k++) \
203: for (j = 0; j < srcOpt->dy[0]; j++) { \
204: for (i = 0; i < srcOpt->dx[0] * MBS; i++) OpApply(Op, v[i], u[(X * Y * k + X * j) * MBS + i]); \
205: v += srcOpt->dx[0] * MBS; \
206: } \
207: } else { /* all other cases */ \
208: for (i = 0; i < count; i++) { \
209: s = (!srcIdx ? srcStart + i : srcIdx[i]) * MBS; \
210: t = (!dstIdx ? dstStart + i : dstIdx[i]) * MBS; \
211: for (j = 0; j < M; j++) \
212: for (k = 0; k < BS; k++) OpApply(Op, v[t + j * BS + k], u[s + j * BS + k]); \
213: } \
214: } \
215: return 0; \
216: }
218: #define DEF_FetchAndOpLocal(Type, BS, EQ, Opname, Op, OpApply) \
219: static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt rootstart, PetscSFPackOpt rootopt, const PetscInt *rootidx, void *rootdata, PetscInt leafstart, PetscSFPackOpt leafopt, const PetscInt *leafidx, const void *leafdata, void *leafupdate) \
220: { \
221: Type *rdata = (Type *)rootdata, *lupdate = (Type *)leafupdate; \
222: const Type *ldata = (const Type *)leafdata; \
223: PetscInt i, j, k, r, l, bs = link->bs; \
224: const PetscInt M = (EQ) ? 1 : bs / BS; \
225: const PetscInt MBS = M * BS; \
226: for (i = 0; i < count; i++) { \
227: r = (rootidx ? rootidx[i] : rootstart + i) * MBS; \
228: l = (leafidx ? leafidx[i] : leafstart + i) * MBS; \
229: for (j = 0; j < M; j++) \
230: for (k = 0; k < BS; k++) { \
231: lupdate[l + j * BS + k] = rdata[r + j * BS + k]; \
232: OpApply(Op, rdata[r + j * BS + k], ldata[l + j * BS + k]); \
233: } \
234: } \
235: return 0; \
236: }
238: /* Pack, Unpack/Fetch ops */
239: #define DEF_Pack(Type, BS, EQ) \
240: DEF_PackFunc(Type, BS, EQ) DEF_UnpackFunc(Type, BS, EQ) DEF_ScatterAndOp(Type, BS, EQ, Insert, =, OP_ASSIGN) static void CPPJoin4(PackInit_Pack, Type, BS, EQ)(PetscSFLink link) \
241: { \
242: link->h_Pack = CPPJoin4(Pack, Type, BS, EQ); \
243: link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert, Type, BS, EQ); \
244: link->h_ScatterAndInsert = CPPJoin4(ScatterAndInsert, Type, BS, EQ); \
245: }
247: /* Add, Mult ops */
248: #define DEF_Add(Type, BS, EQ) \
249: DEF_UnpackAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOpLocal(Type, BS, EQ, Add, +, OP_BINARY) static void CPPJoin4(PackInit_Add, Type, BS, EQ)(PetscSFLink link) \
250: { \
251: link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type, BS, EQ); \
252: link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type, BS, EQ); \
253: link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type, BS, EQ); \
254: link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type, BS, EQ); \
255: link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type, BS, EQ); \
256: link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal, Type, BS, EQ); \
257: }
259: /* Max, Min ops */
260: #define DEF_Cmp(Type, BS, EQ) \
261: DEF_UnpackAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_UnpackAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) static void CPPJoin4(PackInit_Compare, Type, BS, EQ)(PetscSFLink link) \
262: { \
263: link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type, BS, EQ); \
264: link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type, BS, EQ); \
265: link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type, BS, EQ); \
266: link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type, BS, EQ); \
267: }
269: /* Logical ops.
270: The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
271: the compilation warning "empty macro arguments are undefined in ISO C90"
272: */
273: #define DEF_Log(Type, BS, EQ) \
274: DEF_UnpackAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) DEF_ScatterAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) static void CPPJoin4(PackInit_Logical, Type, BS, EQ)(PetscSFLink link) \
275: { \
276: link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type, BS, EQ); \
277: link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type, BS, EQ); \
278: link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type, BS, EQ); \
279: link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND, Type, BS, EQ); \
280: link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type, BS, EQ); \
281: link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR, Type, BS, EQ); \
282: }
284: /* Bitwise ops */
285: #define DEF_Bit(Type, BS, EQ) \
286: DEF_UnpackAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) static void CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(PetscSFLink link) \
287: { \
288: link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type, BS, EQ); \
289: link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type, BS, EQ); \
290: link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type, BS, EQ); \
291: link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND, Type, BS, EQ); \
292: link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type, BS, EQ); \
293: link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR, Type, BS, EQ); \
294: }
296: /* Maxloc, Minloc ops */
297: #define DEF_Xloc(Type, BS, EQ) \
298: DEF_UnpackAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_UnpackAndOp(Type, BS, EQ, Min, <, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Min, <, OP_XLOC) static void CPPJoin4(PackInit_Xloc, Type, BS, EQ)(PetscSFLink link) \
299: { \
300: link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type, BS, EQ); \
301: link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type, BS, EQ); \
302: link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax, Type, BS, EQ); \
303: link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin, Type, BS, EQ); \
304: }
306: #define DEF_IntegerType(Type, BS, EQ) \
307: DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) DEF_Log(Type, BS, EQ) DEF_Bit(Type, BS, EQ) static void CPPJoin4(PackInit_IntegerType, Type, BS, EQ)(PetscSFLink link) \
308: { \
309: CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
310: CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
311: CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \
312: CPPJoin4(PackInit_Logical, Type, BS, EQ)(link); \
313: CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(link); \
314: }
316: #define DEF_RealType(Type, BS, EQ) \
317: DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) static void CPPJoin4(PackInit_RealType, Type, BS, EQ)(PetscSFLink link) \
318: { \
319: CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
320: CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
321: CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \
322: }
324: #if defined(PETSC_HAVE_COMPLEX)
325: #define DEF_ComplexType(Type, BS, EQ) \
326: DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) static void CPPJoin4(PackInit_ComplexType, Type, BS, EQ)(PetscSFLink link) \
327: { \
328: CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
329: CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
330: }
331: #endif
333: #define DEF_DumbType(Type, BS, EQ) \
334: DEF_Pack(Type, BS, EQ) static void CPPJoin4(PackInit_DumbType, Type, BS, EQ)(PetscSFLink link) \
335: { \
336: CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
337: }
339: /* Maxloc, Minloc */
340: #define DEF_PairType(Type, BS, EQ) \
341: DEF_Pack(Type, BS, EQ) DEF_Xloc(Type, BS, EQ) static void CPPJoin4(PackInit_PairType, Type, BS, EQ)(PetscSFLink link) \
342: { \
343: CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
344: CPPJoin4(PackInit_Xloc, Type, BS, EQ)(link); \
345: }
347: DEF_IntegerType(PetscInt, 1, 1) /* unit = 1 MPIU_INT */
348: DEF_IntegerType(PetscInt, 2, 1) /* unit = 2 MPIU_INTs */
349: DEF_IntegerType(PetscInt, 4, 1) /* unit = 4 MPIU_INTs */
350: DEF_IntegerType(PetscInt, 8, 1) /* unit = 8 MPIU_INTs */
351: DEF_IntegerType(PetscInt, 1, 0) /* unit = 1*n MPIU_INTs, n>1 */
352: DEF_IntegerType(PetscInt, 2, 0) /* unit = 2*n MPIU_INTs, n>1 */
353: DEF_IntegerType(PetscInt, 4, 0) /* unit = 4*n MPIU_INTs, n>1 */
354: DEF_IntegerType(PetscInt, 8, 0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
356: #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
357: DEF_IntegerType(int, 1, 1) DEF_IntegerType(int, 2, 1) DEF_IntegerType(int, 4, 1) DEF_IntegerType(int, 8, 1) DEF_IntegerType(int, 1, 0) DEF_IntegerType(int, 2, 0) DEF_IntegerType(int, 4, 0) DEF_IntegerType(int, 8, 0)
358: #endif
360: /* The typedefs are used to get a typename without space that CPPJoin can handle */
361: typedef signed char SignedChar;
362: DEF_IntegerType(SignedChar, 1, 1) DEF_IntegerType(SignedChar, 2, 1) DEF_IntegerType(SignedChar, 4, 1) DEF_IntegerType(SignedChar, 8, 1) DEF_IntegerType(SignedChar, 1, 0) DEF_IntegerType(SignedChar, 2, 0) DEF_IntegerType(SignedChar, 4, 0) DEF_IntegerType(SignedChar, 8, 0)
364: typedef unsigned char UnsignedChar;
365: DEF_IntegerType(UnsignedChar, 1, 1) DEF_IntegerType(UnsignedChar, 2, 1) DEF_IntegerType(UnsignedChar, 4, 1) DEF_IntegerType(UnsignedChar, 8, 1) DEF_IntegerType(UnsignedChar, 1, 0) DEF_IntegerType(UnsignedChar, 2, 0) DEF_IntegerType(UnsignedChar, 4, 0) DEF_IntegerType(UnsignedChar, 8, 0)
367: DEF_RealType(PetscReal, 1, 1) DEF_RealType(PetscReal, 2, 1) DEF_RealType(PetscReal, 4, 1) DEF_RealType(PetscReal, 8, 1) DEF_RealType(PetscReal, 1, 0) DEF_RealType(PetscReal, 2, 0) DEF_RealType(PetscReal, 4, 0) DEF_RealType(PetscReal, 8, 0)
368: #if defined(PETSC_HAVE_COMPLEX)
369: DEF_ComplexType(PetscComplex, 1, 1) DEF_ComplexType(PetscComplex, 2, 1) DEF_ComplexType(PetscComplex, 4, 1) DEF_ComplexType(PetscComplex, 8, 1) DEF_ComplexType(PetscComplex, 1, 0) DEF_ComplexType(PetscComplex, 2, 0) DEF_ComplexType(PetscComplex, 4, 0) DEF_ComplexType(PetscComplex, 8, 0)
370: #endif
372: #define PairType(Type1, Type2) Type1##_##Type2
373: typedef struct {
374: int u;
375: int i;
376: } PairType(int, int);
377: typedef struct {
378: PetscInt u;
379: PetscInt i;
380: } PairType(PetscInt, PetscInt);
381: DEF_PairType(PairType(int, int), 1, 1) DEF_PairType(PairType(PetscInt, PetscInt), 1, 1)
383: /* If we don't know the basic type, we treat it as a stream of chars or ints */
384: DEF_DumbType(char, 1, 1) DEF_DumbType(char, 2, 1) DEF_DumbType(char, 4, 1) DEF_DumbType(char, 1, 0) DEF_DumbType(char, 2, 0) DEF_DumbType(char, 4, 0)
386: typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
387: DEF_DumbType(DumbInt, 1, 1) DEF_DumbType(DumbInt, 2, 1) DEF_DumbType(DumbInt, 4, 1) DEF_DumbType(DumbInt, 8, 1) DEF_DumbType(DumbInt, 1, 0) DEF_DumbType(DumbInt, 2, 0) DEF_DumbType(DumbInt, 4, 0) DEF_DumbType(DumbInt, 8, 0)
389: PetscErrorCode PetscSFLinkDestroy(PetscSF sf, PetscSFLink link)
390: {
391: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
392: PetscInt i, nreqs = (bas->nrootreqs + sf->nleafreqs) * 8;
394: /* Destroy device-specific fields */
395: if (link->deviceinited) (*link->Destroy)(sf, link);
397: /* Destroy host related fields */
398: if (!link->isbuiltin) MPI_Type_free(&link->unit);
399: if (!link->use_nvshmem) {
400: for (i = 0; i < nreqs; i++) { /* Persistent reqs must be freed. */
401: if (link->reqs[i] != MPI_REQUEST_NULL) MPI_Request_free(&link->reqs[i]);
402: }
403: PetscFree(link->reqs);
404: for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
405: PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);
406: PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);
407: }
408: }
409: PetscFree(link);
410: return 0;
411: }
413: PetscErrorCode PetscSFLinkCreate(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink)
414: {
415: PetscSFSetErrorOnUnsupportedOverlap(sf, unit, rootdata, leafdata);
416: #if defined(PETSC_HAVE_NVSHMEM)
417: {
418: PetscBool use_nvshmem;
419: PetscSFLinkNvshmemCheck(sf, rootmtype, rootdata, leafmtype, leafdata, &use_nvshmem);
420: if (use_nvshmem) {
421: PetscSFLinkCreate_NVSHMEM(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink);
422: return 0;
423: }
424: }
425: #endif
426: PetscSFLinkCreate_MPI(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink);
427: return 0;
428: }
430: /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
431: If the sf uses persistent requests and the requests have not been initialized, then initialize them.
432: */
433: PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void **rootbuf, void **leafbuf, MPI_Request **rootreqs, MPI_Request **leafreqs)
434: {
435: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
436: PetscInt i, j, cnt, nrootranks, ndrootranks, nleafranks, ndleafranks;
437: const PetscInt *rootoffset, *leafoffset;
438: MPI_Aint disp;
439: MPI_Comm comm = PetscObjectComm((PetscObject)sf);
440: MPI_Datatype unit = link->unit;
441: const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
442: const PetscInt rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
444: /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
445: if (sf->persistent) {
446: if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
447: PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL);
448: if (direction == PETSCSF_LEAF2../../../../../..) {
449: for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
450: disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
451: cnt = rootoffset[i + 1] - rootoffset[i];
452: MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j);
453: }
454: } else { /* PETSCSF_../../../../../..2LEAF */
455: for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
456: disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
457: cnt = rootoffset[i + 1] - rootoffset[i];
458: MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j);
459: }
460: }
461: link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
462: }
464: if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
465: PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL);
466: if (direction == PETSCSF_LEAF2../../../../../..) {
467: for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
468: disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
469: cnt = leafoffset[i + 1] - leafoffset[i];
470: MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j);
471: }
472: } else { /* PETSCSF_../../../../../..2LEAF */
473: for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
474: disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
475: cnt = leafoffset[i + 1] - leafoffset[i];
476: MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j);
477: }
478: }
479: link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
480: }
481: }
482: if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
483: if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
484: if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
485: if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
486: return 0;
487: }
489: PetscErrorCode PetscSFLinkGetInUse(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata, PetscCopyMode cmode, PetscSFLink *mylink)
490: {
491: PetscSFLink link, *p;
492: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
494: /* Look for types in cache */
495: for (p = &bas->inuse; (link = *p); p = &link->next) {
496: PetscBool match;
497: MPIPetsc_Type_compare(unit, link->unit, &match);
498: if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
499: switch (cmode) {
500: case PETSC_OWN_POINTER:
501: *p = link->next;
502: break; /* Remove from inuse list */
503: case PETSC_USE_POINTER:
504: break;
505: default:
506: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "invalid cmode");
507: }
508: *mylink = link;
509: return 0;
510: }
511: }
512: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Could not find pack");
513: }
515: PetscErrorCode PetscSFLinkReclaim(PetscSF sf, PetscSFLink *mylink)
516: {
517: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
518: PetscSFLink link = *mylink;
520: link->rootdata = NULL;
521: link->leafdata = NULL;
522: link->next = bas->avail;
523: bas->avail = link;
524: *mylink = NULL;
525: return 0;
526: }
528: /* Error out on unsupported overlapped communications */
529: PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata)
530: {
531: PetscSFLink link, *p;
532: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
533: PetscBool match;
535: if (PetscDefined(USE_DEBUG)) {
536: /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
537: the potential overlapping since this process does not participate in communication. Overlapping is harmless.
538: */
539: if (rootdata || leafdata) {
540: for (p = &bas->inuse; (link = *p); p = &link->next) {
541: MPIPetsc_Type_compare(unit, link->unit, &match);
543: }
544: }
545: }
546: return 0;
547: }
549: static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link, PetscMemType dstmtype, void *dst, PetscMemType srcmtype, const void *src, size_t n)
550: {
551: if (n) PetscMemcpy(dst, src, n);
552: return 0;
553: }
555: PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf, PetscSFLink link, MPI_Datatype unit)
556: {
557: PetscInt nSignedChar = 0, nUnsignedChar = 0, nInt = 0, nPetscInt = 0, nPetscReal = 0;
558: PetscBool is2Int, is2PetscInt;
559: PetscMPIInt ni, na, nd, combiner;
560: #if defined(PETSC_HAVE_COMPLEX)
561: PetscInt nPetscComplex = 0;
562: #endif
564: MPIPetsc_Type_compare_contig(unit, MPI_SIGNED_CHAR, &nSignedChar);
565: MPIPetsc_Type_compare_contig(unit, MPI_UNSIGNED_CHAR, &nUnsignedChar);
566: /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
567: MPIPetsc_Type_compare_contig(unit, MPI_INT, &nInt);
568: MPIPetsc_Type_compare_contig(unit, MPIU_INT, &nPetscInt);
569: MPIPetsc_Type_compare_contig(unit, MPIU_REAL, &nPetscReal);
570: #if defined(PETSC_HAVE_COMPLEX)
571: MPIPetsc_Type_compare_contig(unit, MPIU_COMPLEX, &nPetscComplex);
572: #endif
573: MPIPetsc_Type_compare(unit, MPI_2INT, &is2Int);
574: MPIPetsc_Type_compare(unit, MPIU_2INT, &is2PetscInt);
575: /* TODO: shaell we also handle Fortran MPI_2REAL? */
576: MPI_Type_get_envelope(unit, &ni, &na, &nd, &combiner);
577: link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
578: link->bs = 1; /* default */
580: if (is2Int) {
581: PackInit_PairType_int_int_1_1(link);
582: link->bs = 1;
583: link->unitbytes = 2 * sizeof(int);
584: link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
585: link->basicunit = MPI_2INT;
586: link->unit = MPI_2INT;
587: } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
588: PackInit_PairType_PetscInt_PetscInt_1_1(link);
589: link->bs = 1;
590: link->unitbytes = 2 * sizeof(PetscInt);
591: link->basicunit = MPIU_2INT;
592: link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
593: link->unit = MPIU_2INT;
594: } else if (nPetscReal) {
595: if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link);
596: else if (nPetscReal % 8 == 0) PackInit_RealType_PetscReal_8_0(link);
597: else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link);
598: else if (nPetscReal % 4 == 0) PackInit_RealType_PetscReal_4_0(link);
599: else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link);
600: else if (nPetscReal % 2 == 0) PackInit_RealType_PetscReal_2_0(link);
601: else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link);
602: else if (nPetscReal % 1 == 0) PackInit_RealType_PetscReal_1_0(link);
603: link->bs = nPetscReal;
604: link->unitbytes = nPetscReal * sizeof(PetscReal);
605: link->basicunit = MPIU_REAL;
606: if (link->bs == 1) {
607: link->isbuiltin = PETSC_TRUE;
608: link->unit = MPIU_REAL;
609: }
610: } else if (nPetscInt) {
611: if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link);
612: else if (nPetscInt % 8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
613: else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link);
614: else if (nPetscInt % 4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
615: else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link);
616: else if (nPetscInt % 2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
617: else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link);
618: else if (nPetscInt % 1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
619: link->bs = nPetscInt;
620: link->unitbytes = nPetscInt * sizeof(PetscInt);
621: link->basicunit = MPIU_INT;
622: if (link->bs == 1) {
623: link->isbuiltin = PETSC_TRUE;
624: link->unit = MPIU_INT;
625: }
626: #if defined(PETSC_USE_64BIT_INDICES)
627: } else if (nInt) {
628: if (nInt == 8) PackInit_IntegerType_int_8_1(link);
629: else if (nInt % 8 == 0) PackInit_IntegerType_int_8_0(link);
630: else if (nInt == 4) PackInit_IntegerType_int_4_1(link);
631: else if (nInt % 4 == 0) PackInit_IntegerType_int_4_0(link);
632: else if (nInt == 2) PackInit_IntegerType_int_2_1(link);
633: else if (nInt % 2 == 0) PackInit_IntegerType_int_2_0(link);
634: else if (nInt == 1) PackInit_IntegerType_int_1_1(link);
635: else if (nInt % 1 == 0) PackInit_IntegerType_int_1_0(link);
636: link->bs = nInt;
637: link->unitbytes = nInt * sizeof(int);
638: link->basicunit = MPI_INT;
639: if (link->bs == 1) {
640: link->isbuiltin = PETSC_TRUE;
641: link->unit = MPI_INT;
642: }
643: #endif
644: } else if (nSignedChar) {
645: if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link);
646: else if (nSignedChar % 8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
647: else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link);
648: else if (nSignedChar % 4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
649: else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link);
650: else if (nSignedChar % 2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
651: else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link);
652: else if (nSignedChar % 1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
653: link->bs = nSignedChar;
654: link->unitbytes = nSignedChar * sizeof(SignedChar);
655: link->basicunit = MPI_SIGNED_CHAR;
656: if (link->bs == 1) {
657: link->isbuiltin = PETSC_TRUE;
658: link->unit = MPI_SIGNED_CHAR;
659: }
660: } else if (nUnsignedChar) {
661: if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link);
662: else if (nUnsignedChar % 8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
663: else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link);
664: else if (nUnsignedChar % 4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
665: else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link);
666: else if (nUnsignedChar % 2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
667: else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link);
668: else if (nUnsignedChar % 1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
669: link->bs = nUnsignedChar;
670: link->unitbytes = nUnsignedChar * sizeof(UnsignedChar);
671: link->basicunit = MPI_UNSIGNED_CHAR;
672: if (link->bs == 1) {
673: link->isbuiltin = PETSC_TRUE;
674: link->unit = MPI_UNSIGNED_CHAR;
675: }
676: #if defined(PETSC_HAVE_COMPLEX)
677: } else if (nPetscComplex) {
678: if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link);
679: else if (nPetscComplex % 8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
680: else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link);
681: else if (nPetscComplex % 4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
682: else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link);
683: else if (nPetscComplex % 2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
684: else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link);
685: else if (nPetscComplex % 1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
686: link->bs = nPetscComplex;
687: link->unitbytes = nPetscComplex * sizeof(PetscComplex);
688: link->basicunit = MPIU_COMPLEX;
689: if (link->bs == 1) {
690: link->isbuiltin = PETSC_TRUE;
691: link->unit = MPIU_COMPLEX;
692: }
693: #endif
694: } else {
695: MPI_Aint lb, nbyte;
696: MPI_Type_get_extent(unit, &lb, &nbyte);
698: if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
699: if (nbyte == 4) PackInit_DumbType_char_4_1(link);
700: else if (nbyte % 4 == 0) PackInit_DumbType_char_4_0(link);
701: else if (nbyte == 2) PackInit_DumbType_char_2_1(link);
702: else if (nbyte % 2 == 0) PackInit_DumbType_char_2_0(link);
703: else if (nbyte == 1) PackInit_DumbType_char_1_1(link);
704: else if (nbyte % 1 == 0) PackInit_DumbType_char_1_0(link);
705: link->bs = nbyte;
706: link->unitbytes = nbyte;
707: link->basicunit = MPI_BYTE;
708: } else {
709: nInt = nbyte / sizeof(int);
710: if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link);
711: else if (nInt % 8 == 0) PackInit_DumbType_DumbInt_8_0(link);
712: else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link);
713: else if (nInt % 4 == 0) PackInit_DumbType_DumbInt_4_0(link);
714: else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link);
715: else if (nInt % 2 == 0) PackInit_DumbType_DumbInt_2_0(link);
716: else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link);
717: else if (nInt % 1 == 0) PackInit_DumbType_DumbInt_1_0(link);
718: link->bs = nInt;
719: link->unitbytes = nbyte;
720: link->basicunit = MPI_INT;
721: }
722: if (link->isbuiltin) link->unit = unit;
723: }
725: if (!link->isbuiltin) MPI_Type_dup(unit, &link->unit);
727: link->Memcpy = PetscSFLinkMemcpy_Host;
728: return 0;
729: }
731: PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *))
732: {
733: *UnpackAndOp = NULL;
734: if (PetscMemTypeHost(mtype)) {
735: if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert;
736: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
737: else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult;
738: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
739: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
740: else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND;
741: else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND;
742: else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR;
743: else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR;
744: else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR;
745: else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR;
746: else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc;
747: else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc;
748: }
749: #if defined(PETSC_HAVE_DEVICE)
750: else if (PetscMemTypeDevice(mtype) && !atomic) {
751: if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert;
752: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
753: else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult;
754: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
755: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
756: else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND;
757: else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND;
758: else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR;
759: else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR;
760: else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR;
761: else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR;
762: else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc;
763: else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc;
764: } else if (PetscMemTypeDevice(mtype) && atomic) {
765: if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert;
766: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
767: else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult;
768: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
769: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
770: else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND;
771: else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND;
772: else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR;
773: else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR;
774: else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR;
775: else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR;
776: else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc;
777: else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc;
778: }
779: #endif
780: return 0;
781: }
783: PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *))
784: {
785: *ScatterAndOp = NULL;
786: if (PetscMemTypeHost(mtype)) {
787: if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert;
788: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
789: else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult;
790: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
791: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
792: else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND;
793: else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND;
794: else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR;
795: else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR;
796: else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR;
797: else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR;
798: else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc;
799: else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc;
800: }
801: #if defined(PETSC_HAVE_DEVICE)
802: else if (PetscMemTypeDevice(mtype) && !atomic) {
803: if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert;
804: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
805: else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult;
806: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
807: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
808: else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND;
809: else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND;
810: else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR;
811: else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR;
812: else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR;
813: else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR;
814: else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc;
815: else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc;
816: } else if (PetscMemTypeDevice(mtype) && atomic) {
817: if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert;
818: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
819: else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult;
820: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
821: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
822: else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND;
823: else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND;
824: else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR;
825: else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR;
826: else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR;
827: else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR;
828: else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc;
829: else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc;
830: }
831: #endif
832: return 0;
833: }
835: PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *))
836: {
837: *FetchAndOp = NULL;
839: if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd;
840: #if defined(PETSC_HAVE_DEVICE)
841: else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd;
842: else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd;
843: #endif
844: return 0;
845: }
847: PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *))
848: {
849: *FetchAndOpLocal = NULL;
851: if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal;
852: #if defined(PETSC_HAVE_DEVICE)
853: else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
854: else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal;
855: #endif
856: return 0;
857: }
859: static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op)
860: {
861: PetscLogDouble flops;
862: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
864: if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
865: flops = bas->rootbuflen[scope] * link->bs; /* # of roots in buffer x # of scalars in unit */
866: #if defined(PETSC_HAVE_DEVICE)
867: if (PetscMemTypeDevice(link->rootmtype)) PetscLogGpuFlops(flops);
868: else
869: #endif
870: PetscLogFlops(flops);
871: }
872: return 0;
873: }
875: static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op)
876: {
877: PetscLogDouble flops;
879: if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
880: flops = sf->leafbuflen[scope] * link->bs; /* # of roots in buffer x # of scalars in unit */
881: #if defined(PETSC_HAVE_DEVICE)
882: if (PetscMemTypeDevice(link->leafmtype)) PetscLogGpuFlops(flops);
883: else
884: #endif
885: PetscLogFlops(flops);
886: }
887: return 0;
888: }
890: /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
891: Input Parameters:
892: +sf - The StarForest
893: .link - The link
894: .count - Number of entries to unpack
895: .start - The first index, significent when indices=NULL
896: .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
897: .buf - A contiguous buffer to unpack from
898: -op - Operation after unpack
900: Output Parameters:
901: .data - The data to unpack to
902: */
903: static inline PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt start, const PetscInt *indices, void *data, const void *buf, MPI_Op op)
904: {
905: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
906: {
907: PetscInt i;
908: if (indices) {
909: /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
910: basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
911: */
912: for (i = 0; i < count; i++) MPI_Reduce_local((const char *)buf + i * link->unitbytes, (char *)data + indices[i] * link->unitbytes, 1, link->unit, op);
913: } else {
914: MPIU_Reduce_local(buf, (char *)data + start * link->unitbytes, count, link->unit, op);
915: }
916: }
917: return 0;
918: #else
919: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op");
920: #endif
921: }
923: static inline PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt srcStart, const PetscInt *srcIdx, const void *src, PetscInt dstStart, const PetscInt *dstIdx, void *dst, MPI_Op op)
924: {
925: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
926: {
927: PetscInt i, disp;
928: if (!srcIdx) {
929: PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, dstStart, dstIdx, dst, (const char *)src + srcStart * link->unitbytes, op);
930: } else {
931: for (i = 0; i < count; i++) {
932: disp = dstIdx ? dstIdx[i] : dstStart + i;
933: MPIU_Reduce_local((const char *)src + srcIdx[i] * link->unitbytes, (char *)dst + disp * link->unitbytes, 1, link->unit, op);
934: }
935: }
936: }
937: return 0;
938: #else
939: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op");
940: #endif
941: }
943: /*=============================================================================
944: Pack/Unpack/Fetch/Scatter routines
945: ============================================================================*/
947: /* Pack rootdata to rootbuf
948: Input Parameters:
949: + sf - The SF this packing works on.
950: . link - It gives the memtype of the roots and also provides root buffer.
951: . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
952: - rootdata - Where to read the roots.
954: Notes:
955: When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
956: in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not)
957: */
958: PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata)
959: {
960: const PetscInt *rootindices = NULL;
961: PetscInt count, start;
962: PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
963: PetscMemType rootmtype = link->rootmtype;
964: PetscSFPackOpt opt = NULL;
966: PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0);
967: if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */
968: PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices);
969: PetscSFLinkGetPack(link, rootmtype, &Pack);
970: (*Pack)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype]);
971: }
972: PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0);
973: return 0;
974: }
976: /* Pack leafdata to leafbuf */
977: PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata)
978: {
979: const PetscInt *leafindices = NULL;
980: PetscInt count, start;
981: PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
982: PetscMemType leafmtype = link->leafmtype;
983: PetscSFPackOpt opt = NULL;
985: PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0);
986: if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */
987: PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices);
988: PetscSFLinkGetPack(link, leafmtype, &Pack);
989: (*Pack)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype]);
990: }
991: PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0);
992: return 0;
993: }
995: /* Pack rootdata to rootbuf, which are in the same memory space */
996: PetscErrorCode PetscSFLinkPackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata)
997: {
998: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1000: if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */
1001: if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) (*link->SyncDevice)(link);
1002: if (link->PrePack) (*link->PrePack)(sf, link, PETSCSF_../../../../../..2LEAF); /* Used by SF nvshmem */
1003: }
1004: PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0);
1005: if (bas->rootbuflen[scope]) PetscSFLinkPackRootData_Private(sf, link, scope, rootdata);
1006: PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0);
1007: return 0;
1008: }
1009: /* Pack leafdata to leafbuf, which are in the same memory space */
1010: PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata)
1011: {
1012: if (scope == PETSCSF_REMOTE) {
1013: if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) (*link->SyncDevice)(link);
1014: if (link->PrePack) (*link->PrePack)(sf, link, PETSCSF_LEAF2../../../../../..); /* Used by SF nvshmem */
1015: }
1016: PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0);
1017: if (sf->leafbuflen[scope]) PetscSFLinkPackLeafData_Private(sf, link, scope, leafdata);
1018: PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0);
1019: return 0;
1020: }
1022: PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op)
1023: {
1024: const PetscInt *rootindices = NULL;
1025: PetscInt count, start;
1026: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1027: PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL;
1028: PetscMemType rootmtype = link->rootmtype;
1029: PetscSFPackOpt opt = NULL;
1031: if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1032: PetscSFLinkGetUnpackAndOp(link, rootmtype, op, bas->rootdups[scope], &UnpackAndOp);
1033: if (UnpackAndOp) {
1034: PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices);
1035: (*UnpackAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype]);
1036: } else {
1037: PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &rootindices);
1038: PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, rootindices, rootdata, link->rootbuf[scope][rootmtype], op);
1039: }
1040: }
1041: PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, scope, op);
1042: return 0;
1043: }
1045: PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op)
1046: {
1047: const PetscInt *leafindices = NULL;
1048: PetscInt count, start;
1049: PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL;
1050: PetscMemType leafmtype = link->leafmtype;
1051: PetscSFPackOpt opt = NULL;
1053: if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1054: PetscSFLinkGetUnpackAndOp(link, leafmtype, op, sf->leafdups[scope], &UnpackAndOp);
1055: if (UnpackAndOp) {
1056: PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices);
1057: (*UnpackAndOp)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype]);
1058: } else {
1059: PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &leafindices);
1060: PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, leafindices, leafdata, link->leafbuf[scope][leafmtype], op);
1061: }
1062: }
1063: PetscSFLinkLogFlopsAfterUnpackLeafData(sf, link, scope, op);
1064: return 0;
1065: }
1066: /* Unpack rootbuf to rootdata, which are in the same memory space */
1067: PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op)
1068: {
1069: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1071: PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0);
1072: if (bas->rootbuflen[scope]) PetscSFLinkUnpackRootData_Private(sf, link, scope, rootdata, op);
1073: PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0);
1074: if (scope == PETSCSF_REMOTE) {
1075: if (link->PostUnpack) (*link->PostUnpack)(sf, link, PETSCSF_LEAF2../../../../../..); /* Used by SF nvshmem */
1076: if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) (*link->SyncDevice)(link);
1077: }
1078: return 0;
1079: }
1081: /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */
1082: PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op)
1083: {
1084: PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0);
1085: if (sf->leafbuflen[scope]) PetscSFLinkUnpackLeafData_Private(sf, link, scope, leafdata, op);
1086: PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0);
1087: if (scope == PETSCSF_REMOTE) {
1088: if (link->PostUnpack) (*link->PostUnpack)(sf, link, PETSCSF_../../../../../..2LEAF); /* Used by SF nvshmem */
1089: if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) (*link->SyncDevice)(link);
1090: }
1091: return 0;
1092: }
1094: /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */
1095: PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf, PetscSFLink link, void *rootdata, MPI_Op op)
1096: {
1097: const PetscInt *rootindices = NULL;
1098: PetscInt count, start;
1099: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1100: PetscErrorCode (*FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *) = NULL;
1101: PetscMemType rootmtype = link->rootmtype;
1102: PetscSFPackOpt opt = NULL;
1104: PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0);
1105: if (bas->rootbuflen[PETSCSF_REMOTE]) {
1106: /* Do FetchAndOp on rootdata with rootbuf */
1107: PetscSFLinkGetFetchAndOp(link, rootmtype, op, bas->rootdups[PETSCSF_REMOTE], &FetchAndOp);
1108: PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_REMOTE, &count, &start, &opt, &rootindices);
1109: (*FetchAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[PETSCSF_REMOTE][rootmtype]);
1110: }
1111: PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, PETSCSF_REMOTE, op);
1112: PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0);
1113: return 0;
1114: }
1116: PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void *rootdata, void *leafdata, MPI_Op op)
1117: {
1118: const PetscInt *rootindices = NULL, *leafindices = NULL;
1119: PetscInt count, rootstart, leafstart;
1120: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1121: PetscErrorCode (*ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *) = NULL;
1122: PetscMemType rootmtype = link->rootmtype, leafmtype = link->leafmtype, srcmtype, dstmtype;
1123: PetscSFPackOpt leafopt = NULL, rootopt = NULL;
1124: PetscInt buflen = sf->leafbuflen[PETSCSF_LOCAL];
1125: char *srcbuf = NULL, *dstbuf = NULL;
1126: PetscBool dstdups;
1128: if (!buflen) return 0;
1129: if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */
1130: if (direction == PETSCSF_../../../../../..2LEAF) {
1131: PetscSFLinkPackRootData(sf, link, PETSCSF_LOCAL, rootdata);
1132: srcmtype = rootmtype;
1133: srcbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1134: dstmtype = leafmtype;
1135: dstbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1136: } else {
1137: PetscSFLinkPackLeafData(sf, link, PETSCSF_LOCAL, leafdata);
1138: srcmtype = leafmtype;
1139: srcbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1140: dstmtype = rootmtype;
1141: dstbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1142: }
1143: (*link->Memcpy)(link, dstmtype, dstbuf, srcmtype, srcbuf, buflen * link->unitbytes);
1144: /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */
1145: if (PetscMemTypeHost(dstmtype)) (*link->SyncStream)(link);
1146: if (direction == PETSCSF_../../../../../..2LEAF) {
1147: PetscSFLinkUnpackLeafData(sf, link, PETSCSF_LOCAL, leafdata, op);
1148: } else {
1149: PetscSFLinkUnpackRootData(sf, link, PETSCSF_LOCAL, rootdata, op);
1150: }
1151: } else {
1152: dstdups = (direction == PETSCSF_../../../../../..2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL];
1153: dstmtype = (direction == PETSCSF_../../../../../..2LEAF) ? link->leafmtype : link->rootmtype;
1154: PetscSFLinkGetScatterAndOp(link, dstmtype, op, dstdups, &ScatterAndOp);
1155: if (ScatterAndOp) {
1156: PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices);
1157: PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices);
1158: if (direction == PETSCSF_../../../../../..2LEAF) {
1159: (*ScatterAndOp)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata);
1160: } else {
1161: (*ScatterAndOp)(link, count, leafstart, leafopt, leafindices, leafdata, rootstart, rootopt, rootindices, rootdata);
1162: }
1163: } else {
1164: PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices);
1165: PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices);
1166: if (direction == PETSCSF_../../../../../..2LEAF) {
1167: PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, rootstart, rootindices, rootdata, leafstart, leafindices, leafdata, op);
1168: } else {
1169: PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, leafstart, leafindices, leafdata, rootstart, rootindices, rootdata, op);
1170: }
1171: }
1172: }
1173: return 0;
1174: }
1176: /* Fetch rootdata to leafdata and leafupdate locally */
1177: PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf, PetscSFLink link, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1178: {
1179: const PetscInt *rootindices = NULL, *leafindices = NULL;
1180: PetscInt count, rootstart, leafstart;
1181: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1182: PetscErrorCode (*FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
1183: const PetscMemType rootmtype = link->rootmtype, leafmtype = link->leafmtype;
1184: PetscSFPackOpt leafopt = NULL, rootopt = NULL;
1186: if (!bas->rootbuflen[PETSCSF_LOCAL]) return 0;
1187: if (rootmtype != leafmtype) {
1188: /* The local communication has to go through pack and unpack */
1189: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1190: } else {
1191: PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices);
1192: PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices);
1193: PetscSFLinkGetFetchAndOpLocal(link, rootmtype, op, bas->rootdups[PETSCSF_LOCAL], &FetchAndOpLocal);
1194: (*FetchAndOpLocal)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata, leafupdate);
1195: }
1196: return 0;
1197: }
1199: /*
1200: Create per-rank pack/unpack optimizations based on indice patterns
1202: Input Parameters:
1203: + n - Number of destination ranks
1204: . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1205: - idx - [*] Array storing indices
1207: Output Parameters:
1208: + opt - Pack optimizations. NULL if no optimizations.
1209: */
1210: PetscErrorCode PetscSFCreatePackOpt(PetscInt n, const PetscInt *offset, const PetscInt *idx, PetscSFPackOpt *out)
1211: {
1212: PetscInt r, p, start, i, j, k, dx, dy, dz, dydz, m, X, Y;
1213: PetscBool optimizable = PETSC_TRUE;
1214: PetscSFPackOpt opt;
1216: PetscMalloc1(1, &opt);
1217: PetscMalloc1(7 * n + 2, &opt->array);
1218: opt->n = opt->array[0] = n;
1219: opt->offset = opt->array + 1;
1220: opt->start = opt->array + n + 2;
1221: opt->dx = opt->array + 2 * n + 2;
1222: opt->dy = opt->array + 3 * n + 2;
1223: opt->dz = opt->array + 4 * n + 2;
1224: opt->X = opt->array + 5 * n + 2;
1225: opt->Y = opt->array + 6 * n + 2;
1227: for (r = 0; r < n; r++) { /* For each destination rank */
1228: m = offset[r + 1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1229: p = offset[r];
1230: start = idx[p]; /* First index for this rank */
1231: p++;
1233: /* Search in X dimension */
1234: for (dx = 1; dx < m; dx++, p++) {
1235: if (start + dx != idx[p]) break;
1236: }
1238: dydz = m / dx;
1239: X = dydz > 1 ? (idx[p] - start) : dx;
1240: /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1241: if (m % dx || X <= 0) {
1242: optimizable = PETSC_FALSE;
1243: goto finish;
1244: }
1245: for (dy = 1; dy < dydz; dy++) { /* Search in Y dimension */
1246: for (i = 0; i < dx; i++, p++) {
1247: if (start + X * dy + i != idx[p]) {
1248: if (i) {
1249: optimizable = PETSC_FALSE;
1250: goto finish;
1251: } /* The pattern is violated in the middle of an x-walk */
1252: else
1253: goto Z_dimension;
1254: }
1255: }
1256: }
1258: Z_dimension:
1259: dz = m / (dx * dy);
1260: Y = dz > 1 ? (idx[p] - start) / X : dy;
1261: /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1262: if (m % (dx * dy) || Y <= 0) {
1263: optimizable = PETSC_FALSE;
1264: goto finish;
1265: }
1266: for (k = 1; k < dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1267: for (j = 0; j < dy; j++) {
1268: for (i = 0; i < dx; i++, p++) {
1269: if (start + X * Y * k + X * j + i != idx[p]) {
1270: optimizable = PETSC_FALSE;
1271: goto finish;
1272: }
1273: }
1274: }
1275: }
1276: opt->start[r] = start;
1277: opt->dx[r] = dx;
1278: opt->dy[r] = dy;
1279: opt->dz[r] = dz;
1280: opt->X[r] = X;
1281: opt->Y[r] = Y;
1282: }
1284: finish:
1285: /* If not optimizable, free arrays to save memory */
1286: if (!n || !optimizable) {
1287: PetscFree(opt->array);
1288: PetscFree(opt);
1289: *out = NULL;
1290: } else {
1291: opt->offset[0] = 0;
1292: for (r = 0; r < n; r++) opt->offset[r + 1] = opt->offset[r] + opt->dx[r] * opt->dy[r] * opt->dz[r];
1293: *out = opt;
1294: }
1295: return 0;
1296: }
1298: static inline PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf, PetscMemType mtype, PetscSFPackOpt *out)
1299: {
1300: PetscSFPackOpt opt = *out;
1302: if (opt) {
1303: PetscSFFree(sf, mtype, opt->array);
1304: PetscFree(opt);
1305: *out = NULL;
1306: }
1307: return 0;
1308: }
1310: PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1311: {
1312: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1313: PetscInt i, j;
1315: /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1316: for (i = 0; i < 2; i++) { /* Set defaults */
1317: sf->leafstart[i] = 0;
1318: sf->leafcontig[i] = PETSC_TRUE;
1319: sf->leafdups[i] = PETSC_FALSE;
1320: bas->rootstart[i] = 0;
1321: bas->rootcontig[i] = PETSC_TRUE;
1322: bas->rootdups[i] = PETSC_FALSE;
1323: }
1325: sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1326: sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1328: if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1329: if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1331: /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1332: for (i = 0; i < sf->roffset[sf->ndranks]; i++) { /* self */
1333: if (sf->rmine[i] != sf->leafstart[0] + i) {
1334: sf->leafcontig[0] = PETSC_FALSE;
1335: break;
1336: }
1337: }
1338: for (i = sf->roffset[sf->ndranks], j = 0; i < sf->roffset[sf->nranks]; i++, j++) { /* remote */
1339: if (sf->rmine[i] != sf->leafstart[1] + j) {
1340: sf->leafcontig[1] = PETSC_FALSE;
1341: break;
1342: }
1343: }
1345: /* If not, see if we can have per-rank optimizations by doing index analysis */
1346: if (!sf->leafcontig[0]) PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);
1347: if (!sf->leafcontig[1]) PetscSFCreatePackOpt(sf->nranks - sf->ndranks, sf->roffset + sf->ndranks, sf->rmine, &sf->leafpackopt[1]);
1349: /* Are root indices for self and remote contiguous? */
1350: bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1351: bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1353: if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1354: if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1356: for (i = 0; i < bas->ioffset[bas->ndiranks]; i++) {
1357: if (bas->irootloc[i] != bas->rootstart[0] + i) {
1358: bas->rootcontig[0] = PETSC_FALSE;
1359: break;
1360: }
1361: }
1362: for (i = bas->ioffset[bas->ndiranks], j = 0; i < bas->ioffset[bas->niranks]; i++, j++) {
1363: if (bas->irootloc[i] != bas->rootstart[1] + j) {
1364: bas->rootcontig[1] = PETSC_FALSE;
1365: break;
1366: }
1367: }
1369: if (!bas->rootcontig[0]) PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);
1370: if (!bas->rootcontig[1]) PetscSFCreatePackOpt(bas->niranks - bas->ndiranks, bas->ioffset + bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);
1372: /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1373: if (PetscDefined(HAVE_DEVICE)) {
1374: PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
1379: }
1380: return 0;
1381: }
1383: PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1384: {
1385: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1386: PetscInt i;
1388: for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
1389: PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &sf->leafpackopt[i]);
1390: PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &bas->rootpackopt[i]);
1391: #if defined(PETSC_HAVE_DEVICE)
1392: PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &sf->leafpackopt_d[i]);
1393: PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &bas->rootpackopt_d[i]);
1394: #endif
1395: }
1396: return 0;
1397: }