Actual source code: sfpack.h
1: #ifndef __SFPACK_H
4: #include <../src/vec/is/sf/impls/basic/sfbasic.h>
5: #if defined(PETSC_HAVE_CUDA)
6: #include <petscdevice_cuda.h>
7: typedef cudaStream_t cupmStream_t;
8: typedef cudaEvent_t cupmEvent_t;
9: #endif
11: #if defined(PETSC_HAVE_HIP)
12: #include <petscdevice_hip.h>
13: typedef hipStream_t cupmStream_t;
14: typedef hipEvent_t cupmEvent_t;
15: #endif
17: /* In terms of function overloading, long long int is a different type than int64_t, which PetscInt might be defined to.
18: We prefer long long int over PetscInt (int64_t), since CUDA atomics are built around (unsigned) long long int.
19: */
20: typedef long long int llint;
21: typedef unsigned long long int ullint;
23: /* We separate SF communications for SFBasic and SFNeighbor in two parts: local (self,intra-rank) and remote (inter-rank) */
24: typedef enum {
25: PETSCSF_LOCAL = 0,
26: PETSCSF_REMOTE
27: } PetscSFScope;
29: /* Optimizations in packing & unpacking for destination ranks.
31: Suppose there are m indices stored in idx[], and two addresses u, p. We want to do packing:
32: p[i] = u[idx[i]], for i in [0,m)
34: Indices are associated with n ranks and each rank's indices are stored consecutively in idx[].
35: We go through indices for each rank and see if they are indices of a 3D submatrix of size [dx,dy,dz] in
36: a parent matrix of size [X,Y,Z], with the submatrix's first index being <start>.
38: E.g., for indices 1,2,3, 6,7,8, 11,12,13, the submatrix size is [3,3,1] with start=1, and the parent matrix's size
39: is [5,3,1]. For simplicity, if any destination rank does not have this pattern, we give up the optimization.
41: Note before using this per-rank optimization, one should check leafcontig[], rootcontig[], which say
42: indices in whole are contiguous, and therefore much more useful than this one when true.
43: */
44: struct _n_PetscSFPackOpt {
45: PetscInt *array; /* [7*n+2] Memory pool for other fields in this struct. Used to easily copy this struct to GPU */
46: PetscInt n; /* Number of destination ranks */
47: PetscInt *offset; /* [n+1] Offsets of indices for each rank. offset[0]=0, offset[i+1]=offset[i]+dx[i]*dy[i]*dz[i] */
48: PetscInt *start; /* [n] First index */
49: PetscInt *dx, *dy, *dz; /* [n] Lengths of the submatrix in X, Y, Z dimension. */
50: PetscInt *X, *Y; /* [n] Lengths of the outer matrix in X, Y. We do not care Z. */
51: };
53: /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers
54: */
55: struct _n_PetscSFLink {
56: PetscErrorCode (*Memcpy)(PetscSFLink, PetscMemType, void *, PetscMemType, const void *, size_t); /* Async device memcopy might use stream in the link */
57: PetscErrorCode (*PrePack)(PetscSF, PetscSFLink, PetscSFDirection);
58: PetscErrorCode (*PostUnpack)(PetscSF, PetscSFLink, PetscSFDirection);
59: PetscErrorCode (*StartCommunication)(PetscSF, PetscSFLink, PetscSFDirection);
60: PetscErrorCode (*FinishCommunication)(PetscSF, PetscSFLink, PetscSFDirection);
61: PetscErrorCode (*SyncDevice)(PetscSFLink);
62: PetscErrorCode (*SyncStream)(PetscSFLink);
63: PetscErrorCode (*Destroy)(PetscSF, PetscSFLink);
65: PetscErrorCode (*BuildDependenceBegin)(PetscSF, PetscSFLink, PetscSFDirection);
66: PetscErrorCode (*BuildDependenceEnd)(PetscSF, PetscSFLink, PetscSFDirection);
68: PetscErrorCode (*h_Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *);
69: PetscErrorCode (*h_UnpackAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
70: PetscErrorCode (*h_UnpackAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
71: PetscErrorCode (*h_UnpackAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
72: PetscErrorCode (*h_UnpackAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
73: PetscErrorCode (*h_UnpackAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
74: PetscErrorCode (*h_UnpackAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
75: PetscErrorCode (*h_UnpackAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
76: PetscErrorCode (*h_UnpackAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
77: PetscErrorCode (*h_UnpackAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
78: PetscErrorCode (*h_UnpackAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
79: PetscErrorCode (*h_UnpackAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
80: PetscErrorCode (*h_UnpackAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
81: PetscErrorCode (*h_UnpackAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
82: PetscErrorCode (*h_FetchAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *);
84: PetscErrorCode (*h_ScatterAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
85: PetscErrorCode (*h_ScatterAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
86: PetscErrorCode (*h_ScatterAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
87: PetscErrorCode (*h_ScatterAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
88: PetscErrorCode (*h_ScatterAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
89: PetscErrorCode (*h_ScatterAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
90: PetscErrorCode (*h_ScatterAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
91: PetscErrorCode (*h_ScatterAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
92: PetscErrorCode (*h_ScatterAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
93: PetscErrorCode (*h_ScatterAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
94: PetscErrorCode (*h_ScatterAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
95: PetscErrorCode (*h_ScatterAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
96: PetscErrorCode (*h_ScatterAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
98: PetscErrorCode (*h_FetchAndAddLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *);
100: PetscBool deviceinited; /* Are device related fields initialized? */
101: #if defined(PETSC_HAVE_DEVICE)
102: /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF
103: will set them, otherwise it just leaves them alone. Packing routines using regular ops when there are no data race chances.
104: */
105: PetscErrorCode (*d_Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *);
106: PetscErrorCode (*d_UnpackAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
107: PetscErrorCode (*d_UnpackAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
108: PetscErrorCode (*d_UnpackAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
109: PetscErrorCode (*d_UnpackAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
110: PetscErrorCode (*d_UnpackAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
111: PetscErrorCode (*d_UnpackAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
112: PetscErrorCode (*d_UnpackAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
113: PetscErrorCode (*d_UnpackAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
114: PetscErrorCode (*d_UnpackAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
115: PetscErrorCode (*d_UnpackAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
116: PetscErrorCode (*d_UnpackAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
117: PetscErrorCode (*d_UnpackAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
118: PetscErrorCode (*d_UnpackAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
119: PetscErrorCode (*d_FetchAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *);
121: PetscErrorCode (*d_ScatterAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
122: PetscErrorCode (*d_ScatterAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
123: PetscErrorCode (*d_ScatterAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
124: PetscErrorCode (*d_ScatterAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
125: PetscErrorCode (*d_ScatterAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
126: PetscErrorCode (*d_ScatterAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
127: PetscErrorCode (*d_ScatterAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
128: PetscErrorCode (*d_ScatterAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
129: PetscErrorCode (*d_ScatterAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
130: PetscErrorCode (*d_ScatterAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
131: PetscErrorCode (*d_ScatterAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
132: PetscErrorCode (*d_ScatterAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
133: PetscErrorCode (*d_ScatterAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
134: PetscErrorCode (*d_FetchAndAddLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *);
136: /* Packing routines using atomics when there are data race chances */
137: PetscErrorCode (*da_UnpackAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
138: PetscErrorCode (*da_UnpackAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
139: PetscErrorCode (*da_UnpackAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
140: PetscErrorCode (*da_UnpackAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
141: PetscErrorCode (*da_UnpackAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
142: PetscErrorCode (*da_UnpackAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
143: PetscErrorCode (*da_UnpackAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
144: PetscErrorCode (*da_UnpackAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
145: PetscErrorCode (*da_UnpackAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
146: PetscErrorCode (*da_UnpackAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
147: PetscErrorCode (*da_UnpackAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
148: PetscErrorCode (*da_UnpackAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
149: PetscErrorCode (*da_UnpackAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
150: PetscErrorCode (*da_FetchAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *);
152: PetscErrorCode (*da_ScatterAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
153: PetscErrorCode (*da_ScatterAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
154: PetscErrorCode (*da_ScatterAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
155: PetscErrorCode (*da_ScatterAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
156: PetscErrorCode (*da_ScatterAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
157: PetscErrorCode (*da_ScatterAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
158: PetscErrorCode (*da_ScatterAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
159: PetscErrorCode (*da_ScatterAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
160: PetscErrorCode (*da_ScatterAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
161: PetscErrorCode (*da_ScatterAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
162: PetscErrorCode (*da_ScatterAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
163: PetscErrorCode (*da_ScatterAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
164: PetscErrorCode (*da_ScatterAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
165: PetscErrorCode (*da_FetchAndAddLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *);
166: #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
167: PetscInt maxResidentThreadsPerGPU; /* It is a copy from SF for convenience */
168: cupmStream_t stream; /* stream on which input/output root/leafdata is computed on (default is PetscDefaultCudaStream) */
169: #endif
170: #endif
171: PetscMPIInt tag; /* Each link has a tag so we can perform multiple SF ops at the same time */
172: MPI_Datatype unit; /* The MPI datatype this PetscSFLink is built for */
173: MPI_Datatype basicunit; /* unit is made of MPI builtin dataype basicunit */
174: PetscBool isbuiltin; /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */
175: size_t unitbytes; /* Number of bytes in a unit */
176: PetscInt bs; /* Number of basic units in a unit */
177: const void *rootdata, *leafdata; /* rootdata and leafdata the link is working on. They are used as keys for pending links. */
178: PetscMemType rootmtype, leafmtype; /* root/leafdata's memory type */
180: /* For local and remote communication */
181: PetscMemType rootmtype_mpi, leafmtype_mpi; /* Mtypes of buffers passed to MPI. If use_gpu_aware_mpi, they are same as root/leafmtype. Otherwise they are PETSC_MEMTYPE_HOST */
182: PetscBool rootdirect[2], leafdirect[2]; /* Can root/leafdata be directly passed to SF (i.e., without buffering). In layout of [PETSCSF_LOCAL/REMOTE]. See more in PetscSFLinkCreate() */
183: PetscInt rootdirect_mpi, leafdirect_mpi; /* Can root/leafdata for remote be directly passed to MPI? 1: yes, 0: no. See more in PetscSFLinkCreate() */
184: const void *rootdatadirect[2][2]; /* The root/leafdata used to init root/leaf requests, in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]. */
185: const void *leafdatadirect[2][2]; /* ... We need them to look up links when root/leafdirect_mpi are true */
186: char *rootbuf[2][2]; /* Buffers for packed roots, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE]. PETSCSF_LOCAL does not need MPI, .. */
187: /* .. but in case rootmtype is different from leafmtype, we still need to pack local roots and then copy them to memory of leafmtype */
188: char *rootbuf_alloc[2][2]; /* Log memory allocated by petsc. We need it since rootbuf[][] may point to rootdata given by user */
189: char *leafbuf[2][2]; /* Buffers for packed leaves, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */
190: char *leafbuf_alloc[2][2];
191: MPI_Request *rootreqs[2][2][2]; /* Root requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi] */
192: MPI_Request *leafreqs[2][2][2]; /* Leaf requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi] */
193: PetscBool rootreqsinited[2][2][2]; /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi]*/
194: PetscBool leafreqsinited[2][2][2]; /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi]*/
195: MPI_Request *reqs; /* An array of length (nrootreqs+nleafreqs)*8. Pointers in rootreqs[][][] and leafreqs[][][] point here */
196: PetscSFLink next;
198: PetscBool use_nvshmem; /* Does this link use nvshem (vs. MPI) for communication? */
199: #if defined(PETSC_HAVE_NVSHMEM)
200: cupmEvent_t dataReady; /* Events to mark readiness of root/leafdata */
201: cupmEvent_t endRemoteComm; /* Events to mark end of local/remote communication */
202: cupmStream_t remoteCommStream; /* Streams for remote (i.e., inter-rank) communication */
204: /* The buffers are allocated in device symmetric heap. Their length is the maximal length over all ranks in the comm, and therefore is the same. */
205: uint64_t *rootSendSig, *rootRecvSig; /* [max{niranks-ndiranks}], signals used when rootbuf works as send/recv buf */
206: uint64_t *leafSendSig, *leafRecvSig; /* [max{nranks-ndranks}], signals used when leafbuf works as send/recv buf */
207: #endif
208: };
210: PETSC_INTERN PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF, MPI_Datatype, const void *, const void *);
212: /* Create/setup/retrieve/destroy a link */
213: PETSC_INTERN PetscErrorCode PetscSFLinkCreate(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, const void *, MPI_Op, PetscSFOperation, PetscSFLink *);
214: PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Host(PetscSF, PetscSFLink, MPI_Datatype);
215: PETSC_INTERN PetscErrorCode PetscSFLinkGetInUse(PetscSF, MPI_Datatype, const void *, const void *, PetscCopyMode, PetscSFLink *);
216: PETSC_INTERN PetscErrorCode PetscSFLinkReclaim(PetscSF, PetscSFLink *);
217: PETSC_INTERN PetscErrorCode PetscSFLinkDestroy(PetscSF, PetscSFLink);
219: /* Get pack/unpack function pointers from a link */
220: static inline PetscErrorCode PetscSFLinkGetPack(PetscSFLink link, PetscMemType mtype, PetscErrorCode (**Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *))
221: {
222: if (PetscMemTypeHost(mtype)) *Pack = link->h_Pack;
223: #if defined(PETSC_HAVE_DEVICE)
224: else *Pack = link->d_Pack;
225: #endif
226: return 0;
227: }
229: PETSC_INTERN PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *));
230: PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *));
231: PETSC_INTERN PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *));
232: PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *));
233: PETSC_INTERN PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF, PetscSFLink, PetscSFDirection, void **, void **, MPI_Request **, MPI_Request **);
235: /* Do Pack/Unpack/Fetch/Scatter with the link */
236: PETSC_INTERN PetscErrorCode PetscSFLinkPackRootData(PetscSF, PetscSFLink, PetscSFScope, const void *);
237: PETSC_INTERN PetscErrorCode PetscSFLinkPackLeafData(PetscSF, PetscSFLink, PetscSFScope, const void *);
238: PETSC_INTERN PetscErrorCode PetscSFLinkUnpackRootData(PetscSF, PetscSFLink, PetscSFScope, void *, MPI_Op);
239: PETSC_INTERN PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF, PetscSFLink, PetscSFScope, void *, MPI_Op);
240: PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF, PetscSFLink, void *, MPI_Op);
242: PETSC_INTERN PetscErrorCode PetscSFLinkScatterLocal(PetscSF, PetscSFLink, PetscSFDirection, void *, void *, MPI_Op);
243: PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF, PetscSFLink, void *, const void *, void *, MPI_Op);
245: PETSC_INTERN PetscErrorCode PetscSFSetUpPackFields(PetscSF);
246: PETSC_INTERN PetscErrorCode PetscSFResetPackFields(PetscSF);
247: PETSC_INTERN PetscErrorCode PetscSFLinkCreate_MPI(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, const void *, MPI_Op, PetscSFOperation, PetscSFLink *);
249: #if defined(PETSC_HAVE_CUDA)
250: PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_CUDA(PetscSF, PetscSFLink, MPI_Datatype);
251: #endif
253: #if defined(PETSC_HAVE_HIP)
254: PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_HIP(PetscSF, PetscSFLink, MPI_Datatype);
255: #endif
257: #if defined(PETSC_HAVE_KOKKOS)
258: PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Kokkos(PetscSF, PetscSFLink, MPI_Datatype);
259: #endif
261: #if defined(PETSC_HAVE_NVSHMEM)
262: PETSC_INTERN PetscErrorCode PetscSFLinkCreate_NVSHMEM(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, const void *, MPI_Op, PetscSFOperation, PetscSFLink *);
263: PETSC_INTERN PetscErrorCode PetscSFLinkNvshmemCheck(PetscSF, PetscMemType, const void *, PetscMemType, const void *, PetscBool *);
264: #endif
266: static inline PetscErrorCode PetscSFLinkStartCommunication(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
267: {
268: if (link->StartCommunication) (*link->StartCommunication)(sf, link, direction);
269: return 0;
270: }
272: static inline PetscErrorCode PetscSFLinkFinishCommunication(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
273: {
274: if (link->FinishCommunication) (*link->FinishCommunication)(sf, link, direction);
275: return 0;
276: }
278: /* A set of helper routines for Pack/Unpack/Scatter on GPUs */
279: #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP) || defined(PETSC_HAVE_SYCL)
280: /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need
281: to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls.
282: */
283: static inline PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf, PetscSFLink link, PetscBool device2host)
284: {
285: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
287: /* rootdata is on device but we use regular MPI for communication */
288: if (PetscMemTypeDevice(link->rootmtype) && PetscMemTypeHost(link->rootmtype_mpi) && bas->rootbuflen[PETSCSF_REMOTE]) {
289: void *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
290: void *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
291: size_t count = bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes;
292: if (device2host) {
293: (*link->Memcpy)(link, PETSC_MEMTYPE_HOST, h_buf, PETSC_MEMTYPE_DEVICE, d_buf, count);
294: PetscLogGpuToCpu(count);
295: } else {
296: (*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, d_buf, PETSC_MEMTYPE_HOST, h_buf, count);
297: PetscLogCpuToGpu(count);
298: }
299: }
300: return 0;
301: }
303: static inline PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf, PetscSFLink link, PetscBool device2host)
304: {
305: if (PetscMemTypeDevice(link->leafmtype) && PetscMemTypeHost(link->leafmtype_mpi) && sf->leafbuflen[PETSCSF_REMOTE]) {
306: void *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
307: void *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
308: size_t count = sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes;
309: if (device2host) {
310: (*link->Memcpy)(link, PETSC_MEMTYPE_HOST, h_buf, PETSC_MEMTYPE_DEVICE, d_buf, count);
311: PetscLogGpuToCpu(count);
312: } else {
313: (*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, d_buf, PETSC_MEMTYPE_HOST, h_buf, count);
314: PetscLogCpuToGpu(count);
315: }
316: }
317: return 0;
318: }
320: /* Make sure root/leafbuf for the remote is ready for MPI */
321: static inline PetscErrorCode PetscSFLinkSyncStreamBeforeCallMPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
322: {
323: PetscSF_Basic *bas;
324: PetscInt buflen;
325: PetscMemType mtype;
327: if (direction == PETSCSF_../../../../../..2LEAF) {
328: bas = (PetscSF_Basic *)sf->data;
329: mtype = link->rootmtype;
330: buflen = bas->rootbuflen[PETSCSF_REMOTE];
331: } else {
332: mtype = link->leafmtype;
333: buflen = sf->leafbuflen[PETSCSF_REMOTE];
334: }
336: if (PetscMemTypeDevice(mtype) && buflen) (*link->SyncStream)(link);
337: return 0;
338: }
339: #else /* Host only */
340: #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a, b, c) 0
341: #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a, b, c) 0
342: #define PetscSFLinkSyncStreamBeforeCallMPI(a, b, c) 0
343: #endif
345: /* Get root indices used for pack/unpack
347: Input arguments:
348: +sf - StarForest
349: .link - The link, which provides the stream for the async memcpy (In SF, we make all GPU operations asynchronous to avoid unexpected pipeline stalls)
350: .mtype - In what type of memory? (PETSC_MEMTYPE_DEVICE or PETSC_MEMTYPE_HOST)
351: -scope - Which part of the indices? (PETSCSF_LOCAL or PETSCSF_REMOTE)
353: Output arguments:
354: +count - Count of indices
355: .start - The first index (only useful when indices is NULL)
356: .opt - Packing optimizations
357: -indices - Indices of roots for pack/unpack. NULL means indices are contiguous
358: */
359: static inline PetscErrorCode PetscSFLinkGetRootPackOptAndIndices(PetscSF sf, PetscSFLink link, PetscMemType mtype, PetscSFScope scope, PetscInt *count, PetscInt *start, PetscSFPackOpt *opt, const PetscInt **indices)
360: {
361: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
362: PetscInt offset;
364: *count = bas->rootbuflen[scope];
365: *start = bas->rootstart[scope];
366: *opt = NULL;
367: *indices = NULL;
369: /* We have these rules:
370: 1) opt == NULL && indices == NULL ==> indices are contiguous.
371: 2) opt != NULL ==> indices are in 3D but not contiguous. On host, indices != NULL since indices are already available and we do not
372: want to enforce all operations to use opt; but on device, indices = NULL since we do not want to copy indices to device.
373: */
374: if (!bas->rootcontig[scope]) {
375: offset = (scope == PETSCSF_LOCAL) ? 0 : bas->ioffset[bas->ndiranks];
376: if (PetscMemTypeHost(mtype)) {
377: *opt = bas->rootpackopt[scope];
378: *indices = bas->irootloc + offset;
379: } else {
380: size_t size;
381: if (bas->rootpackopt[scope]) {
382: if (!bas->rootpackopt_d[scope]) {
383: PetscMalloc1(1, &bas->rootpackopt_d[scope]);
384: PetscArraycpy(bas->rootpackopt_d[scope], bas->rootpackopt[scope], 1); /* Make pointers in bas->rootpackopt_d[] still work on host */
385: size = (bas->rootpackopt[scope]->n * 7 + 2) * sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/
386: PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&bas->rootpackopt_d[scope]->array);
387: (*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, bas->rootpackopt_d[scope]->array, PETSC_MEMTYPE_HOST, bas->rootpackopt[scope]->array, size);
388: }
389: *opt = bas->rootpackopt_d[scope];
390: } else { /* On device, we only provide indices when there is no optimization. We're reluctant to copy indices to device. */
391: if (!bas->irootloc_d[scope]) {
392: size = bas->rootbuflen[scope] * sizeof(PetscInt);
393: PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&bas->irootloc_d[scope]);
394: (*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, bas->irootloc_d[scope], PETSC_MEMTYPE_HOST, bas->irootloc + offset, size);
395: }
396: *indices = bas->irootloc_d[scope];
397: }
398: }
399: }
400: return 0;
401: }
403: /* Get leaf indices used for pack/unpack
405: See also PetscSFLinkGetRootPackOptAndIndices()
406: */
407: static inline PetscErrorCode PetscSFLinkGetLeafPackOptAndIndices(PetscSF sf, PetscSFLink link, PetscMemType mtype, PetscSFScope scope, PetscInt *count, PetscInt *start, PetscSFPackOpt *opt, const PetscInt **indices)
408: {
409: PetscInt offset;
411: *count = sf->leafbuflen[scope];
412: *start = sf->leafstart[scope];
413: *opt = NULL;
414: *indices = NULL;
415: if (!sf->leafcontig[scope]) {
416: offset = (scope == PETSCSF_LOCAL) ? 0 : sf->roffset[sf->ndranks];
417: if (PetscMemTypeHost(mtype)) {
418: *opt = sf->leafpackopt[scope];
419: *indices = sf->rmine + offset;
420: } else {
421: size_t size;
422: if (sf->leafpackopt[scope]) {
423: if (!sf->leafpackopt_d[scope]) {
424: PetscMalloc1(1, &sf->leafpackopt_d[scope]);
425: PetscArraycpy(sf->leafpackopt_d[scope], sf->leafpackopt[scope], 1);
426: size = (sf->leafpackopt[scope]->n * 7 + 2) * sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/
427: PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&sf->leafpackopt_d[scope]->array); /* Change ->array to a device pointer */
428: (*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, sf->leafpackopt_d[scope]->array, PETSC_MEMTYPE_HOST, sf->leafpackopt[scope]->array, size);
429: }
430: *opt = sf->leafpackopt_d[scope];
431: } else {
432: if (!sf->rmine_d[scope]) {
433: size = sf->leafbuflen[scope] * sizeof(PetscInt);
434: PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&sf->rmine_d[scope]);
435: (*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, sf->rmine_d[scope], PETSC_MEMTYPE_HOST, sf->rmine + offset, size);
436: }
437: *indices = sf->rmine_d[scope];
438: }
439: }
440: }
441: return 0;
442: }
443: #endif