Actual source code: sfmpi.c

  1: /* Mainly for MPI_Isend in SFBASIC. Once SFNEIGHBOR, SFALLGHATERV etc have a persistent version,
  2:    we can also do abstractions like Prepare/StartCommunication.
  3: */

  5: #include <../src/vec/is/sf/impls/basic/sfpack.h>

  7: /* Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf */
  8: static PetscErrorCode PetscSFLinkStartRequests_MPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
  9: {
 10:   PetscMPIInt    nreqs;
 11:   MPI_Request   *reqs = NULL;
 12:   PetscSF_Basic *bas  = (PetscSF_Basic *)sf->data;
 13:   PetscInt       buflen;

 15:   buflen = (direction == PETSCSF_../../../../../..2LEAF) ? sf->leafbuflen[PETSCSF_REMOTE] : bas->rootbuflen[PETSCSF_REMOTE];
 16:   if (buflen) {
 17:     if (direction == PETSCSF_../../../../../..2LEAF) {
 18:       nreqs = sf->nleafreqs;
 19:       PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs);
 20:     } else { /* leaf to root */
 21:       nreqs = bas->nrootreqs;
 22:       PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL);
 23:     }
 24:     MPI_Startall_irecv(buflen, link->unit, nreqs, reqs);
 25:   }

 27:   buflen = (direction == PETSCSF_../../../../../..2LEAF) ? bas->rootbuflen[PETSCSF_REMOTE] : sf->leafbuflen[PETSCSF_REMOTE];
 28:   if (buflen) {
 29:     if (direction == PETSCSF_../../../../../..2LEAF) {
 30:       nreqs = bas->nrootreqs;
 31:       PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /*device2host before sending */);
 32:       PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL);
 33:     } else { /* leaf to root */
 34:       nreqs = sf->nleafreqs;
 35:       PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE);
 36:       PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs);
 37:     }
 38:     PetscSFLinkSyncStreamBeforeCallMPI(sf, link, direction);
 39:     MPI_Startall_isend(buflen, link->unit, nreqs, reqs);
 40:   }
 41:   return 0;
 42: }

 44: static PetscErrorCode PetscSFLinkWaitRequests_MPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
 45: {
 46:   PetscSF_Basic     *bas           = (PetscSF_Basic *)sf->data;
 47:   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi;
 48:   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;

 50:   MPI_Waitall(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE);
 51:   MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE);
 52:   if (direction == PETSCSF_../../../../../..2LEAF) {
 53:     PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE /* host2device after recving */);
 54:   } else {
 55:     PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE);
 56:   }
 57:   return 0;
 58: }

 60: /*
 61:    The routine Creates a communication link for the given operation. It first looks up its link cache. If
 62:    there is a free & suitable one, it uses it. Otherwise it creates a new one.

 64:    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
 65:    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
 66:    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
 67:    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.

 69:    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.

 71:    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.

 73:    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
 74:    need pack/unpack data.
 75: */
 76: PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf, MPI_Datatype unit, PetscMemType xrootmtype, const void *rootdata, PetscMemType xleafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink)
 77: {
 78:   PetscSF_Basic   *bas = (PetscSF_Basic *)sf->data;
 79:   PetscInt         i, j, k, nrootreqs, nleafreqs, nreqs;
 80:   PetscSFLink     *p, link;
 81:   PetscSFDirection direction;
 82:   MPI_Request     *reqs = NULL;
 83:   PetscBool        match, rootdirect[2], leafdirect[2];
 84:   PetscMemType     rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */
 85:   PetscMemType     leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE;
 86:   PetscMemType     rootmtype_mpi, leafmtype_mpi;   /* mtypes seen by MPI */
 87:   PetscInt         rootdirect_mpi, leafdirect_mpi; /* root/leafdirect seen by MPI*/


 90:   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
 91:   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
 92:     if (sfop == PETSCSF_BCAST) {
 93:       rootdirect[i] = bas->rootcontig[i];                                                  /* Pack roots */
 94:       leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */
 95:     } else if (sfop == PETSCSF_REDUCE) {
 96:       leafdirect[i] = sf->leafcontig[i];                                                    /* Pack leaves */
 97:       rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
 98:     } else {                                                                                /* PETSCSF_FETCH */
 99:       rootdirect[i] = PETSC_FALSE;                                                          /* FETCH always need a separate rootbuf */
100:       leafdirect[i] = PETSC_FALSE;                                                          /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
101:     }
102:   }

104:   if (sf->use_gpu_aware_mpi) {
105:     rootmtype_mpi = rootmtype;
106:     leafmtype_mpi = leafmtype;
107:   } else {
108:     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
109:   }
110:   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is buffered on host and then passed to MPI */
111:   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype) ? 1 : 0;
112:   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype) ? 1 : 0;

114:   direction = (sfop == PETSCSF_BCAST) ? PETSCSF_../../../../../..2LEAF : PETSCSF_LEAF2../../../../../..;
115:   nrootreqs = bas->nrootreqs;
116:   nleafreqs = sf->nleafreqs;

118:   /* Look for free links in cache */
119:   for (p = &bas->avail; (link = *p); p = &link->next) {
120:     if (!link->use_nvshmem) { /* Only check with MPI links */
121:       MPIPetsc_Type_compare(unit, link->unit, &match);
122:       if (match) {
123:         /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current.
124:            If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
125:         */
126:         if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
127:           reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
128:           for (i = 0; i < nrootreqs; i++) {
129:             if (reqs[i] != MPI_REQUEST_NULL) MPI_Request_free(&reqs[i]);
130:           }
131:           link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
132:         }
133:         if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
134:           reqs = link->leafreqs[direction][leafmtype][1];
135:           for (i = 0; i < nleafreqs; i++) {
136:             if (reqs[i] != MPI_REQUEST_NULL) MPI_Request_free(&reqs[i]);
137:           }
138:           link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
139:         }
140:         *p = link->next; /* Remove from available list */
141:         goto found;
142:       }
143:     }
144:   }

146:   PetscNew(&link);
147:   PetscSFLinkSetUp_Host(sf, link, unit);
148:   PetscCommGetNewTag(PetscObjectComm((PetscObject)sf), &link->tag); /* One tag per link */

150:   nreqs = (nrootreqs + nleafreqs) * 8;
151:   PetscMalloc1(nreqs, &link->reqs);
152:   for (i = 0; i < nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */

154:   for (i = 0; i < 2; i++) {     /* Two communication directions */
155:     for (j = 0; j < 2; j++) {   /* Two memory types */
156:       for (k = 0; k < 2; k++) { /* root/leafdirect 0 or 1 */
157:         link->rootreqs[i][j][k] = link->reqs + nrootreqs * (4 * i + 2 * j + k);
158:         link->leafreqs[i][j][k] = link->reqs + nrootreqs * 8 + nleafreqs * (4 * i + 2 * j + k);
159:       }
160:     }
161:   }
162:   link->StartCommunication  = PetscSFLinkStartRequests_MPI;
163:   link->FinishCommunication = PetscSFLinkWaitRequests_MPI;

165: found:

167: #if defined(PETSC_HAVE_DEVICE)
168:   if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) {
169:   #if defined(PETSC_HAVE_CUDA)
170:     if (sf->backend == PETSCSF_BACKEND_CUDA) PetscSFLinkSetUp_CUDA(sf, link, unit); /* Setup streams etc */
171:   #endif
172:   #if defined(PETSC_HAVE_HIP)
173:     if (sf->backend == PETSCSF_BACKEND_HIP) PetscSFLinkSetUp_HIP(sf, link, unit); /* Setup streams etc */
174:   #endif
175:   #if defined(PETSC_HAVE_KOKKOS)
176:     if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscSFLinkSetUp_Kokkos(sf, link, unit);
177:   #endif
178:   }
179: #endif

181:   /* Allocate buffers along root/leafdata */
182:   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
183:     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
184:     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
185:     if (bas->rootbuflen[i]) {
186:       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
187:         link->rootbuf[i][rootmtype] = (char *)rootdata + bas->rootstart[i] * link->unitbytes;
188:       } else { /* Have to have a separate rootbuf */
189:         if (!link->rootbuf_alloc[i][rootmtype]) PetscSFMalloc(sf, rootmtype, bas->rootbuflen[i] * link->unitbytes, (void **)&link->rootbuf_alloc[i][rootmtype]);
190:         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
191:       }
192:     }

194:     if (sf->leafbuflen[i]) {
195:       if (leafdirect[i]) {
196:         link->leafbuf[i][leafmtype] = (char *)leafdata + sf->leafstart[i] * link->unitbytes;
197:       } else {
198:         if (!link->leafbuf_alloc[i][leafmtype]) PetscSFMalloc(sf, leafmtype, sf->leafbuflen[i] * link->unitbytes, (void **)&link->leafbuf_alloc[i][leafmtype]);
199:         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
200:       }
201:     }
202:   }

204: #if defined(PETSC_HAVE_DEVICE)
205:   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
206:   if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) {
207:     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);
208:     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
209:   }
210:   if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) {
211:     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);
212:     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
213:   }
214: #endif

216:   /* Set `current` state of the link. They may change between different SF invocations with the same link */
217:   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
218:     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
219:     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
220:   }

222:   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
223:   link->leafdata = leafdata;
224:   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
225:     link->rootdirect[i] = rootdirect[i];
226:     link->leafdirect[i] = leafdirect[i];
227:   }
228:   link->rootdirect_mpi = rootdirect_mpi;
229:   link->leafdirect_mpi = leafdirect_mpi;
230:   link->rootmtype      = rootmtype;
231:   link->leafmtype      = leafmtype;
232:   link->rootmtype_mpi  = rootmtype_mpi;
233:   link->leafmtype_mpi  = leafmtype_mpi;

235:   link->next = bas->inuse;
236:   bas->inuse = link;
237:   *mylink    = link;
238:   return 0;
239: }