Actual source code: mpits.c

  1: #include <petscsys.h>
  2: #include <petsc/private/petscimpl.h>

  4: PetscLogEvent PETSC_BuildTwoSided;
  5: PetscLogEvent PETSC_BuildTwoSidedF;

  7: const char *const PetscBuildTwoSidedTypes[] = {"ALLREDUCE", "IBARRIER", "REDSCATTER", "PetscBuildTwoSidedType", "PETSC_BUILDTWOSIDED_", NULL};

  9: static PetscBuildTwoSidedType _twosided_type = PETSC_BUILDTWOSIDED_NOTSET;

 11: /*@
 12:    PetscCommBuildTwoSidedSetType - set algorithm to use when building two-sided communication

 14:    Logically Collective

 16:    Input Parameters:
 17: +  comm - `PETSC_COMM_WORLD`
 18: -  twosided - algorithm to use in subsequent calls to `PetscCommBuildTwoSided()`

 20:    Level: developer

 22:    Note:
 23:    This option is currently global, but could be made per-communicator.

 25: .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedGetType()`, `PetscBuildTwoSidedType`
 26: @*/
 27: PetscErrorCode PetscCommBuildTwoSidedSetType(MPI_Comm comm, PetscBuildTwoSidedType twosided)
 28: {
 30:     PetscMPIInt b1[2], b2[2];
 31:     b1[0] = -(PetscMPIInt)twosided;
 32:     b1[1] = (PetscMPIInt)twosided;
 33:     MPIU_Allreduce(b1, b2, 2, MPI_INT, MPI_MAX, comm);
 35:   }
 36:   _twosided_type = twosided;
 37:   return 0;
 38: }

 40: /*@
 41:    PetscCommBuildTwoSidedGetType - get algorithm used when building two-sided communication

 43:    Logically Collective

 45:    Output Parameters:
 46: +  comm - communicator on which to query algorithm
 47: -  twosided - algorithm to use for `PetscCommBuildTwoSided()`

 49:    Level: developer

 51: .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedSetType()`, `PetscBuildTwoSidedType`
 52: @*/
 53: PetscErrorCode PetscCommBuildTwoSidedGetType(MPI_Comm comm, PetscBuildTwoSidedType *twosided)
 54: {
 55:   PetscMPIInt size;

 57:   *twosided = PETSC_BUILDTWOSIDED_NOTSET;
 58:   if (_twosided_type == PETSC_BUILDTWOSIDED_NOTSET) {
 59:     MPI_Comm_size(comm, &size);
 60:     _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE; /* default for small comms, see https://gitlab.com/petsc/petsc/-/merge_requests/2611 */
 61: #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
 62:     if (size > 1024) _twosided_type = PETSC_BUILDTWOSIDED_IBARRIER;
 63: #endif
 64:     PetscOptionsGetEnum(NULL, NULL, "-build_twosided", PetscBuildTwoSidedTypes, (PetscEnum *)&_twosided_type, NULL);
 65:   }
 66:   *twosided = _twosided_type;
 67:   return 0;
 68: }

 70: #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
 71: static PetscErrorCode PetscCommBuildTwoSided_Ibarrier(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata)
 72: {
 73:   PetscMPIInt    nrecvs, tag, done, i;
 74:   MPI_Aint       lb, unitbytes;
 75:   char          *tdata;
 76:   MPI_Request   *sendreqs, barrier;
 77:   PetscSegBuffer segrank, segdata;
 78:   PetscBool      barrier_started;

 80:   PetscCommDuplicate(comm, &comm, &tag);
 81:   MPI_Type_get_extent(dtype, &lb, &unitbytes);
 83:   tdata = (char *)todata;
 84:   PetscMalloc1(nto, &sendreqs);
 85:   for (i = 0; i < nto; i++) MPI_Issend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i);
 86:   PetscSegBufferCreate(sizeof(PetscMPIInt), 4, &segrank);
 87:   PetscSegBufferCreate(unitbytes, 4 * count, &segdata);

 89:   nrecvs  = 0;
 90:   barrier = MPI_REQUEST_NULL;
 91:   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
 92:    * but we need to work around it. */
 93:   barrier_started = PETSC_FALSE;
 94:   for (done = 0; !done;) {
 95:     PetscMPIInt flag;
 96:     MPI_Status  status;
 97:     MPI_Iprobe(MPI_ANY_SOURCE, tag, comm, &flag, &status);
 98:     if (flag) { /* incoming message */
 99:       PetscMPIInt *recvrank;
100:       void        *buf;
101:       PetscSegBufferGet(segrank, 1, &recvrank);
102:       PetscSegBufferGet(segdata, count, &buf);
103:       *recvrank = status.MPI_SOURCE;
104:       MPI_Recv(buf, count, dtype, status.MPI_SOURCE, tag, comm, MPI_STATUS_IGNORE);
105:       nrecvs++;
106:     }
107:     if (!barrier_started) {
108:       PetscMPIInt sent, nsends;
109:       PetscMPIIntCast(nto, &nsends);
110:       MPI_Testall(nsends, sendreqs, &sent, MPI_STATUSES_IGNORE);
111:       if (sent) {
112:         MPI_Ibarrier(comm, &barrier);
113:         barrier_started = PETSC_TRUE;
114:         PetscFree(sendreqs);
115:       }
116:     } else {
117:       MPI_Test(&barrier, &done, MPI_STATUS_IGNORE);
118:     }
119:   }
120:   *nfrom = nrecvs;
121:   PetscSegBufferExtractAlloc(segrank, fromranks);
122:   PetscSegBufferDestroy(&segrank);
123:   PetscSegBufferExtractAlloc(segdata, fromdata);
124:   PetscSegBufferDestroy(&segdata);
125:   PetscCommDestroy(&comm);
126:   return 0;
127: }
128: #endif

130: static PetscErrorCode PetscCommBuildTwoSided_Allreduce(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata)
131: {
132:   PetscMPIInt       size, rank, *iflags, nrecvs, tag, *franks, i, flg;
133:   MPI_Aint          lb, unitbytes;
134:   char             *tdata, *fdata;
135:   MPI_Request      *reqs, *sendreqs;
136:   MPI_Status       *statuses;
137:   PetscCommCounter *counter;

139:   MPI_Comm_size(comm, &size);
140:   MPI_Comm_rank(comm, &rank);
141:   PetscCommDuplicate(comm, &comm, &tag);
142:   MPI_Comm_get_attr(comm, Petsc_Counter_keyval, &counter, &flg);
144:   if (!counter->iflags) {
145:     PetscCalloc1(size, &counter->iflags);
146:     iflags = counter->iflags;
147:   } else {
148:     iflags = counter->iflags;
149:     PetscArrayzero(iflags, size);
150:   }
151:   for (i = 0; i < nto; i++) iflags[toranks[i]] = 1;
152:   MPIU_Allreduce(MPI_IN_PLACE, iflags, size, MPI_INT, MPI_SUM, comm);
153:   nrecvs = iflags[rank];
154:   MPI_Type_get_extent(dtype, &lb, &unitbytes);
156:   PetscMalloc(nrecvs * count * unitbytes, &fdata);
157:   tdata = (char *)todata;
158:   PetscMalloc2(nto + nrecvs, &reqs, nto + nrecvs, &statuses);
159:   sendreqs = reqs + nrecvs;
160:   for (i = 0; i < nrecvs; i++) MPI_Irecv((void *)(fdata + count * unitbytes * i), count, dtype, MPI_ANY_SOURCE, tag, comm, reqs + i);
161:   for (i = 0; i < nto; i++) MPI_Isend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i);
162:   MPI_Waitall(nto + nrecvs, reqs, statuses);
163:   PetscMalloc1(nrecvs, &franks);
164:   for (i = 0; i < nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
165:   PetscFree2(reqs, statuses);
166:   PetscCommDestroy(&comm);

168:   *nfrom             = nrecvs;
169:   *fromranks         = franks;
170:   *(void **)fromdata = fdata;
171:   return 0;
172: }

174: #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
175: static PetscErrorCode PetscCommBuildTwoSided_RedScatter(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata)
176: {
177:   PetscMPIInt       size, *iflags, nrecvs, tag, *franks, i, flg;
178:   MPI_Aint          lb, unitbytes;
179:   char             *tdata, *fdata;
180:   MPI_Request      *reqs, *sendreqs;
181:   MPI_Status       *statuses;
182:   PetscCommCounter *counter;

184:   MPI_Comm_size(comm, &size);
185:   PetscCommDuplicate(comm, &comm, &tag);
186:   MPI_Comm_get_attr(comm, Petsc_Counter_keyval, &counter, &flg);
188:   if (!counter->iflags) {
189:     PetscCalloc1(size, &counter->iflags);
190:     iflags = counter->iflags;
191:   } else {
192:     iflags = counter->iflags;
193:     PetscArrayzero(iflags, size);
194:   }
195:   for (i = 0; i < nto; i++) iflags[toranks[i]] = 1;
196:   MPI_Reduce_scatter_block(iflags, &nrecvs, 1, MPI_INT, MPI_SUM, comm);
197:   MPI_Type_get_extent(dtype, &lb, &unitbytes);
199:   PetscMalloc(nrecvs * count * unitbytes, &fdata);
200:   tdata = (char *)todata;
201:   PetscMalloc2(nto + nrecvs, &reqs, nto + nrecvs, &statuses);
202:   sendreqs = reqs + nrecvs;
203:   for (i = 0; i < nrecvs; i++) MPI_Irecv((void *)(fdata + count * unitbytes * i), count, dtype, MPI_ANY_SOURCE, tag, comm, reqs + i);
204:   for (i = 0; i < nto; i++) MPI_Isend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i);
205:   MPI_Waitall(nto + nrecvs, reqs, statuses);
206:   PetscMalloc1(nrecvs, &franks);
207:   for (i = 0; i < nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
208:   PetscFree2(reqs, statuses);
209:   PetscCommDestroy(&comm);

211:   *nfrom             = nrecvs;
212:   *fromranks         = franks;
213:   *(void **)fromdata = fdata;
214:   return 0;
215: }
216: #endif

218: /*@C
219:    PetscCommBuildTwoSided - discovers communicating ranks given one-sided information, moving constant-sized data in the process (often message lengths)

221:    Collective

223:    Input Parameters:
224: +  comm - communicator
225: .  count - number of entries to send/receive (must match on all ranks)
226: .  dtype - datatype to send/receive from each rank (must match on all ranks)
227: .  nto - number of ranks to send data to
228: .  toranks - ranks to send to (array of length nto)
229: -  todata - data to send to each rank (packed)

231:    Output Parameters:
232: +  nfrom - number of ranks receiving messages from
233: .  fromranks - ranks receiving messages from (length nfrom; caller should `PetscFree()`)
234: -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for `PetscFree()`)

236:    Level: developer

238:    Options Database Key:
239: .  -build_twosided <allreduce|ibarrier|redscatter> - algorithm to set up two-sided communication. Default is allreduce for communicators with <= 1024 ranks, otherwise ibarrier.

241:    Notes:
242:    This memory-scalable interface is an alternative to calling `PetscGatherNumberOfMessages()` and
243:    `PetscGatherMessageLengths()`, possibly with a subsequent round of communication to send other constant-size data.

245:    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.

247:    References:
248: .  * - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
249:    Scalable communication protocols for dynamic sparse data exchange, 2010.

251: .seealso: `PetscGatherNumberOfMessages()`, `PetscGatherMessageLengths()`, `PetscCommBuildTwoSidedSetType()`, `PetscCommBuildTwoSidedType`
252: @*/
253: PetscErrorCode PetscCommBuildTwoSided(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata)
254: {
255:   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;

257:   PetscSysInitializePackage();
258:   PetscLogEventSync(PETSC_BuildTwoSided, comm);
259:   PetscLogEventBegin(PETSC_BuildTwoSided, 0, 0, 0, 0);
260:   PetscCommBuildTwoSidedGetType(comm, &buildtype);
261:   switch (buildtype) {
262:   case PETSC_BUILDTWOSIDED_IBARRIER:
263: #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
264:     PetscCommBuildTwoSided_Ibarrier(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata);
265:     break;
266: #else
267:     SETERRQ(comm, PETSC_ERR_PLIB, "MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
268: #endif
269:   case PETSC_BUILDTWOSIDED_ALLREDUCE:
270:     PetscCommBuildTwoSided_Allreduce(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata);
271:     break;
272:   case PETSC_BUILDTWOSIDED_REDSCATTER:
273: #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
274:     PetscCommBuildTwoSided_RedScatter(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata);
275:     break;
276: #else
277:     SETERRQ(comm, PETSC_ERR_PLIB, "MPI implementation does not provide MPI_Reduce_scatter_block (part of MPI-2.2)");
278: #endif
279:   default:
280:     SETERRQ(comm, PETSC_ERR_PLIB, "Unknown method for building two-sided communication");
281:   }
282:   PetscLogEventEnd(PETSC_BuildTwoSided, 0, 0, 0, 0);
283:   return 0;
284: }

286: static PetscErrorCode PetscCommBuildTwoSidedFReq_Reference(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, MPI_Request **toreqs, MPI_Request **fromreqs, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx)
287: {
288:   PetscMPIInt  i, *tag;
289:   MPI_Aint     lb, unitbytes;
290:   MPI_Request *sendreq, *recvreq;

292:   PetscMalloc1(ntags, &tag);
293:   if (ntags > 0) PetscCommDuplicate(comm, &comm, &tag[0]);
294:   for (i = 1; i < ntags; i++) PetscCommGetNewTag(comm, &tag[i]);

296:   /* Perform complete initial rendezvous */
297:   PetscCommBuildTwoSided(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata);

299:   PetscMalloc1(nto * ntags, &sendreq);
300:   PetscMalloc1(*nfrom * ntags, &recvreq);

302:   MPI_Type_get_extent(dtype, &lb, &unitbytes);
304:   for (i = 0; i < nto; i++) {
305:     PetscMPIInt k;
306:     for (k = 0; k < ntags; k++) sendreq[i * ntags + k] = MPI_REQUEST_NULL;
307:     (*send)(comm, tag, i, toranks[i], ((char *)todata) + count * unitbytes * i, sendreq + i * ntags, ctx);
308:   }
309:   for (i = 0; i < *nfrom; i++) {
310:     void       *header = (*(char **)fromdata) + count * unitbytes * i;
311:     PetscMPIInt k;
312:     for (k = 0; k < ntags; k++) recvreq[i * ntags + k] = MPI_REQUEST_NULL;
313:     (*recv)(comm, tag, (*fromranks)[i], header, recvreq + i * ntags, ctx);
314:   }
315:   PetscFree(tag);
316:   PetscCommDestroy(&comm);
317:   *toreqs   = sendreq;
318:   *fromreqs = recvreq;
319:   return 0;
320: }

322: #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)

324: static PetscErrorCode PetscCommBuildTwoSidedFReq_Ibarrier(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, MPI_Request **toreqs, MPI_Request **fromreqs, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx)
325: {
326:   PetscMPIInt    nrecvs, tag, *tags, done, i;
327:   MPI_Aint       lb, unitbytes;
328:   char          *tdata;
329:   MPI_Request   *sendreqs, *usendreqs, *req, barrier;
330:   PetscSegBuffer segrank, segdata, segreq;
331:   PetscBool      barrier_started;

333:   PetscCommDuplicate(comm, &comm, &tag);
334:   PetscMalloc1(ntags, &tags);
335:   for (i = 0; i < ntags; i++) PetscCommGetNewTag(comm, &tags[i]);
336:   MPI_Type_get_extent(dtype, &lb, &unitbytes);
338:   tdata = (char *)todata;
339:   PetscMalloc1(nto, &sendreqs);
340:   PetscMalloc1(nto * ntags, &usendreqs);
341:   /* Post synchronous sends */
342:   for (i = 0; i < nto; i++) MPI_Issend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i);
343:   /* Post actual payloads.  These are typically larger messages.  Hopefully sending these later does not slow down the
344:    * synchronous messages above. */
345:   for (i = 0; i < nto; i++) {
346:     PetscMPIInt k;
347:     for (k = 0; k < ntags; k++) usendreqs[i * ntags + k] = MPI_REQUEST_NULL;
348:     (*send)(comm, tags, i, toranks[i], tdata + count * unitbytes * i, usendreqs + i * ntags, ctx);
349:   }

351:   PetscSegBufferCreate(sizeof(PetscMPIInt), 4, &segrank);
352:   PetscSegBufferCreate(unitbytes, 4 * count, &segdata);
353:   PetscSegBufferCreate(sizeof(MPI_Request), 4, &segreq);

355:   nrecvs  = 0;
356:   barrier = MPI_REQUEST_NULL;
357:   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
358:    * but we need to work around it. */
359:   barrier_started = PETSC_FALSE;
360:   for (done = 0; !done;) {
361:     PetscMPIInt flag;
362:     MPI_Status  status;
363:     MPI_Iprobe(MPI_ANY_SOURCE, tag, comm, &flag, &status);
364:     if (flag) { /* incoming message */
365:       PetscMPIInt *recvrank, k;
366:       void        *buf;
367:       PetscSegBufferGet(segrank, 1, &recvrank);
368:       PetscSegBufferGet(segdata, count, &buf);
369:       *recvrank = status.MPI_SOURCE;
370:       MPI_Recv(buf, count, dtype, status.MPI_SOURCE, tag, comm, MPI_STATUS_IGNORE);
371:       PetscSegBufferGet(segreq, ntags, &req);
372:       for (k = 0; k < ntags; k++) req[k] = MPI_REQUEST_NULL;
373:       (*recv)(comm, tags, status.MPI_SOURCE, buf, req, ctx);
374:       nrecvs++;
375:     }
376:     if (!barrier_started) {
377:       PetscMPIInt sent, nsends;
378:       PetscMPIIntCast(nto, &nsends);
379:       MPI_Testall(nsends, sendreqs, &sent, MPI_STATUSES_IGNORE);
380:       if (sent) {
381:         MPI_Ibarrier(comm, &barrier);
382:         barrier_started = PETSC_TRUE;
383:       }
384:     } else {
385:       MPI_Test(&barrier, &done, MPI_STATUS_IGNORE);
386:     }
387:   }
388:   *nfrom = nrecvs;
389:   PetscSegBufferExtractAlloc(segrank, fromranks);
390:   PetscSegBufferDestroy(&segrank);
391:   PetscSegBufferExtractAlloc(segdata, fromdata);
392:   PetscSegBufferDestroy(&segdata);
393:   *toreqs = usendreqs;
394:   PetscSegBufferExtractAlloc(segreq, fromreqs);
395:   PetscSegBufferDestroy(&segreq);
396:   PetscFree(sendreqs);
397:   PetscFree(tags);
398:   PetscCommDestroy(&comm);
399:   return 0;
400: }
401: #endif

403: /*@C
404:    PetscCommBuildTwoSidedF - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous

406:    Collective

408:    Input Parameters:
409: +  comm - communicator
410: .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
411: .  dtype - datatype to send/receive from each rank (must match on all ranks)
412: .  nto - number of ranks to send data to
413: .  toranks - ranks to send to (array of length nto)
414: .  todata - data to send to each rank (packed)
415: .  ntags - number of tags needed by send/recv callbacks
416: .  send - callback invoked on sending process when ready to send primary payload
417: .  recv - callback invoked on receiving process after delivery of rendezvous message
418: -  ctx - context for callbacks

420:    Output Parameters:
421: +  nfrom - number of ranks receiving messages from
422: .  fromranks - ranks receiving messages from (length nfrom; caller should `PetscFree()`)
423: -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for `PetscFree()`)

425:    Level: developer

427:    Notes:
428:    This memory-scalable interface is an alternative to calling `PetscGatherNumberOfMessages()` and
429:    `PetscGatherMessageLengths()`, possibly with a subsequent round of communication to send other data.

431:    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.

433:    References:
434: .  * - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
435:    Scalable communication protocols for dynamic sparse data exchange, 2010.

437: .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedFReq()`, `PetscGatherNumberOfMessages()`, `PetscGatherMessageLengths()`
438: @*/
439: PetscErrorCode PetscCommBuildTwoSidedF(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx)
440: {
441:   MPI_Request *toreqs, *fromreqs;

443:   PetscCommBuildTwoSidedFReq(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata, ntags, &toreqs, &fromreqs, send, recv, ctx);
444:   MPI_Waitall(nto * ntags, toreqs, MPI_STATUSES_IGNORE);
445:   MPI_Waitall(*nfrom * ntags, fromreqs, MPI_STATUSES_IGNORE);
446:   PetscFree(toreqs);
447:   PetscFree(fromreqs);
448:   return 0;
449: }

451: /*@C
452:    PetscCommBuildTwoSidedFReq - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous, returns requests

454:    Collective

456:    Input Parameters:
457: +  comm - communicator
458: .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
459: .  dtype - datatype to send/receive from each rank (must match on all ranks)
460: .  nto - number of ranks to send data to
461: .  toranks - ranks to send to (array of length nto)
462: .  todata - data to send to each rank (packed)
463: .  ntags - number of tags needed by send/recv callbacks
464: .  send - callback invoked on sending process when ready to send primary payload
465: .  recv - callback invoked on receiving process after delivery of rendezvous message
466: -  ctx - context for callbacks

468:    Output Parameters:
469: +  nfrom - number of ranks receiving messages from
470: .  fromranks - ranks receiving messages from (length nfrom; caller should `PetscFree()`)
471: .  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for `PetscFree()`)
472: .  toreqs - array of nto*ntags sender requests (caller must wait on these, then `PetscFree()`)
473: -  fromreqs - array of nfrom*ntags receiver requests (caller must wait on these, then `PetscFree()`)

475:    Level: developer

477:    Notes:
478:    This memory-scalable interface is an alternative to calling `PetscGatherNumberOfMessages()` and
479:    `PetscGatherMessageLengths()`, possibly with a subsequent round of communication to send other data.

481:    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.

483:    References:
484: .  * - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
485:    Scalable communication protocols for dynamic sparse data exchange, 2010.

487: .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedF()`, `PetscGatherNumberOfMessages()`, `PetscGatherMessageLengths()`
488: @*/
489: PetscErrorCode PetscCommBuildTwoSidedFReq(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, MPI_Request **toreqs, MPI_Request **fromreqs, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx)
490: {
491:   PetscErrorCode (*f)(MPI_Comm, PetscMPIInt, MPI_Datatype, PetscMPIInt, const PetscMPIInt[], const void *, PetscMPIInt *, PetscMPIInt **, void *, PetscMPIInt, MPI_Request **, MPI_Request **, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx);
492:   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
493:   PetscMPIInt            i, size;

495:   PetscSysInitializePackage();
496:   MPI_Comm_size(comm, &size);
498:   PetscLogEventSync(PETSC_BuildTwoSidedF, comm);
499:   PetscLogEventBegin(PETSC_BuildTwoSidedF, 0, 0, 0, 0);
500:   PetscCommBuildTwoSidedGetType(comm, &buildtype);
501:   switch (buildtype) {
502:   case PETSC_BUILDTWOSIDED_IBARRIER:
503: #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
504:     f = PetscCommBuildTwoSidedFReq_Ibarrier;
505:     break;
506: #else
507:     SETERRQ(comm, PETSC_ERR_PLIB, "MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
508: #endif
509:   case PETSC_BUILDTWOSIDED_ALLREDUCE:
510:   case PETSC_BUILDTWOSIDED_REDSCATTER:
511:     f = PetscCommBuildTwoSidedFReq_Reference;
512:     break;
513:   default:
514:     SETERRQ(comm, PETSC_ERR_PLIB, "Unknown method for building two-sided communication");
515:   }
516:   (*f)(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata, ntags, toreqs, fromreqs, send, recv, ctx);
517:   PetscLogEventEnd(PETSC_BuildTwoSidedF, 0, 0, 0, 0);
518:   return 0;
519: }