Actual source code: matstash.c


  2: #include <petsc/private/matimpl.h>

  4: #define DEFAULT_STASH_SIZE 10000

  6: static PetscErrorCode       MatStashScatterBegin_Ref(Mat, MatStash *, PetscInt *);
  7: PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash *, PetscMPIInt *, PetscInt **, PetscInt **, PetscScalar **, PetscInt *);
  8: PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash *);
  9: #if !defined(PETSC_HAVE_MPIUNI)
 10: static PetscErrorCode MatStashScatterBegin_BTS(Mat, MatStash *, PetscInt *);
 11: static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash *, PetscMPIInt *, PetscInt **, PetscInt **, PetscScalar **, PetscInt *);
 12: static PetscErrorCode MatStashScatterEnd_BTS(MatStash *);
 13: #endif

 15: /*
 16:   MatStashCreate_Private - Creates a stash,currently used for all the parallel
 17:   matrix implementations. The stash is where elements of a matrix destined
 18:   to be stored on other processors are kept until matrix assembly is done.

 20:   This is a simple minded stash. Simply adds entries to end of stash.

 22:   Input Parameters:
 23:   comm - communicator, required for scatters.
 24:   bs   - stash block size. used when stashing blocks of values

 26:   Output Parameters:
 27:   stash    - the newly created stash
 28: */
 29: PetscErrorCode MatStashCreate_Private(MPI_Comm comm, PetscInt bs, MatStash *stash)
 30: {
 31:   PetscInt  max, *opt, nopt, i;
 32:   PetscBool flg;

 34:   /* Require 2 tags,get the second using PetscCommGetNewTag() */
 35:   stash->comm = comm;

 37:   PetscCommGetNewTag(stash->comm, &stash->tag1);
 38:   PetscCommGetNewTag(stash->comm, &stash->tag2);
 39:   MPI_Comm_size(stash->comm, &stash->size);
 40:   MPI_Comm_rank(stash->comm, &stash->rank);
 41:   PetscMalloc1(2 * stash->size, &stash->flg_v);
 42:   for (i = 0; i < 2 * stash->size; i++) stash->flg_v[i] = -1;

 44:   nopt = stash->size;
 45:   PetscMalloc1(nopt, &opt);
 46:   PetscOptionsGetIntArray(NULL, NULL, "-matstash_initial_size", opt, &nopt, &flg);
 47:   if (flg) {
 48:     if (nopt == 1) max = opt[0];
 49:     else if (nopt == stash->size) max = opt[stash->rank];
 50:     else if (stash->rank < nopt) max = opt[stash->rank];
 51:     else max = 0; /* Use default */
 52:     stash->umax = max;
 53:   } else {
 54:     stash->umax = 0;
 55:   }
 56:   PetscFree(opt);
 57:   if (bs <= 0) bs = 1;

 59:   stash->bs         = bs;
 60:   stash->nmax       = 0;
 61:   stash->oldnmax    = 0;
 62:   stash->n          = 0;
 63:   stash->reallocs   = -1;
 64:   stash->space_head = NULL;
 65:   stash->space      = NULL;

 67:   stash->send_waits  = NULL;
 68:   stash->recv_waits  = NULL;
 69:   stash->send_status = NULL;
 70:   stash->nsends      = 0;
 71:   stash->nrecvs      = 0;
 72:   stash->svalues     = NULL;
 73:   stash->rvalues     = NULL;
 74:   stash->rindices    = NULL;
 75:   stash->nprocessed  = 0;
 76:   stash->reproduce   = PETSC_FALSE;
 77:   stash->blocktype   = MPI_DATATYPE_NULL;

 79:   PetscOptionsGetBool(NULL, NULL, "-matstash_reproduce", &stash->reproduce, NULL);
 80: #if !defined(PETSC_HAVE_MPIUNI)
 81:   flg = PETSC_FALSE;
 82:   PetscOptionsGetBool(NULL, NULL, "-matstash_legacy", &flg, NULL);
 83:   if (!flg) {
 84:     stash->ScatterBegin   = MatStashScatterBegin_BTS;
 85:     stash->ScatterGetMesg = MatStashScatterGetMesg_BTS;
 86:     stash->ScatterEnd     = MatStashScatterEnd_BTS;
 87:     stash->ScatterDestroy = MatStashScatterDestroy_BTS;
 88:   } else {
 89: #endif
 90:     stash->ScatterBegin   = MatStashScatterBegin_Ref;
 91:     stash->ScatterGetMesg = MatStashScatterGetMesg_Ref;
 92:     stash->ScatterEnd     = MatStashScatterEnd_Ref;
 93:     stash->ScatterDestroy = NULL;
 94: #if !defined(PETSC_HAVE_MPIUNI)
 95:   }
 96: #endif
 97:   return 0;
 98: }

100: /*
101:    MatStashDestroy_Private - Destroy the stash
102: */
103: PetscErrorCode MatStashDestroy_Private(MatStash *stash)
104: {
105:   PetscMatStashSpaceDestroy(&stash->space_head);
106:   if (stash->ScatterDestroy) (*stash->ScatterDestroy)(stash);

108:   stash->space = NULL;

110:   PetscFree(stash->flg_v);
111:   return 0;
112: }

114: /*
115:    MatStashScatterEnd_Private - This is called as the final stage of
116:    scatter. The final stages of message passing is done here, and
117:    all the memory used for message passing is cleaned up. This
118:    routine also resets the stash, and deallocates the memory used
119:    for the stash. It also keeps track of the current memory usage
120:    so that the same value can be used the next time through.
121: */
122: PetscErrorCode MatStashScatterEnd_Private(MatStash *stash)
123: {
124:   (*stash->ScatterEnd)(stash);
125:   return 0;
126: }

128: PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash *stash)
129: {
130:   PetscInt    nsends = stash->nsends, bs2, oldnmax, i;
131:   MPI_Status *send_status;

133:   for (i = 0; i < 2 * stash->size; i++) stash->flg_v[i] = -1;
134:   /* wait on sends */
135:   if (nsends) {
136:     PetscMalloc1(2 * nsends, &send_status);
137:     MPI_Waitall(2 * nsends, stash->send_waits, send_status);
138:     PetscFree(send_status);
139:   }

141:   /* Now update nmaxold to be app 10% more than max n used, this way the
142:      wastage of space is reduced the next time this stash is used.
143:      Also update the oldmax, only if it increases */
144:   if (stash->n) {
145:     bs2     = stash->bs * stash->bs;
146:     oldnmax = ((int)(stash->n * 1.1) + 5) * bs2;
147:     if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
148:   }

150:   stash->nmax       = 0;
151:   stash->n          = 0;
152:   stash->reallocs   = -1;
153:   stash->nprocessed = 0;

155:   PetscMatStashSpaceDestroy(&stash->space_head);

157:   stash->space = NULL;

159:   PetscFree(stash->send_waits);
160:   PetscFree(stash->recv_waits);
161:   PetscFree2(stash->svalues, stash->sindices);
162:   PetscFree(stash->rvalues[0]);
163:   PetscFree(stash->rvalues);
164:   PetscFree(stash->rindices[0]);
165:   PetscFree(stash->rindices);
166:   return 0;
167: }

169: /*
170:    MatStashGetInfo_Private - Gets the relevant statistics of the stash

172:    Input Parameters:
173:    stash    - the stash
174:    nstash   - the size of the stash. Indicates the number of values stored.
175:    reallocs - the number of additional mallocs incurred.

177: */
178: PetscErrorCode MatStashGetInfo_Private(MatStash *stash, PetscInt *nstash, PetscInt *reallocs)
179: {
180:   PetscInt bs2 = stash->bs * stash->bs;

182:   if (nstash) *nstash = stash->n * bs2;
183:   if (reallocs) {
184:     if (stash->reallocs < 0) *reallocs = 0;
185:     else *reallocs = stash->reallocs;
186:   }
187:   return 0;
188: }

190: /*
191:    MatStashSetInitialSize_Private - Sets the initial size of the stash

193:    Input Parameters:
194:    stash  - the stash
195:    max    - the value that is used as the max size of the stash.
196:             this value is used while allocating memory.
197: */
198: PetscErrorCode MatStashSetInitialSize_Private(MatStash *stash, PetscInt max)
199: {
200:   stash->umax = max;
201:   return 0;
202: }

204: /* MatStashExpand_Private - Expand the stash. This function is called
205:    when the space in the stash is not sufficient to add the new values
206:    being inserted into the stash.

208:    Input Parameters:
209:    stash - the stash
210:    incr  - the minimum increase requested

212:    Note:
213:    This routine doubles the currently used memory.
214:  */
215: static PetscErrorCode MatStashExpand_Private(MatStash *stash, PetscInt incr)
216: {
217:   PetscInt newnmax, bs2 = stash->bs * stash->bs;

219:   /* allocate a larger stash */
220:   if (!stash->oldnmax && !stash->nmax) { /* new stash */
221:     if (stash->umax) newnmax = stash->umax / bs2;
222:     else newnmax = DEFAULT_STASH_SIZE / bs2;
223:   } else if (!stash->nmax) { /* resuing stash */
224:     if (stash->umax > stash->oldnmax) newnmax = stash->umax / bs2;
225:     else newnmax = stash->oldnmax / bs2;
226:   } else newnmax = stash->nmax * 2;
227:   if (newnmax < (stash->nmax + incr)) newnmax += 2 * incr;

229:   /* Get a MatStashSpace and attach it to stash */
230:   PetscMatStashSpaceGet(bs2, newnmax, &stash->space);
231:   if (!stash->space_head) { /* new stash or resuing stash->oldnmax */
232:     stash->space_head = stash->space;
233:   }

235:   stash->reallocs++;
236:   stash->nmax = newnmax;
237:   return 0;
238: }
239: /*
240:   MatStashValuesRow_Private - inserts values into the stash. This function
241:   expects the values to be roworiented. Multiple columns belong to the same row
242:   can be inserted with a single call to this function.

244:   Input Parameters:
245:   stash  - the stash
246:   row    - the global row correspoiding to the values
247:   n      - the number of elements inserted. All elements belong to the above row.
248:   idxn   - the global column indices corresponding to each of the values.
249:   values - the values inserted
250: */
251: PetscErrorCode MatStashValuesRow_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscBool ignorezeroentries)
252: {
253:   PetscInt           i, k, cnt = 0;
254:   PetscMatStashSpace space = stash->space;

256:   /* Check and see if we have sufficient memory */
257:   if (!space || space->local_remaining < n) MatStashExpand_Private(stash, n);
258:   space = stash->space;
259:   k     = space->local_used;
260:   for (i = 0; i < n; i++) {
261:     if (ignorezeroentries && values && values[i] == 0.0) continue;
262:     space->idx[k] = row;
263:     space->idy[k] = idxn[i];
264:     space->val[k] = values ? values[i] : 0.0;
265:     k++;
266:     cnt++;
267:   }
268:   stash->n += cnt;
269:   space->local_used += cnt;
270:   space->local_remaining -= cnt;
271:   return 0;
272: }

274: /*
275:   MatStashValuesCol_Private - inserts values into the stash. This function
276:   expects the values to be columnoriented. Multiple columns belong to the same row
277:   can be inserted with a single call to this function.

279:   Input Parameters:
280:   stash   - the stash
281:   row     - the global row correspoiding to the values
282:   n       - the number of elements inserted. All elements belong to the above row.
283:   idxn    - the global column indices corresponding to each of the values.
284:   values  - the values inserted
285:   stepval - the consecutive values are sepated by a distance of stepval.
286:             this happens because the input is columnoriented.
287: */
288: PetscErrorCode MatStashValuesCol_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscInt stepval, PetscBool ignorezeroentries)
289: {
290:   PetscInt           i, k, cnt = 0;
291:   PetscMatStashSpace space = stash->space;

293:   /* Check and see if we have sufficient memory */
294:   if (!space || space->local_remaining < n) MatStashExpand_Private(stash, n);
295:   space = stash->space;
296:   k     = space->local_used;
297:   for (i = 0; i < n; i++) {
298:     if (ignorezeroentries && values && values[i * stepval] == 0.0) continue;
299:     space->idx[k] = row;
300:     space->idy[k] = idxn[i];
301:     space->val[k] = values ? values[i * stepval] : 0.0;
302:     k++;
303:     cnt++;
304:   }
305:   stash->n += cnt;
306:   space->local_used += cnt;
307:   space->local_remaining -= cnt;
308:   return 0;
309: }

311: /*
312:   MatStashValuesRowBlocked_Private - inserts blocks of values into the stash.
313:   This function expects the values to be roworiented. Multiple columns belong
314:   to the same block-row can be inserted with a single call to this function.
315:   This function extracts the sub-block of values based on the dimensions of
316:   the original input block, and the row,col values corresponding to the blocks.

318:   Input Parameters:
319:   stash  - the stash
320:   row    - the global block-row correspoiding to the values
321:   n      - the number of elements inserted. All elements belong to the above row.
322:   idxn   - the global block-column indices corresponding to each of the blocks of
323:            values. Each block is of size bs*bs.
324:   values - the values inserted
325:   rmax   - the number of block-rows in the original block.
326:   cmax   - the number of block-columns on the original block.
327:   idx    - the index of the current block-row in the original block.
328: */
329: PetscErrorCode MatStashValuesRowBlocked_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscInt rmax, PetscInt cmax, PetscInt idx)
330: {
331:   PetscInt           i, j, k, bs2, bs = stash->bs, l;
332:   const PetscScalar *vals;
333:   PetscScalar       *array;
334:   PetscMatStashSpace space = stash->space;

336:   if (!space || space->local_remaining < n) MatStashExpand_Private(stash, n);
337:   space = stash->space;
338:   l     = space->local_used;
339:   bs2   = bs * bs;
340:   for (i = 0; i < n; i++) {
341:     space->idx[l] = row;
342:     space->idy[l] = idxn[i];
343:     /* Now copy over the block of values. Store the values column oriented.
344:        This enables inserting multiple blocks belonging to a row with a single
345:        function call */
346:     array = space->val + bs2 * l;
347:     vals  = values + idx * bs2 * n + bs * i;
348:     for (j = 0; j < bs; j++) {
349:       for (k = 0; k < bs; k++) array[k * bs] = values ? vals[k] : 0.0;
350:       array++;
351:       vals += cmax * bs;
352:     }
353:     l++;
354:   }
355:   stash->n += n;
356:   space->local_used += n;
357:   space->local_remaining -= n;
358:   return 0;
359: }

361: /*
362:   MatStashValuesColBlocked_Private - inserts blocks of values into the stash.
363:   This function expects the values to be roworiented. Multiple columns belong
364:   to the same block-row can be inserted with a single call to this function.
365:   This function extracts the sub-block of values based on the dimensions of
366:   the original input block, and the row,col values corresponding to the blocks.

368:   Input Parameters:
369:   stash  - the stash
370:   row    - the global block-row correspoiding to the values
371:   n      - the number of elements inserted. All elements belong to the above row.
372:   idxn   - the global block-column indices corresponding to each of the blocks of
373:            values. Each block is of size bs*bs.
374:   values - the values inserted
375:   rmax   - the number of block-rows in the original block.
376:   cmax   - the number of block-columns on the original block.
377:   idx    - the index of the current block-row in the original block.
378: */
379: PetscErrorCode MatStashValuesColBlocked_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscInt rmax, PetscInt cmax, PetscInt idx)
380: {
381:   PetscInt           i, j, k, bs2, bs = stash->bs, l;
382:   const PetscScalar *vals;
383:   PetscScalar       *array;
384:   PetscMatStashSpace space = stash->space;

386:   if (!space || space->local_remaining < n) MatStashExpand_Private(stash, n);
387:   space = stash->space;
388:   l     = space->local_used;
389:   bs2   = bs * bs;
390:   for (i = 0; i < n; i++) {
391:     space->idx[l] = row;
392:     space->idy[l] = idxn[i];
393:     /* Now copy over the block of values. Store the values column oriented.
394:      This enables inserting multiple blocks belonging to a row with a single
395:      function call */
396:     array = space->val + bs2 * l;
397:     vals  = values + idx * bs2 * n + bs * i;
398:     for (j = 0; j < bs; j++) {
399:       for (k = 0; k < bs; k++) array[k] = values ? vals[k] : 0.0;
400:       array += bs;
401:       vals += rmax * bs;
402:     }
403:     l++;
404:   }
405:   stash->n += n;
406:   space->local_used += n;
407:   space->local_remaining -= n;
408:   return 0;
409: }
410: /*
411:   MatStashScatterBegin_Private - Initiates the transfer of values to the
412:   correct owners. This function goes through the stash, and check the
413:   owners of each stashed value, and sends the values off to the owner
414:   processors.

416:   Input Parameters:
417:   stash  - the stash
418:   owners - an array of size 'no-of-procs' which gives the ownership range
419:            for each node.

421:   Note:
422:     The 'owners' array in the cased of the blocked-stash has the
423:   ranges specified blocked global indices, and for the regular stash in
424:   the proper global indices.
425: */
426: PetscErrorCode MatStashScatterBegin_Private(Mat mat, MatStash *stash, PetscInt *owners)
427: {
428:   (*stash->ScatterBegin)(mat, stash, owners);
429:   return 0;
430: }

432: static PetscErrorCode MatStashScatterBegin_Ref(Mat mat, MatStash *stash, PetscInt *owners)
433: {
434:   PetscInt          *owner, *startv, *starti, tag1 = stash->tag1, tag2 = stash->tag2, bs2;
435:   PetscInt           size = stash->size, nsends;
436:   PetscInt           count, *sindices, **rindices, i, j, idx, lastidx, l;
437:   PetscScalar      **rvalues, *svalues;
438:   MPI_Comm           comm = stash->comm;
439:   MPI_Request       *send_waits, *recv_waits, *recv_waits1, *recv_waits2;
440:   PetscMPIInt       *sizes, *nlengths, nreceives;
441:   PetscInt          *sp_idx, *sp_idy;
442:   PetscScalar       *sp_val;
443:   PetscMatStashSpace space, space_next;

445:   { /* make sure all processors are either in INSERTMODE or ADDMODE */
446:     InsertMode addv;
447:     MPIU_Allreduce((PetscEnum *)&mat->insertmode, (PetscEnum *)&addv, 1, MPIU_ENUM, MPI_BOR, PetscObjectComm((PetscObject)mat));
449:     mat->insertmode = addv; /* in case this processor had no cache */
450:   }

452:   bs2 = stash->bs * stash->bs;

454:   /*  first count number of contributors to each processor */
455:   PetscCalloc1(size, &nlengths);
456:   PetscMalloc1(stash->n + 1, &owner);

458:   i = j   = 0;
459:   lastidx = -1;
460:   space   = stash->space_head;
461:   while (space) {
462:     space_next = space->next;
463:     sp_idx     = space->idx;
464:     for (l = 0; l < space->local_used; l++) {
465:       /* if indices are NOT locally sorted, need to start search at the beginning */
466:       if (lastidx > (idx = sp_idx[l])) j = 0;
467:       lastidx = idx;
468:       for (; j < size; j++) {
469:         if (idx >= owners[j] && idx < owners[j + 1]) {
470:           nlengths[j]++;
471:           owner[i] = j;
472:           break;
473:         }
474:       }
475:       i++;
476:     }
477:     space = space_next;
478:   }

480:   /* Now check what procs get messages - and compute nsends. */
481:   PetscCalloc1(size, &sizes);
482:   for (i = 0, nsends = 0; i < size; i++) {
483:     if (nlengths[i]) {
484:       sizes[i] = 1;
485:       nsends++;
486:     }
487:   }

489:   {
490:     PetscMPIInt *onodes, *olengths;
491:     /* Determine the number of messages to expect, their lengths, from from-ids */
492:     PetscGatherNumberOfMessages(comm, sizes, nlengths, &nreceives);
493:     PetscGatherMessageLengths(comm, nsends, nreceives, nlengths, &onodes, &olengths);
494:     /* since clubbing row,col - lengths are multiplied by 2 */
495:     for (i = 0; i < nreceives; i++) olengths[i] *= 2;
496:     PetscPostIrecvInt(comm, tag1, nreceives, onodes, olengths, &rindices, &recv_waits1);
497:     /* values are size 'bs2' lengths (and remove earlier factor 2 */
498:     for (i = 0; i < nreceives; i++) olengths[i] = olengths[i] * bs2 / 2;
499:     PetscPostIrecvScalar(comm, tag2, nreceives, onodes, olengths, &rvalues, &recv_waits2);
500:     PetscFree(onodes);
501:     PetscFree(olengths);
502:   }

504:   /* do sends:
505:       1) starts[i] gives the starting index in svalues for stuff going to
506:          the ith processor
507:   */
508:   PetscMalloc2(bs2 * stash->n, &svalues, 2 * (stash->n + 1), &sindices);
509:   PetscMalloc1(2 * nsends, &send_waits);
510:   PetscMalloc2(size, &startv, size, &starti);
511:   /* use 2 sends the first with all_a, the next with all_i and all_j */
512:   startv[0] = 0;
513:   starti[0] = 0;
514:   for (i = 1; i < size; i++) {
515:     startv[i] = startv[i - 1] + nlengths[i - 1];
516:     starti[i] = starti[i - 1] + 2 * nlengths[i - 1];
517:   }

519:   i     = 0;
520:   space = stash->space_head;
521:   while (space) {
522:     space_next = space->next;
523:     sp_idx     = space->idx;
524:     sp_idy     = space->idy;
525:     sp_val     = space->val;
526:     for (l = 0; l < space->local_used; l++) {
527:       j = owner[i];
528:       if (bs2 == 1) {
529:         svalues[startv[j]] = sp_val[l];
530:       } else {
531:         PetscInt     k;
532:         PetscScalar *buf1, *buf2;
533:         buf1 = svalues + bs2 * startv[j];
534:         buf2 = space->val + bs2 * l;
535:         for (k = 0; k < bs2; k++) buf1[k] = buf2[k];
536:       }
537:       sindices[starti[j]]               = sp_idx[l];
538:       sindices[starti[j] + nlengths[j]] = sp_idy[l];
539:       startv[j]++;
540:       starti[j]++;
541:       i++;
542:     }
543:     space = space_next;
544:   }
545:   startv[0] = 0;
546:   for (i = 1; i < size; i++) startv[i] = startv[i - 1] + nlengths[i - 1];

548:   for (i = 0, count = 0; i < size; i++) {
549:     if (sizes[i]) {
550:       MPI_Isend(sindices + 2 * startv[i], 2 * nlengths[i], MPIU_INT, i, tag1, comm, send_waits + count++);
551:       MPI_Isend(svalues + bs2 * startv[i], bs2 * nlengths[i], MPIU_SCALAR, i, tag2, comm, send_waits + count++);
552:     }
553:   }
554: #if defined(PETSC_USE_INFO)
555:   PetscInfo(NULL, "No of messages: %" PetscInt_FMT " \n", nsends);
556:   for (i = 0; i < size; i++) {
557:     if (sizes[i]) PetscInfo(NULL, "Mesg_to: %" PetscInt_FMT ": size: %zu bytes\n", i, (size_t)(nlengths[i] * (bs2 * sizeof(PetscScalar) + 2 * sizeof(PetscInt))));
558:   }
559: #endif
560:   PetscFree(nlengths);
561:   PetscFree(owner);
562:   PetscFree2(startv, starti);
563:   PetscFree(sizes);

565:   /* recv_waits need to be contiguous for MatStashScatterGetMesg_Private() */
566:   PetscMalloc1(2 * nreceives, &recv_waits);

568:   for (i = 0; i < nreceives; i++) {
569:     recv_waits[2 * i]     = recv_waits1[i];
570:     recv_waits[2 * i + 1] = recv_waits2[i];
571:   }
572:   stash->recv_waits = recv_waits;

574:   PetscFree(recv_waits1);
575:   PetscFree(recv_waits2);

577:   stash->svalues         = svalues;
578:   stash->sindices        = sindices;
579:   stash->rvalues         = rvalues;
580:   stash->rindices        = rindices;
581:   stash->send_waits      = send_waits;
582:   stash->nsends          = nsends;
583:   stash->nrecvs          = nreceives;
584:   stash->reproduce_count = 0;
585:   return 0;
586: }

588: /*
589:    MatStashScatterGetMesg_Private - This function waits on the receives posted
590:    in the function MatStashScatterBegin_Private() and returns one message at
591:    a time to the calling function. If no messages are left, it indicates this
592:    by setting flg = 0, else it sets flg = 1.

594:    Input Parameters:
595:    stash - the stash

597:    Output Parameters:
598:    nvals - the number of entries in the current message.
599:    rows  - an array of row indices (or blocked indices) corresponding to the values
600:    cols  - an array of columnindices (or blocked indices) corresponding to the values
601:    vals  - the values
602:    flg   - 0 indicates no more message left, and the current call has no values associated.
603:            1 indicates that the current call successfully received a message, and the
604:              other output parameters nvals,rows,cols,vals are set appropriately.
605: */
606: PetscErrorCode MatStashScatterGetMesg_Private(MatStash *stash, PetscMPIInt *nvals, PetscInt **rows, PetscInt **cols, PetscScalar **vals, PetscInt *flg)
607: {
608:   (*stash->ScatterGetMesg)(stash, nvals, rows, cols, vals, flg);
609:   return 0;
610: }

612: PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash *stash, PetscMPIInt *nvals, PetscInt **rows, PetscInt **cols, PetscScalar **vals, PetscInt *flg)
613: {
614:   PetscMPIInt i, *flg_v = stash->flg_v, i1, i2;
615:   PetscInt    bs2;
616:   MPI_Status  recv_status;
617:   PetscBool   match_found = PETSC_FALSE;

619:   *flg = 0; /* When a message is discovered this is reset to 1 */
620:   /* Return if no more messages to process */
621:   if (stash->nprocessed == stash->nrecvs) return 0;

623:   bs2 = stash->bs * stash->bs;
624:   /* If a matching pair of receives are found, process them, and return the data to
625:      the calling function. Until then keep receiving messages */
626:   while (!match_found) {
627:     if (stash->reproduce) {
628:       i = stash->reproduce_count++;
629:       MPI_Wait(stash->recv_waits + i, &recv_status);
630:     } else {
631:       MPI_Waitany(2 * stash->nrecvs, stash->recv_waits, &i, &recv_status);
632:     }

635:     /* Now pack the received message into a structure which is usable by others */
636:     if (i % 2) {
637:       MPI_Get_count(&recv_status, MPIU_SCALAR, nvals);
638:       flg_v[2 * recv_status.MPI_SOURCE] = i / 2;
639:       *nvals                            = *nvals / bs2;
640:     } else {
641:       MPI_Get_count(&recv_status, MPIU_INT, nvals);
642:       flg_v[2 * recv_status.MPI_SOURCE + 1] = i / 2;
643:       *nvals                                = *nvals / 2; /* This message has both row indices and col indices */
644:     }

646:     /* Check if we have both messages from this proc */
647:     i1 = flg_v[2 * recv_status.MPI_SOURCE];
648:     i2 = flg_v[2 * recv_status.MPI_SOURCE + 1];
649:     if (i1 != -1 && i2 != -1) {
650:       *rows = stash->rindices[i2];
651:       *cols = *rows + *nvals;
652:       *vals = stash->rvalues[i1];
653:       *flg  = 1;
654:       stash->nprocessed++;
655:       match_found = PETSC_TRUE;
656:     }
657:   }
658:   return 0;
659: }

661: #if !defined(PETSC_HAVE_MPIUNI)
662: typedef struct {
663:   PetscInt    row;
664:   PetscInt    col;
665:   PetscScalar vals[1]; /* Actually an array of length bs2 */
666: } MatStashBlock;

668: static PetscErrorCode MatStashSortCompress_Private(MatStash *stash, InsertMode insertmode)
669: {
670:   PetscMatStashSpace space;
671:   PetscInt           n = stash->n, bs = stash->bs, bs2 = bs * bs, cnt, *row, *col, *perm, rowstart, i;
672:   PetscScalar      **valptr;

674:   PetscMalloc4(n, &row, n, &col, n, &valptr, n, &perm);
675:   for (space = stash->space_head, cnt = 0; space; space = space->next) {
676:     for (i = 0; i < space->local_used; i++) {
677:       row[cnt]    = space->idx[i];
678:       col[cnt]    = space->idy[i];
679:       valptr[cnt] = &space->val[i * bs2];
680:       perm[cnt]   = cnt; /* Will tell us where to find valptr after sorting row[] and col[] */
681:       cnt++;
682:     }
683:   }
685:   PetscSortIntWithArrayPair(n, row, col, perm);
686:   /* Scan through the rows, sorting each one, combining duplicates, and packing send buffers */
687:   for (rowstart = 0, cnt = 0, i = 1; i <= n; i++) {
688:     if (i == n || row[i] != row[rowstart]) { /* Sort the last row. */
689:       PetscInt colstart;
690:       PetscSortIntWithArray(i - rowstart, &col[rowstart], &perm[rowstart]);
691:       for (colstart = rowstart; colstart < i;) { /* Compress multiple insertions to the same location */
692:         PetscInt       j, l;
693:         MatStashBlock *block;
694:         PetscSegBufferGet(stash->segsendblocks, 1, &block);
695:         block->row = row[rowstart];
696:         block->col = col[colstart];
697:         PetscArraycpy(block->vals, valptr[perm[colstart]], bs2);
698:         for (j = colstart + 1; j < i && col[j] == col[colstart]; j++) { /* Add any extra stashed blocks at the same (row,col) */
699:           if (insertmode == ADD_VALUES) {
700:             for (l = 0; l < bs2; l++) block->vals[l] += valptr[perm[j]][l];
701:           } else {
702:             PetscArraycpy(block->vals, valptr[perm[j]], bs2);
703:           }
704:         }
705:         colstart = j;
706:       }
707:       rowstart = i;
708:     }
709:   }
710:   PetscFree4(row, col, valptr, perm);
711:   return 0;
712: }

714: static PetscErrorCode MatStashBlockTypeSetUp(MatStash *stash)
715: {
716:   if (stash->blocktype == MPI_DATATYPE_NULL) {
717:     PetscInt     bs2 = PetscSqr(stash->bs);
718:     PetscMPIInt  blocklens[2];
719:     MPI_Aint     displs[2];
720:     MPI_Datatype types[2], stype;
721:     /*
722:         DummyBlock is a type having standard layout, even when PetscScalar is C++ std::complex.
723:        std::complex itself has standard layout, so does DummyBlock, recursively.
724:        To be compatible with C++ std::complex, complex implementations on GPUs must also have standard layout,
725:        though they can have different alignment, e.g, 16 bytes for double complex, instead of 8 bytes as in GCC stdlibc++.
726:        offsetof(type, member) only requires type to have standard layout. Ref. https://en.cppreference.com/w/cpp/types/offsetof.

728:        We can test if std::complex has standard layout with the following code:
729:        #include <iostream>
730:        #include <type_traits>
731:        #include <complex>
732:        int main() {
733:          std::cout << std::boolalpha;
734:          std::cout << std::is_standard_layout<std::complex<double> >::value << '\n';
735:        }
736:        Output: true
737:      */
738:     struct DummyBlock {
739:       PetscInt    row, col;
740:       PetscScalar vals;
741:     };

743:     stash->blocktype_size = offsetof(struct DummyBlock, vals) + bs2 * sizeof(PetscScalar);
744:     if (stash->blocktype_size % sizeof(PetscInt)) { /* Implies that PetscInt is larger and does not satisfy alignment without padding */
745:       stash->blocktype_size += sizeof(PetscInt) - stash->blocktype_size % sizeof(PetscInt);
746:     }
747:     PetscSegBufferCreate(stash->blocktype_size, 1, &stash->segsendblocks);
748:     PetscSegBufferCreate(stash->blocktype_size, 1, &stash->segrecvblocks);
749:     PetscSegBufferCreate(sizeof(MatStashFrame), 1, &stash->segrecvframe);
750:     blocklens[0] = 2;
751:     blocklens[1] = bs2;
752:     displs[0]    = offsetof(struct DummyBlock, row);
753:     displs[1]    = offsetof(struct DummyBlock, vals);
754:     types[0]     = MPIU_INT;
755:     types[1]     = MPIU_SCALAR;
756:     MPI_Type_create_struct(2, blocklens, displs, types, &stype);
757:     MPI_Type_commit(&stype);
758:     MPI_Type_create_resized(stype, 0, stash->blocktype_size, &stash->blocktype);
759:     MPI_Type_commit(&stash->blocktype);
760:     MPI_Type_free(&stype);
761:   }
762:   return 0;
763: }

765: /* Callback invoked after target rank has initiated receive of rendezvous message.
766:  * Here we post the main sends.
767:  */
768: static PetscErrorCode MatStashBTSSend_Private(MPI_Comm comm, const PetscMPIInt tag[], PetscMPIInt rankid, PetscMPIInt rank, void *sdata, MPI_Request req[], void *ctx)
769: {
770:   MatStash       *stash = (MatStash *)ctx;
771:   MatStashHeader *hdr   = (MatStashHeader *)sdata;

774:   MPI_Isend(stash->sendframes[rankid].buffer, hdr->count, stash->blocktype, rank, tag[0], comm, &req[0]);
775:   stash->sendframes[rankid].count   = hdr->count;
776:   stash->sendframes[rankid].pending = 1;
777:   return 0;
778: }

780: /*
781:     Callback invoked by target after receiving rendezvous message.
782:     Here we post the main recvs.
783:  */
784: static PetscErrorCode MatStashBTSRecv_Private(MPI_Comm comm, const PetscMPIInt tag[], PetscMPIInt rank, void *rdata, MPI_Request req[], void *ctx)
785: {
786:   MatStash       *stash = (MatStash *)ctx;
787:   MatStashHeader *hdr   = (MatStashHeader *)rdata;
788:   MatStashFrame  *frame;

790:   PetscSegBufferGet(stash->segrecvframe, 1, &frame);
791:   PetscSegBufferGet(stash->segrecvblocks, hdr->count, &frame->buffer);
792:   MPI_Irecv(frame->buffer, hdr->count, stash->blocktype, rank, tag[0], comm, &req[0]);
793:   frame->count   = hdr->count;
794:   frame->pending = 1;
795:   return 0;
796: }

798: /*
799:  * owners[] contains the ownership ranges; may be indexed by either blocks or scalars
800:  */
801: static PetscErrorCode MatStashScatterBegin_BTS(Mat mat, MatStash *stash, PetscInt owners[])
802: {
803:   size_t nblocks;
804:   char  *sendblocks;

806:   if (PetscDefined(USE_DEBUG)) { /* make sure all processors are either in INSERTMODE or ADDMODE */
807:     InsertMode addv;
808:     MPIU_Allreduce((PetscEnum *)&mat->insertmode, (PetscEnum *)&addv, 1, MPIU_ENUM, MPI_BOR, PetscObjectComm((PetscObject)mat));
810:   }

812:   MatStashBlockTypeSetUp(stash);
813:   MatStashSortCompress_Private(stash, mat->insertmode);
814:   PetscSegBufferGetSize(stash->segsendblocks, &nblocks);
815:   PetscSegBufferExtractInPlace(stash->segsendblocks, &sendblocks);
816:   if (stash->first_assembly_done) { /* Set up sendhdrs and sendframes for each rank that we sent before */
817:     PetscInt i;
818:     size_t   b;
819:     for (i = 0, b = 0; i < stash->nsendranks; i++) {
820:       stash->sendframes[i].buffer = &sendblocks[b * stash->blocktype_size];
821:       /* sendhdr is never actually sent, but the count is used by MatStashBTSSend_Private */
822:       stash->sendhdr[i].count = 0; /* Might remain empty (in which case we send a zero-sized message) if no values are communicated to that process */
823:       for (; b < nblocks; b++) {
824:         MatStashBlock *sendblock_b = (MatStashBlock *)&sendblocks[b * stash->blocktype_size];
826:         if (sendblock_b->row >= owners[stash->sendranks[i] + 1]) break;
827:         stash->sendhdr[i].count++;
828:       }
829:     }
830:   } else { /* Dynamically count and pack (first time) */
831:     PetscInt sendno;
832:     size_t   i, rowstart;

834:     /* Count number of send ranks and allocate for sends */
835:     stash->nsendranks = 0;
836:     for (rowstart = 0; rowstart < nblocks;) {
837:       PetscInt       owner;
838:       MatStashBlock *sendblock_rowstart = (MatStashBlock *)&sendblocks[rowstart * stash->blocktype_size];
839:       PetscFindInt(sendblock_rowstart->row, stash->size + 1, owners, &owner);
840:       if (owner < 0) owner = -(owner + 2);
841:       for (i = rowstart + 1; i < nblocks; i++) { /* Move forward through a run of blocks with the same owner */
842:         MatStashBlock *sendblock_i = (MatStashBlock *)&sendblocks[i * stash->blocktype_size];
843:         if (sendblock_i->row >= owners[owner + 1]) break;
844:       }
845:       stash->nsendranks++;
846:       rowstart = i;
847:     }
848:     PetscMalloc3(stash->nsendranks, &stash->sendranks, stash->nsendranks, &stash->sendhdr, stash->nsendranks, &stash->sendframes);

850:     /* Set up sendhdrs and sendframes */
851:     sendno = 0;
852:     for (rowstart = 0; rowstart < nblocks;) {
853:       PetscInt       owner;
854:       MatStashBlock *sendblock_rowstart = (MatStashBlock *)&sendblocks[rowstart * stash->blocktype_size];
855:       PetscFindInt(sendblock_rowstart->row, stash->size + 1, owners, &owner);
856:       if (owner < 0) owner = -(owner + 2);
857:       stash->sendranks[sendno] = owner;
858:       for (i = rowstart + 1; i < nblocks; i++) { /* Move forward through a run of blocks with the same owner */
859:         MatStashBlock *sendblock_i = (MatStashBlock *)&sendblocks[i * stash->blocktype_size];
860:         if (sendblock_i->row >= owners[owner + 1]) break;
861:       }
862:       stash->sendframes[sendno].buffer  = sendblock_rowstart;
863:       stash->sendframes[sendno].pending = 0;
864:       stash->sendhdr[sendno].count      = i - rowstart;
865:       sendno++;
866:       rowstart = i;
867:     }
869:   }

871:   /* Encode insertmode on the outgoing messages. If we want to support more than two options, we would need a new
872:    * message or a dummy entry of some sort. */
873:   if (mat->insertmode == INSERT_VALUES) {
874:     size_t i;
875:     for (i = 0; i < nblocks; i++) {
876:       MatStashBlock *sendblock_i = (MatStashBlock *)&sendblocks[i * stash->blocktype_size];
877:       sendblock_i->row           = -(sendblock_i->row + 1);
878:     }
879:   }

881:   if (stash->first_assembly_done) {
882:     PetscMPIInt i, tag;
883:     PetscCommGetNewTag(stash->comm, &tag);
884:     for (i = 0; i < stash->nrecvranks; i++) MatStashBTSRecv_Private(stash->comm, &tag, stash->recvranks[i], &stash->recvhdr[i], &stash->recvreqs[i], stash);
885:     for (i = 0; i < stash->nsendranks; i++) MatStashBTSSend_Private(stash->comm, &tag, i, stash->sendranks[i], &stash->sendhdr[i], &stash->sendreqs[i], stash);
886:     stash->use_status = PETSC_TRUE; /* Use count from message status. */
887:   } else {
888:     PetscCommBuildTwoSidedFReq(stash->comm, 1, MPIU_INT, stash->nsendranks, stash->sendranks, (PetscInt *)stash->sendhdr, &stash->nrecvranks, &stash->recvranks, (PetscInt *)&stash->recvhdr, 1, &stash->sendreqs, &stash->recvreqs, MatStashBTSSend_Private, MatStashBTSRecv_Private, stash);
889:     PetscMalloc2(stash->nrecvranks, &stash->some_indices, stash->nrecvranks, &stash->some_statuses);
890:     stash->use_status = PETSC_FALSE; /* Use count from header instead of from message. */
891:   }

893:   PetscSegBufferExtractInPlace(stash->segrecvframe, &stash->recvframes);
894:   stash->recvframe_active    = NULL;
895:   stash->recvframe_i         = 0;
896:   stash->some_i              = 0;
897:   stash->some_count          = 0;
898:   stash->recvcount           = 0;
899:   stash->first_assembly_done = mat->assembly_subset; /* See the same logic in VecAssemblyBegin_MPI_BTS */
900:   stash->insertmode          = &mat->insertmode;
901:   return 0;
902: }

904: static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash *stash, PetscMPIInt *n, PetscInt **row, PetscInt **col, PetscScalar **val, PetscInt *flg)
905: {
906:   MatStashBlock *block;

908:   *flg = 0;
909:   while (!stash->recvframe_active || stash->recvframe_i == stash->recvframe_count) {
910:     if (stash->some_i == stash->some_count) {
911:       if (stash->recvcount == stash->nrecvranks) return 0; /* Done */
912:       MPI_Waitsome(stash->nrecvranks, stash->recvreqs, &stash->some_count, stash->some_indices, stash->use_status ? stash->some_statuses : MPI_STATUSES_IGNORE);
913:       stash->some_i = 0;
914:     }
915:     stash->recvframe_active = &stash->recvframes[stash->some_indices[stash->some_i]];
916:     stash->recvframe_count  = stash->recvframe_active->count; /* From header; maximum count */
917:     if (stash->use_status) {                                  /* Count what was actually sent */
918:       MPI_Get_count(&stash->some_statuses[stash->some_i], stash->blocktype, &stash->recvframe_count);
919:     }
920:     if (stash->recvframe_count > 0) { /* Check for InsertMode consistency */
921:       block = (MatStashBlock *)&((char *)stash->recvframe_active->buffer)[0];
922:       if (PetscUnlikely(*stash->insertmode == NOT_SET_VALUES)) *stash->insertmode = block->row < 0 ? INSERT_VALUES : ADD_VALUES;
925:     }
926:     stash->some_i++;
927:     stash->recvcount++;
928:     stash->recvframe_i = 0;
929:   }
930:   *n    = 1;
931:   block = (MatStashBlock *)&((char *)stash->recvframe_active->buffer)[stash->recvframe_i * stash->blocktype_size];
932:   if (block->row < 0) block->row = -(block->row + 1);
933:   *row = &block->row;
934:   *col = &block->col;
935:   *val = block->vals;
936:   stash->recvframe_i++;
937:   *flg = 1;
938:   return 0;
939: }

941: static PetscErrorCode MatStashScatterEnd_BTS(MatStash *stash)
942: {
943:   MPI_Waitall(stash->nsendranks, stash->sendreqs, MPI_STATUSES_IGNORE);
944:   if (stash->first_assembly_done) { /* Reuse the communication contexts, so consolidate and reset segrecvblocks  */
945:     PetscSegBufferExtractInPlace(stash->segrecvblocks, NULL);
946:   } else { /* No reuse, so collect everything. */
947:     MatStashScatterDestroy_BTS(stash);
948:   }

950:   /* Now update nmaxold to be app 10% more than max n used, this way the
951:      wastage of space is reduced the next time this stash is used.
952:      Also update the oldmax, only if it increases */
953:   if (stash->n) {
954:     PetscInt bs2     = stash->bs * stash->bs;
955:     PetscInt oldnmax = ((int)(stash->n * 1.1) + 5) * bs2;
956:     if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
957:   }

959:   stash->nmax       = 0;
960:   stash->n          = 0;
961:   stash->reallocs   = -1;
962:   stash->nprocessed = 0;

964:   PetscMatStashSpaceDestroy(&stash->space_head);

966:   stash->space = NULL;

968:   return 0;
969: }

971: PetscErrorCode MatStashScatterDestroy_BTS(MatStash *stash)
972: {
973:   PetscSegBufferDestroy(&stash->segsendblocks);
974:   PetscSegBufferDestroy(&stash->segrecvframe);
975:   stash->recvframes = NULL;
976:   PetscSegBufferDestroy(&stash->segrecvblocks);
977:   if (stash->blocktype != MPI_DATATYPE_NULL) MPI_Type_free(&stash->blocktype);
978:   stash->nsendranks = 0;
979:   stash->nrecvranks = 0;
980:   PetscFree3(stash->sendranks, stash->sendhdr, stash->sendframes);
981:   PetscFree(stash->sendreqs);
982:   PetscFree(stash->recvreqs);
983:   PetscFree(stash->recvranks);
984:   PetscFree(stash->recvhdr);
985:   PetscFree2(stash->some_indices, stash->some_statuses);
986:   return 0;
987: }
988: #endif