Actual source code: aomemscalable.c


  2: /*
  3:     The memory scalable AO application ordering routines. These store the
  4:   orderings on each processor for that processor's range of values
  5: */

  7: #include <../src/vec/is/ao/aoimpl.h>

  9: typedef struct {
 10:   PetscInt   *app_loc;   /* app_loc[i] is the partner for the ith local PETSc slot */
 11:   PetscInt   *petsc_loc; /* petsc_loc[j] is the partner for the jth local app slot */
 12:   PetscLayout map;       /* determines the local sizes of ao */
 13: } AO_MemoryScalable;

 15: /*
 16:        All processors ship the data to process 0 to be printed; note that this is not scalable because
 17:        process 0 allocates space for all the orderings entry across all the processes
 18: */
 19: PetscErrorCode AOView_MemoryScalable(AO ao, PetscViewer viewer)
 20: {
 21:   PetscMPIInt        rank, size;
 22:   AO_MemoryScalable *aomems = (AO_MemoryScalable *)ao->data;
 23:   PetscBool          iascii;
 24:   PetscMPIInt        tag_app, tag_petsc;
 25:   PetscLayout        map = aomems->map;
 26:   PetscInt          *app, *app_loc, *petsc, *petsc_loc, len, i, j;
 27:   MPI_Status         status;

 29:   PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii);

 32:   MPI_Comm_rank(PetscObjectComm((PetscObject)ao), &rank);
 33:   MPI_Comm_size(PetscObjectComm((PetscObject)ao), &size);

 35:   PetscObjectGetNewTag((PetscObject)ao, &tag_app);
 36:   PetscObjectGetNewTag((PetscObject)ao, &tag_petsc);

 38:   if (rank == 0) {
 39:     PetscViewerASCIIPrintf(viewer, "Number of elements in ordering %" PetscInt_FMT "\n", ao->N);
 40:     PetscViewerASCIIPrintf(viewer, "PETSc->App  App->PETSc\n");

 42:     PetscMalloc2(map->N, &app, map->N, &petsc);
 43:     len = map->n;
 44:     /* print local AO */
 45:     PetscViewerASCIIPrintf(viewer, "Process [%d]\n", rank);
 46:     for (i = 0; i < len; i++) PetscViewerASCIIPrintf(viewer, "%3" PetscInt_FMT "  %3" PetscInt_FMT "    %3" PetscInt_FMT "  %3" PetscInt_FMT "\n", i, aomems->app_loc[i], i, aomems->petsc_loc[i]);

 48:     /* recv and print off-processor's AO */
 49:     for (i = 1; i < size; i++) {
 50:       len       = map->range[i + 1] - map->range[i];
 51:       app_loc   = app + map->range[i];
 52:       petsc_loc = petsc + map->range[i];
 53:       MPI_Recv(app_loc, (PetscMPIInt)len, MPIU_INT, i, tag_app, PetscObjectComm((PetscObject)ao), &status);
 54:       MPI_Recv(petsc_loc, (PetscMPIInt)len, MPIU_INT, i, tag_petsc, PetscObjectComm((PetscObject)ao), &status);
 55:       PetscViewerASCIIPrintf(viewer, "Process [%" PetscInt_FMT "]\n", i);
 56:       for (j = 0; j < len; j++) PetscViewerASCIIPrintf(viewer, "%3" PetscInt_FMT "  %3" PetscInt_FMT "    %3" PetscInt_FMT "  %3" PetscInt_FMT "\n", map->range[i] + j, app_loc[j], map->range[i] + j, petsc_loc[j]);
 57:     }
 58:     PetscFree2(app, petsc);

 60:   } else {
 61:     /* send values */
 62:     MPI_Send((void *)aomems->app_loc, map->n, MPIU_INT, 0, tag_app, PetscObjectComm((PetscObject)ao));
 63:     MPI_Send((void *)aomems->petsc_loc, map->n, MPIU_INT, 0, tag_petsc, PetscObjectComm((PetscObject)ao));
 64:   }
 65:   PetscViewerFlush(viewer);
 66:   return 0;
 67: }

 69: PetscErrorCode AODestroy_MemoryScalable(AO ao)
 70: {
 71:   AO_MemoryScalable *aomems = (AO_MemoryScalable *)ao->data;

 73:   PetscFree2(aomems->app_loc, aomems->petsc_loc);
 74:   PetscLayoutDestroy(&aomems->map);
 75:   PetscFree(aomems);
 76:   return 0;
 77: }

 79: /*
 80:    Input Parameters:
 81: +   ao - the application ordering context
 82: .   n  - the number of integers in ia[]
 83: .   ia - the integers; these are replaced with their mapped value
 84: -   maploc - app_loc or petsc_loc in struct "AO_MemoryScalable"

 86:    Output Parameter:
 87: .   ia - the mapped interges
 88:  */
 89: PetscErrorCode AOMap_MemoryScalable_private(AO ao, PetscInt n, PetscInt *ia, const PetscInt *maploc)
 90: {
 91:   AO_MemoryScalable *aomems = (AO_MemoryScalable *)ao->data;
 92:   MPI_Comm           comm;
 93:   PetscMPIInt        rank, size, tag1, tag2;
 94:   PetscInt          *owner, *start, *sizes, nsends, nreceives;
 95:   PetscInt           nmax, count, *sindices, *rindices, i, j, idx, lastidx, *sindices2, *rindices2;
 96:   const PetscInt    *owners = aomems->map->range;
 97:   MPI_Request       *send_waits, *recv_waits, *send_waits2, *recv_waits2;
 98:   MPI_Status         recv_status;
 99:   PetscMPIInt        nindices, source, widx;
100:   PetscInt          *rbuf, *sbuf;
101:   MPI_Status        *send_status, *send_status2;

103:   PetscObjectGetComm((PetscObject)ao, &comm);
104:   MPI_Comm_rank(comm, &rank);
105:   MPI_Comm_size(comm, &size);

107:   /*  first count number of contributors to each processor */
108:   PetscMalloc1(size, &start);
109:   PetscCalloc2(2 * size, &sizes, n, &owner);

111:   j       = 0;
112:   lastidx = -1;
113:   for (i = 0; i < n; i++) {
114:     if (ia[i] < 0) owner[i] = -1;      /* mark negative entries (which are not to be mapped) with a special negative value */
115:     if (ia[i] >= ao->N) owner[i] = -2; /* mark out of range entries with special negative value */
116:     else {
117:       /* if indices are NOT locally sorted, need to start search at the beginning */
118:       if (lastidx > (idx = ia[i])) j = 0;
119:       lastidx = idx;
120:       for (; j < size; j++) {
121:         if (idx >= owners[j] && idx < owners[j + 1]) {
122:           sizes[2 * j]++;       /* num of indices to be sent */
123:           sizes[2 * j + 1] = 1; /* send to proc[j] */
124:           owner[i]         = j;
125:           break;
126:         }
127:       }
128:     }
129:   }
130:   sizes[2 * rank] = sizes[2 * rank + 1] = 0; /* do not receive from self! */
131:   nsends                                = 0;
132:   for (i = 0; i < size; i++) nsends += sizes[2 * i + 1];

134:   /* inform other processors of number of messages and max length*/
135:   PetscMaxSum(comm, sizes, &nmax, &nreceives);

137:   /* allocate arrays */
138:   PetscObjectGetNewTag((PetscObject)ao, &tag1);
139:   PetscObjectGetNewTag((PetscObject)ao, &tag2);

141:   PetscMalloc2(nreceives * nmax, &rindices, nreceives, &recv_waits);
142:   PetscMalloc2(nsends * nmax, &rindices2, nsends, &recv_waits2);

144:   PetscMalloc3(n, &sindices, nsends, &send_waits, nsends, &send_status);
145:   PetscMalloc3(n, &sindices2, nreceives, &send_waits2, nreceives, &send_status2);

147:   /* post 1st receives: receive others requests
148:      since we don't know how long each individual message is we
149:      allocate the largest needed buffer for each receive. Potentially
150:      this is a lot of wasted space.
151:   */
152:   for (i = 0, count = 0; i < nreceives; i++) MPI_Irecv(rindices + nmax * i, nmax, MPIU_INT, MPI_ANY_SOURCE, tag1, comm, recv_waits + count++);

154:   /* do 1st sends:
155:       1) starts[i] gives the starting index in svalues for stuff going to
156:          the ith processor
157:   */
158:   start[0] = 0;
159:   for (i = 1; i < size; i++) start[i] = start[i - 1] + sizes[2 * i - 2];
160:   for (i = 0; i < n; i++) {
161:     j = owner[i];
162:     if (j == -1) continue; /* do not remap negative entries in ia[] */
163:     else if (j == -2) { /* out of range entries get mapped to -1 */ ia[i] = -1;
164:       continue;
165:     } else if (j != rank) {
166:       sindices[start[j]++] = ia[i];
167:     } else { /* compute my own map */
168:       ia[i] = maploc[ia[i] - owners[rank]];
169:     }
170:   }

172:   start[0] = 0;
173:   for (i = 1; i < size; i++) start[i] = start[i - 1] + sizes[2 * i - 2];
174:   for (i = 0, count = 0; i < size; i++) {
175:     if (sizes[2 * i + 1]) {
176:       /* send my request to others */
177:       MPI_Isend(sindices + start[i], sizes[2 * i], MPIU_INT, i, tag1, comm, send_waits + count);
178:       /* post receive for the answer of my request */
179:       MPI_Irecv(sindices2 + start[i], sizes[2 * i], MPIU_INT, i, tag2, comm, recv_waits2 + count);
180:       count++;
181:     }
182:   }

185:   /* wait on 1st sends */
186:   if (nsends) MPI_Waitall(nsends, send_waits, send_status);

188:   /* 1st recvs: other's requests */
189:   for (j = 0; j < nreceives; j++) {
190:     MPI_Waitany(nreceives, recv_waits, &widx, &recv_status); /* idx: index of handle for operation that completed */
191:     MPI_Get_count(&recv_status, MPIU_INT, &nindices);
192:     rbuf   = rindices + nmax * widx; /* global index */
193:     source = recv_status.MPI_SOURCE;

195:     /* compute mapping */
196:     sbuf = rbuf;
197:     for (i = 0; i < nindices; i++) sbuf[i] = maploc[rbuf[i] - owners[rank]];

199:     /* send mapping back to the sender */
200:     MPI_Isend(sbuf, nindices, MPIU_INT, source, tag2, comm, send_waits2 + widx);
201:   }

203:   /* wait on 2nd sends */
204:   if (nreceives) MPI_Waitall(nreceives, send_waits2, send_status2);

206:   /* 2nd recvs: for the answer of my request */
207:   for (j = 0; j < nsends; j++) {
208:     MPI_Waitany(nsends, recv_waits2, &widx, &recv_status);
209:     MPI_Get_count(&recv_status, MPIU_INT, &nindices);
210:     source = recv_status.MPI_SOURCE;
211:     /* pack output ia[] */
212:     rbuf  = sindices2 + start[source];
213:     count = 0;
214:     for (i = 0; i < n; i++) {
215:       if (source == owner[i]) ia[i] = rbuf[count++];
216:     }
217:   }

219:   /* free arrays */
220:   PetscFree(start);
221:   PetscFree2(sizes, owner);
222:   PetscFree2(rindices, recv_waits);
223:   PetscFree2(rindices2, recv_waits2);
224:   PetscFree3(sindices, send_waits, send_status);
225:   PetscFree3(sindices2, send_waits2, send_status2);
226:   return 0;
227: }

229: PetscErrorCode AOPetscToApplication_MemoryScalable(AO ao, PetscInt n, PetscInt *ia)
230: {
231:   AO_MemoryScalable *aomems  = (AO_MemoryScalable *)ao->data;
232:   PetscInt          *app_loc = aomems->app_loc;

234:   AOMap_MemoryScalable_private(ao, n, ia, app_loc);
235:   return 0;
236: }

238: PetscErrorCode AOApplicationToPetsc_MemoryScalable(AO ao, PetscInt n, PetscInt *ia)
239: {
240:   AO_MemoryScalable *aomems    = (AO_MemoryScalable *)ao->data;
241:   PetscInt          *petsc_loc = aomems->petsc_loc;

243:   AOMap_MemoryScalable_private(ao, n, ia, petsc_loc);
244:   return 0;
245: }

247: static struct _AOOps AOOps_MemoryScalable = {
248:   PetscDesignatedInitializer(view, AOView_MemoryScalable),
249:   PetscDesignatedInitializer(destroy, AODestroy_MemoryScalable),
250:   PetscDesignatedInitializer(petsctoapplication, AOPetscToApplication_MemoryScalable),
251:   PetscDesignatedInitializer(applicationtopetsc, AOApplicationToPetsc_MemoryScalable),
252: };

254: PetscErrorCode AOCreateMemoryScalable_private(MPI_Comm comm, PetscInt napp, const PetscInt from_array[], const PetscInt to_array[], AO ao, PetscInt *aomap_loc)
255: {
256:   AO_MemoryScalable *aomems  = (AO_MemoryScalable *)ao->data;
257:   PetscLayout        map     = aomems->map;
258:   PetscInt           n_local = map->n, i, j;
259:   PetscMPIInt        rank, size, tag;
260:   PetscInt          *owner, *start, *sizes, nsends, nreceives;
261:   PetscInt           nmax, count, *sindices, *rindices, idx, lastidx;
262:   PetscInt          *owners = aomems->map->range;
263:   MPI_Request       *send_waits, *recv_waits;
264:   MPI_Status         recv_status;
265:   PetscMPIInt        nindices, widx;
266:   PetscInt          *rbuf;
267:   PetscInt           n = napp, ip, ia;
268:   MPI_Status        *send_status;

270:   PetscArrayzero(aomap_loc, n_local);

272:   MPI_Comm_rank(comm, &rank);
273:   MPI_Comm_size(comm, &size);

275:   /*  first count number of contributors (of from_array[]) to each processor */
276:   PetscCalloc1(2 * size, &sizes);
277:   PetscMalloc1(n, &owner);

279:   j       = 0;
280:   lastidx = -1;
281:   for (i = 0; i < n; i++) {
282:     /* if indices are NOT locally sorted, need to start search at the beginning */
283:     if (lastidx > (idx = from_array[i])) j = 0;
284:     lastidx = idx;
285:     for (; j < size; j++) {
286:       if (idx >= owners[j] && idx < owners[j + 1]) {
287:         sizes[2 * j] += 2;    /* num of indices to be sent - in pairs (ip,ia) */
288:         sizes[2 * j + 1] = 1; /* send to proc[j] */
289:         owner[i]         = j;
290:         break;
291:       }
292:     }
293:   }
294:   sizes[2 * rank] = sizes[2 * rank + 1] = 0; /* do not receive from self! */
295:   nsends                                = 0;
296:   for (i = 0; i < size; i++) nsends += sizes[2 * i + 1];

298:   /* inform other processors of number of messages and max length*/
299:   PetscMaxSum(comm, sizes, &nmax, &nreceives);

301:   /* allocate arrays */
302:   PetscObjectGetNewTag((PetscObject)ao, &tag);
303:   PetscMalloc2(nreceives * nmax, &rindices, nreceives, &recv_waits);
304:   PetscMalloc3(2 * n, &sindices, nsends, &send_waits, nsends, &send_status);
305:   PetscMalloc1(size, &start);

307:   /* post receives: */
308:   for (i = 0; i < nreceives; i++) MPI_Irecv(rindices + nmax * i, nmax, MPIU_INT, MPI_ANY_SOURCE, tag, comm, recv_waits + i);

310:   /* do sends:
311:       1) starts[i] gives the starting index in svalues for stuff going to
312:          the ith processor
313:   */
314:   start[0] = 0;
315:   for (i = 1; i < size; i++) start[i] = start[i - 1] + sizes[2 * i - 2];
316:   for (i = 0; i < n; i++) {
317:     j = owner[i];
318:     if (j != rank) {
319:       ip                   = from_array[i];
320:       ia                   = to_array[i];
321:       sindices[start[j]++] = ip;
322:       sindices[start[j]++] = ia;
323:     } else { /* compute my own map */
324:       ip            = from_array[i] - owners[rank];
325:       ia            = to_array[i];
326:       aomap_loc[ip] = ia;
327:     }
328:   }

330:   start[0] = 0;
331:   for (i = 1; i < size; i++) start[i] = start[i - 1] + sizes[2 * i - 2];
332:   for (i = 0, count = 0; i < size; i++) {
333:     if (sizes[2 * i + 1]) {
334:       MPI_Isend(sindices + start[i], sizes[2 * i], MPIU_INT, i, tag, comm, send_waits + count);
335:       count++;
336:     }
337:   }

340:   /* wait on sends */
341:   if (nsends) MPI_Waitall(nsends, send_waits, send_status);

343:   /* recvs */
344:   count = 0;
345:   for (j = nreceives; j > 0; j--) {
346:     MPI_Waitany(nreceives, recv_waits, &widx, &recv_status);
347:     MPI_Get_count(&recv_status, MPIU_INT, &nindices);
348:     rbuf = rindices + nmax * widx; /* global index */

350:     /* compute local mapping */
351:     for (i = 0; i < nindices; i += 2) {       /* pack aomap_loc */
352:       ip            = rbuf[i] - owners[rank]; /* local index */
353:       ia            = rbuf[i + 1];
354:       aomap_loc[ip] = ia;
355:     }
356:     count++;
357:   }

359:   PetscFree(start);
360:   PetscFree3(sindices, send_waits, send_status);
361:   PetscFree2(rindices, recv_waits);
362:   PetscFree(owner);
363:   PetscFree(sizes);
364:   return 0;
365: }

367: PETSC_EXTERN PetscErrorCode AOCreate_MemoryScalable(AO ao)
368: {
369:   IS                 isapp = ao->isapp, ispetsc = ao->ispetsc;
370:   const PetscInt    *mypetsc, *myapp;
371:   PetscInt           napp, n_local, N, i, start, *petsc, *lens, *disp;
372:   MPI_Comm           comm;
373:   AO_MemoryScalable *aomems;
374:   PetscLayout        map;
375:   PetscMPIInt        size, rank;

378:   /* create special struct aomems */
379:   PetscNew(&aomems);
380:   ao->data = (void *)aomems;
381:   PetscMemcpy(ao->ops, &AOOps_MemoryScalable, sizeof(struct _AOOps));
382:   PetscObjectChangeTypeName((PetscObject)ao, AOMEMORYSCALABLE);

384:   /* transmit all local lengths of isapp to all processors */
385:   PetscObjectGetComm((PetscObject)isapp, &comm);
386:   MPI_Comm_size(comm, &size);
387:   MPI_Comm_rank(comm, &rank);
388:   PetscMalloc2(size, &lens, size, &disp);
389:   ISGetLocalSize(isapp, &napp);
390:   MPI_Allgather(&napp, 1, MPIU_INT, lens, 1, MPIU_INT, comm);

392:   N = 0;
393:   for (i = 0; i < size; i++) {
394:     disp[i] = N;
395:     N += lens[i];
396:   }

398:   /* If ispetsc is 0 then use "natural" numbering */
399:   if (napp) {
400:     if (!ispetsc) {
401:       start = disp[rank];
402:       PetscMalloc1(napp + 1, &petsc);
403:       for (i = 0; i < napp; i++) petsc[i] = start + i;
404:     } else {
405:       ISGetIndices(ispetsc, &mypetsc);
406:       petsc = (PetscInt *)mypetsc;
407:     }
408:   } else {
409:     petsc = NULL;
410:   }

412:   /* create a map with global size N - used to determine the local sizes of ao - shall we use local napp instead of N? */
413:   PetscLayoutCreate(comm, &map);
414:   map->bs = 1;
415:   map->N  = N;
416:   PetscLayoutSetUp(map);

418:   ao->N       = N;
419:   ao->n       = map->n;
420:   aomems->map = map;

422:   /* create distributed indices app_loc: petsc->app and petsc_loc: app->petsc */
423:   n_local = map->n;
424:   PetscCalloc2(n_local, &aomems->app_loc, n_local, &aomems->petsc_loc);
425:   ISGetIndices(isapp, &myapp);

427:   AOCreateMemoryScalable_private(comm, napp, petsc, myapp, ao, aomems->app_loc);
428:   AOCreateMemoryScalable_private(comm, napp, myapp, petsc, ao, aomems->petsc_loc);

430:   ISRestoreIndices(isapp, &myapp);
431:   if (napp) {
432:     if (ispetsc) {
433:       ISRestoreIndices(ispetsc, &mypetsc);
434:     } else {
435:       PetscFree(petsc);
436:     }
437:   }
438:   PetscFree2(lens, disp);
439:   return 0;
440: }

442: /*@C
443:    AOCreateMemoryScalable - Creates a memory scalable application ordering using two integer arrays.

445:    Collective

447:    Input Parameters:
448: +  comm - MPI communicator that is to share the `AO`
449: .  napp - size of integer arrays
450: .  myapp - integer array that defines an ordering
451: -  mypetsc - integer array that defines another ordering (may be NULL to
452:              indicate the natural ordering, that is 0,1,2,3,...)

454:    Output Parameter:
455: .  aoout - the new application ordering

457:    Level: beginner

459:     Note:
460:     The arrays myapp and mypetsc must contain the all the integers 0 to napp-1 with no duplicates; that is there cannot be any "holes"
461:     in the indices. Use `AOCreateMapping()` or `AOCreateMappingIS()` if you wish to have "holes" in the indices.
462:     Comparing with `AOCreateBasic()`, this routine trades memory with message communication.

464: .seealso: [](sec_ao), [](sec_scatter), `AO`, `AOCreateMemoryScalableIS()`, `AODestroy()`, `AOPetscToApplication()`, `AOApplicationToPetsc()`
465: @*/
466: PetscErrorCode AOCreateMemoryScalable(MPI_Comm comm, PetscInt napp, const PetscInt myapp[], const PetscInt mypetsc[], AO *aoout)
467: {
468:   IS              isapp, ispetsc;
469:   const PetscInt *app = myapp, *petsc = mypetsc;

471:   ISCreateGeneral(comm, napp, app, PETSC_USE_POINTER, &isapp);
472:   if (mypetsc) {
473:     ISCreateGeneral(comm, napp, petsc, PETSC_USE_POINTER, &ispetsc);
474:   } else {
475:     ispetsc = NULL;
476:   }
477:   AOCreateMemoryScalableIS(isapp, ispetsc, aoout);
478:   ISDestroy(&isapp);
479:   if (mypetsc) ISDestroy(&ispetsc);
480:   return 0;
481: }

483: /*@C
484:    AOCreateMemoryScalableIS - Creates a memory scalable application ordering using two index sets.

486:    Collective

488:    Input Parameters:
489: +  isapp - index set that defines an ordering
490: -  ispetsc - index set that defines another ordering (may be NULL to use the
491:              natural ordering)

493:    Output Parameter:
494: .  aoout - the new application ordering

496:    Level: beginner

498:     Notes:
499:     The index sets isapp and ispetsc must contain the all the integers 0 to napp-1 (where napp is the length of the index sets) with no duplicates;
500:     that is there cannot be any "holes".

502:     Comparing with `AOCreateBasicIS()`, this routine trades memory with message communication.

504: .seealso: [](sec_ao), [](sec_scatter), `AO`, `AOCreateBasicIS()`, `AOCreateMemoryScalable()`, `AODestroy()`
505: @*/
506: PetscErrorCode AOCreateMemoryScalableIS(IS isapp, IS ispetsc, AO *aoout)
507: {
508:   MPI_Comm comm;
509:   AO       ao;

511:   PetscObjectGetComm((PetscObject)isapp, &comm);
512:   AOCreate(comm, &ao);
513:   AOSetIS(ao, isapp, ispetsc);
514:   AOSetType(ao, AOMEMORYSCALABLE);
515:   AOViewFromOptions(ao, NULL, "-ao_view");
516:   *aoout = ao;
517:   return 0;
518: }