Actual source code: aomemscalable.c
2: /*
3: The memory scalable AO application ordering routines. These store the
4: orderings on each processor for that processor's range of values
5: */
7: #include <../src/vec/is/ao/aoimpl.h>
9: typedef struct {
10: PetscInt *app_loc; /* app_loc[i] is the partner for the ith local PETSc slot */
11: PetscInt *petsc_loc; /* petsc_loc[j] is the partner for the jth local app slot */
12: PetscLayout map; /* determines the local sizes of ao */
13: } AO_MemoryScalable;
15: /*
16: All processors ship the data to process 0 to be printed; note that this is not scalable because
17: process 0 allocates space for all the orderings entry across all the processes
18: */
19: PetscErrorCode AOView_MemoryScalable(AO ao, PetscViewer viewer)
20: {
21: PetscMPIInt rank, size;
22: AO_MemoryScalable *aomems = (AO_MemoryScalable *)ao->data;
23: PetscBool iascii;
24: PetscMPIInt tag_app, tag_petsc;
25: PetscLayout map = aomems->map;
26: PetscInt *app, *app_loc, *petsc, *petsc_loc, len, i, j;
27: MPI_Status status;
29: PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii);
32: MPI_Comm_rank(PetscObjectComm((PetscObject)ao), &rank);
33: MPI_Comm_size(PetscObjectComm((PetscObject)ao), &size);
35: PetscObjectGetNewTag((PetscObject)ao, &tag_app);
36: PetscObjectGetNewTag((PetscObject)ao, &tag_petsc);
38: if (rank == 0) {
39: PetscViewerASCIIPrintf(viewer, "Number of elements in ordering %" PetscInt_FMT "\n", ao->N);
40: PetscViewerASCIIPrintf(viewer, "PETSc->App App->PETSc\n");
42: PetscMalloc2(map->N, &app, map->N, &petsc);
43: len = map->n;
44: /* print local AO */
45: PetscViewerASCIIPrintf(viewer, "Process [%d]\n", rank);
46: for (i = 0; i < len; i++) PetscViewerASCIIPrintf(viewer, "%3" PetscInt_FMT " %3" PetscInt_FMT " %3" PetscInt_FMT " %3" PetscInt_FMT "\n", i, aomems->app_loc[i], i, aomems->petsc_loc[i]);
48: /* recv and print off-processor's AO */
49: for (i = 1; i < size; i++) {
50: len = map->range[i + 1] - map->range[i];
51: app_loc = app + map->range[i];
52: petsc_loc = petsc + map->range[i];
53: MPI_Recv(app_loc, (PetscMPIInt)len, MPIU_INT, i, tag_app, PetscObjectComm((PetscObject)ao), &status);
54: MPI_Recv(petsc_loc, (PetscMPIInt)len, MPIU_INT, i, tag_petsc, PetscObjectComm((PetscObject)ao), &status);
55: PetscViewerASCIIPrintf(viewer, "Process [%" PetscInt_FMT "]\n", i);
56: for (j = 0; j < len; j++) PetscViewerASCIIPrintf(viewer, "%3" PetscInt_FMT " %3" PetscInt_FMT " %3" PetscInt_FMT " %3" PetscInt_FMT "\n", map->range[i] + j, app_loc[j], map->range[i] + j, petsc_loc[j]);
57: }
58: PetscFree2(app, petsc);
60: } else {
61: /* send values */
62: MPI_Send((void *)aomems->app_loc, map->n, MPIU_INT, 0, tag_app, PetscObjectComm((PetscObject)ao));
63: MPI_Send((void *)aomems->petsc_loc, map->n, MPIU_INT, 0, tag_petsc, PetscObjectComm((PetscObject)ao));
64: }
65: PetscViewerFlush(viewer);
66: return 0;
67: }
69: PetscErrorCode AODestroy_MemoryScalable(AO ao)
70: {
71: AO_MemoryScalable *aomems = (AO_MemoryScalable *)ao->data;
73: PetscFree2(aomems->app_loc, aomems->petsc_loc);
74: PetscLayoutDestroy(&aomems->map);
75: PetscFree(aomems);
76: return 0;
77: }
79: /*
80: Input Parameters:
81: + ao - the application ordering context
82: . n - the number of integers in ia[]
83: . ia - the integers; these are replaced with their mapped value
84: - maploc - app_loc or petsc_loc in struct "AO_MemoryScalable"
86: Output Parameter:
87: . ia - the mapped interges
88: */
89: PetscErrorCode AOMap_MemoryScalable_private(AO ao, PetscInt n, PetscInt *ia, const PetscInt *maploc)
90: {
91: AO_MemoryScalable *aomems = (AO_MemoryScalable *)ao->data;
92: MPI_Comm comm;
93: PetscMPIInt rank, size, tag1, tag2;
94: PetscInt *owner, *start, *sizes, nsends, nreceives;
95: PetscInt nmax, count, *sindices, *rindices, i, j, idx, lastidx, *sindices2, *rindices2;
96: const PetscInt *owners = aomems->map->range;
97: MPI_Request *send_waits, *recv_waits, *send_waits2, *recv_waits2;
98: MPI_Status recv_status;
99: PetscMPIInt nindices, source, widx;
100: PetscInt *rbuf, *sbuf;
101: MPI_Status *send_status, *send_status2;
103: PetscObjectGetComm((PetscObject)ao, &comm);
104: MPI_Comm_rank(comm, &rank);
105: MPI_Comm_size(comm, &size);
107: /* first count number of contributors to each processor */
108: PetscMalloc1(size, &start);
109: PetscCalloc2(2 * size, &sizes, n, &owner);
111: j = 0;
112: lastidx = -1;
113: for (i = 0; i < n; i++) {
114: if (ia[i] < 0) owner[i] = -1; /* mark negative entries (which are not to be mapped) with a special negative value */
115: if (ia[i] >= ao->N) owner[i] = -2; /* mark out of range entries with special negative value */
116: else {
117: /* if indices are NOT locally sorted, need to start search at the beginning */
118: if (lastidx > (idx = ia[i])) j = 0;
119: lastidx = idx;
120: for (; j < size; j++) {
121: if (idx >= owners[j] && idx < owners[j + 1]) {
122: sizes[2 * j]++; /* num of indices to be sent */
123: sizes[2 * j + 1] = 1; /* send to proc[j] */
124: owner[i] = j;
125: break;
126: }
127: }
128: }
129: }
130: sizes[2 * rank] = sizes[2 * rank + 1] = 0; /* do not receive from self! */
131: nsends = 0;
132: for (i = 0; i < size; i++) nsends += sizes[2 * i + 1];
134: /* inform other processors of number of messages and max length*/
135: PetscMaxSum(comm, sizes, &nmax, &nreceives);
137: /* allocate arrays */
138: PetscObjectGetNewTag((PetscObject)ao, &tag1);
139: PetscObjectGetNewTag((PetscObject)ao, &tag2);
141: PetscMalloc2(nreceives * nmax, &rindices, nreceives, &recv_waits);
142: PetscMalloc2(nsends * nmax, &rindices2, nsends, &recv_waits2);
144: PetscMalloc3(n, &sindices, nsends, &send_waits, nsends, &send_status);
145: PetscMalloc3(n, &sindices2, nreceives, &send_waits2, nreceives, &send_status2);
147: /* post 1st receives: receive others requests
148: since we don't know how long each individual message is we
149: allocate the largest needed buffer for each receive. Potentially
150: this is a lot of wasted space.
151: */
152: for (i = 0, count = 0; i < nreceives; i++) MPI_Irecv(rindices + nmax * i, nmax, MPIU_INT, MPI_ANY_SOURCE, tag1, comm, recv_waits + count++);
154: /* do 1st sends:
155: 1) starts[i] gives the starting index in svalues for stuff going to
156: the ith processor
157: */
158: start[0] = 0;
159: for (i = 1; i < size; i++) start[i] = start[i - 1] + sizes[2 * i - 2];
160: for (i = 0; i < n; i++) {
161: j = owner[i];
162: if (j == -1) continue; /* do not remap negative entries in ia[] */
163: else if (j == -2) { /* out of range entries get mapped to -1 */ ia[i] = -1;
164: continue;
165: } else if (j != rank) {
166: sindices[start[j]++] = ia[i];
167: } else { /* compute my own map */
168: ia[i] = maploc[ia[i] - owners[rank]];
169: }
170: }
172: start[0] = 0;
173: for (i = 1; i < size; i++) start[i] = start[i - 1] + sizes[2 * i - 2];
174: for (i = 0, count = 0; i < size; i++) {
175: if (sizes[2 * i + 1]) {
176: /* send my request to others */
177: MPI_Isend(sindices + start[i], sizes[2 * i], MPIU_INT, i, tag1, comm, send_waits + count);
178: /* post receive for the answer of my request */
179: MPI_Irecv(sindices2 + start[i], sizes[2 * i], MPIU_INT, i, tag2, comm, recv_waits2 + count);
180: count++;
181: }
182: }
185: /* wait on 1st sends */
186: if (nsends) MPI_Waitall(nsends, send_waits, send_status);
188: /* 1st recvs: other's requests */
189: for (j = 0; j < nreceives; j++) {
190: MPI_Waitany(nreceives, recv_waits, &widx, &recv_status); /* idx: index of handle for operation that completed */
191: MPI_Get_count(&recv_status, MPIU_INT, &nindices);
192: rbuf = rindices + nmax * widx; /* global index */
193: source = recv_status.MPI_SOURCE;
195: /* compute mapping */
196: sbuf = rbuf;
197: for (i = 0; i < nindices; i++) sbuf[i] = maploc[rbuf[i] - owners[rank]];
199: /* send mapping back to the sender */
200: MPI_Isend(sbuf, nindices, MPIU_INT, source, tag2, comm, send_waits2 + widx);
201: }
203: /* wait on 2nd sends */
204: if (nreceives) MPI_Waitall(nreceives, send_waits2, send_status2);
206: /* 2nd recvs: for the answer of my request */
207: for (j = 0; j < nsends; j++) {
208: MPI_Waitany(nsends, recv_waits2, &widx, &recv_status);
209: MPI_Get_count(&recv_status, MPIU_INT, &nindices);
210: source = recv_status.MPI_SOURCE;
211: /* pack output ia[] */
212: rbuf = sindices2 + start[source];
213: count = 0;
214: for (i = 0; i < n; i++) {
215: if (source == owner[i]) ia[i] = rbuf[count++];
216: }
217: }
219: /* free arrays */
220: PetscFree(start);
221: PetscFree2(sizes, owner);
222: PetscFree2(rindices, recv_waits);
223: PetscFree2(rindices2, recv_waits2);
224: PetscFree3(sindices, send_waits, send_status);
225: PetscFree3(sindices2, send_waits2, send_status2);
226: return 0;
227: }
229: PetscErrorCode AOPetscToApplication_MemoryScalable(AO ao, PetscInt n, PetscInt *ia)
230: {
231: AO_MemoryScalable *aomems = (AO_MemoryScalable *)ao->data;
232: PetscInt *app_loc = aomems->app_loc;
234: AOMap_MemoryScalable_private(ao, n, ia, app_loc);
235: return 0;
236: }
238: PetscErrorCode AOApplicationToPetsc_MemoryScalable(AO ao, PetscInt n, PetscInt *ia)
239: {
240: AO_MemoryScalable *aomems = (AO_MemoryScalable *)ao->data;
241: PetscInt *petsc_loc = aomems->petsc_loc;
243: AOMap_MemoryScalable_private(ao, n, ia, petsc_loc);
244: return 0;
245: }
247: static struct _AOOps AOOps_MemoryScalable = {
248: PetscDesignatedInitializer(view, AOView_MemoryScalable),
249: PetscDesignatedInitializer(destroy, AODestroy_MemoryScalable),
250: PetscDesignatedInitializer(petsctoapplication, AOPetscToApplication_MemoryScalable),
251: PetscDesignatedInitializer(applicationtopetsc, AOApplicationToPetsc_MemoryScalable),
252: };
254: PetscErrorCode AOCreateMemoryScalable_private(MPI_Comm comm, PetscInt napp, const PetscInt from_array[], const PetscInt to_array[], AO ao, PetscInt *aomap_loc)
255: {
256: AO_MemoryScalable *aomems = (AO_MemoryScalable *)ao->data;
257: PetscLayout map = aomems->map;
258: PetscInt n_local = map->n, i, j;
259: PetscMPIInt rank, size, tag;
260: PetscInt *owner, *start, *sizes, nsends, nreceives;
261: PetscInt nmax, count, *sindices, *rindices, idx, lastidx;
262: PetscInt *owners = aomems->map->range;
263: MPI_Request *send_waits, *recv_waits;
264: MPI_Status recv_status;
265: PetscMPIInt nindices, widx;
266: PetscInt *rbuf;
267: PetscInt n = napp, ip, ia;
268: MPI_Status *send_status;
270: PetscArrayzero(aomap_loc, n_local);
272: MPI_Comm_rank(comm, &rank);
273: MPI_Comm_size(comm, &size);
275: /* first count number of contributors (of from_array[]) to each processor */
276: PetscCalloc1(2 * size, &sizes);
277: PetscMalloc1(n, &owner);
279: j = 0;
280: lastidx = -1;
281: for (i = 0; i < n; i++) {
282: /* if indices are NOT locally sorted, need to start search at the beginning */
283: if (lastidx > (idx = from_array[i])) j = 0;
284: lastidx = idx;
285: for (; j < size; j++) {
286: if (idx >= owners[j] && idx < owners[j + 1]) {
287: sizes[2 * j] += 2; /* num of indices to be sent - in pairs (ip,ia) */
288: sizes[2 * j + 1] = 1; /* send to proc[j] */
289: owner[i] = j;
290: break;
291: }
292: }
293: }
294: sizes[2 * rank] = sizes[2 * rank + 1] = 0; /* do not receive from self! */
295: nsends = 0;
296: for (i = 0; i < size; i++) nsends += sizes[2 * i + 1];
298: /* inform other processors of number of messages and max length*/
299: PetscMaxSum(comm, sizes, &nmax, &nreceives);
301: /* allocate arrays */
302: PetscObjectGetNewTag((PetscObject)ao, &tag);
303: PetscMalloc2(nreceives * nmax, &rindices, nreceives, &recv_waits);
304: PetscMalloc3(2 * n, &sindices, nsends, &send_waits, nsends, &send_status);
305: PetscMalloc1(size, &start);
307: /* post receives: */
308: for (i = 0; i < nreceives; i++) MPI_Irecv(rindices + nmax * i, nmax, MPIU_INT, MPI_ANY_SOURCE, tag, comm, recv_waits + i);
310: /* do sends:
311: 1) starts[i] gives the starting index in svalues for stuff going to
312: the ith processor
313: */
314: start[0] = 0;
315: for (i = 1; i < size; i++) start[i] = start[i - 1] + sizes[2 * i - 2];
316: for (i = 0; i < n; i++) {
317: j = owner[i];
318: if (j != rank) {
319: ip = from_array[i];
320: ia = to_array[i];
321: sindices[start[j]++] = ip;
322: sindices[start[j]++] = ia;
323: } else { /* compute my own map */
324: ip = from_array[i] - owners[rank];
325: ia = to_array[i];
326: aomap_loc[ip] = ia;
327: }
328: }
330: start[0] = 0;
331: for (i = 1; i < size; i++) start[i] = start[i - 1] + sizes[2 * i - 2];
332: for (i = 0, count = 0; i < size; i++) {
333: if (sizes[2 * i + 1]) {
334: MPI_Isend(sindices + start[i], sizes[2 * i], MPIU_INT, i, tag, comm, send_waits + count);
335: count++;
336: }
337: }
340: /* wait on sends */
341: if (nsends) MPI_Waitall(nsends, send_waits, send_status);
343: /* recvs */
344: count = 0;
345: for (j = nreceives; j > 0; j--) {
346: MPI_Waitany(nreceives, recv_waits, &widx, &recv_status);
347: MPI_Get_count(&recv_status, MPIU_INT, &nindices);
348: rbuf = rindices + nmax * widx; /* global index */
350: /* compute local mapping */
351: for (i = 0; i < nindices; i += 2) { /* pack aomap_loc */
352: ip = rbuf[i] - owners[rank]; /* local index */
353: ia = rbuf[i + 1];
354: aomap_loc[ip] = ia;
355: }
356: count++;
357: }
359: PetscFree(start);
360: PetscFree3(sindices, send_waits, send_status);
361: PetscFree2(rindices, recv_waits);
362: PetscFree(owner);
363: PetscFree(sizes);
364: return 0;
365: }
367: PETSC_EXTERN PetscErrorCode AOCreate_MemoryScalable(AO ao)
368: {
369: IS isapp = ao->isapp, ispetsc = ao->ispetsc;
370: const PetscInt *mypetsc, *myapp;
371: PetscInt napp, n_local, N, i, start, *petsc, *lens, *disp;
372: MPI_Comm comm;
373: AO_MemoryScalable *aomems;
374: PetscLayout map;
375: PetscMPIInt size, rank;
378: /* create special struct aomems */
379: PetscNew(&aomems);
380: ao->data = (void *)aomems;
381: PetscMemcpy(ao->ops, &AOOps_MemoryScalable, sizeof(struct _AOOps));
382: PetscObjectChangeTypeName((PetscObject)ao, AOMEMORYSCALABLE);
384: /* transmit all local lengths of isapp to all processors */
385: PetscObjectGetComm((PetscObject)isapp, &comm);
386: MPI_Comm_size(comm, &size);
387: MPI_Comm_rank(comm, &rank);
388: PetscMalloc2(size, &lens, size, &disp);
389: ISGetLocalSize(isapp, &napp);
390: MPI_Allgather(&napp, 1, MPIU_INT, lens, 1, MPIU_INT, comm);
392: N = 0;
393: for (i = 0; i < size; i++) {
394: disp[i] = N;
395: N += lens[i];
396: }
398: /* If ispetsc is 0 then use "natural" numbering */
399: if (napp) {
400: if (!ispetsc) {
401: start = disp[rank];
402: PetscMalloc1(napp + 1, &petsc);
403: for (i = 0; i < napp; i++) petsc[i] = start + i;
404: } else {
405: ISGetIndices(ispetsc, &mypetsc);
406: petsc = (PetscInt *)mypetsc;
407: }
408: } else {
409: petsc = NULL;
410: }
412: /* create a map with global size N - used to determine the local sizes of ao - shall we use local napp instead of N? */
413: PetscLayoutCreate(comm, &map);
414: map->bs = 1;
415: map->N = N;
416: PetscLayoutSetUp(map);
418: ao->N = N;
419: ao->n = map->n;
420: aomems->map = map;
422: /* create distributed indices app_loc: petsc->app and petsc_loc: app->petsc */
423: n_local = map->n;
424: PetscCalloc2(n_local, &aomems->app_loc, n_local, &aomems->petsc_loc);
425: ISGetIndices(isapp, &myapp);
427: AOCreateMemoryScalable_private(comm, napp, petsc, myapp, ao, aomems->app_loc);
428: AOCreateMemoryScalable_private(comm, napp, myapp, petsc, ao, aomems->petsc_loc);
430: ISRestoreIndices(isapp, &myapp);
431: if (napp) {
432: if (ispetsc) {
433: ISRestoreIndices(ispetsc, &mypetsc);
434: } else {
435: PetscFree(petsc);
436: }
437: }
438: PetscFree2(lens, disp);
439: return 0;
440: }
442: /*@C
443: AOCreateMemoryScalable - Creates a memory scalable application ordering using two integer arrays.
445: Collective
447: Input Parameters:
448: + comm - MPI communicator that is to share the `AO`
449: . napp - size of integer arrays
450: . myapp - integer array that defines an ordering
451: - mypetsc - integer array that defines another ordering (may be NULL to
452: indicate the natural ordering, that is 0,1,2,3,...)
454: Output Parameter:
455: . aoout - the new application ordering
457: Level: beginner
459: Note:
460: The arrays myapp and mypetsc must contain the all the integers 0 to napp-1 with no duplicates; that is there cannot be any "holes"
461: in the indices. Use `AOCreateMapping()` or `AOCreateMappingIS()` if you wish to have "holes" in the indices.
462: Comparing with `AOCreateBasic()`, this routine trades memory with message communication.
464: .seealso: [](sec_ao), [](sec_scatter), `AO`, `AOCreateMemoryScalableIS()`, `AODestroy()`, `AOPetscToApplication()`, `AOApplicationToPetsc()`
465: @*/
466: PetscErrorCode AOCreateMemoryScalable(MPI_Comm comm, PetscInt napp, const PetscInt myapp[], const PetscInt mypetsc[], AO *aoout)
467: {
468: IS isapp, ispetsc;
469: const PetscInt *app = myapp, *petsc = mypetsc;
471: ISCreateGeneral(comm, napp, app, PETSC_USE_POINTER, &isapp);
472: if (mypetsc) {
473: ISCreateGeneral(comm, napp, petsc, PETSC_USE_POINTER, &ispetsc);
474: } else {
475: ispetsc = NULL;
476: }
477: AOCreateMemoryScalableIS(isapp, ispetsc, aoout);
478: ISDestroy(&isapp);
479: if (mypetsc) ISDestroy(&ispetsc);
480: return 0;
481: }
483: /*@C
484: AOCreateMemoryScalableIS - Creates a memory scalable application ordering using two index sets.
486: Collective
488: Input Parameters:
489: + isapp - index set that defines an ordering
490: - ispetsc - index set that defines another ordering (may be NULL to use the
491: natural ordering)
493: Output Parameter:
494: . aoout - the new application ordering
496: Level: beginner
498: Notes:
499: The index sets isapp and ispetsc must contain the all the integers 0 to napp-1 (where napp is the length of the index sets) with no duplicates;
500: that is there cannot be any "holes".
502: Comparing with `AOCreateBasicIS()`, this routine trades memory with message communication.
504: .seealso: [](sec_ao), [](sec_scatter), `AO`, `AOCreateBasicIS()`, `AOCreateMemoryScalable()`, `AODestroy()`
505: @*/
506: PetscErrorCode AOCreateMemoryScalableIS(IS isapp, IS ispetsc, AO *aoout)
507: {
508: MPI_Comm comm;
509: AO ao;
511: PetscObjectGetComm((PetscObject)isapp, &comm);
512: AOCreate(comm, &ao);
513: AOSetIS(ao, isapp, ispetsc);
514: AOSetType(ao, AOMEMORYSCALABLE);
515: AOViewFromOptions(ao, NULL, "-ao_view");
516: *aoout = ao;
517: return 0;
518: }