Actual source code: sbaijov.c
2: /*
3: Routines to compute overlapping regions of a parallel MPI matrix.
4: Used for finding submatrices that were shared across processors.
5: */
6: #include <../src/mat/impls/sbaij/mpi/mpisbaij.h>
7: #include <petscbt.h>
9: static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat, PetscInt, IS *);
10: static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Local(Mat, PetscInt *, PetscInt, PetscInt *, PetscBT *);
12: PetscErrorCode MatIncreaseOverlap_MPISBAIJ(Mat C, PetscInt is_max, IS is[], PetscInt ov)
13: {
14: PetscInt i, N = C->cmap->N, bs = C->rmap->bs, M = C->rmap->N, Mbs = M / bs, *nidx, isz, iov;
15: IS *is_new, *is_row;
16: Mat *submats;
17: Mat_MPISBAIJ *c = (Mat_MPISBAIJ *)C->data;
18: Mat_SeqSBAIJ *asub_i;
19: PetscBT table;
20: PetscInt *ai, brow, nz, nis, l, nmax, nstages_local, nstages, max_no, pos;
21: const PetscInt *idx;
22: PetscBool flg;
24: PetscMalloc1(is_max, &is_new);
25: /* Convert the indices into block format */
26: ISCompressIndicesGeneral(N, C->rmap->n, bs, is_max, is, is_new);
29: /* ----- previous non-scalable implementation ----- */
30: flg = PETSC_FALSE;
31: PetscOptionsHasName(NULL, NULL, "-IncreaseOverlap_old", &flg);
32: if (flg) { /* previous non-scalable implementation */
33: printf("use previous non-scalable implementation...\n");
34: for (i = 0; i < ov; ++i) MatIncreaseOverlap_MPISBAIJ_Once(C, is_max, is_new);
35: } else { /* implementation using modified BAIJ routines */
37: PetscMalloc1(Mbs + 1, &nidx);
38: PetscBTCreate(Mbs, &table); /* for column search */
40: /* Create is_row */
41: PetscMalloc1(is_max, &is_row);
42: ISCreateStride(PETSC_COMM_SELF, Mbs, 0, 1, &is_row[0]);
44: for (i = 1; i < is_max; i++) { is_row[i] = is_row[0]; /* reuse is_row[0] */ }
46: /* Allocate memory to hold all the submatrices - Modified from MatCreateSubMatrices_MPIBAIJ() */
47: PetscMalloc1(is_max + 1, &submats);
49: /* Determine the number of stages through which submatrices are done */
50: nmax = 20 * 1000000 / (c->Nbs * sizeof(PetscInt));
51: if (!nmax) nmax = 1;
52: nstages_local = is_max / nmax + ((is_max % nmax) ? 1 : 0);
54: /* Make sure every processor loops through the nstages */
55: MPIU_Allreduce(&nstages_local, &nstages, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C));
57: for (iov = 0; iov < ov; ++iov) {
58: /* 1) Get submats for column search */
59: for (i = 0, pos = 0; i < nstages; i++) {
60: if (pos + nmax <= is_max) max_no = nmax;
61: else if (pos == is_max) max_no = 0;
62: else max_no = is_max - pos;
63: c->ijonly = PETSC_TRUE; /* only matrix data structures are requested */
64: /* The resulting submatrices should be BAIJ, not SBAIJ, hence we change this value to trigger that */
65: PetscStrcpy(((PetscObject)c->A)->type_name, MATSEQBAIJ);
66: MatCreateSubMatrices_MPIBAIJ_local(C, max_no, is_row + pos, is_new + pos, MAT_INITIAL_MATRIX, submats + pos);
67: PetscStrcpy(((PetscObject)c->A)->type_name, MATSEQSBAIJ);
68: pos += max_no;
69: }
71: /* 2) Row search */
72: MatIncreaseOverlap_MPIBAIJ_Once(C, is_max, is_new);
74: /* 3) Column search */
75: for (i = 0; i < is_max; i++) {
76: asub_i = (Mat_SeqSBAIJ *)submats[i]->data;
77: ai = asub_i->i;
79: /* put is_new obtained from MatIncreaseOverlap_MPIBAIJ() to table */
80: PetscBTMemzero(Mbs, table);
82: ISGetIndices(is_new[i], &idx);
83: ISGetLocalSize(is_new[i], &nis);
84: for (l = 0; l < nis; l++) {
85: PetscBTSet(table, idx[l]);
86: nidx[l] = idx[l];
87: }
88: isz = nis;
90: /* add column entries to table */
91: for (brow = 0; brow < Mbs; brow++) {
92: nz = ai[brow + 1] - ai[brow];
93: if (nz) {
94: if (!PetscBTLookupSet(table, brow)) nidx[isz++] = brow;
95: }
96: }
97: ISRestoreIndices(is_new[i], &idx);
98: ISDestroy(&is_new[i]);
100: /* create updated is_new */
101: ISCreateGeneral(PETSC_COMM_SELF, isz, nidx, PETSC_COPY_VALUES, is_new + i);
102: }
104: /* Free tmp spaces */
105: for (i = 0; i < is_max; i++) MatDestroy(&submats[i]);
106: }
108: PetscBTDestroy(&table);
109: PetscFree(submats);
110: ISDestroy(&is_row[0]);
111: PetscFree(is_row);
112: PetscFree(nidx);
113: }
115: for (i = 0; i < is_max; i++) ISDestroy(&is[i]);
116: ISExpandIndicesGeneral(N, N, bs, is_max, is_new, is);
118: for (i = 0; i < is_max; i++) ISDestroy(&is_new[i]);
119: PetscFree(is_new);
120: return 0;
121: }
123: typedef enum {
124: MINE,
125: OTHER
126: } WhoseOwner;
127: /* data1, odata1 and odata2 are packed in the format (for communication):
128: data[0] = is_max, no of is
129: data[1] = size of is[0]
130: ...
131: data[is_max] = size of is[is_max-1]
132: data[is_max + 1] = data(is[0])
133: ...
134: data[is_max+1+sum(size of is[k]), k=0,...,i-1] = data(is[i])
135: ...
136: data2 is packed in the format (for creating output is[]):
137: data[0] = is_max, no of is
138: data[1] = size of is[0]
139: ...
140: data[is_max] = size of is[is_max-1]
141: data[is_max + 1] = data(is[0])
142: ...
143: data[is_max + 1 + Mbs*i) = data(is[i])
144: ...
145: */
146: static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat C, PetscInt is_max, IS is[])
147: {
148: Mat_MPISBAIJ *c = (Mat_MPISBAIJ *)C->data;
149: PetscMPIInt size, rank, tag1, tag2, *len_s, nrqr, nrqs, *id_r1, *len_r1, flag, len, *iwork;
150: const PetscInt *idx_i;
151: PetscInt idx, isz, col, *n, *data1, **data1_start, *data2, *data2_i, *data, *data_i;
152: PetscInt Mbs, i, j, k, *odata1, *odata2;
153: PetscInt proc_id, **odata2_ptr, *ctable = NULL, *btable, len_max, len_est;
154: PetscInt proc_end = 0, len_unused, nodata2;
155: PetscInt ois_max; /* max no of is[] in each of processor */
156: char *t_p;
157: MPI_Comm comm;
158: MPI_Request *s_waits1, *s_waits2, r_req;
159: MPI_Status *s_status, r_status;
160: PetscBT *table; /* mark indices of this processor's is[] */
161: PetscBT table_i;
162: PetscBT otable; /* mark indices of other processors' is[] */
163: PetscInt bs = C->rmap->bs, Bn = c->B->cmap->n, Bnbs = Bn / bs, *Bowners;
164: IS garray_local, garray_gl;
166: PetscObjectGetComm((PetscObject)C, &comm);
167: size = c->size;
168: rank = c->rank;
169: Mbs = c->Mbs;
171: PetscObjectGetNewTag((PetscObject)C, &tag1);
172: PetscObjectGetNewTag((PetscObject)C, &tag2);
174: /* create tables used in
175: step 1: table[i] - mark c->garray of proc [i]
176: step 3: table[i] - mark indices of is[i] when whose=MINE
177: table[0] - mark incideces of is[] when whose=OTHER */
178: len = PetscMax(is_max, size);
179: PetscMalloc2(len, &table, (Mbs / PETSC_BITS_PER_BYTE + 1) * len, &t_p);
180: for (i = 0; i < len; i++) table[i] = t_p + (Mbs / PETSC_BITS_PER_BYTE + 1) * i;
182: MPIU_Allreduce(&is_max, &ois_max, 1, MPIU_INT, MPI_MAX, comm);
184: /* 1. Send this processor's is[] to other processors */
185: /*---------------------------------------------------*/
186: /* allocate spaces */
187: PetscMalloc1(is_max, &n);
188: len = 0;
189: for (i = 0; i < is_max; i++) {
190: ISGetLocalSize(is[i], &n[i]);
191: len += n[i];
192: }
193: if (!len) {
194: is_max = 0;
195: } else {
196: len += 1 + is_max; /* max length of data1 for one processor */
197: }
199: PetscMalloc1(size * len + 1, &data1);
200: PetscMalloc1(size, &data1_start);
201: for (i = 0; i < size; i++) data1_start[i] = data1 + i * len;
203: PetscMalloc4(size, &len_s, size, &btable, size, &iwork, size + 1, &Bowners);
205: /* gather c->garray from all processors */
206: ISCreateGeneral(comm, Bnbs, c->garray, PETSC_COPY_VALUES, &garray_local);
207: ISAllGather(garray_local, &garray_gl);
208: ISDestroy(&garray_local);
209: MPI_Allgather(&Bnbs, 1, MPIU_INT, Bowners + 1, 1, MPIU_INT, comm);
211: Bowners[0] = 0;
212: for (i = 0; i < size; i++) Bowners[i + 1] += Bowners[i];
214: if (is_max) {
215: /* hash table ctable which maps c->row to proc_id) */
216: PetscMalloc1(Mbs, &ctable);
217: for (proc_id = 0, j = 0; proc_id < size; proc_id++) {
218: for (; j < C->rmap->range[proc_id + 1] / bs; j++) ctable[j] = proc_id;
219: }
221: /* hash tables marking c->garray */
222: ISGetIndices(garray_gl, &idx_i);
223: for (i = 0; i < size; i++) {
224: table_i = table[i];
225: PetscBTMemzero(Mbs, table_i);
226: for (j = Bowners[i]; j < Bowners[i + 1]; j++) { /* go through B cols of proc[i]*/
227: PetscBTSet(table_i, idx_i[j]);
228: }
229: }
230: ISRestoreIndices(garray_gl, &idx_i);
231: } /* if (is_max) */
232: ISDestroy(&garray_gl);
234: /* evaluate communication - mesg to who, length, and buffer space */
235: for (i = 0; i < size; i++) len_s[i] = 0;
237: /* header of data1 */
238: for (proc_id = 0; proc_id < size; proc_id++) {
239: iwork[proc_id] = 0;
240: *data1_start[proc_id] = is_max;
241: data1_start[proc_id]++;
242: for (j = 0; j < is_max; j++) {
243: if (proc_id == rank) {
244: *data1_start[proc_id] = n[j];
245: } else {
246: *data1_start[proc_id] = 0;
247: }
248: data1_start[proc_id]++;
249: }
250: }
252: for (i = 0; i < is_max; i++) {
253: ISGetIndices(is[i], &idx_i);
254: for (j = 0; j < n[i]; j++) {
255: idx = idx_i[j];
256: *data1_start[rank] = idx;
257: data1_start[rank]++; /* for local processing */
258: proc_end = ctable[idx];
259: for (proc_id = 0; proc_id <= proc_end; proc_id++) { /* for others to process */
260: if (proc_id == rank) continue; /* done before this loop */
261: if (proc_id < proc_end && !PetscBTLookup(table[proc_id], idx)) continue; /* no need for sending idx to [proc_id] */
262: *data1_start[proc_id] = idx;
263: data1_start[proc_id]++;
264: len_s[proc_id]++;
265: }
266: }
267: /* update header data */
268: for (proc_id = 0; proc_id < size; proc_id++) {
269: if (proc_id == rank) continue;
270: *(data1 + proc_id * len + 1 + i) = len_s[proc_id] - iwork[proc_id];
271: iwork[proc_id] = len_s[proc_id];
272: }
273: ISRestoreIndices(is[i], &idx_i);
274: }
276: nrqs = 0;
277: nrqr = 0;
278: for (i = 0; i < size; i++) {
279: data1_start[i] = data1 + i * len;
280: if (len_s[i]) {
281: nrqs++;
282: len_s[i] += 1 + is_max; /* add no. of header msg */
283: }
284: }
286: for (i = 0; i < is_max; i++) ISDestroy(&is[i]);
287: PetscFree(n);
288: PetscFree(ctable);
290: /* Determine the number of messages to expect, their lengths, from from-ids */
291: PetscGatherNumberOfMessages(comm, NULL, len_s, &nrqr);
292: PetscGatherMessageLengths(comm, nrqs, nrqr, len_s, &id_r1, &len_r1);
294: /* Now post the sends */
295: PetscMalloc2(size, &s_waits1, size, &s_waits2);
296: k = 0;
297: for (proc_id = 0; proc_id < size; proc_id++) { /* send data1 to processor [proc_id] */
298: if (len_s[proc_id]) {
299: MPI_Isend(data1_start[proc_id], len_s[proc_id], MPIU_INT, proc_id, tag1, comm, s_waits1 + k);
300: k++;
301: }
302: }
304: /* 2. Receive other's is[] and process. Then send back */
305: /*-----------------------------------------------------*/
306: len = 0;
307: for (i = 0; i < nrqr; i++) {
308: if (len_r1[i] > len) len = len_r1[i];
309: }
310: PetscFree(len_r1);
311: PetscFree(id_r1);
313: for (proc_id = 0; proc_id < size; proc_id++) len_s[proc_id] = iwork[proc_id] = 0;
315: PetscMalloc1(len + 1, &odata1);
316: PetscMalloc1(size, &odata2_ptr);
317: PetscBTCreate(Mbs, &otable);
319: len_max = ois_max * (Mbs + 1); /* max space storing all is[] for each receive */
320: len_est = 2 * len_max; /* estimated space of storing is[] for all receiving messages */
321: PetscMalloc1(len_est + 1, &odata2);
322: nodata2 = 0; /* nodata2+1: num of PetscMalloc(,&odata2_ptr[]) called */
324: odata2_ptr[nodata2] = odata2;
326: len_unused = len_est; /* unused space in the array odata2_ptr[nodata2]-- needs to be >= len_max */
328: k = 0;
329: while (k < nrqr) {
330: /* Receive messages */
331: MPI_Iprobe(MPI_ANY_SOURCE, tag1, comm, &flag, &r_status);
332: if (flag) {
333: MPI_Get_count(&r_status, MPIU_INT, &len);
334: proc_id = r_status.MPI_SOURCE;
335: MPI_Irecv(odata1, len, MPIU_INT, proc_id, r_status.MPI_TAG, comm, &r_req);
336: MPI_Wait(&r_req, &r_status);
338: /* Process messages */
339: /* make sure there is enough unused space in odata2 array */
340: if (len_unused < len_max) { /* allocate more space for odata2 */
341: PetscMalloc1(len_est + 1, &odata2);
343: odata2_ptr[++nodata2] = odata2;
345: len_unused = len_est;
346: }
348: MatIncreaseOverlap_MPISBAIJ_Local(C, odata1, OTHER, odata2, &otable);
349: len = 1 + odata2[0];
350: for (i = 0; i < odata2[0]; i++) len += odata2[1 + i];
352: /* Send messages back */
353: MPI_Isend(odata2, len, MPIU_INT, proc_id, tag2, comm, s_waits2 + k);
354: k++;
355: odata2 += len;
356: len_unused -= len;
357: len_s[proc_id] = len; /* num of messages sending back to [proc_id] by this proc */
358: }
359: }
360: PetscFree(odata1);
361: PetscBTDestroy(&otable);
363: /* 3. Do local work on this processor's is[] */
364: /*-------------------------------------------*/
365: /* make sure there is enough unused space in odata2(=data) array */
366: len_max = is_max * (Mbs + 1); /* max space storing all is[] for this processor */
367: if (len_unused < len_max) { /* allocate more space for odata2 */
368: PetscMalloc1(len_est + 1, &odata2);
370: odata2_ptr[++nodata2] = odata2;
371: }
373: data = odata2;
374: MatIncreaseOverlap_MPISBAIJ_Local(C, data1_start[rank], MINE, data, table);
375: PetscFree(data1_start);
377: /* 4. Receive work done on other processors, then merge */
378: /*------------------------------------------------------*/
379: /* get max number of messages that this processor expects to recv */
380: MPIU_Allreduce(len_s, iwork, size, MPI_INT, MPI_MAX, comm);
381: PetscMalloc1(iwork[rank] + 1, &data2);
382: PetscFree4(len_s, btable, iwork, Bowners);
384: k = 0;
385: while (k < nrqs) {
386: /* Receive messages */
387: MPI_Iprobe(MPI_ANY_SOURCE, tag2, comm, &flag, &r_status);
388: if (flag) {
389: MPI_Get_count(&r_status, MPIU_INT, &len);
391: proc_id = r_status.MPI_SOURCE;
393: MPI_Irecv(data2, len, MPIU_INT, proc_id, r_status.MPI_TAG, comm, &r_req);
394: MPI_Wait(&r_req, &r_status);
395: if (len > 1 + is_max) { /* Add data2 into data */
396: data2_i = data2 + 1 + is_max;
397: for (i = 0; i < is_max; i++) {
398: table_i = table[i];
399: data_i = data + 1 + is_max + Mbs * i;
400: isz = data[1 + i];
401: for (j = 0; j < data2[1 + i]; j++) {
402: col = data2_i[j];
403: if (!PetscBTLookupSet(table_i, col)) data_i[isz++] = col;
404: }
405: data[1 + i] = isz;
406: if (i < is_max - 1) data2_i += data2[1 + i];
407: }
408: }
409: k++;
410: }
411: }
412: PetscFree(data2);
413: PetscFree2(table, t_p);
415: /* phase 1 sends are complete */
416: PetscMalloc1(size, &s_status);
417: if (nrqs) MPI_Waitall(nrqs, s_waits1, s_status);
418: PetscFree(data1);
420: /* phase 2 sends are complete */
421: if (nrqr) MPI_Waitall(nrqr, s_waits2, s_status);
422: PetscFree2(s_waits1, s_waits2);
423: PetscFree(s_status);
425: /* 5. Create new is[] */
426: /*--------------------*/
427: for (i = 0; i < is_max; i++) {
428: data_i = data + 1 + is_max + Mbs * i;
429: ISCreateGeneral(PETSC_COMM_SELF, data[1 + i], data_i, PETSC_COPY_VALUES, is + i);
430: }
431: for (k = 0; k <= nodata2; k++) PetscFree(odata2_ptr[k]);
432: PetscFree(odata2_ptr);
433: return 0;
434: }
436: /*
437: MatIncreaseOverlap_MPISBAIJ_Local - Called by MatIncreaseOverlap, to do
438: the work on the local processor.
440: Inputs:
441: C - MAT_MPISBAIJ;
442: data - holds is[]. See MatIncreaseOverlap_MPISBAIJ_Once() for the format.
443: whose - whose is[] to be processed,
444: MINE: this processor's is[]
445: OTHER: other processor's is[]
446: Output:
447: nidx - whose = MINE:
448: holds input and newly found indices in the same format as data
449: whose = OTHER:
450: only holds the newly found indices
451: table - table[i]: mark the indices of is[i], i=0,...,is_max. Used only in the case 'whose=MINE'.
452: */
453: /* Would computation be reduced by swapping the loop 'for each is' and 'for each row'? */
454: static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Local(Mat C, PetscInt *data, PetscInt whose, PetscInt *nidx, PetscBT *table)
455: {
456: Mat_MPISBAIJ *c = (Mat_MPISBAIJ *)C->data;
457: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)(c->A)->data;
458: Mat_SeqBAIJ *b = (Mat_SeqBAIJ *)(c->B)->data;
459: PetscInt row, mbs, Mbs, *nidx_i, col, col_max, isz, isz0, *ai, *aj, *bi, *bj, *garray, rstart, l;
460: PetscInt a_start, a_end, b_start, b_end, i, j, k, is_max, *idx_i, n;
461: PetscBT table0; /* mark the indices of input is[] for look up */
462: PetscBT table_i; /* points to i-th table. When whose=OTHER, a single table is used for all is[] */
464: Mbs = c->Mbs;
465: mbs = a->mbs;
466: ai = a->i;
467: aj = a->j;
468: bi = b->i;
469: bj = b->j;
470: garray = c->garray;
471: rstart = c->rstartbs;
472: is_max = data[0];
474: PetscBTCreate(Mbs, &table0);
476: nidx[0] = is_max;
477: idx_i = data + is_max + 1; /* ptr to input is[0] array */
478: nidx_i = nidx + is_max + 1; /* ptr to output is[0] array */
479: for (i = 0; i < is_max; i++) { /* for each is */
480: isz = 0;
481: n = data[1 + i]; /* size of input is[i] */
483: /* initialize and set table_i(mark idx and nidx) and table0(only mark idx) */
484: if (whose == MINE) { /* process this processor's is[] */
485: table_i = table[i];
486: nidx_i = nidx + 1 + is_max + Mbs * i;
487: } else { /* process other processor's is[] - only use one temp table */
488: table_i = table[0];
489: }
490: PetscBTMemzero(Mbs, table_i);
491: PetscBTMemzero(Mbs, table0);
492: if (n == 0) {
493: nidx[1 + i] = 0; /* size of new is[i] */
494: continue;
495: }
497: isz0 = 0;
498: col_max = 0;
499: for (j = 0; j < n; j++) {
500: col = idx_i[j];
502: if (!PetscBTLookupSet(table_i, col)) {
503: PetscBTSet(table0, col);
504: if (whose == MINE) nidx_i[isz0] = col;
505: if (col_max < col) col_max = col;
506: isz0++;
507: }
508: }
510: if (whose == MINE) isz = isz0;
511: k = 0; /* no. of indices from input is[i] that have been examined */
512: for (row = 0; row < mbs; row++) {
513: a_start = ai[row];
514: a_end = ai[row + 1];
515: b_start = bi[row];
516: b_end = bi[row + 1];
517: if (PetscBTLookup(table0, row + rstart)) { /* row is on input is[i]:
518: do row search: collect all col in this row */
519: for (l = a_start; l < a_end; l++) { /* Amat */
520: col = aj[l] + rstart;
521: if (!PetscBTLookupSet(table_i, col)) nidx_i[isz++] = col;
522: }
523: for (l = b_start; l < b_end; l++) { /* Bmat */
524: col = garray[bj[l]];
525: if (!PetscBTLookupSet(table_i, col)) nidx_i[isz++] = col;
526: }
527: k++;
528: if (k >= isz0) break; /* for (row=0; row<mbs; row++) */
529: } else { /* row is not on input is[i]:
530: do col search: add row onto nidx_i if there is a col in nidx_i */
531: for (l = a_start; l < a_end; l++) { /* Amat */
532: col = aj[l] + rstart;
533: if (col > col_max) break;
534: if (PetscBTLookup(table0, col)) {
535: if (!PetscBTLookupSet(table_i, row + rstart)) nidx_i[isz++] = row + rstart;
536: break; /* for l = start; l<end ; l++) */
537: }
538: }
539: for (l = b_start; l < b_end; l++) { /* Bmat */
540: col = garray[bj[l]];
541: if (col > col_max) break;
542: if (PetscBTLookup(table0, col)) {
543: if (!PetscBTLookupSet(table_i, row + rstart)) nidx_i[isz++] = row + rstart;
544: break; /* for l = start; l<end ; l++) */
545: }
546: }
547: }
548: }
550: if (i < is_max - 1) {
551: idx_i += n; /* ptr to input is[i+1] array */
552: nidx_i += isz; /* ptr to output is[i+1] array */
553: }
554: nidx[1 + i] = isz; /* size of new is[i] */
555: } /* for each is */
556: PetscBTDestroy(&table0);
557: return 0;
558: }