Actual source code: mpiov.c
1: /*
2: Routines to compute overlapping regions of a parallel MPI matrix
3: and to find submatrices that were shared across processors.
4: */
5: #include <../src/mat/impls/aij/seq/aij.h>
6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
7: #include <petscbt.h>
8: #include <petscsf.h>
10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *);
11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscTable *);
12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *);
13: extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
14: extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
16: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *);
17: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *);
18: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **);
19: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *);
21: /*
22: Takes a general IS and builds a block version of the IS that contains the given IS plus any needed values to fill out the blocks
24: The entire MatIncreaseOverlap_MPIAIJ() stack could be rewritten to respect the bs and it would offer higher performance but
25: that is a very major recoding job.
27: Possible scalability issues with this routine because it allocates space proportional to Nmax-Nmin
28: */
29: static PetscErrorCode ISAdjustForBlockSize(PetscInt bs, PetscInt imax, IS is[])
30: {
31: for (PetscInt i = 0; i < imax; i++) {
32: if (!is[i]) break;
33: PetscInt n = 0, N, Nmax, Nmin;
34: const PetscInt *idx;
35: PetscInt *nidx = NULL;
36: MPI_Comm comm;
37: PetscBT bt;
39: ISGetLocalSize(is[i], &N);
40: if (N > 0) { /* Nmax and Nmin are garbage for empty IS */
41: ISGetIndices(is[i], &idx);
42: ISGetMinMax(is[i], &Nmin, &Nmax);
43: Nmin = Nmin / bs;
44: Nmax = Nmax / bs;
45: PetscBTCreate(Nmax - Nmin, &bt);
46: for (PetscInt j = 0; j < N; j++) {
47: if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) n++;
48: }
49: PetscMalloc1(n, &nidx);
50: n = 0;
51: PetscBTMemzero(Nmax - Nmin, bt);
52: for (PetscInt j = 0; j < N; j++) {
53: if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) nidx[n++] = idx[j] / bs;
54: }
55: PetscBTDestroy(&bt);
56: ISRestoreIndices(is[i], &idx);
57: }
58: PetscObjectGetComm((PetscObject)is[i], &comm);
59: ISDestroy(is + i);
60: ISCreateBlock(comm, bs, n, nidx, PETSC_OWN_POINTER, is + i);
61: }
62: return 0;
63: }
65: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov)
66: {
67: PetscInt i;
70: for (i = 0; i < ov; ++i) MatIncreaseOverlap_MPIAIJ_Once(C, imax, is);
71: if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) ISAdjustForBlockSize(C->rmap->bs, imax, is);
72: return 0;
73: }
75: PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov)
76: {
77: PetscInt i;
80: for (i = 0; i < ov; ++i) MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is);
81: if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) ISAdjustForBlockSize(C->rmap->bs, imax, is);
82: return 0;
83: }
85: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[])
86: {
87: MPI_Comm comm;
88: PetscInt *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_ranks, *rrow_isids, i, j;
89: PetscInt *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata;
90: PetscInt nrecvrows, *sbsizes = NULL, *sbdata = NULL;
91: const PetscInt *indices_i, **indices;
92: PetscLayout rmap;
93: PetscMPIInt rank, size, *toranks, *fromranks, nto, nfrom, owner;
94: PetscSF sf;
95: PetscSFNode *remote;
97: PetscObjectGetComm((PetscObject)mat, &comm);
98: MPI_Comm_rank(comm, &rank);
99: MPI_Comm_size(comm, &size);
100: /* get row map to determine where rows should be going */
101: MatGetLayouts(mat, &rmap, NULL);
102: /* retrieve IS data and put all together so that we
103: * can optimize communication
104: * */
105: PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length);
106: for (i = 0, tlength = 0; i < nidx; i++) {
107: ISGetLocalSize(is[i], &length[i]);
108: tlength += length[i];
109: ISGetIndices(is[i], &indices[i]);
110: }
111: /* find these rows on remote processors */
112: PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids);
113: PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp);
114: nrrows = 0;
115: for (i = 0; i < nidx; i++) {
116: length_i = length[i];
117: indices_i = indices[i];
118: for (j = 0; j < length_i; j++) {
119: owner = -1;
120: PetscLayoutFindOwner(rmap, indices_i[j], &owner);
121: /* remote processors */
122: if (owner != rank) {
123: tosizes_temp[owner]++; /* number of rows to owner */
124: rrow_ranks[nrrows] = owner; /* processor */
125: rrow_isids[nrrows] = i; /* is id */
126: remoterows[nrrows++] = indices_i[j]; /* row */
127: }
128: }
129: ISRestoreIndices(is[i], &indices[i]);
130: }
131: PetscFree2(*(PetscInt ***)&indices, length);
132: /* test if we need to exchange messages
133: * generally speaking, we do not need to exchange
134: * data when overlap is 1
135: * */
136: MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm);
137: /* we do not have any messages
138: * It usually corresponds to overlap 1
139: * */
140: if (!reducednrrows) {
141: PetscFree3(toranks, tosizes, tosizes_temp);
142: PetscFree3(remoterows, rrow_ranks, rrow_isids);
143: MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is);
144: return 0;
145: }
146: nto = 0;
147: /* send sizes and ranks for building a two-sided communication */
148: for (i = 0; i < size; i++) {
149: if (tosizes_temp[i]) {
150: tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */
151: tosizes_temp[i] = nto; /* a map from processor to index */
152: toranks[nto++] = i; /* processor */
153: }
154: }
155: PetscMalloc1(nto + 1, &toffsets);
156: toffsets[0] = 0;
157: for (i = 0; i < nto; i++) {
158: toffsets[i + 1] = toffsets[i] + tosizes[2 * i]; /* offsets */
159: tosizes[2 * i + 1] = toffsets[i]; /* offsets to send */
160: }
161: /* send information to other processors */
162: PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes);
163: nrecvrows = 0;
164: for (i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i];
165: PetscMalloc1(nrecvrows, &remote);
166: nrecvrows = 0;
167: for (i = 0; i < nfrom; i++) {
168: for (j = 0; j < fromsizes[2 * i]; j++) {
169: remote[nrecvrows].rank = fromranks[i];
170: remote[nrecvrows++].index = fromsizes[2 * i + 1] + j;
171: }
172: }
173: PetscSFCreate(comm, &sf);
174: PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER);
175: /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
176: PetscSFSetType(sf, PETSCSFBASIC);
177: PetscSFSetFromOptions(sf);
178: /* message pair <no of is, row> */
179: PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata);
180: for (i = 0; i < nrrows; i++) {
181: owner = rrow_ranks[i]; /* processor */
182: j = tosizes_temp[owner]; /* index */
183: todata[toffsets[j]++] = rrow_isids[i];
184: todata[toffsets[j]++] = remoterows[i];
185: }
186: PetscFree3(toranks, tosizes, tosizes_temp);
187: PetscFree3(remoterows, rrow_ranks, rrow_isids);
188: PetscFree(toffsets);
189: PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE);
190: PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE);
191: PetscSFDestroy(&sf);
192: /* send rows belonging to the remote so that then we could get the overlapping data back */
193: MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata);
194: PetscFree2(todata, fromdata);
195: PetscFree(fromsizes);
196: PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes);
197: PetscFree(fromranks);
198: nrecvrows = 0;
199: for (i = 0; i < nto; i++) nrecvrows += tosizes[2 * i];
200: PetscCalloc1(nrecvrows, &todata);
201: PetscMalloc1(nrecvrows, &remote);
202: nrecvrows = 0;
203: for (i = 0; i < nto; i++) {
204: for (j = 0; j < tosizes[2 * i]; j++) {
205: remote[nrecvrows].rank = toranks[i];
206: remote[nrecvrows++].index = tosizes[2 * i + 1] + j;
207: }
208: }
209: PetscSFCreate(comm, &sf);
210: PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER);
211: /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
212: PetscSFSetType(sf, PETSCSFBASIC);
213: PetscSFSetFromOptions(sf);
214: /* overlap communication and computation */
215: PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE);
216: MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is);
217: PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE);
218: PetscSFDestroy(&sf);
219: PetscFree2(sbdata, sbsizes);
220: MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata);
221: PetscFree(toranks);
222: PetscFree(tosizes);
223: PetscFree(todata);
224: return 0;
225: }
227: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
228: {
229: PetscInt *isz, isz_i, i, j, is_id, data_size;
230: PetscInt col, lsize, max_lsize, *indices_temp, *indices_i;
231: const PetscInt *indices_i_temp;
232: MPI_Comm *iscomms;
234: max_lsize = 0;
235: PetscMalloc1(nidx, &isz);
236: for (i = 0; i < nidx; i++) {
237: ISGetLocalSize(is[i], &lsize);
238: max_lsize = lsize > max_lsize ? lsize : max_lsize;
239: isz[i] = lsize;
240: }
241: PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms);
242: for (i = 0; i < nidx; i++) {
243: PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL);
244: ISGetIndices(is[i], &indices_i_temp);
245: PetscArraycpy(indices_temp + i * (max_lsize + nrecvs), indices_i_temp, isz[i]);
246: ISRestoreIndices(is[i], &indices_i_temp);
247: ISDestroy(&is[i]);
248: }
249: /* retrieve information to get row id and its overlap */
250: for (i = 0; i < nrecvs;) {
251: is_id = recvdata[i++];
252: data_size = recvdata[i++];
253: indices_i = indices_temp + (max_lsize + nrecvs) * is_id;
254: isz_i = isz[is_id];
255: for (j = 0; j < data_size; j++) {
256: col = recvdata[i++];
257: indices_i[isz_i++] = col;
258: }
259: isz[is_id] = isz_i;
260: }
261: /* remove duplicate entities */
262: for (i = 0; i < nidx; i++) {
263: indices_i = indices_temp + (max_lsize + nrecvs) * i;
264: isz_i = isz[i];
265: PetscSortRemoveDupsInt(&isz_i, indices_i);
266: ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i]);
267: PetscCommDestroy(&iscomms[i]);
268: }
269: PetscFree(isz);
270: PetscFree2(indices_temp, iscomms);
271: return 0;
272: }
274: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
275: {
276: PetscLayout rmap, cmap;
277: PetscInt i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i;
278: PetscInt is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata;
279: PetscInt *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets;
280: const PetscInt *gcols, *ai, *aj, *bi, *bj;
281: Mat amat, bmat;
282: PetscMPIInt rank;
283: PetscBool done;
284: MPI_Comm comm;
286: PetscObjectGetComm((PetscObject)mat, &comm);
287: MPI_Comm_rank(comm, &rank);
288: MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols);
289: /* Even if the mat is symmetric, we still assume it is not symmetric */
290: MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done);
292: MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done);
294: /* total number of nonzero values is used to estimate the memory usage in the next step */
295: tnz = ai[an] + bi[bn];
296: MatGetLayouts(mat, &rmap, &cmap);
297: PetscLayoutGetRange(rmap, &rstart, NULL);
298: PetscLayoutGetRange(cmap, &cstart, NULL);
299: /* to find the longest message */
300: max_fszs = 0;
301: for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs;
302: /* better way to estimate number of nonzero in the mat??? */
303: PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp);
304: for (i = 0; i < nidx; i++) rows_data[i] = rows_data_ptr + max_fszs * i;
305: rows_pos = 0;
306: totalrows = 0;
307: for (i = 0; i < nfrom; i++) {
308: PetscArrayzero(rows_pos_i, nidx);
309: /* group data together */
310: for (j = 0; j < fromsizes[2 * i]; j += 2) {
311: is_id = fromrows[rows_pos++]; /* no of is */
312: rows_i = rows_data[is_id];
313: rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */
314: }
315: /* estimate a space to avoid multiple allocations */
316: for (j = 0; j < nidx; j++) {
317: indvc_ij = 0;
318: rows_i = rows_data[j];
319: for (l = 0; l < rows_pos_i[j]; l++) {
320: row = rows_i[l] - rstart;
321: start = ai[row];
322: end = ai[row + 1];
323: for (k = start; k < end; k++) { /* Amat */
324: col = aj[k] + cstart;
325: indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */
326: }
327: start = bi[row];
328: end = bi[row + 1];
329: for (k = start; k < end; k++) { /* Bmat */
330: col = gcols[bj[k]];
331: indices_tmp[indvc_ij++] = col;
332: }
333: }
334: PetscSortRemoveDupsInt(&indvc_ij, indices_tmp);
335: indv_counts[i * nidx + j] = indvc_ij;
336: totalrows += indvc_ij;
337: }
338: }
339: /* message triple <no of is, number of rows, rows> */
340: PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes);
341: totalrows = 0;
342: rows_pos = 0;
343: /* use this code again */
344: for (i = 0; i < nfrom; i++) {
345: PetscArrayzero(rows_pos_i, nidx);
346: for (j = 0; j < fromsizes[2 * i]; j += 2) {
347: is_id = fromrows[rows_pos++];
348: rows_i = rows_data[is_id];
349: rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
350: }
351: /* add data */
352: for (j = 0; j < nidx; j++) {
353: if (!indv_counts[i * nidx + j]) continue;
354: indvc_ij = 0;
355: sbdata[totalrows++] = j;
356: sbdata[totalrows++] = indv_counts[i * nidx + j];
357: sbsizes[2 * i] += 2;
358: rows_i = rows_data[j];
359: for (l = 0; l < rows_pos_i[j]; l++) {
360: row = rows_i[l] - rstart;
361: start = ai[row];
362: end = ai[row + 1];
363: for (k = start; k < end; k++) { /* Amat */
364: col = aj[k] + cstart;
365: indices_tmp[indvc_ij++] = col;
366: }
367: start = bi[row];
368: end = bi[row + 1];
369: for (k = start; k < end; k++) { /* Bmat */
370: col = gcols[bj[k]];
371: indices_tmp[indvc_ij++] = col;
372: }
373: }
374: PetscSortRemoveDupsInt(&indvc_ij, indices_tmp);
375: sbsizes[2 * i] += indvc_ij;
376: PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij);
377: totalrows += indvc_ij;
378: }
379: }
380: PetscMalloc1(nfrom + 1, &offsets);
381: offsets[0] = 0;
382: for (i = 0; i < nfrom; i++) {
383: offsets[i + 1] = offsets[i] + sbsizes[2 * i];
384: sbsizes[2 * i + 1] = offsets[i];
385: }
386: PetscFree(offsets);
387: if (sbrowsizes) *sbrowsizes = sbsizes;
388: if (sbrows) *sbrows = sbdata;
389: PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp);
390: MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done);
391: MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done);
392: return 0;
393: }
395: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[])
396: {
397: const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices;
398: PetscInt tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp;
399: PetscInt lsize, lsize_tmp;
400: PetscMPIInt rank, owner;
401: Mat amat, bmat;
402: PetscBool done;
403: PetscLayout cmap, rmap;
404: MPI_Comm comm;
406: PetscObjectGetComm((PetscObject)mat, &comm);
407: MPI_Comm_rank(comm, &rank);
408: MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols);
409: MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done);
411: MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done);
413: /* is it a safe way to compute number of nonzero values ? */
414: tnz = ai[an] + bi[bn];
415: MatGetLayouts(mat, &rmap, &cmap);
416: PetscLayoutGetRange(rmap, &rstart, NULL);
417: PetscLayoutGetRange(cmap, &cstart, NULL);
418: /* it is a better way to estimate memory than the old implementation
419: * where global size of matrix is used
420: * */
421: PetscMalloc1(tnz, &indices_temp);
422: for (i = 0; i < nidx; i++) {
423: MPI_Comm iscomm;
425: ISGetLocalSize(is[i], &lsize);
426: ISGetIndices(is[i], &indices);
427: lsize_tmp = 0;
428: for (j = 0; j < lsize; j++) {
429: owner = -1;
430: row = indices[j];
431: PetscLayoutFindOwner(rmap, row, &owner);
432: if (owner != rank) continue;
433: /* local number */
434: row -= rstart;
435: start = ai[row];
436: end = ai[row + 1];
437: for (k = start; k < end; k++) { /* Amat */
438: col = aj[k] + cstart;
439: indices_temp[lsize_tmp++] = col;
440: }
441: start = bi[row];
442: end = bi[row + 1];
443: for (k = start; k < end; k++) { /* Bmat */
444: col = gcols[bj[k]];
445: indices_temp[lsize_tmp++] = col;
446: }
447: }
448: ISRestoreIndices(is[i], &indices);
449: PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL);
450: ISDestroy(&is[i]);
451: PetscSortRemoveDupsInt(&lsize_tmp, indices_temp);
452: ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i]);
453: PetscCommDestroy(&iscomm);
454: }
455: PetscFree(indices_temp);
456: MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done);
457: MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done);
458: return 0;
459: }
461: /*
462: Sample message format:
463: If a processor A wants processor B to process some elements corresponding
464: to index sets is[1],is[5]
465: mesg [0] = 2 (no of index sets in the mesg)
466: -----------
467: mesg [1] = 1 => is[1]
468: mesg [2] = sizeof(is[1]);
469: -----------
470: mesg [3] = 5 => is[5]
471: mesg [4] = sizeof(is[5]);
472: -----------
473: mesg [5]
474: mesg [n] datas[1]
475: -----------
476: mesg[n+1]
477: mesg[m] data(is[5])
478: -----------
480: Notes:
481: nrqs - no of requests sent (or to be sent out)
482: nrqr - no of requests received (which have to be or which have been processed)
483: */
484: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[])
485: {
486: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
487: PetscMPIInt *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2;
488: const PetscInt **idx, *idx_i;
489: PetscInt *n, **data, len;
490: #if defined(PETSC_USE_CTABLE)
491: PetscTable *table_data, table_data_i;
492: PetscInt *tdata, tcount, tcount_max;
493: #else
494: PetscInt *data_i, *d_p;
495: #endif
496: PetscMPIInt size, rank, tag1, tag2, proc = 0;
497: PetscInt M, i, j, k, **rbuf, row, nrqs, msz, **outdat, **ptr;
498: PetscInt *ctr, *pa, *tmp, *isz, *isz1, **xdata, **rbuf2;
499: PetscBT *table;
500: MPI_Comm comm;
501: MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2;
502: MPI_Status *recv_status;
503: MPI_Comm *iscomms;
504: char *t_p;
506: PetscObjectGetComm((PetscObject)C, &comm);
507: size = c->size;
508: rank = c->rank;
509: M = C->rmap->N;
511: PetscObjectGetNewTag((PetscObject)C, &tag1);
512: PetscObjectGetNewTag((PetscObject)C, &tag2);
514: PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n);
516: for (i = 0; i < imax; i++) {
517: ISGetIndices(is[i], &idx[i]);
518: ISGetLocalSize(is[i], &n[i]);
519: }
521: /* evaluate communication - mesg to who,length of mesg, and buffer space
522: required. Based on this, buffers are allocated, and data copied into them */
523: PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4);
524: for (i = 0; i < imax; i++) {
525: PetscArrayzero(w4, size); /* initialise work vector*/
526: idx_i = idx[i];
527: len = n[i];
528: for (j = 0; j < len; j++) {
529: row = idx_i[j];
531: PetscLayoutFindOwner(C->rmap, row, &proc);
532: w4[proc]++;
533: }
534: for (j = 0; j < size; j++) {
535: if (w4[j]) {
536: w1[j] += w4[j];
537: w3[j]++;
538: }
539: }
540: }
542: nrqs = 0; /* no of outgoing messages */
543: msz = 0; /* total mesg length (for all proc */
544: w1[rank] = 0; /* no mesg sent to intself */
545: w3[rank] = 0;
546: for (i = 0; i < size; i++) {
547: if (w1[i]) {
548: w2[i] = 1;
549: nrqs++;
550: } /* there exists a message to proc i */
551: }
552: /* pa - is list of processors to communicate with */
553: PetscMalloc1(nrqs, &pa);
554: for (i = 0, j = 0; i < size; i++) {
555: if (w1[i]) {
556: pa[j] = i;
557: j++;
558: }
559: }
561: /* Each message would have a header = 1 + 2*(no of IS) + data */
562: for (i = 0; i < nrqs; i++) {
563: j = pa[i];
564: w1[j] += w2[j] + 2 * w3[j];
565: msz += w1[j];
566: }
568: /* Determine the number of messages to expect, their lengths, from from-ids */
569: PetscGatherNumberOfMessages(comm, w2, w1, &nrqr);
570: PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1);
572: /* Now post the Irecvs corresponding to these messages */
573: PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1);
575: /* Allocate Memory for outgoing messages */
576: PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr);
577: PetscArrayzero(outdat, size);
578: PetscArrayzero(ptr, size);
580: {
581: PetscInt *iptr = tmp, ict = 0;
582: for (i = 0; i < nrqs; i++) {
583: j = pa[i];
584: iptr += ict;
585: outdat[j] = iptr;
586: ict = w1[j];
587: }
588: }
590: /* Form the outgoing messages */
591: /* plug in the headers */
592: for (i = 0; i < nrqs; i++) {
593: j = pa[i];
594: outdat[j][0] = 0;
595: PetscArrayzero(outdat[j] + 1, 2 * w3[j]);
596: ptr[j] = outdat[j] + 2 * w3[j] + 1;
597: }
599: /* Memory for doing local proc's work */
600: {
601: PetscInt M_BPB_imax = 0;
602: #if defined(PETSC_USE_CTABLE)
603: PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax);
604: PetscMalloc1(imax, &table_data);
605: for (i = 0; i < imax; i++) PetscTableCreate(n[i], M, &table_data[i]);
606: PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p);
607: for (i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
608: #else
609: PetscInt Mimax = 0;
610: PetscIntMultError(M, imax, &Mimax);
611: PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax);
612: PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p);
613: for (i = 0; i < imax; i++) {
614: table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
615: data[i] = d_p + M * i;
616: }
617: #endif
618: }
620: /* Parse the IS and update local tables and the outgoing buf with the data */
621: {
622: PetscInt n_i, isz_i, *outdat_j, ctr_j;
623: PetscBT table_i;
625: for (i = 0; i < imax; i++) {
626: PetscArrayzero(ctr, size);
627: n_i = n[i];
628: table_i = table[i];
629: idx_i = idx[i];
630: #if defined(PETSC_USE_CTABLE)
631: table_data_i = table_data[i];
632: #else
633: data_i = data[i];
634: #endif
635: isz_i = isz[i];
636: for (j = 0; j < n_i; j++) { /* parse the indices of each IS */
637: row = idx_i[j];
638: PetscLayoutFindOwner(C->rmap, row, &proc);
639: if (proc != rank) { /* copy to the outgoing buffer */
640: ctr[proc]++;
641: *ptr[proc] = row;
642: ptr[proc]++;
643: } else if (!PetscBTLookupSet(table_i, row)) {
644: #if defined(PETSC_USE_CTABLE)
645: PetscTableAdd(table_data_i, row + 1, isz_i + 1, INSERT_VALUES);
646: #else
647: data_i[isz_i] = row; /* Update the local table */
648: #endif
649: isz_i++;
650: }
651: }
652: /* Update the headers for the current IS */
653: for (j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */
654: if ((ctr_j = ctr[j])) {
655: outdat_j = outdat[j];
656: k = ++outdat_j[0];
657: outdat_j[2 * k] = ctr_j;
658: outdat_j[2 * k - 1] = i;
659: }
660: }
661: isz[i] = isz_i;
662: }
663: }
665: /* Now post the sends */
666: PetscMalloc1(nrqs, &s_waits1);
667: for (i = 0; i < nrqs; ++i) {
668: j = pa[i];
669: MPI_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i);
670: }
672: /* No longer need the original indices */
673: PetscMalloc1(imax, &iscomms);
674: for (i = 0; i < imax; ++i) {
675: ISRestoreIndices(is[i], idx + i);
676: PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL);
677: }
678: PetscFree2(*(PetscInt ***)&idx, n);
680: for (i = 0; i < imax; ++i) ISDestroy(&is[i]);
682: /* Do Local work */
683: #if defined(PETSC_USE_CTABLE)
684: MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data);
685: #else
686: MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL);
687: #endif
689: /* Receive messages */
690: PetscMalloc1(nrqr, &recv_status);
691: MPI_Waitall(nrqr, r_waits1, recv_status);
692: MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE);
694: /* Phase 1 sends are complete - deallocate buffers */
695: PetscFree4(outdat, ptr, tmp, ctr);
696: PetscFree4(w1, w2, w3, w4);
698: PetscMalloc1(nrqr, &xdata);
699: PetscMalloc1(nrqr, &isz1);
700: MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1);
701: PetscFree(rbuf[0]);
702: PetscFree(rbuf);
704: /* Send the data back */
705: /* Do a global reduction to know the buffer space req for incoming messages */
706: {
707: PetscMPIInt *rw1;
709: PetscCalloc1(size, &rw1);
711: for (i = 0; i < nrqr; ++i) {
712: proc = recv_status[i].MPI_SOURCE;
715: rw1[proc] = isz1[i];
716: }
717: PetscFree(onodes1);
718: PetscFree(olengths1);
720: /* Determine the number of messages to expect, their lengths, from from-ids */
721: PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2);
722: PetscFree(rw1);
723: }
724: /* Now post the Irecvs corresponding to these messages */
725: PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2);
727: /* Now post the sends */
728: PetscMalloc1(nrqr, &s_waits2);
729: for (i = 0; i < nrqr; ++i) {
730: j = recv_status[i].MPI_SOURCE;
731: MPI_Isend(xdata[i], isz1[i], MPIU_INT, j, tag2, comm, s_waits2 + i);
732: }
734: /* receive work done on other processors */
735: {
736: PetscInt is_no, ct1, max, *rbuf2_i, isz_i, jmax;
737: PetscMPIInt idex;
738: PetscBT table_i;
740: for (i = 0; i < nrqs; ++i) {
741: MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE);
742: /* Process the message */
743: rbuf2_i = rbuf2[idex];
744: ct1 = 2 * rbuf2_i[0] + 1;
745: jmax = rbuf2[idex][0];
746: for (j = 1; j <= jmax; j++) {
747: max = rbuf2_i[2 * j];
748: is_no = rbuf2_i[2 * j - 1];
749: isz_i = isz[is_no];
750: table_i = table[is_no];
751: #if defined(PETSC_USE_CTABLE)
752: table_data_i = table_data[is_no];
753: #else
754: data_i = data[is_no];
755: #endif
756: for (k = 0; k < max; k++, ct1++) {
757: row = rbuf2_i[ct1];
758: if (!PetscBTLookupSet(table_i, row)) {
759: #if defined(PETSC_USE_CTABLE)
760: PetscTableAdd(table_data_i, row + 1, isz_i + 1, INSERT_VALUES);
761: #else
762: data_i[isz_i] = row;
763: #endif
764: isz_i++;
765: }
766: }
767: isz[is_no] = isz_i;
768: }
769: }
771: MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE);
772: }
774: #if defined(PETSC_USE_CTABLE)
775: tcount_max = 0;
776: for (i = 0; i < imax; ++i) {
777: table_data_i = table_data[i];
778: PetscTableGetCount(table_data_i, &tcount);
779: if (tcount_max < tcount) tcount_max = tcount;
780: }
781: PetscMalloc1(tcount_max + 1, &tdata);
782: #endif
784: for (i = 0; i < imax; ++i) {
785: #if defined(PETSC_USE_CTABLE)
786: PetscTablePosition tpos;
787: table_data_i = table_data[i];
789: PetscTableGetHeadPosition(table_data_i, &tpos);
790: while (tpos) {
791: PetscTableGetNext(table_data_i, &tpos, &k, &j);
792: tdata[--j] = --k;
793: }
794: ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i);
795: #else
796: ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i);
797: #endif
798: PetscCommDestroy(&iscomms[i]);
799: }
801: PetscFree(iscomms);
802: PetscFree(onodes2);
803: PetscFree(olengths2);
805: PetscFree(pa);
806: PetscFree(rbuf2[0]);
807: PetscFree(rbuf2);
808: PetscFree(s_waits1);
809: PetscFree(r_waits1);
810: PetscFree(s_waits2);
811: PetscFree(r_waits2);
812: PetscFree(recv_status);
813: if (xdata) {
814: PetscFree(xdata[0]);
815: PetscFree(xdata);
816: }
817: PetscFree(isz1);
818: #if defined(PETSC_USE_CTABLE)
819: for (i = 0; i < imax; i++) PetscTableDestroy((PetscTable *)&table_data[i]);
820: PetscFree(table_data);
821: PetscFree(tdata);
822: PetscFree4(table, data, isz, t_p);
823: #else
824: PetscFree5(table, data, isz, d_p, t_p);
825: #endif
826: return 0;
827: }
829: /*
830: MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
831: the work on the local processor.
833: Inputs:
834: C - MAT_MPIAIJ;
835: imax - total no of index sets processed at a time;
836: table - an array of char - size = m bits.
838: Output:
839: isz - array containing the count of the solution elements corresponding
840: to each index set;
841: data or table_data - pointer to the solutions
842: */
843: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscTable *table_data)
844: {
845: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
846: Mat A = c->A, B = c->B;
847: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
848: PetscInt start, end, val, max, rstart, cstart, *ai, *aj;
849: PetscInt *bi, *bj, *garray, i, j, k, row, isz_i;
850: PetscBT table_i;
851: #if defined(PETSC_USE_CTABLE)
852: PetscTable table_data_i;
853: PetscTablePosition tpos;
854: PetscInt tcount, *tdata;
855: #else
856: PetscInt *data_i;
857: #endif
859: rstart = C->rmap->rstart;
860: cstart = C->cmap->rstart;
861: ai = a->i;
862: aj = a->j;
863: bi = b->i;
864: bj = b->j;
865: garray = c->garray;
867: for (i = 0; i < imax; i++) {
868: #if defined(PETSC_USE_CTABLE)
869: /* copy existing entries of table_data_i into tdata[] */
870: table_data_i = table_data[i];
871: PetscTableGetCount(table_data_i, &tcount);
874: PetscMalloc1(tcount, &tdata);
875: PetscTableGetHeadPosition(table_data_i, &tpos);
876: while (tpos) {
877: PetscTableGetNext(table_data_i, &tpos, &row, &j);
878: tdata[--j] = --row;
880: }
881: #else
882: data_i = data[i];
883: #endif
884: table_i = table[i];
885: isz_i = isz[i];
886: max = isz[i];
888: for (j = 0; j < max; j++) {
889: #if defined(PETSC_USE_CTABLE)
890: row = tdata[j] - rstart;
891: #else
892: row = data_i[j] - rstart;
893: #endif
894: start = ai[row];
895: end = ai[row + 1];
896: for (k = start; k < end; k++) { /* Amat */
897: val = aj[k] + cstart;
898: if (!PetscBTLookupSet(table_i, val)) {
899: #if defined(PETSC_USE_CTABLE)
900: PetscTableAdd(table_data_i, val + 1, isz_i + 1, INSERT_VALUES);
901: #else
902: data_i[isz_i] = val;
903: #endif
904: isz_i++;
905: }
906: }
907: start = bi[row];
908: end = bi[row + 1];
909: for (k = start; k < end; k++) { /* Bmat */
910: val = garray[bj[k]];
911: if (!PetscBTLookupSet(table_i, val)) {
912: #if defined(PETSC_USE_CTABLE)
913: PetscTableAdd(table_data_i, val + 1, isz_i + 1, INSERT_VALUES);
914: #else
915: data_i[isz_i] = val;
916: #endif
917: isz_i++;
918: }
919: }
920: }
921: isz[i] = isz_i;
923: #if defined(PETSC_USE_CTABLE)
924: PetscFree(tdata);
925: #endif
926: }
927: return 0;
928: }
930: /*
931: MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
932: and return the output
934: Input:
935: C - the matrix
936: nrqr - no of messages being processed.
937: rbuf - an array of pointers to the received requests
939: Output:
940: xdata - array of messages to be sent back
941: isz1 - size of each message
943: For better efficiency perhaps we should malloc separately each xdata[i],
944: then if a remalloc is required we need only copy the data for that one row
945: rather then all previous rows as it is now where a single large chunk of
946: memory is used.
948: */
949: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1)
950: {
951: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
952: Mat A = c->A, B = c->B;
953: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
954: PetscInt rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k;
955: PetscInt row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end;
956: PetscInt val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr;
957: PetscInt *rbuf_i, kmax, rbuf_0;
958: PetscBT xtable;
960: m = C->rmap->N;
961: rstart = C->rmap->rstart;
962: cstart = C->cmap->rstart;
963: ai = a->i;
964: aj = a->j;
965: bi = b->i;
966: bj = b->j;
967: garray = c->garray;
969: for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) {
970: rbuf_i = rbuf[i];
971: rbuf_0 = rbuf_i[0];
972: ct += rbuf_0;
973: for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j];
974: }
976: if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n;
977: else max1 = 1;
978: mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1);
979: if (nrqr) {
980: PetscMalloc1(mem_estimate, &xdata[0]);
981: ++no_malloc;
982: }
983: PetscBTCreate(m, &xtable);
984: PetscArrayzero(isz1, nrqr);
986: ct3 = 0;
987: for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */
988: rbuf_i = rbuf[i];
989: rbuf_0 = rbuf_i[0];
990: ct1 = 2 * rbuf_0 + 1;
991: ct2 = ct1;
992: ct3 += ct1;
993: for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/
994: PetscBTMemzero(m, xtable);
995: oct2 = ct2;
996: kmax = rbuf_i[2 * j];
997: for (k = 0; k < kmax; k++, ct1++) {
998: row = rbuf_i[ct1];
999: if (!PetscBTLookupSet(xtable, row)) {
1000: if (!(ct3 < mem_estimate)) {
1001: new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1002: PetscMalloc1(new_estimate, &tmp);
1003: PetscArraycpy(tmp, xdata[0], mem_estimate);
1004: PetscFree(xdata[0]);
1005: xdata[0] = tmp;
1006: mem_estimate = new_estimate;
1007: ++no_malloc;
1008: for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1009: }
1010: xdata[i][ct2++] = row;
1011: ct3++;
1012: }
1013: }
1014: for (k = oct2, max2 = ct2; k < max2; k++) {
1015: row = xdata[i][k] - rstart;
1016: start = ai[row];
1017: end = ai[row + 1];
1018: for (l = start; l < end; l++) {
1019: val = aj[l] + cstart;
1020: if (!PetscBTLookupSet(xtable, val)) {
1021: if (!(ct3 < mem_estimate)) {
1022: new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1023: PetscMalloc1(new_estimate, &tmp);
1024: PetscArraycpy(tmp, xdata[0], mem_estimate);
1025: PetscFree(xdata[0]);
1026: xdata[0] = tmp;
1027: mem_estimate = new_estimate;
1028: ++no_malloc;
1029: for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1030: }
1031: xdata[i][ct2++] = val;
1032: ct3++;
1033: }
1034: }
1035: start = bi[row];
1036: end = bi[row + 1];
1037: for (l = start; l < end; l++) {
1038: val = garray[bj[l]];
1039: if (!PetscBTLookupSet(xtable, val)) {
1040: if (!(ct3 < mem_estimate)) {
1041: new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1042: PetscMalloc1(new_estimate, &tmp);
1043: PetscArraycpy(tmp, xdata[0], mem_estimate);
1044: PetscFree(xdata[0]);
1045: xdata[0] = tmp;
1046: mem_estimate = new_estimate;
1047: ++no_malloc;
1048: for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1049: }
1050: xdata[i][ct2++] = val;
1051: ct3++;
1052: }
1053: }
1054: }
1055: /* Update the header*/
1056: xdata[i][2 * j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1057: xdata[i][2 * j - 1] = rbuf_i[2 * j - 1];
1058: }
1059: xdata[i][0] = rbuf_0;
1060: if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2;
1061: isz1[i] = ct2; /* size of each message */
1062: }
1063: PetscBTDestroy(&xtable);
1064: PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc);
1065: return 0;
1066: }
1067: /* -------------------------------------------------------------------------*/
1068: extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *);
1069: /*
1070: Every processor gets the entire matrix
1071: */
1072: PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[])
1073: {
1074: Mat B;
1075: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1076: Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data;
1077: PetscMPIInt size, rank, *recvcounts = NULL, *displs = NULL;
1078: PetscInt sendcount, i, *rstarts = A->rmap->range, n, cnt, j;
1079: PetscInt m, *b_sendj, *garray = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf;
1081: MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
1082: MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank);
1083: if (scall == MAT_INITIAL_MATRIX) {
1084: /* ----------------------------------------------------------------
1085: Tell every processor the number of nonzeros per row
1086: */
1087: PetscMalloc1(A->rmap->N, &lens);
1088: for (i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart];
1089: PetscMalloc2(size, &recvcounts, size, &displs);
1090: for (i = 0; i < size; i++) {
1091: recvcounts[i] = A->rmap->range[i + 1] - A->rmap->range[i];
1092: displs[i] = A->rmap->range[i];
1093: }
1094: MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, lens, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A));
1095: /* ---------------------------------------------------------------
1096: Create the sequential matrix of the same type as the local block diagonal
1097: */
1098: MatCreate(PETSC_COMM_SELF, &B);
1099: MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE);
1100: MatSetBlockSizesFromMats(B, A, A);
1101: MatSetType(B, ((PetscObject)a->A)->type_name);
1102: MatSeqAIJSetPreallocation(B, 0, lens);
1103: PetscCalloc1(2, Bin);
1104: **Bin = B;
1105: b = (Mat_SeqAIJ *)B->data;
1107: /*--------------------------------------------------------------------
1108: Copy my part of matrix column indices over
1109: */
1110: sendcount = ad->nz + bd->nz;
1111: jsendbuf = b->j + b->i[rstarts[rank]];
1112: a_jsendbuf = ad->j;
1113: b_jsendbuf = bd->j;
1114: n = A->rmap->rend - A->rmap->rstart;
1115: cnt = 0;
1116: for (i = 0; i < n; i++) {
1117: /* put in lower diagonal portion */
1118: m = bd->i[i + 1] - bd->i[i];
1119: while (m > 0) {
1120: /* is it above diagonal (in bd (compressed) numbering) */
1121: if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1122: jsendbuf[cnt++] = garray[*b_jsendbuf++];
1123: m--;
1124: }
1126: /* put in diagonal portion */
1127: for (j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1129: /* put in upper diagonal portion */
1130: while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++];
1131: }
1134: /*--------------------------------------------------------------------
1135: Gather all column indices to all processors
1136: */
1137: for (i = 0; i < size; i++) {
1138: recvcounts[i] = 0;
1139: for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) recvcounts[i] += lens[j];
1140: }
1141: displs[0] = 0;
1142: for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1143: MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, b->j, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A));
1144: /*--------------------------------------------------------------------
1145: Assemble the matrix into useable form (note numerical values not yet set)
1146: */
1147: /* set the b->ilen (length of each row) values */
1148: PetscArraycpy(b->ilen, lens, A->rmap->N);
1149: /* set the b->i indices */
1150: b->i[0] = 0;
1151: for (i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1];
1152: PetscFree(lens);
1153: MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY);
1154: MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY);
1156: } else {
1157: B = **Bin;
1158: b = (Mat_SeqAIJ *)B->data;
1159: }
1161: /*--------------------------------------------------------------------
1162: Copy my part of matrix numerical values into the values location
1163: */
1164: if (flag == MAT_GET_VALUES) {
1165: const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf;
1166: MatScalar *sendbuf, *recvbuf;
1168: MatSeqAIJGetArrayRead(a->A, &ada);
1169: MatSeqAIJGetArrayRead(a->B, &bda);
1170: sendcount = ad->nz + bd->nz;
1171: sendbuf = b->a + b->i[rstarts[rank]];
1172: a_sendbuf = ada;
1173: b_sendbuf = bda;
1174: b_sendj = bd->j;
1175: n = A->rmap->rend - A->rmap->rstart;
1176: cnt = 0;
1177: for (i = 0; i < n; i++) {
1178: /* put in lower diagonal portion */
1179: m = bd->i[i + 1] - bd->i[i];
1180: while (m > 0) {
1181: /* is it above diagonal (in bd (compressed) numbering) */
1182: if (garray[*b_sendj] > A->rmap->rstart + i) break;
1183: sendbuf[cnt++] = *b_sendbuf++;
1184: m--;
1185: b_sendj++;
1186: }
1188: /* put in diagonal portion */
1189: for (j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++;
1191: /* put in upper diagonal portion */
1192: while (m-- > 0) {
1193: sendbuf[cnt++] = *b_sendbuf++;
1194: b_sendj++;
1195: }
1196: }
1199: /* -----------------------------------------------------------------
1200: Gather all numerical values to all processors
1201: */
1202: if (!recvcounts) PetscMalloc2(size, &recvcounts, size, &displs);
1203: for (i = 0; i < size; i++) recvcounts[i] = b->i[rstarts[i + 1]] - b->i[rstarts[i]];
1204: displs[0] = 0;
1205: for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1206: recvbuf = b->a;
1207: MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, recvbuf, recvcounts, displs, MPIU_SCALAR, PetscObjectComm((PetscObject)A));
1208: MatSeqAIJRestoreArrayRead(a->A, &ada);
1209: MatSeqAIJRestoreArrayRead(a->B, &bda);
1210: } /* endof (flag == MAT_GET_VALUES) */
1211: PetscFree2(recvcounts, displs);
1213: MatPropagateSymmetryOptions(A, B);
1214: return 0;
1215: }
1217: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats)
1218: {
1219: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
1220: Mat submat, A = c->A, B = c->B;
1221: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc;
1222: PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB;
1223: PetscInt cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray;
1224: const PetscInt *icol, *irow;
1225: PetscInt nrow, ncol, start;
1226: PetscMPIInt rank, size, tag1, tag2, tag3, tag4, *w1, *w2, nrqr;
1227: PetscInt **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, ct3, **rbuf1, row, proc;
1228: PetscInt nrqs = 0, msz, **ptr, *req_size, *ctr, *pa, *tmp, tcol, *iptr;
1229: PetscInt **rbuf3, *req_source1, *req_source2, **sbuf_aj, **rbuf2, max1, nnz;
1230: PetscInt *lens, rmax, ncols, *cols, Crow;
1231: #if defined(PETSC_USE_CTABLE)
1232: PetscTable cmap, rmap;
1233: PetscInt *cmap_loc, *rmap_loc;
1234: #else
1235: PetscInt *cmap, *rmap;
1236: #endif
1237: PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i;
1238: PetscInt *cworkB, lwrite, *subcols, *row2proc;
1239: PetscScalar *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL;
1240: MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
1241: MPI_Request *r_waits4, *s_waits3 = NULL, *s_waits4;
1242: MPI_Status *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2;
1243: MPI_Status *r_status3 = NULL, *r_status4, *s_status4;
1244: MPI_Comm comm;
1245: PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i;
1246: PetscMPIInt *onodes1, *olengths1, idex, end;
1247: Mat_SubSppt *smatis1;
1248: PetscBool isrowsorted, iscolsorted;
1253: MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a);
1254: MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a);
1255: PetscObjectGetComm((PetscObject)C, &comm);
1256: size = c->size;
1257: rank = c->rank;
1259: ISSorted(iscol[0], &iscolsorted);
1260: ISSorted(isrow[0], &isrowsorted);
1261: ISGetIndices(isrow[0], &irow);
1262: ISGetLocalSize(isrow[0], &nrow);
1263: if (allcolumns) {
1264: icol = NULL;
1265: ncol = C->cmap->N;
1266: } else {
1267: ISGetIndices(iscol[0], &icol);
1268: ISGetLocalSize(iscol[0], &ncol);
1269: }
1271: if (scall == MAT_INITIAL_MATRIX) {
1272: PetscInt *sbuf2_i, *cworkA, lwrite, ctmp;
1274: /* Get some new tags to keep the communication clean */
1275: tag1 = ((PetscObject)C)->tag;
1276: PetscObjectGetNewTag((PetscObject)C, &tag2);
1277: PetscObjectGetNewTag((PetscObject)C, &tag3);
1279: /* evaluate communication - mesg to who, length of mesg, and buffer space
1280: required. Based on this, buffers are allocated, and data copied into them */
1281: PetscCalloc2(size, &w1, size, &w2);
1282: PetscMalloc1(nrow, &row2proc);
1284: /* w1[proc] = num of rows owned by proc -- to be requested */
1285: proc = 0;
1286: nrqs = 0; /* num of outgoing messages */
1287: for (j = 0; j < nrow; j++) {
1288: row = irow[j];
1289: if (!isrowsorted) proc = 0;
1290: while (row >= C->rmap->range[proc + 1]) proc++;
1291: w1[proc]++;
1292: row2proc[j] = proc; /* map row index to proc */
1294: if (proc != rank && !w2[proc]) {
1295: w2[proc] = 1;
1296: nrqs++;
1297: }
1298: }
1299: w1[rank] = 0; /* rows owned by self will not be requested */
1301: PetscMalloc1(nrqs, &pa); /*(proc -array)*/
1302: for (proc = 0, j = 0; proc < size; proc++) {
1303: if (w1[proc]) pa[j++] = proc;
1304: }
1306: /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1307: msz = 0; /* total mesg length (for all procs) */
1308: for (i = 0; i < nrqs; i++) {
1309: proc = pa[i];
1310: w1[proc] += 3;
1311: msz += w1[proc];
1312: }
1313: PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz);
1315: /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1316: /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1317: PetscGatherNumberOfMessages(comm, w2, w1, &nrqr);
1319: /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1320: Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1321: PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1);
1323: /* Now post the Irecvs corresponding to these messages */
1324: PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1);
1326: PetscFree(onodes1);
1327: PetscFree(olengths1);
1329: /* Allocate Memory for outgoing messages */
1330: PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr);
1331: PetscArrayzero(sbuf1, size);
1332: PetscArrayzero(ptr, size);
1334: /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1335: iptr = tmp;
1336: for (i = 0; i < nrqs; i++) {
1337: proc = pa[i];
1338: sbuf1[proc] = iptr;
1339: iptr += w1[proc];
1340: }
1342: /* Form the outgoing messages */
1343: /* Initialize the header space */
1344: for (i = 0; i < nrqs; i++) {
1345: proc = pa[i];
1346: PetscArrayzero(sbuf1[proc], 3);
1347: ptr[proc] = sbuf1[proc] + 3;
1348: }
1350: /* Parse the isrow and copy data into outbuf */
1351: PetscArrayzero(ctr, size);
1352: for (j = 0; j < nrow; j++) { /* parse the indices of each IS */
1353: proc = row2proc[j];
1354: if (proc != rank) { /* copy to the outgoing buf*/
1355: *ptr[proc] = irow[j];
1356: ctr[proc]++;
1357: ptr[proc]++;
1358: }
1359: }
1361: /* Update the headers for the current IS */
1362: for (j = 0; j < size; j++) { /* Can Optimise this loop too */
1363: if ((ctr_j = ctr[j])) {
1364: sbuf1_j = sbuf1[j];
1365: k = ++sbuf1_j[0];
1366: sbuf1_j[2 * k] = ctr_j;
1367: sbuf1_j[2 * k - 1] = 0;
1368: }
1369: }
1371: /* Now post the sends */
1372: PetscMalloc1(nrqs, &s_waits1);
1373: for (i = 0; i < nrqs; ++i) {
1374: proc = pa[i];
1375: MPI_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i);
1376: }
1378: /* Post Receives to capture the buffer size */
1379: PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2);
1380: PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3);
1382: if (nrqs) rbuf2[0] = tmp + msz;
1383: for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
1385: for (i = 0; i < nrqs; ++i) {
1386: proc = pa[i];
1387: MPI_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i);
1388: }
1390: PetscFree2(w1, w2);
1392: /* Send to other procs the buf size they should allocate */
1393: /* Receive messages*/
1394: PetscMalloc1(nrqr, &r_status1);
1395: PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1);
1397: MPI_Waitall(nrqr, r_waits1, r_status1);
1398: for (i = 0; i < nrqr; ++i) {
1399: req_size[i] = 0;
1400: rbuf1_i = rbuf1[i];
1401: start = 2 * rbuf1_i[0] + 1;
1402: MPI_Get_count(r_status1 + i, MPIU_INT, &end);
1403: PetscMalloc1(end, &sbuf2[i]);
1404: sbuf2_i = sbuf2[i];
1405: for (j = start; j < end; j++) {
1406: k = rbuf1_i[j] - rstart;
1407: ncols = ai[k + 1] - ai[k] + bi[k + 1] - bi[k];
1408: sbuf2_i[j] = ncols;
1409: req_size[i] += ncols;
1410: }
1411: req_source1[i] = r_status1[i].MPI_SOURCE;
1413: /* form the header */
1414: sbuf2_i[0] = req_size[i];
1415: for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
1417: MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i);
1418: }
1420: PetscFree(r_status1);
1421: PetscFree(r_waits1);
1423: /* rbuf2 is received, Post recv column indices a->j */
1424: MPI_Waitall(nrqs, r_waits2, r_status2);
1426: PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3);
1427: for (i = 0; i < nrqs; ++i) {
1428: PetscMalloc1(rbuf2[i][0], &rbuf3[i]);
1429: req_source2[i] = r_status2[i].MPI_SOURCE;
1430: MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i);
1431: }
1433: /* Wait on sends1 and sends2 */
1434: PetscMalloc1(nrqs, &s_status1);
1435: MPI_Waitall(nrqs, s_waits1, s_status1);
1436: PetscFree(s_waits1);
1437: PetscFree(s_status1);
1439: MPI_Waitall(nrqr, s_waits2, s_status2);
1440: PetscFree4(r_status2, s_waits2, r_waits2, s_status2);
1442: /* Now allocate sending buffers for a->j, and send them off */
1443: PetscMalloc1(nrqr, &sbuf_aj);
1444: for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1445: if (nrqr) PetscMalloc1(j, &sbuf_aj[0]);
1446: for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
1448: for (i = 0; i < nrqr; i++) { /* for each requested message */
1449: rbuf1_i = rbuf1[i];
1450: sbuf_aj_i = sbuf_aj[i];
1451: ct1 = 2 * rbuf1_i[0] + 1;
1452: ct2 = 0;
1455: kmax = rbuf1[i][2];
1456: for (k = 0; k < kmax; k++, ct1++) { /* for each row */
1457: row = rbuf1_i[ct1] - rstart;
1458: nzA = ai[row + 1] - ai[row];
1459: nzB = bi[row + 1] - bi[row];
1460: ncols = nzA + nzB;
1461: cworkA = aj + ai[row];
1462: cworkB = bj + bi[row];
1464: /* load the column indices for this row into cols*/
1465: cols = sbuf_aj_i + ct2;
1467: lwrite = 0;
1468: for (l = 0; l < nzB; l++) {
1469: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1470: }
1471: for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1472: for (l = 0; l < nzB; l++) {
1473: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1474: }
1476: ct2 += ncols;
1477: }
1478: MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i);
1479: }
1481: /* create column map (cmap): global col of C -> local col of submat */
1482: #if defined(PETSC_USE_CTABLE)
1483: if (!allcolumns) {
1484: PetscTableCreate(ncol, C->cmap->N, &cmap);
1485: PetscCalloc1(C->cmap->n, &cmap_loc);
1486: for (j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */
1487: if (icol[j] >= cstart && icol[j] < cend) {
1488: cmap_loc[icol[j] - cstart] = j + 1;
1489: } else { /* use PetscTable for non-local col indices */
1490: PetscTableAdd(cmap, icol[j] + 1, j + 1, INSERT_VALUES);
1491: }
1492: }
1493: } else {
1494: cmap = NULL;
1495: cmap_loc = NULL;
1496: }
1497: PetscCalloc1(C->rmap->n, &rmap_loc);
1498: #else
1499: if (!allcolumns) {
1500: PetscCalloc1(C->cmap->N, &cmap);
1501: for (j = 0; j < ncol; j++) cmap[icol[j]] = j + 1;
1502: } else {
1503: cmap = NULL;
1504: }
1505: #endif
1507: /* Create lens for MatSeqAIJSetPreallocation() */
1508: PetscCalloc1(nrow, &lens);
1510: /* Compute lens from local part of C */
1511: for (j = 0; j < nrow; j++) {
1512: row = irow[j];
1513: proc = row2proc[j];
1514: if (proc == rank) {
1515: /* diagonal part A = c->A */
1516: ncols = ai[row - rstart + 1] - ai[row - rstart];
1517: cols = aj + ai[row - rstart];
1518: if (!allcolumns) {
1519: for (k = 0; k < ncols; k++) {
1520: #if defined(PETSC_USE_CTABLE)
1521: tcol = cmap_loc[cols[k]];
1522: #else
1523: tcol = cmap[cols[k] + cstart];
1524: #endif
1525: if (tcol) lens[j]++;
1526: }
1527: } else { /* allcolumns */
1528: lens[j] = ncols;
1529: }
1531: /* off-diagonal part B = c->B */
1532: ncols = bi[row - rstart + 1] - bi[row - rstart];
1533: cols = bj + bi[row - rstart];
1534: if (!allcolumns) {
1535: for (k = 0; k < ncols; k++) {
1536: #if defined(PETSC_USE_CTABLE)
1537: PetscTableFind(cmap, bmap[cols[k]] + 1, &tcol);
1538: #else
1539: tcol = cmap[bmap[cols[k]]];
1540: #endif
1541: if (tcol) lens[j]++;
1542: }
1543: } else { /* allcolumns */
1544: lens[j] += ncols;
1545: }
1546: }
1547: }
1549: /* Create row map (rmap): global row of C -> local row of submat */
1550: #if defined(PETSC_USE_CTABLE)
1551: PetscTableCreate(nrow, C->rmap->N, &rmap);
1552: for (j = 0; j < nrow; j++) {
1553: row = irow[j];
1554: proc = row2proc[j];
1555: if (proc == rank) { /* a local row */
1556: rmap_loc[row - rstart] = j;
1557: } else {
1558: PetscTableAdd(rmap, irow[j] + 1, j + 1, INSERT_VALUES);
1559: }
1560: }
1561: #else
1562: PetscCalloc1(C->rmap->N, &rmap);
1563: for (j = 0; j < nrow; j++) rmap[irow[j]] = j;
1564: #endif
1566: /* Update lens from offproc data */
1567: /* recv a->j is done */
1568: MPI_Waitall(nrqs, r_waits3, r_status3);
1569: for (i = 0; i < nrqs; i++) {
1570: proc = pa[i];
1571: sbuf1_i = sbuf1[proc];
1573: ct1 = 2 + 1;
1574: ct2 = 0;
1575: rbuf2_i = rbuf2[i]; /* received length of C->j */
1576: rbuf3_i = rbuf3[i]; /* received C->j */
1579: max1 = sbuf1_i[2];
1580: for (k = 0; k < max1; k++, ct1++) {
1581: #if defined(PETSC_USE_CTABLE)
1582: PetscTableFind(rmap, sbuf1_i[ct1] + 1, &row);
1583: row--;
1585: #else
1586: row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1587: #endif
1588: /* Now, store row index of submat in sbuf1_i[ct1] */
1589: sbuf1_i[ct1] = row;
1591: nnz = rbuf2_i[ct1];
1592: if (!allcolumns) {
1593: for (l = 0; l < nnz; l++, ct2++) {
1594: #if defined(PETSC_USE_CTABLE)
1595: if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1596: tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1597: } else {
1598: PetscTableFind(cmap, rbuf3_i[ct2] + 1, &tcol);
1599: }
1600: #else
1601: tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1602: #endif
1603: if (tcol) lens[row]++;
1604: }
1605: } else { /* allcolumns */
1606: lens[row] += nnz;
1607: }
1608: }
1609: }
1610: MPI_Waitall(nrqr, s_waits3, s_status3);
1611: PetscFree4(r_waits3, s_waits3, r_status3, s_status3);
1613: /* Create the submatrices */
1614: MatCreate(PETSC_COMM_SELF, &submat);
1615: MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE);
1617: ISGetBlockSize(isrow[0], &i);
1618: ISGetBlockSize(iscol[0], &j);
1619: MatSetBlockSizes(submat, i, j);
1620: MatSetType(submat, ((PetscObject)A)->type_name);
1621: MatSeqAIJSetPreallocation(submat, 0, lens);
1623: /* create struct Mat_SubSppt and attached it to submat */
1624: PetscNew(&smatis1);
1625: subc = (Mat_SeqAIJ *)submat->data;
1626: subc->submatis1 = smatis1;
1628: smatis1->id = 0;
1629: smatis1->nrqs = nrqs;
1630: smatis1->nrqr = nrqr;
1631: smatis1->rbuf1 = rbuf1;
1632: smatis1->rbuf2 = rbuf2;
1633: smatis1->rbuf3 = rbuf3;
1634: smatis1->sbuf2 = sbuf2;
1635: smatis1->req_source2 = req_source2;
1637: smatis1->sbuf1 = sbuf1;
1638: smatis1->ptr = ptr;
1639: smatis1->tmp = tmp;
1640: smatis1->ctr = ctr;
1642: smatis1->pa = pa;
1643: smatis1->req_size = req_size;
1644: smatis1->req_source1 = req_source1;
1646: smatis1->allcolumns = allcolumns;
1647: smatis1->singleis = PETSC_TRUE;
1648: smatis1->row2proc = row2proc;
1649: smatis1->rmap = rmap;
1650: smatis1->cmap = cmap;
1651: #if defined(PETSC_USE_CTABLE)
1652: smatis1->rmap_loc = rmap_loc;
1653: smatis1->cmap_loc = cmap_loc;
1654: #endif
1656: smatis1->destroy = submat->ops->destroy;
1657: submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1658: submat->factortype = C->factortype;
1660: /* compute rmax */
1661: rmax = 0;
1662: for (i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]);
1664: } else { /* scall == MAT_REUSE_MATRIX */
1665: submat = submats[0];
1668: subc = (Mat_SeqAIJ *)submat->data;
1669: rmax = subc->rmax;
1670: smatis1 = subc->submatis1;
1671: nrqs = smatis1->nrqs;
1672: nrqr = smatis1->nrqr;
1673: rbuf1 = smatis1->rbuf1;
1674: rbuf2 = smatis1->rbuf2;
1675: rbuf3 = smatis1->rbuf3;
1676: req_source2 = smatis1->req_source2;
1678: sbuf1 = smatis1->sbuf1;
1679: sbuf2 = smatis1->sbuf2;
1680: ptr = smatis1->ptr;
1681: tmp = smatis1->tmp;
1682: ctr = smatis1->ctr;
1684: pa = smatis1->pa;
1685: req_size = smatis1->req_size;
1686: req_source1 = smatis1->req_source1;
1688: allcolumns = smatis1->allcolumns;
1689: row2proc = smatis1->row2proc;
1690: rmap = smatis1->rmap;
1691: cmap = smatis1->cmap;
1692: #if defined(PETSC_USE_CTABLE)
1693: rmap_loc = smatis1->rmap_loc;
1694: cmap_loc = smatis1->cmap_loc;
1695: #endif
1696: }
1698: /* Post recv matrix values */
1699: PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals);
1700: PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4);
1701: PetscObjectGetNewTag((PetscObject)C, &tag4);
1702: for (i = 0; i < nrqs; ++i) {
1703: PetscMalloc1(rbuf2[i][0], &rbuf4[i]);
1704: MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i);
1705: }
1707: /* Allocate sending buffers for a->a, and send them off */
1708: PetscMalloc1(nrqr, &sbuf_aa);
1709: for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1710: if (nrqr) PetscMalloc1(j, &sbuf_aa[0]);
1711: for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
1713: for (i = 0; i < nrqr; i++) {
1714: rbuf1_i = rbuf1[i];
1715: sbuf_aa_i = sbuf_aa[i];
1716: ct1 = 2 * rbuf1_i[0] + 1;
1717: ct2 = 0;
1720: kmax = rbuf1_i[2];
1721: for (k = 0; k < kmax; k++, ct1++) {
1722: row = rbuf1_i[ct1] - rstart;
1723: nzA = ai[row + 1] - ai[row];
1724: nzB = bi[row + 1] - bi[row];
1725: ncols = nzA + nzB;
1726: cworkB = bj + bi[row];
1727: vworkA = a_a + ai[row];
1728: vworkB = b_a + bi[row];
1730: /* load the column values for this row into vals*/
1731: vals = sbuf_aa_i + ct2;
1733: lwrite = 0;
1734: for (l = 0; l < nzB; l++) {
1735: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1736: }
1737: for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
1738: for (l = 0; l < nzB; l++) {
1739: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1740: }
1742: ct2 += ncols;
1743: }
1744: MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i);
1745: }
1747: /* Assemble submat */
1748: /* First assemble the local rows */
1749: for (j = 0; j < nrow; j++) {
1750: row = irow[j];
1751: proc = row2proc[j];
1752: if (proc == rank) {
1753: Crow = row - rstart; /* local row index of C */
1754: #if defined(PETSC_USE_CTABLE)
1755: row = rmap_loc[Crow]; /* row index of submat */
1756: #else
1757: row = rmap[row];
1758: #endif
1760: if (allcolumns) {
1761: /* diagonal part A = c->A */
1762: ncols = ai[Crow + 1] - ai[Crow];
1763: cols = aj + ai[Crow];
1764: vals = a_a + ai[Crow];
1765: i = 0;
1766: for (k = 0; k < ncols; k++) {
1767: subcols[i] = cols[k] + cstart;
1768: subvals[i++] = vals[k];
1769: }
1771: /* off-diagonal part B = c->B */
1772: ncols = bi[Crow + 1] - bi[Crow];
1773: cols = bj + bi[Crow];
1774: vals = b_a + bi[Crow];
1775: for (k = 0; k < ncols; k++) {
1776: subcols[i] = bmap[cols[k]];
1777: subvals[i++] = vals[k];
1778: }
1780: MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES);
1782: } else { /* !allcolumns */
1783: #if defined(PETSC_USE_CTABLE)
1784: /* diagonal part A = c->A */
1785: ncols = ai[Crow + 1] - ai[Crow];
1786: cols = aj + ai[Crow];
1787: vals = a_a + ai[Crow];
1788: i = 0;
1789: for (k = 0; k < ncols; k++) {
1790: tcol = cmap_loc[cols[k]];
1791: if (tcol) {
1792: subcols[i] = --tcol;
1793: subvals[i++] = vals[k];
1794: }
1795: }
1797: /* off-diagonal part B = c->B */
1798: ncols = bi[Crow + 1] - bi[Crow];
1799: cols = bj + bi[Crow];
1800: vals = b_a + bi[Crow];
1801: for (k = 0; k < ncols; k++) {
1802: PetscTableFind(cmap, bmap[cols[k]] + 1, &tcol);
1803: if (tcol) {
1804: subcols[i] = --tcol;
1805: subvals[i++] = vals[k];
1806: }
1807: }
1808: #else
1809: /* diagonal part A = c->A */
1810: ncols = ai[Crow + 1] - ai[Crow];
1811: cols = aj + ai[Crow];
1812: vals = a_a + ai[Crow];
1813: i = 0;
1814: for (k = 0; k < ncols; k++) {
1815: tcol = cmap[cols[k] + cstart];
1816: if (tcol) {
1817: subcols[i] = --tcol;
1818: subvals[i++] = vals[k];
1819: }
1820: }
1822: /* off-diagonal part B = c->B */
1823: ncols = bi[Crow + 1] - bi[Crow];
1824: cols = bj + bi[Crow];
1825: vals = b_a + bi[Crow];
1826: for (k = 0; k < ncols; k++) {
1827: tcol = cmap[bmap[cols[k]]];
1828: if (tcol) {
1829: subcols[i] = --tcol;
1830: subvals[i++] = vals[k];
1831: }
1832: }
1833: #endif
1834: MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES);
1835: }
1836: }
1837: }
1839: /* Now assemble the off-proc rows */
1840: for (i = 0; i < nrqs; i++) { /* for each requested message */
1841: /* recv values from other processes */
1842: MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i);
1843: proc = pa[idex];
1844: sbuf1_i = sbuf1[proc];
1846: ct1 = 2 + 1;
1847: ct2 = 0; /* count of received C->j */
1848: ct3 = 0; /* count of received C->j that will be inserted into submat */
1849: rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1850: rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1851: rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1854: max1 = sbuf1_i[2]; /* num of rows */
1855: for (k = 0; k < max1; k++, ct1++) { /* for each recved row */
1856: row = sbuf1_i[ct1]; /* row index of submat */
1857: if (!allcolumns) {
1858: idex = 0;
1859: if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1860: nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1861: for (l = 0; l < nnz; l++, ct2++) { /* for each recved column */
1862: #if defined(PETSC_USE_CTABLE)
1863: if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1864: tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1865: } else {
1866: PetscTableFind(cmap, rbuf3_i[ct2] + 1, &tcol);
1867: }
1868: #else
1869: tcol = cmap[rbuf3_i[ct2]];
1870: #endif
1871: if (tcol) {
1872: subcols[idex] = --tcol; /* may not be sorted */
1873: subvals[idex++] = rbuf4_i[ct2];
1875: /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1876: For reuse, we replace received C->j with index that should be inserted to submat */
1877: if (iscolsorted) rbuf3_i[ct3++] = ct2;
1878: }
1879: }
1880: MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES);
1881: } else { /* scall == MAT_REUSE_MATRIX */
1882: submat = submats[0];
1883: subc = (Mat_SeqAIJ *)submat->data;
1885: nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */
1886: for (l = 0; l < nnz; l++) {
1887: ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1888: subvals[idex++] = rbuf4_i[ct2];
1889: }
1891: bj = subc->j + subc->i[row]; /* sorted column indices */
1892: MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES);
1893: }
1894: } else { /* allcolumns */
1895: nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1896: MatSetValues_SeqAIJ(submat, 1, &row, nnz, rbuf3_i + ct2, rbuf4_i + ct2, INSERT_VALUES);
1897: ct2 += nnz;
1898: }
1899: }
1900: }
1902: /* sending a->a are done */
1903: MPI_Waitall(nrqr, s_waits4, s_status4);
1904: PetscFree4(r_waits4, s_waits4, r_status4, s_status4);
1906: MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY);
1907: MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY);
1908: submats[0] = submat;
1910: /* Restore the indices */
1911: ISRestoreIndices(isrow[0], &irow);
1912: if (!allcolumns) ISRestoreIndices(iscol[0], &icol);
1914: /* Destroy allocated memory */
1915: for (i = 0; i < nrqs; ++i) PetscFree(rbuf4[i]);
1916: PetscFree3(rbuf4, subcols, subvals);
1917: if (sbuf_aa) {
1918: PetscFree(sbuf_aa[0]);
1919: PetscFree(sbuf_aa);
1920: }
1922: if (scall == MAT_INITIAL_MATRIX) {
1923: PetscFree(lens);
1924: if (sbuf_aj) {
1925: PetscFree(sbuf_aj[0]);
1926: PetscFree(sbuf_aj);
1927: }
1928: }
1929: MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a);
1930: MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a);
1931: return 0;
1932: }
1934: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1935: {
1936: PetscInt ncol;
1937: PetscBool colflag, allcolumns = PETSC_FALSE;
1939: /* Allocate memory to hold all the submatrices */
1940: if (scall == MAT_INITIAL_MATRIX) PetscCalloc1(2, submat);
1942: /* Check for special case: each processor gets entire matrix columns */
1943: ISIdentity(iscol[0], &colflag);
1944: ISGetLocalSize(iscol[0], &ncol);
1945: if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1947: MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat);
1948: return 0;
1949: }
1951: PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1952: {
1953: PetscInt nmax, nstages = 0, i, pos, max_no, nrow, ncol, in[2], out[2];
1954: PetscBool rowflag, colflag, wantallmatrix = PETSC_FALSE;
1955: Mat_SeqAIJ *subc;
1956: Mat_SubSppt *smat;
1958: /* Check for special case: each processor has a single IS */
1959: if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1960: MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat);
1961: C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1962: return 0;
1963: }
1965: /* Collect global wantallmatrix and nstages */
1966: if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt);
1967: else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt));
1968: if (!nmax) nmax = 1;
1970: if (scall == MAT_INITIAL_MATRIX) {
1971: /* Collect global wantallmatrix and nstages */
1972: if (ismax == 1 && C->rmap->N == C->cmap->N) {
1973: ISIdentity(*isrow, &rowflag);
1974: ISIdentity(*iscol, &colflag);
1975: ISGetLocalSize(*isrow, &nrow);
1976: ISGetLocalSize(*iscol, &ncol);
1977: if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1978: wantallmatrix = PETSC_TRUE;
1980: PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL);
1981: }
1982: }
1984: /* Determine the number of stages through which submatrices are done
1985: Each stage will extract nmax submatrices.
1986: nmax is determined by the matrix column dimension.
1987: If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1988: */
1989: nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
1991: in[0] = -1 * (PetscInt)wantallmatrix;
1992: in[1] = nstages;
1993: MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C));
1994: wantallmatrix = (PetscBool)(-out[0]);
1995: nstages = out[1]; /* Make sure every processor loops through the global nstages */
1997: } else { /* MAT_REUSE_MATRIX */
1998: if (ismax) {
1999: subc = (Mat_SeqAIJ *)(*submat)[0]->data;
2000: smat = subc->submatis1;
2001: } else { /* (*submat)[0] is a dummy matrix */
2002: smat = (Mat_SubSppt *)(*submat)[0]->data;
2003: }
2004: if (!smat) {
2005: /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2006: wantallmatrix = PETSC_TRUE;
2007: } else if (smat->singleis) {
2008: MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat);
2009: return 0;
2010: } else {
2011: nstages = smat->nstages;
2012: }
2013: }
2015: if (wantallmatrix) {
2016: MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat);
2017: return 0;
2018: }
2020: /* Allocate memory to hold all the submatrices and dummy submatrices */
2021: if (scall == MAT_INITIAL_MATRIX) PetscCalloc1(ismax + nstages, submat);
2023: for (i = 0, pos = 0; i < nstages; i++) {
2024: if (pos + nmax <= ismax) max_no = nmax;
2025: else if (pos >= ismax) max_no = 0;
2026: else max_no = ismax - pos;
2028: MatCreateSubMatrices_MPIAIJ_Local(C, max_no, isrow + pos, iscol + pos, scall, *submat + pos);
2029: if (!max_no) {
2030: if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2031: smat = (Mat_SubSppt *)(*submat)[pos]->data;
2032: smat->nstages = nstages;
2033: }
2034: pos++; /* advance to next dummy matrix if any */
2035: } else pos += max_no;
2036: }
2038: if (ismax && scall == MAT_INITIAL_MATRIX) {
2039: /* save nstages for reuse */
2040: subc = (Mat_SeqAIJ *)(*submat)[0]->data;
2041: smat = subc->submatis1;
2042: smat->nstages = nstages;
2043: }
2044: return 0;
2045: }
2047: /* -------------------------------------------------------------------------*/
2048: PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats)
2049: {
2050: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
2051: Mat A = c->A;
2052: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc;
2053: const PetscInt **icol, **irow;
2054: PetscInt *nrow, *ncol, start;
2055: PetscMPIInt rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr;
2056: PetscInt **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, **rbuf1, row, proc = -1;
2057: PetscInt nrqs = 0, msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *pa, *tmp = NULL, tcol;
2058: PetscInt **rbuf3 = NULL, *req_source1 = NULL, *req_source2, **sbuf_aj, **rbuf2 = NULL, max1, max2;
2059: PetscInt **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax;
2060: #if defined(PETSC_USE_CTABLE)
2061: PetscTable *cmap, cmap_i = NULL, *rmap, rmap_i;
2062: #else
2063: PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i;
2064: #endif
2065: const PetscInt *irow_i;
2066: PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i;
2067: MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
2068: MPI_Request *r_waits4, *s_waits3, *s_waits4;
2069: MPI_Comm comm;
2070: PetscScalar **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i;
2071: PetscMPIInt *onodes1, *olengths1, end;
2072: PetscInt **row2proc, *row2proc_i, ilen_row, *imat_ilen, *imat_j, *imat_i, old_row;
2073: Mat_SubSppt *smat_i;
2074: PetscBool *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE;
2075: PetscInt *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen;
2077: PetscObjectGetComm((PetscObject)C, &comm);
2078: size = c->size;
2079: rank = c->rank;
2081: PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns);
2082: PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted);
2084: for (i = 0; i < ismax; i++) {
2085: ISSorted(iscol[i], &issorted[i]);
2086: if (!issorted[i]) iscsorted = issorted[i];
2088: ISSorted(isrow[i], &issorted[i]);
2090: ISGetIndices(isrow[i], &irow[i]);
2091: ISGetLocalSize(isrow[i], &nrow[i]);
2093: /* Check for special case: allcolumn */
2094: ISIdentity(iscol[i], &colflag);
2095: ISGetLocalSize(iscol[i], &ncol[i]);
2096: if (colflag && ncol[i] == C->cmap->N) {
2097: allcolumns[i] = PETSC_TRUE;
2098: icol[i] = NULL;
2099: } else {
2100: allcolumns[i] = PETSC_FALSE;
2101: ISGetIndices(iscol[i], &icol[i]);
2102: }
2103: }
2105: if (scall == MAT_REUSE_MATRIX) {
2106: /* Assumes new rows are same length as the old rows */
2107: for (i = 0; i < ismax; i++) {
2109: subc = (Mat_SeqAIJ *)submats[i]->data;
2112: /* Initial matrix as if empty */
2113: PetscArrayzero(subc->ilen, submats[i]->rmap->n);
2115: smat_i = subc->submatis1;
2117: nrqs = smat_i->nrqs;
2118: nrqr = smat_i->nrqr;
2119: rbuf1 = smat_i->rbuf1;
2120: rbuf2 = smat_i->rbuf2;
2121: rbuf3 = smat_i->rbuf3;
2122: req_source2 = smat_i->req_source2;
2124: sbuf1 = smat_i->sbuf1;
2125: sbuf2 = smat_i->sbuf2;
2126: ptr = smat_i->ptr;
2127: tmp = smat_i->tmp;
2128: ctr = smat_i->ctr;
2130: pa = smat_i->pa;
2131: req_size = smat_i->req_size;
2132: req_source1 = smat_i->req_source1;
2134: allcolumns[i] = smat_i->allcolumns;
2135: row2proc[i] = smat_i->row2proc;
2136: rmap[i] = smat_i->rmap;
2137: cmap[i] = smat_i->cmap;
2138: }
2140: if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2142: smat_i = (Mat_SubSppt *)submats[0]->data;
2144: nrqs = smat_i->nrqs;
2145: nrqr = smat_i->nrqr;
2146: rbuf1 = smat_i->rbuf1;
2147: rbuf2 = smat_i->rbuf2;
2148: rbuf3 = smat_i->rbuf3;
2149: req_source2 = smat_i->req_source2;
2151: sbuf1 = smat_i->sbuf1;
2152: sbuf2 = smat_i->sbuf2;
2153: ptr = smat_i->ptr;
2154: tmp = smat_i->tmp;
2155: ctr = smat_i->ctr;
2157: pa = smat_i->pa;
2158: req_size = smat_i->req_size;
2159: req_source1 = smat_i->req_source1;
2161: allcolumns[0] = PETSC_FALSE;
2162: }
2163: } else { /* scall == MAT_INITIAL_MATRIX */
2164: /* Get some new tags to keep the communication clean */
2165: PetscObjectGetNewTag((PetscObject)C, &tag2);
2166: PetscObjectGetNewTag((PetscObject)C, &tag3);
2168: /* evaluate communication - mesg to who, length of mesg, and buffer space
2169: required. Based on this, buffers are allocated, and data copied into them*/
2170: PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4); /* mesg size, initialize work vectors */
2172: for (i = 0; i < ismax; i++) {
2173: jmax = nrow[i];
2174: irow_i = irow[i];
2176: PetscMalloc1(jmax, &row2proc_i);
2177: row2proc[i] = row2proc_i;
2179: if (issorted[i]) proc = 0;
2180: for (j = 0; j < jmax; j++) {
2181: if (!issorted[i]) proc = 0;
2182: row = irow_i[j];
2183: while (row >= C->rmap->range[proc + 1]) proc++;
2184: w4[proc]++;
2185: row2proc_i[j] = proc; /* map row index to proc */
2186: }
2187: for (j = 0; j < size; j++) {
2188: if (w4[j]) {
2189: w1[j] += w4[j];
2190: w3[j]++;
2191: w4[j] = 0;
2192: }
2193: }
2194: }
2196: nrqs = 0; /* no of outgoing messages */
2197: msz = 0; /* total mesg length (for all procs) */
2198: w1[rank] = 0; /* no mesg sent to self */
2199: w3[rank] = 0;
2200: for (i = 0; i < size; i++) {
2201: if (w1[i]) {
2202: w2[i] = 1;
2203: nrqs++;
2204: } /* there exists a message to proc i */
2205: }
2206: PetscMalloc1(nrqs, &pa); /*(proc -array)*/
2207: for (i = 0, j = 0; i < size; i++) {
2208: if (w1[i]) {
2209: pa[j] = i;
2210: j++;
2211: }
2212: }
2214: /* Each message would have a header = 1 + 2*(no of IS) + data */
2215: for (i = 0; i < nrqs; i++) {
2216: j = pa[i];
2217: w1[j] += w2[j] + 2 * w3[j];
2218: msz += w1[j];
2219: }
2220: PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz);
2222: /* Determine the number of messages to expect, their lengths, from from-ids */
2223: PetscGatherNumberOfMessages(comm, w2, w1, &nrqr);
2224: PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1);
2226: /* Now post the Irecvs corresponding to these messages */
2227: PetscObjectGetNewTag((PetscObject)C, &tag0);
2228: PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1);
2230: /* Allocate Memory for outgoing messages */
2231: PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr);
2232: PetscArrayzero(sbuf1, size);
2233: PetscArrayzero(ptr, size);
2235: {
2236: PetscInt *iptr = tmp;
2237: k = 0;
2238: for (i = 0; i < nrqs; i++) {
2239: j = pa[i];
2240: iptr += k;
2241: sbuf1[j] = iptr;
2242: k = w1[j];
2243: }
2244: }
2246: /* Form the outgoing messages. Initialize the header space */
2247: for (i = 0; i < nrqs; i++) {
2248: j = pa[i];
2249: sbuf1[j][0] = 0;
2250: PetscArrayzero(sbuf1[j] + 1, 2 * w3[j]);
2251: ptr[j] = sbuf1[j] + 2 * w3[j] + 1;
2252: }
2254: /* Parse the isrow and copy data into outbuf */
2255: for (i = 0; i < ismax; i++) {
2256: row2proc_i = row2proc[i];
2257: PetscArrayzero(ctr, size);
2258: irow_i = irow[i];
2259: jmax = nrow[i];
2260: for (j = 0; j < jmax; j++) { /* parse the indices of each IS */
2261: proc = row2proc_i[j];
2262: if (proc != rank) { /* copy to the outgoing buf*/
2263: ctr[proc]++;
2264: *ptr[proc] = irow_i[j];
2265: ptr[proc]++;
2266: }
2267: }
2268: /* Update the headers for the current IS */
2269: for (j = 0; j < size; j++) { /* Can Optimise this loop too */
2270: if ((ctr_j = ctr[j])) {
2271: sbuf1_j = sbuf1[j];
2272: k = ++sbuf1_j[0];
2273: sbuf1_j[2 * k] = ctr_j;
2274: sbuf1_j[2 * k - 1] = i;
2275: }
2276: }
2277: }
2279: /* Now post the sends */
2280: PetscMalloc1(nrqs, &s_waits1);
2281: for (i = 0; i < nrqs; ++i) {
2282: j = pa[i];
2283: MPI_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i);
2284: }
2286: /* Post Receives to capture the buffer size */
2287: PetscMalloc1(nrqs, &r_waits2);
2288: PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3);
2289: if (nrqs) rbuf2[0] = tmp + msz;
2290: for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
2291: for (i = 0; i < nrqs; ++i) {
2292: j = pa[i];
2293: MPI_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i);
2294: }
2296: /* Send to other procs the buf size they should allocate */
2297: /* Receive messages*/
2298: PetscMalloc1(nrqr, &s_waits2);
2299: PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1);
2300: {
2301: PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart;
2302: PetscInt *sbuf2_i;
2304: MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE);
2305: for (i = 0; i < nrqr; ++i) {
2306: req_size[i] = 0;
2307: rbuf1_i = rbuf1[i];
2308: start = 2 * rbuf1_i[0] + 1;
2309: end = olengths1[i];
2310: PetscMalloc1(end, &sbuf2[i]);
2311: sbuf2_i = sbuf2[i];
2312: for (j = start; j < end; j++) {
2313: id = rbuf1_i[j] - rstart;
2314: ncols = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id];
2315: sbuf2_i[j] = ncols;
2316: req_size[i] += ncols;
2317: }
2318: req_source1[i] = onodes1[i];
2319: /* form the header */
2320: sbuf2_i[0] = req_size[i];
2321: for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
2323: MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i);
2324: }
2325: }
2327: PetscFree(onodes1);
2328: PetscFree(olengths1);
2329: PetscFree(r_waits1);
2330: PetscFree4(w1, w2, w3, w4);
2332: /* Receive messages*/
2333: PetscMalloc1(nrqs, &r_waits3);
2334: MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE);
2335: for (i = 0; i < nrqs; ++i) {
2336: PetscMalloc1(rbuf2[i][0], &rbuf3[i]);
2337: req_source2[i] = pa[i];
2338: MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i);
2339: }
2340: PetscFree(r_waits2);
2342: /* Wait on sends1 and sends2 */
2343: MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE);
2344: MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE);
2345: PetscFree(s_waits1);
2346: PetscFree(s_waits2);
2348: /* Now allocate sending buffers for a->j, and send them off */
2349: PetscMalloc1(nrqr, &sbuf_aj);
2350: for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2351: if (nrqr) PetscMalloc1(j, &sbuf_aj[0]);
2352: for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
2354: PetscMalloc1(nrqr, &s_waits3);
2355: {
2356: PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite;
2357: PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2358: PetscInt cend = C->cmap->rend;
2359: PetscInt *a_j = a->j, *b_j = b->j, ctmp;
2361: for (i = 0; i < nrqr; i++) {
2362: rbuf1_i = rbuf1[i];
2363: sbuf_aj_i = sbuf_aj[i];
2364: ct1 = 2 * rbuf1_i[0] + 1;
2365: ct2 = 0;
2366: for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2367: kmax = rbuf1[i][2 * j];
2368: for (k = 0; k < kmax; k++, ct1++) {
2369: row = rbuf1_i[ct1] - rstart;
2370: nzA = a_i[row + 1] - a_i[row];
2371: nzB = b_i[row + 1] - b_i[row];
2372: ncols = nzA + nzB;
2373: cworkA = a_j + a_i[row];
2374: cworkB = b_j + b_i[row];
2376: /* load the column indices for this row into cols */
2377: cols = sbuf_aj_i + ct2;
2379: lwrite = 0;
2380: for (l = 0; l < nzB; l++) {
2381: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2382: }
2383: for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2384: for (l = 0; l < nzB; l++) {
2385: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2386: }
2388: ct2 += ncols;
2389: }
2390: }
2391: MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i);
2392: }
2393: }
2395: /* create col map: global col of C -> local col of submatrices */
2396: {
2397: const PetscInt *icol_i;
2398: #if defined(PETSC_USE_CTABLE)
2399: for (i = 0; i < ismax; i++) {
2400: if (!allcolumns[i]) {
2401: PetscTableCreate(ncol[i], C->cmap->N, &cmap[i]);
2403: jmax = ncol[i];
2404: icol_i = icol[i];
2405: cmap_i = cmap[i];
2406: for (j = 0; j < jmax; j++) PetscTableAdd(cmap[i], icol_i[j] + 1, j + 1, INSERT_VALUES);
2407: } else cmap[i] = NULL;
2408: }
2409: #else
2410: for (i = 0; i < ismax; i++) {
2411: if (!allcolumns[i]) {
2412: PetscCalloc1(C->cmap->N, &cmap[i]);
2413: jmax = ncol[i];
2414: icol_i = icol[i];
2415: cmap_i = cmap[i];
2416: for (j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1;
2417: } else cmap[i] = NULL;
2418: }
2419: #endif
2420: }
2422: /* Create lens which is required for MatCreate... */
2423: for (i = 0, j = 0; i < ismax; i++) j += nrow[i];
2424: PetscMalloc1(ismax, &lens);
2426: if (ismax) PetscCalloc1(j, &lens[0]);
2427: for (i = 1; i < ismax; i++) lens[i] = lens[i - 1] + nrow[i - 1];
2429: /* Update lens from local data */
2430: for (i = 0; i < ismax; i++) {
2431: row2proc_i = row2proc[i];
2432: jmax = nrow[i];
2433: if (!allcolumns[i]) cmap_i = cmap[i];
2434: irow_i = irow[i];
2435: lens_i = lens[i];
2436: for (j = 0; j < jmax; j++) {
2437: row = irow_i[j];
2438: proc = row2proc_i[j];
2439: if (proc == rank) {
2440: MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL);
2441: if (!allcolumns[i]) {
2442: for (k = 0; k < ncols; k++) {
2443: #if defined(PETSC_USE_CTABLE)
2444: PetscTableFind(cmap_i, cols[k] + 1, &tcol);
2445: #else
2446: tcol = cmap_i[cols[k]];
2447: #endif
2448: if (tcol) lens_i[j]++;
2449: }
2450: } else { /* allcolumns */
2451: lens_i[j] = ncols;
2452: }
2453: MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL);
2454: }
2455: }
2456: }
2458: /* Create row map: global row of C -> local row of submatrices */
2459: #if defined(PETSC_USE_CTABLE)
2460: for (i = 0; i < ismax; i++) {
2461: PetscTableCreate(nrow[i], C->rmap->N, &rmap[i]);
2462: irow_i = irow[i];
2463: jmax = nrow[i];
2464: for (j = 0; j < jmax; j++) PetscTableAdd(rmap[i], irow_i[j] + 1, j + 1, INSERT_VALUES);
2465: }
2466: #else
2467: for (i = 0; i < ismax; i++) {
2468: PetscCalloc1(C->rmap->N, &rmap[i]);
2469: rmap_i = rmap[i];
2470: irow_i = irow[i];
2471: jmax = nrow[i];
2472: for (j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j;
2473: }
2474: #endif
2476: /* Update lens from offproc data */
2477: {
2478: PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i;
2480: MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE);
2481: for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2482: sbuf1_i = sbuf1[pa[tmp2]];
2483: jmax = sbuf1_i[0];
2484: ct1 = 2 * jmax + 1;
2485: ct2 = 0;
2486: rbuf2_i = rbuf2[tmp2];
2487: rbuf3_i = rbuf3[tmp2];
2488: for (j = 1; j <= jmax; j++) {
2489: is_no = sbuf1_i[2 * j - 1];
2490: max1 = sbuf1_i[2 * j];
2491: lens_i = lens[is_no];
2492: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2493: rmap_i = rmap[is_no];
2494: for (k = 0; k < max1; k++, ct1++) {
2495: #if defined(PETSC_USE_CTABLE)
2496: PetscTableFind(rmap_i, sbuf1_i[ct1] + 1, &row);
2497: row--;
2499: #else
2500: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2501: #endif
2502: max2 = rbuf2_i[ct1];
2503: for (l = 0; l < max2; l++, ct2++) {
2504: if (!allcolumns[is_no]) {
2505: #if defined(PETSC_USE_CTABLE)
2506: PetscTableFind(cmap_i, rbuf3_i[ct2] + 1, &tcol);
2507: #else
2508: tcol = cmap_i[rbuf3_i[ct2]];
2509: #endif
2510: if (tcol) lens_i[row]++;
2511: } else { /* allcolumns */
2512: lens_i[row]++; /* lens_i[row] += max2 ? */
2513: }
2514: }
2515: }
2516: }
2517: }
2518: }
2519: PetscFree(r_waits3);
2520: MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE);
2521: PetscFree(s_waits3);
2523: /* Create the submatrices */
2524: for (i = 0; i < ismax; i++) {
2525: PetscInt rbs, cbs;
2527: ISGetBlockSize(isrow[i], &rbs);
2528: ISGetBlockSize(iscol[i], &cbs);
2530: MatCreate(PETSC_COMM_SELF, submats + i);
2531: MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE);
2533: MatSetBlockSizes(submats[i], rbs, cbs);
2534: MatSetType(submats[i], ((PetscObject)A)->type_name);
2535: MatSeqAIJSetPreallocation(submats[i], 0, lens[i]);
2537: /* create struct Mat_SubSppt and attached it to submat */
2538: PetscNew(&smat_i);
2539: subc = (Mat_SeqAIJ *)submats[i]->data;
2540: subc->submatis1 = smat_i;
2542: smat_i->destroy = submats[i]->ops->destroy;
2543: submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2544: submats[i]->factortype = C->factortype;
2546: smat_i->id = i;
2547: smat_i->nrqs = nrqs;
2548: smat_i->nrqr = nrqr;
2549: smat_i->rbuf1 = rbuf1;
2550: smat_i->rbuf2 = rbuf2;
2551: smat_i->rbuf3 = rbuf3;
2552: smat_i->sbuf2 = sbuf2;
2553: smat_i->req_source2 = req_source2;
2555: smat_i->sbuf1 = sbuf1;
2556: smat_i->ptr = ptr;
2557: smat_i->tmp = tmp;
2558: smat_i->ctr = ctr;
2560: smat_i->pa = pa;
2561: smat_i->req_size = req_size;
2562: smat_i->req_source1 = req_source1;
2564: smat_i->allcolumns = allcolumns[i];
2565: smat_i->singleis = PETSC_FALSE;
2566: smat_i->row2proc = row2proc[i];
2567: smat_i->rmap = rmap[i];
2568: smat_i->cmap = cmap[i];
2569: }
2571: if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2572: MatCreate(PETSC_COMM_SELF, &submats[0]);
2573: MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE);
2574: MatSetType(submats[0], MATDUMMY);
2576: /* create struct Mat_SubSppt and attached it to submat */
2577: PetscNew(&smat_i);
2578: submats[0]->data = (void *)smat_i;
2580: smat_i->destroy = submats[0]->ops->destroy;
2581: submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2582: submats[0]->factortype = C->factortype;
2584: smat_i->id = 0;
2585: smat_i->nrqs = nrqs;
2586: smat_i->nrqr = nrqr;
2587: smat_i->rbuf1 = rbuf1;
2588: smat_i->rbuf2 = rbuf2;
2589: smat_i->rbuf3 = rbuf3;
2590: smat_i->sbuf2 = sbuf2;
2591: smat_i->req_source2 = req_source2;
2593: smat_i->sbuf1 = sbuf1;
2594: smat_i->ptr = ptr;
2595: smat_i->tmp = tmp;
2596: smat_i->ctr = ctr;
2598: smat_i->pa = pa;
2599: smat_i->req_size = req_size;
2600: smat_i->req_source1 = req_source1;
2602: smat_i->allcolumns = PETSC_FALSE;
2603: smat_i->singleis = PETSC_FALSE;
2604: smat_i->row2proc = NULL;
2605: smat_i->rmap = NULL;
2606: smat_i->cmap = NULL;
2607: }
2609: if (ismax) PetscFree(lens[0]);
2610: PetscFree(lens);
2611: if (sbuf_aj) {
2612: PetscFree(sbuf_aj[0]);
2613: PetscFree(sbuf_aj);
2614: }
2616: } /* endof scall == MAT_INITIAL_MATRIX */
2618: /* Post recv matrix values */
2619: PetscObjectGetNewTag((PetscObject)C, &tag4);
2620: PetscMalloc1(nrqs, &rbuf4);
2621: PetscMalloc1(nrqs, &r_waits4);
2622: for (i = 0; i < nrqs; ++i) {
2623: PetscMalloc1(rbuf2[i][0], &rbuf4[i]);
2624: MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i);
2625: }
2627: /* Allocate sending buffers for a->a, and send them off */
2628: PetscMalloc1(nrqr, &sbuf_aa);
2629: for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2630: if (nrqr) PetscMalloc1(j, &sbuf_aa[0]);
2631: for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
2633: PetscMalloc1(nrqr, &s_waits4);
2634: {
2635: PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite;
2636: PetscInt cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2637: PetscInt cend = C->cmap->rend;
2638: PetscInt *b_j = b->j;
2639: PetscScalar *vworkA, *vworkB, *a_a, *b_a;
2641: MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a);
2642: MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a);
2643: for (i = 0; i < nrqr; i++) {
2644: rbuf1_i = rbuf1[i];
2645: sbuf_aa_i = sbuf_aa[i];
2646: ct1 = 2 * rbuf1_i[0] + 1;
2647: ct2 = 0;
2648: for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2649: kmax = rbuf1_i[2 * j];
2650: for (k = 0; k < kmax; k++, ct1++) {
2651: row = rbuf1_i[ct1] - rstart;
2652: nzA = a_i[row + 1] - a_i[row];
2653: nzB = b_i[row + 1] - b_i[row];
2654: ncols = nzA + nzB;
2655: cworkB = b_j + b_i[row];
2656: vworkA = a_a + a_i[row];
2657: vworkB = b_a + b_i[row];
2659: /* load the column values for this row into vals*/
2660: vals = sbuf_aa_i + ct2;
2662: lwrite = 0;
2663: for (l = 0; l < nzB; l++) {
2664: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2665: }
2666: for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
2667: for (l = 0; l < nzB; l++) {
2668: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2669: }
2671: ct2 += ncols;
2672: }
2673: }
2674: MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i);
2675: }
2676: MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a);
2677: MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a);
2678: }
2680: /* Assemble the matrices */
2681: /* First assemble the local rows */
2682: for (i = 0; i < ismax; i++) {
2683: row2proc_i = row2proc[i];
2684: subc = (Mat_SeqAIJ *)submats[i]->data;
2685: imat_ilen = subc->ilen;
2686: imat_j = subc->j;
2687: imat_i = subc->i;
2688: imat_a = subc->a;
2690: if (!allcolumns[i]) cmap_i = cmap[i];
2691: rmap_i = rmap[i];
2692: irow_i = irow[i];
2693: jmax = nrow[i];
2694: for (j = 0; j < jmax; j++) {
2695: row = irow_i[j];
2696: proc = row2proc_i[j];
2697: if (proc == rank) {
2698: old_row = row;
2699: #if defined(PETSC_USE_CTABLE)
2700: PetscTableFind(rmap_i, row + 1, &row);
2701: row--;
2702: #else
2703: row = rmap_i[row];
2704: #endif
2705: ilen_row = imat_ilen[row];
2706: MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals);
2707: mat_i = imat_i[row];
2708: mat_a = imat_a + mat_i;
2709: mat_j = imat_j + mat_i;
2710: if (!allcolumns[i]) {
2711: for (k = 0; k < ncols; k++) {
2712: #if defined(PETSC_USE_CTABLE)
2713: PetscTableFind(cmap_i, cols[k] + 1, &tcol);
2714: #else
2715: tcol = cmap_i[cols[k]];
2716: #endif
2717: if (tcol) {
2718: *mat_j++ = tcol - 1;
2719: *mat_a++ = vals[k];
2720: ilen_row++;
2721: }
2722: }
2723: } else { /* allcolumns */
2724: for (k = 0; k < ncols; k++) {
2725: *mat_j++ = cols[k]; /* global col index! */
2726: *mat_a++ = vals[k];
2727: ilen_row++;
2728: }
2729: }
2730: MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals);
2732: imat_ilen[row] = ilen_row;
2733: }
2734: }
2735: }
2737: /* Now assemble the off proc rows */
2738: MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE);
2739: for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2740: sbuf1_i = sbuf1[pa[tmp2]];
2741: jmax = sbuf1_i[0];
2742: ct1 = 2 * jmax + 1;
2743: ct2 = 0;
2744: rbuf2_i = rbuf2[tmp2];
2745: rbuf3_i = rbuf3[tmp2];
2746: rbuf4_i = rbuf4[tmp2];
2747: for (j = 1; j <= jmax; j++) {
2748: is_no = sbuf1_i[2 * j - 1];
2749: rmap_i = rmap[is_no];
2750: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2751: subc = (Mat_SeqAIJ *)submats[is_no]->data;
2752: imat_ilen = subc->ilen;
2753: imat_j = subc->j;
2754: imat_i = subc->i;
2755: imat_a = subc->a;
2756: max1 = sbuf1_i[2 * j];
2757: for (k = 0; k < max1; k++, ct1++) {
2758: row = sbuf1_i[ct1];
2759: #if defined(PETSC_USE_CTABLE)
2760: PetscTableFind(rmap_i, row + 1, &row);
2761: row--;
2762: #else
2763: row = rmap_i[row];
2764: #endif
2765: ilen = imat_ilen[row];
2766: mat_i = imat_i[row];
2767: mat_a = imat_a + mat_i;
2768: mat_j = imat_j + mat_i;
2769: max2 = rbuf2_i[ct1];
2770: if (!allcolumns[is_no]) {
2771: for (l = 0; l < max2; l++, ct2++) {
2772: #if defined(PETSC_USE_CTABLE)
2773: PetscTableFind(cmap_i, rbuf3_i[ct2] + 1, &tcol);
2774: #else
2775: tcol = cmap_i[rbuf3_i[ct2]];
2776: #endif
2777: if (tcol) {
2778: *mat_j++ = tcol - 1;
2779: *mat_a++ = rbuf4_i[ct2];
2780: ilen++;
2781: }
2782: }
2783: } else { /* allcolumns */
2784: for (l = 0; l < max2; l++, ct2++) {
2785: *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2786: *mat_a++ = rbuf4_i[ct2];
2787: ilen++;
2788: }
2789: }
2790: imat_ilen[row] = ilen;
2791: }
2792: }
2793: }
2795: if (!iscsorted) { /* sort column indices of the rows */
2796: for (i = 0; i < ismax; i++) {
2797: subc = (Mat_SeqAIJ *)submats[i]->data;
2798: imat_j = subc->j;
2799: imat_i = subc->i;
2800: imat_a = subc->a;
2801: imat_ilen = subc->ilen;
2803: if (allcolumns[i]) continue;
2804: jmax = nrow[i];
2805: for (j = 0; j < jmax; j++) {
2806: mat_i = imat_i[j];
2807: mat_a = imat_a + mat_i;
2808: mat_j = imat_j + mat_i;
2809: PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a);
2810: }
2811: }
2812: }
2814: PetscFree(r_waits4);
2815: MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE);
2816: PetscFree(s_waits4);
2818: /* Restore the indices */
2819: for (i = 0; i < ismax; i++) {
2820: ISRestoreIndices(isrow[i], irow + i);
2821: if (!allcolumns[i]) ISRestoreIndices(iscol[i], icol + i);
2822: }
2824: for (i = 0; i < ismax; i++) {
2825: MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY);
2826: MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY);
2827: }
2829: /* Destroy allocated memory */
2830: if (sbuf_aa) {
2831: PetscFree(sbuf_aa[0]);
2832: PetscFree(sbuf_aa);
2833: }
2834: PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted);
2836: for (i = 0; i < nrqs; ++i) PetscFree(rbuf4[i]);
2837: PetscFree(rbuf4);
2839: PetscFree4(row2proc, cmap, rmap, allcolumns);
2840: return 0;
2841: }
2843: /*
2844: Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2845: Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2846: of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2847: If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2848: After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2849: state, and needs to be "assembled" later by compressing B's column space.
2851: This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2852: Following this call, C->A & C->B have been created, even if empty.
2853: */
2854: PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B)
2855: {
2856: /* If making this function public, change the error returned in this function away from _PLIB. */
2857: Mat_MPIAIJ *aij;
2858: Mat_SeqAIJ *Baij;
2859: PetscBool seqaij, Bdisassembled;
2860: PetscInt m, n, *nz, i, j, ngcol, col, rstart, rend, shift, count;
2861: PetscScalar v;
2862: const PetscInt *rowindices, *colindices;
2864: /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2865: if (A) {
2866: PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);
2868: if (rowemb) {
2869: ISGetLocalSize(rowemb, &m);
2871: } else {
2873: }
2874: if (dcolemb) {
2875: ISGetLocalSize(dcolemb, &n);
2877: } else {
2879: }
2880: }
2881: if (B) {
2882: PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);
2884: if (rowemb) {
2885: ISGetLocalSize(rowemb, &m);
2887: } else {
2889: }
2890: if (ocolemb) {
2891: ISGetLocalSize(ocolemb, &n);
2893: } else {
2895: }
2896: }
2898: aij = (Mat_MPIAIJ *)C->data;
2899: if (!aij->A) {
2900: /* Mimic parts of MatMPIAIJSetPreallocation() */
2901: MatCreate(PETSC_COMM_SELF, &aij->A);
2902: MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n);
2903: MatSetBlockSizesFromMats(aij->A, C, C);
2904: MatSetType(aij->A, MATSEQAIJ);
2905: }
2906: if (A) {
2907: MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A);
2908: } else {
2909: MatSetUp(aij->A);
2910: }
2911: if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2912: /*
2913: If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2914: need to "disassemble" B -- convert it to using C's global indices.
2915: To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2917: If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2918: we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2920: TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2921: At least avoid calling MatSetValues() and the implied searches?
2922: */
2924: if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2925: #if defined(PETSC_USE_CTABLE)
2926: PetscTableDestroy(&aij->colmap);
2927: #else
2928: PetscFree(aij->colmap);
2929: /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2930: #endif
2931: ngcol = 0;
2932: if (aij->lvec) VecGetSize(aij->lvec, &ngcol);
2933: if (aij->garray) { PetscFree(aij->garray); }
2934: VecDestroy(&aij->lvec);
2935: VecScatterDestroy(&aij->Mvctx);
2936: }
2937: if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) MatDestroy(&aij->B);
2938: if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) MatZeroEntries(aij->B);
2939: }
2940: Bdisassembled = PETSC_FALSE;
2941: if (!aij->B) {
2942: MatCreate(PETSC_COMM_SELF, &aij->B);
2943: MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N);
2944: MatSetBlockSizesFromMats(aij->B, B, B);
2945: MatSetType(aij->B, MATSEQAIJ);
2946: Bdisassembled = PETSC_TRUE;
2947: }
2948: if (B) {
2949: Baij = (Mat_SeqAIJ *)B->data;
2950: if (pattern == DIFFERENT_NONZERO_PATTERN) {
2951: PetscMalloc1(B->rmap->n, &nz);
2952: for (i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i];
2953: MatSeqAIJSetPreallocation(aij->B, 0, nz);
2954: PetscFree(nz);
2955: }
2957: PetscLayoutGetRange(C->rmap, &rstart, &rend);
2958: shift = rend - rstart;
2959: count = 0;
2960: rowindices = NULL;
2961: colindices = NULL;
2962: if (rowemb) ISGetIndices(rowemb, &rowindices);
2963: if (ocolemb) ISGetIndices(ocolemb, &colindices);
2964: for (i = 0; i < B->rmap->n; i++) {
2965: PetscInt row;
2966: row = i;
2967: if (rowindices) row = rowindices[i];
2968: for (j = Baij->i[i]; j < Baij->i[i + 1]; j++) {
2969: col = Baij->j[count];
2970: if (colindices) col = colindices[col];
2971: if (Bdisassembled && col >= rstart) col += shift;
2972: v = Baij->a[count];
2973: MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES);
2974: ++count;
2975: }
2976: }
2977: /* No assembly for aij->B is necessary. */
2978: /* FIXME: set aij->B's nonzerostate correctly. */
2979: } else {
2980: MatSetUp(aij->B);
2981: }
2982: C->preallocated = PETSC_TRUE;
2983: C->was_assembled = PETSC_FALSE;
2984: C->assembled = PETSC_FALSE;
2985: /*
2986: C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2987: Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2988: */
2989: return 0;
2990: }
2992: /*
2993: B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray.
2994: */
2995: PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B)
2996: {
2997: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data;
3001: /* FIXME: make sure C is assembled */
3002: *A = aij->A;
3003: *B = aij->B;
3004: /* Note that we don't incref *A and *B, so be careful! */
3005: return 0;
3006: }
3008: /*
3009: Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3010: NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3011: */
3012: PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat))
3013: {
3014: PetscMPIInt size, flag;
3015: PetscInt i, ii, cismax, ispar;
3016: Mat *A, *B;
3017: IS *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p;
3019: if (!ismax) return 0;
3021: for (i = 0, cismax = 0; i < ismax; ++i) {
3022: MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag);
3024: MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);
3025: if (size > 1) ++cismax;
3026: }
3028: /*
3029: If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3030: ispar counts the number of parallel ISs across C's comm.
3031: */
3032: MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C));
3033: if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3034: (*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat);
3035: return 0;
3036: }
3038: /* if (ispar) */
3039: /*
3040: Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3041: These are used to extract the off-diag portion of the resulting parallel matrix.
3042: The row IS for the off-diag portion is the same as for the diag portion,
3043: so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3044: */
3045: PetscMalloc2(cismax, &cisrow, cismax, &ciscol);
3046: PetscMalloc1(cismax, &ciscol_p);
3047: for (i = 0, ii = 0; i < ismax; ++i) {
3048: MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);
3049: if (size > 1) {
3050: /*
3051: TODO: This is the part that's ***NOT SCALABLE***.
3052: To fix this we need to extract just the indices of C's nonzero columns
3053: that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3054: part of iscol[i] -- without actually computing ciscol[ii]. This also has
3055: to be done without serializing on the IS list, so, most likely, it is best
3056: done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3057: */
3058: ISGetNonlocalIS(iscol[i], &(ciscol[ii]));
3059: /* Now we have to
3060: (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3061: were sorted on each rank, concatenated they might no longer be sorted;
3062: (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3063: indices in the nondecreasing order to the original index positions.
3064: If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3065: */
3066: ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii);
3067: ISSort(ciscol[ii]);
3068: ++ii;
3069: }
3070: }
3071: PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p);
3072: for (i = 0, ii = 0; i < ismax; ++i) {
3073: PetscInt j, issize;
3074: const PetscInt *indices;
3076: /*
3077: Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3078: */
3079: ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i);
3080: ISSort(isrow[i]);
3081: ISGetLocalSize(isrow[i], &issize);
3082: ISGetIndices(isrow[i], &indices);
3083: for (j = 1; j < issize; ++j) {
3085: }
3086: ISRestoreIndices(isrow[i], &indices);
3087: ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i);
3088: ISSort(iscol[i]);
3089: ISGetLocalSize(iscol[i], &issize);
3090: ISGetIndices(iscol[i], &indices);
3091: for (j = 1; j < issize; ++j) {
3093: }
3094: ISRestoreIndices(iscol[i], &indices);
3095: MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);
3096: if (size > 1) {
3097: cisrow[ii] = isrow[i];
3098: ++ii;
3099: }
3100: }
3101: /*
3102: Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3103: array of sequential matrices underlying the resulting parallel matrices.
3104: Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3105: contain duplicates.
3107: There are as many diag matrices as there are original index sets. There are only as many parallel
3108: and off-diag matrices, as there are parallel (comm size > 1) index sets.
3110: ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3111: - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3112: and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3113: will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3114: with A[i] and B[ii] extracted from the corresponding MPI submat.
3115: - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3116: will have a different order from what getsubmats_seq expects. To handle this case -- indicated
3117: by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3118: (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3119: values into A[i] and B[ii] sitting inside the corresponding submat.
3120: - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3121: A[i], B[ii], AA[i] or BB[ii] matrices.
3122: */
3123: /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3124: if (scall == MAT_INITIAL_MATRIX) PetscMalloc1(ismax, submat);
3126: /* Now obtain the sequential A and B submatrices separately. */
3127: /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3128: (*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A);
3129: (*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B);
3131: /*
3132: If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3133: matrices A & B have been extracted directly into the parallel matrices containing them, or
3134: simply into the sequential matrix identical with the corresponding A (if size == 1).
3135: Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3136: to have the same sparsity pattern.
3137: Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3138: must be constructed for C. This is done by setseqmat(s).
3139: */
3140: for (i = 0, ii = 0; i < ismax; ++i) {
3141: /*
3142: TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3143: That way we can avoid sorting and computing permutations when reusing.
3144: To this end:
3145: - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3146: - if caching arrays to hold the ISs, make and compose a container for them so that it can
3147: be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3148: */
3149: MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3151: MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);
3152: /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3153: if (size > 1) {
3154: if (scall == MAT_INITIAL_MATRIX) {
3155: MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i);
3156: MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE);
3157: MatSetType((*submat)[i], MATMPIAIJ);
3158: PetscLayoutSetUp((*submat)[i]->rmap);
3159: PetscLayoutSetUp((*submat)[i]->cmap);
3160: }
3161: /*
3162: For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3163: */
3164: {
3165: Mat AA = A[i], BB = B[ii];
3167: if (AA || BB) {
3168: setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB);
3169: MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY);
3170: MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY);
3171: }
3172: MatDestroy(&AA);
3173: }
3174: ISDestroy(ciscol + ii);
3175: ISDestroy(ciscol_p + ii);
3176: ++ii;
3177: } else { /* if (size == 1) */
3178: if (scall == MAT_REUSE_MATRIX) MatDestroy(&(*submat)[i]);
3179: if (isrow_p[i] || iscol_p[i]) {
3180: MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i);
3181: setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i]);
3182: /* Otherwise A is extracted straight into (*submats)[i]. */
3183: /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3184: MatDestroy(A + i);
3185: } else (*submat)[i] = A[i];
3186: }
3187: ISDestroy(&isrow_p[i]);
3188: ISDestroy(&iscol_p[i]);
3189: }
3190: PetscFree2(cisrow, ciscol);
3191: PetscFree2(isrow_p, iscol_p);
3192: PetscFree(ciscol_p);
3193: PetscFree(A);
3194: MatDestroySubMatrices(cismax, &B);
3195: return 0;
3196: }
3198: PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
3199: {
3200: MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ);
3201: return 0;
3202: }