Actual source code: mpiov.c

  1: /*
  2:    Routines to compute overlapping regions of a parallel MPI matrix
  3:   and to find submatrices that were shared across processors.
  4: */
  5: #include <../src/mat/impls/aij/seq/aij.h>
  6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  7: #include <petscbt.h>
  8: #include <petscsf.h>

 10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *);
 11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscTable *);
 12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *);
 13: extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
 14: extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);

 16: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *);
 17: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *);
 18: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **);
 19: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *);

 21: /*
 22:    Takes a general IS and builds a block version of the IS that contains the given IS plus any needed values to fill out the blocks

 24:    The entire MatIncreaseOverlap_MPIAIJ() stack could be rewritten to respect the bs and it would offer higher performance but
 25:    that is a very major recoding job.

 27:    Possible scalability issues with this routine because it allocates space proportional to Nmax-Nmin
 28: */
 29: static PetscErrorCode ISAdjustForBlockSize(PetscInt bs, PetscInt imax, IS is[])
 30: {
 31:   for (PetscInt i = 0; i < imax; i++) {
 32:     if (!is[i]) break;
 33:     PetscInt        n = 0, N, Nmax, Nmin;
 34:     const PetscInt *idx;
 35:     PetscInt       *nidx = NULL;
 36:     MPI_Comm        comm;
 37:     PetscBT         bt;

 39:     ISGetLocalSize(is[i], &N);
 40:     if (N > 0) { /* Nmax and Nmin are garbage for empty IS */
 41:       ISGetIndices(is[i], &idx);
 42:       ISGetMinMax(is[i], &Nmin, &Nmax);
 43:       Nmin = Nmin / bs;
 44:       Nmax = Nmax / bs;
 45:       PetscBTCreate(Nmax - Nmin, &bt);
 46:       for (PetscInt j = 0; j < N; j++) {
 47:         if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) n++;
 48:       }
 49:       PetscMalloc1(n, &nidx);
 50:       n = 0;
 51:       PetscBTMemzero(Nmax - Nmin, bt);
 52:       for (PetscInt j = 0; j < N; j++) {
 53:         if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) nidx[n++] = idx[j] / bs;
 54:       }
 55:       PetscBTDestroy(&bt);
 56:       ISRestoreIndices(is[i], &idx);
 57:     }
 58:     PetscObjectGetComm((PetscObject)is[i], &comm);
 59:     ISDestroy(is + i);
 60:     ISCreateBlock(comm, bs, n, nidx, PETSC_OWN_POINTER, is + i);
 61:   }
 62:   return 0;
 63: }

 65: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov)
 66: {
 67:   PetscInt i;

 70:   for (i = 0; i < ov; ++i) MatIncreaseOverlap_MPIAIJ_Once(C, imax, is);
 71:   if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) ISAdjustForBlockSize(C->rmap->bs, imax, is);
 72:   return 0;
 73: }

 75: PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov)
 76: {
 77:   PetscInt i;

 80:   for (i = 0; i < ov; ++i) MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is);
 81:   if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) ISAdjustForBlockSize(C->rmap->bs, imax, is);
 82:   return 0;
 83: }

 85: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[])
 86: {
 87:   MPI_Comm        comm;
 88:   PetscInt       *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_ranks, *rrow_isids, i, j;
 89:   PetscInt       *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata;
 90:   PetscInt        nrecvrows, *sbsizes = NULL, *sbdata = NULL;
 91:   const PetscInt *indices_i, **indices;
 92:   PetscLayout     rmap;
 93:   PetscMPIInt     rank, size, *toranks, *fromranks, nto, nfrom, owner;
 94:   PetscSF         sf;
 95:   PetscSFNode    *remote;

 97:   PetscObjectGetComm((PetscObject)mat, &comm);
 98:   MPI_Comm_rank(comm, &rank);
 99:   MPI_Comm_size(comm, &size);
100:   /* get row map to determine where rows should be going */
101:   MatGetLayouts(mat, &rmap, NULL);
102:   /* retrieve IS data and put all together so that we
103:    * can optimize communication
104:    *  */
105:   PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length);
106:   for (i = 0, tlength = 0; i < nidx; i++) {
107:     ISGetLocalSize(is[i], &length[i]);
108:     tlength += length[i];
109:     ISGetIndices(is[i], &indices[i]);
110:   }
111:   /* find these rows on remote processors */
112:   PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids);
113:   PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp);
114:   nrrows = 0;
115:   for (i = 0; i < nidx; i++) {
116:     length_i  = length[i];
117:     indices_i = indices[i];
118:     for (j = 0; j < length_i; j++) {
119:       owner = -1;
120:       PetscLayoutFindOwner(rmap, indices_i[j], &owner);
121:       /* remote processors */
122:       if (owner != rank) {
123:         tosizes_temp[owner]++;               /* number of rows to owner */
124:         rrow_ranks[nrrows]   = owner;        /* processor */
125:         rrow_isids[nrrows]   = i;            /* is id */
126:         remoterows[nrrows++] = indices_i[j]; /* row */
127:       }
128:     }
129:     ISRestoreIndices(is[i], &indices[i]);
130:   }
131:   PetscFree2(*(PetscInt ***)&indices, length);
132:   /* test if we need to exchange messages
133:    * generally speaking, we do not need to exchange
134:    * data when overlap is 1
135:    * */
136:   MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm);
137:   /* we do not have any messages
138:    * It usually corresponds to overlap 1
139:    * */
140:   if (!reducednrrows) {
141:     PetscFree3(toranks, tosizes, tosizes_temp);
142:     PetscFree3(remoterows, rrow_ranks, rrow_isids);
143:     MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is);
144:     return 0;
145:   }
146:   nto = 0;
147:   /* send sizes and ranks for building a two-sided communication */
148:   for (i = 0; i < size; i++) {
149:     if (tosizes_temp[i]) {
150:       tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */
151:       tosizes_temp[i]  = nto;                 /* a map from processor to index */
152:       toranks[nto++]   = i;                   /* processor */
153:     }
154:   }
155:   PetscMalloc1(nto + 1, &toffsets);
156:   toffsets[0] = 0;
157:   for (i = 0; i < nto; i++) {
158:     toffsets[i + 1]    = toffsets[i] + tosizes[2 * i]; /* offsets */
159:     tosizes[2 * i + 1] = toffsets[i];                  /* offsets to send */
160:   }
161:   /* send information to other processors */
162:   PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes);
163:   nrecvrows = 0;
164:   for (i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i];
165:   PetscMalloc1(nrecvrows, &remote);
166:   nrecvrows = 0;
167:   for (i = 0; i < nfrom; i++) {
168:     for (j = 0; j < fromsizes[2 * i]; j++) {
169:       remote[nrecvrows].rank    = fromranks[i];
170:       remote[nrecvrows++].index = fromsizes[2 * i + 1] + j;
171:     }
172:   }
173:   PetscSFCreate(comm, &sf);
174:   PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER);
175:   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
176:   PetscSFSetType(sf, PETSCSFBASIC);
177:   PetscSFSetFromOptions(sf);
178:   /* message pair <no of is, row>  */
179:   PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata);
180:   for (i = 0; i < nrrows; i++) {
181:     owner                 = rrow_ranks[i];       /* processor */
182:     j                     = tosizes_temp[owner]; /* index */
183:     todata[toffsets[j]++] = rrow_isids[i];
184:     todata[toffsets[j]++] = remoterows[i];
185:   }
186:   PetscFree3(toranks, tosizes, tosizes_temp);
187:   PetscFree3(remoterows, rrow_ranks, rrow_isids);
188:   PetscFree(toffsets);
189:   PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE);
190:   PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE);
191:   PetscSFDestroy(&sf);
192:   /* send rows belonging to the remote so that then we could get the overlapping data back */
193:   MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata);
194:   PetscFree2(todata, fromdata);
195:   PetscFree(fromsizes);
196:   PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes);
197:   PetscFree(fromranks);
198:   nrecvrows = 0;
199:   for (i = 0; i < nto; i++) nrecvrows += tosizes[2 * i];
200:   PetscCalloc1(nrecvrows, &todata);
201:   PetscMalloc1(nrecvrows, &remote);
202:   nrecvrows = 0;
203:   for (i = 0; i < nto; i++) {
204:     for (j = 0; j < tosizes[2 * i]; j++) {
205:       remote[nrecvrows].rank    = toranks[i];
206:       remote[nrecvrows++].index = tosizes[2 * i + 1] + j;
207:     }
208:   }
209:   PetscSFCreate(comm, &sf);
210:   PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER);
211:   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
212:   PetscSFSetType(sf, PETSCSFBASIC);
213:   PetscSFSetFromOptions(sf);
214:   /* overlap communication and computation */
215:   PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE);
216:   MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is);
217:   PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE);
218:   PetscSFDestroy(&sf);
219:   PetscFree2(sbdata, sbsizes);
220:   MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata);
221:   PetscFree(toranks);
222:   PetscFree(tosizes);
223:   PetscFree(todata);
224:   return 0;
225: }

227: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
228: {
229:   PetscInt       *isz, isz_i, i, j, is_id, data_size;
230:   PetscInt        col, lsize, max_lsize, *indices_temp, *indices_i;
231:   const PetscInt *indices_i_temp;
232:   MPI_Comm       *iscomms;

234:   max_lsize = 0;
235:   PetscMalloc1(nidx, &isz);
236:   for (i = 0; i < nidx; i++) {
237:     ISGetLocalSize(is[i], &lsize);
238:     max_lsize = lsize > max_lsize ? lsize : max_lsize;
239:     isz[i]    = lsize;
240:   }
241:   PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms);
242:   for (i = 0; i < nidx; i++) {
243:     PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL);
244:     ISGetIndices(is[i], &indices_i_temp);
245:     PetscArraycpy(indices_temp + i * (max_lsize + nrecvs), indices_i_temp, isz[i]);
246:     ISRestoreIndices(is[i], &indices_i_temp);
247:     ISDestroy(&is[i]);
248:   }
249:   /* retrieve information to get row id and its overlap */
250:   for (i = 0; i < nrecvs;) {
251:     is_id     = recvdata[i++];
252:     data_size = recvdata[i++];
253:     indices_i = indices_temp + (max_lsize + nrecvs) * is_id;
254:     isz_i     = isz[is_id];
255:     for (j = 0; j < data_size; j++) {
256:       col                = recvdata[i++];
257:       indices_i[isz_i++] = col;
258:     }
259:     isz[is_id] = isz_i;
260:   }
261:   /* remove duplicate entities */
262:   for (i = 0; i < nidx; i++) {
263:     indices_i = indices_temp + (max_lsize + nrecvs) * i;
264:     isz_i     = isz[i];
265:     PetscSortRemoveDupsInt(&isz_i, indices_i);
266:     ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i]);
267:     PetscCommDestroy(&iscomms[i]);
268:   }
269:   PetscFree(isz);
270:   PetscFree2(indices_temp, iscomms);
271:   return 0;
272: }

274: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
275: {
276:   PetscLayout     rmap, cmap;
277:   PetscInt        i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i;
278:   PetscInt        is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata;
279:   PetscInt       *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets;
280:   const PetscInt *gcols, *ai, *aj, *bi, *bj;
281:   Mat             amat, bmat;
282:   PetscMPIInt     rank;
283:   PetscBool       done;
284:   MPI_Comm        comm;

286:   PetscObjectGetComm((PetscObject)mat, &comm);
287:   MPI_Comm_rank(comm, &rank);
288:   MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols);
289:   /* Even if the mat is symmetric, we still assume it is not symmetric */
290:   MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done);
292:   MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done);
294:   /* total number of nonzero values is used to estimate the memory usage in the next step */
295:   tnz = ai[an] + bi[bn];
296:   MatGetLayouts(mat, &rmap, &cmap);
297:   PetscLayoutGetRange(rmap, &rstart, NULL);
298:   PetscLayoutGetRange(cmap, &cstart, NULL);
299:   /* to find the longest message */
300:   max_fszs = 0;
301:   for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs;
302:   /* better way to estimate number of nonzero in the mat??? */
303:   PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp);
304:   for (i = 0; i < nidx; i++) rows_data[i] = rows_data_ptr + max_fszs * i;
305:   rows_pos  = 0;
306:   totalrows = 0;
307:   for (i = 0; i < nfrom; i++) {
308:     PetscArrayzero(rows_pos_i, nidx);
309:     /* group data together */
310:     for (j = 0; j < fromsizes[2 * i]; j += 2) {
311:       is_id                       = fromrows[rows_pos++]; /* no of is */
312:       rows_i                      = rows_data[is_id];
313:       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */
314:     }
315:     /* estimate a space to avoid multiple allocations  */
316:     for (j = 0; j < nidx; j++) {
317:       indvc_ij = 0;
318:       rows_i   = rows_data[j];
319:       for (l = 0; l < rows_pos_i[j]; l++) {
320:         row   = rows_i[l] - rstart;
321:         start = ai[row];
322:         end   = ai[row + 1];
323:         for (k = start; k < end; k++) { /* Amat */
324:           col                     = aj[k] + cstart;
325:           indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */
326:         }
327:         start = bi[row];
328:         end   = bi[row + 1];
329:         for (k = start; k < end; k++) { /* Bmat */
330:           col                     = gcols[bj[k]];
331:           indices_tmp[indvc_ij++] = col;
332:         }
333:       }
334:       PetscSortRemoveDupsInt(&indvc_ij, indices_tmp);
335:       indv_counts[i * nidx + j] = indvc_ij;
336:       totalrows += indvc_ij;
337:     }
338:   }
339:   /* message triple <no of is, number of rows, rows> */
340:   PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes);
341:   totalrows = 0;
342:   rows_pos  = 0;
343:   /* use this code again */
344:   for (i = 0; i < nfrom; i++) {
345:     PetscArrayzero(rows_pos_i, nidx);
346:     for (j = 0; j < fromsizes[2 * i]; j += 2) {
347:       is_id                       = fromrows[rows_pos++];
348:       rows_i                      = rows_data[is_id];
349:       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
350:     }
351:     /* add data  */
352:     for (j = 0; j < nidx; j++) {
353:       if (!indv_counts[i * nidx + j]) continue;
354:       indvc_ij            = 0;
355:       sbdata[totalrows++] = j;
356:       sbdata[totalrows++] = indv_counts[i * nidx + j];
357:       sbsizes[2 * i] += 2;
358:       rows_i = rows_data[j];
359:       for (l = 0; l < rows_pos_i[j]; l++) {
360:         row   = rows_i[l] - rstart;
361:         start = ai[row];
362:         end   = ai[row + 1];
363:         for (k = start; k < end; k++) { /* Amat */
364:           col                     = aj[k] + cstart;
365:           indices_tmp[indvc_ij++] = col;
366:         }
367:         start = bi[row];
368:         end   = bi[row + 1];
369:         for (k = start; k < end; k++) { /* Bmat */
370:           col                     = gcols[bj[k]];
371:           indices_tmp[indvc_ij++] = col;
372:         }
373:       }
374:       PetscSortRemoveDupsInt(&indvc_ij, indices_tmp);
375:       sbsizes[2 * i] += indvc_ij;
376:       PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij);
377:       totalrows += indvc_ij;
378:     }
379:   }
380:   PetscMalloc1(nfrom + 1, &offsets);
381:   offsets[0] = 0;
382:   for (i = 0; i < nfrom; i++) {
383:     offsets[i + 1]     = offsets[i] + sbsizes[2 * i];
384:     sbsizes[2 * i + 1] = offsets[i];
385:   }
386:   PetscFree(offsets);
387:   if (sbrowsizes) *sbrowsizes = sbsizes;
388:   if (sbrows) *sbrows = sbdata;
389:   PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp);
390:   MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done);
391:   MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done);
392:   return 0;
393: }

395: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[])
396: {
397:   const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices;
398:   PetscInt        tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp;
399:   PetscInt        lsize, lsize_tmp;
400:   PetscMPIInt     rank, owner;
401:   Mat             amat, bmat;
402:   PetscBool       done;
403:   PetscLayout     cmap, rmap;
404:   MPI_Comm        comm;

406:   PetscObjectGetComm((PetscObject)mat, &comm);
407:   MPI_Comm_rank(comm, &rank);
408:   MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols);
409:   MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done);
411:   MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done);
413:   /* is it a safe way to compute number of nonzero values ? */
414:   tnz = ai[an] + bi[bn];
415:   MatGetLayouts(mat, &rmap, &cmap);
416:   PetscLayoutGetRange(rmap, &rstart, NULL);
417:   PetscLayoutGetRange(cmap, &cstart, NULL);
418:   /* it is a better way to estimate memory than the old implementation
419:    * where global size of matrix is used
420:    * */
421:   PetscMalloc1(tnz, &indices_temp);
422:   for (i = 0; i < nidx; i++) {
423:     MPI_Comm iscomm;

425:     ISGetLocalSize(is[i], &lsize);
426:     ISGetIndices(is[i], &indices);
427:     lsize_tmp = 0;
428:     for (j = 0; j < lsize; j++) {
429:       owner = -1;
430:       row   = indices[j];
431:       PetscLayoutFindOwner(rmap, row, &owner);
432:       if (owner != rank) continue;
433:       /* local number */
434:       row -= rstart;
435:       start = ai[row];
436:       end   = ai[row + 1];
437:       for (k = start; k < end; k++) { /* Amat */
438:         col                       = aj[k] + cstart;
439:         indices_temp[lsize_tmp++] = col;
440:       }
441:       start = bi[row];
442:       end   = bi[row + 1];
443:       for (k = start; k < end; k++) { /* Bmat */
444:         col                       = gcols[bj[k]];
445:         indices_temp[lsize_tmp++] = col;
446:       }
447:     }
448:     ISRestoreIndices(is[i], &indices);
449:     PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL);
450:     ISDestroy(&is[i]);
451:     PetscSortRemoveDupsInt(&lsize_tmp, indices_temp);
452:     ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i]);
453:     PetscCommDestroy(&iscomm);
454:   }
455:   PetscFree(indices_temp);
456:   MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done);
457:   MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done);
458:   return 0;
459: }

461: /*
462:   Sample message format:
463:   If a processor A wants processor B to process some elements corresponding
464:   to index sets is[1],is[5]
465:   mesg [0] = 2   (no of index sets in the mesg)
466:   -----------
467:   mesg [1] = 1 => is[1]
468:   mesg [2] = sizeof(is[1]);
469:   -----------
470:   mesg [3] = 5  => is[5]
471:   mesg [4] = sizeof(is[5]);
472:   -----------
473:   mesg [5]
474:   mesg [n]  datas[1]
475:   -----------
476:   mesg[n+1]
477:   mesg[m]  data(is[5])
478:   -----------

480:   Notes:
481:   nrqs - no of requests sent (or to be sent out)
482:   nrqr - no of requests received (which have to be or which have been processed)
483: */
484: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[])
485: {
486:   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
487:   PetscMPIInt     *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2;
488:   const PetscInt **idx, *idx_i;
489:   PetscInt        *n, **data, len;
490: #if defined(PETSC_USE_CTABLE)
491:   PetscTable *table_data, table_data_i;
492:   PetscInt   *tdata, tcount, tcount_max;
493: #else
494:   PetscInt *data_i, *d_p;
495: #endif
496:   PetscMPIInt  size, rank, tag1, tag2, proc = 0;
497:   PetscInt     M, i, j, k, **rbuf, row, nrqs, msz, **outdat, **ptr;
498:   PetscInt    *ctr, *pa, *tmp, *isz, *isz1, **xdata, **rbuf2;
499:   PetscBT     *table;
500:   MPI_Comm     comm;
501:   MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2;
502:   MPI_Status  *recv_status;
503:   MPI_Comm    *iscomms;
504:   char        *t_p;

506:   PetscObjectGetComm((PetscObject)C, &comm);
507:   size = c->size;
508:   rank = c->rank;
509:   M    = C->rmap->N;

511:   PetscObjectGetNewTag((PetscObject)C, &tag1);
512:   PetscObjectGetNewTag((PetscObject)C, &tag2);

514:   PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n);

516:   for (i = 0; i < imax; i++) {
517:     ISGetIndices(is[i], &idx[i]);
518:     ISGetLocalSize(is[i], &n[i]);
519:   }

521:   /* evaluate communication - mesg to who,length of mesg, and buffer space
522:      required. Based on this, buffers are allocated, and data copied into them  */
523:   PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4);
524:   for (i = 0; i < imax; i++) {
525:     PetscArrayzero(w4, size); /* initialise work vector*/
526:     idx_i = idx[i];
527:     len   = n[i];
528:     for (j = 0; j < len; j++) {
529:       row = idx_i[j];
531:       PetscLayoutFindOwner(C->rmap, row, &proc);
532:       w4[proc]++;
533:     }
534:     for (j = 0; j < size; j++) {
535:       if (w4[j]) {
536:         w1[j] += w4[j];
537:         w3[j]++;
538:       }
539:     }
540:   }

542:   nrqs     = 0; /* no of outgoing messages */
543:   msz      = 0; /* total mesg length (for all proc */
544:   w1[rank] = 0; /* no mesg sent to intself */
545:   w3[rank] = 0;
546:   for (i = 0; i < size; i++) {
547:     if (w1[i]) {
548:       w2[i] = 1;
549:       nrqs++;
550:     } /* there exists a message to proc i */
551:   }
552:   /* pa - is list of processors to communicate with */
553:   PetscMalloc1(nrqs, &pa);
554:   for (i = 0, j = 0; i < size; i++) {
555:     if (w1[i]) {
556:       pa[j] = i;
557:       j++;
558:     }
559:   }

561:   /* Each message would have a header = 1 + 2*(no of IS) + data */
562:   for (i = 0; i < nrqs; i++) {
563:     j = pa[i];
564:     w1[j] += w2[j] + 2 * w3[j];
565:     msz += w1[j];
566:   }

568:   /* Determine the number of messages to expect, their lengths, from from-ids */
569:   PetscGatherNumberOfMessages(comm, w2, w1, &nrqr);
570:   PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1);

572:   /* Now post the Irecvs corresponding to these messages */
573:   PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1);

575:   /* Allocate Memory for outgoing messages */
576:   PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr);
577:   PetscArrayzero(outdat, size);
578:   PetscArrayzero(ptr, size);

580:   {
581:     PetscInt *iptr = tmp, ict = 0;
582:     for (i = 0; i < nrqs; i++) {
583:       j = pa[i];
584:       iptr += ict;
585:       outdat[j] = iptr;
586:       ict       = w1[j];
587:     }
588:   }

590:   /* Form the outgoing messages */
591:   /* plug in the headers */
592:   for (i = 0; i < nrqs; i++) {
593:     j            = pa[i];
594:     outdat[j][0] = 0;
595:     PetscArrayzero(outdat[j] + 1, 2 * w3[j]);
596:     ptr[j] = outdat[j] + 2 * w3[j] + 1;
597:   }

599:   /* Memory for doing local proc's work */
600:   {
601:     PetscInt M_BPB_imax = 0;
602: #if defined(PETSC_USE_CTABLE)
603:     PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax);
604:     PetscMalloc1(imax, &table_data);
605:     for (i = 0; i < imax; i++) PetscTableCreate(n[i], M, &table_data[i]);
606:     PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p);
607:     for (i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
608: #else
609:     PetscInt Mimax = 0;
610:     PetscIntMultError(M, imax, &Mimax);
611:     PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax);
612:     PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p);
613:     for (i = 0; i < imax; i++) {
614:       table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
615:       data[i]  = d_p + M * i;
616:     }
617: #endif
618:   }

620:   /* Parse the IS and update local tables and the outgoing buf with the data */
621:   {
622:     PetscInt n_i, isz_i, *outdat_j, ctr_j;
623:     PetscBT  table_i;

625:     for (i = 0; i < imax; i++) {
626:       PetscArrayzero(ctr, size);
627:       n_i     = n[i];
628:       table_i = table[i];
629:       idx_i   = idx[i];
630: #if defined(PETSC_USE_CTABLE)
631:       table_data_i = table_data[i];
632: #else
633:       data_i = data[i];
634: #endif
635:       isz_i = isz[i];
636:       for (j = 0; j < n_i; j++) { /* parse the indices of each IS */
637:         row = idx_i[j];
638:         PetscLayoutFindOwner(C->rmap, row, &proc);
639:         if (proc != rank) { /* copy to the outgoing buffer */
640:           ctr[proc]++;
641:           *ptr[proc] = row;
642:           ptr[proc]++;
643:         } else if (!PetscBTLookupSet(table_i, row)) {
644: #if defined(PETSC_USE_CTABLE)
645:           PetscTableAdd(table_data_i, row + 1, isz_i + 1, INSERT_VALUES);
646: #else
647:           data_i[isz_i] = row; /* Update the local table */
648: #endif
649:           isz_i++;
650:         }
651:       }
652:       /* Update the headers for the current IS */
653:       for (j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */
654:         if ((ctr_j = ctr[j])) {
655:           outdat_j            = outdat[j];
656:           k                   = ++outdat_j[0];
657:           outdat_j[2 * k]     = ctr_j;
658:           outdat_j[2 * k - 1] = i;
659:         }
660:       }
661:       isz[i] = isz_i;
662:     }
663:   }

665:   /*  Now  post the sends */
666:   PetscMalloc1(nrqs, &s_waits1);
667:   for (i = 0; i < nrqs; ++i) {
668:     j = pa[i];
669:     MPI_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i);
670:   }

672:   /* No longer need the original indices */
673:   PetscMalloc1(imax, &iscomms);
674:   for (i = 0; i < imax; ++i) {
675:     ISRestoreIndices(is[i], idx + i);
676:     PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL);
677:   }
678:   PetscFree2(*(PetscInt ***)&idx, n);

680:   for (i = 0; i < imax; ++i) ISDestroy(&is[i]);

682:     /* Do Local work */
683: #if defined(PETSC_USE_CTABLE)
684:   MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data);
685: #else
686:   MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL);
687: #endif

689:   /* Receive messages */
690:   PetscMalloc1(nrqr, &recv_status);
691:   MPI_Waitall(nrqr, r_waits1, recv_status);
692:   MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE);

694:   /* Phase 1 sends are complete - deallocate buffers */
695:   PetscFree4(outdat, ptr, tmp, ctr);
696:   PetscFree4(w1, w2, w3, w4);

698:   PetscMalloc1(nrqr, &xdata);
699:   PetscMalloc1(nrqr, &isz1);
700:   MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1);
701:   PetscFree(rbuf[0]);
702:   PetscFree(rbuf);

704:   /* Send the data back */
705:   /* Do a global reduction to know the buffer space req for incoming messages */
706:   {
707:     PetscMPIInt *rw1;

709:     PetscCalloc1(size, &rw1);

711:     for (i = 0; i < nrqr; ++i) {
712:       proc = recv_status[i].MPI_SOURCE;

715:       rw1[proc] = isz1[i];
716:     }
717:     PetscFree(onodes1);
718:     PetscFree(olengths1);

720:     /* Determine the number of messages to expect, their lengths, from from-ids */
721:     PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2);
722:     PetscFree(rw1);
723:   }
724:   /* Now post the Irecvs corresponding to these messages */
725:   PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2);

727:   /* Now  post the sends */
728:   PetscMalloc1(nrqr, &s_waits2);
729:   for (i = 0; i < nrqr; ++i) {
730:     j = recv_status[i].MPI_SOURCE;
731:     MPI_Isend(xdata[i], isz1[i], MPIU_INT, j, tag2, comm, s_waits2 + i);
732:   }

734:   /* receive work done on other processors */
735:   {
736:     PetscInt    is_no, ct1, max, *rbuf2_i, isz_i, jmax;
737:     PetscMPIInt idex;
738:     PetscBT     table_i;

740:     for (i = 0; i < nrqs; ++i) {
741:       MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE);
742:       /* Process the message */
743:       rbuf2_i = rbuf2[idex];
744:       ct1     = 2 * rbuf2_i[0] + 1;
745:       jmax    = rbuf2[idex][0];
746:       for (j = 1; j <= jmax; j++) {
747:         max     = rbuf2_i[2 * j];
748:         is_no   = rbuf2_i[2 * j - 1];
749:         isz_i   = isz[is_no];
750:         table_i = table[is_no];
751: #if defined(PETSC_USE_CTABLE)
752:         table_data_i = table_data[is_no];
753: #else
754:         data_i = data[is_no];
755: #endif
756:         for (k = 0; k < max; k++, ct1++) {
757:           row = rbuf2_i[ct1];
758:           if (!PetscBTLookupSet(table_i, row)) {
759: #if defined(PETSC_USE_CTABLE)
760:             PetscTableAdd(table_data_i, row + 1, isz_i + 1, INSERT_VALUES);
761: #else
762:             data_i[isz_i] = row;
763: #endif
764:             isz_i++;
765:           }
766:         }
767:         isz[is_no] = isz_i;
768:       }
769:     }

771:     MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE);
772:   }

774: #if defined(PETSC_USE_CTABLE)
775:   tcount_max = 0;
776:   for (i = 0; i < imax; ++i) {
777:     table_data_i = table_data[i];
778:     PetscTableGetCount(table_data_i, &tcount);
779:     if (tcount_max < tcount) tcount_max = tcount;
780:   }
781:   PetscMalloc1(tcount_max + 1, &tdata);
782: #endif

784:   for (i = 0; i < imax; ++i) {
785: #if defined(PETSC_USE_CTABLE)
786:     PetscTablePosition tpos;
787:     table_data_i = table_data[i];

789:     PetscTableGetHeadPosition(table_data_i, &tpos);
790:     while (tpos) {
791:       PetscTableGetNext(table_data_i, &tpos, &k, &j);
792:       tdata[--j] = --k;
793:     }
794:     ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i);
795: #else
796:     ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i);
797: #endif
798:     PetscCommDestroy(&iscomms[i]);
799:   }

801:   PetscFree(iscomms);
802:   PetscFree(onodes2);
803:   PetscFree(olengths2);

805:   PetscFree(pa);
806:   PetscFree(rbuf2[0]);
807:   PetscFree(rbuf2);
808:   PetscFree(s_waits1);
809:   PetscFree(r_waits1);
810:   PetscFree(s_waits2);
811:   PetscFree(r_waits2);
812:   PetscFree(recv_status);
813:   if (xdata) {
814:     PetscFree(xdata[0]);
815:     PetscFree(xdata);
816:   }
817:   PetscFree(isz1);
818: #if defined(PETSC_USE_CTABLE)
819:   for (i = 0; i < imax; i++) PetscTableDestroy((PetscTable *)&table_data[i]);
820:   PetscFree(table_data);
821:   PetscFree(tdata);
822:   PetscFree4(table, data, isz, t_p);
823: #else
824:   PetscFree5(table, data, isz, d_p, t_p);
825: #endif
826:   return 0;
827: }

829: /*
830:    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
831:        the work on the local processor.

833:      Inputs:
834:       C      - MAT_MPIAIJ;
835:       imax - total no of index sets processed at a time;
836:       table  - an array of char - size = m bits.

838:      Output:
839:       isz    - array containing the count of the solution elements corresponding
840:                to each index set;
841:       data or table_data  - pointer to the solutions
842: */
843: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscTable *table_data)
844: {
845:   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
846:   Mat         A = c->A, B = c->B;
847:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
848:   PetscInt    start, end, val, max, rstart, cstart, *ai, *aj;
849:   PetscInt   *bi, *bj, *garray, i, j, k, row, isz_i;
850:   PetscBT     table_i;
851: #if defined(PETSC_USE_CTABLE)
852:   PetscTable         table_data_i;
853:   PetscTablePosition tpos;
854:   PetscInt           tcount, *tdata;
855: #else
856:   PetscInt *data_i;
857: #endif

859:   rstart = C->rmap->rstart;
860:   cstart = C->cmap->rstart;
861:   ai     = a->i;
862:   aj     = a->j;
863:   bi     = b->i;
864:   bj     = b->j;
865:   garray = c->garray;

867:   for (i = 0; i < imax; i++) {
868: #if defined(PETSC_USE_CTABLE)
869:     /* copy existing entries of table_data_i into tdata[] */
870:     table_data_i = table_data[i];
871:     PetscTableGetCount(table_data_i, &tcount);

874:     PetscMalloc1(tcount, &tdata);
875:     PetscTableGetHeadPosition(table_data_i, &tpos);
876:     while (tpos) {
877:       PetscTableGetNext(table_data_i, &tpos, &row, &j);
878:       tdata[--j] = --row;
880:     }
881: #else
882:     data_i = data[i];
883: #endif
884:     table_i = table[i];
885:     isz_i   = isz[i];
886:     max     = isz[i];

888:     for (j = 0; j < max; j++) {
889: #if defined(PETSC_USE_CTABLE)
890:       row = tdata[j] - rstart;
891: #else
892:       row = data_i[j] - rstart;
893: #endif
894:       start = ai[row];
895:       end   = ai[row + 1];
896:       for (k = start; k < end; k++) { /* Amat */
897:         val = aj[k] + cstart;
898:         if (!PetscBTLookupSet(table_i, val)) {
899: #if defined(PETSC_USE_CTABLE)
900:           PetscTableAdd(table_data_i, val + 1, isz_i + 1, INSERT_VALUES);
901: #else
902:           data_i[isz_i] = val;
903: #endif
904:           isz_i++;
905:         }
906:       }
907:       start = bi[row];
908:       end   = bi[row + 1];
909:       for (k = start; k < end; k++) { /* Bmat */
910:         val = garray[bj[k]];
911:         if (!PetscBTLookupSet(table_i, val)) {
912: #if defined(PETSC_USE_CTABLE)
913:           PetscTableAdd(table_data_i, val + 1, isz_i + 1, INSERT_VALUES);
914: #else
915:           data_i[isz_i] = val;
916: #endif
917:           isz_i++;
918:         }
919:       }
920:     }
921:     isz[i] = isz_i;

923: #if defined(PETSC_USE_CTABLE)
924:     PetscFree(tdata);
925: #endif
926:   }
927:   return 0;
928: }

930: /*
931:       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
932:          and return the output

934:          Input:
935:            C    - the matrix
936:            nrqr - no of messages being processed.
937:            rbuf - an array of pointers to the received requests

939:          Output:
940:            xdata - array of messages to be sent back
941:            isz1  - size of each message

943:   For better efficiency perhaps we should malloc separately each xdata[i],
944: then if a remalloc is required we need only copy the data for that one row
945: rather then all previous rows as it is now where a single large chunk of
946: memory is used.

948: */
949: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1)
950: {
951:   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
952:   Mat         A = c->A, B = c->B;
953:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
954:   PetscInt    rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k;
955:   PetscInt    row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end;
956:   PetscInt    val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr;
957:   PetscInt   *rbuf_i, kmax, rbuf_0;
958:   PetscBT     xtable;

960:   m      = C->rmap->N;
961:   rstart = C->rmap->rstart;
962:   cstart = C->cmap->rstart;
963:   ai     = a->i;
964:   aj     = a->j;
965:   bi     = b->i;
966:   bj     = b->j;
967:   garray = c->garray;

969:   for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) {
970:     rbuf_i = rbuf[i];
971:     rbuf_0 = rbuf_i[0];
972:     ct += rbuf_0;
973:     for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j];
974:   }

976:   if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n;
977:   else max1 = 1;
978:   mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1);
979:   if (nrqr) {
980:     PetscMalloc1(mem_estimate, &xdata[0]);
981:     ++no_malloc;
982:   }
983:   PetscBTCreate(m, &xtable);
984:   PetscArrayzero(isz1, nrqr);

986:   ct3 = 0;
987:   for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */
988:     rbuf_i = rbuf[i];
989:     rbuf_0 = rbuf_i[0];
990:     ct1    = 2 * rbuf_0 + 1;
991:     ct2    = ct1;
992:     ct3 += ct1;
993:     for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/
994:       PetscBTMemzero(m, xtable);
995:       oct2 = ct2;
996:       kmax = rbuf_i[2 * j];
997:       for (k = 0; k < kmax; k++, ct1++) {
998:         row = rbuf_i[ct1];
999:         if (!PetscBTLookupSet(xtable, row)) {
1000:           if (!(ct3 < mem_estimate)) {
1001:             new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1002:             PetscMalloc1(new_estimate, &tmp);
1003:             PetscArraycpy(tmp, xdata[0], mem_estimate);
1004:             PetscFree(xdata[0]);
1005:             xdata[0]     = tmp;
1006:             mem_estimate = new_estimate;
1007:             ++no_malloc;
1008:             for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1009:           }
1010:           xdata[i][ct2++] = row;
1011:           ct3++;
1012:         }
1013:       }
1014:       for (k = oct2, max2 = ct2; k < max2; k++) {
1015:         row   = xdata[i][k] - rstart;
1016:         start = ai[row];
1017:         end   = ai[row + 1];
1018:         for (l = start; l < end; l++) {
1019:           val = aj[l] + cstart;
1020:           if (!PetscBTLookupSet(xtable, val)) {
1021:             if (!(ct3 < mem_estimate)) {
1022:               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1023:               PetscMalloc1(new_estimate, &tmp);
1024:               PetscArraycpy(tmp, xdata[0], mem_estimate);
1025:               PetscFree(xdata[0]);
1026:               xdata[0]     = tmp;
1027:               mem_estimate = new_estimate;
1028:               ++no_malloc;
1029:               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1030:             }
1031:             xdata[i][ct2++] = val;
1032:             ct3++;
1033:           }
1034:         }
1035:         start = bi[row];
1036:         end   = bi[row + 1];
1037:         for (l = start; l < end; l++) {
1038:           val = garray[bj[l]];
1039:           if (!PetscBTLookupSet(xtable, val)) {
1040:             if (!(ct3 < mem_estimate)) {
1041:               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1042:               PetscMalloc1(new_estimate, &tmp);
1043:               PetscArraycpy(tmp, xdata[0], mem_estimate);
1044:               PetscFree(xdata[0]);
1045:               xdata[0]     = tmp;
1046:               mem_estimate = new_estimate;
1047:               ++no_malloc;
1048:               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1049:             }
1050:             xdata[i][ct2++] = val;
1051:             ct3++;
1052:           }
1053:         }
1054:       }
1055:       /* Update the header*/
1056:       xdata[i][2 * j]     = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1057:       xdata[i][2 * j - 1] = rbuf_i[2 * j - 1];
1058:     }
1059:     xdata[i][0] = rbuf_0;
1060:     if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2;
1061:     isz1[i] = ct2; /* size of each message */
1062:   }
1063:   PetscBTDestroy(&xtable);
1064:   PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc);
1065:   return 0;
1066: }
1067: /* -------------------------------------------------------------------------*/
1068: extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *);
1069: /*
1070:     Every processor gets the entire matrix
1071: */
1072: PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[])
1073: {
1074:   Mat         B;
1075:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1076:   Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data;
1077:   PetscMPIInt size, rank, *recvcounts = NULL, *displs = NULL;
1078:   PetscInt    sendcount, i, *rstarts = A->rmap->range, n, cnt, j;
1079:   PetscInt    m, *b_sendj, *garray   = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf;

1081:   MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
1082:   MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank);
1083:   if (scall == MAT_INITIAL_MATRIX) {
1084:     /* ----------------------------------------------------------------
1085:          Tell every processor the number of nonzeros per row
1086:     */
1087:     PetscMalloc1(A->rmap->N, &lens);
1088:     for (i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart];
1089:     PetscMalloc2(size, &recvcounts, size, &displs);
1090:     for (i = 0; i < size; i++) {
1091:       recvcounts[i] = A->rmap->range[i + 1] - A->rmap->range[i];
1092:       displs[i]     = A->rmap->range[i];
1093:     }
1094:     MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, lens, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A));
1095:     /* ---------------------------------------------------------------
1096:          Create the sequential matrix of the same type as the local block diagonal
1097:     */
1098:     MatCreate(PETSC_COMM_SELF, &B);
1099:     MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE);
1100:     MatSetBlockSizesFromMats(B, A, A);
1101:     MatSetType(B, ((PetscObject)a->A)->type_name);
1102:     MatSeqAIJSetPreallocation(B, 0, lens);
1103:     PetscCalloc1(2, Bin);
1104:     **Bin = B;
1105:     b     = (Mat_SeqAIJ *)B->data;

1107:     /*--------------------------------------------------------------------
1108:        Copy my part of matrix column indices over
1109:     */
1110:     sendcount  = ad->nz + bd->nz;
1111:     jsendbuf   = b->j + b->i[rstarts[rank]];
1112:     a_jsendbuf = ad->j;
1113:     b_jsendbuf = bd->j;
1114:     n          = A->rmap->rend - A->rmap->rstart;
1115:     cnt        = 0;
1116:     for (i = 0; i < n; i++) {
1117:       /* put in lower diagonal portion */
1118:       m = bd->i[i + 1] - bd->i[i];
1119:       while (m > 0) {
1120:         /* is it above diagonal (in bd (compressed) numbering) */
1121:         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1122:         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1123:         m--;
1124:       }

1126:       /* put in diagonal portion */
1127:       for (j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;

1129:       /* put in upper diagonal portion */
1130:       while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++];
1131:     }

1134:     /*--------------------------------------------------------------------
1135:        Gather all column indices to all processors
1136:     */
1137:     for (i = 0; i < size; i++) {
1138:       recvcounts[i] = 0;
1139:       for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) recvcounts[i] += lens[j];
1140:     }
1141:     displs[0] = 0;
1142:     for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1143:     MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, b->j, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A));
1144:     /*--------------------------------------------------------------------
1145:         Assemble the matrix into useable form (note numerical values not yet set)
1146:     */
1147:     /* set the b->ilen (length of each row) values */
1148:     PetscArraycpy(b->ilen, lens, A->rmap->N);
1149:     /* set the b->i indices */
1150:     b->i[0] = 0;
1151:     for (i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1];
1152:     PetscFree(lens);
1153:     MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY);
1154:     MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY);

1156:   } else {
1157:     B = **Bin;
1158:     b = (Mat_SeqAIJ *)B->data;
1159:   }

1161:   /*--------------------------------------------------------------------
1162:        Copy my part of matrix numerical values into the values location
1163:   */
1164:   if (flag == MAT_GET_VALUES) {
1165:     const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf;
1166:     MatScalar         *sendbuf, *recvbuf;

1168:     MatSeqAIJGetArrayRead(a->A, &ada);
1169:     MatSeqAIJGetArrayRead(a->B, &bda);
1170:     sendcount = ad->nz + bd->nz;
1171:     sendbuf   = b->a + b->i[rstarts[rank]];
1172:     a_sendbuf = ada;
1173:     b_sendbuf = bda;
1174:     b_sendj   = bd->j;
1175:     n         = A->rmap->rend - A->rmap->rstart;
1176:     cnt       = 0;
1177:     for (i = 0; i < n; i++) {
1178:       /* put in lower diagonal portion */
1179:       m = bd->i[i + 1] - bd->i[i];
1180:       while (m > 0) {
1181:         /* is it above diagonal (in bd (compressed) numbering) */
1182:         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1183:         sendbuf[cnt++] = *b_sendbuf++;
1184:         m--;
1185:         b_sendj++;
1186:       }

1188:       /* put in diagonal portion */
1189:       for (j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++;

1191:       /* put in upper diagonal portion */
1192:       while (m-- > 0) {
1193:         sendbuf[cnt++] = *b_sendbuf++;
1194:         b_sendj++;
1195:       }
1196:     }

1199:     /* -----------------------------------------------------------------
1200:        Gather all numerical values to all processors
1201:     */
1202:     if (!recvcounts) PetscMalloc2(size, &recvcounts, size, &displs);
1203:     for (i = 0; i < size; i++) recvcounts[i] = b->i[rstarts[i + 1]] - b->i[rstarts[i]];
1204:     displs[0] = 0;
1205:     for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1206:     recvbuf = b->a;
1207:     MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, recvbuf, recvcounts, displs, MPIU_SCALAR, PetscObjectComm((PetscObject)A));
1208:     MatSeqAIJRestoreArrayRead(a->A, &ada);
1209:     MatSeqAIJRestoreArrayRead(a->B, &bda);
1210:   } /* endof (flag == MAT_GET_VALUES) */
1211:   PetscFree2(recvcounts, displs);

1213:   MatPropagateSymmetryOptions(A, B);
1214:   return 0;
1215: }

1217: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats)
1218: {
1219:   Mat_MPIAIJ     *c = (Mat_MPIAIJ *)C->data;
1220:   Mat             submat, A = c->A, B = c->B;
1221:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc;
1222:   PetscInt       *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB;
1223:   PetscInt        cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray;
1224:   const PetscInt *icol, *irow;
1225:   PetscInt        nrow, ncol, start;
1226:   PetscMPIInt     rank, size, tag1, tag2, tag3, tag4, *w1, *w2, nrqr;
1227:   PetscInt      **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, ct3, **rbuf1, row, proc;
1228:   PetscInt        nrqs = 0, msz, **ptr, *req_size, *ctr, *pa, *tmp, tcol, *iptr;
1229:   PetscInt      **rbuf3, *req_source1, *req_source2, **sbuf_aj, **rbuf2, max1, nnz;
1230:   PetscInt       *lens, rmax, ncols, *cols, Crow;
1231: #if defined(PETSC_USE_CTABLE)
1232:   PetscTable cmap, rmap;
1233:   PetscInt  *cmap_loc, *rmap_loc;
1234: #else
1235:   PetscInt *cmap, *rmap;
1236: #endif
1237:   PetscInt      ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i;
1238:   PetscInt     *cworkB, lwrite, *subcols, *row2proc;
1239:   PetscScalar  *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL;
1240:   MPI_Request  *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
1241:   MPI_Request  *r_waits4, *s_waits3 = NULL, *s_waits4;
1242:   MPI_Status   *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2;
1243:   MPI_Status   *r_status3 = NULL, *r_status4, *s_status4;
1244:   MPI_Comm      comm;
1245:   PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i;
1246:   PetscMPIInt  *onodes1, *olengths1, idex, end;
1247:   Mat_SubSppt  *smatis1;
1248:   PetscBool     isrowsorted, iscolsorted;

1253:   MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a);
1254:   MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a);
1255:   PetscObjectGetComm((PetscObject)C, &comm);
1256:   size = c->size;
1257:   rank = c->rank;

1259:   ISSorted(iscol[0], &iscolsorted);
1260:   ISSorted(isrow[0], &isrowsorted);
1261:   ISGetIndices(isrow[0], &irow);
1262:   ISGetLocalSize(isrow[0], &nrow);
1263:   if (allcolumns) {
1264:     icol = NULL;
1265:     ncol = C->cmap->N;
1266:   } else {
1267:     ISGetIndices(iscol[0], &icol);
1268:     ISGetLocalSize(iscol[0], &ncol);
1269:   }

1271:   if (scall == MAT_INITIAL_MATRIX) {
1272:     PetscInt *sbuf2_i, *cworkA, lwrite, ctmp;

1274:     /* Get some new tags to keep the communication clean */
1275:     tag1 = ((PetscObject)C)->tag;
1276:     PetscObjectGetNewTag((PetscObject)C, &tag2);
1277:     PetscObjectGetNewTag((PetscObject)C, &tag3);

1279:     /* evaluate communication - mesg to who, length of mesg, and buffer space
1280:      required. Based on this, buffers are allocated, and data copied into them */
1281:     PetscCalloc2(size, &w1, size, &w2);
1282:     PetscMalloc1(nrow, &row2proc);

1284:     /* w1[proc] = num of rows owned by proc -- to be requested */
1285:     proc = 0;
1286:     nrqs = 0; /* num of outgoing messages */
1287:     for (j = 0; j < nrow; j++) {
1288:       row = irow[j];
1289:       if (!isrowsorted) proc = 0;
1290:       while (row >= C->rmap->range[proc + 1]) proc++;
1291:       w1[proc]++;
1292:       row2proc[j] = proc; /* map row index to proc */

1294:       if (proc != rank && !w2[proc]) {
1295:         w2[proc] = 1;
1296:         nrqs++;
1297:       }
1298:     }
1299:     w1[rank] = 0; /* rows owned by self will not be requested */

1301:     PetscMalloc1(nrqs, &pa); /*(proc -array)*/
1302:     for (proc = 0, j = 0; proc < size; proc++) {
1303:       if (w1[proc]) pa[j++] = proc;
1304:     }

1306:     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1307:     msz = 0; /* total mesg length (for all procs) */
1308:     for (i = 0; i < nrqs; i++) {
1309:       proc = pa[i];
1310:       w1[proc] += 3;
1311:       msz += w1[proc];
1312:     }
1313:     PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz);

1315:     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1316:     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1317:     PetscGatherNumberOfMessages(comm, w2, w1, &nrqr);

1319:     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1320:        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1321:     PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1);

1323:     /* Now post the Irecvs corresponding to these messages */
1324:     PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1);

1326:     PetscFree(onodes1);
1327:     PetscFree(olengths1);

1329:     /* Allocate Memory for outgoing messages */
1330:     PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr);
1331:     PetscArrayzero(sbuf1, size);
1332:     PetscArrayzero(ptr, size);

1334:     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1335:     iptr = tmp;
1336:     for (i = 0; i < nrqs; i++) {
1337:       proc        = pa[i];
1338:       sbuf1[proc] = iptr;
1339:       iptr += w1[proc];
1340:     }

1342:     /* Form the outgoing messages */
1343:     /* Initialize the header space */
1344:     for (i = 0; i < nrqs; i++) {
1345:       proc = pa[i];
1346:       PetscArrayzero(sbuf1[proc], 3);
1347:       ptr[proc] = sbuf1[proc] + 3;
1348:     }

1350:     /* Parse the isrow and copy data into outbuf */
1351:     PetscArrayzero(ctr, size);
1352:     for (j = 0; j < nrow; j++) { /* parse the indices of each IS */
1353:       proc = row2proc[j];
1354:       if (proc != rank) { /* copy to the outgoing buf*/
1355:         *ptr[proc] = irow[j];
1356:         ctr[proc]++;
1357:         ptr[proc]++;
1358:       }
1359:     }

1361:     /* Update the headers for the current IS */
1362:     for (j = 0; j < size; j++) { /* Can Optimise this loop too */
1363:       if ((ctr_j = ctr[j])) {
1364:         sbuf1_j            = sbuf1[j];
1365:         k                  = ++sbuf1_j[0];
1366:         sbuf1_j[2 * k]     = ctr_j;
1367:         sbuf1_j[2 * k - 1] = 0;
1368:       }
1369:     }

1371:     /* Now post the sends */
1372:     PetscMalloc1(nrqs, &s_waits1);
1373:     for (i = 0; i < nrqs; ++i) {
1374:       proc = pa[i];
1375:       MPI_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i);
1376:     }

1378:     /* Post Receives to capture the buffer size */
1379:     PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2);
1380:     PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3);

1382:     if (nrqs) rbuf2[0] = tmp + msz;
1383:     for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];

1385:     for (i = 0; i < nrqs; ++i) {
1386:       proc = pa[i];
1387:       MPI_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i);
1388:     }

1390:     PetscFree2(w1, w2);

1392:     /* Send to other procs the buf size they should allocate */
1393:     /* Receive messages*/
1394:     PetscMalloc1(nrqr, &r_status1);
1395:     PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1);

1397:     MPI_Waitall(nrqr, r_waits1, r_status1);
1398:     for (i = 0; i < nrqr; ++i) {
1399:       req_size[i] = 0;
1400:       rbuf1_i     = rbuf1[i];
1401:       start       = 2 * rbuf1_i[0] + 1;
1402:       MPI_Get_count(r_status1 + i, MPIU_INT, &end);
1403:       PetscMalloc1(end, &sbuf2[i]);
1404:       sbuf2_i = sbuf2[i];
1405:       for (j = start; j < end; j++) {
1406:         k          = rbuf1_i[j] - rstart;
1407:         ncols      = ai[k + 1] - ai[k] + bi[k + 1] - bi[k];
1408:         sbuf2_i[j] = ncols;
1409:         req_size[i] += ncols;
1410:       }
1411:       req_source1[i] = r_status1[i].MPI_SOURCE;

1413:       /* form the header */
1414:       sbuf2_i[0] = req_size[i];
1415:       for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];

1417:       MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i);
1418:     }

1420:     PetscFree(r_status1);
1421:     PetscFree(r_waits1);

1423:     /* rbuf2 is received, Post recv column indices a->j */
1424:     MPI_Waitall(nrqs, r_waits2, r_status2);

1426:     PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3);
1427:     for (i = 0; i < nrqs; ++i) {
1428:       PetscMalloc1(rbuf2[i][0], &rbuf3[i]);
1429:       req_source2[i] = r_status2[i].MPI_SOURCE;
1430:       MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i);
1431:     }

1433:     /* Wait on sends1 and sends2 */
1434:     PetscMalloc1(nrqs, &s_status1);
1435:     MPI_Waitall(nrqs, s_waits1, s_status1);
1436:     PetscFree(s_waits1);
1437:     PetscFree(s_status1);

1439:     MPI_Waitall(nrqr, s_waits2, s_status2);
1440:     PetscFree4(r_status2, s_waits2, r_waits2, s_status2);

1442:     /* Now allocate sending buffers for a->j, and send them off */
1443:     PetscMalloc1(nrqr, &sbuf_aj);
1444:     for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1445:     if (nrqr) PetscMalloc1(j, &sbuf_aj[0]);
1446:     for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];

1448:     for (i = 0; i < nrqr; i++) { /* for each requested message */
1449:       rbuf1_i   = rbuf1[i];
1450:       sbuf_aj_i = sbuf_aj[i];
1451:       ct1       = 2 * rbuf1_i[0] + 1;
1452:       ct2       = 0;

1455:       kmax = rbuf1[i][2];
1456:       for (k = 0; k < kmax; k++, ct1++) { /* for each row */
1457:         row    = rbuf1_i[ct1] - rstart;
1458:         nzA    = ai[row + 1] - ai[row];
1459:         nzB    = bi[row + 1] - bi[row];
1460:         ncols  = nzA + nzB;
1461:         cworkA = aj + ai[row];
1462:         cworkB = bj + bi[row];

1464:         /* load the column indices for this row into cols*/
1465:         cols = sbuf_aj_i + ct2;

1467:         lwrite = 0;
1468:         for (l = 0; l < nzB; l++) {
1469:           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1470:         }
1471:         for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1472:         for (l = 0; l < nzB; l++) {
1473:           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1474:         }

1476:         ct2 += ncols;
1477:       }
1478:       MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i);
1479:     }

1481:     /* create column map (cmap): global col of C -> local col of submat */
1482: #if defined(PETSC_USE_CTABLE)
1483:     if (!allcolumns) {
1484:       PetscTableCreate(ncol, C->cmap->N, &cmap);
1485:       PetscCalloc1(C->cmap->n, &cmap_loc);
1486:       for (j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */
1487:         if (icol[j] >= cstart && icol[j] < cend) {
1488:           cmap_loc[icol[j] - cstart] = j + 1;
1489:         } else { /* use PetscTable for non-local col indices */
1490:           PetscTableAdd(cmap, icol[j] + 1, j + 1, INSERT_VALUES);
1491:         }
1492:       }
1493:     } else {
1494:       cmap     = NULL;
1495:       cmap_loc = NULL;
1496:     }
1497:     PetscCalloc1(C->rmap->n, &rmap_loc);
1498: #else
1499:     if (!allcolumns) {
1500:       PetscCalloc1(C->cmap->N, &cmap);
1501:       for (j = 0; j < ncol; j++) cmap[icol[j]] = j + 1;
1502:     } else {
1503:       cmap = NULL;
1504:     }
1505: #endif

1507:     /* Create lens for MatSeqAIJSetPreallocation() */
1508:     PetscCalloc1(nrow, &lens);

1510:     /* Compute lens from local part of C */
1511:     for (j = 0; j < nrow; j++) {
1512:       row  = irow[j];
1513:       proc = row2proc[j];
1514:       if (proc == rank) {
1515:         /* diagonal part A = c->A */
1516:         ncols = ai[row - rstart + 1] - ai[row - rstart];
1517:         cols  = aj + ai[row - rstart];
1518:         if (!allcolumns) {
1519:           for (k = 0; k < ncols; k++) {
1520: #if defined(PETSC_USE_CTABLE)
1521:             tcol = cmap_loc[cols[k]];
1522: #else
1523:             tcol = cmap[cols[k] + cstart];
1524: #endif
1525:             if (tcol) lens[j]++;
1526:           }
1527:         } else { /* allcolumns */
1528:           lens[j] = ncols;
1529:         }

1531:         /* off-diagonal part B = c->B */
1532:         ncols = bi[row - rstart + 1] - bi[row - rstart];
1533:         cols  = bj + bi[row - rstart];
1534:         if (!allcolumns) {
1535:           for (k = 0; k < ncols; k++) {
1536: #if defined(PETSC_USE_CTABLE)
1537:             PetscTableFind(cmap, bmap[cols[k]] + 1, &tcol);
1538: #else
1539:             tcol = cmap[bmap[cols[k]]];
1540: #endif
1541:             if (tcol) lens[j]++;
1542:           }
1543:         } else { /* allcolumns */
1544:           lens[j] += ncols;
1545:         }
1546:       }
1547:     }

1549:     /* Create row map (rmap): global row of C -> local row of submat */
1550: #if defined(PETSC_USE_CTABLE)
1551:     PetscTableCreate(nrow, C->rmap->N, &rmap);
1552:     for (j = 0; j < nrow; j++) {
1553:       row  = irow[j];
1554:       proc = row2proc[j];
1555:       if (proc == rank) { /* a local row */
1556:         rmap_loc[row - rstart] = j;
1557:       } else {
1558:         PetscTableAdd(rmap, irow[j] + 1, j + 1, INSERT_VALUES);
1559:       }
1560:     }
1561: #else
1562:     PetscCalloc1(C->rmap->N, &rmap);
1563:     for (j = 0; j < nrow; j++) rmap[irow[j]] = j;
1564: #endif

1566:     /* Update lens from offproc data */
1567:     /* recv a->j is done */
1568:     MPI_Waitall(nrqs, r_waits3, r_status3);
1569:     for (i = 0; i < nrqs; i++) {
1570:       proc    = pa[i];
1571:       sbuf1_i = sbuf1[proc];
1573:       ct1     = 2 + 1;
1574:       ct2     = 0;
1575:       rbuf2_i = rbuf2[i]; /* received length of C->j */
1576:       rbuf3_i = rbuf3[i]; /* received C->j */

1579:       max1 = sbuf1_i[2];
1580:       for (k = 0; k < max1; k++, ct1++) {
1581: #if defined(PETSC_USE_CTABLE)
1582:         PetscTableFind(rmap, sbuf1_i[ct1] + 1, &row);
1583:         row--;
1585: #else
1586:         row = rmap[sbuf1_i[ct1]];      /* the row index in submat */
1587: #endif
1588:         /* Now, store row index of submat in sbuf1_i[ct1] */
1589:         sbuf1_i[ct1] = row;

1591:         nnz = rbuf2_i[ct1];
1592:         if (!allcolumns) {
1593:           for (l = 0; l < nnz; l++, ct2++) {
1594: #if defined(PETSC_USE_CTABLE)
1595:             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1596:               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1597:             } else {
1598:               PetscTableFind(cmap, rbuf3_i[ct2] + 1, &tcol);
1599:             }
1600: #else
1601:             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1602: #endif
1603:             if (tcol) lens[row]++;
1604:           }
1605:         } else { /* allcolumns */
1606:           lens[row] += nnz;
1607:         }
1608:       }
1609:     }
1610:     MPI_Waitall(nrqr, s_waits3, s_status3);
1611:     PetscFree4(r_waits3, s_waits3, r_status3, s_status3);

1613:     /* Create the submatrices */
1614:     MatCreate(PETSC_COMM_SELF, &submat);
1615:     MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE);

1617:     ISGetBlockSize(isrow[0], &i);
1618:     ISGetBlockSize(iscol[0], &j);
1619:     MatSetBlockSizes(submat, i, j);
1620:     MatSetType(submat, ((PetscObject)A)->type_name);
1621:     MatSeqAIJSetPreallocation(submat, 0, lens);

1623:     /* create struct Mat_SubSppt and attached it to submat */
1624:     PetscNew(&smatis1);
1625:     subc            = (Mat_SeqAIJ *)submat->data;
1626:     subc->submatis1 = smatis1;

1628:     smatis1->id          = 0;
1629:     smatis1->nrqs        = nrqs;
1630:     smatis1->nrqr        = nrqr;
1631:     smatis1->rbuf1       = rbuf1;
1632:     smatis1->rbuf2       = rbuf2;
1633:     smatis1->rbuf3       = rbuf3;
1634:     smatis1->sbuf2       = sbuf2;
1635:     smatis1->req_source2 = req_source2;

1637:     smatis1->sbuf1 = sbuf1;
1638:     smatis1->ptr   = ptr;
1639:     smatis1->tmp   = tmp;
1640:     smatis1->ctr   = ctr;

1642:     smatis1->pa          = pa;
1643:     smatis1->req_size    = req_size;
1644:     smatis1->req_source1 = req_source1;

1646:     smatis1->allcolumns = allcolumns;
1647:     smatis1->singleis   = PETSC_TRUE;
1648:     smatis1->row2proc   = row2proc;
1649:     smatis1->rmap       = rmap;
1650:     smatis1->cmap       = cmap;
1651: #if defined(PETSC_USE_CTABLE)
1652:     smatis1->rmap_loc = rmap_loc;
1653:     smatis1->cmap_loc = cmap_loc;
1654: #endif

1656:     smatis1->destroy     = submat->ops->destroy;
1657:     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1658:     submat->factortype   = C->factortype;

1660:     /* compute rmax */
1661:     rmax = 0;
1662:     for (i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]);

1664:   } else { /* scall == MAT_REUSE_MATRIX */
1665:     submat = submats[0];

1668:     subc        = (Mat_SeqAIJ *)submat->data;
1669:     rmax        = subc->rmax;
1670:     smatis1     = subc->submatis1;
1671:     nrqs        = smatis1->nrqs;
1672:     nrqr        = smatis1->nrqr;
1673:     rbuf1       = smatis1->rbuf1;
1674:     rbuf2       = smatis1->rbuf2;
1675:     rbuf3       = smatis1->rbuf3;
1676:     req_source2 = smatis1->req_source2;

1678:     sbuf1 = smatis1->sbuf1;
1679:     sbuf2 = smatis1->sbuf2;
1680:     ptr   = smatis1->ptr;
1681:     tmp   = smatis1->tmp;
1682:     ctr   = smatis1->ctr;

1684:     pa          = smatis1->pa;
1685:     req_size    = smatis1->req_size;
1686:     req_source1 = smatis1->req_source1;

1688:     allcolumns = smatis1->allcolumns;
1689:     row2proc   = smatis1->row2proc;
1690:     rmap       = smatis1->rmap;
1691:     cmap       = smatis1->cmap;
1692: #if defined(PETSC_USE_CTABLE)
1693:     rmap_loc = smatis1->rmap_loc;
1694:     cmap_loc = smatis1->cmap_loc;
1695: #endif
1696:   }

1698:   /* Post recv matrix values */
1699:   PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals);
1700:   PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4);
1701:   PetscObjectGetNewTag((PetscObject)C, &tag4);
1702:   for (i = 0; i < nrqs; ++i) {
1703:     PetscMalloc1(rbuf2[i][0], &rbuf4[i]);
1704:     MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i);
1705:   }

1707:   /* Allocate sending buffers for a->a, and send them off */
1708:   PetscMalloc1(nrqr, &sbuf_aa);
1709:   for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1710:   if (nrqr) PetscMalloc1(j, &sbuf_aa[0]);
1711:   for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];

1713:   for (i = 0; i < nrqr; i++) {
1714:     rbuf1_i   = rbuf1[i];
1715:     sbuf_aa_i = sbuf_aa[i];
1716:     ct1       = 2 * rbuf1_i[0] + 1;
1717:     ct2       = 0;

1720:     kmax = rbuf1_i[2];
1721:     for (k = 0; k < kmax; k++, ct1++) {
1722:       row    = rbuf1_i[ct1] - rstart;
1723:       nzA    = ai[row + 1] - ai[row];
1724:       nzB    = bi[row + 1] - bi[row];
1725:       ncols  = nzA + nzB;
1726:       cworkB = bj + bi[row];
1727:       vworkA = a_a + ai[row];
1728:       vworkB = b_a + bi[row];

1730:       /* load the column values for this row into vals*/
1731:       vals = sbuf_aa_i + ct2;

1733:       lwrite = 0;
1734:       for (l = 0; l < nzB; l++) {
1735:         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1736:       }
1737:       for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
1738:       for (l = 0; l < nzB; l++) {
1739:         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1740:       }

1742:       ct2 += ncols;
1743:     }
1744:     MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i);
1745:   }

1747:   /* Assemble submat */
1748:   /* First assemble the local rows */
1749:   for (j = 0; j < nrow; j++) {
1750:     row  = irow[j];
1751:     proc = row2proc[j];
1752:     if (proc == rank) {
1753:       Crow = row - rstart; /* local row index of C */
1754: #if defined(PETSC_USE_CTABLE)
1755:       row = rmap_loc[Crow]; /* row index of submat */
1756: #else
1757:       row = rmap[row];
1758: #endif

1760:       if (allcolumns) {
1761:         /* diagonal part A = c->A */
1762:         ncols = ai[Crow + 1] - ai[Crow];
1763:         cols  = aj + ai[Crow];
1764:         vals  = a_a + ai[Crow];
1765:         i     = 0;
1766:         for (k = 0; k < ncols; k++) {
1767:           subcols[i]   = cols[k] + cstart;
1768:           subvals[i++] = vals[k];
1769:         }

1771:         /* off-diagonal part B = c->B */
1772:         ncols = bi[Crow + 1] - bi[Crow];
1773:         cols  = bj + bi[Crow];
1774:         vals  = b_a + bi[Crow];
1775:         for (k = 0; k < ncols; k++) {
1776:           subcols[i]   = bmap[cols[k]];
1777:           subvals[i++] = vals[k];
1778:         }

1780:         MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES);

1782:       } else { /* !allcolumns */
1783: #if defined(PETSC_USE_CTABLE)
1784:         /* diagonal part A = c->A */
1785:         ncols = ai[Crow + 1] - ai[Crow];
1786:         cols  = aj + ai[Crow];
1787:         vals  = a_a + ai[Crow];
1788:         i     = 0;
1789:         for (k = 0; k < ncols; k++) {
1790:           tcol = cmap_loc[cols[k]];
1791:           if (tcol) {
1792:             subcols[i]   = --tcol;
1793:             subvals[i++] = vals[k];
1794:           }
1795:         }

1797:         /* off-diagonal part B = c->B */
1798:         ncols = bi[Crow + 1] - bi[Crow];
1799:         cols  = bj + bi[Crow];
1800:         vals  = b_a + bi[Crow];
1801:         for (k = 0; k < ncols; k++) {
1802:           PetscTableFind(cmap, bmap[cols[k]] + 1, &tcol);
1803:           if (tcol) {
1804:             subcols[i]   = --tcol;
1805:             subvals[i++] = vals[k];
1806:           }
1807:         }
1808: #else
1809:         /* diagonal part A = c->A */
1810:         ncols = ai[Crow + 1] - ai[Crow];
1811:         cols  = aj + ai[Crow];
1812:         vals  = a_a + ai[Crow];
1813:         i     = 0;
1814:         for (k = 0; k < ncols; k++) {
1815:           tcol = cmap[cols[k] + cstart];
1816:           if (tcol) {
1817:             subcols[i]   = --tcol;
1818:             subvals[i++] = vals[k];
1819:           }
1820:         }

1822:         /* off-diagonal part B = c->B */
1823:         ncols = bi[Crow + 1] - bi[Crow];
1824:         cols  = bj + bi[Crow];
1825:         vals  = b_a + bi[Crow];
1826:         for (k = 0; k < ncols; k++) {
1827:           tcol = cmap[bmap[cols[k]]];
1828:           if (tcol) {
1829:             subcols[i]   = --tcol;
1830:             subvals[i++] = vals[k];
1831:           }
1832:         }
1833: #endif
1834:         MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES);
1835:       }
1836:     }
1837:   }

1839:   /* Now assemble the off-proc rows */
1840:   for (i = 0; i < nrqs; i++) { /* for each requested message */
1841:     /* recv values from other processes */
1842:     MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i);
1843:     proc    = pa[idex];
1844:     sbuf1_i = sbuf1[proc];
1846:     ct1     = 2 + 1;
1847:     ct2     = 0;           /* count of received C->j */
1848:     ct3     = 0;           /* count of received C->j that will be inserted into submat */
1849:     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1850:     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1851:     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */

1854:     max1 = sbuf1_i[2];                  /* num of rows */
1855:     for (k = 0; k < max1; k++, ct1++) { /* for each recved row */
1856:       row = sbuf1_i[ct1];               /* row index of submat */
1857:       if (!allcolumns) {
1858:         idex = 0;
1859:         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1860:           nnz = rbuf2_i[ct1];                /* num of C entries in this row */
1861:           for (l = 0; l < nnz; l++, ct2++) { /* for each recved column */
1862: #if defined(PETSC_USE_CTABLE)
1863:             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1864:               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1865:             } else {
1866:               PetscTableFind(cmap, rbuf3_i[ct2] + 1, &tcol);
1867:             }
1868: #else
1869:             tcol = cmap[rbuf3_i[ct2]];
1870: #endif
1871:             if (tcol) {
1872:               subcols[idex]   = --tcol; /* may not be sorted */
1873:               subvals[idex++] = rbuf4_i[ct2];

1875:               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1876:                For reuse, we replace received C->j with index that should be inserted to submat */
1877:               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1878:             }
1879:           }
1880:           MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES);
1881:         } else { /* scall == MAT_REUSE_MATRIX */
1882:           submat = submats[0];
1883:           subc   = (Mat_SeqAIJ *)submat->data;

1885:           nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */
1886:           for (l = 0; l < nnz; l++) {
1887:             ct2             = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1888:             subvals[idex++] = rbuf4_i[ct2];
1889:           }

1891:           bj = subc->j + subc->i[row]; /* sorted column indices */
1892:           MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES);
1893:         }
1894:       } else {              /* allcolumns */
1895:         nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1896:         MatSetValues_SeqAIJ(submat, 1, &row, nnz, rbuf3_i + ct2, rbuf4_i + ct2, INSERT_VALUES);
1897:         ct2 += nnz;
1898:       }
1899:     }
1900:   }

1902:   /* sending a->a are done */
1903:   MPI_Waitall(nrqr, s_waits4, s_status4);
1904:   PetscFree4(r_waits4, s_waits4, r_status4, s_status4);

1906:   MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY);
1907:   MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY);
1908:   submats[0] = submat;

1910:   /* Restore the indices */
1911:   ISRestoreIndices(isrow[0], &irow);
1912:   if (!allcolumns) ISRestoreIndices(iscol[0], &icol);

1914:   /* Destroy allocated memory */
1915:   for (i = 0; i < nrqs; ++i) PetscFree(rbuf4[i]);
1916:   PetscFree3(rbuf4, subcols, subvals);
1917:   if (sbuf_aa) {
1918:     PetscFree(sbuf_aa[0]);
1919:     PetscFree(sbuf_aa);
1920:   }

1922:   if (scall == MAT_INITIAL_MATRIX) {
1923:     PetscFree(lens);
1924:     if (sbuf_aj) {
1925:       PetscFree(sbuf_aj[0]);
1926:       PetscFree(sbuf_aj);
1927:     }
1928:   }
1929:   MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a);
1930:   MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a);
1931:   return 0;
1932: }

1934: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1935: {
1936:   PetscInt  ncol;
1937:   PetscBool colflag, allcolumns = PETSC_FALSE;

1939:   /* Allocate memory to hold all the submatrices */
1940:   if (scall == MAT_INITIAL_MATRIX) PetscCalloc1(2, submat);

1942:   /* Check for special case: each processor gets entire matrix columns */
1943:   ISIdentity(iscol[0], &colflag);
1944:   ISGetLocalSize(iscol[0], &ncol);
1945:   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;

1947:   MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat);
1948:   return 0;
1949: }

1951: PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1952: {
1953:   PetscInt     nmax, nstages = 0, i, pos, max_no, nrow, ncol, in[2], out[2];
1954:   PetscBool    rowflag, colflag, wantallmatrix = PETSC_FALSE;
1955:   Mat_SeqAIJ  *subc;
1956:   Mat_SubSppt *smat;

1958:   /* Check for special case: each processor has a single IS */
1959:   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1960:     MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat);
1961:     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1962:     return 0;
1963:   }

1965:   /* Collect global wantallmatrix and nstages */
1966:   if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt);
1967:   else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt));
1968:   if (!nmax) nmax = 1;

1970:   if (scall == MAT_INITIAL_MATRIX) {
1971:     /* Collect global wantallmatrix and nstages */
1972:     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1973:       ISIdentity(*isrow, &rowflag);
1974:       ISIdentity(*iscol, &colflag);
1975:       ISGetLocalSize(*isrow, &nrow);
1976:       ISGetLocalSize(*iscol, &ncol);
1977:       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1978:         wantallmatrix = PETSC_TRUE;

1980:         PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL);
1981:       }
1982:     }

1984:     /* Determine the number of stages through which submatrices are done
1985:        Each stage will extract nmax submatrices.
1986:        nmax is determined by the matrix column dimension.
1987:        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1988:     */
1989:     nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */

1991:     in[0] = -1 * (PetscInt)wantallmatrix;
1992:     in[1] = nstages;
1993:     MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C));
1994:     wantallmatrix = (PetscBool)(-out[0]);
1995:     nstages       = out[1]; /* Make sure every processor loops through the global nstages */

1997:   } else { /* MAT_REUSE_MATRIX */
1998:     if (ismax) {
1999:       subc = (Mat_SeqAIJ *)(*submat)[0]->data;
2000:       smat = subc->submatis1;
2001:     } else { /* (*submat)[0] is a dummy matrix */
2002:       smat = (Mat_SubSppt *)(*submat)[0]->data;
2003:     }
2004:     if (!smat) {
2005:       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2006:       wantallmatrix = PETSC_TRUE;
2007:     } else if (smat->singleis) {
2008:       MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat);
2009:       return 0;
2010:     } else {
2011:       nstages = smat->nstages;
2012:     }
2013:   }

2015:   if (wantallmatrix) {
2016:     MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat);
2017:     return 0;
2018:   }

2020:   /* Allocate memory to hold all the submatrices and dummy submatrices */
2021:   if (scall == MAT_INITIAL_MATRIX) PetscCalloc1(ismax + nstages, submat);

2023:   for (i = 0, pos = 0; i < nstages; i++) {
2024:     if (pos + nmax <= ismax) max_no = nmax;
2025:     else if (pos >= ismax) max_no = 0;
2026:     else max_no = ismax - pos;

2028:     MatCreateSubMatrices_MPIAIJ_Local(C, max_no, isrow + pos, iscol + pos, scall, *submat + pos);
2029:     if (!max_no) {
2030:       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2031:         smat          = (Mat_SubSppt *)(*submat)[pos]->data;
2032:         smat->nstages = nstages;
2033:       }
2034:       pos++; /* advance to next dummy matrix if any */
2035:     } else pos += max_no;
2036:   }

2038:   if (ismax && scall == MAT_INITIAL_MATRIX) {
2039:     /* save nstages for reuse */
2040:     subc          = (Mat_SeqAIJ *)(*submat)[0]->data;
2041:     smat          = subc->submatis1;
2042:     smat->nstages = nstages;
2043:   }
2044:   return 0;
2045: }

2047: /* -------------------------------------------------------------------------*/
2048: PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats)
2049: {
2050:   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
2051:   Mat              A = c->A;
2052:   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc;
2053:   const PetscInt **icol, **irow;
2054:   PetscInt        *nrow, *ncol, start;
2055:   PetscMPIInt      rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr;
2056:   PetscInt       **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, **rbuf1, row, proc = -1;
2057:   PetscInt         nrqs = 0, msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *pa, *tmp = NULL, tcol;
2058:   PetscInt       **rbuf3 = NULL, *req_source1 = NULL, *req_source2, **sbuf_aj, **rbuf2 = NULL, max1, max2;
2059:   PetscInt       **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax;
2060: #if defined(PETSC_USE_CTABLE)
2061:   PetscTable *cmap, cmap_i = NULL, *rmap, rmap_i;
2062: #else
2063:   PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i;
2064: #endif
2065:   const PetscInt *irow_i;
2066:   PetscInt        ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i;
2067:   MPI_Request    *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
2068:   MPI_Request    *r_waits4, *s_waits3, *s_waits4;
2069:   MPI_Comm        comm;
2070:   PetscScalar   **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i;
2071:   PetscMPIInt    *onodes1, *olengths1, end;
2072:   PetscInt      **row2proc, *row2proc_i, ilen_row, *imat_ilen, *imat_j, *imat_i, old_row;
2073:   Mat_SubSppt    *smat_i;
2074:   PetscBool      *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE;
2075:   PetscInt       *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen;

2077:   PetscObjectGetComm((PetscObject)C, &comm);
2078:   size = c->size;
2079:   rank = c->rank;

2081:   PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns);
2082:   PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted);

2084:   for (i = 0; i < ismax; i++) {
2085:     ISSorted(iscol[i], &issorted[i]);
2086:     if (!issorted[i]) iscsorted = issorted[i];

2088:     ISSorted(isrow[i], &issorted[i]);

2090:     ISGetIndices(isrow[i], &irow[i]);
2091:     ISGetLocalSize(isrow[i], &nrow[i]);

2093:     /* Check for special case: allcolumn */
2094:     ISIdentity(iscol[i], &colflag);
2095:     ISGetLocalSize(iscol[i], &ncol[i]);
2096:     if (colflag && ncol[i] == C->cmap->N) {
2097:       allcolumns[i] = PETSC_TRUE;
2098:       icol[i]       = NULL;
2099:     } else {
2100:       allcolumns[i] = PETSC_FALSE;
2101:       ISGetIndices(iscol[i], &icol[i]);
2102:     }
2103:   }

2105:   if (scall == MAT_REUSE_MATRIX) {
2106:     /* Assumes new rows are same length as the old rows */
2107:     for (i = 0; i < ismax; i++) {
2109:       subc = (Mat_SeqAIJ *)submats[i]->data;

2112:       /* Initial matrix as if empty */
2113:       PetscArrayzero(subc->ilen, submats[i]->rmap->n);

2115:       smat_i = subc->submatis1;

2117:       nrqs        = smat_i->nrqs;
2118:       nrqr        = smat_i->nrqr;
2119:       rbuf1       = smat_i->rbuf1;
2120:       rbuf2       = smat_i->rbuf2;
2121:       rbuf3       = smat_i->rbuf3;
2122:       req_source2 = smat_i->req_source2;

2124:       sbuf1 = smat_i->sbuf1;
2125:       sbuf2 = smat_i->sbuf2;
2126:       ptr   = smat_i->ptr;
2127:       tmp   = smat_i->tmp;
2128:       ctr   = smat_i->ctr;

2130:       pa          = smat_i->pa;
2131:       req_size    = smat_i->req_size;
2132:       req_source1 = smat_i->req_source1;

2134:       allcolumns[i] = smat_i->allcolumns;
2135:       row2proc[i]   = smat_i->row2proc;
2136:       rmap[i]       = smat_i->rmap;
2137:       cmap[i]       = smat_i->cmap;
2138:     }

2140:     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2142:       smat_i = (Mat_SubSppt *)submats[0]->data;

2144:       nrqs        = smat_i->nrqs;
2145:       nrqr        = smat_i->nrqr;
2146:       rbuf1       = smat_i->rbuf1;
2147:       rbuf2       = smat_i->rbuf2;
2148:       rbuf3       = smat_i->rbuf3;
2149:       req_source2 = smat_i->req_source2;

2151:       sbuf1 = smat_i->sbuf1;
2152:       sbuf2 = smat_i->sbuf2;
2153:       ptr   = smat_i->ptr;
2154:       tmp   = smat_i->tmp;
2155:       ctr   = smat_i->ctr;

2157:       pa          = smat_i->pa;
2158:       req_size    = smat_i->req_size;
2159:       req_source1 = smat_i->req_source1;

2161:       allcolumns[0] = PETSC_FALSE;
2162:     }
2163:   } else { /* scall == MAT_INITIAL_MATRIX */
2164:     /* Get some new tags to keep the communication clean */
2165:     PetscObjectGetNewTag((PetscObject)C, &tag2);
2166:     PetscObjectGetNewTag((PetscObject)C, &tag3);

2168:     /* evaluate communication - mesg to who, length of mesg, and buffer space
2169:      required. Based on this, buffers are allocated, and data copied into them*/
2170:     PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4); /* mesg size, initialize work vectors */

2172:     for (i = 0; i < ismax; i++) {
2173:       jmax   = nrow[i];
2174:       irow_i = irow[i];

2176:       PetscMalloc1(jmax, &row2proc_i);
2177:       row2proc[i] = row2proc_i;

2179:       if (issorted[i]) proc = 0;
2180:       for (j = 0; j < jmax; j++) {
2181:         if (!issorted[i]) proc = 0;
2182:         row = irow_i[j];
2183:         while (row >= C->rmap->range[proc + 1]) proc++;
2184:         w4[proc]++;
2185:         row2proc_i[j] = proc; /* map row index to proc */
2186:       }
2187:       for (j = 0; j < size; j++) {
2188:         if (w4[j]) {
2189:           w1[j] += w4[j];
2190:           w3[j]++;
2191:           w4[j] = 0;
2192:         }
2193:       }
2194:     }

2196:     nrqs     = 0; /* no of outgoing messages */
2197:     msz      = 0; /* total mesg length (for all procs) */
2198:     w1[rank] = 0; /* no mesg sent to self */
2199:     w3[rank] = 0;
2200:     for (i = 0; i < size; i++) {
2201:       if (w1[i]) {
2202:         w2[i] = 1;
2203:         nrqs++;
2204:       } /* there exists a message to proc i */
2205:     }
2206:     PetscMalloc1(nrqs, &pa); /*(proc -array)*/
2207:     for (i = 0, j = 0; i < size; i++) {
2208:       if (w1[i]) {
2209:         pa[j] = i;
2210:         j++;
2211:       }
2212:     }

2214:     /* Each message would have a header = 1 + 2*(no of IS) + data */
2215:     for (i = 0; i < nrqs; i++) {
2216:       j = pa[i];
2217:       w1[j] += w2[j] + 2 * w3[j];
2218:       msz += w1[j];
2219:     }
2220:     PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz);

2222:     /* Determine the number of messages to expect, their lengths, from from-ids */
2223:     PetscGatherNumberOfMessages(comm, w2, w1, &nrqr);
2224:     PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1);

2226:     /* Now post the Irecvs corresponding to these messages */
2227:     PetscObjectGetNewTag((PetscObject)C, &tag0);
2228:     PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1);

2230:     /* Allocate Memory for outgoing messages */
2231:     PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr);
2232:     PetscArrayzero(sbuf1, size);
2233:     PetscArrayzero(ptr, size);

2235:     {
2236:       PetscInt *iptr = tmp;
2237:       k              = 0;
2238:       for (i = 0; i < nrqs; i++) {
2239:         j = pa[i];
2240:         iptr += k;
2241:         sbuf1[j] = iptr;
2242:         k        = w1[j];
2243:       }
2244:     }

2246:     /* Form the outgoing messages. Initialize the header space */
2247:     for (i = 0; i < nrqs; i++) {
2248:       j           = pa[i];
2249:       sbuf1[j][0] = 0;
2250:       PetscArrayzero(sbuf1[j] + 1, 2 * w3[j]);
2251:       ptr[j] = sbuf1[j] + 2 * w3[j] + 1;
2252:     }

2254:     /* Parse the isrow and copy data into outbuf */
2255:     for (i = 0; i < ismax; i++) {
2256:       row2proc_i = row2proc[i];
2257:       PetscArrayzero(ctr, size);
2258:       irow_i = irow[i];
2259:       jmax   = nrow[i];
2260:       for (j = 0; j < jmax; j++) { /* parse the indices of each IS */
2261:         proc = row2proc_i[j];
2262:         if (proc != rank) { /* copy to the outgoing buf*/
2263:           ctr[proc]++;
2264:           *ptr[proc] = irow_i[j];
2265:           ptr[proc]++;
2266:         }
2267:       }
2268:       /* Update the headers for the current IS */
2269:       for (j = 0; j < size; j++) { /* Can Optimise this loop too */
2270:         if ((ctr_j = ctr[j])) {
2271:           sbuf1_j            = sbuf1[j];
2272:           k                  = ++sbuf1_j[0];
2273:           sbuf1_j[2 * k]     = ctr_j;
2274:           sbuf1_j[2 * k - 1] = i;
2275:         }
2276:       }
2277:     }

2279:     /*  Now  post the sends */
2280:     PetscMalloc1(nrqs, &s_waits1);
2281:     for (i = 0; i < nrqs; ++i) {
2282:       j = pa[i];
2283:       MPI_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i);
2284:     }

2286:     /* Post Receives to capture the buffer size */
2287:     PetscMalloc1(nrqs, &r_waits2);
2288:     PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3);
2289:     if (nrqs) rbuf2[0] = tmp + msz;
2290:     for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
2291:     for (i = 0; i < nrqs; ++i) {
2292:       j = pa[i];
2293:       MPI_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i);
2294:     }

2296:     /* Send to other procs the buf size they should allocate */
2297:     /* Receive messages*/
2298:     PetscMalloc1(nrqr, &s_waits2);
2299:     PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1);
2300:     {
2301:       PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart;
2302:       PetscInt *sbuf2_i;

2304:       MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE);
2305:       for (i = 0; i < nrqr; ++i) {
2306:         req_size[i] = 0;
2307:         rbuf1_i     = rbuf1[i];
2308:         start       = 2 * rbuf1_i[0] + 1;
2309:         end         = olengths1[i];
2310:         PetscMalloc1(end, &sbuf2[i]);
2311:         sbuf2_i = sbuf2[i];
2312:         for (j = start; j < end; j++) {
2313:           id         = rbuf1_i[j] - rstart;
2314:           ncols      = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id];
2315:           sbuf2_i[j] = ncols;
2316:           req_size[i] += ncols;
2317:         }
2318:         req_source1[i] = onodes1[i];
2319:         /* form the header */
2320:         sbuf2_i[0] = req_size[i];
2321:         for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];

2323:         MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i);
2324:       }
2325:     }

2327:     PetscFree(onodes1);
2328:     PetscFree(olengths1);
2329:     PetscFree(r_waits1);
2330:     PetscFree4(w1, w2, w3, w4);

2332:     /* Receive messages*/
2333:     PetscMalloc1(nrqs, &r_waits3);
2334:     MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE);
2335:     for (i = 0; i < nrqs; ++i) {
2336:       PetscMalloc1(rbuf2[i][0], &rbuf3[i]);
2337:       req_source2[i] = pa[i];
2338:       MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i);
2339:     }
2340:     PetscFree(r_waits2);

2342:     /* Wait on sends1 and sends2 */
2343:     MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE);
2344:     MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE);
2345:     PetscFree(s_waits1);
2346:     PetscFree(s_waits2);

2348:     /* Now allocate sending buffers for a->j, and send them off */
2349:     PetscMalloc1(nrqr, &sbuf_aj);
2350:     for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2351:     if (nrqr) PetscMalloc1(j, &sbuf_aj[0]);
2352:     for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];

2354:     PetscMalloc1(nrqr, &s_waits3);
2355:     {
2356:       PetscInt  nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite;
2357:       PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2358:       PetscInt  cend = C->cmap->rend;
2359:       PetscInt *a_j = a->j, *b_j = b->j, ctmp;

2361:       for (i = 0; i < nrqr; i++) {
2362:         rbuf1_i   = rbuf1[i];
2363:         sbuf_aj_i = sbuf_aj[i];
2364:         ct1       = 2 * rbuf1_i[0] + 1;
2365:         ct2       = 0;
2366:         for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2367:           kmax = rbuf1[i][2 * j];
2368:           for (k = 0; k < kmax; k++, ct1++) {
2369:             row    = rbuf1_i[ct1] - rstart;
2370:             nzA    = a_i[row + 1] - a_i[row];
2371:             nzB    = b_i[row + 1] - b_i[row];
2372:             ncols  = nzA + nzB;
2373:             cworkA = a_j + a_i[row];
2374:             cworkB = b_j + b_i[row];

2376:             /* load the column indices for this row into cols */
2377:             cols = sbuf_aj_i + ct2;

2379:             lwrite = 0;
2380:             for (l = 0; l < nzB; l++) {
2381:               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2382:             }
2383:             for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2384:             for (l = 0; l < nzB; l++) {
2385:               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2386:             }

2388:             ct2 += ncols;
2389:           }
2390:         }
2391:         MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i);
2392:       }
2393:     }

2395:     /* create col map: global col of C -> local col of submatrices */
2396:     {
2397:       const PetscInt *icol_i;
2398: #if defined(PETSC_USE_CTABLE)
2399:       for (i = 0; i < ismax; i++) {
2400:         if (!allcolumns[i]) {
2401:           PetscTableCreate(ncol[i], C->cmap->N, &cmap[i]);

2403:           jmax   = ncol[i];
2404:           icol_i = icol[i];
2405:           cmap_i = cmap[i];
2406:           for (j = 0; j < jmax; j++) PetscTableAdd(cmap[i], icol_i[j] + 1, j + 1, INSERT_VALUES);
2407:         } else cmap[i] = NULL;
2408:       }
2409: #else
2410:       for (i = 0; i < ismax; i++) {
2411:         if (!allcolumns[i]) {
2412:           PetscCalloc1(C->cmap->N, &cmap[i]);
2413:           jmax   = ncol[i];
2414:           icol_i = icol[i];
2415:           cmap_i = cmap[i];
2416:           for (j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1;
2417:         } else cmap[i] = NULL;
2418:       }
2419: #endif
2420:     }

2422:     /* Create lens which is required for MatCreate... */
2423:     for (i = 0, j = 0; i < ismax; i++) j += nrow[i];
2424:     PetscMalloc1(ismax, &lens);

2426:     if (ismax) PetscCalloc1(j, &lens[0]);
2427:     for (i = 1; i < ismax; i++) lens[i] = lens[i - 1] + nrow[i - 1];

2429:     /* Update lens from local data */
2430:     for (i = 0; i < ismax; i++) {
2431:       row2proc_i = row2proc[i];
2432:       jmax       = nrow[i];
2433:       if (!allcolumns[i]) cmap_i = cmap[i];
2434:       irow_i = irow[i];
2435:       lens_i = lens[i];
2436:       for (j = 0; j < jmax; j++) {
2437:         row  = irow_i[j];
2438:         proc = row2proc_i[j];
2439:         if (proc == rank) {
2440:           MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL);
2441:           if (!allcolumns[i]) {
2442:             for (k = 0; k < ncols; k++) {
2443: #if defined(PETSC_USE_CTABLE)
2444:               PetscTableFind(cmap_i, cols[k] + 1, &tcol);
2445: #else
2446:               tcol = cmap_i[cols[k]];
2447: #endif
2448:               if (tcol) lens_i[j]++;
2449:             }
2450:           } else { /* allcolumns */
2451:             lens_i[j] = ncols;
2452:           }
2453:           MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL);
2454:         }
2455:       }
2456:     }

2458:     /* Create row map: global row of C -> local row of submatrices */
2459: #if defined(PETSC_USE_CTABLE)
2460:     for (i = 0; i < ismax; i++) {
2461:       PetscTableCreate(nrow[i], C->rmap->N, &rmap[i]);
2462:       irow_i = irow[i];
2463:       jmax   = nrow[i];
2464:       for (j = 0; j < jmax; j++) PetscTableAdd(rmap[i], irow_i[j] + 1, j + 1, INSERT_VALUES);
2465:     }
2466: #else
2467:     for (i = 0; i < ismax; i++) {
2468:       PetscCalloc1(C->rmap->N, &rmap[i]);
2469:       rmap_i = rmap[i];
2470:       irow_i = irow[i];
2471:       jmax   = nrow[i];
2472:       for (j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j;
2473:     }
2474: #endif

2476:     /* Update lens from offproc data */
2477:     {
2478:       PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i;

2480:       MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE);
2481:       for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2482:         sbuf1_i = sbuf1[pa[tmp2]];
2483:         jmax    = sbuf1_i[0];
2484:         ct1     = 2 * jmax + 1;
2485:         ct2     = 0;
2486:         rbuf2_i = rbuf2[tmp2];
2487:         rbuf3_i = rbuf3[tmp2];
2488:         for (j = 1; j <= jmax; j++) {
2489:           is_no  = sbuf1_i[2 * j - 1];
2490:           max1   = sbuf1_i[2 * j];
2491:           lens_i = lens[is_no];
2492:           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2493:           rmap_i = rmap[is_no];
2494:           for (k = 0; k < max1; k++, ct1++) {
2495: #if defined(PETSC_USE_CTABLE)
2496:             PetscTableFind(rmap_i, sbuf1_i[ct1] + 1, &row);
2497:             row--;
2499: #else
2500:             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2501: #endif
2502:             max2 = rbuf2_i[ct1];
2503:             for (l = 0; l < max2; l++, ct2++) {
2504:               if (!allcolumns[is_no]) {
2505: #if defined(PETSC_USE_CTABLE)
2506:                 PetscTableFind(cmap_i, rbuf3_i[ct2] + 1, &tcol);
2507: #else
2508:                 tcol = cmap_i[rbuf3_i[ct2]];
2509: #endif
2510:                 if (tcol) lens_i[row]++;
2511:               } else {         /* allcolumns */
2512:                 lens_i[row]++; /* lens_i[row] += max2 ? */
2513:               }
2514:             }
2515:           }
2516:         }
2517:       }
2518:     }
2519:     PetscFree(r_waits3);
2520:     MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE);
2521:     PetscFree(s_waits3);

2523:     /* Create the submatrices */
2524:     for (i = 0; i < ismax; i++) {
2525:       PetscInt rbs, cbs;

2527:       ISGetBlockSize(isrow[i], &rbs);
2528:       ISGetBlockSize(iscol[i], &cbs);

2530:       MatCreate(PETSC_COMM_SELF, submats + i);
2531:       MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE);

2533:       MatSetBlockSizes(submats[i], rbs, cbs);
2534:       MatSetType(submats[i], ((PetscObject)A)->type_name);
2535:       MatSeqAIJSetPreallocation(submats[i], 0, lens[i]);

2537:       /* create struct Mat_SubSppt and attached it to submat */
2538:       PetscNew(&smat_i);
2539:       subc            = (Mat_SeqAIJ *)submats[i]->data;
2540:       subc->submatis1 = smat_i;

2542:       smat_i->destroy          = submats[i]->ops->destroy;
2543:       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2544:       submats[i]->factortype   = C->factortype;

2546:       smat_i->id          = i;
2547:       smat_i->nrqs        = nrqs;
2548:       smat_i->nrqr        = nrqr;
2549:       smat_i->rbuf1       = rbuf1;
2550:       smat_i->rbuf2       = rbuf2;
2551:       smat_i->rbuf3       = rbuf3;
2552:       smat_i->sbuf2       = sbuf2;
2553:       smat_i->req_source2 = req_source2;

2555:       smat_i->sbuf1 = sbuf1;
2556:       smat_i->ptr   = ptr;
2557:       smat_i->tmp   = tmp;
2558:       smat_i->ctr   = ctr;

2560:       smat_i->pa          = pa;
2561:       smat_i->req_size    = req_size;
2562:       smat_i->req_source1 = req_source1;

2564:       smat_i->allcolumns = allcolumns[i];
2565:       smat_i->singleis   = PETSC_FALSE;
2566:       smat_i->row2proc   = row2proc[i];
2567:       smat_i->rmap       = rmap[i];
2568:       smat_i->cmap       = cmap[i];
2569:     }

2571:     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2572:       MatCreate(PETSC_COMM_SELF, &submats[0]);
2573:       MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE);
2574:       MatSetType(submats[0], MATDUMMY);

2576:       /* create struct Mat_SubSppt and attached it to submat */
2577:       PetscNew(&smat_i);
2578:       submats[0]->data = (void *)smat_i;

2580:       smat_i->destroy          = submats[0]->ops->destroy;
2581:       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2582:       submats[0]->factortype   = C->factortype;

2584:       smat_i->id          = 0;
2585:       smat_i->nrqs        = nrqs;
2586:       smat_i->nrqr        = nrqr;
2587:       smat_i->rbuf1       = rbuf1;
2588:       smat_i->rbuf2       = rbuf2;
2589:       smat_i->rbuf3       = rbuf3;
2590:       smat_i->sbuf2       = sbuf2;
2591:       smat_i->req_source2 = req_source2;

2593:       smat_i->sbuf1 = sbuf1;
2594:       smat_i->ptr   = ptr;
2595:       smat_i->tmp   = tmp;
2596:       smat_i->ctr   = ctr;

2598:       smat_i->pa          = pa;
2599:       smat_i->req_size    = req_size;
2600:       smat_i->req_source1 = req_source1;

2602:       smat_i->allcolumns = PETSC_FALSE;
2603:       smat_i->singleis   = PETSC_FALSE;
2604:       smat_i->row2proc   = NULL;
2605:       smat_i->rmap       = NULL;
2606:       smat_i->cmap       = NULL;
2607:     }

2609:     if (ismax) PetscFree(lens[0]);
2610:     PetscFree(lens);
2611:     if (sbuf_aj) {
2612:       PetscFree(sbuf_aj[0]);
2613:       PetscFree(sbuf_aj);
2614:     }

2616:   } /* endof scall == MAT_INITIAL_MATRIX */

2618:   /* Post recv matrix values */
2619:   PetscObjectGetNewTag((PetscObject)C, &tag4);
2620:   PetscMalloc1(nrqs, &rbuf4);
2621:   PetscMalloc1(nrqs, &r_waits4);
2622:   for (i = 0; i < nrqs; ++i) {
2623:     PetscMalloc1(rbuf2[i][0], &rbuf4[i]);
2624:     MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i);
2625:   }

2627:   /* Allocate sending buffers for a->a, and send them off */
2628:   PetscMalloc1(nrqr, &sbuf_aa);
2629:   for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2630:   if (nrqr) PetscMalloc1(j, &sbuf_aa[0]);
2631:   for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];

2633:   PetscMalloc1(nrqr, &s_waits4);
2634:   {
2635:     PetscInt     nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite;
2636:     PetscInt     cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2637:     PetscInt     cend = C->cmap->rend;
2638:     PetscInt    *b_j  = b->j;
2639:     PetscScalar *vworkA, *vworkB, *a_a, *b_a;

2641:     MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a);
2642:     MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a);
2643:     for (i = 0; i < nrqr; i++) {
2644:       rbuf1_i   = rbuf1[i];
2645:       sbuf_aa_i = sbuf_aa[i];
2646:       ct1       = 2 * rbuf1_i[0] + 1;
2647:       ct2       = 0;
2648:       for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2649:         kmax = rbuf1_i[2 * j];
2650:         for (k = 0; k < kmax; k++, ct1++) {
2651:           row    = rbuf1_i[ct1] - rstart;
2652:           nzA    = a_i[row + 1] - a_i[row];
2653:           nzB    = b_i[row + 1] - b_i[row];
2654:           ncols  = nzA + nzB;
2655:           cworkB = b_j + b_i[row];
2656:           vworkA = a_a + a_i[row];
2657:           vworkB = b_a + b_i[row];

2659:           /* load the column values for this row into vals*/
2660:           vals = sbuf_aa_i + ct2;

2662:           lwrite = 0;
2663:           for (l = 0; l < nzB; l++) {
2664:             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2665:           }
2666:           for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
2667:           for (l = 0; l < nzB; l++) {
2668:             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2669:           }

2671:           ct2 += ncols;
2672:         }
2673:       }
2674:       MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i);
2675:     }
2676:     MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a);
2677:     MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a);
2678:   }

2680:   /* Assemble the matrices */
2681:   /* First assemble the local rows */
2682:   for (i = 0; i < ismax; i++) {
2683:     row2proc_i = row2proc[i];
2684:     subc       = (Mat_SeqAIJ *)submats[i]->data;
2685:     imat_ilen  = subc->ilen;
2686:     imat_j     = subc->j;
2687:     imat_i     = subc->i;
2688:     imat_a     = subc->a;

2690:     if (!allcolumns[i]) cmap_i = cmap[i];
2691:     rmap_i = rmap[i];
2692:     irow_i = irow[i];
2693:     jmax   = nrow[i];
2694:     for (j = 0; j < jmax; j++) {
2695:       row  = irow_i[j];
2696:       proc = row2proc_i[j];
2697:       if (proc == rank) {
2698:         old_row = row;
2699: #if defined(PETSC_USE_CTABLE)
2700:         PetscTableFind(rmap_i, row + 1, &row);
2701:         row--;
2702: #else
2703:         row = rmap_i[row];
2704: #endif
2705:         ilen_row = imat_ilen[row];
2706:         MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals);
2707:         mat_i = imat_i[row];
2708:         mat_a = imat_a + mat_i;
2709:         mat_j = imat_j + mat_i;
2710:         if (!allcolumns[i]) {
2711:           for (k = 0; k < ncols; k++) {
2712: #if defined(PETSC_USE_CTABLE)
2713:             PetscTableFind(cmap_i, cols[k] + 1, &tcol);
2714: #else
2715:             tcol = cmap_i[cols[k]];
2716: #endif
2717:             if (tcol) {
2718:               *mat_j++ = tcol - 1;
2719:               *mat_a++ = vals[k];
2720:               ilen_row++;
2721:             }
2722:           }
2723:         } else { /* allcolumns */
2724:           for (k = 0; k < ncols; k++) {
2725:             *mat_j++ = cols[k]; /* global col index! */
2726:             *mat_a++ = vals[k];
2727:             ilen_row++;
2728:           }
2729:         }
2730:         MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals);

2732:         imat_ilen[row] = ilen_row;
2733:       }
2734:     }
2735:   }

2737:   /* Now assemble the off proc rows */
2738:   MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE);
2739:   for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2740:     sbuf1_i = sbuf1[pa[tmp2]];
2741:     jmax    = sbuf1_i[0];
2742:     ct1     = 2 * jmax + 1;
2743:     ct2     = 0;
2744:     rbuf2_i = rbuf2[tmp2];
2745:     rbuf3_i = rbuf3[tmp2];
2746:     rbuf4_i = rbuf4[tmp2];
2747:     for (j = 1; j <= jmax; j++) {
2748:       is_no  = sbuf1_i[2 * j - 1];
2749:       rmap_i = rmap[is_no];
2750:       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2751:       subc      = (Mat_SeqAIJ *)submats[is_no]->data;
2752:       imat_ilen = subc->ilen;
2753:       imat_j    = subc->j;
2754:       imat_i    = subc->i;
2755:       imat_a    = subc->a;
2756:       max1      = sbuf1_i[2 * j];
2757:       for (k = 0; k < max1; k++, ct1++) {
2758:         row = sbuf1_i[ct1];
2759: #if defined(PETSC_USE_CTABLE)
2760:         PetscTableFind(rmap_i, row + 1, &row);
2761:         row--;
2762: #else
2763:         row = rmap_i[row];
2764: #endif
2765:         ilen  = imat_ilen[row];
2766:         mat_i = imat_i[row];
2767:         mat_a = imat_a + mat_i;
2768:         mat_j = imat_j + mat_i;
2769:         max2  = rbuf2_i[ct1];
2770:         if (!allcolumns[is_no]) {
2771:           for (l = 0; l < max2; l++, ct2++) {
2772: #if defined(PETSC_USE_CTABLE)
2773:             PetscTableFind(cmap_i, rbuf3_i[ct2] + 1, &tcol);
2774: #else
2775:             tcol = cmap_i[rbuf3_i[ct2]];
2776: #endif
2777:             if (tcol) {
2778:               *mat_j++ = tcol - 1;
2779:               *mat_a++ = rbuf4_i[ct2];
2780:               ilen++;
2781:             }
2782:           }
2783:         } else { /* allcolumns */
2784:           for (l = 0; l < max2; l++, ct2++) {
2785:             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2786:             *mat_a++ = rbuf4_i[ct2];
2787:             ilen++;
2788:           }
2789:         }
2790:         imat_ilen[row] = ilen;
2791:       }
2792:     }
2793:   }

2795:   if (!iscsorted) { /* sort column indices of the rows */
2796:     for (i = 0; i < ismax; i++) {
2797:       subc      = (Mat_SeqAIJ *)submats[i]->data;
2798:       imat_j    = subc->j;
2799:       imat_i    = subc->i;
2800:       imat_a    = subc->a;
2801:       imat_ilen = subc->ilen;

2803:       if (allcolumns[i]) continue;
2804:       jmax = nrow[i];
2805:       for (j = 0; j < jmax; j++) {
2806:         mat_i = imat_i[j];
2807:         mat_a = imat_a + mat_i;
2808:         mat_j = imat_j + mat_i;
2809:         PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a);
2810:       }
2811:     }
2812:   }

2814:   PetscFree(r_waits4);
2815:   MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE);
2816:   PetscFree(s_waits4);

2818:   /* Restore the indices */
2819:   for (i = 0; i < ismax; i++) {
2820:     ISRestoreIndices(isrow[i], irow + i);
2821:     if (!allcolumns[i]) ISRestoreIndices(iscol[i], icol + i);
2822:   }

2824:   for (i = 0; i < ismax; i++) {
2825:     MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY);
2826:     MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY);
2827:   }

2829:   /* Destroy allocated memory */
2830:   if (sbuf_aa) {
2831:     PetscFree(sbuf_aa[0]);
2832:     PetscFree(sbuf_aa);
2833:   }
2834:   PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted);

2836:   for (i = 0; i < nrqs; ++i) PetscFree(rbuf4[i]);
2837:   PetscFree(rbuf4);

2839:   PetscFree4(row2proc, cmap, rmap, allcolumns);
2840:   return 0;
2841: }

2843: /*
2844:  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2845:  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2846:  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2847:  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2848:  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2849:  state, and needs to be "assembled" later by compressing B's column space.

2851:  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2852:  Following this call, C->A & C->B have been created, even if empty.
2853:  */
2854: PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B)
2855: {
2856:   /* If making this function public, change the error returned in this function away from _PLIB. */
2857:   Mat_MPIAIJ     *aij;
2858:   Mat_SeqAIJ     *Baij;
2859:   PetscBool       seqaij, Bdisassembled;
2860:   PetscInt        m, n, *nz, i, j, ngcol, col, rstart, rend, shift, count;
2861:   PetscScalar     v;
2862:   const PetscInt *rowindices, *colindices;

2864:   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2865:   if (A) {
2866:     PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);
2868:     if (rowemb) {
2869:       ISGetLocalSize(rowemb, &m);
2871:     } else {
2873:     }
2874:     if (dcolemb) {
2875:       ISGetLocalSize(dcolemb, &n);
2877:     } else {
2879:     }
2880:   }
2881:   if (B) {
2882:     PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);
2884:     if (rowemb) {
2885:       ISGetLocalSize(rowemb, &m);
2887:     } else {
2889:     }
2890:     if (ocolemb) {
2891:       ISGetLocalSize(ocolemb, &n);
2893:     } else {
2895:     }
2896:   }

2898:   aij = (Mat_MPIAIJ *)C->data;
2899:   if (!aij->A) {
2900:     /* Mimic parts of MatMPIAIJSetPreallocation() */
2901:     MatCreate(PETSC_COMM_SELF, &aij->A);
2902:     MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n);
2903:     MatSetBlockSizesFromMats(aij->A, C, C);
2904:     MatSetType(aij->A, MATSEQAIJ);
2905:   }
2906:   if (A) {
2907:     MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A);
2908:   } else {
2909:     MatSetUp(aij->A);
2910:   }
2911:   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2912:     /*
2913:       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2914:       need to "disassemble" B -- convert it to using C's global indices.
2915:       To insert the values we take the safer, albeit more expensive, route of MatSetValues().

2917:       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2918:       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.

2920:       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2921:       At least avoid calling MatSetValues() and the implied searches?
2922:     */

2924:     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2925: #if defined(PETSC_USE_CTABLE)
2926:       PetscTableDestroy(&aij->colmap);
2927: #else
2928:       PetscFree(aij->colmap);
2929:       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2930: #endif
2931:       ngcol = 0;
2932:       if (aij->lvec) VecGetSize(aij->lvec, &ngcol);
2933:       if (aij->garray) { PetscFree(aij->garray); }
2934:       VecDestroy(&aij->lvec);
2935:       VecScatterDestroy(&aij->Mvctx);
2936:     }
2937:     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) MatDestroy(&aij->B);
2938:     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) MatZeroEntries(aij->B);
2939:   }
2940:   Bdisassembled = PETSC_FALSE;
2941:   if (!aij->B) {
2942:     MatCreate(PETSC_COMM_SELF, &aij->B);
2943:     MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N);
2944:     MatSetBlockSizesFromMats(aij->B, B, B);
2945:     MatSetType(aij->B, MATSEQAIJ);
2946:     Bdisassembled = PETSC_TRUE;
2947:   }
2948:   if (B) {
2949:     Baij = (Mat_SeqAIJ *)B->data;
2950:     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2951:       PetscMalloc1(B->rmap->n, &nz);
2952:       for (i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i];
2953:       MatSeqAIJSetPreallocation(aij->B, 0, nz);
2954:       PetscFree(nz);
2955:     }

2957:     PetscLayoutGetRange(C->rmap, &rstart, &rend);
2958:     shift      = rend - rstart;
2959:     count      = 0;
2960:     rowindices = NULL;
2961:     colindices = NULL;
2962:     if (rowemb) ISGetIndices(rowemb, &rowindices);
2963:     if (ocolemb) ISGetIndices(ocolemb, &colindices);
2964:     for (i = 0; i < B->rmap->n; i++) {
2965:       PetscInt row;
2966:       row = i;
2967:       if (rowindices) row = rowindices[i];
2968:       for (j = Baij->i[i]; j < Baij->i[i + 1]; j++) {
2969:         col = Baij->j[count];
2970:         if (colindices) col = colindices[col];
2971:         if (Bdisassembled && col >= rstart) col += shift;
2972:         v = Baij->a[count];
2973:         MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES);
2974:         ++count;
2975:       }
2976:     }
2977:     /* No assembly for aij->B is necessary. */
2978:     /* FIXME: set aij->B's nonzerostate correctly. */
2979:   } else {
2980:     MatSetUp(aij->B);
2981:   }
2982:   C->preallocated  = PETSC_TRUE;
2983:   C->was_assembled = PETSC_FALSE;
2984:   C->assembled     = PETSC_FALSE;
2985:   /*
2986:       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2987:       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2988:    */
2989:   return 0;
2990: }

2992: /*
2993:   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
2994:  */
2995: PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B)
2996: {
2997:   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data;

3001:   /* FIXME: make sure C is assembled */
3002:   *A = aij->A;
3003:   *B = aij->B;
3004:   /* Note that we don't incref *A and *B, so be careful! */
3005:   return 0;
3006: }

3008: /*
3009:   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3010:   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3011: */
3012: PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat))
3013: {
3014:   PetscMPIInt size, flag;
3015:   PetscInt    i, ii, cismax, ispar;
3016:   Mat        *A, *B;
3017:   IS         *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p;

3019:   if (!ismax) return 0;

3021:   for (i = 0, cismax = 0; i < ismax; ++i) {
3022:     MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag);
3024:     MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);
3025:     if (size > 1) ++cismax;
3026:   }

3028:   /*
3029:      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3030:      ispar counts the number of parallel ISs across C's comm.
3031:   */
3032:   MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C));
3033:   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3034:     (*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat);
3035:     return 0;
3036:   }

3038:   /* if (ispar) */
3039:   /*
3040:     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3041:     These are used to extract the off-diag portion of the resulting parallel matrix.
3042:     The row IS for the off-diag portion is the same as for the diag portion,
3043:     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3044:   */
3045:   PetscMalloc2(cismax, &cisrow, cismax, &ciscol);
3046:   PetscMalloc1(cismax, &ciscol_p);
3047:   for (i = 0, ii = 0; i < ismax; ++i) {
3048:     MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);
3049:     if (size > 1) {
3050:       /*
3051:          TODO: This is the part that's ***NOT SCALABLE***.
3052:          To fix this we need to extract just the indices of C's nonzero columns
3053:          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3054:          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3055:          to be done without serializing on the IS list, so, most likely, it is best
3056:          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3057:       */
3058:       ISGetNonlocalIS(iscol[i], &(ciscol[ii]));
3059:       /* Now we have to
3060:          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3061:              were sorted on each rank, concatenated they might no longer be sorted;
3062:          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3063:              indices in the nondecreasing order to the original index positions.
3064:          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3065:       */
3066:       ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii);
3067:       ISSort(ciscol[ii]);
3068:       ++ii;
3069:     }
3070:   }
3071:   PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p);
3072:   for (i = 0, ii = 0; i < ismax; ++i) {
3073:     PetscInt        j, issize;
3074:     const PetscInt *indices;

3076:     /*
3077:        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3078:      */
3079:     ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i);
3080:     ISSort(isrow[i]);
3081:     ISGetLocalSize(isrow[i], &issize);
3082:     ISGetIndices(isrow[i], &indices);
3083:     for (j = 1; j < issize; ++j) {
3085:     }
3086:     ISRestoreIndices(isrow[i], &indices);
3087:     ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i);
3088:     ISSort(iscol[i]);
3089:     ISGetLocalSize(iscol[i], &issize);
3090:     ISGetIndices(iscol[i], &indices);
3091:     for (j = 1; j < issize; ++j) {
3093:     }
3094:     ISRestoreIndices(iscol[i], &indices);
3095:     MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);
3096:     if (size > 1) {
3097:       cisrow[ii] = isrow[i];
3098:       ++ii;
3099:     }
3100:   }
3101:   /*
3102:     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3103:     array of sequential matrices underlying the resulting parallel matrices.
3104:     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3105:     contain duplicates.

3107:     There are as many diag matrices as there are original index sets. There are only as many parallel
3108:     and off-diag matrices, as there are parallel (comm size > 1) index sets.

3110:     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3111:     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3112:       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3113:       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3114:       with A[i] and B[ii] extracted from the corresponding MPI submat.
3115:     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3116:       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3117:       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3118:       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3119:       values into A[i] and B[ii] sitting inside the corresponding submat.
3120:     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3121:       A[i], B[ii], AA[i] or BB[ii] matrices.
3122:   */
3123:   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3124:   if (scall == MAT_INITIAL_MATRIX) PetscMalloc1(ismax, submat);

3126:   /* Now obtain the sequential A and B submatrices separately. */
3127:   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3128:   (*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A);
3129:   (*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B);

3131:   /*
3132:     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3133:     matrices A & B have been extracted directly into the parallel matrices containing them, or
3134:     simply into the sequential matrix identical with the corresponding A (if size == 1).
3135:     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3136:     to have the same sparsity pattern.
3137:     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3138:     must be constructed for C. This is done by setseqmat(s).
3139:   */
3140:   for (i = 0, ii = 0; i < ismax; ++i) {
3141:     /*
3142:        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3143:        That way we can avoid sorting and computing permutations when reusing.
3144:        To this end:
3145:         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3146:         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3147:           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3148:     */
3149:     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;

3151:     MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);
3152:     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3153:     if (size > 1) {
3154:       if (scall == MAT_INITIAL_MATRIX) {
3155:         MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i);
3156:         MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE);
3157:         MatSetType((*submat)[i], MATMPIAIJ);
3158:         PetscLayoutSetUp((*submat)[i]->rmap);
3159:         PetscLayoutSetUp((*submat)[i]->cmap);
3160:       }
3161:       /*
3162:         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3163:       */
3164:       {
3165:         Mat AA = A[i], BB = B[ii];

3167:         if (AA || BB) {
3168:           setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB);
3169:           MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY);
3170:           MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY);
3171:         }
3172:         MatDestroy(&AA);
3173:       }
3174:       ISDestroy(ciscol + ii);
3175:       ISDestroy(ciscol_p + ii);
3176:       ++ii;
3177:     } else { /* if (size == 1) */
3178:       if (scall == MAT_REUSE_MATRIX) MatDestroy(&(*submat)[i]);
3179:       if (isrow_p[i] || iscol_p[i]) {
3180:         MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i);
3181:         setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i]);
3182:         /* Otherwise A is extracted straight into (*submats)[i]. */
3183:         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3184:         MatDestroy(A + i);
3185:       } else (*submat)[i] = A[i];
3186:     }
3187:     ISDestroy(&isrow_p[i]);
3188:     ISDestroy(&iscol_p[i]);
3189:   }
3190:   PetscFree2(cisrow, ciscol);
3191:   PetscFree2(isrow_p, iscol_p);
3192:   PetscFree(ciscol_p);
3193:   PetscFree(A);
3194:   MatDestroySubMatrices(cismax, &B);
3195:   return 0;
3196: }

3198: PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
3199: {
3200:   MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ);
3201:   return 0;
3202: }