Actual source code: mpiptap.c


  2: /*
  3:   Defines projective product routines where A is a MPIAIJ matrix
  4:           C = P^T * A * P
  5: */

  7: #include <../src/mat/impls/aij/seq/aij.h>
  8: #include <../src/mat/utils/freespace.h>
  9: #include <../src/mat/impls/aij/mpi/mpiaij.h>
 10: #include <petscbt.h>
 11: #include <petsctime.h>
 12: #include <petsc/private/hashmapiv.h>
 13: #include <petsc/private/hashseti.h>
 14: #include <petscsf.h>

 16: PetscErrorCode MatView_MPIAIJ_PtAP(Mat A, PetscViewer viewer)
 17: {
 18:   PetscBool         iascii;
 19:   PetscViewerFormat format;
 20:   Mat_APMPI        *ptap;

 22:   MatCheckProduct(A, 1);
 23:   ptap = (Mat_APMPI *)A->product->data;
 24:   PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii);
 25:   if (iascii) {
 26:     PetscViewerGetFormat(viewer, &format);
 27:     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
 28:       if (ptap->algType == 0) {
 29:         PetscViewerASCIIPrintf(viewer, "using scalable MatPtAP() implementation\n");
 30:       } else if (ptap->algType == 1) {
 31:         PetscViewerASCIIPrintf(viewer, "using nonscalable MatPtAP() implementation\n");
 32:       } else if (ptap->algType == 2) {
 33:         PetscViewerASCIIPrintf(viewer, "using allatonce MatPtAP() implementation\n");
 34:       } else if (ptap->algType == 3) {
 35:         PetscViewerASCIIPrintf(viewer, "using merged allatonce MatPtAP() implementation\n");
 36:       }
 37:     }
 38:   }
 39:   return 0;
 40: }

 42: PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data)
 43: {
 44:   Mat_APMPI           *ptap = (Mat_APMPI *)data;
 45:   Mat_Merge_SeqsToMPI *merge;

 47:   PetscFree2(ptap->startsj_s, ptap->startsj_r);
 48:   PetscFree(ptap->bufa);
 49:   MatDestroy(&ptap->P_loc);
 50:   MatDestroy(&ptap->P_oth);
 51:   MatDestroy(&ptap->A_loc); /* used by MatTransposeMatMult() */
 52:   MatDestroy(&ptap->Rd);
 53:   MatDestroy(&ptap->Ro);
 54:   if (ptap->AP_loc) { /* used by alg_rap */
 55:     Mat_SeqAIJ *ap = (Mat_SeqAIJ *)(ptap->AP_loc)->data;
 56:     PetscFree(ap->i);
 57:     PetscFree2(ap->j, ap->a);
 58:     MatDestroy(&ptap->AP_loc);
 59:   } else { /* used by alg_ptap */
 60:     PetscFree(ptap->api);
 61:     PetscFree(ptap->apj);
 62:   }
 63:   MatDestroy(&ptap->C_loc);
 64:   MatDestroy(&ptap->C_oth);
 65:   if (ptap->apa) PetscFree(ptap->apa);

 67:   MatDestroy(&ptap->Pt);

 69:   merge = ptap->merge;
 70:   if (merge) { /* used by alg_ptap */
 71:     PetscFree(merge->id_r);
 72:     PetscFree(merge->len_s);
 73:     PetscFree(merge->len_r);
 74:     PetscFree(merge->bi);
 75:     PetscFree(merge->bj);
 76:     PetscFree(merge->buf_ri[0]);
 77:     PetscFree(merge->buf_ri);
 78:     PetscFree(merge->buf_rj[0]);
 79:     PetscFree(merge->buf_rj);
 80:     PetscFree(merge->coi);
 81:     PetscFree(merge->coj);
 82:     PetscFree(merge->owners_co);
 83:     PetscLayoutDestroy(&merge->rowmap);
 84:     PetscFree(ptap->merge);
 85:   }
 86:   ISLocalToGlobalMappingDestroy(&ptap->ltog);

 88:   PetscSFDestroy(&ptap->sf);
 89:   PetscFree(ptap->c_othi);
 90:   PetscFree(ptap->c_rmti);
 91:   PetscFree(ptap);
 92:   return 0;
 93: }

 95: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, Mat C)
 96: {
 97:   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
 98:   Mat_SeqAIJ        *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data;
 99:   Mat_SeqAIJ        *ap, *p_loc, *p_oth = NULL, *c_seq;
100:   Mat_APMPI         *ptap;
101:   Mat                AP_loc, C_loc, C_oth;
102:   PetscInt           i, rstart, rend, cm, ncols, row, *api, *apj, am = A->rmap->n, apnz, nout;
103:   PetscScalar       *apa;
104:   const PetscInt    *cols;
105:   const PetscScalar *vals;

107:   MatCheckProduct(C, 3);
108:   ptap = (Mat_APMPI *)C->product->data;

112:   MatZeroEntries(C);

114:   /* 1) get R = Pd^T,Ro = Po^T */
115:   if (ptap->reuse == MAT_REUSE_MATRIX) {
116:     MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd);
117:     MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro);
118:   }

120:   /* 2) get AP_loc */
121:   AP_loc = ptap->AP_loc;
122:   ap     = (Mat_SeqAIJ *)AP_loc->data;

124:   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
125:   /*-----------------------------------------------------*/
126:   if (ptap->reuse == MAT_REUSE_MATRIX) {
127:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
128:     MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth);
129:     MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc);
130:   }

132:   /* 2-2) compute numeric A_loc*P - dominating part */
133:   /* ---------------------------------------------- */
134:   /* get data from symbolic products */
135:   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
136:   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;

138:   api = ap->i;
139:   apj = ap->j;
140:   ISLocalToGlobalMappingApply(ptap->ltog, api[AP_loc->rmap->n], apj, apj);
141:   for (i = 0; i < am; i++) {
142:     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
143:     apnz = api[i + 1] - api[i];
144:     apa  = ap->a + api[i];
145:     PetscArrayzero(apa, apnz);
146:     AProw_scalable(i, ad, ao, p_loc, p_oth, api, apj, apa);
147:   }
148:   ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, api[AP_loc->rmap->n], apj, &nout, apj);

151:   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
152:   /* Always use scalable version since we are in the MPI scalable version */
153:   MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd, AP_loc, ptap->C_loc);
154:   MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro, AP_loc, ptap->C_oth);

156:   C_loc = ptap->C_loc;
157:   C_oth = ptap->C_oth;

159:   /* add C_loc and Co to to C */
160:   MatGetOwnershipRange(C, &rstart, &rend);

162:   /* C_loc -> C */
163:   cm    = C_loc->rmap->N;
164:   c_seq = (Mat_SeqAIJ *)C_loc->data;
165:   cols  = c_seq->j;
166:   vals  = c_seq->a;
167:   ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_loc->rmap->n], c_seq->j, c_seq->j);

169:   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
170:   /* when there are no off-processor parts.  */
171:   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
172:   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
173:   /* a table, and other, more complex stuff has to be done. */
174:   if (C->assembled) {
175:     C->was_assembled = PETSC_TRUE;
176:     C->assembled     = PETSC_FALSE;
177:   }
178:   if (C->was_assembled) {
179:     for (i = 0; i < cm; i++) {
180:       ncols = c_seq->i[i + 1] - c_seq->i[i];
181:       row   = rstart + i;
182:       MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES);
183:       cols += ncols;
184:       vals += ncols;
185:     }
186:   } else {
187:     MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a);
188:   }
189:   ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_loc->rmap->n], c_seq->j, &nout, c_seq->j);

192:   /* Co -> C, off-processor part */
193:   cm    = C_oth->rmap->N;
194:   c_seq = (Mat_SeqAIJ *)C_oth->data;
195:   cols  = c_seq->j;
196:   vals  = c_seq->a;
197:   ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_oth->rmap->n], c_seq->j, c_seq->j);
198:   for (i = 0; i < cm; i++) {
199:     ncols = c_seq->i[i + 1] - c_seq->i[i];
200:     row   = p->garray[i];
201:     MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES);
202:     cols += ncols;
203:     vals += ncols;
204:   }
205:   MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
206:   MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);

208:   ptap->reuse = MAT_REUSE_MATRIX;

210:   ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_oth->rmap->n], c_seq->j, &nout, c_seq->j);
212:   return 0;
213: }

215: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, PetscReal fill, Mat Cmpi)
216: {
217:   Mat_APMPI               *ptap;
218:   Mat_MPIAIJ              *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
219:   MPI_Comm                 comm;
220:   PetscMPIInt              size, rank;
221:   Mat                      P_loc, P_oth;
222:   PetscFreeSpaceList       free_space = NULL, current_space = NULL;
223:   PetscInt                 am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
224:   PetscInt                *lnk, i, k, pnz, row, nsend;
225:   PetscMPIInt              tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
226:   PETSC_UNUSED PetscMPIInt icompleted = 0;
227:   PetscInt               **buf_rj, **buf_ri, **buf_ri_k;
228:   const PetscInt          *owners;
229:   PetscInt                 len, proc, *dnz, *onz, nzi, nspacedouble;
230:   PetscInt                 nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
231:   MPI_Request             *swaits, *rwaits;
232:   MPI_Status              *sstatus, rstatus;
233:   PetscLayout              rowmap;
234:   PetscInt                *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
235:   PetscMPIInt             *len_r, *id_r;          /* array of length of comm->size, store send/recv matrix values */
236:   PetscInt                *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, Crmax, *aj, *ai, *pi, nout;
237:   Mat_SeqAIJ              *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth;
238:   PetscScalar             *apv;
239:   PetscTable               ta;
240:   MatType                  mtype;
241:   const char              *prefix;
242: #if defined(PETSC_USE_INFO)
243:   PetscReal apfill;
244: #endif

246:   MatCheckProduct(Cmpi, 4);
248:   PetscObjectGetComm((PetscObject)A, &comm);
249:   MPI_Comm_size(comm, &size);
250:   MPI_Comm_rank(comm, &rank);

252:   if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data;

254:   /* create symbolic parallel matrix Cmpi */
255:   MatGetType(A, &mtype);
256:   MatSetType(Cmpi, mtype);

258:   /* create struct Mat_APMPI and attached it to C later */
259:   PetscNew(&ptap);
260:   ptap->reuse   = MAT_INITIAL_MATRIX;
261:   ptap->algType = 0;

263:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
264:   MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &P_oth);
265:   /* get P_loc by taking all local rows of P */
266:   MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &P_loc);

268:   ptap->P_loc = P_loc;
269:   ptap->P_oth = P_oth;

271:   /* (0) compute Rd = Pd^T, Ro = Po^T  */
272:   /* --------------------------------- */
273:   MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd);
274:   MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro);

276:   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
277:   /* ----------------------------------------------------------------- */
278:   p_loc = (Mat_SeqAIJ *)P_loc->data;
279:   if (P_oth) p_oth = (Mat_SeqAIJ *)P_oth->data;

281:   /* create and initialize a linked list */
282:   PetscTableCreate(pn, pN, &ta); /* for compute AP_loc and Cmpi */
283:   MatRowMergeMax_SeqAIJ(p_loc, P_loc->rmap->N, ta);
284:   MatRowMergeMax_SeqAIJ(p_oth, P_oth->rmap->N, ta);
285:   PetscTableGetCount(ta, &Crmax); /* Crmax = nnz(sum of Prows) */

287:   PetscLLCondensedCreate_Scalable(Crmax, &lnk);

289:   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
290:   if (ao) {
291:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space);
292:   } else {
293:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space);
294:   }
295:   current_space = free_space;
296:   nspacedouble  = 0;

298:   PetscMalloc1(am + 1, &api);
299:   api[0] = 0;
300:   for (i = 0; i < am; i++) {
301:     /* diagonal portion: Ad[i,:]*P */
302:     ai  = ad->i;
303:     pi  = p_loc->i;
304:     nzi = ai[i + 1] - ai[i];
305:     aj  = ad->j + ai[i];
306:     for (j = 0; j < nzi; j++) {
307:       row  = aj[j];
308:       pnz  = pi[row + 1] - pi[row];
309:       Jptr = p_loc->j + pi[row];
310:       /* add non-zero cols of P into the sorted linked list lnk */
311:       PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk);
312:     }
313:     /* off-diagonal portion: Ao[i,:]*P */
314:     if (ao) {
315:       ai  = ao->i;
316:       pi  = p_oth->i;
317:       nzi = ai[i + 1] - ai[i];
318:       aj  = ao->j + ai[i];
319:       for (j = 0; j < nzi; j++) {
320:         row  = aj[j];
321:         pnz  = pi[row + 1] - pi[row];
322:         Jptr = p_oth->j + pi[row];
323:         PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk);
324:       }
325:     }
326:     apnz       = lnk[0];
327:     api[i + 1] = api[i] + apnz;

329:     /* if free space is not available, double the total space in the list */
330:     if (current_space->local_remaining < apnz) {
331:       PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), &current_space);
332:       nspacedouble++;
333:     }

335:     /* Copy data into free space, then initialize lnk */
336:     PetscLLCondensedClean_Scalable(apnz, current_space->array, lnk);

338:     current_space->array += apnz;
339:     current_space->local_used += apnz;
340:     current_space->local_remaining -= apnz;
341:   }
342:   /* Allocate space for apj and apv, initialize apj, and */
343:   /* destroy list of free space and other temporary array(s) */
344:   PetscCalloc2(api[am], &apj, api[am], &apv);
345:   PetscFreeSpaceContiguous(&free_space, apj);
346:   PetscLLCondensedDestroy_Scalable(lnk);

348:   /* Create AP_loc for reuse */
349:   MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc);
350:   MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog);

352: #if defined(PETSC_USE_INFO)
353:   if (ao) {
354:     apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
355:   } else {
356:     apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
357:   }
358:   ptap->AP_loc->info.mallocs           = nspacedouble;
359:   ptap->AP_loc->info.fill_ratio_given  = fill;
360:   ptap->AP_loc->info.fill_ratio_needed = apfill;

362:   if (api[am]) {
363:     PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill);
364:     PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill);
365:   } else {
366:     PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc is empty \n");
367:   }
368: #endif

370:   /* (2-1) compute symbolic Co = Ro*AP_loc  */
371:   /* -------------------------------------- */
372:   MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth);
373:   MatGetOptionsPrefix(A, &prefix);
374:   MatSetOptionsPrefix(ptap->C_oth, prefix);
375:   MatAppendOptionsPrefix(ptap->C_oth, "inner_offdiag_");

377:   MatProductSetType(ptap->C_oth, MATPRODUCT_AB);
378:   MatProductSetAlgorithm(ptap->C_oth, "sorted");
379:   MatProductSetFill(ptap->C_oth, fill);
380:   MatProductSetFromOptions(ptap->C_oth);
381:   MatProductSymbolic(ptap->C_oth);

383:   /* (3) send coj of C_oth to other processors  */
384:   /* ------------------------------------------ */
385:   /* determine row ownership */
386:   PetscLayoutCreate(comm, &rowmap);
387:   PetscLayoutSetLocalSize(rowmap, pn);
388:   PetscLayoutSetBlockSize(rowmap, 1);
389:   PetscLayoutSetUp(rowmap);
390:   PetscLayoutGetRanges(rowmap, &owners);

392:   /* determine the number of messages to send, their lengths */
393:   PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co);
394:   PetscArrayzero(len_s, size);
395:   PetscArrayzero(len_si, size);

397:   c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
398:   coi   = c_oth->i;
399:   coj   = c_oth->j;
400:   con   = ptap->C_oth->rmap->n;
401:   proc  = 0;
402:   ISLocalToGlobalMappingApply(ptap->ltog, coi[con], coj, coj);
403:   for (i = 0; i < con; i++) {
404:     while (prmap[i] >= owners[proc + 1]) proc++;
405:     len_si[proc]++;                     /* num of rows in Co(=Pt*AP) to be sent to [proc] */
406:     len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
407:   }

409:   len          = 0; /* max length of buf_si[], see (4) */
410:   owners_co[0] = 0;
411:   nsend        = 0;
412:   for (proc = 0; proc < size; proc++) {
413:     owners_co[proc + 1] = owners_co[proc] + len_si[proc];
414:     if (len_s[proc]) {
415:       nsend++;
416:       len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
417:       len += len_si[proc];
418:     }
419:   }

421:   /* determine the number and length of messages to receive for coi and coj  */
422:   PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv);
423:   PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri);

425:   /* post the Irecv and Isend of coj */
426:   PetscCommGetNewTag(comm, &tagj);
427:   PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits);
428:   PetscMalloc1(nsend + 1, &swaits);
429:   for (proc = 0, k = 0; proc < size; proc++) {
430:     if (!len_s[proc]) continue;
431:     i = owners_co[proc];
432:     MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k);
433:     k++;
434:   }

436:   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
437:   /* ---------------------------------------- */
438:   MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc);
439:   MatProductSetType(ptap->C_loc, MATPRODUCT_AB);
440:   MatProductSetAlgorithm(ptap->C_loc, "default");
441:   MatProductSetFill(ptap->C_loc, fill);

443:   MatSetOptionsPrefix(ptap->C_loc, prefix);
444:   MatAppendOptionsPrefix(ptap->C_loc, "inner_diag_");

446:   MatProductSetFromOptions(ptap->C_loc);
447:   MatProductSymbolic(ptap->C_loc);

449:   c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
450:   ISLocalToGlobalMappingApply(ptap->ltog, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, c_loc->j);

452:   /* receives coj are complete */
453:   for (i = 0; i < nrecv; i++) MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus);
454:   PetscFree(rwaits);
455:   if (nsend) MPI_Waitall(nsend, swaits, sstatus);

457:   /* add received column indices into ta to update Crmax */
458:   for (k = 0; k < nrecv; k++) { /* k-th received message */
459:     Jptr = buf_rj[k];
460:     for (j = 0; j < len_r[k]; j++) PetscTableAdd(ta, *(Jptr + j) + 1, 1, INSERT_VALUES);
461:   }
462:   PetscTableGetCount(ta, &Crmax);
463:   PetscTableDestroy(&ta);

465:   /* (4) send and recv coi */
466:   /*-----------------------*/
467:   PetscCommGetNewTag(comm, &tagi);
468:   PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits);
469:   PetscMalloc1(len + 1, &buf_s);
470:   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
471:   for (proc = 0, k = 0; proc < size; proc++) {
472:     if (!len_s[proc]) continue;
473:     /* form outgoing message for i-structure:
474:          buf_si[0]:                 nrows to be sent
475:                [1:nrows]:           row index (global)
476:                [nrows+1:2*nrows+1]: i-structure index
477:     */
478:     /*-------------------------------------------*/
479:     nrows       = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
480:     buf_si_i    = buf_si + nrows + 1;
481:     buf_si[0]   = nrows;
482:     buf_si_i[0] = 0;
483:     nrows       = 0;
484:     for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
485:       nzi                 = coi[i + 1] - coi[i];
486:       buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi;   /* i-structure */
487:       buf_si[nrows + 1]   = prmap[i] - owners[proc]; /* local row index */
488:       nrows++;
489:     }
490:     MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k);
491:     k++;
492:     buf_si += len_si[proc];
493:   }
494:   for (i = 0; i < nrecv; i++) MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus);
495:   PetscFree(rwaits);
496:   if (nsend) MPI_Waitall(nsend, swaits, sstatus);

498:   PetscFree4(len_s, len_si, sstatus, owners_co);
499:   PetscFree(len_ri);
500:   PetscFree(swaits);
501:   PetscFree(buf_s);

503:   /* (5) compute the local portion of Cmpi      */
504:   /* ------------------------------------------ */
505:   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
506:   PetscFreeSpaceGet(Crmax, &free_space);
507:   current_space = free_space;

509:   PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci);
510:   for (k = 0; k < nrecv; k++) {
511:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
512:     nrows       = *buf_ri_k[k];
513:     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
514:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
515:   }

517:   MatPreallocateBegin(comm, pn, pn, dnz, onz);
518:   PetscLLCondensedCreate_Scalable(Crmax, &lnk);
519:   for (i = 0; i < pn; i++) {
520:     /* add C_loc into Cmpi */
521:     nzi  = c_loc->i[i + 1] - c_loc->i[i];
522:     Jptr = c_loc->j + c_loc->i[i];
523:     PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk);

525:     /* add received col data into lnk */
526:     for (k = 0; k < nrecv; k++) { /* k-th received message */
527:       if (i == *nextrow[k]) {     /* i-th row */
528:         nzi  = *(nextci[k] + 1) - *nextci[k];
529:         Jptr = buf_rj[k] + *nextci[k];
530:         PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk);
531:         nextrow[k]++;
532:         nextci[k]++;
533:       }
534:     }
535:     nzi = lnk[0];

537:     /* copy data into free space, then initialize lnk */
538:     PetscLLCondensedClean_Scalable(nzi, current_space->array, lnk);
539:     MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz);
540:   }
541:   PetscFree3(buf_ri_k, nextrow, nextci);
542:   PetscLLCondensedDestroy_Scalable(lnk);
543:   PetscFreeSpaceDestroy(free_space);

545:   /* local sizes and preallocation */
546:   MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE);
547:   if (P->cmap->bs > 0) {
548:     PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs);
549:     PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs);
550:   }
551:   MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz);
552:   MatPreallocateEnd(dnz, onz);

554:   /* members in merge */
555:   PetscFree(id_r);
556:   PetscFree(len_r);
557:   PetscFree(buf_ri[0]);
558:   PetscFree(buf_ri);
559:   PetscFree(buf_rj[0]);
560:   PetscFree(buf_rj);
561:   PetscLayoutDestroy(&rowmap);

563:   nout = 0;
564:   ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_oth->i[ptap->C_oth->rmap->n], c_oth->j, &nout, c_oth->j);
566:   ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, &nout, c_loc->j);

569:   /* attach the supporting struct to Cmpi for reuse */
570:   Cmpi->product->data    = ptap;
571:   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
572:   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;

574:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
575:   Cmpi->assembled        = PETSC_FALSE;
576:   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
577:   return 0;
578: }

580: static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHSetI dht, PetscHSetI oht)
581: {
582:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
583:   Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
584:   PetscInt   *ai, nzi, j, *aj, row, col, *pi, *pj, pnz, nzpi, *p_othcols, k;
585:   PetscInt    pcstart, pcend, column, offset;

587:   pcstart = P->cmap->rstart;
588:   pcstart *= dof;
589:   pcend = P->cmap->rend;
590:   pcend *= dof;
591:   /* diagonal portion: Ad[i,:]*P */
592:   ai  = ad->i;
593:   nzi = ai[i + 1] - ai[i];
594:   aj  = ad->j + ai[i];
595:   for (j = 0; j < nzi; j++) {
596:     row    = aj[j];
597:     offset = row % dof;
598:     row /= dof;
599:     nzpi = pd->i[row + 1] - pd->i[row];
600:     pj   = pd->j + pd->i[row];
601:     for (k = 0; k < nzpi; k++) PetscHSetIAdd(dht, pj[k] * dof + offset + pcstart);
602:   }
603:   /* off diag P */
604:   for (j = 0; j < nzi; j++) {
605:     row    = aj[j];
606:     offset = row % dof;
607:     row /= dof;
608:     nzpi = po->i[row + 1] - po->i[row];
609:     pj   = po->j + po->i[row];
610:     for (k = 0; k < nzpi; k++) PetscHSetIAdd(oht, p->garray[pj[k]] * dof + offset);
611:   }

613:   /* off diagonal part: Ao[i, :]*P_oth */
614:   if (ao) {
615:     ai  = ao->i;
616:     pi  = p_oth->i;
617:     nzi = ai[i + 1] - ai[i];
618:     aj  = ao->j + ai[i];
619:     for (j = 0; j < nzi; j++) {
620:       row       = aj[j];
621:       offset    = a->garray[row] % dof;
622:       row       = map[row];
623:       pnz       = pi[row + 1] - pi[row];
624:       p_othcols = p_oth->j + pi[row];
625:       for (col = 0; col < pnz; col++) {
626:         column = p_othcols[col] * dof + offset;
627:         if (column >= pcstart && column < pcend) {
628:           PetscHSetIAdd(dht, column);
629:         } else {
630:           PetscHSetIAdd(oht, column);
631:         }
632:       }
633:     }
634:   } /* end if (ao) */
635:   return 0;
636: }

638: static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHMapIV hmap)
639: {
640:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
641:   Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
642:   PetscInt   *ai, nzi, j, *aj, row, col, *pi, pnz, *p_othcols, pcstart, *pj, k, nzpi, offset;
643:   PetscScalar ra, *aa, *pa;

645:   pcstart = P->cmap->rstart;
646:   pcstart *= dof;

648:   /* diagonal portion: Ad[i,:]*P */
649:   ai  = ad->i;
650:   nzi = ai[i + 1] - ai[i];
651:   aj  = ad->j + ai[i];
652:   aa  = ad->a + ai[i];
653:   for (j = 0; j < nzi; j++) {
654:     ra     = aa[j];
655:     row    = aj[j];
656:     offset = row % dof;
657:     row /= dof;
658:     nzpi = pd->i[row + 1] - pd->i[row];
659:     pj   = pd->j + pd->i[row];
660:     pa   = pd->a + pd->i[row];
661:     for (k = 0; k < nzpi; k++) PetscHMapIVAddValue(hmap, pj[k] * dof + offset + pcstart, ra * pa[k]);
662:     PetscLogFlops(2.0 * nzpi);
663:   }
664:   for (j = 0; j < nzi; j++) {
665:     ra     = aa[j];
666:     row    = aj[j];
667:     offset = row % dof;
668:     row /= dof;
669:     nzpi = po->i[row + 1] - po->i[row];
670:     pj   = po->j + po->i[row];
671:     pa   = po->a + po->i[row];
672:     for (k = 0; k < nzpi; k++) PetscHMapIVAddValue(hmap, p->garray[pj[k]] * dof + offset, ra * pa[k]);
673:     PetscLogFlops(2.0 * nzpi);
674:   }

676:   /* off diagonal part: Ao[i, :]*P_oth */
677:   if (ao) {
678:     ai  = ao->i;
679:     pi  = p_oth->i;
680:     nzi = ai[i + 1] - ai[i];
681:     aj  = ao->j + ai[i];
682:     aa  = ao->a + ai[i];
683:     for (j = 0; j < nzi; j++) {
684:       row       = aj[j];
685:       offset    = a->garray[row] % dof;
686:       row       = map[row];
687:       ra        = aa[j];
688:       pnz       = pi[row + 1] - pi[row];
689:       p_othcols = p_oth->j + pi[row];
690:       pa        = p_oth->a + pi[row];
691:       for (col = 0; col < pnz; col++) PetscHMapIVAddValue(hmap, p_othcols[col] * dof + offset, ra * pa[col]);
692:       PetscLogFlops(2.0 * pnz);
693:     }
694:   } /* end if (ao) */

696:   return 0;
697: }

699: PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat, Mat, PetscInt dof, MatReuse, Mat *);

701: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, Mat C)
702: {
703:   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
704:   Mat_SeqAIJ     *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
705:   Mat_APMPI      *ptap;
706:   PetscHMapIV     hmap;
707:   PetscInt        i, j, jj, kk, nzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, ccstart, ccend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, *dcc, *occ, loc;
708:   PetscScalar    *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
709:   PetscInt        offset, ii, pocol;
710:   const PetscInt *mappingindices;
711:   IS              map;

713:   MatCheckProduct(C, 4);
714:   ptap = (Mat_APMPI *)C->product->data;

718:   MatZeroEntries(C);

720:   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
721:   /*-----------------------------------------------------*/
722:   if (ptap->reuse == MAT_REUSE_MATRIX) {
723:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
724:     MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth);
725:   }
726:   PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map);

728:   MatGetLocalSize(p->B, NULL, &pon);
729:   pon *= dof;
730:   PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta);
731:   MatGetLocalSize(A, &am, NULL);
732:   cmaxr = 0;
733:   for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
734:   PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc);
735:   PetscHMapIVCreate(&hmap);
736:   PetscHMapIVResize(hmap, cmaxr);
737:   ISGetIndices(map, &mappingindices);
738:   for (i = 0; i < am && pon; i++) {
739:     PetscHMapIVClear(hmap);
740:     offset = i % dof;
741:     ii     = i / dof;
742:     nzi    = po->i[ii + 1] - po->i[ii];
743:     if (!nzi) continue;
744:     MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap);
745:     voff = 0;
746:     PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues);
747:     if (!voff) continue;

749:     /* Form C(ii, :) */
750:     poj = po->j + po->i[ii];
751:     poa = po->a + po->i[ii];
752:     for (j = 0; j < nzi; j++) {
753:       pocol   = poj[j] * dof + offset;
754:       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
755:       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
756:       for (jj = 0; jj < voff; jj++) {
757:         apvaluestmp[jj] = apvalues[jj] * poa[j];
758:         /* If the row is empty */
759:         if (!c_rmtc[pocol]) {
760:           c_rmtjj[jj] = apindices[jj];
761:           c_rmtaa[jj] = apvaluestmp[jj];
762:           c_rmtc[pocol]++;
763:         } else {
764:           PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc);
765:           if (loc >= 0) { /* hit */
766:             c_rmtaa[loc] += apvaluestmp[jj];
767:             PetscLogFlops(1.0);
768:           } else { /* new element */
769:             loc = -(loc + 1);
770:             /* Move data backward */
771:             for (kk = c_rmtc[pocol]; kk > loc; kk--) {
772:               c_rmtjj[kk] = c_rmtjj[kk - 1];
773:               c_rmtaa[kk] = c_rmtaa[kk - 1];
774:             } /* End kk */
775:             c_rmtjj[loc] = apindices[jj];
776:             c_rmtaa[loc] = apvaluestmp[jj];
777:             c_rmtc[pocol]++;
778:           }
779:         }
780:         PetscLogFlops(voff);
781:       } /* End jj */
782:     }   /* End j */
783:   }     /* End i */

785:   PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc);
786:   PetscHMapIVDestroy(&hmap);

788:   MatGetLocalSize(P, NULL, &pn);
789:   pn *= dof;
790:   PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha);

792:   PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE);
793:   PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE);
794:   MatGetOwnershipRangeColumn(P, &pcstart, &pcend);
795:   pcstart = pcstart * dof;
796:   pcend   = pcend * dof;
797:   cd      = (Mat_SeqAIJ *)(c->A)->data;
798:   co      = (Mat_SeqAIJ *)(c->B)->data;

800:   cmaxr = 0;
801:   for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
802:   PetscCalloc5(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pn, &dcc, pn, &occ);
803:   PetscHMapIVCreate(&hmap);
804:   PetscHMapIVResize(hmap, cmaxr);
805:   for (i = 0; i < am && pn; i++) {
806:     PetscHMapIVClear(hmap);
807:     offset = i % dof;
808:     ii     = i / dof;
809:     nzi    = pd->i[ii + 1] - pd->i[ii];
810:     if (!nzi) continue;
811:     MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap);
812:     voff = 0;
813:     PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues);
814:     if (!voff) continue;
815:     /* Form C(ii, :) */
816:     pdj = pd->j + pd->i[ii];
817:     pda = pd->a + pd->i[ii];
818:     for (j = 0; j < nzi; j++) {
819:       row = pcstart + pdj[j] * dof + offset;
820:       for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j];
821:       PetscLogFlops(voff);
822:       MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES);
823:     }
824:   }
825:   ISRestoreIndices(map, &mappingindices);
826:   MatGetOwnershipRangeColumn(C, &ccstart, &ccend);
827:   PetscFree5(apindices, apvalues, apvaluestmp, dcc, occ);
828:   PetscHMapIVDestroy(&hmap);
829:   PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE);
830:   PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE);
831:   PetscFree2(c_rmtj, c_rmta);

833:   /* Add contributions from remote */
834:   for (i = 0; i < pn; i++) {
835:     row = i + pcstart;
836:     MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES);
837:   }
838:   PetscFree2(c_othj, c_otha);

840:   MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
841:   MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);

843:   ptap->reuse = MAT_REUSE_MATRIX;
844:   return 0;
845: }

847: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, Mat C)
848: {

850:   MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, C);
851:   return 0;
852: }

854: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, Mat C)
855: {
856:   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
857:   Mat_SeqAIJ     *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
858:   Mat_APMPI      *ptap;
859:   PetscHMapIV     hmap;
860:   PetscInt        i, j, jj, kk, nzi, dnzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, loc;
861:   PetscScalar    *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
862:   PetscInt        offset, ii, pocol;
863:   const PetscInt *mappingindices;
864:   IS              map;

866:   MatCheckProduct(C, 4);
867:   ptap = (Mat_APMPI *)C->product->data;

871:   MatZeroEntries(C);

873:   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
874:   /*-----------------------------------------------------*/
875:   if (ptap->reuse == MAT_REUSE_MATRIX) {
876:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
877:     MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth);
878:   }
879:   PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map);
880:   MatGetLocalSize(p->B, NULL, &pon);
881:   pon *= dof;
882:   MatGetLocalSize(P, NULL, &pn);
883:   pn *= dof;

885:   PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta);
886:   MatGetLocalSize(A, &am, NULL);
887:   MatGetOwnershipRangeColumn(P, &pcstart, &pcend);
888:   pcstart *= dof;
889:   pcend *= dof;
890:   cmaxr = 0;
891:   for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
892:   cd = (Mat_SeqAIJ *)(c->A)->data;
893:   co = (Mat_SeqAIJ *)(c->B)->data;
894:   for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
895:   PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc);
896:   PetscHMapIVCreate(&hmap);
897:   PetscHMapIVResize(hmap, cmaxr);
898:   ISGetIndices(map, &mappingindices);
899:   for (i = 0; i < am && (pon || pn); i++) {
900:     PetscHMapIVClear(hmap);
901:     offset = i % dof;
902:     ii     = i / dof;
903:     nzi    = po->i[ii + 1] - po->i[ii];
904:     dnzi   = pd->i[ii + 1] - pd->i[ii];
905:     if (!nzi && !dnzi) continue;
906:     MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap);
907:     voff = 0;
908:     PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues);
909:     if (!voff) continue;

911:     /* Form remote C(ii, :) */
912:     poj = po->j + po->i[ii];
913:     poa = po->a + po->i[ii];
914:     for (j = 0; j < nzi; j++) {
915:       pocol   = poj[j] * dof + offset;
916:       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
917:       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
918:       for (jj = 0; jj < voff; jj++) {
919:         apvaluestmp[jj] = apvalues[jj] * poa[j];
920:         /* If the row is empty */
921:         if (!c_rmtc[pocol]) {
922:           c_rmtjj[jj] = apindices[jj];
923:           c_rmtaa[jj] = apvaluestmp[jj];
924:           c_rmtc[pocol]++;
925:         } else {
926:           PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc);
927:           if (loc >= 0) { /* hit */
928:             c_rmtaa[loc] += apvaluestmp[jj];
929:             PetscLogFlops(1.0);
930:           } else { /* new element */
931:             loc = -(loc + 1);
932:             /* Move data backward */
933:             for (kk = c_rmtc[pocol]; kk > loc; kk--) {
934:               c_rmtjj[kk] = c_rmtjj[kk - 1];
935:               c_rmtaa[kk] = c_rmtaa[kk - 1];
936:             } /* End kk */
937:             c_rmtjj[loc] = apindices[jj];
938:             c_rmtaa[loc] = apvaluestmp[jj];
939:             c_rmtc[pocol]++;
940:           }
941:         }
942:       } /* End jj */
943:       PetscLogFlops(voff);
944:     } /* End j */

946:     /* Form local C(ii, :) */
947:     pdj = pd->j + pd->i[ii];
948:     pda = pd->a + pd->i[ii];
949:     for (j = 0; j < dnzi; j++) {
950:       row = pcstart + pdj[j] * dof + offset;
951:       for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; /* End kk */
952:       PetscLogFlops(voff);
953:       MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES);
954:     } /* End j */
955:   }   /* End i */

957:   ISRestoreIndices(map, &mappingindices);
958:   PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc);
959:   PetscHMapIVDestroy(&hmap);
960:   PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha);

962:   PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE);
963:   PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE);
964:   PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE);
965:   PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE);
966:   PetscFree2(c_rmtj, c_rmta);

968:   /* Add contributions from remote */
969:   for (i = 0; i < pn; i++) {
970:     row = i + pcstart;
971:     MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES);
972:   }
973:   PetscFree2(c_othj, c_otha);

975:   MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
976:   MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);

978:   ptap->reuse = MAT_REUSE_MATRIX;
979:   return 0;
980: }

982: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, Mat C)
983: {

985:   MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, C);
986:   return 0;
987: }

989: /* TODO: move algorithm selection to MatProductSetFromOptions */
990: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
991: {
992:   Mat_APMPI      *ptap;
993:   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data;
994:   MPI_Comm        comm;
995:   Mat_SeqAIJ     *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
996:   MatType         mtype;
997:   PetscSF         sf;
998:   PetscSFNode    *iremote;
999:   PetscInt        rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
1000:   const PetscInt *rootdegrees;
1001:   PetscHSetI      ht, oht, *hta, *hto;
1002:   PetscInt        pn, pon, *c_rmtc, i, j, nzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
1003:   PetscInt        lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
1004:   PetscInt        nalg = 2, alg = 0, offset, ii;
1005:   PetscMPIInt     owner;
1006:   const PetscInt *mappingindices;
1007:   PetscBool       flg;
1008:   const char     *algTypes[2] = {"overlapping", "merged"};
1009:   IS              map;

1011:   MatCheckProduct(Cmpi, 5);
1013:   PetscObjectGetComm((PetscObject)A, &comm);

1015:   /* Create symbolic parallel matrix Cmpi */
1016:   MatGetLocalSize(P, NULL, &pn);
1017:   pn *= dof;
1018:   MatGetType(A, &mtype);
1019:   MatSetType(Cmpi, mtype);
1020:   MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE);

1022:   PetscNew(&ptap);
1023:   ptap->reuse   = MAT_INITIAL_MATRIX;
1024:   ptap->algType = 2;

1026:   /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1027:   MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth);
1028:   PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map);
1029:   /* This equals to the number of offdiag columns in P */
1030:   MatGetLocalSize(p->B, NULL, &pon);
1031:   pon *= dof;
1032:   /* offsets */
1033:   PetscMalloc1(pon + 1, &ptap->c_rmti);
1034:   /* The number of columns we will send to remote ranks */
1035:   PetscMalloc1(pon, &c_rmtc);
1036:   PetscMalloc1(pon, &hta);
1037:   for (i = 0; i < pon; i++) PetscHSetICreate(&hta[i]);
1038:   MatGetLocalSize(A, &am, NULL);
1039:   MatGetOwnershipRange(A, &arstart, &arend);
1040:   /* Create hash table to merge all columns for C(i, :) */
1041:   PetscHSetICreate(&ht);

1043:   ISGetIndices(map, &mappingindices);
1044:   ptap->c_rmti[0] = 0;
1045:   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1046:   for (i = 0; i < am && pon; i++) {
1047:     /* Form one row of AP */
1048:     PetscHSetIClear(ht);
1049:     offset = i % dof;
1050:     ii     = i / dof;
1051:     /* If the off diag is empty, we should not do any calculation */
1052:     nzi = po->i[ii + 1] - po->i[ii];
1053:     if (!nzi) continue;

1055:     MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, ht);
1056:     PetscHSetIGetSize(ht, &htsize);
1057:     /* If AP is empty, just continue */
1058:     if (!htsize) continue;
1059:     /* Form C(ii, :) */
1060:     poj = po->j + po->i[ii];
1061:     for (j = 0; j < nzi; j++) PetscHSetIUpdate(hta[poj[j] * dof + offset], ht);
1062:   }

1064:   for (i = 0; i < pon; i++) {
1065:     PetscHSetIGetSize(hta[i], &htsize);
1066:     ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
1067:     c_rmtc[i]           = htsize;
1068:   }

1070:   PetscMalloc1(ptap->c_rmti[pon], &c_rmtj);

1072:   for (i = 0; i < pon; i++) {
1073:     off = 0;
1074:     PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]);
1075:     PetscHSetIDestroy(&hta[i]);
1076:   }
1077:   PetscFree(hta);

1079:   PetscMalloc1(pon, &iremote);
1080:   for (i = 0; i < pon; i++) {
1081:     owner  = 0;
1082:     lidx   = 0;
1083:     offset = i % dof;
1084:     ii     = i / dof;
1085:     PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx);
1086:     iremote[i].index = lidx * dof + offset;
1087:     iremote[i].rank  = owner;
1088:   }

1090:   PetscSFCreate(comm, &sf);
1091:   PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER);
1092:   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1093:   PetscSFSetRankOrder(sf, PETSC_TRUE);
1094:   PetscSFSetFromOptions(sf);
1095:   PetscSFSetUp(sf);
1096:   /* How many neighbors have contributions to my rows? */
1097:   PetscSFComputeDegreeBegin(sf, &rootdegrees);
1098:   PetscSFComputeDegreeEnd(sf, &rootdegrees);
1099:   rootspacesize = 0;
1100:   for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
1101:   PetscMalloc1(rootspacesize, &rootspace);
1102:   PetscMalloc1(rootspacesize + 1, &rootspaceoffsets);
1103:   /* Get information from leaves
1104:    * Number of columns other people contribute to my rows
1105:    * */
1106:   PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace);
1107:   PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace);
1108:   PetscFree(c_rmtc);
1109:   PetscCalloc1(pn + 1, &ptap->c_othi);
1110:   /* The number of columns is received for each row */
1111:   ptap->c_othi[0]     = 0;
1112:   rootspacesize       = 0;
1113:   rootspaceoffsets[0] = 0;
1114:   for (i = 0; i < pn; i++) {
1115:     rcvncols = 0;
1116:     for (j = 0; j < rootdegrees[i]; j++) {
1117:       rcvncols += rootspace[rootspacesize];
1118:       rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1119:       rootspacesize++;
1120:     }
1121:     ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
1122:   }
1123:   PetscFree(rootspace);

1125:   PetscMalloc1(pon, &c_rmtoffsets);
1126:   PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets);
1127:   PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets);
1128:   PetscSFDestroy(&sf);
1129:   PetscFree(rootspaceoffsets);

1131:   PetscCalloc1(ptap->c_rmti[pon], &iremote);
1132:   nleaves = 0;
1133:   for (i = 0; i < pon; i++) {
1134:     owner = 0;
1135:     ii    = i / dof;
1136:     PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL);
1137:     sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
1138:     for (j = 0; j < sendncols; j++) {
1139:       iremote[nleaves].rank    = owner;
1140:       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1141:     }
1142:   }
1143:   PetscFree(c_rmtoffsets);
1144:   PetscCalloc1(ptap->c_othi[pn], &c_othj);

1146:   PetscSFCreate(comm, &ptap->sf);
1147:   PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER);
1148:   PetscSFSetFromOptions(ptap->sf);
1149:   /* One to one map */
1150:   PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE);

1152:   PetscMalloc2(pn, &dnz, pn, &onz);
1153:   PetscHSetICreate(&oht);
1154:   MatGetOwnershipRangeColumn(P, &pcstart, &pcend);
1155:   pcstart *= dof;
1156:   pcend *= dof;
1157:   PetscMalloc2(pn, &hta, pn, &hto);
1158:   for (i = 0; i < pn; i++) {
1159:     PetscHSetICreate(&hta[i]);
1160:     PetscHSetICreate(&hto[i]);
1161:   }
1162:   /* Work on local part */
1163:   /* 4) Pass 1: Estimate memory for C_loc */
1164:   for (i = 0; i < am && pn; i++) {
1165:     PetscHSetIClear(ht);
1166:     PetscHSetIClear(oht);
1167:     offset = i % dof;
1168:     ii     = i / dof;
1169:     nzi    = pd->i[ii + 1] - pd->i[ii];
1170:     if (!nzi) continue;

1172:     MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht);
1173:     PetscHSetIGetSize(ht, &htsize);
1174:     PetscHSetIGetSize(oht, &htosize);
1175:     if (!(htsize + htosize)) continue;
1176:     /* Form C(ii, :) */
1177:     pdj = pd->j + pd->i[ii];
1178:     for (j = 0; j < nzi; j++) {
1179:       PetscHSetIUpdate(hta[pdj[j] * dof + offset], ht);
1180:       PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht);
1181:     }
1182:   }

1184:   ISRestoreIndices(map, &mappingindices);

1186:   PetscHSetIDestroy(&ht);
1187:   PetscHSetIDestroy(&oht);

1189:   /* Get remote data */
1190:   PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE);
1191:   PetscFree(c_rmtj);

1193:   for (i = 0; i < pn; i++) {
1194:     nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
1195:     rdj = c_othj + ptap->c_othi[i];
1196:     for (j = 0; j < nzi; j++) {
1197:       col = rdj[j];
1198:       /* diag part */
1199:       if (col >= pcstart && col < pcend) {
1200:         PetscHSetIAdd(hta[i], col);
1201:       } else { /* off diag */
1202:         PetscHSetIAdd(hto[i], col);
1203:       }
1204:     }
1205:     PetscHSetIGetSize(hta[i], &htsize);
1206:     dnz[i] = htsize;
1207:     PetscHSetIDestroy(&hta[i]);
1208:     PetscHSetIGetSize(hto[i], &htsize);
1209:     onz[i] = htsize;
1210:     PetscHSetIDestroy(&hto[i]);
1211:   }

1213:   PetscFree2(hta, hto);
1214:   PetscFree(c_othj);

1216:   /* local sizes and preallocation */
1217:   MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE);
1218:   MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs);
1219:   MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz);
1220:   MatSetUp(Cmpi);
1221:   PetscFree2(dnz, onz);

1223:   /* attach the supporting struct to Cmpi for reuse */
1224:   Cmpi->product->data    = ptap;
1225:   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1226:   Cmpi->product->view    = MatView_MPIAIJ_PtAP;

1228:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1229:   Cmpi->assembled = PETSC_FALSE;
1230:   /* pick an algorithm */
1231:   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
1232:   alg = 0;
1233:   PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg);
1234:   PetscOptionsEnd();
1235:   switch (alg) {
1236:   case 0:
1237:     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1238:     break;
1239:   case 1:
1240:     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1241:     break;
1242:   default:
1243:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
1244:   }
1245:   return 0;
1246: }

1248: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, PetscReal fill, Mat C)
1249: {
1250:   MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, fill, C);
1251:   return 0;
1252: }

1254: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
1255: {
1256:   Mat_APMPI      *ptap;
1257:   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data;
1258:   MPI_Comm        comm;
1259:   Mat_SeqAIJ     *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
1260:   MatType         mtype;
1261:   PetscSF         sf;
1262:   PetscSFNode    *iremote;
1263:   PetscInt        rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
1264:   const PetscInt *rootdegrees;
1265:   PetscHSetI      ht, oht, *hta, *hto, *htd;
1266:   PetscInt        pn, pon, *c_rmtc, i, j, nzi, dnzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
1267:   PetscInt        lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
1268:   PetscInt        nalg = 2, alg = 0, offset, ii;
1269:   PetscMPIInt     owner;
1270:   PetscBool       flg;
1271:   const char     *algTypes[2] = {"merged", "overlapping"};
1272:   const PetscInt *mappingindices;
1273:   IS              map;

1275:   MatCheckProduct(Cmpi, 5);
1277:   PetscObjectGetComm((PetscObject)A, &comm);

1279:   /* Create symbolic parallel matrix Cmpi */
1280:   MatGetLocalSize(P, NULL, &pn);
1281:   pn *= dof;
1282:   MatGetType(A, &mtype);
1283:   MatSetType(Cmpi, mtype);
1284:   MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE);

1286:   PetscNew(&ptap);
1287:   ptap->reuse   = MAT_INITIAL_MATRIX;
1288:   ptap->algType = 3;

1290:   /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1291:   MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth);
1292:   PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map);

1294:   /* This equals to the number of offdiag columns in P */
1295:   MatGetLocalSize(p->B, NULL, &pon);
1296:   pon *= dof;
1297:   /* offsets */
1298:   PetscMalloc1(pon + 1, &ptap->c_rmti);
1299:   /* The number of columns we will send to remote ranks */
1300:   PetscMalloc1(pon, &c_rmtc);
1301:   PetscMalloc1(pon, &hta);
1302:   for (i = 0; i < pon; i++) PetscHSetICreate(&hta[i]);
1303:   MatGetLocalSize(A, &am, NULL);
1304:   MatGetOwnershipRange(A, &arstart, &arend);
1305:   /* Create hash table to merge all columns for C(i, :) */
1306:   PetscHSetICreate(&ht);
1307:   PetscHSetICreate(&oht);
1308:   PetscMalloc2(pn, &htd, pn, &hto);
1309:   for (i = 0; i < pn; i++) {
1310:     PetscHSetICreate(&htd[i]);
1311:     PetscHSetICreate(&hto[i]);
1312:   }

1314:   ISGetIndices(map, &mappingindices);
1315:   ptap->c_rmti[0] = 0;
1316:   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1317:   for (i = 0; i < am && (pon || pn); i++) {
1318:     /* Form one row of AP */
1319:     PetscHSetIClear(ht);
1320:     PetscHSetIClear(oht);
1321:     offset = i % dof;
1322:     ii     = i / dof;
1323:     /* If the off diag is empty, we should not do any calculation */
1324:     nzi  = po->i[ii + 1] - po->i[ii];
1325:     dnzi = pd->i[ii + 1] - pd->i[ii];
1326:     if (!nzi && !dnzi) continue;

1328:     MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht);
1329:     PetscHSetIGetSize(ht, &htsize);
1330:     PetscHSetIGetSize(oht, &htosize);
1331:     /* If AP is empty, just continue */
1332:     if (!(htsize + htosize)) continue;

1334:     /* Form remote C(ii, :) */
1335:     poj = po->j + po->i[ii];
1336:     for (j = 0; j < nzi; j++) {
1337:       PetscHSetIUpdate(hta[poj[j] * dof + offset], ht);
1338:       PetscHSetIUpdate(hta[poj[j] * dof + offset], oht);
1339:     }

1341:     /* Form local C(ii, :) */
1342:     pdj = pd->j + pd->i[ii];
1343:     for (j = 0; j < dnzi; j++) {
1344:       PetscHSetIUpdate(htd[pdj[j] * dof + offset], ht);
1345:       PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht);
1346:     }
1347:   }

1349:   ISRestoreIndices(map, &mappingindices);

1351:   PetscHSetIDestroy(&ht);
1352:   PetscHSetIDestroy(&oht);

1354:   for (i = 0; i < pon; i++) {
1355:     PetscHSetIGetSize(hta[i], &htsize);
1356:     ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
1357:     c_rmtc[i]           = htsize;
1358:   }

1360:   PetscMalloc1(ptap->c_rmti[pon], &c_rmtj);

1362:   for (i = 0; i < pon; i++) {
1363:     off = 0;
1364:     PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]);
1365:     PetscHSetIDestroy(&hta[i]);
1366:   }
1367:   PetscFree(hta);

1369:   PetscMalloc1(pon, &iremote);
1370:   for (i = 0; i < pon; i++) {
1371:     owner  = 0;
1372:     lidx   = 0;
1373:     offset = i % dof;
1374:     ii     = i / dof;
1375:     PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx);
1376:     iremote[i].index = lidx * dof + offset;
1377:     iremote[i].rank  = owner;
1378:   }

1380:   PetscSFCreate(comm, &sf);
1381:   PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER);
1382:   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1383:   PetscSFSetRankOrder(sf, PETSC_TRUE);
1384:   PetscSFSetFromOptions(sf);
1385:   PetscSFSetUp(sf);
1386:   /* How many neighbors have contributions to my rows? */
1387:   PetscSFComputeDegreeBegin(sf, &rootdegrees);
1388:   PetscSFComputeDegreeEnd(sf, &rootdegrees);
1389:   rootspacesize = 0;
1390:   for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
1391:   PetscMalloc1(rootspacesize, &rootspace);
1392:   PetscMalloc1(rootspacesize + 1, &rootspaceoffsets);
1393:   /* Get information from leaves
1394:    * Number of columns other people contribute to my rows
1395:    * */
1396:   PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace);
1397:   PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace);
1398:   PetscFree(c_rmtc);
1399:   PetscMalloc1(pn + 1, &ptap->c_othi);
1400:   /* The number of columns is received for each row */
1401:   ptap->c_othi[0]     = 0;
1402:   rootspacesize       = 0;
1403:   rootspaceoffsets[0] = 0;
1404:   for (i = 0; i < pn; i++) {
1405:     rcvncols = 0;
1406:     for (j = 0; j < rootdegrees[i]; j++) {
1407:       rcvncols += rootspace[rootspacesize];
1408:       rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1409:       rootspacesize++;
1410:     }
1411:     ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
1412:   }
1413:   PetscFree(rootspace);

1415:   PetscMalloc1(pon, &c_rmtoffsets);
1416:   PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets);
1417:   PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets);
1418:   PetscSFDestroy(&sf);
1419:   PetscFree(rootspaceoffsets);

1421:   PetscCalloc1(ptap->c_rmti[pon], &iremote);
1422:   nleaves = 0;
1423:   for (i = 0; i < pon; i++) {
1424:     owner = 0;
1425:     ii    = i / dof;
1426:     PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL);
1427:     sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
1428:     for (j = 0; j < sendncols; j++) {
1429:       iremote[nleaves].rank    = owner;
1430:       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1431:     }
1432:   }
1433:   PetscFree(c_rmtoffsets);
1434:   PetscCalloc1(ptap->c_othi[pn], &c_othj);

1436:   PetscSFCreate(comm, &ptap->sf);
1437:   PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER);
1438:   PetscSFSetFromOptions(ptap->sf);
1439:   /* One to one map */
1440:   PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE);
1441:   /* Get remote data */
1442:   PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE);
1443:   PetscFree(c_rmtj);
1444:   PetscMalloc2(pn, &dnz, pn, &onz);
1445:   MatGetOwnershipRangeColumn(P, &pcstart, &pcend);
1446:   pcstart *= dof;
1447:   pcend *= dof;
1448:   for (i = 0; i < pn; i++) {
1449:     nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
1450:     rdj = c_othj + ptap->c_othi[i];
1451:     for (j = 0; j < nzi; j++) {
1452:       col = rdj[j];
1453:       /* diag part */
1454:       if (col >= pcstart && col < pcend) {
1455:         PetscHSetIAdd(htd[i], col);
1456:       } else { /* off diag */
1457:         PetscHSetIAdd(hto[i], col);
1458:       }
1459:     }
1460:     PetscHSetIGetSize(htd[i], &htsize);
1461:     dnz[i] = htsize;
1462:     PetscHSetIDestroy(&htd[i]);
1463:     PetscHSetIGetSize(hto[i], &htsize);
1464:     onz[i] = htsize;
1465:     PetscHSetIDestroy(&hto[i]);
1466:   }

1468:   PetscFree2(htd, hto);
1469:   PetscFree(c_othj);

1471:   /* local sizes and preallocation */
1472:   MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE);
1473:   MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs);
1474:   MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz);
1475:   PetscFree2(dnz, onz);

1477:   /* attach the supporting struct to Cmpi for reuse */
1478:   Cmpi->product->data    = ptap;
1479:   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1480:   Cmpi->product->view    = MatView_MPIAIJ_PtAP;

1482:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1483:   Cmpi->assembled = PETSC_FALSE;
1484:   /* pick an algorithm */
1485:   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
1486:   alg = 0;
1487:   PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg);
1488:   PetscOptionsEnd();
1489:   switch (alg) {
1490:   case 0:
1491:     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1492:     break;
1493:   case 1:
1494:     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1495:     break;
1496:   default:
1497:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
1498:   }
1499:   return 0;
1500: }

1502: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, PetscReal fill, Mat C)
1503: {
1504:   MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, fill, C);
1505:   return 0;
1506: }

1508: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A, Mat P, PetscReal fill, Mat Cmpi)
1509: {
1510:   Mat_APMPI               *ptap;
1511:   Mat_MPIAIJ              *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1512:   MPI_Comm                 comm;
1513:   PetscMPIInt              size, rank;
1514:   PetscFreeSpaceList       free_space = NULL, current_space = NULL;
1515:   PetscInt                 am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
1516:   PetscInt                *lnk, i, k, pnz, row, nsend;
1517:   PetscBT                  lnkbt;
1518:   PetscMPIInt              tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
1519:   PETSC_UNUSED PetscMPIInt icompleted = 0;
1520:   PetscInt               **buf_rj, **buf_ri, **buf_ri_k;
1521:   PetscInt                 len, proc, *dnz, *onz, *owners, nzi, nspacedouble;
1522:   PetscInt                 nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
1523:   MPI_Request             *swaits, *rwaits;
1524:   MPI_Status              *sstatus, rstatus;
1525:   PetscLayout              rowmap;
1526:   PetscInt                *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
1527:   PetscMPIInt             *len_r, *id_r;          /* array of length of comm->size, store send/recv matrix values */
1528:   PetscInt                *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, ap_rmax = 0, Crmax, *aj, *ai, *pi;
1529:   Mat_SeqAIJ              *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth;
1530:   PetscScalar             *apv;
1531:   PetscTable               ta;
1532:   MatType                  mtype;
1533:   const char              *prefix;
1534: #if defined(PETSC_USE_INFO)
1535:   PetscReal apfill;
1536: #endif

1538:   MatCheckProduct(Cmpi, 4);
1540:   PetscObjectGetComm((PetscObject)A, &comm);
1541:   MPI_Comm_size(comm, &size);
1542:   MPI_Comm_rank(comm, &rank);

1544:   if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data;

1546:   /* create symbolic parallel matrix Cmpi */
1547:   MatGetType(A, &mtype);
1548:   MatSetType(Cmpi, mtype);

1550:   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1551:   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;

1553:   /* create struct Mat_APMPI and attached it to C later */
1554:   PetscNew(&ptap);
1555:   ptap->reuse   = MAT_INITIAL_MATRIX;
1556:   ptap->algType = 1;

1558:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1559:   MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth);
1560:   /* get P_loc by taking all local rows of P */
1561:   MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc);

1563:   /* (0) compute Rd = Pd^T, Ro = Po^T  */
1564:   /* --------------------------------- */
1565:   MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd);
1566:   MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro);

1568:   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
1569:   /* ----------------------------------------------------------------- */
1570:   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
1571:   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;

1573:   /* create and initialize a linked list */
1574:   PetscTableCreate(pn, pN, &ta); /* for compute AP_loc and Cmpi */
1575:   MatRowMergeMax_SeqAIJ(p_loc, ptap->P_loc->rmap->N, ta);
1576:   MatRowMergeMax_SeqAIJ(p_oth, ptap->P_oth->rmap->N, ta);
1577:   PetscTableGetCount(ta, &Crmax); /* Crmax = nnz(sum of Prows) */
1578:   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */

1580:   PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt);

1582:   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1583:   if (ao) {
1584:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space);
1585:   } else {
1586:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space);
1587:   }
1588:   current_space = free_space;
1589:   nspacedouble  = 0;

1591:   PetscMalloc1(am + 1, &api);
1592:   api[0] = 0;
1593:   for (i = 0; i < am; i++) {
1594:     /* diagonal portion: Ad[i,:]*P */
1595:     ai  = ad->i;
1596:     pi  = p_loc->i;
1597:     nzi = ai[i + 1] - ai[i];
1598:     aj  = ad->j + ai[i];
1599:     for (j = 0; j < nzi; j++) {
1600:       row  = aj[j];
1601:       pnz  = pi[row + 1] - pi[row];
1602:       Jptr = p_loc->j + pi[row];
1603:       /* add non-zero cols of P into the sorted linked list lnk */
1604:       PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt);
1605:     }
1606:     /* off-diagonal portion: Ao[i,:]*P */
1607:     if (ao) {
1608:       ai  = ao->i;
1609:       pi  = p_oth->i;
1610:       nzi = ai[i + 1] - ai[i];
1611:       aj  = ao->j + ai[i];
1612:       for (j = 0; j < nzi; j++) {
1613:         row  = aj[j];
1614:         pnz  = pi[row + 1] - pi[row];
1615:         Jptr = p_oth->j + pi[row];
1616:         PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt);
1617:       }
1618:     }
1619:     apnz       = lnk[0];
1620:     api[i + 1] = api[i] + apnz;
1621:     if (ap_rmax < apnz) ap_rmax = apnz;

1623:     /* if free space is not available, double the total space in the list */
1624:     if (current_space->local_remaining < apnz) {
1625:       PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), &current_space);
1626:       nspacedouble++;
1627:     }

1629:     /* Copy data into free space, then initialize lnk */
1630:     PetscLLCondensedClean(pN, apnz, current_space->array, lnk, lnkbt);

1632:     current_space->array += apnz;
1633:     current_space->local_used += apnz;
1634:     current_space->local_remaining -= apnz;
1635:   }
1636:   /* Allocate space for apj and apv, initialize apj, and */
1637:   /* destroy list of free space and other temporary array(s) */
1638:   PetscMalloc2(api[am], &apj, api[am], &apv);
1639:   PetscFreeSpaceContiguous(&free_space, apj);
1640:   PetscLLDestroy(lnk, lnkbt);

1642:   /* Create AP_loc for reuse */
1643:   MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc);
1644:   MatSetType(ptap->AP_loc, ((PetscObject)p->A)->type_name);
1645: #if defined(PETSC_USE_INFO)
1646:   if (ao) {
1647:     apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
1648:   } else {
1649:     apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
1650:   }
1651:   ptap->AP_loc->info.mallocs           = nspacedouble;
1652:   ptap->AP_loc->info.fill_ratio_given  = fill;
1653:   ptap->AP_loc->info.fill_ratio_needed = apfill;

1655:   if (api[am]) {
1656:     PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill);
1657:     PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill);
1658:   } else {
1659:     PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc is empty \n");
1660:   }
1661: #endif

1663:   /* (2-1) compute symbolic Co = Ro*AP_loc  */
1664:   /* ------------------------------------ */
1665:   MatGetOptionsPrefix(A, &prefix);
1666:   MatSetOptionsPrefix(ptap->Ro, prefix);
1667:   MatAppendOptionsPrefix(ptap->Ro, "inner_offdiag_");
1668:   MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth);
1669:   MatGetOptionsPrefix(Cmpi, &prefix);
1670:   MatSetOptionsPrefix(ptap->C_oth, prefix);
1671:   MatAppendOptionsPrefix(ptap->C_oth, "inner_C_oth_");
1672:   MatProductSetType(ptap->C_oth, MATPRODUCT_AB);
1673:   MatProductSetAlgorithm(ptap->C_oth, "default");
1674:   MatProductSetFill(ptap->C_oth, fill);
1675:   MatProductSetFromOptions(ptap->C_oth);
1676:   MatProductSymbolic(ptap->C_oth);

1678:   /* (3) send coj of C_oth to other processors  */
1679:   /* ------------------------------------------ */
1680:   /* determine row ownership */
1681:   PetscLayoutCreate(comm, &rowmap);
1682:   rowmap->n  = pn;
1683:   rowmap->bs = 1;
1684:   PetscLayoutSetUp(rowmap);
1685:   owners = rowmap->range;

1687:   /* determine the number of messages to send, their lengths */
1688:   PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co);
1689:   PetscArrayzero(len_s, size);
1690:   PetscArrayzero(len_si, size);

1692:   c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
1693:   coi   = c_oth->i;
1694:   coj   = c_oth->j;
1695:   con   = ptap->C_oth->rmap->n;
1696:   proc  = 0;
1697:   for (i = 0; i < con; i++) {
1698:     while (prmap[i] >= owners[proc + 1]) proc++;
1699:     len_si[proc]++;                     /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1700:     len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1701:   }

1703:   len          = 0; /* max length of buf_si[], see (4) */
1704:   owners_co[0] = 0;
1705:   nsend        = 0;
1706:   for (proc = 0; proc < size; proc++) {
1707:     owners_co[proc + 1] = owners_co[proc] + len_si[proc];
1708:     if (len_s[proc]) {
1709:       nsend++;
1710:       len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1711:       len += len_si[proc];
1712:     }
1713:   }

1715:   /* determine the number and length of messages to receive for coi and coj  */
1716:   PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv);
1717:   PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri);

1719:   /* post the Irecv and Isend of coj */
1720:   PetscCommGetNewTag(comm, &tagj);
1721:   PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits);
1722:   PetscMalloc1(nsend + 1, &swaits);
1723:   for (proc = 0, k = 0; proc < size; proc++) {
1724:     if (!len_s[proc]) continue;
1725:     i = owners_co[proc];
1726:     MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k);
1727:     k++;
1728:   }

1730:   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1731:   /* ---------------------------------------- */
1732:   MatSetOptionsPrefix(ptap->Rd, prefix);
1733:   MatAppendOptionsPrefix(ptap->Rd, "inner_diag_");
1734:   MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc);
1735:   MatGetOptionsPrefix(Cmpi, &prefix);
1736:   MatSetOptionsPrefix(ptap->C_loc, prefix);
1737:   MatAppendOptionsPrefix(ptap->C_loc, "inner_C_loc_");
1738:   MatProductSetType(ptap->C_loc, MATPRODUCT_AB);
1739:   MatProductSetAlgorithm(ptap->C_loc, "default");
1740:   MatProductSetFill(ptap->C_loc, fill);
1741:   MatProductSetFromOptions(ptap->C_loc);
1742:   MatProductSymbolic(ptap->C_loc);

1744:   c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;

1746:   /* receives coj are complete */
1747:   for (i = 0; i < nrecv; i++) MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus);
1748:   PetscFree(rwaits);
1749:   if (nsend) MPI_Waitall(nsend, swaits, sstatus);

1751:   /* add received column indices into ta to update Crmax */
1752:   for (k = 0; k < nrecv; k++) { /* k-th received message */
1753:     Jptr = buf_rj[k];
1754:     for (j = 0; j < len_r[k]; j++) PetscTableAdd(ta, *(Jptr + j) + 1, 1, INSERT_VALUES);
1755:   }
1756:   PetscTableGetCount(ta, &Crmax);
1757:   PetscTableDestroy(&ta);

1759:   /* (4) send and recv coi */
1760:   /*-----------------------*/
1761:   PetscCommGetNewTag(comm, &tagi);
1762:   PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits);
1763:   PetscMalloc1(len + 1, &buf_s);
1764:   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1765:   for (proc = 0, k = 0; proc < size; proc++) {
1766:     if (!len_s[proc]) continue;
1767:     /* form outgoing message for i-structure:
1768:          buf_si[0]:                 nrows to be sent
1769:                [1:nrows]:           row index (global)
1770:                [nrows+1:2*nrows+1]: i-structure index
1771:     */
1772:     /*-------------------------------------------*/
1773:     nrows       = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
1774:     buf_si_i    = buf_si + nrows + 1;
1775:     buf_si[0]   = nrows;
1776:     buf_si_i[0] = 0;
1777:     nrows       = 0;
1778:     for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
1779:       nzi                 = coi[i + 1] - coi[i];
1780:       buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi;   /* i-structure */
1781:       buf_si[nrows + 1]   = prmap[i] - owners[proc]; /* local row index */
1782:       nrows++;
1783:     }
1784:     MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k);
1785:     k++;
1786:     buf_si += len_si[proc];
1787:   }
1788:   for (i = 0; i < nrecv; i++) MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus);
1789:   PetscFree(rwaits);
1790:   if (nsend) MPI_Waitall(nsend, swaits, sstatus);

1792:   PetscFree4(len_s, len_si, sstatus, owners_co);
1793:   PetscFree(len_ri);
1794:   PetscFree(swaits);
1795:   PetscFree(buf_s);

1797:   /* (5) compute the local portion of Cmpi      */
1798:   /* ------------------------------------------ */
1799:   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
1800:   PetscFreeSpaceGet(Crmax, &free_space);
1801:   current_space = free_space;

1803:   PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci);
1804:   for (k = 0; k < nrecv; k++) {
1805:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1806:     nrows       = *buf_ri_k[k];
1807:     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
1808:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
1809:   }

1811:   MatPreallocateBegin(comm, pn, pn, dnz, onz);
1812:   PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt);
1813:   for (i = 0; i < pn; i++) {
1814:     /* add C_loc into Cmpi */
1815:     nzi  = c_loc->i[i + 1] - c_loc->i[i];
1816:     Jptr = c_loc->j + c_loc->i[i];
1817:     PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt);

1819:     /* add received col data into lnk */
1820:     for (k = 0; k < nrecv; k++) { /* k-th received message */
1821:       if (i == *nextrow[k]) {     /* i-th row */
1822:         nzi  = *(nextci[k] + 1) - *nextci[k];
1823:         Jptr = buf_rj[k] + *nextci[k];
1824:         PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt);
1825:         nextrow[k]++;
1826:         nextci[k]++;
1827:       }
1828:     }
1829:     nzi = lnk[0];

1831:     /* copy data into free space, then initialize lnk */
1832:     PetscLLCondensedClean(pN, nzi, current_space->array, lnk, lnkbt);
1833:     MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz);
1834:   }
1835:   PetscFree3(buf_ri_k, nextrow, nextci);
1836:   PetscLLDestroy(lnk, lnkbt);
1837:   PetscFreeSpaceDestroy(free_space);

1839:   /* local sizes and preallocation */
1840:   MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE);
1841:   if (P->cmap->bs > 0) {
1842:     PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs);
1843:     PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs);
1844:   }
1845:   MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz);
1846:   MatPreallocateEnd(dnz, onz);

1848:   /* members in merge */
1849:   PetscFree(id_r);
1850:   PetscFree(len_r);
1851:   PetscFree(buf_ri[0]);
1852:   PetscFree(buf_ri);
1853:   PetscFree(buf_rj[0]);
1854:   PetscFree(buf_rj);
1855:   PetscLayoutDestroy(&rowmap);

1857:   PetscCalloc1(pN, &ptap->apa);

1859:   /* attach the supporting struct to Cmpi for reuse */
1860:   Cmpi->product->data    = ptap;
1861:   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1862:   Cmpi->product->view    = MatView_MPIAIJ_PtAP;

1864:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1865:   Cmpi->assembled = PETSC_FALSE;
1866:   return 0;
1867: }

1869: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A, Mat P, Mat C)
1870: {
1871:   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1872:   Mat_SeqAIJ        *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data;
1873:   Mat_SeqAIJ        *ap, *p_loc, *p_oth = NULL, *c_seq;
1874:   Mat_APMPI         *ptap;
1875:   Mat                AP_loc, C_loc, C_oth;
1876:   PetscInt           i, rstart, rend, cm, ncols, row;
1877:   PetscInt          *api, *apj, am = A->rmap->n, j, col, apnz;
1878:   PetscScalar       *apa;
1879:   const PetscInt    *cols;
1880:   const PetscScalar *vals;

1882:   MatCheckProduct(C, 3);
1883:   ptap = (Mat_APMPI *)C->product->data;

1887:   MatZeroEntries(C);
1888:   /* 1) get R = Pd^T,Ro = Po^T */
1889:   if (ptap->reuse == MAT_REUSE_MATRIX) {
1890:     MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd);
1891:     MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro);
1892:   }

1894:   /* 2) get AP_loc */
1895:   AP_loc = ptap->AP_loc;
1896:   ap     = (Mat_SeqAIJ *)AP_loc->data;

1898:   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
1899:   /*-----------------------------------------------------*/
1900:   if (ptap->reuse == MAT_REUSE_MATRIX) {
1901:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1902:     MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth);
1903:     MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc);
1904:   }

1906:   /* 2-2) compute numeric A_loc*P - dominating part */
1907:   /* ---------------------------------------------- */
1908:   /* get data from symbolic products */
1909:   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
1910:   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
1911:   apa = ptap->apa;
1912:   api = ap->i;
1913:   apj = ap->j;
1914:   for (i = 0; i < am; i++) {
1915:     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1916:     AProw_nonscalable(i, ad, ao, p_loc, p_oth, apa);
1917:     apnz = api[i + 1] - api[i];
1918:     for (j = 0; j < apnz; j++) {
1919:       col                 = apj[j + api[i]];
1920:       ap->a[j + ap->i[i]] = apa[col];
1921:       apa[col]            = 0.0;
1922:     }
1923:   }
1924:   /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
1925:   PetscObjectStateIncrease((PetscObject)AP_loc);

1927:   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1928:   MatProductNumeric(ptap->C_loc);
1929:   MatProductNumeric(ptap->C_oth);
1930:   C_loc = ptap->C_loc;
1931:   C_oth = ptap->C_oth;

1933:   /* add C_loc and Co to to C */
1934:   MatGetOwnershipRange(C, &rstart, &rend);

1936:   /* C_loc -> C */
1937:   cm    = C_loc->rmap->N;
1938:   c_seq = (Mat_SeqAIJ *)C_loc->data;
1939:   cols  = c_seq->j;
1940:   vals  = c_seq->a;

1942:   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1943:   /* when there are no off-processor parts.  */
1944:   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1945:   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1946:   /* a table, and other, more complex stuff has to be done. */
1947:   if (C->assembled) {
1948:     C->was_assembled = PETSC_TRUE;
1949:     C->assembled     = PETSC_FALSE;
1950:   }
1951:   if (C->was_assembled) {
1952:     for (i = 0; i < cm; i++) {
1953:       ncols = c_seq->i[i + 1] - c_seq->i[i];
1954:       row   = rstart + i;
1955:       MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES);
1956:       cols += ncols;
1957:       vals += ncols;
1958:     }
1959:   } else {
1960:     MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a);
1961:   }

1963:   /* Co -> C, off-processor part */
1964:   cm    = C_oth->rmap->N;
1965:   c_seq = (Mat_SeqAIJ *)C_oth->data;
1966:   cols  = c_seq->j;
1967:   vals  = c_seq->a;
1968:   for (i = 0; i < cm; i++) {
1969:     ncols = c_seq->i[i + 1] - c_seq->i[i];
1970:     row   = p->garray[i];
1971:     MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES);
1972:     cols += ncols;
1973:     vals += ncols;
1974:   }

1976:   MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
1977:   MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);

1979:   ptap->reuse = MAT_REUSE_MATRIX;
1980:   return 0;
1981: }

1983: PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C)
1984: {
1985:   Mat_Product        *product = C->product;
1986:   Mat                 A = product->A, P = product->B;
1987:   MatProductAlgorithm alg  = product->alg;
1988:   PetscReal           fill = product->fill;
1989:   PetscBool           flg;

1991:   /* scalable: do R=P^T locally, then C=R*A*P */
1992:   PetscStrcmp(alg, "scalable", &flg);
1993:   if (flg) {
1994:     MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A, P, product->fill, C);
1995:     C->ops->productnumeric = MatProductNumeric_PtAP;
1996:     goto next;
1997:   }

1999:   /* nonscalable: do R=P^T locally, then C=R*A*P */
2000:   PetscStrcmp(alg, "nonscalable", &flg);
2001:   if (flg) {
2002:     MatPtAPSymbolic_MPIAIJ_MPIAIJ(A, P, fill, C);
2003:     goto next;
2004:   }

2006:   /* allatonce */
2007:   PetscStrcmp(alg, "allatonce", &flg);
2008:   if (flg) {
2009:     MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A, P, fill, C);
2010:     goto next;
2011:   }

2013:   /* allatonce_merged */
2014:   PetscStrcmp(alg, "allatonce_merged", &flg);
2015:   if (flg) {
2016:     MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A, P, fill, C);
2017:     goto next;
2018:   }

2020:   /* backend general code */
2021:   PetscStrcmp(alg, "backend", &flg);
2022:   if (flg) {
2023:     MatProductSymbolic_MPIAIJBACKEND(C);
2024:     return 0;
2025:   }

2027:   /* hypre */
2028: #if defined(PETSC_HAVE_HYPRE)
2029:   PetscStrcmp(alg, "hypre", &flg);
2030:   if (flg) {
2031:     MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A, P, fill, C);
2032:     C->ops->productnumeric = MatProductNumeric_PtAP;
2033:     return 0;
2034:   }
2035: #endif
2036:   SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported");

2038: next:
2039:   C->ops->productnumeric = MatProductNumeric_PtAP;
2040:   return 0;
2041: }