Actual source code: matmatmult.c


  2: /*
  3:   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
  4:           C = A * B
  5: */

  7: #include <../src/mat/impls/aij/seq/aij.h>
  8: #include <../src/mat/utils/freespace.h>
  9: #include <petscbt.h>
 10: #include <petsc/private/isimpl.h>
 11: #include <../src/mat/impls/dense/seq/dense.h>

 13: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
 14: {
 15:   if (C->ops->matmultnumeric) (*C->ops->matmultnumeric)(A, B, C);
 16:   else MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C);
 17:   return 0;
 18: }

 20: /* Modified from MatCreateSeqAIJWithArrays() */
 21: PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat)
 22: {
 23:   PetscInt    ii;
 24:   Mat_SeqAIJ *aij;
 25:   PetscBool   isseqaij, osingle, ofree_a, ofree_ij;

 28:   MatSetSizes(mat, m, n, m, n);

 30:   if (!mtype) {
 31:     PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij);
 32:     if (!isseqaij) MatSetType(mat, MATSEQAIJ);
 33:   } else {
 34:     MatSetType(mat, mtype);
 35:   }

 37:   aij      = (Mat_SeqAIJ *)(mat)->data;
 38:   osingle  = aij->singlemalloc;
 39:   ofree_a  = aij->free_a;
 40:   ofree_ij = aij->free_ij;
 41:   /* changes the free flags */
 42:   MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL);

 44:   PetscFree(aij->ilen);
 45:   PetscFree(aij->imax);
 46:   PetscMalloc1(m, &aij->imax);
 47:   PetscMalloc1(m, &aij->ilen);
 48:   for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) {
 49:     const PetscInt rnz = i[ii + 1] - i[ii];
 50:     aij->nonzerorowcnt += !!rnz;
 51:     aij->rmax     = PetscMax(aij->rmax, rnz);
 52:     aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii];
 53:   }
 54:   aij->maxnz = i[m];
 55:   aij->nz    = i[m];

 57:   if (osingle) {
 58:     PetscFree3(aij->a, aij->j, aij->i);
 59:   } else {
 60:     if (ofree_a) PetscFree(aij->a);
 61:     if (ofree_ij) PetscFree(aij->j);
 62:     if (ofree_ij) PetscFree(aij->i);
 63:   }
 64:   aij->i     = i;
 65:   aij->j     = j;
 66:   aij->a     = a;
 67:   aij->nonew = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */
 68:   /* default to not retain ownership */
 69:   aij->singlemalloc = PETSC_FALSE;
 70:   aij->free_a       = PETSC_FALSE;
 71:   aij->free_ij      = PETSC_FALSE;
 72:   MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6);
 73:   return 0;
 74: }

 76: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
 77: {
 78:   Mat_Product        *product = C->product;
 79:   MatProductAlgorithm alg;
 80:   PetscBool           flg;

 82:   if (product) {
 83:     alg = product->alg;
 84:   } else {
 85:     alg = "sorted";
 86:   }
 87:   /* sorted */
 88:   PetscStrcmp(alg, "sorted", &flg);
 89:   if (flg) {
 90:     MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C);
 91:     return 0;
 92:   }

 94:   /* scalable */
 95:   PetscStrcmp(alg, "scalable", &flg);
 96:   if (flg) {
 97:     MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C);
 98:     return 0;
 99:   }

101:   /* scalable_fast */
102:   PetscStrcmp(alg, "scalable_fast", &flg);
103:   if (flg) {
104:     MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C);
105:     return 0;
106:   }

108:   /* heap */
109:   PetscStrcmp(alg, "heap", &flg);
110:   if (flg) {
111:     MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C);
112:     return 0;
113:   }

115:   /* btheap */
116:   PetscStrcmp(alg, "btheap", &flg);
117:   if (flg) {
118:     MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C);
119:     return 0;
120:   }

122:   /* llcondensed */
123:   PetscStrcmp(alg, "llcondensed", &flg);
124:   if (flg) {
125:     MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C);
126:     return 0;
127:   }

129:   /* rowmerge */
130:   PetscStrcmp(alg, "rowmerge", &flg);
131:   if (flg) {
132:     MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C);
133:     return 0;
134:   }

136: #if defined(PETSC_HAVE_HYPRE)
137:   PetscStrcmp(alg, "hypre", &flg);
138:   if (flg) {
139:     MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C);
140:     return 0;
141:   }
142: #endif

144:   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
145: }

147: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C)
148: {
149:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
150:   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
151:   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
152:   PetscReal          afill;
153:   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
154:   PetscTable         ta;
155:   PetscBT            lnkbt;
156:   PetscFreeSpaceList free_space = NULL, current_space = NULL;

158:   /* Get ci and cj */
159:   /*---------------*/
160:   /* Allocate ci array, arrays for fill computation and */
161:   /* free space for accumulating nonzero column info */
162:   PetscMalloc1(am + 2, &ci);
163:   ci[0] = 0;

165:   /* create and initialize a linked list */
166:   PetscTableCreate(bn, bn, &ta);
167:   MatRowMergeMax_SeqAIJ(b, bm, ta);
168:   PetscTableGetCount(ta, &Crmax);
169:   PetscTableDestroy(&ta);

171:   PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt);

173:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
174:   PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space);

176:   current_space = free_space;

178:   /* Determine ci and cj */
179:   for (i = 0; i < am; i++) {
180:     anzi = ai[i + 1] - ai[i];
181:     aj   = a->j + ai[i];
182:     for (j = 0; j < anzi; j++) {
183:       brow = aj[j];
184:       bnzj = bi[brow + 1] - bi[brow];
185:       bj   = b->j + bi[brow];
186:       /* add non-zero cols of B into the sorted linked list lnk */
187:       PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt);
188:     }
189:     /* add possible missing diagonal entry */
190:     if (C->force_diagonals) PetscLLCondensedAddSorted(1, &i, lnk, lnkbt);
191:     cnzi = lnk[0];

193:     /* If free space is not available, make more free space */
194:     /* Double the amount of total space in the list */
195:     if (current_space->local_remaining < cnzi) {
196:       PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space);
197:       ndouble++;
198:     }

200:     /* Copy data into free space, then initialize lnk */
201:     PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt);

203:     current_space->array += cnzi;
204:     current_space->local_used += cnzi;
205:     current_space->local_remaining -= cnzi;

207:     ci[i + 1] = ci[i] + cnzi;
208:   }

210:   /* Column indices are in the list of free space */
211:   /* Allocate space for cj, initialize cj, and */
212:   /* destroy list of free space and other temporary array(s) */
213:   PetscMalloc1(ci[am] + 1, &cj);
214:   PetscFreeSpaceContiguous(&free_space, cj);
215:   PetscLLCondensedDestroy(lnk, lnkbt);

217:   /* put together the new symbolic matrix */
218:   MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C);
219:   MatSetBlockSizesFromMats(C, A, B);

221:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
222:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
223:   c          = (Mat_SeqAIJ *)(C->data);
224:   c->free_a  = PETSC_FALSE;
225:   c->free_ij = PETSC_TRUE;
226:   c->nonew   = 0;

228:   /* fast, needs non-scalable O(bn) array 'abdense' */
229:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;

231:   /* set MatInfo */
232:   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
233:   if (afill < 1.0) afill = 1.0;
234:   C->info.mallocs           = ndouble;
235:   C->info.fill_ratio_given  = fill;
236:   C->info.fill_ratio_needed = afill;

238: #if defined(PETSC_USE_INFO)
239:   if (ci[am]) {
240:     PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill);
241:     PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill);
242:   } else {
243:     PetscInfo(C, "Empty matrix product\n");
244:   }
245: #endif
246:   return 0;
247: }

249: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C)
250: {
251:   PetscLogDouble     flops = 0.0;
252:   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
253:   Mat_SeqAIJ        *b     = (Mat_SeqAIJ *)B->data;
254:   Mat_SeqAIJ        *c     = (Mat_SeqAIJ *)C->data;
255:   PetscInt          *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
256:   PetscInt           am = A->rmap->n, cm = C->rmap->n;
257:   PetscInt           i, j, k, anzi, bnzi, cnzi, brow;
258:   PetscScalar       *ca, valtmp;
259:   PetscScalar       *ab_dense;
260:   PetscContainer     cab_dense;
261:   const PetscScalar *aa, *ba, *baj;

263:   MatSeqAIJGetArrayRead(A, &aa);
264:   MatSeqAIJGetArrayRead(B, &ba);
265:   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
266:     PetscMalloc1(ci[cm] + 1, &ca);
267:     c->a      = ca;
268:     c->free_a = PETSC_TRUE;
269:   } else ca = c->a;

271:   /* TODO this should be done in the symbolic phase */
272:   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
273:      that is hard to eradicate) */
274:   PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense);
275:   if (!cab_dense) {
276:     PetscMalloc1(B->cmap->N, &ab_dense);
277:     PetscContainerCreate(PETSC_COMM_SELF, &cab_dense);
278:     PetscContainerSetPointer(cab_dense, ab_dense);
279:     PetscContainerSetUserDestroy(cab_dense, PetscContainerUserDestroyDefault);
280:     PetscObjectCompose((PetscObject)C, "__PETSc__ab_dense", (PetscObject)cab_dense);
281:     PetscObjectDereference((PetscObject)cab_dense);
282:   }
283:   PetscContainerGetPointer(cab_dense, (void **)&ab_dense);
284:   PetscArrayzero(ab_dense, B->cmap->N);

286:   /* clean old values in C */
287:   PetscArrayzero(ca, ci[cm]);
288:   /* Traverse A row-wise. */
289:   /* Build the ith row in C by summing over nonzero columns in A, */
290:   /* the rows of B corresponding to nonzeros of A. */
291:   for (i = 0; i < am; i++) {
292:     anzi = ai[i + 1] - ai[i];
293:     for (j = 0; j < anzi; j++) {
294:       brow = aj[j];
295:       bnzi = bi[brow + 1] - bi[brow];
296:       bjj  = bj + bi[brow];
297:       baj  = ba + bi[brow];
298:       /* perform dense axpy */
299:       valtmp = aa[j];
300:       for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k];
301:       flops += 2 * bnzi;
302:     }
303:     aj += anzi;
304:     aa += anzi;

306:     cnzi = ci[i + 1] - ci[i];
307:     for (k = 0; k < cnzi; k++) {
308:       ca[k] += ab_dense[cj[k]];
309:       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
310:     }
311:     flops += cnzi;
312:     cj += cnzi;
313:     ca += cnzi;
314:   }
315: #if defined(PETSC_HAVE_DEVICE)
316:   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
317: #endif
318:   MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
319:   MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);
320:   PetscLogFlops(flops);
321:   MatSeqAIJRestoreArrayRead(A, &aa);
322:   MatSeqAIJRestoreArrayRead(B, &ba);
323:   return 0;
324: }

326: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C)
327: {
328:   PetscLogDouble     flops = 0.0;
329:   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
330:   Mat_SeqAIJ        *b     = (Mat_SeqAIJ *)B->data;
331:   Mat_SeqAIJ        *c     = (Mat_SeqAIJ *)C->data;
332:   PetscInt          *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
333:   PetscInt           am = A->rmap->N, cm = C->rmap->N;
334:   PetscInt           i, j, k, anzi, bnzi, cnzi, brow;
335:   PetscScalar       *ca = c->a, valtmp;
336:   const PetscScalar *aa, *ba, *baj;
337:   PetscInt           nextb;

339:   MatSeqAIJGetArrayRead(A, &aa);
340:   MatSeqAIJGetArrayRead(B, &ba);
341:   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
342:     PetscMalloc1(ci[cm] + 1, &ca);
343:     c->a      = ca;
344:     c->free_a = PETSC_TRUE;
345:   }

347:   /* clean old values in C */
348:   PetscArrayzero(ca, ci[cm]);
349:   /* Traverse A row-wise. */
350:   /* Build the ith row in C by summing over nonzero columns in A, */
351:   /* the rows of B corresponding to nonzeros of A. */
352:   for (i = 0; i < am; i++) {
353:     anzi = ai[i + 1] - ai[i];
354:     cnzi = ci[i + 1] - ci[i];
355:     for (j = 0; j < anzi; j++) {
356:       brow = aj[j];
357:       bnzi = bi[brow + 1] - bi[brow];
358:       bjj  = bj + bi[brow];
359:       baj  = ba + bi[brow];
360:       /* perform sparse axpy */
361:       valtmp = aa[j];
362:       nextb  = 0;
363:       for (k = 0; nextb < bnzi; k++) {
364:         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
365:           ca[k] += valtmp * baj[nextb++];
366:         }
367:       }
368:       flops += 2 * bnzi;
369:     }
370:     aj += anzi;
371:     aa += anzi;
372:     cj += cnzi;
373:     ca += cnzi;
374:   }
375: #if defined(PETSC_HAVE_DEVICE)
376:   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
377: #endif
378:   MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
379:   MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);
380:   PetscLogFlops(flops);
381:   MatSeqAIJRestoreArrayRead(A, &aa);
382:   MatSeqAIJRestoreArrayRead(B, &ba);
383:   return 0;
384: }

386: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C)
387: {
388:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
389:   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
390:   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
391:   MatScalar         *ca;
392:   PetscReal          afill;
393:   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
394:   PetscTable         ta;
395:   PetscFreeSpaceList free_space = NULL, current_space = NULL;

397:   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
398:   /*-----------------------------------------------------------------------------------------*/
399:   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
400:   PetscMalloc1(am + 2, &ci);
401:   ci[0] = 0;

403:   /* create and initialize a linked list */
404:   PetscTableCreate(bn, bn, &ta);
405:   MatRowMergeMax_SeqAIJ(b, bm, ta);
406:   PetscTableGetCount(ta, &Crmax);
407:   PetscTableDestroy(&ta);

409:   PetscLLCondensedCreate_fast(Crmax, &lnk);

411:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
412:   PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space);
413:   current_space = free_space;

415:   /* Determine ci and cj */
416:   for (i = 0; i < am; i++) {
417:     anzi = ai[i + 1] - ai[i];
418:     aj   = a->j + ai[i];
419:     for (j = 0; j < anzi; j++) {
420:       brow = aj[j];
421:       bnzj = bi[brow + 1] - bi[brow];
422:       bj   = b->j + bi[brow];
423:       /* add non-zero cols of B into the sorted linked list lnk */
424:       PetscLLCondensedAddSorted_fast(bnzj, bj, lnk);
425:     }
426:     /* add possible missing diagonal entry */
427:     if (C->force_diagonals) PetscLLCondensedAddSorted_fast(1, &i, lnk);
428:     cnzi = lnk[1];

430:     /* If free space is not available, make more free space */
431:     /* Double the amount of total space in the list */
432:     if (current_space->local_remaining < cnzi) {
433:       PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space);
434:       ndouble++;
435:     }

437:     /* Copy data into free space, then initialize lnk */
438:     PetscLLCondensedClean_fast(cnzi, current_space->array, lnk);

440:     current_space->array += cnzi;
441:     current_space->local_used += cnzi;
442:     current_space->local_remaining -= cnzi;

444:     ci[i + 1] = ci[i] + cnzi;
445:   }

447:   /* Column indices are in the list of free space */
448:   /* Allocate space for cj, initialize cj, and */
449:   /* destroy list of free space and other temporary array(s) */
450:   PetscMalloc1(ci[am] + 1, &cj);
451:   PetscFreeSpaceContiguous(&free_space, cj);
452:   PetscLLCondensedDestroy_fast(lnk);

454:   /* Allocate space for ca */
455:   PetscCalloc1(ci[am] + 1, &ca);

457:   /* put together the new symbolic matrix */
458:   MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C);
459:   MatSetBlockSizesFromMats(C, A, B);

461:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
462:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
463:   c          = (Mat_SeqAIJ *)(C->data);
464:   c->free_a  = PETSC_TRUE;
465:   c->free_ij = PETSC_TRUE;
466:   c->nonew   = 0;

468:   /* slower, less memory */
469:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;

471:   /* set MatInfo */
472:   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
473:   if (afill < 1.0) afill = 1.0;
474:   C->info.mallocs           = ndouble;
475:   C->info.fill_ratio_given  = fill;
476:   C->info.fill_ratio_needed = afill;

478: #if defined(PETSC_USE_INFO)
479:   if (ci[am]) {
480:     PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill);
481:     PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill);
482:   } else {
483:     PetscInfo(C, "Empty matrix product\n");
484:   }
485: #endif
486:   return 0;
487: }

489: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C)
490: {
491:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
492:   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
493:   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
494:   MatScalar         *ca;
495:   PetscReal          afill;
496:   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
497:   PetscTable         ta;
498:   PetscFreeSpaceList free_space = NULL, current_space = NULL;

500:   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
501:   /*---------------------------------------------------------------------------------------------*/
502:   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
503:   PetscMalloc1(am + 2, &ci);
504:   ci[0] = 0;

506:   /* create and initialize a linked list */
507:   PetscTableCreate(bn, bn, &ta);
508:   MatRowMergeMax_SeqAIJ(b, bm, ta);
509:   PetscTableGetCount(ta, &Crmax);
510:   PetscTableDestroy(&ta);
511:   PetscLLCondensedCreate_Scalable(Crmax, &lnk);

513:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
514:   PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space);
515:   current_space = free_space;

517:   /* Determine ci and cj */
518:   for (i = 0; i < am; i++) {
519:     anzi = ai[i + 1] - ai[i];
520:     aj   = a->j + ai[i];
521:     for (j = 0; j < anzi; j++) {
522:       brow = aj[j];
523:       bnzj = bi[brow + 1] - bi[brow];
524:       bj   = b->j + bi[brow];
525:       /* add non-zero cols of B into the sorted linked list lnk */
526:       PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk);
527:     }
528:     /* add possible missing diagonal entry */
529:     if (C->force_diagonals) PetscLLCondensedAddSorted_Scalable(1, &i, lnk);

531:     cnzi = lnk[0];

533:     /* If free space is not available, make more free space */
534:     /* Double the amount of total space in the list */
535:     if (current_space->local_remaining < cnzi) {
536:       PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space);
537:       ndouble++;
538:     }

540:     /* Copy data into free space, then initialize lnk */
541:     PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk);

543:     current_space->array += cnzi;
544:     current_space->local_used += cnzi;
545:     current_space->local_remaining -= cnzi;

547:     ci[i + 1] = ci[i] + cnzi;
548:   }

550:   /* Column indices are in the list of free space */
551:   /* Allocate space for cj, initialize cj, and */
552:   /* destroy list of free space and other temporary array(s) */
553:   PetscMalloc1(ci[am] + 1, &cj);
554:   PetscFreeSpaceContiguous(&free_space, cj);
555:   PetscLLCondensedDestroy_Scalable(lnk);

557:   /* Allocate space for ca */
558:   /*-----------------------*/
559:   PetscCalloc1(ci[am] + 1, &ca);

561:   /* put together the new symbolic matrix */
562:   MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C);
563:   MatSetBlockSizesFromMats(C, A, B);

565:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
566:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
567:   c          = (Mat_SeqAIJ *)(C->data);
568:   c->free_a  = PETSC_TRUE;
569:   c->free_ij = PETSC_TRUE;
570:   c->nonew   = 0;

572:   /* slower, less memory */
573:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;

575:   /* set MatInfo */
576:   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
577:   if (afill < 1.0) afill = 1.0;
578:   C->info.mallocs           = ndouble;
579:   C->info.fill_ratio_given  = fill;
580:   C->info.fill_ratio_needed = afill;

582: #if defined(PETSC_USE_INFO)
583:   if (ci[am]) {
584:     PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill);
585:     PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill);
586:   } else {
587:     PetscInfo(C, "Empty matrix product\n");
588:   }
589: #endif
590:   return 0;
591: }

593: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C)
594: {
595:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
596:   const PetscInt    *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
597:   PetscInt          *ci, *cj, *bb;
598:   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
599:   PetscReal          afill;
600:   PetscInt           i, j, col, ndouble = 0;
601:   PetscFreeSpaceList free_space = NULL, current_space = NULL;
602:   PetscHeap          h;

604:   /* Get ci and cj - by merging sorted rows using a heap */
605:   /*---------------------------------------------------------------------------------------------*/
606:   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
607:   PetscMalloc1(am + 2, &ci);
608:   ci[0] = 0;

610:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
611:   PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space);
612:   current_space = free_space;

614:   PetscHeapCreate(a->rmax, &h);
615:   PetscMalloc1(a->rmax, &bb);

617:   /* Determine ci and cj */
618:   for (i = 0; i < am; i++) {
619:     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
620:     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
621:     ci[i + 1]            = ci[i];
622:     /* Populate the min heap */
623:     for (j = 0; j < anzi; j++) {
624:       bb[j] = bi[acol[j]];           /* bb points at the start of the row */
625:       if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */
626:         PetscHeapAdd(h, j, bj[bb[j]++]);
627:       }
628:     }
629:     /* Pick off the min element, adding it to free space */
630:     PetscHeapPop(h, &j, &col);
631:     while (j >= 0) {
632:       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
633:         PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), &current_space);
634:         ndouble++;
635:       }
636:       *(current_space->array++) = col;
637:       current_space->local_used++;
638:       current_space->local_remaining--;
639:       ci[i + 1]++;

641:       /* stash if anything else remains in this row of B */
642:       if (bb[j] < bi[acol[j] + 1]) PetscHeapStash(h, j, bj[bb[j]++]);
643:       while (1) { /* pop and stash any other rows of B that also had an entry in this column */
644:         PetscInt j2, col2;
645:         PetscHeapPeek(h, &j2, &col2);
646:         if (col2 != col) break;
647:         PetscHeapPop(h, &j2, &col2);
648:         if (bb[j2] < bi[acol[j2] + 1]) PetscHeapStash(h, j2, bj[bb[j2]++]);
649:       }
650:       /* Put any stashed elements back into the min heap */
651:       PetscHeapUnstash(h);
652:       PetscHeapPop(h, &j, &col);
653:     }
654:   }
655:   PetscFree(bb);
656:   PetscHeapDestroy(&h);

658:   /* Column indices are in the list of free space */
659:   /* Allocate space for cj, initialize cj, and */
660:   /* destroy list of free space and other temporary array(s) */
661:   PetscMalloc1(ci[am], &cj);
662:   PetscFreeSpaceContiguous(&free_space, cj);

664:   /* put together the new symbolic matrix */
665:   MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C);
666:   MatSetBlockSizesFromMats(C, A, B);

668:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
669:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
670:   c          = (Mat_SeqAIJ *)(C->data);
671:   c->free_a  = PETSC_TRUE;
672:   c->free_ij = PETSC_TRUE;
673:   c->nonew   = 0;

675:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;

677:   /* set MatInfo */
678:   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
679:   if (afill < 1.0) afill = 1.0;
680:   C->info.mallocs           = ndouble;
681:   C->info.fill_ratio_given  = fill;
682:   C->info.fill_ratio_needed = afill;

684: #if defined(PETSC_USE_INFO)
685:   if (ci[am]) {
686:     PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill);
687:     PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill);
688:   } else {
689:     PetscInfo(C, "Empty matrix product\n");
690:   }
691: #endif
692:   return 0;
693: }

695: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C)
696: {
697:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
698:   const PetscInt    *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
699:   PetscInt          *ci, *cj, *bb;
700:   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
701:   PetscReal          afill;
702:   PetscInt           i, j, col, ndouble = 0;
703:   PetscFreeSpaceList free_space = NULL, current_space = NULL;
704:   PetscHeap          h;
705:   PetscBT            bt;

707:   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
708:   /*---------------------------------------------------------------------------------------------*/
709:   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
710:   PetscMalloc1(am + 2, &ci);
711:   ci[0] = 0;

713:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
714:   PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space);

716:   current_space = free_space;

718:   PetscHeapCreate(a->rmax, &h);
719:   PetscMalloc1(a->rmax, &bb);
720:   PetscBTCreate(bn, &bt);

722:   /* Determine ci and cj */
723:   for (i = 0; i < am; i++) {
724:     const PetscInt  anzi = ai[i + 1] - ai[i];    /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
725:     const PetscInt *acol = aj + ai[i];           /* column indices of nonzero entries in this row */
726:     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
727:     ci[i + 1]            = ci[i];
728:     /* Populate the min heap */
729:     for (j = 0; j < anzi; j++) {
730:       PetscInt brow = acol[j];
731:       for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) {
732:         PetscInt bcol = bj[bb[j]];
733:         if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
734:           PetscHeapAdd(h, j, bcol);
735:           bb[j]++;
736:           break;
737:         }
738:       }
739:     }
740:     /* Pick off the min element, adding it to free space */
741:     PetscHeapPop(h, &j, &col);
742:     while (j >= 0) {
743:       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
744:         fptr = NULL;                            /* need PetscBTMemzero */
745:         PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), &current_space);
746:         ndouble++;
747:       }
748:       *(current_space->array++) = col;
749:       current_space->local_used++;
750:       current_space->local_remaining--;
751:       ci[i + 1]++;

753:       /* stash if anything else remains in this row of B */
754:       for (; bb[j] < bi[acol[j] + 1]; bb[j]++) {
755:         PetscInt bcol = bj[bb[j]];
756:         if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
757:           PetscHeapAdd(h, j, bcol);
758:           bb[j]++;
759:           break;
760:         }
761:       }
762:       PetscHeapPop(h, &j, &col);
763:     }
764:     if (fptr) { /* Clear the bits for this row */
765:       for (; fptr < current_space->array; fptr++) PetscBTClear(bt, *fptr);
766:     } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
767:       PetscBTMemzero(bn, bt);
768:     }
769:   }
770:   PetscFree(bb);
771:   PetscHeapDestroy(&h);
772:   PetscBTDestroy(&bt);

774:   /* Column indices are in the list of free space */
775:   /* Allocate space for cj, initialize cj, and */
776:   /* destroy list of free space and other temporary array(s) */
777:   PetscMalloc1(ci[am], &cj);
778:   PetscFreeSpaceContiguous(&free_space, cj);

780:   /* put together the new symbolic matrix */
781:   MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C);
782:   MatSetBlockSizesFromMats(C, A, B);

784:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
785:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
786:   c          = (Mat_SeqAIJ *)(C->data);
787:   c->free_a  = PETSC_TRUE;
788:   c->free_ij = PETSC_TRUE;
789:   c->nonew   = 0;

791:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;

793:   /* set MatInfo */
794:   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
795:   if (afill < 1.0) afill = 1.0;
796:   C->info.mallocs           = ndouble;
797:   C->info.fill_ratio_given  = fill;
798:   C->info.fill_ratio_needed = afill;

800: #if defined(PETSC_USE_INFO)
801:   if (ci[am]) {
802:     PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill);
803:     PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill);
804:   } else {
805:     PetscInfo(C, "Empty matrix product\n");
806:   }
807: #endif
808:   return 0;
809: }

811: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C)
812: {
813:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
814:   const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1;
815:   PetscInt       *ci, *cj, *outputj, worki_L1[9], worki_L2[9];
816:   PetscInt        c_maxmem, a_maxrownnz = 0, a_rownnz;
817:   const PetscInt  workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7};
818:   const PetscInt  am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
819:   const PetscInt *brow_ptr[8], *brow_end[8];
820:   PetscInt        window[8];
821:   PetscInt        window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows;
822:   PetscInt        i, k, ndouble = 0, L1_rowsleft, rowsleft;
823:   PetscReal       afill;
824:   PetscInt       *workj_L1, *workj_L2, *workj_L3;
825:   PetscInt        L1_nnz, L2_nnz;

827:   /* Step 1: Get upper bound on memory required for allocation.
828:              Because of the way virtual memory works,
829:              only the memory pages that are actually needed will be physically allocated. */
830:   PetscMalloc1(am + 1, &ci);
831:   for (i = 0; i < am; i++) {
832:     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
833:     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
834:     a_rownnz             = 0;
835:     for (k = 0; k < anzi; ++k) {
836:       a_rownnz += bi[acol[k] + 1] - bi[acol[k]];
837:       if (a_rownnz > bn) {
838:         a_rownnz = bn;
839:         break;
840:       }
841:     }
842:     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
843:   }
844:   /* temporary work areas for merging rows */
845:   PetscMalloc1(a_maxrownnz * 8, &workj_L1);
846:   PetscMalloc1(a_maxrownnz * 8, &workj_L2);
847:   PetscMalloc1(a_maxrownnz, &workj_L3);

849:   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
850:   c_maxmem = 8 * (ai[am] + bi[bm]);
851:   /* Step 2: Populate pattern for C */
852:   PetscMalloc1(c_maxmem, &cj);

854:   ci_nnz      = 0;
855:   ci[0]       = 0;
856:   worki_L1[0] = 0;
857:   worki_L2[0] = 0;
858:   for (i = 0; i < am; i++) {
859:     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
860:     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
861:     rowsleft             = anzi;
862:     inputcol_L1          = acol;
863:     L2_nnz               = 0;
864:     L2_nrows             = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
865:     worki_L2[1]          = 0;
866:     outputi_nnz          = 0;

868:     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
869:     while (ci_nnz + a_maxrownnz > c_maxmem) {
870:       c_maxmem *= 2;
871:       ndouble++;
872:       PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj);
873:     }

875:     while (rowsleft) {
876:       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
877:       L1_nrows    = 0;
878:       L1_nnz      = 0;
879:       inputcol    = inputcol_L1;
880:       inputi      = bi;
881:       inputj      = bj;

883:       /* The following macro is used to specialize for small rows in A.
884:          This helps with compiler unrolling, improving performance substantially.
885:           Input:  inputj   inputi  inputcol  bn
886:           Output: outputj  outputi_nnz                       */
887: #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \
888:   window_min  = bn; \
889:   outputi_nnz = 0; \
890:   for (k = 0; k < ANNZ; ++k) { \
891:     brow_ptr[k] = inputj + inputi[inputcol[k]]; \
892:     brow_end[k] = inputj + inputi[inputcol[k] + 1]; \
893:     window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
894:     window_min  = PetscMin(window[k], window_min); \
895:   } \
896:   while (window_min < bn) { \
897:     outputj[outputi_nnz++] = window_min; \
898:     /* advance front and compute new minimum */ \
899:     old_window_min = window_min; \
900:     window_min     = bn; \
901:     for (k = 0; k < ANNZ; ++k) { \
902:       if (window[k] == old_window_min) { \
903:         brow_ptr[k]++; \
904:         window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
905:       } \
906:       window_min = PetscMin(window[k], window_min); \
907:     } \
908:   }

910:       /************** L E V E L  1 ***************/
911:       /* Merge up to 8 rows of B to L1 work array*/
912:       while (L1_rowsleft) {
913:         outputi_nnz = 0;
914:         if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/
915:         else outputj = cj + ci_nnz;                /* Merge directly to C */

917:         switch (L1_rowsleft) {
918:         case 1:
919:           brow_ptr[0] = inputj + inputi[inputcol[0]];
920:           brow_end[0] = inputj + inputi[inputcol[0] + 1];
921:           for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
922:           inputcol += L1_rowsleft;
923:           rowsleft -= L1_rowsleft;
924:           L1_rowsleft = 0;
925:           break;
926:         case 2:
927:           MatMatMultSymbolic_RowMergeMacro(2);
928:           inputcol += L1_rowsleft;
929:           rowsleft -= L1_rowsleft;
930:           L1_rowsleft = 0;
931:           break;
932:         case 3:
933:           MatMatMultSymbolic_RowMergeMacro(3);
934:           inputcol += L1_rowsleft;
935:           rowsleft -= L1_rowsleft;
936:           L1_rowsleft = 0;
937:           break;
938:         case 4:
939:           MatMatMultSymbolic_RowMergeMacro(4);
940:           inputcol += L1_rowsleft;
941:           rowsleft -= L1_rowsleft;
942:           L1_rowsleft = 0;
943:           break;
944:         case 5:
945:           MatMatMultSymbolic_RowMergeMacro(5);
946:           inputcol += L1_rowsleft;
947:           rowsleft -= L1_rowsleft;
948:           L1_rowsleft = 0;
949:           break;
950:         case 6:
951:           MatMatMultSymbolic_RowMergeMacro(6);
952:           inputcol += L1_rowsleft;
953:           rowsleft -= L1_rowsleft;
954:           L1_rowsleft = 0;
955:           break;
956:         case 7:
957:           MatMatMultSymbolic_RowMergeMacro(7);
958:           inputcol += L1_rowsleft;
959:           rowsleft -= L1_rowsleft;
960:           L1_rowsleft = 0;
961:           break;
962:         default:
963:           MatMatMultSymbolic_RowMergeMacro(8);
964:           inputcol += 8;
965:           rowsleft -= 8;
966:           L1_rowsleft -= 8;
967:           break;
968:         }
969:         inputcol_L1 = inputcol;
970:         L1_nnz += outputi_nnz;
971:         worki_L1[++L1_nrows] = L1_nnz;
972:       }

974:       /********************** L E V E L  2 ************************/
975:       /* Merge from L1 work array to either C or to L2 work array */
976:       if (anzi > 8) {
977:         inputi      = worki_L1;
978:         inputj      = workj_L1;
979:         inputcol    = workcol;
980:         outputi_nnz = 0;

982:         if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */
983:         else outputj = workj_L2 + L2_nnz;      /* Merge from L1 work array to L2 work array */

985:         switch (L1_nrows) {
986:         case 1:
987:           brow_ptr[0] = inputj + inputi[inputcol[0]];
988:           brow_end[0] = inputj + inputi[inputcol[0] + 1];
989:           for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
990:           break;
991:         case 2:
992:           MatMatMultSymbolic_RowMergeMacro(2);
993:           break;
994:         case 3:
995:           MatMatMultSymbolic_RowMergeMacro(3);
996:           break;
997:         case 4:
998:           MatMatMultSymbolic_RowMergeMacro(4);
999:           break;
1000:         case 5:
1001:           MatMatMultSymbolic_RowMergeMacro(5);
1002:           break;
1003:         case 6:
1004:           MatMatMultSymbolic_RowMergeMacro(6);
1005:           break;
1006:         case 7:
1007:           MatMatMultSymbolic_RowMergeMacro(7);
1008:           break;
1009:         case 8:
1010:           MatMatMultSymbolic_RowMergeMacro(8);
1011:           break;
1012:         default:
1013:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
1014:         }
1015:         L2_nnz += outputi_nnz;
1016:         worki_L2[++L2_nrows] = L2_nnz;

1018:         /************************ L E V E L  3 **********************/
1019:         /* Merge from L2 work array to either C or to L2 work array */
1020:         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1021:           inputi      = worki_L2;
1022:           inputj      = workj_L2;
1023:           inputcol    = workcol;
1024:           outputi_nnz = 0;
1025:           if (rowsleft) outputj = workj_L3;
1026:           else outputj = cj + ci_nnz;
1027:           switch (L2_nrows) {
1028:           case 1:
1029:             brow_ptr[0] = inputj + inputi[inputcol[0]];
1030:             brow_end[0] = inputj + inputi[inputcol[0] + 1];
1031:             for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1032:             break;
1033:           case 2:
1034:             MatMatMultSymbolic_RowMergeMacro(2);
1035:             break;
1036:           case 3:
1037:             MatMatMultSymbolic_RowMergeMacro(3);
1038:             break;
1039:           case 4:
1040:             MatMatMultSymbolic_RowMergeMacro(4);
1041:             break;
1042:           case 5:
1043:             MatMatMultSymbolic_RowMergeMacro(5);
1044:             break;
1045:           case 6:
1046:             MatMatMultSymbolic_RowMergeMacro(6);
1047:             break;
1048:           case 7:
1049:             MatMatMultSymbolic_RowMergeMacro(7);
1050:             break;
1051:           case 8:
1052:             MatMatMultSymbolic_RowMergeMacro(8);
1053:             break;
1054:           default:
1055:             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1056:           }
1057:           L2_nrows    = 1;
1058:           L2_nnz      = outputi_nnz;
1059:           worki_L2[1] = outputi_nnz;
1060:           /* Copy to workj_L2 */
1061:           if (rowsleft) {
1062:             for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k];
1063:           }
1064:         }
1065:       }
1066:     } /* while (rowsleft) */
1067: #undef MatMatMultSymbolic_RowMergeMacro

1069:     /* terminate current row */
1070:     ci_nnz += outputi_nnz;
1071:     ci[i + 1] = ci_nnz;
1072:   }

1074:   /* Step 3: Create the new symbolic matrix */
1075:   MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C);
1076:   MatSetBlockSizesFromMats(C, A, B);

1078:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1079:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1080:   c          = (Mat_SeqAIJ *)(C->data);
1081:   c->free_a  = PETSC_TRUE;
1082:   c->free_ij = PETSC_TRUE;
1083:   c->nonew   = 0;

1085:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;

1087:   /* set MatInfo */
1088:   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
1089:   if (afill < 1.0) afill = 1.0;
1090:   C->info.mallocs           = ndouble;
1091:   C->info.fill_ratio_given  = fill;
1092:   C->info.fill_ratio_needed = afill;

1094: #if defined(PETSC_USE_INFO)
1095:   if (ci[am]) {
1096:     PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill);
1097:     PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill);
1098:   } else {
1099:     PetscInfo(C, "Empty matrix product\n");
1100:   }
1101: #endif

1103:   /* Step 4: Free temporary work areas */
1104:   PetscFree(workj_L1);
1105:   PetscFree(workj_L2);
1106:   PetscFree(workj_L3);
1107:   return 0;
1108: }

1110: /* concatenate unique entries and then sort */
1111: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C)
1112: {
1113:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
1114:   const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
1115:   PetscInt       *ci, *cj, bcol;
1116:   PetscInt        am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
1117:   PetscReal       afill;
1118:   PetscInt        i, j, ndouble = 0;
1119:   PetscSegBuffer  seg, segrow;
1120:   char           *seen;

1122:   PetscMalloc1(am + 1, &ci);
1123:   ci[0] = 0;

1125:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1126:   PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg);
1127:   PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow);
1128:   PetscCalloc1(bn, &seen);

1130:   /* Determine ci and cj */
1131:   for (i = 0; i < am; i++) {
1132:     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1133:     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
1134:     PetscInt packlen     = 0, *PETSC_RESTRICT crow;

1136:     /* Pack segrow */
1137:     for (j = 0; j < anzi; j++) {
1138:       PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k;
1139:       for (k = bjstart; k < bjend; k++) {
1140:         bcol = bj[k];
1141:         if (!seen[bcol]) { /* new entry */
1142:           PetscInt *PETSC_RESTRICT slot;
1143:           PetscSegBufferGetInts(segrow, 1, &slot);
1144:           *slot      = bcol;
1145:           seen[bcol] = 1;
1146:           packlen++;
1147:         }
1148:       }
1149:     }

1151:     /* Check i-th diagonal entry */
1152:     if (C->force_diagonals && !seen[i]) {
1153:       PetscInt *PETSC_RESTRICT slot;
1154:       PetscSegBufferGetInts(segrow, 1, &slot);
1155:       *slot   = i;
1156:       seen[i] = 1;
1157:       packlen++;
1158:     }

1160:     PetscSegBufferGetInts(seg, packlen, &crow);
1161:     PetscSegBufferExtractTo(segrow, crow);
1162:     PetscSortInt(packlen, crow);
1163:     ci[i + 1] = ci[i] + packlen;
1164:     for (j = 0; j < packlen; j++) seen[crow[j]] = 0;
1165:   }
1166:   PetscSegBufferDestroy(&segrow);
1167:   PetscFree(seen);

1169:   /* Column indices are in the segmented buffer */
1170:   PetscSegBufferExtractAlloc(seg, &cj);
1171:   PetscSegBufferDestroy(&seg);

1173:   /* put together the new symbolic matrix */
1174:   MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C);
1175:   MatSetBlockSizesFromMats(C, A, B);

1177:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1178:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1179:   c          = (Mat_SeqAIJ *)(C->data);
1180:   c->free_a  = PETSC_TRUE;
1181:   c->free_ij = PETSC_TRUE;
1182:   c->nonew   = 0;

1184:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;

1186:   /* set MatInfo */
1187:   afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5;
1188:   if (afill < 1.0) afill = 1.0;
1189:   C->info.mallocs           = ndouble;
1190:   C->info.fill_ratio_given  = fill;
1191:   C->info.fill_ratio_needed = afill;

1193: #if defined(PETSC_USE_INFO)
1194:   if (ci[am]) {
1195:     PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill);
1196:     PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill);
1197:   } else {
1198:     PetscInfo(C, "Empty matrix product\n");
1199:   }
1200: #endif
1201:   return 0;
1202: }

1204: PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
1205: {
1206:   Mat_MatMatTransMult *abt = (Mat_MatMatTransMult *)data;

1208:   MatTransposeColoringDestroy(&abt->matcoloring);
1209:   MatDestroy(&abt->Bt_den);
1210:   MatDestroy(&abt->ABt_den);
1211:   PetscFree(abt);
1212:   return 0;
1213: }

1215: PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1216: {
1217:   Mat                  Bt;
1218:   Mat_MatMatTransMult *abt;
1219:   Mat_Product         *product = C->product;
1220:   char                *alg;


1225:   /* create symbolic Bt */
1226:   MatTransposeSymbolic(B, &Bt);
1227:   MatSetBlockSizes(Bt, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs));
1228:   MatSetType(Bt, ((PetscObject)A)->type_name);

1230:   /* get symbolic C=A*Bt */
1231:   PetscStrallocpy(product->alg, &alg);
1232:   MatProductSetAlgorithm(C, "sorted"); /* set algorithm for C = A*Bt */
1233:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C);
1234:   MatProductSetAlgorithm(C, alg); /* resume original algorithm for ABt product */
1235:   PetscFree(alg);

1237:   /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
1238:   PetscNew(&abt);

1240:   product->data    = abt;
1241:   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;

1243:   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;

1245:   abt->usecoloring = PETSC_FALSE;
1246:   PetscStrcmp(product->alg, "color", &abt->usecoloring);
1247:   if (abt->usecoloring) {
1248:     /* Create MatTransposeColoring from symbolic C=A*B^T */
1249:     MatTransposeColoring matcoloring;
1250:     MatColoring          coloring;
1251:     ISColoring           iscoloring;
1252:     Mat                  Bt_dense, C_dense;

1254:     /* inode causes memory problem */
1255:     MatSetOption(C, MAT_USE_INODES, PETSC_FALSE);

1257:     MatColoringCreate(C, &coloring);
1258:     MatColoringSetDistance(coloring, 2);
1259:     MatColoringSetType(coloring, MATCOLORINGSL);
1260:     MatColoringSetFromOptions(coloring);
1261:     MatColoringApply(coloring, &iscoloring);
1262:     MatColoringDestroy(&coloring);
1263:     MatTransposeColoringCreate(C, iscoloring, &matcoloring);

1265:     abt->matcoloring = matcoloring;

1267:     ISColoringDestroy(&iscoloring);

1269:     /* Create Bt_dense and C_dense = A*Bt_dense */
1270:     MatCreate(PETSC_COMM_SELF, &Bt_dense);
1271:     MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors);
1272:     MatSetType(Bt_dense, MATSEQDENSE);
1273:     MatSeqDenseSetPreallocation(Bt_dense, NULL);

1275:     Bt_dense->assembled = PETSC_TRUE;
1276:     abt->Bt_den         = Bt_dense;

1278:     MatCreate(PETSC_COMM_SELF, &C_dense);
1279:     MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors);
1280:     MatSetType(C_dense, MATSEQDENSE);
1281:     MatSeqDenseSetPreallocation(C_dense, NULL);

1283:     Bt_dense->assembled = PETSC_TRUE;
1284:     abt->ABt_den        = C_dense;

1286: #if defined(PETSC_USE_INFO)
1287:     {
1288:       Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
1289:       PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n,
1290:                           Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)(c->nz)) / ((PetscReal)(A->rmap->n * matcoloring->ncolors)))));
1291:     }
1292: #endif
1293:   }
1294:   /* clean up */
1295:   MatDestroy(&Bt);
1296:   return 0;
1297: }

1299: PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1300: {
1301:   Mat_SeqAIJ          *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1302:   PetscInt            *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow;
1303:   PetscInt             cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol;
1304:   PetscLogDouble       flops = 0.0;
1305:   MatScalar           *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval;
1306:   Mat_MatMatTransMult *abt;
1307:   Mat_Product         *product = C->product;

1310:   abt = (Mat_MatMatTransMult *)product->data;
1312:   /* clear old values in C */
1313:   if (!c->a) {
1314:     PetscCalloc1(ci[cm] + 1, &ca);
1315:     c->a      = ca;
1316:     c->free_a = PETSC_TRUE;
1317:   } else {
1318:     ca = c->a;
1319:     PetscArrayzero(ca, ci[cm] + 1);
1320:   }

1322:   if (abt->usecoloring) {
1323:     MatTransposeColoring matcoloring = abt->matcoloring;
1324:     Mat                  Bt_dense, C_dense = abt->ABt_den;

1326:     /* Get Bt_dense by Apply MatTransposeColoring to B */
1327:     Bt_dense = abt->Bt_den;
1328:     MatTransColoringApplySpToDen(matcoloring, B, Bt_dense);

1330:     /* C_dense = A*Bt_dense */
1331:     MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense);

1333:     /* Recover C from C_dense */
1334:     MatTransColoringApplyDenToSp(matcoloring, C_dense, C);
1335:     return 0;
1336:   }

1338:   for (i = 0; i < cm; i++) {
1339:     anzi = ai[i + 1] - ai[i];
1340:     acol = aj + ai[i];
1341:     aval = aa + ai[i];
1342:     cnzi = ci[i + 1] - ci[i];
1343:     ccol = cj + ci[i];
1344:     cval = ca + ci[i];
1345:     for (j = 0; j < cnzi; j++) {
1346:       brow = ccol[j];
1347:       bnzj = bi[brow + 1] - bi[brow];
1348:       bcol = bj + bi[brow];
1349:       bval = ba + bi[brow];

1351:       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
1352:       nexta = 0;
1353:       nextb = 0;
1354:       while (nexta < anzi && nextb < bnzj) {
1355:         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
1356:         if (nexta == anzi) break;
1357:         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
1358:         if (nextb == bnzj) break;
1359:         if (acol[nexta] == bcol[nextb]) {
1360:           cval[j] += aval[nexta] * bval[nextb];
1361:           nexta++;
1362:           nextb++;
1363:           flops += 2;
1364:         }
1365:       }
1366:     }
1367:   }
1368:   MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
1369:   MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);
1370:   PetscLogFlops(flops);
1371:   return 0;
1372: }

1374: PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
1375: {
1376:   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)data;

1378:   MatDestroy(&atb->At);
1379:   if (atb->destroy) (*atb->destroy)(atb->data);
1380:   PetscFree(atb);
1381:   return 0;
1382: }

1384: PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1385: {
1386:   Mat          At      = NULL;
1387:   Mat_Product *product = C->product;
1388:   PetscBool    flg, def, square;

1390:   MatCheckProduct(C, 4);
1391:   square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE);
1392:   /* outerproduct */
1393:   PetscStrcmp(product->alg, "outerproduct", &flg);
1394:   if (flg) {
1395:     /* create symbolic At */
1396:     if (!square) {
1397:       MatTransposeSymbolic(A, &At);
1398:       MatSetBlockSizes(At, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs));
1399:       MatSetType(At, ((PetscObject)A)->type_name);
1400:     }
1401:     /* get symbolic C=At*B */
1402:     MatProductSetAlgorithm(C, "sorted");
1403:     MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C);

1405:     /* clean up */
1406:     if (!square) MatDestroy(&At);

1408:     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
1409:     MatProductSetAlgorithm(C, "outerproduct");
1410:     return 0;
1411:   }

1413:   /* matmatmult */
1414:   PetscStrcmp(product->alg, "default", &def);
1415:   PetscStrcmp(product->alg, "at*b", &flg);
1416:   if (flg || def) {
1417:     Mat_MatTransMatMult *atb;

1420:     PetscNew(&atb);
1421:     if (!square) MatTranspose(A, MAT_INITIAL_MATRIX, &At);
1422:     MatProductSetAlgorithm(C, "sorted");
1423:     MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C);
1424:     MatProductSetAlgorithm(C, "at*b");
1425:     product->data    = atb;
1426:     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
1427:     atb->At          = At;

1429:     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
1430:     return 0;
1431:   }

1433:   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
1434: }

1436: PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1437: {
1438:   Mat_SeqAIJ    *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1439:   PetscInt       am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb;
1440:   PetscInt       cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k;
1441:   PetscLogDouble flops = 0.0;
1442:   MatScalar     *aa    = a->a, *ba, *ca, *caj;

1444:   if (!c->a) {
1445:     PetscCalloc1(ci[cm] + 1, &ca);

1447:     c->a      = ca;
1448:     c->free_a = PETSC_TRUE;
1449:   } else {
1450:     ca = c->a;
1451:     PetscArrayzero(ca, ci[cm]);
1452:   }

1454:   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1455:   for (i = 0; i < am; i++) {
1456:     bj   = b->j + bi[i];
1457:     ba   = b->a + bi[i];
1458:     bnzi = bi[i + 1] - bi[i];
1459:     anzi = ai[i + 1] - ai[i];
1460:     for (j = 0; j < anzi; j++) {
1461:       nextb = 0;
1462:       crow  = *aj++;
1463:       cjj   = cj + ci[crow];
1464:       caj   = ca + ci[crow];
1465:       /* perform sparse axpy operation.  Note cjj includes bj. */
1466:       for (k = 0; nextb < bnzi; k++) {
1467:         if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */
1468:           caj[k] += (*aa) * (*(ba + nextb));
1469:           nextb++;
1470:         }
1471:       }
1472:       flops += 2 * bnzi;
1473:       aa++;
1474:     }
1475:   }

1477:   /* Assemble the final matrix and clean up */
1478:   MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
1479:   MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);
1480:   PetscLogFlops(flops);
1481:   return 0;
1482: }

1484: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1485: {
1486:   MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C);
1487:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1488:   return 0;
1489: }

1491: PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add)
1492: {
1493:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
1494:   PetscScalar       *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4;
1495:   const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av;
1496:   const PetscInt    *aj;
1497:   PetscInt           cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n;
1498:   PetscInt           clda;
1499:   PetscInt           am4, bm4, col, i, j, n;

1501:   if (!cm || !cn) return 0;
1502:   MatSeqAIJGetArrayRead(A, &av);
1503:   if (add) {
1504:     MatDenseGetArray(C, &c);
1505:   } else {
1506:     MatDenseGetArrayWrite(C, &c);
1507:   }
1508:   MatDenseGetArrayRead(B, &b);
1509:   MatDenseGetLDA(B, &bm);
1510:   MatDenseGetLDA(C, &clda);
1511:   am4 = 4 * clda;
1512:   bm4 = 4 * bm;
1513:   b1  = b;
1514:   b2  = b1 + bm;
1515:   b3  = b2 + bm;
1516:   b4  = b3 + bm;
1517:   c1  = c;
1518:   c2  = c1 + clda;
1519:   c3  = c2 + clda;
1520:   c4  = c3 + clda;
1521:   for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */
1522:     for (i = 0; i < am; i++) {                  /* over rows of A in those columns */
1523:       r1 = r2 = r3 = r4 = 0.0;
1524:       n                 = a->i[i + 1] - a->i[i];
1525:       aj                = a->j + a->i[i];
1526:       aa                = av + a->i[i];
1527:       for (j = 0; j < n; j++) {
1528:         const PetscScalar aatmp = aa[j];
1529:         const PetscInt    ajtmp = aj[j];
1530:         r1 += aatmp * b1[ajtmp];
1531:         r2 += aatmp * b2[ajtmp];
1532:         r3 += aatmp * b3[ajtmp];
1533:         r4 += aatmp * b4[ajtmp];
1534:       }
1535:       if (add) {
1536:         c1[i] += r1;
1537:         c2[i] += r2;
1538:         c3[i] += r3;
1539:         c4[i] += r4;
1540:       } else {
1541:         c1[i] = r1;
1542:         c2[i] = r2;
1543:         c3[i] = r3;
1544:         c4[i] = r4;
1545:       }
1546:     }
1547:     b1 += bm4;
1548:     b2 += bm4;
1549:     b3 += bm4;
1550:     b4 += bm4;
1551:     c1 += am4;
1552:     c2 += am4;
1553:     c3 += am4;
1554:     c4 += am4;
1555:   }
1556:   /* process remaining columns */
1557:   if (col != cn) {
1558:     PetscInt rc = cn - col;

1560:     if (rc == 1) {
1561:       for (i = 0; i < am; i++) {
1562:         r1 = 0.0;
1563:         n  = a->i[i + 1] - a->i[i];
1564:         aj = a->j + a->i[i];
1565:         aa = av + a->i[i];
1566:         for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]];
1567:         if (add) c1[i] += r1;
1568:         else c1[i] = r1;
1569:       }
1570:     } else if (rc == 2) {
1571:       for (i = 0; i < am; i++) {
1572:         r1 = r2 = 0.0;
1573:         n       = a->i[i + 1] - a->i[i];
1574:         aj      = a->j + a->i[i];
1575:         aa      = av + a->i[i];
1576:         for (j = 0; j < n; j++) {
1577:           const PetscScalar aatmp = aa[j];
1578:           const PetscInt    ajtmp = aj[j];
1579:           r1 += aatmp * b1[ajtmp];
1580:           r2 += aatmp * b2[ajtmp];
1581:         }
1582:         if (add) {
1583:           c1[i] += r1;
1584:           c2[i] += r2;
1585:         } else {
1586:           c1[i] = r1;
1587:           c2[i] = r2;
1588:         }
1589:       }
1590:     } else {
1591:       for (i = 0; i < am; i++) {
1592:         r1 = r2 = r3 = 0.0;
1593:         n            = a->i[i + 1] - a->i[i];
1594:         aj           = a->j + a->i[i];
1595:         aa           = av + a->i[i];
1596:         for (j = 0; j < n; j++) {
1597:           const PetscScalar aatmp = aa[j];
1598:           const PetscInt    ajtmp = aj[j];
1599:           r1 += aatmp * b1[ajtmp];
1600:           r2 += aatmp * b2[ajtmp];
1601:           r3 += aatmp * b3[ajtmp];
1602:         }
1603:         if (add) {
1604:           c1[i] += r1;
1605:           c2[i] += r2;
1606:           c3[i] += r3;
1607:         } else {
1608:           c1[i] = r1;
1609:           c2[i] = r2;
1610:           c3[i] = r3;
1611:         }
1612:       }
1613:     }
1614:   }
1615:   PetscLogFlops(cn * (2.0 * a->nz));
1616:   if (add) {
1617:     MatDenseRestoreArray(C, &c);
1618:   } else {
1619:     MatDenseRestoreArrayWrite(C, &c);
1620:   }
1621:   MatDenseRestoreArrayRead(B, &b);
1622:   MatSeqAIJRestoreArrayRead(A, &av);
1623:   return 0;
1624: }

1626: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C)
1627: {

1632:   MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE);
1633:   return 0;
1634: }

1636: /* ------------------------------------------------------- */
1637: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1638: {
1639:   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
1640:   C->ops->productsymbolic = MatProductSymbolic_AB;
1641:   return 0;
1642: }

1644: PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat);

1646: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1647: {
1648:   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1649:   C->ops->productsymbolic          = MatProductSymbolic_AtB;
1650:   return 0;
1651: }

1653: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
1654: {
1655:   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1656:   C->ops->productsymbolic          = MatProductSymbolic_ABt;
1657:   return 0;
1658: }

1660: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1661: {
1662:   Mat_Product *product = C->product;

1664:   switch (product->type) {
1665:   case MATPRODUCT_AB:
1666:     MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C);
1667:     break;
1668:   case MATPRODUCT_AtB:
1669:     MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C);
1670:     break;
1671:   case MATPRODUCT_ABt:
1672:     MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C);
1673:     break;
1674:   default:
1675:     break;
1676:   }
1677:   return 0;
1678: }
1679: /* ------------------------------------------------------- */
1680: static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1681: {
1682:   Mat_Product *product = C->product;
1683:   Mat          A       = product->A;
1684:   PetscBool    baij;

1686:   PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij);
1687:   if (!baij) { /* A is seqsbaij */
1688:     PetscBool sbaij;
1689:     PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij);

1692:     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
1693:   } else { /* A is seqbaij */
1694:     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
1695:   }

1697:   C->ops->productsymbolic = MatProductSymbolic_AB;
1698:   return 0;
1699: }

1701: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1702: {
1703:   Mat_Product *product = C->product;

1705:   MatCheckProduct(C, 1);
1707:   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C);
1708:   return 0;
1709: }

1711: /* ------------------------------------------------------- */
1712: static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1713: {
1714:   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
1715:   C->ops->productsymbolic = MatProductSymbolic_AB;
1716:   return 0;
1717: }

1719: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1720: {
1721:   Mat_Product *product = C->product;

1723:   if (product->type == MATPRODUCT_AB) MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C);
1724:   return 0;
1725: }
1726: /* ------------------------------------------------------- */

1728: PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense)
1729: {
1730:   Mat_SeqAIJ   *b       = (Mat_SeqAIJ *)B->data;
1731:   Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data;
1732:   PetscInt     *bi = b->i, *bj = b->j;
1733:   PetscInt      m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns;
1734:   MatScalar    *btval, *btval_den, *ba = b->a;
1735:   PetscInt     *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors;

1737:   btval_den = btdense->v;
1738:   PetscArrayzero(btval_den, m * n);
1739:   for (k = 0; k < ncolors; k++) {
1740:     ncolumns = coloring->ncolumns[k];
1741:     for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */
1742:       col   = *(columns + colorforcol[k] + l);
1743:       btcol = bj + bi[col];
1744:       btval = ba + bi[col];
1745:       anz   = bi[col + 1] - bi[col];
1746:       for (j = 0; j < anz; j++) {
1747:         brow            = btcol[j];
1748:         btval_den[brow] = btval[j];
1749:       }
1750:     }
1751:     btval_den += m;
1752:   }
1753:   return 0;
1754: }

1756: PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp)
1757: {
1758:   Mat_SeqAIJ        *csp = (Mat_SeqAIJ *)Csp->data;
1759:   const PetscScalar *ca_den, *ca_den_ptr;
1760:   PetscScalar       *ca = csp->a;
1761:   PetscInt           k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors;
1762:   PetscInt           brows = matcoloring->brows, *den2sp = matcoloring->den2sp;
1763:   PetscInt           nrows, *row, *idx;
1764:   PetscInt          *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow;

1766:   MatDenseGetArrayRead(Cden, &ca_den);

1768:   if (brows > 0) {
1769:     PetscInt *lstart, row_end, row_start;
1770:     lstart = matcoloring->lstart;
1771:     PetscArrayzero(lstart, ncolors);

1773:     row_end = brows;
1774:     if (row_end > m) row_end = m;
1775:     for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */
1776:       ca_den_ptr = ca_den;
1777:       for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */
1778:         nrows = matcoloring->nrows[k];
1779:         row   = rows + colorforrow[k];
1780:         idx   = den2sp + colorforrow[k];
1781:         for (l = lstart[k]; l < nrows; l++) {
1782:           if (row[l] >= row_end) {
1783:             lstart[k] = l;
1784:             break;
1785:           } else {
1786:             ca[idx[l]] = ca_den_ptr[row[l]];
1787:           }
1788:         }
1789:         ca_den_ptr += m;
1790:       }
1791:       row_end += brows;
1792:       if (row_end > m) row_end = m;
1793:     }
1794:   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1795:     ca_den_ptr = ca_den;
1796:     for (k = 0; k < ncolors; k++) {
1797:       nrows = matcoloring->nrows[k];
1798:       row   = rows + colorforrow[k];
1799:       idx   = den2sp + colorforrow[k];
1800:       for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]];
1801:       ca_den_ptr += m;
1802:     }
1803:   }

1805:   MatDenseRestoreArrayRead(Cden, &ca_den);
1806: #if defined(PETSC_USE_INFO)
1807:   if (matcoloring->brows > 0) {
1808:     PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows);
1809:   } else {
1810:     PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");
1811:   }
1812: #endif
1813:   return 0;
1814: }

1816: PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c)
1817: {
1818:   PetscInt        i, n, nrows, Nbs, j, k, m, ncols, col, cm;
1819:   const PetscInt *is, *ci, *cj, *row_idx;
1820:   PetscInt        nis = iscoloring->n, *rowhit, bs = 1;
1821:   IS             *isa;
1822:   Mat_SeqAIJ     *csp = (Mat_SeqAIJ *)mat->data;
1823:   PetscInt       *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i;
1824:   PetscInt       *colorforcol, *columns, *columns_i, brows;
1825:   PetscBool       flg;

1827:   ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa);

1829:   /* bs >1 is not being tested yet! */
1830:   Nbs       = mat->cmap->N / bs;
1831:   c->M      = mat->rmap->N / bs; /* set total rows, columns and local rows */
1832:   c->N      = Nbs;
1833:   c->m      = c->M;
1834:   c->rstart = 0;
1835:   c->brows  = 100;

1837:   c->ncolors = nis;
1838:   PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow);
1839:   PetscMalloc1(csp->nz + 1, &rows);
1840:   PetscMalloc1(csp->nz + 1, &den2sp);

1842:   brows = c->brows;
1843:   PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg);
1844:   if (flg) c->brows = brows;
1845:   if (brows > 0) PetscMalloc1(nis + 1, &c->lstart);

1847:   colorforrow[0] = 0;
1848:   rows_i         = rows;
1849:   den2sp_i       = den2sp;

1851:   PetscMalloc1(nis + 1, &colorforcol);
1852:   PetscMalloc1(Nbs + 1, &columns);

1854:   colorforcol[0] = 0;
1855:   columns_i      = columns;

1857:   /* get column-wise storage of mat */
1858:   MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL);

1860:   cm = c->m;
1861:   PetscMalloc1(cm + 1, &rowhit);
1862:   PetscMalloc1(cm + 1, &idxhit);
1863:   for (i = 0; i < nis; i++) { /* loop over color */
1864:     ISGetLocalSize(isa[i], &n);
1865:     ISGetIndices(isa[i], &is);

1867:     c->ncolumns[i] = n;
1868:     if (n) PetscArraycpy(columns_i, is, n);
1869:     colorforcol[i + 1] = colorforcol[i] + n;
1870:     columns_i += n;

1872:     /* fast, crude version requires O(N*N) work */
1873:     PetscArrayzero(rowhit, cm);

1875:     for (j = 0; j < n; j++) { /* loop over columns*/
1876:       col     = is[j];
1877:       row_idx = cj + ci[col];
1878:       m       = ci[col + 1] - ci[col];
1879:       for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */
1880:         idxhit[*row_idx]   = spidx[ci[col] + k];
1881:         rowhit[*row_idx++] = col + 1;
1882:       }
1883:     }
1884:     /* count the number of hits */
1885:     nrows = 0;
1886:     for (j = 0; j < cm; j++) {
1887:       if (rowhit[j]) nrows++;
1888:     }
1889:     c->nrows[i]        = nrows;
1890:     colorforrow[i + 1] = colorforrow[i] + nrows;

1892:     nrows = 0;
1893:     for (j = 0; j < cm; j++) { /* loop over rows */
1894:       if (rowhit[j]) {
1895:         rows_i[nrows]   = j;
1896:         den2sp_i[nrows] = idxhit[j];
1897:         nrows++;
1898:       }
1899:     }
1900:     den2sp_i += nrows;

1902:     ISRestoreIndices(isa[i], &is);
1903:     rows_i += nrows;
1904:   }
1905:   MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL);
1906:   PetscFree(rowhit);
1907:   ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa);

1910:   c->colorforrow = colorforrow;
1911:   c->rows        = rows;
1912:   c->den2sp      = den2sp;
1913:   c->colorforcol = colorforcol;
1914:   c->columns     = columns;

1916:   PetscFree(idxhit);
1917:   return 0;
1918: }

1920: /* --------------------------------------------------------------- */
1921: static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1922: {
1923:   Mat_Product *product = C->product;
1924:   Mat          A = product->A, B = product->B;

1926:   if (C->ops->mattransposemultnumeric) {
1927:     /* Alg: "outerproduct" */
1928:     (*C->ops->mattransposemultnumeric)(A, B, C);
1929:   } else {
1930:     /* Alg: "matmatmult" -- C = At*B */
1931:     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;

1934:     if (atb->At) {
1935:       /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ();
1936:          user may have called MatProductReplaceMats() to get this A=product->A */
1937:       MatTransposeSetPrecursor(A, atb->At);
1938:       MatTranspose(A, MAT_REUSE_MATRIX, &atb->At);
1939:     }
1940:     MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C);
1941:   }
1942:   return 0;
1943: }

1945: static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1946: {
1947:   Mat_Product *product = C->product;
1948:   Mat          A = product->A, B = product->B;
1949:   PetscReal    fill = product->fill;

1951:   MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C);

1953:   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
1954:   return 0;
1955: }

1957: /* --------------------------------------------------------------- */
1958: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1959: {
1960:   Mat_Product *product = C->product;
1961:   PetscInt     alg     = 0; /* default algorithm */
1962:   PetscBool    flg     = PETSC_FALSE;
1963: #if !defined(PETSC_HAVE_HYPRE)
1964:   const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
1965:   PetscInt    nalg        = 7;
1966: #else
1967:   const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"};
1968:   PetscInt    nalg        = 8;
1969: #endif

1971:   /* Set default algorithm */
1972:   PetscStrcmp(C->product->alg, "default", &flg);
1973:   if (flg) MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]);

1975:   /* Get runtime option */
1976:   if (product->api_user) {
1977:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat");
1978:     PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg);
1979:     PetscOptionsEnd();
1980:   } else {
1981:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
1982:     PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg);
1983:     PetscOptionsEnd();
1984:   }
1985:   if (flg) MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]);

1987:   C->ops->productsymbolic = MatProductSymbolic_AB;
1988:   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
1989:   return 0;
1990: }

1992: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
1993: {
1994:   Mat_Product *product     = C->product;
1995:   PetscInt     alg         = 0; /* default algorithm */
1996:   PetscBool    flg         = PETSC_FALSE;
1997:   const char  *algTypes[3] = {"default", "at*b", "outerproduct"};
1998:   PetscInt     nalg        = 3;

2000:   /* Get runtime option */
2001:   if (product->api_user) {
2002:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat");
2003:     PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg);
2004:     PetscOptionsEnd();
2005:   } else {
2006:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat");
2007:     PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg);
2008:     PetscOptionsEnd();
2009:   }
2010:   if (flg) MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]);

2012:   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
2013:   return 0;
2014: }

2016: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
2017: {
2018:   Mat_Product *product     = C->product;
2019:   PetscInt     alg         = 0; /* default algorithm */
2020:   PetscBool    flg         = PETSC_FALSE;
2021:   const char  *algTypes[2] = {"default", "color"};
2022:   PetscInt     nalg        = 2;

2024:   /* Set default algorithm */
2025:   PetscStrcmp(C->product->alg, "default", &flg);
2026:   if (!flg) {
2027:     alg = 1;
2028:     MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]);
2029:   }

2031:   /* Get runtime option */
2032:   if (product->api_user) {
2033:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat");
2034:     PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg);
2035:     PetscOptionsEnd();
2036:   } else {
2037:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat");
2038:     PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg);
2039:     PetscOptionsEnd();
2040:   }
2041:   if (flg) MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]);

2043:   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
2044:   C->ops->productsymbolic          = MatProductSymbolic_ABt;
2045:   return 0;
2046: }

2048: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
2049: {
2050:   Mat_Product *product = C->product;
2051:   PetscBool    flg     = PETSC_FALSE;
2052:   PetscInt     alg     = 0; /* default algorithm -- alg=1 should be default!!! */
2053: #if !defined(PETSC_HAVE_HYPRE)
2054:   const char *algTypes[2] = {"scalable", "rap"};
2055:   PetscInt    nalg        = 2;
2056: #else
2057:   const char *algTypes[3] = {"scalable", "rap", "hypre"};
2058:   PetscInt    nalg        = 3;
2059: #endif

2061:   /* Set default algorithm */
2062:   PetscStrcmp(product->alg, "default", &flg);
2063:   if (flg) MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]);

2065:   /* Get runtime option */
2066:   if (product->api_user) {
2067:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat");
2068:     PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg);
2069:     PetscOptionsEnd();
2070:   } else {
2071:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
2072:     PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg);
2073:     PetscOptionsEnd();
2074:   }
2075:   if (flg) MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]);

2077:   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
2078:   return 0;
2079: }

2081: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2082: {
2083:   Mat_Product *product     = C->product;
2084:   PetscBool    flg         = PETSC_FALSE;
2085:   PetscInt     alg         = 0; /* default algorithm */
2086:   const char  *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"};
2087:   PetscInt     nalg        = 3;

2089:   /* Set default algorithm */
2090:   PetscStrcmp(product->alg, "default", &flg);
2091:   if (flg) MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]);

2093:   /* Get runtime option */
2094:   if (product->api_user) {
2095:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat");
2096:     PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg);
2097:     PetscOptionsEnd();
2098:   } else {
2099:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat");
2100:     PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg);
2101:     PetscOptionsEnd();
2102:   }
2103:   if (flg) MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]);

2105:   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
2106:   return 0;
2107: }

2109: /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
2110: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2111: {
2112:   Mat_Product *product     = C->product;
2113:   PetscInt     alg         = 0; /* default algorithm */
2114:   PetscBool    flg         = PETSC_FALSE;
2115:   const char  *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
2116:   PetscInt     nalg        = 7;

2118:   /* Set default algorithm */
2119:   PetscStrcmp(product->alg, "default", &flg);
2120:   if (flg) MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]);

2122:   /* Get runtime option */
2123:   if (product->api_user) {
2124:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat");
2125:     PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg);
2126:     PetscOptionsEnd();
2127:   } else {
2128:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat");
2129:     PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg);
2130:     PetscOptionsEnd();
2131:   }
2132:   if (flg) MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]);

2134:   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
2135:   C->ops->productsymbolic    = MatProductSymbolic_ABC;
2136:   return 0;
2137: }

2139: PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2140: {
2141:   Mat_Product *product = C->product;

2143:   switch (product->type) {
2144:   case MATPRODUCT_AB:
2145:     MatProductSetFromOptions_SeqAIJ_AB(C);
2146:     break;
2147:   case MATPRODUCT_AtB:
2148:     MatProductSetFromOptions_SeqAIJ_AtB(C);
2149:     break;
2150:   case MATPRODUCT_ABt:
2151:     MatProductSetFromOptions_SeqAIJ_ABt(C);
2152:     break;
2153:   case MATPRODUCT_PtAP:
2154:     MatProductSetFromOptions_SeqAIJ_PtAP(C);
2155:     break;
2156:   case MATPRODUCT_RARt:
2157:     MatProductSetFromOptions_SeqAIJ_RARt(C);
2158:     break;
2159:   case MATPRODUCT_ABC:
2160:     MatProductSetFromOptions_SeqAIJ_ABC(C);
2161:     break;
2162:   default:
2163:     break;
2164:   }
2165:   return 0;
2166: }