Actual source code: baij2.c

  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <../src/mat/impls/dense/seq/dense.h>
  3: #include <petsc/private/kernels/blockinvert.h>
  4: #include <petscbt.h>
  5: #include <petscblaslapack.h>

  7: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
  8:   #include <immintrin.h>
  9: #endif

 11: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
 12: {
 13:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
 14:   PetscInt        row, i, j, k, l, m, n, *nidx, isz, val, ival;
 15:   const PetscInt *idx;
 16:   PetscInt        start, end, *ai, *aj, bs;
 17:   PetscBT         table;

 19:   m  = a->mbs;
 20:   ai = a->i;
 21:   aj = a->j;
 22:   bs = A->rmap->bs;


 26:   PetscBTCreate(m, &table);
 27:   PetscMalloc1(m + 1, &nidx);

 29:   for (i = 0; i < is_max; i++) {
 30:     /* Initialise the two local arrays */
 31:     isz = 0;
 32:     PetscBTMemzero(m, table);

 34:     /* Extract the indices, assume there can be duplicate entries */
 35:     ISGetIndices(is[i], &idx);
 36:     ISGetLocalSize(is[i], &n);

 38:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 39:     for (j = 0; j < n; ++j) {
 40:       ival = idx[j] / bs; /* convert the indices into block indices */
 42:       if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival;
 43:     }
 44:     ISRestoreIndices(is[i], &idx);
 45:     ISDestroy(&is[i]);

 47:     k = 0;
 48:     for (j = 0; j < ov; j++) { /* for each overlap*/
 49:       n = isz;
 50:       for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */
 51:         row   = nidx[k];
 52:         start = ai[row];
 53:         end   = ai[row + 1];
 54:         for (l = start; l < end; l++) {
 55:           val = aj[l];
 56:           if (!PetscBTLookupSet(table, val)) nidx[isz++] = val;
 57:         }
 58:       }
 59:     }
 60:     ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i);
 61:   }
 62:   PetscBTDestroy(&table);
 63:   PetscFree(nidx);
 64:   return 0;
 65: }

 67: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
 68: {
 69:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *c;
 70:   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
 71:   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
 72:   const PetscInt *irow, *icol;
 73:   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
 74:   PetscInt       *aj = a->j, *ai = a->i;
 75:   MatScalar      *mat_a;
 76:   Mat             C;
 77:   PetscBool       flag;

 79:   ISGetIndices(isrow, &irow);
 80:   ISGetIndices(iscol, &icol);
 81:   ISGetLocalSize(isrow, &nrows);
 82:   ISGetLocalSize(iscol, &ncols);

 84:   PetscCalloc1(1 + oldcols, &smap);
 85:   ssmap = smap;
 86:   PetscMalloc1(1 + nrows, &lens);
 87:   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
 88:   /* determine lens of each row */
 89:   for (i = 0; i < nrows; i++) {
 90:     kstart  = ai[irow[i]];
 91:     kend    = kstart + a->ilen[irow[i]];
 92:     lens[i] = 0;
 93:     for (k = kstart; k < kend; k++) {
 94:       if (ssmap[aj[k]]) lens[i]++;
 95:     }
 96:   }
 97:   /* Create and fill new matrix */
 98:   if (scall == MAT_REUSE_MATRIX) {
 99:     c = (Mat_SeqBAIJ *)((*B)->data);

102:     PetscArraycmp(c->ilen, lens, c->mbs, &flag);
104:     PetscArrayzero(c->ilen, c->mbs);
105:     C = *B;
106:   } else {
107:     MatCreate(PetscObjectComm((PetscObject)A), &C);
108:     MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE);
109:     MatSetType(C, ((PetscObject)A)->type_name);
110:     MatSeqBAIJSetPreallocation(C, bs, 0, lens);
111:   }
112:   c = (Mat_SeqBAIJ *)(C->data);
113:   for (i = 0; i < nrows; i++) {
114:     row      = irow[i];
115:     kstart   = ai[row];
116:     kend     = kstart + a->ilen[row];
117:     mat_i    = c->i[i];
118:     mat_j    = c->j ? c->j + mat_i : NULL;       /* mustn't add to NULL, that is UB */
119:     mat_a    = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */
120:     mat_ilen = c->ilen + i;
121:     for (k = kstart; k < kend; k++) {
122:       if ((tcol = ssmap[a->j[k]])) {
123:         *mat_j++ = tcol - 1;
124:         PetscArraycpy(mat_a, a->a + k * bs2, bs2);
125:         mat_a += bs2;
126:         (*mat_ilen)++;
127:       }
128:     }
129:   }
130:   /* sort */
131:   if (c->j && c->a) {
132:     MatScalar *work;
133:     PetscMalloc1(bs2, &work);
134:     for (i = 0; i < nrows; i++) {
135:       PetscInt ilen;
136:       mat_i = c->i[i];
137:       mat_j = c->j + mat_i;
138:       mat_a = c->a + mat_i * bs2;
139:       ilen  = c->ilen[i];
140:       PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work);
141:     }
142:     PetscFree(work);
143:   }

145:   /* Free work space */
146:   ISRestoreIndices(iscol, &icol);
147:   PetscFree(smap);
148:   PetscFree(lens);
149:   MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
150:   MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);

152:   ISRestoreIndices(isrow, &irow);
153:   *B = C;
154:   return 0;
155: }

157: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
158: {
159:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
160:   IS              is1, is2;
161:   PetscInt       *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j;
162:   const PetscInt *irow, *icol;

164:   ISGetIndices(isrow, &irow);
165:   ISGetIndices(iscol, &icol);
166:   ISGetLocalSize(isrow, &nrows);
167:   ISGetLocalSize(iscol, &ncols);

169:   /* Verify if the indices correspond to each element in a block
170:    and form the IS with compressed IS */
171:   maxmnbs = PetscMax(a->mbs, a->nbs);
172:   PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary);
173:   PetscArrayzero(vary, a->mbs);
174:   for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
176:   count = 0;
177:   for (i = 0; i < nrows; i++) {
178:     j = irow[i] / bs;
179:     if ((vary[j]--) == bs) iary[count++] = j;
180:   }
181:   ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1);

183:   PetscArrayzero(vary, a->nbs);
184:   for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
186:   count = 0;
187:   for (i = 0; i < ncols; i++) {
188:     j = icol[i] / bs;
189:     if ((vary[j]--) == bs) iary[count++] = j;
190:   }
191:   ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2);
192:   ISRestoreIndices(isrow, &irow);
193:   ISRestoreIndices(iscol, &icol);
194:   PetscFree2(vary, iary);

196:   MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B);
197:   ISDestroy(&is1);
198:   ISDestroy(&is2);
199:   return 0;
200: }

202: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
203: {
204:   Mat_SeqBAIJ *c       = (Mat_SeqBAIJ *)C->data;
205:   Mat_SubSppt *submatj = c->submatis1;

207:   (*submatj->destroy)(C);
208:   MatDestroySubMatrix_Private(submatj);
209:   return 0;
210: }

212: /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
213: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[])
214: {
215:   PetscInt     i;
216:   Mat          C;
217:   Mat_SeqBAIJ *c;
218:   Mat_SubSppt *submatj;

220:   for (i = 0; i < n; i++) {
221:     C       = (*mat)[i];
222:     c       = (Mat_SeqBAIJ *)C->data;
223:     submatj = c->submatis1;
224:     if (submatj) {
225:       if (--((PetscObject)C)->refct <= 0) {
226:         PetscFree(C->factorprefix);
227:         (*submatj->destroy)(C);
228:         MatDestroySubMatrix_Private(submatj);
229:         PetscFree(C->defaultvectype);
230:         PetscFree(C->defaultrandtype);
231:         PetscLayoutDestroy(&C->rmap);
232:         PetscLayoutDestroy(&C->cmap);
233:         PetscHeaderDestroy(&C);
234:       }
235:     } else {
236:       MatDestroy(&C);
237:     }
238:   }

240:   /* Destroy Dummy submatrices created for reuse */
241:   MatDestroySubMatrices_Dummy(n, mat);

243:   PetscFree(*mat);
244:   return 0;
245: }

247: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
248: {
249:   PetscInt i;

251:   if (scall == MAT_INITIAL_MATRIX) PetscCalloc1(n + 1, B);

253:   for (i = 0; i < n; i++) MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i]);
254:   return 0;
255: }

257: /* -------------------------------------------------------*/
258: /* Should check that shapes of vectors and matrices match */
259: /* -------------------------------------------------------*/

261: PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz)
262: {
263:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
264:   PetscScalar       *z, sum;
265:   const PetscScalar *x;
266:   const MatScalar   *v;
267:   PetscInt           mbs, i, n;
268:   const PetscInt    *idx, *ii, *ridx = NULL;
269:   PetscBool          usecprow = a->compressedrow.use;

271:   VecGetArrayRead(xx, &x);
272:   VecGetArrayWrite(zz, &z);

274:   if (usecprow) {
275:     mbs  = a->compressedrow.nrows;
276:     ii   = a->compressedrow.i;
277:     ridx = a->compressedrow.rindex;
278:     PetscArrayzero(z, a->mbs);
279:   } else {
280:     mbs = a->mbs;
281:     ii  = a->i;
282:   }

284:   for (i = 0; i < mbs; i++) {
285:     n   = ii[1] - ii[0];
286:     v   = a->a + ii[0];
287:     idx = a->j + ii[0];
288:     ii++;
289:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
290:     PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
291:     sum = 0.0;
292:     PetscSparseDensePlusDot(sum, x, v, idx, n);
293:     if (usecprow) {
294:       z[ridx[i]] = sum;
295:     } else {
296:       z[i] = sum;
297:     }
298:   }
299:   VecRestoreArrayRead(xx, &x);
300:   VecRestoreArrayWrite(zz, &z);
301:   PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt);
302:   return 0;
303: }

305: PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz)
306: {
307:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
308:   PetscScalar       *z = NULL, sum1, sum2, *zarray;
309:   const PetscScalar *x, *xb;
310:   PetscScalar        x1, x2;
311:   const MatScalar   *v;
312:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
313:   PetscBool          usecprow = a->compressedrow.use;

315:   VecGetArrayRead(xx, &x);
316:   VecGetArrayWrite(zz, &zarray);

318:   idx = a->j;
319:   v   = a->a;
320:   if (usecprow) {
321:     mbs  = a->compressedrow.nrows;
322:     ii   = a->compressedrow.i;
323:     ridx = a->compressedrow.rindex;
324:     PetscArrayzero(zarray, 2 * a->mbs);
325:   } else {
326:     mbs = a->mbs;
327:     ii  = a->i;
328:     z   = zarray;
329:   }

331:   for (i = 0; i < mbs; i++) {
332:     n = ii[1] - ii[0];
333:     ii++;
334:     sum1 = 0.0;
335:     sum2 = 0.0;
336:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
337:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
338:     for (j = 0; j < n; j++) {
339:       xb = x + 2 * (*idx++);
340:       x1 = xb[0];
341:       x2 = xb[1];
342:       sum1 += v[0] * x1 + v[2] * x2;
343:       sum2 += v[1] * x1 + v[3] * x2;
344:       v += 4;
345:     }
346:     if (usecprow) z = zarray + 2 * ridx[i];
347:     z[0] = sum1;
348:     z[1] = sum2;
349:     if (!usecprow) z += 2;
350:   }
351:   VecRestoreArrayRead(xx, &x);
352:   VecRestoreArrayWrite(zz, &zarray);
353:   PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt);
354:   return 0;
355: }

357: PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz)
358: {
359:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
360:   PetscScalar       *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray;
361:   const PetscScalar *x, *xb;
362:   const MatScalar   *v;
363:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
364:   PetscBool          usecprow = a->compressedrow.use;

366: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
367:   #pragma disjoint(*v, *z, *xb)
368: #endif

370:   VecGetArrayRead(xx, &x);
371:   VecGetArrayWrite(zz, &zarray);

373:   idx = a->j;
374:   v   = a->a;
375:   if (usecprow) {
376:     mbs  = a->compressedrow.nrows;
377:     ii   = a->compressedrow.i;
378:     ridx = a->compressedrow.rindex;
379:     PetscArrayzero(zarray, 3 * a->mbs);
380:   } else {
381:     mbs = a->mbs;
382:     ii  = a->i;
383:     z   = zarray;
384:   }

386:   for (i = 0; i < mbs; i++) {
387:     n = ii[1] - ii[0];
388:     ii++;
389:     sum1 = 0.0;
390:     sum2 = 0.0;
391:     sum3 = 0.0;
392:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
393:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
394:     for (j = 0; j < n; j++) {
395:       xb = x + 3 * (*idx++);
396:       x1 = xb[0];
397:       x2 = xb[1];
398:       x3 = xb[2];

400:       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
401:       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
402:       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
403:       v += 9;
404:     }
405:     if (usecprow) z = zarray + 3 * ridx[i];
406:     z[0] = sum1;
407:     z[1] = sum2;
408:     z[2] = sum3;
409:     if (!usecprow) z += 3;
410:   }
411:   VecRestoreArrayRead(xx, &x);
412:   VecRestoreArrayWrite(zz, &zarray);
413:   PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt);
414:   return 0;
415: }

417: PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz)
418: {
419:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
420:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray;
421:   const PetscScalar *x, *xb;
422:   const MatScalar   *v;
423:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
424:   PetscBool          usecprow = a->compressedrow.use;

426:   VecGetArrayRead(xx, &x);
427:   VecGetArrayWrite(zz, &zarray);

429:   idx = a->j;
430:   v   = a->a;
431:   if (usecprow) {
432:     mbs  = a->compressedrow.nrows;
433:     ii   = a->compressedrow.i;
434:     ridx = a->compressedrow.rindex;
435:     PetscArrayzero(zarray, 4 * a->mbs);
436:   } else {
437:     mbs = a->mbs;
438:     ii  = a->i;
439:     z   = zarray;
440:   }

442:   for (i = 0; i < mbs; i++) {
443:     n = ii[1] - ii[0];
444:     ii++;
445:     sum1 = 0.0;
446:     sum2 = 0.0;
447:     sum3 = 0.0;
448:     sum4 = 0.0;

450:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
451:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
452:     for (j = 0; j < n; j++) {
453:       xb = x + 4 * (*idx++);
454:       x1 = xb[0];
455:       x2 = xb[1];
456:       x3 = xb[2];
457:       x4 = xb[3];
458:       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
459:       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
460:       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
461:       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
462:       v += 16;
463:     }
464:     if (usecprow) z = zarray + 4 * ridx[i];
465:     z[0] = sum1;
466:     z[1] = sum2;
467:     z[2] = sum3;
468:     z[3] = sum4;
469:     if (!usecprow) z += 4;
470:   }
471:   VecRestoreArrayRead(xx, &x);
472:   VecRestoreArrayWrite(zz, &zarray);
473:   PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt);
474:   return 0;
475: }

477: PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz)
478: {
479:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
480:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray;
481:   const PetscScalar *xb, *x;
482:   const MatScalar   *v;
483:   const PetscInt    *idx, *ii, *ridx = NULL;
484:   PetscInt           mbs, i, j, n;
485:   PetscBool          usecprow = a->compressedrow.use;

487:   VecGetArrayRead(xx, &x);
488:   VecGetArrayWrite(zz, &zarray);

490:   idx = a->j;
491:   v   = a->a;
492:   if (usecprow) {
493:     mbs  = a->compressedrow.nrows;
494:     ii   = a->compressedrow.i;
495:     ridx = a->compressedrow.rindex;
496:     PetscArrayzero(zarray, 5 * a->mbs);
497:   } else {
498:     mbs = a->mbs;
499:     ii  = a->i;
500:     z   = zarray;
501:   }

503:   for (i = 0; i < mbs; i++) {
504:     n = ii[1] - ii[0];
505:     ii++;
506:     sum1 = 0.0;
507:     sum2 = 0.0;
508:     sum3 = 0.0;
509:     sum4 = 0.0;
510:     sum5 = 0.0;
511:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
512:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
513:     for (j = 0; j < n; j++) {
514:       xb = x + 5 * (*idx++);
515:       x1 = xb[0];
516:       x2 = xb[1];
517:       x3 = xb[2];
518:       x4 = xb[3];
519:       x5 = xb[4];
520:       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
521:       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
522:       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
523:       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
524:       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
525:       v += 25;
526:     }
527:     if (usecprow) z = zarray + 5 * ridx[i];
528:     z[0] = sum1;
529:     z[1] = sum2;
530:     z[2] = sum3;
531:     z[3] = sum4;
532:     z[4] = sum5;
533:     if (!usecprow) z += 5;
534:   }
535:   VecRestoreArrayRead(xx, &x);
536:   VecRestoreArrayWrite(zz, &zarray);
537:   PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt);
538:   return 0;
539: }

541: PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz)
542: {
543:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
544:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
545:   const PetscScalar *x, *xb;
546:   PetscScalar        x1, x2, x3, x4, x5, x6, *zarray;
547:   const MatScalar   *v;
548:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
549:   PetscBool          usecprow = a->compressedrow.use;

551:   VecGetArrayRead(xx, &x);
552:   VecGetArrayWrite(zz, &zarray);

554:   idx = a->j;
555:   v   = a->a;
556:   if (usecprow) {
557:     mbs  = a->compressedrow.nrows;
558:     ii   = a->compressedrow.i;
559:     ridx = a->compressedrow.rindex;
560:     PetscArrayzero(zarray, 6 * a->mbs);
561:   } else {
562:     mbs = a->mbs;
563:     ii  = a->i;
564:     z   = zarray;
565:   }

567:   for (i = 0; i < mbs; i++) {
568:     n = ii[1] - ii[0];
569:     ii++;
570:     sum1 = 0.0;
571:     sum2 = 0.0;
572:     sum3 = 0.0;
573:     sum4 = 0.0;
574:     sum5 = 0.0;
575:     sum6 = 0.0;

577:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
578:     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
579:     for (j = 0; j < n; j++) {
580:       xb = x + 6 * (*idx++);
581:       x1 = xb[0];
582:       x2 = xb[1];
583:       x3 = xb[2];
584:       x4 = xb[3];
585:       x5 = xb[4];
586:       x6 = xb[5];
587:       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
588:       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
589:       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
590:       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
591:       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
592:       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
593:       v += 36;
594:     }
595:     if (usecprow) z = zarray + 6 * ridx[i];
596:     z[0] = sum1;
597:     z[1] = sum2;
598:     z[2] = sum3;
599:     z[3] = sum4;
600:     z[4] = sum5;
601:     z[5] = sum6;
602:     if (!usecprow) z += 6;
603:   }

605:   VecRestoreArrayRead(xx, &x);
606:   VecRestoreArrayWrite(zz, &zarray);
607:   PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt);
608:   return 0;
609: }

611: PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz)
612: {
613:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
614:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
615:   const PetscScalar *x, *xb;
616:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *zarray;
617:   const MatScalar   *v;
618:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
619:   PetscBool          usecprow = a->compressedrow.use;

621:   VecGetArrayRead(xx, &x);
622:   VecGetArrayWrite(zz, &zarray);

624:   idx = a->j;
625:   v   = a->a;
626:   if (usecprow) {
627:     mbs  = a->compressedrow.nrows;
628:     ii   = a->compressedrow.i;
629:     ridx = a->compressedrow.rindex;
630:     PetscArrayzero(zarray, 7 * a->mbs);
631:   } else {
632:     mbs = a->mbs;
633:     ii  = a->i;
634:     z   = zarray;
635:   }

637:   for (i = 0; i < mbs; i++) {
638:     n = ii[1] - ii[0];
639:     ii++;
640:     sum1 = 0.0;
641:     sum2 = 0.0;
642:     sum3 = 0.0;
643:     sum4 = 0.0;
644:     sum5 = 0.0;
645:     sum6 = 0.0;
646:     sum7 = 0.0;

648:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
649:     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
650:     for (j = 0; j < n; j++) {
651:       xb = x + 7 * (*idx++);
652:       x1 = xb[0];
653:       x2 = xb[1];
654:       x3 = xb[2];
655:       x4 = xb[3];
656:       x5 = xb[4];
657:       x6 = xb[5];
658:       x7 = xb[6];
659:       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
660:       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
661:       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
662:       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
663:       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
664:       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
665:       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
666:       v += 49;
667:     }
668:     if (usecprow) z = zarray + 7 * ridx[i];
669:     z[0] = sum1;
670:     z[1] = sum2;
671:     z[2] = sum3;
672:     z[3] = sum4;
673:     z[4] = sum5;
674:     z[5] = sum6;
675:     z[6] = sum7;
676:     if (!usecprow) z += 7;
677:   }

679:   VecRestoreArrayRead(xx, &x);
680:   VecRestoreArrayWrite(zz, &zarray);
681:   PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt);
682:   return 0;
683: }

685: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
686: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz)
687: {
688:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
689:   PetscScalar       *z = NULL, *work, *workt, *zarray;
690:   const PetscScalar *x, *xb;
691:   const MatScalar   *v;
692:   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
693:   const PetscInt    *idx, *ii, *ridx = NULL;
694:   PetscInt           k;
695:   PetscBool          usecprow = a->compressedrow.use;

697:   __m256d a0, a1, a2, a3, a4, a5;
698:   __m256d w0, w1, w2, w3;
699:   __m256d z0, z1, z2;
700:   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);

702:   VecGetArrayRead(xx, &x);
703:   VecGetArrayWrite(zz, &zarray);

705:   idx = a->j;
706:   v   = a->a;
707:   if (usecprow) {
708:     mbs  = a->compressedrow.nrows;
709:     ii   = a->compressedrow.i;
710:     ridx = a->compressedrow.rindex;
711:     PetscArrayzero(zarray, bs * a->mbs);
712:   } else {
713:     mbs = a->mbs;
714:     ii  = a->i;
715:     z   = zarray;
716:   }

718:   if (!a->mult_work) {
719:     k = PetscMax(A->rmap->n, A->cmap->n);
720:     PetscMalloc1(k + 1, &a->mult_work);
721:   }

723:   work = a->mult_work;
724:   for (i = 0; i < mbs; i++) {
725:     n = ii[1] - ii[0];
726:     ii++;
727:     workt = work;
728:     for (j = 0; j < n; j++) {
729:       xb = x + bs * (*idx++);
730:       for (k = 0; k < bs; k++) workt[k] = xb[k];
731:       workt += bs;
732:     }
733:     if (usecprow) z = zarray + bs * ridx[i];

735:     z0 = _mm256_setzero_pd();
736:     z1 = _mm256_setzero_pd();
737:     z2 = _mm256_setzero_pd();

739:     for (j = 0; j < n; j++) {
740:       /* first column of a */
741:       w0 = _mm256_set1_pd(work[j * 9]);
742:       a0 = _mm256_loadu_pd(&v[j * 81]);
743:       z0 = _mm256_fmadd_pd(a0, w0, z0);
744:       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
745:       z1 = _mm256_fmadd_pd(a1, w0, z1);
746:       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
747:       z2 = _mm256_fmadd_pd(a2, w0, z2);

749:       /* second column of a */
750:       w1 = _mm256_set1_pd(work[j * 9 + 1]);
751:       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
752:       z0 = _mm256_fmadd_pd(a0, w1, z0);
753:       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
754:       z1 = _mm256_fmadd_pd(a1, w1, z1);
755:       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
756:       z2 = _mm256_fmadd_pd(a2, w1, z2);

758:       /* third column of a */
759:       w2 = _mm256_set1_pd(work[j * 9 + 2]);
760:       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
761:       z0 = _mm256_fmadd_pd(a3, w2, z0);
762:       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
763:       z1 = _mm256_fmadd_pd(a4, w2, z1);
764:       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
765:       z2 = _mm256_fmadd_pd(a5, w2, z2);

767:       /* fourth column of a */
768:       w3 = _mm256_set1_pd(work[j * 9 + 3]);
769:       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
770:       z0 = _mm256_fmadd_pd(a0, w3, z0);
771:       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
772:       z1 = _mm256_fmadd_pd(a1, w3, z1);
773:       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
774:       z2 = _mm256_fmadd_pd(a2, w3, z2);

776:       /* fifth column of a */
777:       w0 = _mm256_set1_pd(work[j * 9 + 4]);
778:       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
779:       z0 = _mm256_fmadd_pd(a3, w0, z0);
780:       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
781:       z1 = _mm256_fmadd_pd(a4, w0, z1);
782:       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
783:       z2 = _mm256_fmadd_pd(a5, w0, z2);

785:       /* sixth column of a */
786:       w1 = _mm256_set1_pd(work[j * 9 + 5]);
787:       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
788:       z0 = _mm256_fmadd_pd(a0, w1, z0);
789:       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
790:       z1 = _mm256_fmadd_pd(a1, w1, z1);
791:       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
792:       z2 = _mm256_fmadd_pd(a2, w1, z2);

794:       /* seventh column of a */
795:       w2 = _mm256_set1_pd(work[j * 9 + 6]);
796:       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
797:       z0 = _mm256_fmadd_pd(a0, w2, z0);
798:       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
799:       z1 = _mm256_fmadd_pd(a1, w2, z1);
800:       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
801:       z2 = _mm256_fmadd_pd(a2, w2, z2);

803:       /* eighth column of a */
804:       w3 = _mm256_set1_pd(work[j * 9 + 7]);
805:       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
806:       z0 = _mm256_fmadd_pd(a3, w3, z0);
807:       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
808:       z1 = _mm256_fmadd_pd(a4, w3, z1);
809:       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
810:       z2 = _mm256_fmadd_pd(a5, w3, z2);

812:       /* ninth column of a */
813:       w0 = _mm256_set1_pd(work[j * 9 + 8]);
814:       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
815:       z0 = _mm256_fmadd_pd(a0, w0, z0);
816:       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
817:       z1 = _mm256_fmadd_pd(a1, w0, z1);
818:       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
819:       z2 = _mm256_fmadd_pd(a2, w0, z2);
820:     }

822:     _mm256_storeu_pd(&z[0], z0);
823:     _mm256_storeu_pd(&z[4], z1);
824:     _mm256_maskstore_pd(&z[8], mask1, z2);

826:     v += n * bs2;
827:     if (!usecprow) z += bs;
828:   }
829:   VecRestoreArrayRead(xx, &x);
830:   VecRestoreArrayWrite(zz, &zarray);
831:   PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt);
832:   return 0;
833: }
834: #endif

836: PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz)
837: {
838:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
839:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
840:   const PetscScalar *x, *xb;
841:   PetscScalar       *zarray, xv;
842:   const MatScalar   *v;
843:   const PetscInt    *ii, *ij = a->j, *idx;
844:   PetscInt           mbs, i, j, k, n, *ridx = NULL;
845:   PetscBool          usecprow = a->compressedrow.use;

847:   VecGetArrayRead(xx, &x);
848:   VecGetArrayWrite(zz, &zarray);

850:   v = a->a;
851:   if (usecprow) {
852:     mbs  = a->compressedrow.nrows;
853:     ii   = a->compressedrow.i;
854:     ridx = a->compressedrow.rindex;
855:     PetscArrayzero(zarray, 11 * a->mbs);
856:   } else {
857:     mbs = a->mbs;
858:     ii  = a->i;
859:     z   = zarray;
860:   }

862:   for (i = 0; i < mbs; i++) {
863:     n     = ii[i + 1] - ii[i];
864:     idx   = ij + ii[i];
865:     sum1  = 0.0;
866:     sum2  = 0.0;
867:     sum3  = 0.0;
868:     sum4  = 0.0;
869:     sum5  = 0.0;
870:     sum6  = 0.0;
871:     sum7  = 0.0;
872:     sum8  = 0.0;
873:     sum9  = 0.0;
874:     sum10 = 0.0;
875:     sum11 = 0.0;

877:     for (j = 0; j < n; j++) {
878:       xb = x + 11 * (idx[j]);

880:       for (k = 0; k < 11; k++) {
881:         xv = xb[k];
882:         sum1 += v[0] * xv;
883:         sum2 += v[1] * xv;
884:         sum3 += v[2] * xv;
885:         sum4 += v[3] * xv;
886:         sum5 += v[4] * xv;
887:         sum6 += v[5] * xv;
888:         sum7 += v[6] * xv;
889:         sum8 += v[7] * xv;
890:         sum9 += v[8] * xv;
891:         sum10 += v[9] * xv;
892:         sum11 += v[10] * xv;
893:         v += 11;
894:       }
895:     }
896:     if (usecprow) z = zarray + 11 * ridx[i];
897:     z[0]  = sum1;
898:     z[1]  = sum2;
899:     z[2]  = sum3;
900:     z[3]  = sum4;
901:     z[4]  = sum5;
902:     z[5]  = sum6;
903:     z[6]  = sum7;
904:     z[7]  = sum8;
905:     z[8]  = sum9;
906:     z[9]  = sum10;
907:     z[10] = sum11;

909:     if (!usecprow) z += 11;
910:   }

912:   VecRestoreArrayRead(xx, &x);
913:   VecRestoreArrayWrite(zz, &zarray);
914:   PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt);
915:   return 0;
916: }

918: /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
919: PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz)
920: {
921:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
922:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
923:   const PetscScalar *x, *xb;
924:   PetscScalar       *zarray, xv;
925:   const MatScalar   *v;
926:   const PetscInt    *ii, *ij = a->j, *idx;
927:   PetscInt           mbs, i, j, k, n, *ridx = NULL;
928:   PetscBool          usecprow = a->compressedrow.use;

930:   VecGetArrayRead(xx, &x);
931:   VecGetArrayWrite(zz, &zarray);

933:   v = a->a;
934:   if (usecprow) {
935:     mbs  = a->compressedrow.nrows;
936:     ii   = a->compressedrow.i;
937:     ridx = a->compressedrow.rindex;
938:     PetscArrayzero(zarray, 12 * a->mbs);
939:   } else {
940:     mbs = a->mbs;
941:     ii  = a->i;
942:     z   = zarray;
943:   }

945:   for (i = 0; i < mbs; i++) {
946:     n     = ii[i + 1] - ii[i];
947:     idx   = ij + ii[i];
948:     sum1  = 0.0;
949:     sum2  = 0.0;
950:     sum3  = 0.0;
951:     sum4  = 0.0;
952:     sum5  = 0.0;
953:     sum6  = 0.0;
954:     sum7  = 0.0;
955:     sum8  = 0.0;
956:     sum9  = 0.0;
957:     sum10 = 0.0;
958:     sum11 = 0.0;
959:     sum12 = 0.0;

961:     for (j = 0; j < n; j++) {
962:       xb = x + 12 * (idx[j]);

964:       for (k = 0; k < 12; k++) {
965:         xv = xb[k];
966:         sum1 += v[0] * xv;
967:         sum2 += v[1] * xv;
968:         sum3 += v[2] * xv;
969:         sum4 += v[3] * xv;
970:         sum5 += v[4] * xv;
971:         sum6 += v[5] * xv;
972:         sum7 += v[6] * xv;
973:         sum8 += v[7] * xv;
974:         sum9 += v[8] * xv;
975:         sum10 += v[9] * xv;
976:         sum11 += v[10] * xv;
977:         sum12 += v[11] * xv;
978:         v += 12;
979:       }
980:     }
981:     if (usecprow) z = zarray + 12 * ridx[i];
982:     z[0]  = sum1;
983:     z[1]  = sum2;
984:     z[2]  = sum3;
985:     z[3]  = sum4;
986:     z[4]  = sum5;
987:     z[5]  = sum6;
988:     z[6]  = sum7;
989:     z[7]  = sum8;
990:     z[8]  = sum9;
991:     z[9]  = sum10;
992:     z[10] = sum11;
993:     z[11] = sum12;
994:     if (!usecprow) z += 12;
995:   }
996:   VecRestoreArrayRead(xx, &x);
997:   VecRestoreArrayWrite(zz, &zarray);
998:   PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt);
999:   return 0;
1000: }

1002: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz)
1003: {
1004:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1005:   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1006:   const PetscScalar *x, *xb;
1007:   PetscScalar       *zarray, *yarray, xv;
1008:   const MatScalar   *v;
1009:   const PetscInt    *ii, *ij = a->j, *idx;
1010:   PetscInt           mbs = a->mbs, i, j, k, n, *ridx = NULL;
1011:   PetscBool          usecprow = a->compressedrow.use;

1013:   VecGetArrayRead(xx, &x);
1014:   VecGetArrayPair(yy, zz, &yarray, &zarray);

1016:   v = a->a;
1017:   if (usecprow) {
1018:     if (zz != yy) PetscArraycpy(zarray, yarray, 12 * mbs);
1019:     mbs  = a->compressedrow.nrows;
1020:     ii   = a->compressedrow.i;
1021:     ridx = a->compressedrow.rindex;
1022:   } else {
1023:     ii = a->i;
1024:     y  = yarray;
1025:     z  = zarray;
1026:   }

1028:   for (i = 0; i < mbs; i++) {
1029:     n   = ii[i + 1] - ii[i];
1030:     idx = ij + ii[i];

1032:     if (usecprow) {
1033:       y = yarray + 12 * ridx[i];
1034:       z = zarray + 12 * ridx[i];
1035:     }
1036:     sum1  = y[0];
1037:     sum2  = y[1];
1038:     sum3  = y[2];
1039:     sum4  = y[3];
1040:     sum5  = y[4];
1041:     sum6  = y[5];
1042:     sum7  = y[6];
1043:     sum8  = y[7];
1044:     sum9  = y[8];
1045:     sum10 = y[9];
1046:     sum11 = y[10];
1047:     sum12 = y[11];

1049:     for (j = 0; j < n; j++) {
1050:       xb = x + 12 * (idx[j]);

1052:       for (k = 0; k < 12; k++) {
1053:         xv = xb[k];
1054:         sum1 += v[0] * xv;
1055:         sum2 += v[1] * xv;
1056:         sum3 += v[2] * xv;
1057:         sum4 += v[3] * xv;
1058:         sum5 += v[4] * xv;
1059:         sum6 += v[5] * xv;
1060:         sum7 += v[6] * xv;
1061:         sum8 += v[7] * xv;
1062:         sum9 += v[8] * xv;
1063:         sum10 += v[9] * xv;
1064:         sum11 += v[10] * xv;
1065:         sum12 += v[11] * xv;
1066:         v += 12;
1067:       }
1068:     }

1070:     z[0]  = sum1;
1071:     z[1]  = sum2;
1072:     z[2]  = sum3;
1073:     z[3]  = sum4;
1074:     z[4]  = sum5;
1075:     z[5]  = sum6;
1076:     z[6]  = sum7;
1077:     z[7]  = sum8;
1078:     z[8]  = sum9;
1079:     z[9]  = sum10;
1080:     z[10] = sum11;
1081:     z[11] = sum12;
1082:     if (!usecprow) {
1083:       y += 12;
1084:       z += 12;
1085:     }
1086:   }
1087:   VecRestoreArrayRead(xx, &x);
1088:   VecRestoreArrayPair(yy, zz, &yarray, &zarray);
1089:   PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt);
1090:   return 0;
1091: }

1093: /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1094: PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz)
1095: {
1096:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1097:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1098:   const PetscScalar *x, *xb;
1099:   PetscScalar        x1, x2, x3, x4, *zarray;
1100:   const MatScalar   *v;
1101:   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1102:   PetscInt           mbs, i, j, n;
1103:   PetscBool          usecprow = a->compressedrow.use;

1105:   VecGetArrayRead(xx, &x);
1106:   VecGetArrayWrite(zz, &zarray);

1108:   v = a->a;
1109:   if (usecprow) {
1110:     mbs  = a->compressedrow.nrows;
1111:     ii   = a->compressedrow.i;
1112:     ridx = a->compressedrow.rindex;
1113:     PetscArrayzero(zarray, 12 * a->mbs);
1114:   } else {
1115:     mbs = a->mbs;
1116:     ii  = a->i;
1117:     z   = zarray;
1118:   }

1120:   for (i = 0; i < mbs; i++) {
1121:     n   = ii[i + 1] - ii[i];
1122:     idx = ij + ii[i];

1124:     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1125:     for (j = 0; j < n; j++) {
1126:       xb = x + 12 * (idx[j]);
1127:       x1 = xb[0];
1128:       x2 = xb[1];
1129:       x3 = xb[2];
1130:       x4 = xb[3];

1132:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1133:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1134:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1135:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1136:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1137:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1138:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1139:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1140:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1141:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1142:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1143:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1144:       v += 48;

1146:       x1 = xb[4];
1147:       x2 = xb[5];
1148:       x3 = xb[6];
1149:       x4 = xb[7];

1151:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1152:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1153:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1154:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1155:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1156:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1157:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1158:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1159:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1160:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1161:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1162:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1163:       v += 48;

1165:       x1 = xb[8];
1166:       x2 = xb[9];
1167:       x3 = xb[10];
1168:       x4 = xb[11];
1169:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1170:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1171:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1172:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1173:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1174:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1175:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1176:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1177:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1178:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1179:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1180:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1181:       v += 48;
1182:     }
1183:     if (usecprow) z = zarray + 12 * ridx[i];
1184:     z[0]  = sum1;
1185:     z[1]  = sum2;
1186:     z[2]  = sum3;
1187:     z[3]  = sum4;
1188:     z[4]  = sum5;
1189:     z[5]  = sum6;
1190:     z[6]  = sum7;
1191:     z[7]  = sum8;
1192:     z[8]  = sum9;
1193:     z[9]  = sum10;
1194:     z[10] = sum11;
1195:     z[11] = sum12;
1196:     if (!usecprow) z += 12;
1197:   }
1198:   VecRestoreArrayRead(xx, &x);
1199:   VecRestoreArrayWrite(zz, &zarray);
1200:   PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt);
1201:   return 0;
1202: }

1204: /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1205: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz)
1206: {
1207:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1208:   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1209:   const PetscScalar *x, *xb;
1210:   PetscScalar        x1, x2, x3, x4, *zarray, *yarray;
1211:   const MatScalar   *v;
1212:   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1213:   PetscInt           mbs      = a->mbs, i, j, n;
1214:   PetscBool          usecprow = a->compressedrow.use;

1216:   VecGetArrayRead(xx, &x);
1217:   VecGetArrayPair(yy, zz, &yarray, &zarray);

1219:   v = a->a;
1220:   if (usecprow) {
1221:     if (zz != yy) PetscArraycpy(zarray, yarray, 12 * mbs);
1222:     mbs  = a->compressedrow.nrows;
1223:     ii   = a->compressedrow.i;
1224:     ridx = a->compressedrow.rindex;
1225:   } else {
1226:     ii = a->i;
1227:     y  = yarray;
1228:     z  = zarray;
1229:   }

1231:   for (i = 0; i < mbs; i++) {
1232:     n   = ii[i + 1] - ii[i];
1233:     idx = ij + ii[i];

1235:     if (usecprow) {
1236:       y = yarray + 12 * ridx[i];
1237:       z = zarray + 12 * ridx[i];
1238:     }
1239:     sum1  = y[0];
1240:     sum2  = y[1];
1241:     sum3  = y[2];
1242:     sum4  = y[3];
1243:     sum5  = y[4];
1244:     sum6  = y[5];
1245:     sum7  = y[6];
1246:     sum8  = y[7];
1247:     sum9  = y[8];
1248:     sum10 = y[9];
1249:     sum11 = y[10];
1250:     sum12 = y[11];

1252:     for (j = 0; j < n; j++) {
1253:       xb = x + 12 * (idx[j]);
1254:       x1 = xb[0];
1255:       x2 = xb[1];
1256:       x3 = xb[2];
1257:       x4 = xb[3];

1259:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1260:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1261:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1262:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1263:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1264:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1265:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1266:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1267:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1268:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1269:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1270:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1271:       v += 48;

1273:       x1 = xb[4];
1274:       x2 = xb[5];
1275:       x3 = xb[6];
1276:       x4 = xb[7];

1278:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1279:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1280:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1281:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1282:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1283:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1284:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1285:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1286:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1287:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1288:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1289:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1290:       v += 48;

1292:       x1 = xb[8];
1293:       x2 = xb[9];
1294:       x3 = xb[10];
1295:       x4 = xb[11];
1296:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1297:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1298:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1299:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1300:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1301:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1302:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1303:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1304:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1305:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1306:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1307:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1308:       v += 48;
1309:     }
1310:     z[0]  = sum1;
1311:     z[1]  = sum2;
1312:     z[2]  = sum3;
1313:     z[3]  = sum4;
1314:     z[4]  = sum5;
1315:     z[5]  = sum6;
1316:     z[6]  = sum7;
1317:     z[7]  = sum8;
1318:     z[8]  = sum9;
1319:     z[9]  = sum10;
1320:     z[10] = sum11;
1321:     z[11] = sum12;
1322:     if (!usecprow) {
1323:       y += 12;
1324:       z += 12;
1325:     }
1326:   }
1327:   VecRestoreArrayRead(xx, &x);
1328:   VecRestoreArrayPair(yy, zz, &yarray, &zarray);
1329:   PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt);
1330:   return 0;
1331: }

1333: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1334: PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz)
1335: {
1336:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1337:   PetscScalar       *z = NULL, *zarray;
1338:   const PetscScalar *x, *work;
1339:   const MatScalar   *v = a->a;
1340:   PetscInt           mbs, i, j, n;
1341:   const PetscInt    *idx = a->j, *ii, *ridx = NULL;
1342:   PetscBool          usecprow = a->compressedrow.use;
1343:   const PetscInt     bs = 12, bs2 = 144;

1345:   __m256d a0, a1, a2, a3, a4, a5;
1346:   __m256d w0, w1, w2, w3;
1347:   __m256d z0, z1, z2;

1349:   VecGetArrayRead(xx, &x);
1350:   VecGetArrayWrite(zz, &zarray);

1352:   if (usecprow) {
1353:     mbs  = a->compressedrow.nrows;
1354:     ii   = a->compressedrow.i;
1355:     ridx = a->compressedrow.rindex;
1356:     PetscArrayzero(zarray, bs * a->mbs);
1357:   } else {
1358:     mbs = a->mbs;
1359:     ii  = a->i;
1360:     z   = zarray;
1361:   }

1363:   for (i = 0; i < mbs; i++) {
1364:     z0 = _mm256_setzero_pd();
1365:     z1 = _mm256_setzero_pd();
1366:     z2 = _mm256_setzero_pd();

1368:     n = ii[1] - ii[0];
1369:     ii++;
1370:     for (j = 0; j < n; j++) {
1371:       work = x + bs * (*idx++);

1373:       /* first column of a */
1374:       w0 = _mm256_set1_pd(work[0]);
1375:       a0 = _mm256_loadu_pd(v + 0);
1376:       z0 = _mm256_fmadd_pd(a0, w0, z0);
1377:       a1 = _mm256_loadu_pd(v + 4);
1378:       z1 = _mm256_fmadd_pd(a1, w0, z1);
1379:       a2 = _mm256_loadu_pd(v + 8);
1380:       z2 = _mm256_fmadd_pd(a2, w0, z2);

1382:       /* second column of a */
1383:       w1 = _mm256_set1_pd(work[1]);
1384:       a3 = _mm256_loadu_pd(v + 12);
1385:       z0 = _mm256_fmadd_pd(a3, w1, z0);
1386:       a4 = _mm256_loadu_pd(v + 16);
1387:       z1 = _mm256_fmadd_pd(a4, w1, z1);
1388:       a5 = _mm256_loadu_pd(v + 20);
1389:       z2 = _mm256_fmadd_pd(a5, w1, z2);

1391:       /* third column of a */
1392:       w2 = _mm256_set1_pd(work[2]);
1393:       a0 = _mm256_loadu_pd(v + 24);
1394:       z0 = _mm256_fmadd_pd(a0, w2, z0);
1395:       a1 = _mm256_loadu_pd(v + 28);
1396:       z1 = _mm256_fmadd_pd(a1, w2, z1);
1397:       a2 = _mm256_loadu_pd(v + 32);
1398:       z2 = _mm256_fmadd_pd(a2, w2, z2);

1400:       /* fourth column of a */
1401:       w3 = _mm256_set1_pd(work[3]);
1402:       a3 = _mm256_loadu_pd(v + 36);
1403:       z0 = _mm256_fmadd_pd(a3, w3, z0);
1404:       a4 = _mm256_loadu_pd(v + 40);
1405:       z1 = _mm256_fmadd_pd(a4, w3, z1);
1406:       a5 = _mm256_loadu_pd(v + 44);
1407:       z2 = _mm256_fmadd_pd(a5, w3, z2);

1409:       /* fifth column of a */
1410:       w0 = _mm256_set1_pd(work[4]);
1411:       a0 = _mm256_loadu_pd(v + 48);
1412:       z0 = _mm256_fmadd_pd(a0, w0, z0);
1413:       a1 = _mm256_loadu_pd(v + 52);
1414:       z1 = _mm256_fmadd_pd(a1, w0, z1);
1415:       a2 = _mm256_loadu_pd(v + 56);
1416:       z2 = _mm256_fmadd_pd(a2, w0, z2);

1418:       /* sixth column of a */
1419:       w1 = _mm256_set1_pd(work[5]);
1420:       a3 = _mm256_loadu_pd(v + 60);
1421:       z0 = _mm256_fmadd_pd(a3, w1, z0);
1422:       a4 = _mm256_loadu_pd(v + 64);
1423:       z1 = _mm256_fmadd_pd(a4, w1, z1);
1424:       a5 = _mm256_loadu_pd(v + 68);
1425:       z2 = _mm256_fmadd_pd(a5, w1, z2);

1427:       /* seventh column of a */
1428:       w2 = _mm256_set1_pd(work[6]);
1429:       a0 = _mm256_loadu_pd(v + 72);
1430:       z0 = _mm256_fmadd_pd(a0, w2, z0);
1431:       a1 = _mm256_loadu_pd(v + 76);
1432:       z1 = _mm256_fmadd_pd(a1, w2, z1);
1433:       a2 = _mm256_loadu_pd(v + 80);
1434:       z2 = _mm256_fmadd_pd(a2, w2, z2);

1436:       /* eighth column of a */
1437:       w3 = _mm256_set1_pd(work[7]);
1438:       a3 = _mm256_loadu_pd(v + 84);
1439:       z0 = _mm256_fmadd_pd(a3, w3, z0);
1440:       a4 = _mm256_loadu_pd(v + 88);
1441:       z1 = _mm256_fmadd_pd(a4, w3, z1);
1442:       a5 = _mm256_loadu_pd(v + 92);
1443:       z2 = _mm256_fmadd_pd(a5, w3, z2);

1445:       /* ninth column of a */
1446:       w0 = _mm256_set1_pd(work[8]);
1447:       a0 = _mm256_loadu_pd(v + 96);
1448:       z0 = _mm256_fmadd_pd(a0, w0, z0);
1449:       a1 = _mm256_loadu_pd(v + 100);
1450:       z1 = _mm256_fmadd_pd(a1, w0, z1);
1451:       a2 = _mm256_loadu_pd(v + 104);
1452:       z2 = _mm256_fmadd_pd(a2, w0, z2);

1454:       /* tenth column of a */
1455:       w1 = _mm256_set1_pd(work[9]);
1456:       a3 = _mm256_loadu_pd(v + 108);
1457:       z0 = _mm256_fmadd_pd(a3, w1, z0);
1458:       a4 = _mm256_loadu_pd(v + 112);
1459:       z1 = _mm256_fmadd_pd(a4, w1, z1);
1460:       a5 = _mm256_loadu_pd(v + 116);
1461:       z2 = _mm256_fmadd_pd(a5, w1, z2);

1463:       /* eleventh column of a */
1464:       w2 = _mm256_set1_pd(work[10]);
1465:       a0 = _mm256_loadu_pd(v + 120);
1466:       z0 = _mm256_fmadd_pd(a0, w2, z0);
1467:       a1 = _mm256_loadu_pd(v + 124);
1468:       z1 = _mm256_fmadd_pd(a1, w2, z1);
1469:       a2 = _mm256_loadu_pd(v + 128);
1470:       z2 = _mm256_fmadd_pd(a2, w2, z2);

1472:       /* twelveth column of a */
1473:       w3 = _mm256_set1_pd(work[11]);
1474:       a3 = _mm256_loadu_pd(v + 132);
1475:       z0 = _mm256_fmadd_pd(a3, w3, z0);
1476:       a4 = _mm256_loadu_pd(v + 136);
1477:       z1 = _mm256_fmadd_pd(a4, w3, z1);
1478:       a5 = _mm256_loadu_pd(v + 140);
1479:       z2 = _mm256_fmadd_pd(a5, w3, z2);

1481:       v += bs2;
1482:     }
1483:     if (usecprow) z = zarray + bs * ridx[i];
1484:     _mm256_storeu_pd(&z[0], z0);
1485:     _mm256_storeu_pd(&z[4], z1);
1486:     _mm256_storeu_pd(&z[8], z2);
1487:     if (!usecprow) z += bs;
1488:   }
1489:   VecRestoreArrayRead(xx, &x);
1490:   VecRestoreArrayWrite(zz, &zarray);
1491:   PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt);
1492:   return 0;
1493: }
1494: #endif

1496: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1497: /* Default MatMult for block size 15 */
1498: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz)
1499: {
1500:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1501:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1502:   const PetscScalar *x, *xb;
1503:   PetscScalar       *zarray, xv;
1504:   const MatScalar   *v;
1505:   const PetscInt    *ii, *ij = a->j, *idx;
1506:   PetscInt           mbs, i, j, k, n, *ridx = NULL;
1507:   PetscBool          usecprow = a->compressedrow.use;

1509:   VecGetArrayRead(xx, &x);
1510:   VecGetArrayWrite(zz, &zarray);

1512:   v = a->a;
1513:   if (usecprow) {
1514:     mbs  = a->compressedrow.nrows;
1515:     ii   = a->compressedrow.i;
1516:     ridx = a->compressedrow.rindex;
1517:     PetscArrayzero(zarray, 15 * a->mbs);
1518:   } else {
1519:     mbs = a->mbs;
1520:     ii  = a->i;
1521:     z   = zarray;
1522:   }

1524:   for (i = 0; i < mbs; i++) {
1525:     n     = ii[i + 1] - ii[i];
1526:     idx   = ij + ii[i];
1527:     sum1  = 0.0;
1528:     sum2  = 0.0;
1529:     sum3  = 0.0;
1530:     sum4  = 0.0;
1531:     sum5  = 0.0;
1532:     sum6  = 0.0;
1533:     sum7  = 0.0;
1534:     sum8  = 0.0;
1535:     sum9  = 0.0;
1536:     sum10 = 0.0;
1537:     sum11 = 0.0;
1538:     sum12 = 0.0;
1539:     sum13 = 0.0;
1540:     sum14 = 0.0;
1541:     sum15 = 0.0;

1543:     for (j = 0; j < n; j++) {
1544:       xb = x + 15 * (idx[j]);

1546:       for (k = 0; k < 15; k++) {
1547:         xv = xb[k];
1548:         sum1 += v[0] * xv;
1549:         sum2 += v[1] * xv;
1550:         sum3 += v[2] * xv;
1551:         sum4 += v[3] * xv;
1552:         sum5 += v[4] * xv;
1553:         sum6 += v[5] * xv;
1554:         sum7 += v[6] * xv;
1555:         sum8 += v[7] * xv;
1556:         sum9 += v[8] * xv;
1557:         sum10 += v[9] * xv;
1558:         sum11 += v[10] * xv;
1559:         sum12 += v[11] * xv;
1560:         sum13 += v[12] * xv;
1561:         sum14 += v[13] * xv;
1562:         sum15 += v[14] * xv;
1563:         v += 15;
1564:       }
1565:     }
1566:     if (usecprow) z = zarray + 15 * ridx[i];
1567:     z[0]  = sum1;
1568:     z[1]  = sum2;
1569:     z[2]  = sum3;
1570:     z[3]  = sum4;
1571:     z[4]  = sum5;
1572:     z[5]  = sum6;
1573:     z[6]  = sum7;
1574:     z[7]  = sum8;
1575:     z[8]  = sum9;
1576:     z[9]  = sum10;
1577:     z[10] = sum11;
1578:     z[11] = sum12;
1579:     z[12] = sum13;
1580:     z[13] = sum14;
1581:     z[14] = sum15;

1583:     if (!usecprow) z += 15;
1584:   }

1586:   VecRestoreArrayRead(xx, &x);
1587:   VecRestoreArrayWrite(zz, &zarray);
1588:   PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt);
1589:   return 0;
1590: }

1592: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1593: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz)
1594: {
1595:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1596:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1597:   const PetscScalar *x, *xb;
1598:   PetscScalar        x1, x2, x3, x4, *zarray;
1599:   const MatScalar   *v;
1600:   const PetscInt    *ii, *ij = a->j, *idx;
1601:   PetscInt           mbs, i, j, n, *ridx = NULL;
1602:   PetscBool          usecprow = a->compressedrow.use;

1604:   VecGetArrayRead(xx, &x);
1605:   VecGetArrayWrite(zz, &zarray);

1607:   v = a->a;
1608:   if (usecprow) {
1609:     mbs  = a->compressedrow.nrows;
1610:     ii   = a->compressedrow.i;
1611:     ridx = a->compressedrow.rindex;
1612:     PetscArrayzero(zarray, 15 * a->mbs);
1613:   } else {
1614:     mbs = a->mbs;
1615:     ii  = a->i;
1616:     z   = zarray;
1617:   }

1619:   for (i = 0; i < mbs; i++) {
1620:     n     = ii[i + 1] - ii[i];
1621:     idx   = ij + ii[i];
1622:     sum1  = 0.0;
1623:     sum2  = 0.0;
1624:     sum3  = 0.0;
1625:     sum4  = 0.0;
1626:     sum5  = 0.0;
1627:     sum6  = 0.0;
1628:     sum7  = 0.0;
1629:     sum8  = 0.0;
1630:     sum9  = 0.0;
1631:     sum10 = 0.0;
1632:     sum11 = 0.0;
1633:     sum12 = 0.0;
1634:     sum13 = 0.0;
1635:     sum14 = 0.0;
1636:     sum15 = 0.0;

1638:     for (j = 0; j < n; j++) {
1639:       xb = x + 15 * (idx[j]);
1640:       x1 = xb[0];
1641:       x2 = xb[1];
1642:       x3 = xb[2];
1643:       x4 = xb[3];

1645:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1646:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1647:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1648:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1649:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1650:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1651:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1652:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1653:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1654:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1655:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1656:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1657:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1658:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1659:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;

1661:       v += 60;

1663:       x1 = xb[4];
1664:       x2 = xb[5];
1665:       x3 = xb[6];
1666:       x4 = xb[7];

1668:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1669:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1670:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1671:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1672:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1673:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1674:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1675:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1676:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1677:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1678:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1679:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1680:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1681:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1682:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1683:       v += 60;

1685:       x1 = xb[8];
1686:       x2 = xb[9];
1687:       x3 = xb[10];
1688:       x4 = xb[11];
1689:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1690:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1691:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1692:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1693:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1694:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1695:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1696:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1697:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1698:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1699:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1700:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1701:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1702:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1703:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1704:       v += 60;

1706:       x1 = xb[12];
1707:       x2 = xb[13];
1708:       x3 = xb[14];
1709:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3;
1710:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3;
1711:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3;
1712:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3;
1713:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3;
1714:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3;
1715:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3;
1716:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3;
1717:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3;
1718:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3;
1719:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3;
1720:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3;
1721:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3;
1722:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3;
1723:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3;
1724:       v += 45;
1725:     }
1726:     if (usecprow) z = zarray + 15 * ridx[i];
1727:     z[0]  = sum1;
1728:     z[1]  = sum2;
1729:     z[2]  = sum3;
1730:     z[3]  = sum4;
1731:     z[4]  = sum5;
1732:     z[5]  = sum6;
1733:     z[6]  = sum7;
1734:     z[7]  = sum8;
1735:     z[8]  = sum9;
1736:     z[9]  = sum10;
1737:     z[10] = sum11;
1738:     z[11] = sum12;
1739:     z[12] = sum13;
1740:     z[13] = sum14;
1741:     z[14] = sum15;

1743:     if (!usecprow) z += 15;
1744:   }

1746:   VecRestoreArrayRead(xx, &x);
1747:   VecRestoreArrayWrite(zz, &zarray);
1748:   PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt);
1749:   return 0;
1750: }

1752: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1753: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz)
1754: {
1755:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1756:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1757:   const PetscScalar *x, *xb;
1758:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, *zarray;
1759:   const MatScalar   *v;
1760:   const PetscInt    *ii, *ij = a->j, *idx;
1761:   PetscInt           mbs, i, j, n, *ridx = NULL;
1762:   PetscBool          usecprow = a->compressedrow.use;

1764:   VecGetArrayRead(xx, &x);
1765:   VecGetArrayWrite(zz, &zarray);

1767:   v = a->a;
1768:   if (usecprow) {
1769:     mbs  = a->compressedrow.nrows;
1770:     ii   = a->compressedrow.i;
1771:     ridx = a->compressedrow.rindex;
1772:     PetscArrayzero(zarray, 15 * a->mbs);
1773:   } else {
1774:     mbs = a->mbs;
1775:     ii  = a->i;
1776:     z   = zarray;
1777:   }

1779:   for (i = 0; i < mbs; i++) {
1780:     n     = ii[i + 1] - ii[i];
1781:     idx   = ij + ii[i];
1782:     sum1  = 0.0;
1783:     sum2  = 0.0;
1784:     sum3  = 0.0;
1785:     sum4  = 0.0;
1786:     sum5  = 0.0;
1787:     sum6  = 0.0;
1788:     sum7  = 0.0;
1789:     sum8  = 0.0;
1790:     sum9  = 0.0;
1791:     sum10 = 0.0;
1792:     sum11 = 0.0;
1793:     sum12 = 0.0;
1794:     sum13 = 0.0;
1795:     sum14 = 0.0;
1796:     sum15 = 0.0;

1798:     for (j = 0; j < n; j++) {
1799:       xb = x + 15 * (idx[j]);
1800:       x1 = xb[0];
1801:       x2 = xb[1];
1802:       x3 = xb[2];
1803:       x4 = xb[3];
1804:       x5 = xb[4];
1805:       x6 = xb[5];
1806:       x7 = xb[6];
1807:       x8 = xb[7];

1809:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8;
1810:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8;
1811:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8;
1812:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8;
1813:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8;
1814:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8;
1815:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8;
1816:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8;
1817:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8;
1818:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8;
1819:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8;
1820:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8;
1821:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8;
1822:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8;
1823:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8;
1824:       v += 120;

1826:       x1 = xb[8];
1827:       x2 = xb[9];
1828:       x3 = xb[10];
1829:       x4 = xb[11];
1830:       x5 = xb[12];
1831:       x6 = xb[13];
1832:       x7 = xb[14];

1834:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7;
1835:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7;
1836:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7;
1837:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7;
1838:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7;
1839:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7;
1840:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7;
1841:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7;
1842:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7;
1843:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7;
1844:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7;
1845:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7;
1846:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7;
1847:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7;
1848:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7;
1849:       v += 105;
1850:     }
1851:     if (usecprow) z = zarray + 15 * ridx[i];
1852:     z[0]  = sum1;
1853:     z[1]  = sum2;
1854:     z[2]  = sum3;
1855:     z[3]  = sum4;
1856:     z[4]  = sum5;
1857:     z[5]  = sum6;
1858:     z[6]  = sum7;
1859:     z[7]  = sum8;
1860:     z[8]  = sum9;
1861:     z[9]  = sum10;
1862:     z[10] = sum11;
1863:     z[11] = sum12;
1864:     z[12] = sum13;
1865:     z[13] = sum14;
1866:     z[14] = sum15;

1868:     if (!usecprow) z += 15;
1869:   }

1871:   VecRestoreArrayRead(xx, &x);
1872:   VecRestoreArrayWrite(zz, &zarray);
1873:   PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt);
1874:   return 0;
1875: }

1877: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1878: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz)
1879: {
1880:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1881:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1882:   const PetscScalar *x, *xb;
1883:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray;
1884:   const MatScalar   *v;
1885:   const PetscInt    *ii, *ij = a->j, *idx;
1886:   PetscInt           mbs, i, j, n, *ridx = NULL;
1887:   PetscBool          usecprow = a->compressedrow.use;

1889:   VecGetArrayRead(xx, &x);
1890:   VecGetArrayWrite(zz, &zarray);

1892:   v = a->a;
1893:   if (usecprow) {
1894:     mbs  = a->compressedrow.nrows;
1895:     ii   = a->compressedrow.i;
1896:     ridx = a->compressedrow.rindex;
1897:     PetscArrayzero(zarray, 15 * a->mbs);
1898:   } else {
1899:     mbs = a->mbs;
1900:     ii  = a->i;
1901:     z   = zarray;
1902:   }

1904:   for (i = 0; i < mbs; i++) {
1905:     n     = ii[i + 1] - ii[i];
1906:     idx   = ij + ii[i];
1907:     sum1  = 0.0;
1908:     sum2  = 0.0;
1909:     sum3  = 0.0;
1910:     sum4  = 0.0;
1911:     sum5  = 0.0;
1912:     sum6  = 0.0;
1913:     sum7  = 0.0;
1914:     sum8  = 0.0;
1915:     sum9  = 0.0;
1916:     sum10 = 0.0;
1917:     sum11 = 0.0;
1918:     sum12 = 0.0;
1919:     sum13 = 0.0;
1920:     sum14 = 0.0;
1921:     sum15 = 0.0;

1923:     for (j = 0; j < n; j++) {
1924:       xb  = x + 15 * (idx[j]);
1925:       x1  = xb[0];
1926:       x2  = xb[1];
1927:       x3  = xb[2];
1928:       x4  = xb[3];
1929:       x5  = xb[4];
1930:       x6  = xb[5];
1931:       x7  = xb[6];
1932:       x8  = xb[7];
1933:       x9  = xb[8];
1934:       x10 = xb[9];
1935:       x11 = xb[10];
1936:       x12 = xb[11];
1937:       x13 = xb[12];
1938:       x14 = xb[13];
1939:       x15 = xb[14];

1941:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15;
1942:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15;
1943:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15;
1944:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15;
1945:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15;
1946:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15;
1947:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15;
1948:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15;
1949:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15;
1950:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15;
1951:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15;
1952:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15;
1953:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15;
1954:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15;
1955:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15;
1956:       v += 225;
1957:     }
1958:     if (usecprow) z = zarray + 15 * ridx[i];
1959:     z[0]  = sum1;
1960:     z[1]  = sum2;
1961:     z[2]  = sum3;
1962:     z[3]  = sum4;
1963:     z[4]  = sum5;
1964:     z[5]  = sum6;
1965:     z[6]  = sum7;
1966:     z[7]  = sum8;
1967:     z[8]  = sum9;
1968:     z[9]  = sum10;
1969:     z[10] = sum11;
1970:     z[11] = sum12;
1971:     z[12] = sum13;
1972:     z[13] = sum14;
1973:     z[14] = sum15;

1975:     if (!usecprow) z += 15;
1976:   }

1978:   VecRestoreArrayRead(xx, &x);
1979:   VecRestoreArrayWrite(zz, &zarray);
1980:   PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt);
1981:   return 0;
1982: }

1984: /*
1985:     This will not work with MatScalar == float because it calls the BLAS
1986: */
1987: PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz)
1988: {
1989:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1990:   PetscScalar       *z = NULL, *work, *workt, *zarray;
1991:   const PetscScalar *x, *xb;
1992:   const MatScalar   *v;
1993:   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1994:   const PetscInt    *idx, *ii, *ridx = NULL;
1995:   PetscInt           ncols, k;
1996:   PetscBool          usecprow = a->compressedrow.use;

1998:   VecGetArrayRead(xx, &x);
1999:   VecGetArrayWrite(zz, &zarray);

2001:   idx = a->j;
2002:   v   = a->a;
2003:   if (usecprow) {
2004:     mbs  = a->compressedrow.nrows;
2005:     ii   = a->compressedrow.i;
2006:     ridx = a->compressedrow.rindex;
2007:     PetscArrayzero(zarray, bs * a->mbs);
2008:   } else {
2009:     mbs = a->mbs;
2010:     ii  = a->i;
2011:     z   = zarray;
2012:   }

2014:   if (!a->mult_work) {
2015:     k = PetscMax(A->rmap->n, A->cmap->n);
2016:     PetscMalloc1(k + 1, &a->mult_work);
2017:   }
2018:   work = a->mult_work;
2019:   for (i = 0; i < mbs; i++) {
2020:     n = ii[1] - ii[0];
2021:     ii++;
2022:     ncols = n * bs;
2023:     workt = work;
2024:     for (j = 0; j < n; j++) {
2025:       xb = x + bs * (*idx++);
2026:       for (k = 0; k < bs; k++) workt[k] = xb[k];
2027:       workt += bs;
2028:     }
2029:     if (usecprow) z = zarray + bs * ridx[i];
2030:     PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z);
2031:     v += n * bs2;
2032:     if (!usecprow) z += bs;
2033:   }
2034:   VecRestoreArrayRead(xx, &x);
2035:   VecRestoreArrayWrite(zz, &zarray);
2036:   PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt);
2037:   return 0;
2038: }

2040: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
2041: {
2042:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2043:   const PetscScalar *x;
2044:   PetscScalar       *y, *z, sum;
2045:   const MatScalar   *v;
2046:   PetscInt           mbs = a->mbs, i, n, *ridx = NULL;
2047:   const PetscInt    *idx, *ii;
2048:   PetscBool          usecprow = a->compressedrow.use;

2050:   VecGetArrayRead(xx, &x);
2051:   VecGetArrayPair(yy, zz, &y, &z);

2053:   idx = a->j;
2054:   v   = a->a;
2055:   if (usecprow) {
2056:     if (zz != yy) PetscArraycpy(z, y, mbs);
2057:     mbs  = a->compressedrow.nrows;
2058:     ii   = a->compressedrow.i;
2059:     ridx = a->compressedrow.rindex;
2060:   } else {
2061:     ii = a->i;
2062:   }

2064:   for (i = 0; i < mbs; i++) {
2065:     n = ii[1] - ii[0];
2066:     ii++;
2067:     if (!usecprow) {
2068:       sum = y[i];
2069:     } else {
2070:       sum = y[ridx[i]];
2071:     }
2072:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2073:     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
2074:     PetscSparseDensePlusDot(sum, x, v, idx, n);
2075:     v += n;
2076:     idx += n;
2077:     if (usecprow) {
2078:       z[ridx[i]] = sum;
2079:     } else {
2080:       z[i] = sum;
2081:     }
2082:   }
2083:   VecRestoreArrayRead(xx, &x);
2084:   VecRestoreArrayPair(yy, zz, &y, &z);
2085:   PetscLogFlops(2.0 * a->nz);
2086:   return 0;
2087: }

2089: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
2090: {
2091:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2092:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2;
2093:   const PetscScalar *x, *xb;
2094:   PetscScalar        x1, x2, *yarray, *zarray;
2095:   const MatScalar   *v;
2096:   PetscInt           mbs = a->mbs, i, n, j;
2097:   const PetscInt    *idx, *ii, *ridx = NULL;
2098:   PetscBool          usecprow = a->compressedrow.use;

2100:   VecGetArrayRead(xx, &x);
2101:   VecGetArrayPair(yy, zz, &yarray, &zarray);

2103:   idx = a->j;
2104:   v   = a->a;
2105:   if (usecprow) {
2106:     if (zz != yy) PetscArraycpy(zarray, yarray, 2 * mbs);
2107:     mbs  = a->compressedrow.nrows;
2108:     ii   = a->compressedrow.i;
2109:     ridx = a->compressedrow.rindex;
2110:   } else {
2111:     ii = a->i;
2112:     y  = yarray;
2113:     z  = zarray;
2114:   }

2116:   for (i = 0; i < mbs; i++) {
2117:     n = ii[1] - ii[0];
2118:     ii++;
2119:     if (usecprow) {
2120:       z = zarray + 2 * ridx[i];
2121:       y = yarray + 2 * ridx[i];
2122:     }
2123:     sum1 = y[0];
2124:     sum2 = y[1];
2125:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2126:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2127:     for (j = 0; j < n; j++) {
2128:       xb = x + 2 * (*idx++);
2129:       x1 = xb[0];
2130:       x2 = xb[1];

2132:       sum1 += v[0] * x1 + v[2] * x2;
2133:       sum2 += v[1] * x1 + v[3] * x2;
2134:       v += 4;
2135:     }
2136:     z[0] = sum1;
2137:     z[1] = sum2;
2138:     if (!usecprow) {
2139:       z += 2;
2140:       y += 2;
2141:     }
2142:   }
2143:   VecRestoreArrayRead(xx, &x);
2144:   VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2145:   PetscLogFlops(4.0 * a->nz);
2146:   return 0;
2147: }

2149: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
2150: {
2151:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2152:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray;
2153:   const PetscScalar *x, *xb;
2154:   const MatScalar   *v;
2155:   PetscInt           mbs = a->mbs, i, j, n;
2156:   const PetscInt    *idx, *ii, *ridx = NULL;
2157:   PetscBool          usecprow = a->compressedrow.use;

2159:   VecGetArrayRead(xx, &x);
2160:   VecGetArrayPair(yy, zz, &yarray, &zarray);

2162:   idx = a->j;
2163:   v   = a->a;
2164:   if (usecprow) {
2165:     if (zz != yy) PetscArraycpy(zarray, yarray, 3 * mbs);
2166:     mbs  = a->compressedrow.nrows;
2167:     ii   = a->compressedrow.i;
2168:     ridx = a->compressedrow.rindex;
2169:   } else {
2170:     ii = a->i;
2171:     y  = yarray;
2172:     z  = zarray;
2173:   }

2175:   for (i = 0; i < mbs; i++) {
2176:     n = ii[1] - ii[0];
2177:     ii++;
2178:     if (usecprow) {
2179:       z = zarray + 3 * ridx[i];
2180:       y = yarray + 3 * ridx[i];
2181:     }
2182:     sum1 = y[0];
2183:     sum2 = y[1];
2184:     sum3 = y[2];
2185:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2186:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2187:     for (j = 0; j < n; j++) {
2188:       xb = x + 3 * (*idx++);
2189:       x1 = xb[0];
2190:       x2 = xb[1];
2191:       x3 = xb[2];
2192:       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
2193:       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
2194:       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
2195:       v += 9;
2196:     }
2197:     z[0] = sum1;
2198:     z[1] = sum2;
2199:     z[2] = sum3;
2200:     if (!usecprow) {
2201:       z += 3;
2202:       y += 3;
2203:     }
2204:   }
2205:   VecRestoreArrayRead(xx, &x);
2206:   VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2207:   PetscLogFlops(18.0 * a->nz);
2208:   return 0;
2209: }

2211: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
2212: {
2213:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2214:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray;
2215:   const PetscScalar *x, *xb;
2216:   const MatScalar   *v;
2217:   PetscInt           mbs = a->mbs, i, j, n;
2218:   const PetscInt    *idx, *ii, *ridx = NULL;
2219:   PetscBool          usecprow = a->compressedrow.use;

2221:   VecGetArrayRead(xx, &x);
2222:   VecGetArrayPair(yy, zz, &yarray, &zarray);

2224:   idx = a->j;
2225:   v   = a->a;
2226:   if (usecprow) {
2227:     if (zz != yy) PetscArraycpy(zarray, yarray, 4 * mbs);
2228:     mbs  = a->compressedrow.nrows;
2229:     ii   = a->compressedrow.i;
2230:     ridx = a->compressedrow.rindex;
2231:   } else {
2232:     ii = a->i;
2233:     y  = yarray;
2234:     z  = zarray;
2235:   }

2237:   for (i = 0; i < mbs; i++) {
2238:     n = ii[1] - ii[0];
2239:     ii++;
2240:     if (usecprow) {
2241:       z = zarray + 4 * ridx[i];
2242:       y = yarray + 4 * ridx[i];
2243:     }
2244:     sum1 = y[0];
2245:     sum2 = y[1];
2246:     sum3 = y[2];
2247:     sum4 = y[3];
2248:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2249:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2250:     for (j = 0; j < n; j++) {
2251:       xb = x + 4 * (*idx++);
2252:       x1 = xb[0];
2253:       x2 = xb[1];
2254:       x3 = xb[2];
2255:       x4 = xb[3];
2256:       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
2257:       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
2258:       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
2259:       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
2260:       v += 16;
2261:     }
2262:     z[0] = sum1;
2263:     z[1] = sum2;
2264:     z[2] = sum3;
2265:     z[3] = sum4;
2266:     if (!usecprow) {
2267:       z += 4;
2268:       y += 4;
2269:     }
2270:   }
2271:   VecRestoreArrayRead(xx, &x);
2272:   VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2273:   PetscLogFlops(32.0 * a->nz);
2274:   return 0;
2275: }

2277: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
2278: {
2279:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2280:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5;
2281:   const PetscScalar *x, *xb;
2282:   PetscScalar       *yarray, *zarray;
2283:   const MatScalar   *v;
2284:   PetscInt           mbs = a->mbs, i, j, n;
2285:   const PetscInt    *idx, *ii, *ridx = NULL;
2286:   PetscBool          usecprow = a->compressedrow.use;

2288:   VecGetArrayRead(xx, &x);
2289:   VecGetArrayPair(yy, zz, &yarray, &zarray);

2291:   idx = a->j;
2292:   v   = a->a;
2293:   if (usecprow) {
2294:     if (zz != yy) PetscArraycpy(zarray, yarray, 5 * mbs);
2295:     mbs  = a->compressedrow.nrows;
2296:     ii   = a->compressedrow.i;
2297:     ridx = a->compressedrow.rindex;
2298:   } else {
2299:     ii = a->i;
2300:     y  = yarray;
2301:     z  = zarray;
2302:   }

2304:   for (i = 0; i < mbs; i++) {
2305:     n = ii[1] - ii[0];
2306:     ii++;
2307:     if (usecprow) {
2308:       z = zarray + 5 * ridx[i];
2309:       y = yarray + 5 * ridx[i];
2310:     }
2311:     sum1 = y[0];
2312:     sum2 = y[1];
2313:     sum3 = y[2];
2314:     sum4 = y[3];
2315:     sum5 = y[4];
2316:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2317:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2318:     for (j = 0; j < n; j++) {
2319:       xb = x + 5 * (*idx++);
2320:       x1 = xb[0];
2321:       x2 = xb[1];
2322:       x3 = xb[2];
2323:       x4 = xb[3];
2324:       x5 = xb[4];
2325:       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
2326:       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
2327:       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
2328:       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
2329:       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
2330:       v += 25;
2331:     }
2332:     z[0] = sum1;
2333:     z[1] = sum2;
2334:     z[2] = sum3;
2335:     z[3] = sum4;
2336:     z[4] = sum5;
2337:     if (!usecprow) {
2338:       z += 5;
2339:       y += 5;
2340:     }
2341:   }
2342:   VecRestoreArrayRead(xx, &x);
2343:   VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2344:   PetscLogFlops(50.0 * a->nz);
2345:   return 0;
2346: }

2348: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
2349: {
2350:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2351:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
2352:   const PetscScalar *x, *xb;
2353:   PetscScalar        x1, x2, x3, x4, x5, x6, *yarray, *zarray;
2354:   const MatScalar   *v;
2355:   PetscInt           mbs = a->mbs, i, j, n;
2356:   const PetscInt    *idx, *ii, *ridx = NULL;
2357:   PetscBool          usecprow = a->compressedrow.use;

2359:   VecGetArrayRead(xx, &x);
2360:   VecGetArrayPair(yy, zz, &yarray, &zarray);

2362:   idx = a->j;
2363:   v   = a->a;
2364:   if (usecprow) {
2365:     if (zz != yy) PetscArraycpy(zarray, yarray, 6 * mbs);
2366:     mbs  = a->compressedrow.nrows;
2367:     ii   = a->compressedrow.i;
2368:     ridx = a->compressedrow.rindex;
2369:   } else {
2370:     ii = a->i;
2371:     y  = yarray;
2372:     z  = zarray;
2373:   }

2375:   for (i = 0; i < mbs; i++) {
2376:     n = ii[1] - ii[0];
2377:     ii++;
2378:     if (usecprow) {
2379:       z = zarray + 6 * ridx[i];
2380:       y = yarray + 6 * ridx[i];
2381:     }
2382:     sum1 = y[0];
2383:     sum2 = y[1];
2384:     sum3 = y[2];
2385:     sum4 = y[3];
2386:     sum5 = y[4];
2387:     sum6 = y[5];
2388:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2389:     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2390:     for (j = 0; j < n; j++) {
2391:       xb = x + 6 * (*idx++);
2392:       x1 = xb[0];
2393:       x2 = xb[1];
2394:       x3 = xb[2];
2395:       x4 = xb[3];
2396:       x5 = xb[4];
2397:       x6 = xb[5];
2398:       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
2399:       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
2400:       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
2401:       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
2402:       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
2403:       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
2404:       v += 36;
2405:     }
2406:     z[0] = sum1;
2407:     z[1] = sum2;
2408:     z[2] = sum3;
2409:     z[3] = sum4;
2410:     z[4] = sum5;
2411:     z[5] = sum6;
2412:     if (!usecprow) {
2413:       z += 6;
2414:       y += 6;
2415:     }
2416:   }
2417:   VecRestoreArrayRead(xx, &x);
2418:   VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2419:   PetscLogFlops(72.0 * a->nz);
2420:   return 0;
2421: }

2423: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
2424: {
2425:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2426:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
2427:   const PetscScalar *x, *xb;
2428:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray;
2429:   const MatScalar   *v;
2430:   PetscInt           mbs = a->mbs, i, j, n;
2431:   const PetscInt    *idx, *ii, *ridx = NULL;
2432:   PetscBool          usecprow = a->compressedrow.use;

2434:   VecGetArrayRead(xx, &x);
2435:   VecGetArrayPair(yy, zz, &yarray, &zarray);

2437:   idx = a->j;
2438:   v   = a->a;
2439:   if (usecprow) {
2440:     if (zz != yy) PetscArraycpy(zarray, yarray, 7 * mbs);
2441:     mbs  = a->compressedrow.nrows;
2442:     ii   = a->compressedrow.i;
2443:     ridx = a->compressedrow.rindex;
2444:   } else {
2445:     ii = a->i;
2446:     y  = yarray;
2447:     z  = zarray;
2448:   }

2450:   for (i = 0; i < mbs; i++) {
2451:     n = ii[1] - ii[0];
2452:     ii++;
2453:     if (usecprow) {
2454:       z = zarray + 7 * ridx[i];
2455:       y = yarray + 7 * ridx[i];
2456:     }
2457:     sum1 = y[0];
2458:     sum2 = y[1];
2459:     sum3 = y[2];
2460:     sum4 = y[3];
2461:     sum5 = y[4];
2462:     sum6 = y[5];
2463:     sum7 = y[6];
2464:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2465:     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2466:     for (j = 0; j < n; j++) {
2467:       xb = x + 7 * (*idx++);
2468:       x1 = xb[0];
2469:       x2 = xb[1];
2470:       x3 = xb[2];
2471:       x4 = xb[3];
2472:       x5 = xb[4];
2473:       x6 = xb[5];
2474:       x7 = xb[6];
2475:       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
2476:       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
2477:       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
2478:       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
2479:       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
2480:       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
2481:       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
2482:       v += 49;
2483:     }
2484:     z[0] = sum1;
2485:     z[1] = sum2;
2486:     z[2] = sum3;
2487:     z[3] = sum4;
2488:     z[4] = sum5;
2489:     z[5] = sum6;
2490:     z[6] = sum7;
2491:     if (!usecprow) {
2492:       z += 7;
2493:       y += 7;
2494:     }
2495:   }
2496:   VecRestoreArrayRead(xx, &x);
2497:   VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2498:   PetscLogFlops(98.0 * a->nz);
2499:   return 0;
2500: }

2502: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2503: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz)
2504: {
2505:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2506:   PetscScalar       *z = NULL, *work, *workt, *zarray;
2507:   const PetscScalar *x, *xb;
2508:   const MatScalar   *v;
2509:   PetscInt           mbs, i, j, n;
2510:   PetscInt           k;
2511:   PetscBool          usecprow = a->compressedrow.use;
2512:   const PetscInt    *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81;

2514:   __m256d a0, a1, a2, a3, a4, a5;
2515:   __m256d w0, w1, w2, w3;
2516:   __m256d z0, z1, z2;
2517:   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);

2519:   VecCopy(yy, zz);
2520:   VecGetArrayRead(xx, &x);
2521:   VecGetArray(zz, &zarray);

2523:   idx = a->j;
2524:   v   = a->a;
2525:   if (usecprow) {
2526:     mbs  = a->compressedrow.nrows;
2527:     ii   = a->compressedrow.i;
2528:     ridx = a->compressedrow.rindex;
2529:   } else {
2530:     mbs = a->mbs;
2531:     ii  = a->i;
2532:     z   = zarray;
2533:   }

2535:   if (!a->mult_work) {
2536:     k = PetscMax(A->rmap->n, A->cmap->n);
2537:     PetscMalloc1(k + 1, &a->mult_work);
2538:   }

2540:   work = a->mult_work;
2541:   for (i = 0; i < mbs; i++) {
2542:     n = ii[1] - ii[0];
2543:     ii++;
2544:     workt = work;
2545:     for (j = 0; j < n; j++) {
2546:       xb = x + bs * (*idx++);
2547:       for (k = 0; k < bs; k++) workt[k] = xb[k];
2548:       workt += bs;
2549:     }
2550:     if (usecprow) z = zarray + bs * ridx[i];

2552:     z0 = _mm256_loadu_pd(&z[0]);
2553:     z1 = _mm256_loadu_pd(&z[4]);
2554:     z2 = _mm256_set1_pd(z[8]);

2556:     for (j = 0; j < n; j++) {
2557:       /* first column of a */
2558:       w0 = _mm256_set1_pd(work[j * 9]);
2559:       a0 = _mm256_loadu_pd(&v[j * 81]);
2560:       z0 = _mm256_fmadd_pd(a0, w0, z0);
2561:       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
2562:       z1 = _mm256_fmadd_pd(a1, w0, z1);
2563:       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
2564:       z2 = _mm256_fmadd_pd(a2, w0, z2);

2566:       /* second column of a */
2567:       w1 = _mm256_set1_pd(work[j * 9 + 1]);
2568:       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
2569:       z0 = _mm256_fmadd_pd(a0, w1, z0);
2570:       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
2571:       z1 = _mm256_fmadd_pd(a1, w1, z1);
2572:       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
2573:       z2 = _mm256_fmadd_pd(a2, w1, z2);

2575:       /* third column of a */
2576:       w2 = _mm256_set1_pd(work[j * 9 + 2]);
2577:       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
2578:       z0 = _mm256_fmadd_pd(a3, w2, z0);
2579:       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
2580:       z1 = _mm256_fmadd_pd(a4, w2, z1);
2581:       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
2582:       z2 = _mm256_fmadd_pd(a5, w2, z2);

2584:       /* fourth column of a */
2585:       w3 = _mm256_set1_pd(work[j * 9 + 3]);
2586:       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
2587:       z0 = _mm256_fmadd_pd(a0, w3, z0);
2588:       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
2589:       z1 = _mm256_fmadd_pd(a1, w3, z1);
2590:       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
2591:       z2 = _mm256_fmadd_pd(a2, w3, z2);

2593:       /* fifth column of a */
2594:       w0 = _mm256_set1_pd(work[j * 9 + 4]);
2595:       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
2596:       z0 = _mm256_fmadd_pd(a3, w0, z0);
2597:       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
2598:       z1 = _mm256_fmadd_pd(a4, w0, z1);
2599:       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
2600:       z2 = _mm256_fmadd_pd(a5, w0, z2);

2602:       /* sixth column of a */
2603:       w1 = _mm256_set1_pd(work[j * 9 + 5]);
2604:       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
2605:       z0 = _mm256_fmadd_pd(a0, w1, z0);
2606:       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
2607:       z1 = _mm256_fmadd_pd(a1, w1, z1);
2608:       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
2609:       z2 = _mm256_fmadd_pd(a2, w1, z2);

2611:       /* seventh column of a */
2612:       w2 = _mm256_set1_pd(work[j * 9 + 6]);
2613:       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
2614:       z0 = _mm256_fmadd_pd(a0, w2, z0);
2615:       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
2616:       z1 = _mm256_fmadd_pd(a1, w2, z1);
2617:       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
2618:       z2 = _mm256_fmadd_pd(a2, w2, z2);

2620:       /* eighth column of a */
2621:       w3 = _mm256_set1_pd(work[j * 9 + 7]);
2622:       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
2623:       z0 = _mm256_fmadd_pd(a3, w3, z0);
2624:       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
2625:       z1 = _mm256_fmadd_pd(a4, w3, z1);
2626:       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
2627:       z2 = _mm256_fmadd_pd(a5, w3, z2);

2629:       /* ninth column of a */
2630:       w0 = _mm256_set1_pd(work[j * 9 + 8]);
2631:       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
2632:       z0 = _mm256_fmadd_pd(a0, w0, z0);
2633:       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
2634:       z1 = _mm256_fmadd_pd(a1, w0, z1);
2635:       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
2636:       z2 = _mm256_fmadd_pd(a2, w0, z2);
2637:     }

2639:     _mm256_storeu_pd(&z[0], z0);
2640:     _mm256_storeu_pd(&z[4], z1);
2641:     _mm256_maskstore_pd(&z[8], mask1, z2);

2643:     v += n * bs2;
2644:     if (!usecprow) z += bs;
2645:   }
2646:   VecRestoreArrayRead(xx, &x);
2647:   VecRestoreArray(zz, &zarray);
2648:   PetscLogFlops(162.0 * a->nz);
2649:   return 0;
2650: }
2651: #endif

2653: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz)
2654: {
2655:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2656:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
2657:   const PetscScalar *x, *xb;
2658:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray;
2659:   const MatScalar   *v;
2660:   PetscInt           mbs = a->mbs, i, j, n;
2661:   const PetscInt    *idx, *ii, *ridx = NULL;
2662:   PetscBool          usecprow = a->compressedrow.use;

2664:   VecGetArrayRead(xx, &x);
2665:   VecGetArrayPair(yy, zz, &yarray, &zarray);

2667:   idx = a->j;
2668:   v   = a->a;
2669:   if (usecprow) {
2670:     if (zz != yy) PetscArraycpy(zarray, yarray, 7 * mbs);
2671:     mbs  = a->compressedrow.nrows;
2672:     ii   = a->compressedrow.i;
2673:     ridx = a->compressedrow.rindex;
2674:   } else {
2675:     ii = a->i;
2676:     y  = yarray;
2677:     z  = zarray;
2678:   }

2680:   for (i = 0; i < mbs; i++) {
2681:     n = ii[1] - ii[0];
2682:     ii++;
2683:     if (usecprow) {
2684:       z = zarray + 11 * ridx[i];
2685:       y = yarray + 11 * ridx[i];
2686:     }
2687:     sum1  = y[0];
2688:     sum2  = y[1];
2689:     sum3  = y[2];
2690:     sum4  = y[3];
2691:     sum5  = y[4];
2692:     sum6  = y[5];
2693:     sum7  = y[6];
2694:     sum8  = y[7];
2695:     sum9  = y[8];
2696:     sum10 = y[9];
2697:     sum11 = y[10];
2698:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);           /* Indices for the next row (assumes same size as this one) */
2699:     PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2700:     for (j = 0; j < n; j++) {
2701:       xb  = x + 11 * (*idx++);
2702:       x1  = xb[0];
2703:       x2  = xb[1];
2704:       x3  = xb[2];
2705:       x4  = xb[3];
2706:       x5  = xb[4];
2707:       x6  = xb[5];
2708:       x7  = xb[6];
2709:       x8  = xb[7];
2710:       x9  = xb[8];
2711:       x10 = xb[9];
2712:       x11 = xb[10];
2713:       sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11;
2714:       sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11;
2715:       sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11;
2716:       sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11;
2717:       sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11;
2718:       sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11;
2719:       sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11;
2720:       sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11;
2721:       sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11;
2722:       sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11;
2723:       sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11;
2724:       v += 121;
2725:     }
2726:     z[0]  = sum1;
2727:     z[1]  = sum2;
2728:     z[2]  = sum3;
2729:     z[3]  = sum4;
2730:     z[4]  = sum5;
2731:     z[5]  = sum6;
2732:     z[6]  = sum7;
2733:     z[7]  = sum8;
2734:     z[8]  = sum9;
2735:     z[9]  = sum10;
2736:     z[10] = sum11;
2737:     if (!usecprow) {
2738:       z += 11;
2739:       y += 11;
2740:     }
2741:   }
2742:   VecRestoreArrayRead(xx, &x);
2743:   VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2744:   PetscLogFlops(242.0 * a->nz);
2745:   return 0;
2746: }

2748: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
2749: {
2750:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2751:   PetscScalar       *z = NULL, *work, *workt, *zarray;
2752:   const PetscScalar *x, *xb;
2753:   const MatScalar   *v;
2754:   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2755:   PetscInt           ncols, k;
2756:   const PetscInt    *ridx     = NULL, *idx, *ii;
2757:   PetscBool          usecprow = a->compressedrow.use;

2759:   VecCopy(yy, zz);
2760:   VecGetArrayRead(xx, &x);
2761:   VecGetArray(zz, &zarray);

2763:   idx = a->j;
2764:   v   = a->a;
2765:   if (usecprow) {
2766:     mbs  = a->compressedrow.nrows;
2767:     ii   = a->compressedrow.i;
2768:     ridx = a->compressedrow.rindex;
2769:   } else {
2770:     mbs = a->mbs;
2771:     ii  = a->i;
2772:     z   = zarray;
2773:   }

2775:   if (!a->mult_work) {
2776:     k = PetscMax(A->rmap->n, A->cmap->n);
2777:     PetscMalloc1(k + 1, &a->mult_work);
2778:   }
2779:   work = a->mult_work;
2780:   for (i = 0; i < mbs; i++) {
2781:     n = ii[1] - ii[0];
2782:     ii++;
2783:     ncols = n * bs;
2784:     workt = work;
2785:     for (j = 0; j < n; j++) {
2786:       xb = x + bs * (*idx++);
2787:       for (k = 0; k < bs; k++) workt[k] = xb[k];
2788:       workt += bs;
2789:     }
2790:     if (usecprow) z = zarray + bs * ridx[i];
2791:     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
2792:     v += n * bs2;
2793:     if (!usecprow) z += bs;
2794:   }
2795:   VecRestoreArrayRead(xx, &x);
2796:   VecRestoreArray(zz, &zarray);
2797:   PetscLogFlops(2.0 * a->nz * bs2);
2798:   return 0;
2799: }

2801: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2802: {
2803:   PetscScalar zero = 0.0;

2805:   VecSet(zz, zero);
2806:   MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz);
2807:   return 0;
2808: }

2810: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2811: {
2812:   PetscScalar zero = 0.0;

2814:   VecSet(zz, zero);
2815:   MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz);
2816:   return 0;
2817: }

2819: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2820: {
2821:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2822:   PetscScalar       *z, x1, x2, x3, x4, x5;
2823:   const PetscScalar *x, *xb = NULL;
2824:   const MatScalar   *v;
2825:   PetscInt           mbs, i, rval, bs     = A->rmap->bs, j, n;
2826:   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2827:   Mat_CompressedRow  cprow    = a->compressedrow;
2828:   PetscBool          usecprow = cprow.use;

2830:   if (yy != zz) VecCopy(yy, zz);
2831:   VecGetArrayRead(xx, &x);
2832:   VecGetArray(zz, &z);

2834:   idx = a->j;
2835:   v   = a->a;
2836:   if (usecprow) {
2837:     mbs  = cprow.nrows;
2838:     ii   = cprow.i;
2839:     ridx = cprow.rindex;
2840:   } else {
2841:     mbs = a->mbs;
2842:     ii  = a->i;
2843:     xb  = x;
2844:   }

2846:   switch (bs) {
2847:   case 1:
2848:     for (i = 0; i < mbs; i++) {
2849:       if (usecprow) xb = x + ridx[i];
2850:       x1 = xb[0];
2851:       ib = idx + ii[0];
2852:       n  = ii[1] - ii[0];
2853:       ii++;
2854:       for (j = 0; j < n; j++) {
2855:         rval = ib[j];
2856:         z[rval] += PetscConj(*v) * x1;
2857:         v++;
2858:       }
2859:       if (!usecprow) xb++;
2860:     }
2861:     break;
2862:   case 2:
2863:     for (i = 0; i < mbs; i++) {
2864:       if (usecprow) xb = x + 2 * ridx[i];
2865:       x1 = xb[0];
2866:       x2 = xb[1];
2867:       ib = idx + ii[0];
2868:       n  = ii[1] - ii[0];
2869:       ii++;
2870:       for (j = 0; j < n; j++) {
2871:         rval = ib[j] * 2;
2872:         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2;
2873:         z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2;
2874:         v += 4;
2875:       }
2876:       if (!usecprow) xb += 2;
2877:     }
2878:     break;
2879:   case 3:
2880:     for (i = 0; i < mbs; i++) {
2881:       if (usecprow) xb = x + 3 * ridx[i];
2882:       x1 = xb[0];
2883:       x2 = xb[1];
2884:       x3 = xb[2];
2885:       ib = idx + ii[0];
2886:       n  = ii[1] - ii[0];
2887:       ii++;
2888:       for (j = 0; j < n; j++) {
2889:         rval = ib[j] * 3;
2890:         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3;
2891:         z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3;
2892:         z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3;
2893:         v += 9;
2894:       }
2895:       if (!usecprow) xb += 3;
2896:     }
2897:     break;
2898:   case 4:
2899:     for (i = 0; i < mbs; i++) {
2900:       if (usecprow) xb = x + 4 * ridx[i];
2901:       x1 = xb[0];
2902:       x2 = xb[1];
2903:       x3 = xb[2];
2904:       x4 = xb[3];
2905:       ib = idx + ii[0];
2906:       n  = ii[1] - ii[0];
2907:       ii++;
2908:       for (j = 0; j < n; j++) {
2909:         rval = ib[j] * 4;
2910:         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4;
2911:         z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4;
2912:         z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4;
2913:         z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4;
2914:         v += 16;
2915:       }
2916:       if (!usecprow) xb += 4;
2917:     }
2918:     break;
2919:   case 5:
2920:     for (i = 0; i < mbs; i++) {
2921:       if (usecprow) xb = x + 5 * ridx[i];
2922:       x1 = xb[0];
2923:       x2 = xb[1];
2924:       x3 = xb[2];
2925:       x4 = xb[3];
2926:       x5 = xb[4];
2927:       ib = idx + ii[0];
2928:       n  = ii[1] - ii[0];
2929:       ii++;
2930:       for (j = 0; j < n; j++) {
2931:         rval = ib[j] * 5;
2932:         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5;
2933:         z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5;
2934:         z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5;
2935:         z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5;
2936:         z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5;
2937:         v += 25;
2938:       }
2939:       if (!usecprow) xb += 5;
2940:     }
2941:     break;
2942:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2943:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet");
2944: #if 0
2945:     {
2946:       PetscInt          ncols,k,bs2=a->bs2;
2947:       PetscScalar       *work,*workt,zb;
2948:       const PetscScalar *xtmp;
2949:       if (!a->mult_work) {
2950:         k    = PetscMax(A->rmap->n,A->cmap->n);
2951:         PetscMalloc1(k+1,&a->mult_work);
2952:       }
2953:       work = a->mult_work;
2954:       xtmp = x;
2955:       for (i=0; i<mbs; i++) {
2956:         n     = ii[1] - ii[0]; ii++;
2957:         ncols = n*bs;
2958:         PetscArrayzero(work,ncols);
2959:         if (usecprow) xtmp = x + bs*ridx[i];
2960:         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2961:         v += n*bs2;
2962:         if (!usecprow) xtmp += bs;
2963:         workt = work;
2964:         for (j=0; j<n; j++) {
2965:           zb = z + bs*(*idx++);
2966:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2967:           workt += bs;
2968:         }
2969:       }
2970:     }
2971: #endif
2972:   }
2973:   VecRestoreArrayRead(xx, &x);
2974:   VecRestoreArray(zz, &z);
2975:   PetscLogFlops(2.0 * a->nz * a->bs2);
2976:   return 0;
2977: }

2979: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2980: {
2981:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2982:   PetscScalar       *zb, *z, x1, x2, x3, x4, x5;
2983:   const PetscScalar *x, *xb = NULL;
2984:   const MatScalar   *v;
2985:   PetscInt           mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2986:   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2987:   Mat_CompressedRow  cprow    = a->compressedrow;
2988:   PetscBool          usecprow = cprow.use;

2990:   if (yy != zz) VecCopy(yy, zz);
2991:   VecGetArrayRead(xx, &x);
2992:   VecGetArray(zz, &z);

2994:   idx = a->j;
2995:   v   = a->a;
2996:   if (usecprow) {
2997:     mbs  = cprow.nrows;
2998:     ii   = cprow.i;
2999:     ridx = cprow.rindex;
3000:   } else {
3001:     mbs = a->mbs;
3002:     ii  = a->i;
3003:     xb  = x;
3004:   }

3006:   switch (bs) {
3007:   case 1:
3008:     for (i = 0; i < mbs; i++) {
3009:       if (usecprow) xb = x + ridx[i];
3010:       x1 = xb[0];
3011:       ib = idx + ii[0];
3012:       n  = ii[1] - ii[0];
3013:       ii++;
3014:       for (j = 0; j < n; j++) {
3015:         rval = ib[j];
3016:         z[rval] += *v * x1;
3017:         v++;
3018:       }
3019:       if (!usecprow) xb++;
3020:     }
3021:     break;
3022:   case 2:
3023:     for (i = 0; i < mbs; i++) {
3024:       if (usecprow) xb = x + 2 * ridx[i];
3025:       x1 = xb[0];
3026:       x2 = xb[1];
3027:       ib = idx + ii[0];
3028:       n  = ii[1] - ii[0];
3029:       ii++;
3030:       for (j = 0; j < n; j++) {
3031:         rval = ib[j] * 2;
3032:         z[rval++] += v[0] * x1 + v[1] * x2;
3033:         z[rval++] += v[2] * x1 + v[3] * x2;
3034:         v += 4;
3035:       }
3036:       if (!usecprow) xb += 2;
3037:     }
3038:     break;
3039:   case 3:
3040:     for (i = 0; i < mbs; i++) {
3041:       if (usecprow) xb = x + 3 * ridx[i];
3042:       x1 = xb[0];
3043:       x2 = xb[1];
3044:       x3 = xb[2];
3045:       ib = idx + ii[0];
3046:       n  = ii[1] - ii[0];
3047:       ii++;
3048:       for (j = 0; j < n; j++) {
3049:         rval = ib[j] * 3;
3050:         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3;
3051:         z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3;
3052:         z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3053:         v += 9;
3054:       }
3055:       if (!usecprow) xb += 3;
3056:     }
3057:     break;
3058:   case 4:
3059:     for (i = 0; i < mbs; i++) {
3060:       if (usecprow) xb = x + 4 * ridx[i];
3061:       x1 = xb[0];
3062:       x2 = xb[1];
3063:       x3 = xb[2];
3064:       x4 = xb[3];
3065:       ib = idx + ii[0];
3066:       n  = ii[1] - ii[0];
3067:       ii++;
3068:       for (j = 0; j < n; j++) {
3069:         rval = ib[j] * 4;
3070:         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
3071:         z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
3072:         z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
3073:         z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
3074:         v += 16;
3075:       }
3076:       if (!usecprow) xb += 4;
3077:     }
3078:     break;
3079:   case 5:
3080:     for (i = 0; i < mbs; i++) {
3081:       if (usecprow) xb = x + 5 * ridx[i];
3082:       x1 = xb[0];
3083:       x2 = xb[1];
3084:       x3 = xb[2];
3085:       x4 = xb[3];
3086:       x5 = xb[4];
3087:       ib = idx + ii[0];
3088:       n  = ii[1] - ii[0];
3089:       ii++;
3090:       for (j = 0; j < n; j++) {
3091:         rval = ib[j] * 5;
3092:         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
3093:         z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
3094:         z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
3095:         z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
3096:         z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
3097:         v += 25;
3098:       }
3099:       if (!usecprow) xb += 5;
3100:     }
3101:     break;
3102:   default: { /* block sizes larger then 5 by 5 are handled by BLAS */
3103:     PetscInt           ncols, k;
3104:     PetscScalar       *work, *workt;
3105:     const PetscScalar *xtmp;
3106:     if (!a->mult_work) {
3107:       k = PetscMax(A->rmap->n, A->cmap->n);
3108:       PetscMalloc1(k + 1, &a->mult_work);
3109:     }
3110:     work = a->mult_work;
3111:     xtmp = x;
3112:     for (i = 0; i < mbs; i++) {
3113:       n = ii[1] - ii[0];
3114:       ii++;
3115:       ncols = n * bs;
3116:       PetscArrayzero(work, ncols);
3117:       if (usecprow) xtmp = x + bs * ridx[i];
3118:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work);
3119:       v += n * bs2;
3120:       if (!usecprow) xtmp += bs;
3121:       workt = work;
3122:       for (j = 0; j < n; j++) {
3123:         zb = z + bs * (*idx++);
3124:         for (k = 0; k < bs; k++) zb[k] += workt[k];
3125:         workt += bs;
3126:       }
3127:     }
3128:   }
3129:   }
3130:   VecRestoreArrayRead(xx, &x);
3131:   VecRestoreArray(zz, &z);
3132:   PetscLogFlops(2.0 * a->nz * a->bs2);
3133:   return 0;
3134: }

3136: PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha)
3137: {
3138:   Mat_SeqBAIJ *a       = (Mat_SeqBAIJ *)inA->data;
3139:   PetscInt     totalnz = a->bs2 * a->nz;
3140:   PetscScalar  oalpha  = alpha;
3141:   PetscBLASInt one     = 1, tnz;

3143:   PetscBLASIntCast(totalnz, &tnz);
3144:   PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one));
3145:   PetscLogFlops(totalnz);
3146:   return 0;
3147: }

3149: PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm)
3150: {
3151:   Mat_SeqBAIJ *a   = (Mat_SeqBAIJ *)A->data;
3152:   MatScalar   *v   = a->a;
3153:   PetscReal    sum = 0.0;
3154:   PetscInt     i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1;

3156:   if (type == NORM_FROBENIUS) {
3157: #if defined(PETSC_USE_REAL___FP16)
3158:     PetscBLASInt one = 1, cnt = bs2 * nz;
3159:     PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one));
3160: #else
3161:     for (i = 0; i < bs2 * nz; i++) {
3162:       sum += PetscRealPart(PetscConj(*v) * (*v));
3163:       v++;
3164:     }
3165: #endif
3166:     *norm = PetscSqrtReal(sum);
3167:     PetscLogFlops(2.0 * bs2 * nz);
3168:   } else if (type == NORM_1) { /* maximum column sum */
3169:     PetscReal *tmp;
3170:     PetscInt  *bcol = a->j;
3171:     PetscCalloc1(A->cmap->n + 1, &tmp);
3172:     for (i = 0; i < nz; i++) {
3173:       for (j = 0; j < bs; j++) {
3174:         k1 = bs * (*bcol) + j; /* column index */
3175:         for (k = 0; k < bs; k++) {
3176:           tmp[k1] += PetscAbsScalar(*v);
3177:           v++;
3178:         }
3179:       }
3180:       bcol++;
3181:     }
3182:     *norm = 0.0;
3183:     for (j = 0; j < A->cmap->n; j++) {
3184:       if (tmp[j] > *norm) *norm = tmp[j];
3185:     }
3186:     PetscFree(tmp);
3187:     PetscLogFlops(PetscMax(bs2 * nz - 1, 0));
3188:   } else if (type == NORM_INFINITY) { /* maximum row sum */
3189:     *norm = 0.0;
3190:     for (k = 0; k < bs; k++) {
3191:       for (j = 0; j < a->mbs; j++) {
3192:         v   = a->a + bs2 * a->i[j] + k;
3193:         sum = 0.0;
3194:         for (i = 0; i < a->i[j + 1] - a->i[j]; i++) {
3195:           for (k1 = 0; k1 < bs; k1++) {
3196:             sum += PetscAbsScalar(*v);
3197:             v += bs;
3198:           }
3199:         }
3200:         if (sum > *norm) *norm = sum;
3201:       }
3202:     }
3203:     PetscLogFlops(PetscMax(bs2 * nz - 1, 0));
3204:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
3205:   return 0;
3206: }

3208: PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg)
3209: {
3210:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;

3212:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
3213:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
3214:     *flg = PETSC_FALSE;
3215:     return 0;
3216:   }

3218:   /* if the a->i are the same */
3219:   PetscArraycmp(a->i, b->i, a->mbs + 1, flg);
3220:   if (!*flg) return 0;

3222:   /* if a->j are the same */
3223:   PetscArraycmp(a->j, b->j, a->nz, flg);
3224:   if (!*flg) return 0;

3226:   /* if a->a are the same */
3227:   PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg);
3228:   return 0;
3229: }

3231: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v)
3232: {
3233:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3234:   PetscInt     i, j, k, n, row, bs, *ai, *aj, ambs, bs2;
3235:   PetscScalar *x, zero = 0.0;
3236:   MatScalar   *aa, *aa_j;

3239:   bs   = A->rmap->bs;
3240:   aa   = a->a;
3241:   ai   = a->i;
3242:   aj   = a->j;
3243:   ambs = a->mbs;
3244:   bs2  = a->bs2;

3246:   VecSet(v, zero);
3247:   VecGetArray(v, &x);
3248:   VecGetLocalSize(v, &n);
3250:   for (i = 0; i < ambs; i++) {
3251:     for (j = ai[i]; j < ai[i + 1]; j++) {
3252:       if (aj[j] == i) {
3253:         row  = i * bs;
3254:         aa_j = aa + j * bs2;
3255:         for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
3256:         break;
3257:       }
3258:     }
3259:   }
3260:   VecRestoreArray(v, &x);
3261:   return 0;
3262: }

3264: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr)
3265: {
3266:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3267:   const PetscScalar *l, *r, *li, *ri;
3268:   PetscScalar        x;
3269:   MatScalar         *aa, *v;
3270:   PetscInt           i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai;
3271:   const PetscInt    *ai, *aj;

3273:   ai  = a->i;
3274:   aj  = a->j;
3275:   aa  = a->a;
3276:   m   = A->rmap->n;
3277:   n   = A->cmap->n;
3278:   bs  = A->rmap->bs;
3279:   mbs = a->mbs;
3280:   bs2 = a->bs2;
3281:   if (ll) {
3282:     VecGetArrayRead(ll, &l);
3283:     VecGetLocalSize(ll, &lm);
3285:     for (i = 0; i < mbs; i++) { /* for each block row */
3286:       M  = ai[i + 1] - ai[i];
3287:       li = l + i * bs;
3288:       v  = aa + bs2 * ai[i];
3289:       for (j = 0; j < M; j++) { /* for each block */
3290:         for (k = 0; k < bs2; k++) (*v++) *= li[k % bs];
3291:       }
3292:     }
3293:     VecRestoreArrayRead(ll, &l);
3294:     PetscLogFlops(a->nz);
3295:   }

3297:   if (rr) {
3298:     VecGetArrayRead(rr, &r);
3299:     VecGetLocalSize(rr, &rn);
3301:     for (i = 0; i < mbs; i++) { /* for each block row */
3302:       iai = ai[i];
3303:       M   = ai[i + 1] - iai;
3304:       v   = aa + bs2 * iai;
3305:       for (j = 0; j < M; j++) { /* for each block */
3306:         ri = r + bs * aj[iai + j];
3307:         for (k = 0; k < bs; k++) {
3308:           x = ri[k];
3309:           for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x;
3310:           v += bs;
3311:         }
3312:       }
3313:     }
3314:     VecRestoreArrayRead(rr, &r);
3315:     PetscLogFlops(a->nz);
3316:   }
3317:   return 0;
3318: }

3320: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info)
3321: {
3322:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;

3324:   info->block_size   = a->bs2;
3325:   info->nz_allocated = a->bs2 * a->maxnz;
3326:   info->nz_used      = a->bs2 * a->nz;
3327:   info->nz_unneeded  = info->nz_allocated - info->nz_used;
3328:   info->assemblies   = A->num_ass;
3329:   info->mallocs      = A->info.mallocs;
3330:   info->memory       = 0; /* REVIEW ME */
3331:   if (A->factortype) {
3332:     info->fill_ratio_given  = A->info.fill_ratio_given;
3333:     info->fill_ratio_needed = A->info.fill_ratio_needed;
3334:     info->factor_mallocs    = A->info.factor_mallocs;
3335:   } else {
3336:     info->fill_ratio_given  = 0;
3337:     info->fill_ratio_needed = 0;
3338:     info->factor_mallocs    = 0;
3339:   }
3340:   return 0;
3341: }

3343: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
3344: {
3345:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;

3347:   PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]);
3348:   return 0;
3349: }

3351: PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
3352: {
3353:   MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C);
3354:   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
3355:   return 0;
3356: }

3358: PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3359: {
3360:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3361:   PetscScalar       *z = NULL, sum1;
3362:   const PetscScalar *xb;
3363:   PetscScalar        x1;
3364:   const MatScalar   *v, *vv;
3365:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3366:   PetscBool          usecprow = a->compressedrow.use;

3368:   idx = a->j;
3369:   v   = a->a;
3370:   if (usecprow) {
3371:     mbs  = a->compressedrow.nrows;
3372:     ii   = a->compressedrow.i;
3373:     ridx = a->compressedrow.rindex;
3374:   } else {
3375:     mbs = a->mbs;
3376:     ii  = a->i;
3377:     z   = c;
3378:   }

3380:   for (i = 0; i < mbs; i++) {
3381:     n = ii[1] - ii[0];
3382:     ii++;
3383:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3384:     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
3385:     if (usecprow) z = c + ridx[i];
3386:     jj = idx;
3387:     vv = v;
3388:     for (k = 0; k < cn; k++) {
3389:       idx  = jj;
3390:       v    = vv;
3391:       sum1 = 0.0;
3392:       for (j = 0; j < n; j++) {
3393:         xb = b + (*idx++);
3394:         x1 = xb[0 + k * bm];
3395:         sum1 += v[0] * x1;
3396:         v += 1;
3397:       }
3398:       z[0 + k * cm] = sum1;
3399:     }
3400:     if (!usecprow) z += 1;
3401:   }
3402:   return 0;
3403: }

3405: PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3406: {
3407:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3408:   PetscScalar       *z = NULL, sum1, sum2;
3409:   const PetscScalar *xb;
3410:   PetscScalar        x1, x2;
3411:   const MatScalar   *v, *vv;
3412:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3413:   PetscBool          usecprow = a->compressedrow.use;

3415:   idx = a->j;
3416:   v   = a->a;
3417:   if (usecprow) {
3418:     mbs  = a->compressedrow.nrows;
3419:     ii   = a->compressedrow.i;
3420:     ridx = a->compressedrow.rindex;
3421:   } else {
3422:     mbs = a->mbs;
3423:     ii  = a->i;
3424:     z   = c;
3425:   }

3427:   for (i = 0; i < mbs; i++) {
3428:     n = ii[1] - ii[0];
3429:     ii++;
3430:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3431:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3432:     if (usecprow) z = c + 2 * ridx[i];
3433:     jj = idx;
3434:     vv = v;
3435:     for (k = 0; k < cn; k++) {
3436:       idx  = jj;
3437:       v    = vv;
3438:       sum1 = 0.0;
3439:       sum2 = 0.0;
3440:       for (j = 0; j < n; j++) {
3441:         xb = b + 2 * (*idx++);
3442:         x1 = xb[0 + k * bm];
3443:         x2 = xb[1 + k * bm];
3444:         sum1 += v[0] * x1 + v[2] * x2;
3445:         sum2 += v[1] * x1 + v[3] * x2;
3446:         v += 4;
3447:       }
3448:       z[0 + k * cm] = sum1;
3449:       z[1 + k * cm] = sum2;
3450:     }
3451:     if (!usecprow) z += 2;
3452:   }
3453:   return 0;
3454: }

3456: PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3457: {
3458:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3459:   PetscScalar       *z = NULL, sum1, sum2, sum3;
3460:   const PetscScalar *xb;
3461:   PetscScalar        x1, x2, x3;
3462:   const MatScalar   *v, *vv;
3463:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3464:   PetscBool          usecprow = a->compressedrow.use;

3466:   idx = a->j;
3467:   v   = a->a;
3468:   if (usecprow) {
3469:     mbs  = a->compressedrow.nrows;
3470:     ii   = a->compressedrow.i;
3471:     ridx = a->compressedrow.rindex;
3472:   } else {
3473:     mbs = a->mbs;
3474:     ii  = a->i;
3475:     z   = c;
3476:   }

3478:   for (i = 0; i < mbs; i++) {
3479:     n = ii[1] - ii[0];
3480:     ii++;
3481:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3482:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3483:     if (usecprow) z = c + 3 * ridx[i];
3484:     jj = idx;
3485:     vv = v;
3486:     for (k = 0; k < cn; k++) {
3487:       idx  = jj;
3488:       v    = vv;
3489:       sum1 = 0.0;
3490:       sum2 = 0.0;
3491:       sum3 = 0.0;
3492:       for (j = 0; j < n; j++) {
3493:         xb = b + 3 * (*idx++);
3494:         x1 = xb[0 + k * bm];
3495:         x2 = xb[1 + k * bm];
3496:         x3 = xb[2 + k * bm];
3497:         sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
3498:         sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
3499:         sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
3500:         v += 9;
3501:       }
3502:       z[0 + k * cm] = sum1;
3503:       z[1 + k * cm] = sum2;
3504:       z[2 + k * cm] = sum3;
3505:     }
3506:     if (!usecprow) z += 3;
3507:   }
3508:   return 0;
3509: }

3511: PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3512: {
3513:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3514:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4;
3515:   const PetscScalar *xb;
3516:   PetscScalar        x1, x2, x3, x4;
3517:   const MatScalar   *v, *vv;
3518:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3519:   PetscBool          usecprow = a->compressedrow.use;

3521:   idx = a->j;
3522:   v   = a->a;
3523:   if (usecprow) {
3524:     mbs  = a->compressedrow.nrows;
3525:     ii   = a->compressedrow.i;
3526:     ridx = a->compressedrow.rindex;
3527:   } else {
3528:     mbs = a->mbs;
3529:     ii  = a->i;
3530:     z   = c;
3531:   }

3533:   for (i = 0; i < mbs; i++) {
3534:     n = ii[1] - ii[0];
3535:     ii++;
3536:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3537:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3538:     if (usecprow) z = c + 4 * ridx[i];
3539:     jj = idx;
3540:     vv = v;
3541:     for (k = 0; k < cn; k++) {
3542:       idx  = jj;
3543:       v    = vv;
3544:       sum1 = 0.0;
3545:       sum2 = 0.0;
3546:       sum3 = 0.0;
3547:       sum4 = 0.0;
3548:       for (j = 0; j < n; j++) {
3549:         xb = b + 4 * (*idx++);
3550:         x1 = xb[0 + k * bm];
3551:         x2 = xb[1 + k * bm];
3552:         x3 = xb[2 + k * bm];
3553:         x4 = xb[3 + k * bm];
3554:         sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
3555:         sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
3556:         sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
3557:         sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
3558:         v += 16;
3559:       }
3560:       z[0 + k * cm] = sum1;
3561:       z[1 + k * cm] = sum2;
3562:       z[2 + k * cm] = sum3;
3563:       z[3 + k * cm] = sum4;
3564:     }
3565:     if (!usecprow) z += 4;
3566:   }
3567:   return 0;
3568: }

3570: PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3571: {
3572:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3573:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5;
3574:   const PetscScalar *xb;
3575:   PetscScalar        x1, x2, x3, x4, x5;
3576:   const MatScalar   *v, *vv;
3577:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3578:   PetscBool          usecprow = a->compressedrow.use;

3580:   idx = a->j;
3581:   v   = a->a;
3582:   if (usecprow) {
3583:     mbs  = a->compressedrow.nrows;
3584:     ii   = a->compressedrow.i;
3585:     ridx = a->compressedrow.rindex;
3586:   } else {
3587:     mbs = a->mbs;
3588:     ii  = a->i;
3589:     z   = c;
3590:   }

3592:   for (i = 0; i < mbs; i++) {
3593:     n = ii[1] - ii[0];
3594:     ii++;
3595:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3596:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3597:     if (usecprow) z = c + 5 * ridx[i];
3598:     jj = idx;
3599:     vv = v;
3600:     for (k = 0; k < cn; k++) {
3601:       idx  = jj;
3602:       v    = vv;
3603:       sum1 = 0.0;
3604:       sum2 = 0.0;
3605:       sum3 = 0.0;
3606:       sum4 = 0.0;
3607:       sum5 = 0.0;
3608:       for (j = 0; j < n; j++) {
3609:         xb = b + 5 * (*idx++);
3610:         x1 = xb[0 + k * bm];
3611:         x2 = xb[1 + k * bm];
3612:         x3 = xb[2 + k * bm];
3613:         x4 = xb[3 + k * bm];
3614:         x5 = xb[4 + k * bm];
3615:         sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
3616:         sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
3617:         sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
3618:         sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
3619:         sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
3620:         v += 25;
3621:       }
3622:       z[0 + k * cm] = sum1;
3623:       z[1 + k * cm] = sum2;
3624:       z[2 + k * cm] = sum3;
3625:       z[3 + k * cm] = sum4;
3626:       z[4 + k * cm] = sum5;
3627:     }
3628:     if (!usecprow) z += 5;
3629:   }
3630:   return 0;
3631: }

3633: PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C)
3634: {
3635:   Mat_SeqBAIJ     *a  = (Mat_SeqBAIJ *)A->data;
3636:   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
3637:   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
3638:   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3639:   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3640:   PetscBLASInt     bbs, bcn, bbm, bcm;
3641:   PetscScalar     *z = NULL;
3642:   PetscScalar     *c, *b;
3643:   const MatScalar *v;
3644:   const PetscInt  *idx, *ii, *ridx = NULL;
3645:   PetscScalar      _DZero = 0.0, _DOne = 1.0;
3646:   PetscBool        usecprow = a->compressedrow.use;

3648:   if (!cm || !cn) return 0;
3652:   b = bd->v;
3653:   if (a->nonzerorowcnt != A->rmap->n) MatZeroEntries(C);
3654:   MatDenseGetArray(C, &c);
3655:   switch (bs) {
3656:   case 1:
3657:     MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);
3658:     break;
3659:   case 2:
3660:     MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);
3661:     break;
3662:   case 3:
3663:     MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);
3664:     break;
3665:   case 4:
3666:     MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);
3667:     break;
3668:   case 5:
3669:     MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);
3670:     break;
3671:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
3672:     PetscBLASIntCast(bs, &bbs);
3673:     PetscBLASIntCast(cn, &bcn);
3674:     PetscBLASIntCast(bm, &bbm);
3675:     PetscBLASIntCast(cm, &bcm);
3676:     idx = a->j;
3677:     v   = a->a;
3678:     if (usecprow) {
3679:       mbs  = a->compressedrow.nrows;
3680:       ii   = a->compressedrow.i;
3681:       ridx = a->compressedrow.rindex;
3682:     } else {
3683:       mbs = a->mbs;
3684:       ii  = a->i;
3685:       z   = c;
3686:     }
3687:     for (i = 0; i < mbs; i++) {
3688:       n = ii[1] - ii[0];
3689:       ii++;
3690:       if (usecprow) z = c + bs * ridx[i];
3691:       if (n) {
3692:         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm));
3693:         v += bs2;
3694:       }
3695:       for (j = 1; j < n; j++) {
3696:         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
3697:         v += bs2;
3698:       }
3699:       if (!usecprow) z += bs;
3700:     }
3701:   }
3702:   MatDenseRestoreArray(C, &c);
3703:   PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn);
3704:   return 0;
3705: }