Actual source code: sbaij2.c


  2: #include <../src/mat/impls/baij/seq/baij.h>
  3: #include <../src/mat/impls/dense/seq/dense.h>
  4: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  5: #include <petsc/private/kernels/blockinvert.h>
  6: #include <petscbt.h>
  7: #include <petscblaslapack.h>

  9: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
 10: {
 11:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
 12:   PetscInt        brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs;
 13:   const PetscInt *idx;
 14:   PetscBT         table_out, table_in;

 17:   mbs = a->mbs;
 18:   ai  = a->i;
 19:   aj  = a->j;
 20:   bs  = A->rmap->bs;
 21:   PetscBTCreate(mbs, &table_out);
 22:   PetscMalloc1(mbs + 1, &nidx);
 23:   PetscBTCreate(mbs, &table_in);

 25:   for (i = 0; i < is_max; i++) { /* for each is */
 26:     isz = 0;
 27:     PetscBTMemzero(mbs, table_out);

 29:     /* Extract the indices, assume there can be duplicate entries */
 30:     ISGetIndices(is[i], &idx);
 31:     ISGetLocalSize(is[i], &n);

 33:     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
 34:     bcol_max = 0;
 35:     for (j = 0; j < n; ++j) {
 36:       brow = idx[j] / bs; /* convert the indices into block indices */
 38:       if (!PetscBTLookupSet(table_out, brow)) {
 39:         nidx[isz++] = brow;
 40:         if (bcol_max < brow) bcol_max = brow;
 41:       }
 42:     }
 43:     ISRestoreIndices(is[i], &idx);
 44:     ISDestroy(&is[i]);

 46:     k = 0;
 47:     for (j = 0; j < ov; j++) { /* for each overlap */
 48:       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
 49:       PetscBTMemzero(mbs, table_in);
 50:       for (l = k; l < isz; l++) PetscBTSet(table_in, nidx[l]);

 52:       n = isz; /* length of the updated is[i] */
 53:       for (brow = 0; brow < mbs; brow++) {
 54:         start = ai[brow];
 55:         end   = ai[brow + 1];
 56:         if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
 57:           for (l = start; l < end; l++) {
 58:             bcol = aj[l];
 59:             if (!PetscBTLookupSet(table_out, bcol)) {
 60:               nidx[isz++] = bcol;
 61:               if (bcol_max < bcol) bcol_max = bcol;
 62:             }
 63:           }
 64:           k++;
 65:           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
 66:         } else {             /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */
 67:           for (l = start; l < end; l++) {
 68:             bcol = aj[l];
 69:             if (bcol > bcol_max) break;
 70:             if (PetscBTLookup(table_in, bcol)) {
 71:               if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
 72:               break; /* for l = start; l<end ; l++) */
 73:             }
 74:           }
 75:         }
 76:       }
 77:     } /* for each overlap */
 78:     ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i);
 79:   } /* for each is */
 80:   PetscBTDestroy(&table_out);
 81:   PetscFree(nidx);
 82:   PetscBTDestroy(&table_in);
 83:   return 0;
 84: }

 86: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
 87:         Zero some ops' to avoid invalid usse */
 88: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
 89: {
 90:   MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE);
 91:   Bseq->ops->mult                   = NULL;
 92:   Bseq->ops->multadd                = NULL;
 93:   Bseq->ops->multtranspose          = NULL;
 94:   Bseq->ops->multtransposeadd       = NULL;
 95:   Bseq->ops->lufactor               = NULL;
 96:   Bseq->ops->choleskyfactor         = NULL;
 97:   Bseq->ops->lufactorsymbolic       = NULL;
 98:   Bseq->ops->choleskyfactorsymbolic = NULL;
 99:   Bseq->ops->getinertia             = NULL;
100:   return 0;
101: }

103: /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
104: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
105: {
106:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *c;
107:   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
108:   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
109:   const PetscInt *irow, *icol;
110:   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
111:   PetscInt       *aj = a->j, *ai = a->i;
112:   MatScalar      *mat_a;
113:   Mat             C;
114:   PetscBool       flag;


117:   ISGetIndices(isrow, &irow);
118:   ISGetIndices(iscol, &icol);
119:   ISGetLocalSize(isrow, &nrows);
120:   ISGetLocalSize(iscol, &ncols);

122:   PetscCalloc1(1 + oldcols, &smap);
123:   ssmap = smap;
124:   PetscMalloc1(1 + nrows, &lens);
125:   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
126:   /* determine lens of each row */
127:   for (i = 0; i < nrows; i++) {
128:     kstart  = ai[irow[i]];
129:     kend    = kstart + a->ilen[irow[i]];
130:     lens[i] = 0;
131:     for (k = kstart; k < kend; k++) {
132:       if (ssmap[aj[k]]) lens[i]++;
133:     }
134:   }
135:   /* Create and fill new matrix */
136:   if (scall == MAT_REUSE_MATRIX) {
137:     c = (Mat_SeqSBAIJ *)((*B)->data);

140:     PetscArraycmp(c->ilen, lens, c->mbs, &flag);
142:     PetscArrayzero(c->ilen, c->mbs);
143:     C = *B;
144:   } else {
145:     MatCreate(PetscObjectComm((PetscObject)A), &C);
146:     MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE);
147:     MatSetType(C, ((PetscObject)A)->type_name);
148:     MatSeqSBAIJSetPreallocation(C, bs, 0, lens);
149:   }
150:   c = (Mat_SeqSBAIJ *)(C->data);
151:   for (i = 0; i < nrows; i++) {
152:     row      = irow[i];
153:     kstart   = ai[row];
154:     kend     = kstart + a->ilen[row];
155:     mat_i    = c->i[i];
156:     mat_j    = c->j + mat_i;
157:     mat_a    = c->a + mat_i * bs2;
158:     mat_ilen = c->ilen + i;
159:     for (k = kstart; k < kend; k++) {
160:       if ((tcol = ssmap[a->j[k]])) {
161:         *mat_j++ = tcol - 1;
162:         PetscArraycpy(mat_a, a->a + k * bs2, bs2);
163:         mat_a += bs2;
164:         (*mat_ilen)++;
165:       }
166:     }
167:   }
168:   /* sort */
169:   {
170:     MatScalar *work;

172:     PetscMalloc1(bs2, &work);
173:     for (i = 0; i < nrows; i++) {
174:       PetscInt ilen;
175:       mat_i = c->i[i];
176:       mat_j = c->j + mat_i;
177:       mat_a = c->a + mat_i * bs2;
178:       ilen  = c->ilen[i];
179:       PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work);
180:     }
181:     PetscFree(work);
182:   }

184:   /* Free work space */
185:   ISRestoreIndices(iscol, &icol);
186:   PetscFree(smap);
187:   PetscFree(lens);
188:   MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
189:   MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);

191:   ISRestoreIndices(isrow, &irow);
192:   *B = C;
193:   return 0;
194: }

196: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
197: {
198:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
199:   IS              is1, is2;
200:   PetscInt       *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs;
201:   const PetscInt *irow, *icol;

203:   ISGetIndices(isrow, &irow);
204:   ISGetIndices(iscol, &icol);
205:   ISGetLocalSize(isrow, &nrows);
206:   ISGetLocalSize(iscol, &ncols);

208:   /* Verify if the indices correspond to each element in a block
209:    and form the IS with compressed IS */
210:   maxmnbs = PetscMax(a->mbs, a->nbs);
211:   PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary);
212:   PetscArrayzero(vary, a->mbs);
213:   for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
215:   count = 0;
216:   for (i = 0; i < nrows; i++) {
217:     PetscInt j = irow[i] / bs;
218:     if ((vary[j]--) == bs) iary[count++] = j;
219:   }
220:   ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1);

222:   PetscArrayzero(vary, a->nbs);
223:   for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
225:   count = 0;
226:   for (i = 0; i < ncols; i++) {
227:     PetscInt j = icol[i] / bs;
228:     if ((vary[j]--) == bs) iary[count++] = j;
229:   }
230:   ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2);
231:   ISRestoreIndices(isrow, &irow);
232:   ISRestoreIndices(iscol, &icol);
233:   PetscFree2(vary, iary);

235:   MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B);
236:   ISDestroy(&is1);
237:   ISDestroy(&is2);

239:   if (isrow != iscol) {
240:     PetscBool isequal;
241:     ISEqual(isrow, iscol, &isequal);
242:     if (!isequal) MatSeqSBAIJZeroOps_Private(*B);
243:   }
244:   return 0;
245: }

247: PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
248: {
249:   PetscInt i;

251:   if (scall == MAT_INITIAL_MATRIX) PetscCalloc1(n + 1, B);

253:   for (i = 0; i < n; i++) MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]);
254:   return 0;
255: }

257: /* -------------------------------------------------------*/
258: /* Should check that shapes of vectors and matrices match */
259: /* -------------------------------------------------------*/

261: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
262: {
263:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
264:   PetscScalar       *z, x1, x2, zero = 0.0;
265:   const PetscScalar *x, *xb;
266:   const MatScalar   *v;
267:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
268:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
269:   PetscInt           nonzerorow = 0;

271:   VecSet(zz, zero);
272:   if (!a->nz) return 0;
273:   VecGetArrayRead(xx, &x);
274:   VecGetArray(zz, &z);

276:   v  = a->a;
277:   xb = x;

279:   for (i = 0; i < mbs; i++) {
280:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
281:     x1   = xb[0];
282:     x2   = xb[1];
283:     ib   = aj + *ai;
284:     jmin = 0;
285:     nonzerorow += (n > 0);
286:     if (*ib == i) { /* (diag of A)*x */
287:       z[2 * i] += v[0] * x1 + v[2] * x2;
288:       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
289:       v += 4;
290:       jmin++;
291:     }
292:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
293:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
294:     for (j = jmin; j < n; j++) {
295:       /* (strict lower triangular part of A)*x  */
296:       cval = ib[j] * 2;
297:       z[cval] += v[0] * x1 + v[1] * x2;
298:       z[cval + 1] += v[2] * x1 + v[3] * x2;
299:       /* (strict upper triangular part of A)*x  */
300:       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
301:       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
302:       v += 4;
303:     }
304:     xb += 2;
305:     ai++;
306:   }

308:   VecRestoreArrayRead(xx, &x);
309:   VecRestoreArray(zz, &z);
310:   PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
311:   return 0;
312: }

314: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
315: {
316:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
317:   PetscScalar       *z, x1, x2, x3, zero = 0.0;
318:   const PetscScalar *x, *xb;
319:   const MatScalar   *v;
320:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
321:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
322:   PetscInt           nonzerorow = 0;

324:   VecSet(zz, zero);
325:   if (!a->nz) return 0;
326:   VecGetArrayRead(xx, &x);
327:   VecGetArray(zz, &z);

329:   v  = a->a;
330:   xb = x;

332:   for (i = 0; i < mbs; i++) {
333:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
334:     x1   = xb[0];
335:     x2   = xb[1];
336:     x3   = xb[2];
337:     ib   = aj + *ai;
338:     jmin = 0;
339:     nonzerorow += (n > 0);
340:     if (*ib == i) { /* (diag of A)*x */
341:       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
342:       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
343:       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
344:       v += 9;
345:       jmin++;
346:     }
347:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
348:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
349:     for (j = jmin; j < n; j++) {
350:       /* (strict lower triangular part of A)*x  */
351:       cval = ib[j] * 3;
352:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
353:       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
354:       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
355:       /* (strict upper triangular part of A)*x  */
356:       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
357:       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
358:       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
359:       v += 9;
360:     }
361:     xb += 3;
362:     ai++;
363:   }

365:   VecRestoreArrayRead(xx, &x);
366:   VecRestoreArray(zz, &z);
367:   PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
368:   return 0;
369: }

371: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
372: {
373:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
374:   PetscScalar       *z, x1, x2, x3, x4, zero = 0.0;
375:   const PetscScalar *x, *xb;
376:   const MatScalar   *v;
377:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
378:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
379:   PetscInt           nonzerorow = 0;

381:   VecSet(zz, zero);
382:   if (!a->nz) return 0;
383:   VecGetArrayRead(xx, &x);
384:   VecGetArray(zz, &z);

386:   v  = a->a;
387:   xb = x;

389:   for (i = 0; i < mbs; i++) {
390:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
391:     x1   = xb[0];
392:     x2   = xb[1];
393:     x3   = xb[2];
394:     x4   = xb[3];
395:     ib   = aj + *ai;
396:     jmin = 0;
397:     nonzerorow += (n > 0);
398:     if (*ib == i) { /* (diag of A)*x */
399:       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
400:       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
401:       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
402:       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
403:       v += 16;
404:       jmin++;
405:     }
406:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
407:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
408:     for (j = jmin; j < n; j++) {
409:       /* (strict lower triangular part of A)*x  */
410:       cval = ib[j] * 4;
411:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
412:       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
413:       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
414:       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
415:       /* (strict upper triangular part of A)*x  */
416:       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
417:       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
418:       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
419:       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
420:       v += 16;
421:     }
422:     xb += 4;
423:     ai++;
424:   }

426:   VecRestoreArrayRead(xx, &x);
427:   VecRestoreArray(zz, &z);
428:   PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
429:   return 0;
430: }

432: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
433: {
434:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
435:   PetscScalar       *z, x1, x2, x3, x4, x5, zero = 0.0;
436:   const PetscScalar *x, *xb;
437:   const MatScalar   *v;
438:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
439:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
440:   PetscInt           nonzerorow = 0;

442:   VecSet(zz, zero);
443:   if (!a->nz) return 0;
444:   VecGetArrayRead(xx, &x);
445:   VecGetArray(zz, &z);

447:   v  = a->a;
448:   xb = x;

450:   for (i = 0; i < mbs; i++) {
451:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
452:     x1   = xb[0];
453:     x2   = xb[1];
454:     x3   = xb[2];
455:     x4   = xb[3];
456:     x5   = xb[4];
457:     ib   = aj + *ai;
458:     jmin = 0;
459:     nonzerorow += (n > 0);
460:     if (*ib == i) { /* (diag of A)*x */
461:       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
462:       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
463:       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
464:       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
465:       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
466:       v += 25;
467:       jmin++;
468:     }
469:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
470:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
471:     for (j = jmin; j < n; j++) {
472:       /* (strict lower triangular part of A)*x  */
473:       cval = ib[j] * 5;
474:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
475:       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
476:       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
477:       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
478:       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
479:       /* (strict upper triangular part of A)*x  */
480:       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
481:       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
482:       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
483:       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
484:       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
485:       v += 25;
486:     }
487:     xb += 5;
488:     ai++;
489:   }

491:   VecRestoreArrayRead(xx, &x);
492:   VecRestoreArray(zz, &z);
493:   PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
494:   return 0;
495: }

497: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
498: {
499:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
500:   PetscScalar       *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
501:   const PetscScalar *x, *xb;
502:   const MatScalar   *v;
503:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
504:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
505:   PetscInt           nonzerorow = 0;

507:   VecSet(zz, zero);
508:   if (!a->nz) return 0;
509:   VecGetArrayRead(xx, &x);
510:   VecGetArray(zz, &z);

512:   v  = a->a;
513:   xb = x;

515:   for (i = 0; i < mbs; i++) {
516:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
517:     x1   = xb[0];
518:     x2   = xb[1];
519:     x3   = xb[2];
520:     x4   = xb[3];
521:     x5   = xb[4];
522:     x6   = xb[5];
523:     ib   = aj + *ai;
524:     jmin = 0;
525:     nonzerorow += (n > 0);
526:     if (*ib == i) { /* (diag of A)*x */
527:       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
528:       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
529:       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
530:       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
531:       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
532:       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
533:       v += 36;
534:       jmin++;
535:     }
536:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
537:     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
538:     for (j = jmin; j < n; j++) {
539:       /* (strict lower triangular part of A)*x  */
540:       cval = ib[j] * 6;
541:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
542:       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
543:       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
544:       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
545:       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
546:       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
547:       /* (strict upper triangular part of A)*x  */
548:       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
549:       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
550:       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
551:       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
552:       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
553:       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
554:       v += 36;
555:     }
556:     xb += 6;
557:     ai++;
558:   }

560:   VecRestoreArrayRead(xx, &x);
561:   VecRestoreArray(zz, &z);
562:   PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
563:   return 0;
564: }

566: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
567: {
568:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
569:   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
570:   const PetscScalar *x, *xb;
571:   const MatScalar   *v;
572:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
573:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
574:   PetscInt           nonzerorow = 0;

576:   VecSet(zz, zero);
577:   if (!a->nz) return 0;
578:   VecGetArrayRead(xx, &x);
579:   VecGetArray(zz, &z);

581:   v  = a->a;
582:   xb = x;

584:   for (i = 0; i < mbs; i++) {
585:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
586:     x1   = xb[0];
587:     x2   = xb[1];
588:     x3   = xb[2];
589:     x4   = xb[3];
590:     x5   = xb[4];
591:     x6   = xb[5];
592:     x7   = xb[6];
593:     ib   = aj + *ai;
594:     jmin = 0;
595:     nonzerorow += (n > 0);
596:     if (*ib == i) { /* (diag of A)*x */
597:       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
598:       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
599:       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
600:       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
601:       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
602:       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
603:       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
604:       v += 49;
605:       jmin++;
606:     }
607:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
608:     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
609:     for (j = jmin; j < n; j++) {
610:       /* (strict lower triangular part of A)*x  */
611:       cval = ib[j] * 7;
612:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
613:       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
614:       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
615:       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
616:       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
617:       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
618:       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
619:       /* (strict upper triangular part of A)*x  */
620:       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
621:       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
622:       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
623:       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
624:       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
625:       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
626:       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
627:       v += 49;
628:     }
629:     xb += 7;
630:     ai++;
631:   }
632:   VecRestoreArrayRead(xx, &x);
633:   VecRestoreArray(zz, &z);
634:   PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
635:   return 0;
636: }

638: /*
639:     This will not work with MatScalar == float because it calls the BLAS
640: */
641: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
642: {
643:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
644:   PetscScalar       *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
645:   const PetscScalar *x, *x_ptr, *xb;
646:   const MatScalar   *v;
647:   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
648:   const PetscInt    *idx, *aj, *ii;
649:   PetscInt           nonzerorow = 0;

651:   VecSet(zz, zero);
652:   if (!a->nz) return 0;
653:   VecGetArrayRead(xx, &x);
654:   VecGetArray(zz, &z);

656:   x_ptr = x;
657:   z_ptr = z;

659:   aj = a->j;
660:   v  = a->a;
661:   ii = a->i;

663:   if (!a->mult_work) PetscMalloc1(A->rmap->N + 1, &a->mult_work);
664:   work = a->mult_work;

666:   for (i = 0; i < mbs; i++) {
667:     n     = ii[1] - ii[0];
668:     ncols = n * bs;
669:     workt = work;
670:     idx   = aj + ii[0];
671:     nonzerorow += (n > 0);

673:     /* upper triangular part */
674:     for (j = 0; j < n; j++) {
675:       xb = x_ptr + bs * (*idx++);
676:       for (k = 0; k < bs; k++) workt[k] = xb[k];
677:       workt += bs;
678:     }
679:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
680:     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);

682:     /* strict lower triangular part */
683:     idx = aj + ii[0];
684:     if (n && *idx == i) {
685:       ncols -= bs;
686:       v += bs2;
687:       idx++;
688:       n--;
689:     }

691:     if (ncols > 0) {
692:       workt = work;
693:       PetscArrayzero(workt, ncols);
694:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
695:       for (j = 0; j < n; j++) {
696:         zb = z_ptr + bs * (*idx++);
697:         for (k = 0; k < bs; k++) zb[k] += workt[k];
698:         workt += bs;
699:       }
700:     }
701:     x += bs;
702:     v += n * bs2;
703:     z += bs;
704:     ii++;
705:   }

707:   VecRestoreArrayRead(xx, &x);
708:   VecRestoreArray(zz, &z);
709:   PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow);
710:   return 0;
711: }

713: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
714: {
715:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
716:   PetscScalar       *z, x1;
717:   const PetscScalar *x, *xb;
718:   const MatScalar   *v;
719:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
720:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
721:   PetscInt           nonzerorow = 0;
722: #if defined(PETSC_USE_COMPLEX)
723:   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
724: #else
725:   const int aconj = 0;
726: #endif

728:   VecCopy(yy, zz);
729:   if (!a->nz) return 0;
730:   VecGetArrayRead(xx, &x);
731:   VecGetArray(zz, &z);
732:   v  = a->a;
733:   xb = x;

735:   for (i = 0; i < mbs; i++) {
736:     n    = ai[1] - ai[0]; /* length of i_th row of A */
737:     x1   = xb[0];
738:     ib   = aj + *ai;
739:     jmin = 0;
740:     nonzerorow += (n > 0);
741:     if (n && *ib == i) { /* (diag of A)*x */
742:       z[i] += *v++ * x[*ib++];
743:       jmin++;
744:     }
745:     if (aconj) {
746:       for (j = jmin; j < n; j++) {
747:         cval = *ib;
748:         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
749:         z[i] += *v++ * x[*ib++];       /* (strict upper triangular part of A)*x  */
750:       }
751:     } else {
752:       for (j = jmin; j < n; j++) {
753:         cval = *ib;
754:         z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
755:         z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
756:       }
757:     }
758:     xb++;
759:     ai++;
760:   }

762:   VecRestoreArrayRead(xx, &x);
763:   VecRestoreArray(zz, &z);

765:   PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow));
766:   return 0;
767: }

769: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
770: {
771:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
772:   PetscScalar       *z, x1, x2;
773:   const PetscScalar *x, *xb;
774:   const MatScalar   *v;
775:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
776:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
777:   PetscInt           nonzerorow = 0;

779:   VecCopy(yy, zz);
780:   if (!a->nz) return 0;
781:   VecGetArrayRead(xx, &x);
782:   VecGetArray(zz, &z);

784:   v  = a->a;
785:   xb = x;

787:   for (i = 0; i < mbs; i++) {
788:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
789:     x1   = xb[0];
790:     x2   = xb[1];
791:     ib   = aj + *ai;
792:     jmin = 0;
793:     nonzerorow += (n > 0);
794:     if (n && *ib == i) { /* (diag of A)*x */
795:       z[2 * i] += v[0] * x1 + v[2] * x2;
796:       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
797:       v += 4;
798:       jmin++;
799:     }
800:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
801:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
802:     for (j = jmin; j < n; j++) {
803:       /* (strict lower triangular part of A)*x  */
804:       cval = ib[j] * 2;
805:       z[cval] += v[0] * x1 + v[1] * x2;
806:       z[cval + 1] += v[2] * x1 + v[3] * x2;
807:       /* (strict upper triangular part of A)*x  */
808:       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
809:       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
810:       v += 4;
811:     }
812:     xb += 2;
813:     ai++;
814:   }
815:   VecRestoreArrayRead(xx, &x);
816:   VecRestoreArray(zz, &z);

818:   PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow));
819:   return 0;
820: }

822: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
823: {
824:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
825:   PetscScalar       *z, x1, x2, x3;
826:   const PetscScalar *x, *xb;
827:   const MatScalar   *v;
828:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
829:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
830:   PetscInt           nonzerorow = 0;

832:   VecCopy(yy, zz);
833:   if (!a->nz) return 0;
834:   VecGetArrayRead(xx, &x);
835:   VecGetArray(zz, &z);

837:   v  = a->a;
838:   xb = x;

840:   for (i = 0; i < mbs; i++) {
841:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
842:     x1   = xb[0];
843:     x2   = xb[1];
844:     x3   = xb[2];
845:     ib   = aj + *ai;
846:     jmin = 0;
847:     nonzerorow += (n > 0);
848:     if (n && *ib == i) { /* (diag of A)*x */
849:       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
850:       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
851:       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
852:       v += 9;
853:       jmin++;
854:     }
855:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
856:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
857:     for (j = jmin; j < n; j++) {
858:       /* (strict lower triangular part of A)*x  */
859:       cval = ib[j] * 3;
860:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
861:       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
862:       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
863:       /* (strict upper triangular part of A)*x  */
864:       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
865:       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
866:       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
867:       v += 9;
868:     }
869:     xb += 3;
870:     ai++;
871:   }

873:   VecRestoreArrayRead(xx, &x);
874:   VecRestoreArray(zz, &z);

876:   PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow));
877:   return 0;
878: }

880: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
881: {
882:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
883:   PetscScalar       *z, x1, x2, x3, x4;
884:   const PetscScalar *x, *xb;
885:   const MatScalar   *v;
886:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
887:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
888:   PetscInt           nonzerorow = 0;

890:   VecCopy(yy, zz);
891:   if (!a->nz) return 0;
892:   VecGetArrayRead(xx, &x);
893:   VecGetArray(zz, &z);

895:   v  = a->a;
896:   xb = x;

898:   for (i = 0; i < mbs; i++) {
899:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
900:     x1   = xb[0];
901:     x2   = xb[1];
902:     x3   = xb[2];
903:     x4   = xb[3];
904:     ib   = aj + *ai;
905:     jmin = 0;
906:     nonzerorow += (n > 0);
907:     if (n && *ib == i) { /* (diag of A)*x */
908:       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
909:       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
910:       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
911:       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
912:       v += 16;
913:       jmin++;
914:     }
915:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
916:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
917:     for (j = jmin; j < n; j++) {
918:       /* (strict lower triangular part of A)*x  */
919:       cval = ib[j] * 4;
920:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
921:       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
922:       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
923:       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
924:       /* (strict upper triangular part of A)*x  */
925:       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
926:       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
927:       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
928:       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
929:       v += 16;
930:     }
931:     xb += 4;
932:     ai++;
933:   }

935:   VecRestoreArrayRead(xx, &x);
936:   VecRestoreArray(zz, &z);

938:   PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow));
939:   return 0;
940: }

942: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
943: {
944:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
945:   PetscScalar       *z, x1, x2, x3, x4, x5;
946:   const PetscScalar *x, *xb;
947:   const MatScalar   *v;
948:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
949:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
950:   PetscInt           nonzerorow = 0;

952:   VecCopy(yy, zz);
953:   if (!a->nz) return 0;
954:   VecGetArrayRead(xx, &x);
955:   VecGetArray(zz, &z);

957:   v  = a->a;
958:   xb = x;

960:   for (i = 0; i < mbs; i++) {
961:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
962:     x1   = xb[0];
963:     x2   = xb[1];
964:     x3   = xb[2];
965:     x4   = xb[3];
966:     x5   = xb[4];
967:     ib   = aj + *ai;
968:     jmin = 0;
969:     nonzerorow += (n > 0);
970:     if (n && *ib == i) { /* (diag of A)*x */
971:       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
972:       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
973:       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
974:       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
975:       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
976:       v += 25;
977:       jmin++;
978:     }
979:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
980:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
981:     for (j = jmin; j < n; j++) {
982:       /* (strict lower triangular part of A)*x  */
983:       cval = ib[j] * 5;
984:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
985:       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
986:       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
987:       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
988:       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
989:       /* (strict upper triangular part of A)*x  */
990:       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
991:       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
992:       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
993:       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
994:       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
995:       v += 25;
996:     }
997:     xb += 5;
998:     ai++;
999:   }

1001:   VecRestoreArrayRead(xx, &x);
1002:   VecRestoreArray(zz, &z);

1004:   PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow));
1005:   return 0;
1006: }

1008: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
1009: {
1010:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1011:   PetscScalar       *z, x1, x2, x3, x4, x5, x6;
1012:   const PetscScalar *x, *xb;
1013:   const MatScalar   *v;
1014:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1015:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1016:   PetscInt           nonzerorow = 0;

1018:   VecCopy(yy, zz);
1019:   if (!a->nz) return 0;
1020:   VecGetArrayRead(xx, &x);
1021:   VecGetArray(zz, &z);

1023:   v  = a->a;
1024:   xb = x;

1026:   for (i = 0; i < mbs; i++) {
1027:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1028:     x1   = xb[0];
1029:     x2   = xb[1];
1030:     x3   = xb[2];
1031:     x4   = xb[3];
1032:     x5   = xb[4];
1033:     x6   = xb[5];
1034:     ib   = aj + *ai;
1035:     jmin = 0;
1036:     nonzerorow += (n > 0);
1037:     if (n && *ib == i) { /* (diag of A)*x */
1038:       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
1039:       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
1040:       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
1041:       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
1042:       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
1043:       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1044:       v += 36;
1045:       jmin++;
1046:     }
1047:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1048:     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1049:     for (j = jmin; j < n; j++) {
1050:       /* (strict lower triangular part of A)*x  */
1051:       cval = ib[j] * 6;
1052:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
1053:       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
1054:       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
1055:       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
1056:       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
1057:       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1058:       /* (strict upper triangular part of A)*x  */
1059:       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
1060:       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
1061:       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
1062:       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
1063:       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
1064:       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
1065:       v += 36;
1066:     }
1067:     xb += 6;
1068:     ai++;
1069:   }

1071:   VecRestoreArrayRead(xx, &x);
1072:   VecRestoreArray(zz, &z);

1074:   PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow));
1075:   return 0;
1076: }

1078: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1079: {
1080:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1081:   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7;
1082:   const PetscScalar *x, *xb;
1083:   const MatScalar   *v;
1084:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1085:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1086:   PetscInt           nonzerorow = 0;

1088:   VecCopy(yy, zz);
1089:   if (!a->nz) return 0;
1090:   VecGetArrayRead(xx, &x);
1091:   VecGetArray(zz, &z);

1093:   v  = a->a;
1094:   xb = x;

1096:   for (i = 0; i < mbs; i++) {
1097:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1098:     x1   = xb[0];
1099:     x2   = xb[1];
1100:     x3   = xb[2];
1101:     x4   = xb[3];
1102:     x5   = xb[4];
1103:     x6   = xb[5];
1104:     x7   = xb[6];
1105:     ib   = aj + *ai;
1106:     jmin = 0;
1107:     nonzerorow += (n > 0);
1108:     if (n && *ib == i) { /* (diag of A)*x */
1109:       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
1110:       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
1111:       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
1112:       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
1113:       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
1114:       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
1115:       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1116:       v += 49;
1117:       jmin++;
1118:     }
1119:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1120:     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1121:     for (j = jmin; j < n; j++) {
1122:       /* (strict lower triangular part of A)*x  */
1123:       cval = ib[j] * 7;
1124:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
1125:       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
1126:       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
1127:       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
1128:       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
1129:       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
1130:       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1131:       /* (strict upper triangular part of A)*x  */
1132:       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
1133:       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
1134:       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
1135:       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
1136:       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
1137:       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
1138:       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
1139:       v += 49;
1140:     }
1141:     xb += 7;
1142:     ai++;
1143:   }

1145:   VecRestoreArrayRead(xx, &x);
1146:   VecRestoreArray(zz, &z);

1148:   PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow));
1149:   return 0;
1150: }

1152: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1153: {
1154:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1155:   PetscScalar       *z, *z_ptr = NULL, *zb, *work, *workt;
1156:   const PetscScalar *x, *x_ptr, *xb;
1157:   const MatScalar   *v;
1158:   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1159:   const PetscInt    *idx, *aj, *ii;
1160:   PetscInt           nonzerorow = 0;

1162:   VecCopy(yy, zz);
1163:   if (!a->nz) return 0;
1164:   VecGetArrayRead(xx, &x);
1165:   x_ptr = x;
1166:   VecGetArray(zz, &z);
1167:   z_ptr = z;

1169:   aj = a->j;
1170:   v  = a->a;
1171:   ii = a->i;

1173:   if (!a->mult_work) PetscMalloc1(A->rmap->n + 1, &a->mult_work);
1174:   work = a->mult_work;

1176:   for (i = 0; i < mbs; i++) {
1177:     n     = ii[1] - ii[0];
1178:     ncols = n * bs;
1179:     workt = work;
1180:     idx   = aj + ii[0];
1181:     nonzerorow += (n > 0);

1183:     /* upper triangular part */
1184:     for (j = 0; j < n; j++) {
1185:       xb = x_ptr + bs * (*idx++);
1186:       for (k = 0; k < bs; k++) workt[k] = xb[k];
1187:       workt += bs;
1188:     }
1189:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1190:     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);

1192:     /* strict lower triangular part */
1193:     idx = aj + ii[0];
1194:     if (n && *idx == i) {
1195:       ncols -= bs;
1196:       v += bs2;
1197:       idx++;
1198:       n--;
1199:     }
1200:     if (ncols > 0) {
1201:       workt = work;
1202:       PetscArrayzero(workt, ncols);
1203:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1204:       for (j = 0; j < n; j++) {
1205:         zb = z_ptr + bs * (*idx++);
1206:         for (k = 0; k < bs; k++) zb[k] += workt[k];
1207:         workt += bs;
1208:       }
1209:     }

1211:     x += bs;
1212:     v += n * bs2;
1213:     z += bs;
1214:     ii++;
1215:   }

1217:   VecRestoreArrayRead(xx, &x);
1218:   VecRestoreArray(zz, &z);

1220:   PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow));
1221:   return 0;
1222: }

1224: PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1225: {
1226:   Mat_SeqSBAIJ *a      = (Mat_SeqSBAIJ *)inA->data;
1227:   PetscScalar   oalpha = alpha;
1228:   PetscBLASInt  one    = 1, totalnz;

1230:   PetscBLASIntCast(a->bs2 * a->nz, &totalnz);
1231:   PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
1232:   PetscLogFlops(totalnz);
1233:   return 0;
1234: }

1236: PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1237: {
1238:   Mat_SeqSBAIJ    *a        = (Mat_SeqSBAIJ *)A->data;
1239:   const MatScalar *v        = a->a;
1240:   PetscReal        sum_diag = 0.0, sum_off = 0.0, *sum;
1241:   PetscInt         i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1242:   const PetscInt  *aj = a->j, *col;

1244:   if (!a->nz) {
1245:     *norm = 0.0;
1246:     return 0;
1247:   }
1248:   if (type == NORM_FROBENIUS) {
1249:     for (k = 0; k < mbs; k++) {
1250:       jmin = a->i[k];
1251:       jmax = a->i[k + 1];
1252:       col  = aj + jmin;
1253:       if (jmax - jmin > 0 && *col == k) { /* diagonal block */
1254:         for (i = 0; i < bs2; i++) {
1255:           sum_diag += PetscRealPart(PetscConj(*v) * (*v));
1256:           v++;
1257:         }
1258:         jmin++;
1259:       }
1260:       for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
1261:         for (i = 0; i < bs2; i++) {
1262:           sum_off += PetscRealPart(PetscConj(*v) * (*v));
1263:           v++;
1264:         }
1265:       }
1266:     }
1267:     *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
1268:     PetscLogFlops(2.0 * bs2 * a->nz);
1269:   } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1270:     PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl);
1271:     for (i = 0; i < mbs; i++) jl[i] = mbs;
1272:     il[0] = 0;

1274:     *norm = 0.0;
1275:     for (k = 0; k < mbs; k++) { /* k_th block row */
1276:       for (j = 0; j < bs; j++) sum[j] = 0.0;
1277:       /*-- col sum --*/
1278:       i = jl[k]; /* first |A(i,k)| to be added */
1279:       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1280:                   at step k */
1281:       while (i < mbs) {
1282:         nexti = jl[i]; /* next block row to be added */
1283:         ik    = il[i]; /* block index of A(i,k) in the array a */
1284:         for (j = 0; j < bs; j++) {
1285:           v = a->a + ik * bs2 + j * bs;
1286:           for (k1 = 0; k1 < bs; k1++) {
1287:             sum[j] += PetscAbsScalar(*v);
1288:             v++;
1289:           }
1290:         }
1291:         /* update il, jl */
1292:         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1293:         jmax = a->i[i + 1];
1294:         if (jmin < jmax) {
1295:           il[i] = jmin;
1296:           j     = a->j[jmin];
1297:           jl[i] = jl[j];
1298:           jl[j] = i;
1299:         }
1300:         i = nexti;
1301:       }
1302:       /*-- row sum --*/
1303:       jmin = a->i[k];
1304:       jmax = a->i[k + 1];
1305:       for (i = jmin; i < jmax; i++) {
1306:         for (j = 0; j < bs; j++) {
1307:           v = a->a + i * bs2 + j;
1308:           for (k1 = 0; k1 < bs; k1++) {
1309:             sum[j] += PetscAbsScalar(*v);
1310:             v += bs;
1311:           }
1312:         }
1313:       }
1314:       /* add k_th block row to il, jl */
1315:       col = aj + jmin;
1316:       if (jmax - jmin > 0 && *col == k) jmin++;
1317:       if (jmin < jmax) {
1318:         il[k] = jmin;
1319:         j     = a->j[jmin];
1320:         jl[k] = jl[j];
1321:         jl[j] = k;
1322:       }
1323:       for (j = 0; j < bs; j++) {
1324:         if (sum[j] > *norm) *norm = sum[j];
1325:       }
1326:     }
1327:     PetscFree3(sum, il, jl);
1328:     PetscLogFlops(PetscMax(mbs * a->nz - 1, 0));
1329:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
1330:   return 0;
1331: }

1333: PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1334: {
1335:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;

1337:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1338:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1339:     *flg = PETSC_FALSE;
1340:     return 0;
1341:   }

1343:   /* if the a->i are the same */
1344:   PetscArraycmp(a->i, b->i, a->mbs + 1, flg);
1345:   if (!*flg) return 0;

1347:   /* if a->j are the same */
1348:   PetscArraycmp(a->j, b->j, a->nz, flg);
1349:   if (!*flg) return 0;

1351:   /* if a->a are the same */
1352:   PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg);
1353:   return 0;
1354: }

1356: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1357: {
1358:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1359:   PetscInt         i, j, k, row, bs, ambs, bs2;
1360:   const PetscInt  *ai, *aj;
1361:   PetscScalar     *x, zero = 0.0;
1362:   const MatScalar *aa, *aa_j;

1364:   bs = A->rmap->bs;

1367:   aa   = a->a;
1368:   ambs = a->mbs;

1370:   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1371:     PetscInt *diag = a->diag;
1372:     aa             = a->a;
1373:     ambs           = a->mbs;
1374:     VecGetArray(v, &x);
1375:     for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]];
1376:     VecRestoreArray(v, &x);
1377:     return 0;
1378:   }

1380:   ai  = a->i;
1381:   aj  = a->j;
1382:   bs2 = a->bs2;
1383:   VecSet(v, zero);
1384:   if (!a->nz) return 0;
1385:   VecGetArray(v, &x);
1386:   for (i = 0; i < ambs; i++) {
1387:     j = ai[i];
1388:     if (aj[j] == i) { /* if this is a diagonal element */
1389:       row  = i * bs;
1390:       aa_j = aa + j * bs2;
1391:       for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
1392:     }
1393:   }
1394:   VecRestoreArray(v, &x);
1395:   return 0;
1396: }

1398: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1399: {
1400:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1401:   PetscScalar        x;
1402:   const PetscScalar *l, *li, *ri;
1403:   MatScalar         *aa, *v;
1404:   PetscInt           i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1405:   const PetscInt    *ai, *aj;
1406:   PetscBool          flg;

1408:   if (ll != rr) {
1409:     VecEqual(ll, rr, &flg);
1411:   }
1412:   if (!ll) return 0;
1413:   ai  = a->i;
1414:   aj  = a->j;
1415:   aa  = a->a;
1416:   m   = A->rmap->N;
1417:   bs  = A->rmap->bs;
1418:   mbs = a->mbs;
1419:   bs2 = a->bs2;

1421:   VecGetArrayRead(ll, &l);
1422:   VecGetLocalSize(ll, &lm);
1424:   for (i = 0; i < mbs; i++) { /* for each block row */
1425:     M  = ai[i + 1] - ai[i];
1426:     li = l + i * bs;
1427:     v  = aa + bs2 * ai[i];
1428:     for (j = 0; j < M; j++) { /* for each block */
1429:       ri = l + bs * aj[ai[i] + j];
1430:       for (k = 0; k < bs; k++) {
1431:         x = ri[k];
1432:         for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
1433:       }
1434:     }
1435:   }
1436:   VecRestoreArrayRead(ll, &l);
1437:   PetscLogFlops(2.0 * a->nz);
1438:   return 0;
1439: }

1441: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1442: {
1443:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;

1445:   info->block_size   = a->bs2;
1446:   info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
1447:   info->nz_used      = a->bs2 * a->nz;    /*num. of nonzeros in upper triangular part */
1448:   info->nz_unneeded  = info->nz_allocated - info->nz_used;
1449:   info->assemblies   = A->num_ass;
1450:   info->mallocs      = A->info.mallocs;
1451:   info->memory       = 0; /* REVIEW ME */
1452:   if (A->factortype) {
1453:     info->fill_ratio_given  = A->info.fill_ratio_given;
1454:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1455:     info->factor_mallocs    = A->info.factor_mallocs;
1456:   } else {
1457:     info->fill_ratio_given  = 0;
1458:     info->fill_ratio_needed = 0;
1459:     info->factor_mallocs    = 0;
1460:   }
1461:   return 0;
1462: }

1464: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1465: {
1466:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;

1468:   PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]);
1469:   return 0;
1470: }

1472: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1473: {
1474:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1475:   PetscInt         i, j, n, row, col, bs, mbs;
1476:   const PetscInt  *ai, *aj;
1477:   PetscReal        atmp;
1478:   const MatScalar *aa;
1479:   PetscScalar     *x;
1480:   PetscInt         ncols, brow, bcol, krow, kcol;

1484:   bs  = A->rmap->bs;
1485:   aa  = a->a;
1486:   ai  = a->i;
1487:   aj  = a->j;
1488:   mbs = a->mbs;

1490:   VecSet(v, 0.0);
1491:   VecGetArray(v, &x);
1492:   VecGetLocalSize(v, &n);
1494:   for (i = 0; i < mbs; i++) {
1495:     ncols = ai[1] - ai[0];
1496:     ai++;
1497:     brow = bs * i;
1498:     for (j = 0; j < ncols; j++) {
1499:       bcol = bs * (*aj);
1500:       for (kcol = 0; kcol < bs; kcol++) {
1501:         col = bcol + kcol; /* col index */
1502:         for (krow = 0; krow < bs; krow++) {
1503:           atmp = PetscAbsScalar(*aa);
1504:           aa++;
1505:           row = brow + krow; /* row index */
1506:           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1507:           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1508:         }
1509:       }
1510:       aj++;
1511:     }
1512:   }
1513:   VecRestoreArray(v, &x);
1514:   return 0;
1515: }

1517: PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1518: {
1519:   MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C);
1520:   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1521:   return 0;
1522: }

1524: PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1525: {
1526:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1527:   PetscScalar       *z = c;
1528:   const PetscScalar *xb;
1529:   PetscScalar        x1;
1530:   const MatScalar   *v   = a->a, *vv;
1531:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1532: #if defined(PETSC_USE_COMPLEX)
1533:   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
1534: #else
1535:   const int aconj = 0;
1536: #endif

1538:   for (i = 0; i < mbs; i++) {
1539:     n = ii[1] - ii[0];
1540:     ii++;
1541:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1542:     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1543:     jj = idx;
1544:     vv = v;
1545:     for (k = 0; k < cn; k++) {
1546:       idx = jj;
1547:       v   = vv;
1548:       for (j = 0; j < n; j++) {
1549:         xb = b + (*idx);
1550:         x1 = xb[0 + k * bm];
1551:         z[0 + k * cm] += v[0] * x1;
1552:         if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1553:         v += 1;
1554:         ++idx;
1555:       }
1556:     }
1557:     z += 1;
1558:   }
1559:   return 0;
1560: }

1562: PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1563: {
1564:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1565:   PetscScalar       *z = c;
1566:   const PetscScalar *xb;
1567:   PetscScalar        x1, x2;
1568:   const MatScalar   *v   = a->a, *vv;
1569:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1571:   for (i = 0; i < mbs; i++) {
1572:     n = ii[1] - ii[0];
1573:     ii++;
1574:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1575:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1576:     jj = idx;
1577:     vv = v;
1578:     for (k = 0; k < cn; k++) {
1579:       idx = jj;
1580:       v   = vv;
1581:       for (j = 0; j < n; j++) {
1582:         xb = b + 2 * (*idx);
1583:         x1 = xb[0 + k * bm];
1584:         x2 = xb[1 + k * bm];
1585:         z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1586:         z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1587:         if (*idx != i) {
1588:           c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1589:           c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1590:         }
1591:         v += 4;
1592:         ++idx;
1593:       }
1594:     }
1595:     z += 2;
1596:   }
1597:   return 0;
1598: }

1600: PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1601: {
1602:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1603:   PetscScalar       *z = c;
1604:   const PetscScalar *xb;
1605:   PetscScalar        x1, x2, x3;
1606:   const MatScalar   *v   = a->a, *vv;
1607:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1609:   for (i = 0; i < mbs; i++) {
1610:     n = ii[1] - ii[0];
1611:     ii++;
1612:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1613:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1614:     jj = idx;
1615:     vv = v;
1616:     for (k = 0; k < cn; k++) {
1617:       idx = jj;
1618:       v   = vv;
1619:       for (j = 0; j < n; j++) {
1620:         xb = b + 3 * (*idx);
1621:         x1 = xb[0 + k * bm];
1622:         x2 = xb[1 + k * bm];
1623:         x3 = xb[2 + k * bm];
1624:         z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1625:         z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1626:         z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1627:         if (*idx != i) {
1628:           c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1629:           c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1630:           c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1631:         }
1632:         v += 9;
1633:         ++idx;
1634:       }
1635:     }
1636:     z += 3;
1637:   }
1638:   return 0;
1639: }

1641: PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1642: {
1643:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1644:   PetscScalar       *z = c;
1645:   const PetscScalar *xb;
1646:   PetscScalar        x1, x2, x3, x4;
1647:   const MatScalar   *v   = a->a, *vv;
1648:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1650:   for (i = 0; i < mbs; i++) {
1651:     n = ii[1] - ii[0];
1652:     ii++;
1653:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1654:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1655:     jj = idx;
1656:     vv = v;
1657:     for (k = 0; k < cn; k++) {
1658:       idx = jj;
1659:       v   = vv;
1660:       for (j = 0; j < n; j++) {
1661:         xb = b + 4 * (*idx);
1662:         x1 = xb[0 + k * bm];
1663:         x2 = xb[1 + k * bm];
1664:         x3 = xb[2 + k * bm];
1665:         x4 = xb[3 + k * bm];
1666:         z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1667:         z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1668:         z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1669:         z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1670:         if (*idx != i) {
1671:           c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1672:           c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1673:           c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1674:           c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1675:         }
1676:         v += 16;
1677:         ++idx;
1678:       }
1679:     }
1680:     z += 4;
1681:   }
1682:   return 0;
1683: }

1685: PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1686: {
1687:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1688:   PetscScalar       *z = c;
1689:   const PetscScalar *xb;
1690:   PetscScalar        x1, x2, x3, x4, x5;
1691:   const MatScalar   *v   = a->a, *vv;
1692:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1694:   for (i = 0; i < mbs; i++) {
1695:     n = ii[1] - ii[0];
1696:     ii++;
1697:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1698:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1699:     jj = idx;
1700:     vv = v;
1701:     for (k = 0; k < cn; k++) {
1702:       idx = jj;
1703:       v   = vv;
1704:       for (j = 0; j < n; j++) {
1705:         xb = b + 5 * (*idx);
1706:         x1 = xb[0 + k * bm];
1707:         x2 = xb[1 + k * bm];
1708:         x3 = xb[2 + k * bm];
1709:         x4 = xb[3 + k * bm];
1710:         x5 = xb[4 + k * cm];
1711:         z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1712:         z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1713:         z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1714:         z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1715:         z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1716:         if (*idx != i) {
1717:           c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1718:           c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1719:           c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1720:           c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1721:           c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1722:         }
1723:         v += 25;
1724:         ++idx;
1725:       }
1726:     }
1727:     z += 5;
1728:   }
1729:   return 0;
1730: }

1732: PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1733: {
1734:   Mat_SeqSBAIJ    *a  = (Mat_SeqSBAIJ *)A->data;
1735:   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
1736:   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
1737:   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1738:   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1739:   PetscBLASInt     bbs, bcn, bbm, bcm;
1740:   PetscScalar     *z = NULL;
1741:   PetscScalar     *c, *b;
1742:   const MatScalar *v;
1743:   const PetscInt  *idx, *ii;
1744:   PetscScalar      _DOne = 1.0;

1746:   if (!cm || !cn) return 0;
1750:   b = bd->v;
1751:   MatZeroEntries(C);
1752:   MatDenseGetArray(C, &c);
1753:   switch (bs) {
1754:   case 1:
1755:     MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn);
1756:     break;
1757:   case 2:
1758:     MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn);
1759:     break;
1760:   case 3:
1761:     MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn);
1762:     break;
1763:   case 4:
1764:     MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn);
1765:     break;
1766:   case 5:
1767:     MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn);
1768:     break;
1769:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1770:     PetscBLASIntCast(bs, &bbs);
1771:     PetscBLASIntCast(cn, &bcn);
1772:     PetscBLASIntCast(bm, &bbm);
1773:     PetscBLASIntCast(cm, &bcm);
1774:     idx = a->j;
1775:     v   = a->a;
1776:     mbs = a->mbs;
1777:     ii  = a->i;
1778:     z   = c;
1779:     for (i = 0; i < mbs; i++) {
1780:       n = ii[1] - ii[0];
1781:       ii++;
1782:       for (j = 0; j < n; j++) {
1783:         if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1784:         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1785:         v += bs2;
1786:       }
1787:       z += bs;
1788:     }
1789:   }
1790:   MatDenseRestoreArray(C, &c);
1791:   PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn);
1792:   return 0;
1793: }