Actual source code: baij2.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <../src/mat/impls/dense/seq/dense.h>
3: #include <petsc/private/kernels/blockinvert.h>
4: #include <petscbt.h>
5: #include <petscblaslapack.h>
7: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
8: #include <immintrin.h>
9: #endif
11: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
12: {
13: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
14: PetscInt row, i, j, k, l, m, n, *nidx, isz, val, ival;
15: const PetscInt *idx;
16: PetscInt start, end, *ai, *aj, bs;
17: PetscBT table;
19: m = a->mbs;
20: ai = a->i;
21: aj = a->j;
22: bs = A->rmap->bs;
26: PetscBTCreate(m, &table);
27: PetscMalloc1(m + 1, &nidx);
29: for (i = 0; i < is_max; i++) {
30: /* Initialise the two local arrays */
31: isz = 0;
32: PetscBTMemzero(m, table);
34: /* Extract the indices, assume there can be duplicate entries */
35: ISGetIndices(is[i], &idx);
36: ISGetLocalSize(is[i], &n);
38: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
39: for (j = 0; j < n; ++j) {
40: ival = idx[j] / bs; /* convert the indices into block indices */
42: if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival;
43: }
44: ISRestoreIndices(is[i], &idx);
45: ISDestroy(&is[i]);
47: k = 0;
48: for (j = 0; j < ov; j++) { /* for each overlap*/
49: n = isz;
50: for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */
51: row = nidx[k];
52: start = ai[row];
53: end = ai[row + 1];
54: for (l = start; l < end; l++) {
55: val = aj[l];
56: if (!PetscBTLookupSet(table, val)) nidx[isz++] = val;
57: }
58: }
59: }
60: ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i);
61: }
62: PetscBTDestroy(&table);
63: PetscFree(nidx);
64: return 0;
65: }
67: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
68: {
69: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *c;
70: PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
71: PetscInt row, mat_i, *mat_j, tcol, *mat_ilen;
72: const PetscInt *irow, *icol;
73: PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
74: PetscInt *aj = a->j, *ai = a->i;
75: MatScalar *mat_a;
76: Mat C;
77: PetscBool flag;
79: ISGetIndices(isrow, &irow);
80: ISGetIndices(iscol, &icol);
81: ISGetLocalSize(isrow, &nrows);
82: ISGetLocalSize(iscol, &ncols);
84: PetscCalloc1(1 + oldcols, &smap);
85: ssmap = smap;
86: PetscMalloc1(1 + nrows, &lens);
87: for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
88: /* determine lens of each row */
89: for (i = 0; i < nrows; i++) {
90: kstart = ai[irow[i]];
91: kend = kstart + a->ilen[irow[i]];
92: lens[i] = 0;
93: for (k = kstart; k < kend; k++) {
94: if (ssmap[aj[k]]) lens[i]++;
95: }
96: }
97: /* Create and fill new matrix */
98: if (scall == MAT_REUSE_MATRIX) {
99: c = (Mat_SeqBAIJ *)((*B)->data);
102: PetscArraycmp(c->ilen, lens, c->mbs, &flag);
104: PetscArrayzero(c->ilen, c->mbs);
105: C = *B;
106: } else {
107: MatCreate(PetscObjectComm((PetscObject)A), &C);
108: MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE);
109: MatSetType(C, ((PetscObject)A)->type_name);
110: MatSeqBAIJSetPreallocation(C, bs, 0, lens);
111: }
112: c = (Mat_SeqBAIJ *)(C->data);
113: for (i = 0; i < nrows; i++) {
114: row = irow[i];
115: kstart = ai[row];
116: kend = kstart + a->ilen[row];
117: mat_i = c->i[i];
118: mat_j = c->j ? c->j + mat_i : NULL; /* mustn't add to NULL, that is UB */
119: mat_a = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */
120: mat_ilen = c->ilen + i;
121: for (k = kstart; k < kend; k++) {
122: if ((tcol = ssmap[a->j[k]])) {
123: *mat_j++ = tcol - 1;
124: PetscArraycpy(mat_a, a->a + k * bs2, bs2);
125: mat_a += bs2;
126: (*mat_ilen)++;
127: }
128: }
129: }
130: /* sort */
131: if (c->j && c->a) {
132: MatScalar *work;
133: PetscMalloc1(bs2, &work);
134: for (i = 0; i < nrows; i++) {
135: PetscInt ilen;
136: mat_i = c->i[i];
137: mat_j = c->j + mat_i;
138: mat_a = c->a + mat_i * bs2;
139: ilen = c->ilen[i];
140: PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work);
141: }
142: PetscFree(work);
143: }
145: /* Free work space */
146: ISRestoreIndices(iscol, &icol);
147: PetscFree(smap);
148: PetscFree(lens);
149: MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
150: MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);
152: ISRestoreIndices(isrow, &irow);
153: *B = C;
154: return 0;
155: }
157: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
158: {
159: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
160: IS is1, is2;
161: PetscInt *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j;
162: const PetscInt *irow, *icol;
164: ISGetIndices(isrow, &irow);
165: ISGetIndices(iscol, &icol);
166: ISGetLocalSize(isrow, &nrows);
167: ISGetLocalSize(iscol, &ncols);
169: /* Verify if the indices correspond to each element in a block
170: and form the IS with compressed IS */
171: maxmnbs = PetscMax(a->mbs, a->nbs);
172: PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary);
173: PetscArrayzero(vary, a->mbs);
174: for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
176: count = 0;
177: for (i = 0; i < nrows; i++) {
178: j = irow[i] / bs;
179: if ((vary[j]--) == bs) iary[count++] = j;
180: }
181: ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1);
183: PetscArrayzero(vary, a->nbs);
184: for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
186: count = 0;
187: for (i = 0; i < ncols; i++) {
188: j = icol[i] / bs;
189: if ((vary[j]--) == bs) iary[count++] = j;
190: }
191: ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2);
192: ISRestoreIndices(isrow, &irow);
193: ISRestoreIndices(iscol, &icol);
194: PetscFree2(vary, iary);
196: MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B);
197: ISDestroy(&is1);
198: ISDestroy(&is2);
199: return 0;
200: }
202: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
203: {
204: Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data;
205: Mat_SubSppt *submatj = c->submatis1;
207: (*submatj->destroy)(C);
208: MatDestroySubMatrix_Private(submatj);
209: return 0;
210: }
212: /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
213: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[])
214: {
215: PetscInt i;
216: Mat C;
217: Mat_SeqBAIJ *c;
218: Mat_SubSppt *submatj;
220: for (i = 0; i < n; i++) {
221: C = (*mat)[i];
222: c = (Mat_SeqBAIJ *)C->data;
223: submatj = c->submatis1;
224: if (submatj) {
225: if (--((PetscObject)C)->refct <= 0) {
226: PetscFree(C->factorprefix);
227: (*submatj->destroy)(C);
228: MatDestroySubMatrix_Private(submatj);
229: PetscFree(C->defaultvectype);
230: PetscFree(C->defaultrandtype);
231: PetscLayoutDestroy(&C->rmap);
232: PetscLayoutDestroy(&C->cmap);
233: PetscHeaderDestroy(&C);
234: }
235: } else {
236: MatDestroy(&C);
237: }
238: }
240: /* Destroy Dummy submatrices created for reuse */
241: MatDestroySubMatrices_Dummy(n, mat);
243: PetscFree(*mat);
244: return 0;
245: }
247: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
248: {
249: PetscInt i;
251: if (scall == MAT_INITIAL_MATRIX) PetscCalloc1(n + 1, B);
253: for (i = 0; i < n; i++) MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i]);
254: return 0;
255: }
257: /* -------------------------------------------------------*/
258: /* Should check that shapes of vectors and matrices match */
259: /* -------------------------------------------------------*/
261: PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz)
262: {
263: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
264: PetscScalar *z, sum;
265: const PetscScalar *x;
266: const MatScalar *v;
267: PetscInt mbs, i, n;
268: const PetscInt *idx, *ii, *ridx = NULL;
269: PetscBool usecprow = a->compressedrow.use;
271: VecGetArrayRead(xx, &x);
272: VecGetArrayWrite(zz, &z);
274: if (usecprow) {
275: mbs = a->compressedrow.nrows;
276: ii = a->compressedrow.i;
277: ridx = a->compressedrow.rindex;
278: PetscArrayzero(z, a->mbs);
279: } else {
280: mbs = a->mbs;
281: ii = a->i;
282: }
284: for (i = 0; i < mbs; i++) {
285: n = ii[1] - ii[0];
286: v = a->a + ii[0];
287: idx = a->j + ii[0];
288: ii++;
289: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
290: PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
291: sum = 0.0;
292: PetscSparseDensePlusDot(sum, x, v, idx, n);
293: if (usecprow) {
294: z[ridx[i]] = sum;
295: } else {
296: z[i] = sum;
297: }
298: }
299: VecRestoreArrayRead(xx, &x);
300: VecRestoreArrayWrite(zz, &z);
301: PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt);
302: return 0;
303: }
305: PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz)
306: {
307: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
308: PetscScalar *z = NULL, sum1, sum2, *zarray;
309: const PetscScalar *x, *xb;
310: PetscScalar x1, x2;
311: const MatScalar *v;
312: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
313: PetscBool usecprow = a->compressedrow.use;
315: VecGetArrayRead(xx, &x);
316: VecGetArrayWrite(zz, &zarray);
318: idx = a->j;
319: v = a->a;
320: if (usecprow) {
321: mbs = a->compressedrow.nrows;
322: ii = a->compressedrow.i;
323: ridx = a->compressedrow.rindex;
324: PetscArrayzero(zarray, 2 * a->mbs);
325: } else {
326: mbs = a->mbs;
327: ii = a->i;
328: z = zarray;
329: }
331: for (i = 0; i < mbs; i++) {
332: n = ii[1] - ii[0];
333: ii++;
334: sum1 = 0.0;
335: sum2 = 0.0;
336: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
337: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
338: for (j = 0; j < n; j++) {
339: xb = x + 2 * (*idx++);
340: x1 = xb[0];
341: x2 = xb[1];
342: sum1 += v[0] * x1 + v[2] * x2;
343: sum2 += v[1] * x1 + v[3] * x2;
344: v += 4;
345: }
346: if (usecprow) z = zarray + 2 * ridx[i];
347: z[0] = sum1;
348: z[1] = sum2;
349: if (!usecprow) z += 2;
350: }
351: VecRestoreArrayRead(xx, &x);
352: VecRestoreArrayWrite(zz, &zarray);
353: PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt);
354: return 0;
355: }
357: PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz)
358: {
359: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
360: PetscScalar *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray;
361: const PetscScalar *x, *xb;
362: const MatScalar *v;
363: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
364: PetscBool usecprow = a->compressedrow.use;
366: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
367: #pragma disjoint(*v, *z, *xb)
368: #endif
370: VecGetArrayRead(xx, &x);
371: VecGetArrayWrite(zz, &zarray);
373: idx = a->j;
374: v = a->a;
375: if (usecprow) {
376: mbs = a->compressedrow.nrows;
377: ii = a->compressedrow.i;
378: ridx = a->compressedrow.rindex;
379: PetscArrayzero(zarray, 3 * a->mbs);
380: } else {
381: mbs = a->mbs;
382: ii = a->i;
383: z = zarray;
384: }
386: for (i = 0; i < mbs; i++) {
387: n = ii[1] - ii[0];
388: ii++;
389: sum1 = 0.0;
390: sum2 = 0.0;
391: sum3 = 0.0;
392: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
393: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
394: for (j = 0; j < n; j++) {
395: xb = x + 3 * (*idx++);
396: x1 = xb[0];
397: x2 = xb[1];
398: x3 = xb[2];
400: sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
401: sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
402: sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
403: v += 9;
404: }
405: if (usecprow) z = zarray + 3 * ridx[i];
406: z[0] = sum1;
407: z[1] = sum2;
408: z[2] = sum3;
409: if (!usecprow) z += 3;
410: }
411: VecRestoreArrayRead(xx, &x);
412: VecRestoreArrayWrite(zz, &zarray);
413: PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt);
414: return 0;
415: }
417: PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz)
418: {
419: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
420: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray;
421: const PetscScalar *x, *xb;
422: const MatScalar *v;
423: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
424: PetscBool usecprow = a->compressedrow.use;
426: VecGetArrayRead(xx, &x);
427: VecGetArrayWrite(zz, &zarray);
429: idx = a->j;
430: v = a->a;
431: if (usecprow) {
432: mbs = a->compressedrow.nrows;
433: ii = a->compressedrow.i;
434: ridx = a->compressedrow.rindex;
435: PetscArrayzero(zarray, 4 * a->mbs);
436: } else {
437: mbs = a->mbs;
438: ii = a->i;
439: z = zarray;
440: }
442: for (i = 0; i < mbs; i++) {
443: n = ii[1] - ii[0];
444: ii++;
445: sum1 = 0.0;
446: sum2 = 0.0;
447: sum3 = 0.0;
448: sum4 = 0.0;
450: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
451: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
452: for (j = 0; j < n; j++) {
453: xb = x + 4 * (*idx++);
454: x1 = xb[0];
455: x2 = xb[1];
456: x3 = xb[2];
457: x4 = xb[3];
458: sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
459: sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
460: sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
461: sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
462: v += 16;
463: }
464: if (usecprow) z = zarray + 4 * ridx[i];
465: z[0] = sum1;
466: z[1] = sum2;
467: z[2] = sum3;
468: z[3] = sum4;
469: if (!usecprow) z += 4;
470: }
471: VecRestoreArrayRead(xx, &x);
472: VecRestoreArrayWrite(zz, &zarray);
473: PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt);
474: return 0;
475: }
477: PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz)
478: {
479: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
480: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray;
481: const PetscScalar *xb, *x;
482: const MatScalar *v;
483: const PetscInt *idx, *ii, *ridx = NULL;
484: PetscInt mbs, i, j, n;
485: PetscBool usecprow = a->compressedrow.use;
487: VecGetArrayRead(xx, &x);
488: VecGetArrayWrite(zz, &zarray);
490: idx = a->j;
491: v = a->a;
492: if (usecprow) {
493: mbs = a->compressedrow.nrows;
494: ii = a->compressedrow.i;
495: ridx = a->compressedrow.rindex;
496: PetscArrayzero(zarray, 5 * a->mbs);
497: } else {
498: mbs = a->mbs;
499: ii = a->i;
500: z = zarray;
501: }
503: for (i = 0; i < mbs; i++) {
504: n = ii[1] - ii[0];
505: ii++;
506: sum1 = 0.0;
507: sum2 = 0.0;
508: sum3 = 0.0;
509: sum4 = 0.0;
510: sum5 = 0.0;
511: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
512: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
513: for (j = 0; j < n; j++) {
514: xb = x + 5 * (*idx++);
515: x1 = xb[0];
516: x2 = xb[1];
517: x3 = xb[2];
518: x4 = xb[3];
519: x5 = xb[4];
520: sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
521: sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
522: sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
523: sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
524: sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
525: v += 25;
526: }
527: if (usecprow) z = zarray + 5 * ridx[i];
528: z[0] = sum1;
529: z[1] = sum2;
530: z[2] = sum3;
531: z[3] = sum4;
532: z[4] = sum5;
533: if (!usecprow) z += 5;
534: }
535: VecRestoreArrayRead(xx, &x);
536: VecRestoreArrayWrite(zz, &zarray);
537: PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt);
538: return 0;
539: }
541: PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz)
542: {
543: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
544: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
545: const PetscScalar *x, *xb;
546: PetscScalar x1, x2, x3, x4, x5, x6, *zarray;
547: const MatScalar *v;
548: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
549: PetscBool usecprow = a->compressedrow.use;
551: VecGetArrayRead(xx, &x);
552: VecGetArrayWrite(zz, &zarray);
554: idx = a->j;
555: v = a->a;
556: if (usecprow) {
557: mbs = a->compressedrow.nrows;
558: ii = a->compressedrow.i;
559: ridx = a->compressedrow.rindex;
560: PetscArrayzero(zarray, 6 * a->mbs);
561: } else {
562: mbs = a->mbs;
563: ii = a->i;
564: z = zarray;
565: }
567: for (i = 0; i < mbs; i++) {
568: n = ii[1] - ii[0];
569: ii++;
570: sum1 = 0.0;
571: sum2 = 0.0;
572: sum3 = 0.0;
573: sum4 = 0.0;
574: sum5 = 0.0;
575: sum6 = 0.0;
577: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
578: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
579: for (j = 0; j < n; j++) {
580: xb = x + 6 * (*idx++);
581: x1 = xb[0];
582: x2 = xb[1];
583: x3 = xb[2];
584: x4 = xb[3];
585: x5 = xb[4];
586: x6 = xb[5];
587: sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
588: sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
589: sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
590: sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
591: sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
592: sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
593: v += 36;
594: }
595: if (usecprow) z = zarray + 6 * ridx[i];
596: z[0] = sum1;
597: z[1] = sum2;
598: z[2] = sum3;
599: z[3] = sum4;
600: z[4] = sum5;
601: z[5] = sum6;
602: if (!usecprow) z += 6;
603: }
605: VecRestoreArrayRead(xx, &x);
606: VecRestoreArrayWrite(zz, &zarray);
607: PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt);
608: return 0;
609: }
611: PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz)
612: {
613: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
614: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
615: const PetscScalar *x, *xb;
616: PetscScalar x1, x2, x3, x4, x5, x6, x7, *zarray;
617: const MatScalar *v;
618: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
619: PetscBool usecprow = a->compressedrow.use;
621: VecGetArrayRead(xx, &x);
622: VecGetArrayWrite(zz, &zarray);
624: idx = a->j;
625: v = a->a;
626: if (usecprow) {
627: mbs = a->compressedrow.nrows;
628: ii = a->compressedrow.i;
629: ridx = a->compressedrow.rindex;
630: PetscArrayzero(zarray, 7 * a->mbs);
631: } else {
632: mbs = a->mbs;
633: ii = a->i;
634: z = zarray;
635: }
637: for (i = 0; i < mbs; i++) {
638: n = ii[1] - ii[0];
639: ii++;
640: sum1 = 0.0;
641: sum2 = 0.0;
642: sum3 = 0.0;
643: sum4 = 0.0;
644: sum5 = 0.0;
645: sum6 = 0.0;
646: sum7 = 0.0;
648: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
649: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
650: for (j = 0; j < n; j++) {
651: xb = x + 7 * (*idx++);
652: x1 = xb[0];
653: x2 = xb[1];
654: x3 = xb[2];
655: x4 = xb[3];
656: x5 = xb[4];
657: x6 = xb[5];
658: x7 = xb[6];
659: sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
660: sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
661: sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
662: sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
663: sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
664: sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
665: sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
666: v += 49;
667: }
668: if (usecprow) z = zarray + 7 * ridx[i];
669: z[0] = sum1;
670: z[1] = sum2;
671: z[2] = sum3;
672: z[3] = sum4;
673: z[4] = sum5;
674: z[5] = sum6;
675: z[6] = sum7;
676: if (!usecprow) z += 7;
677: }
679: VecRestoreArrayRead(xx, &x);
680: VecRestoreArrayWrite(zz, &zarray);
681: PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt);
682: return 0;
683: }
685: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
686: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz)
687: {
688: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
689: PetscScalar *z = NULL, *work, *workt, *zarray;
690: const PetscScalar *x, *xb;
691: const MatScalar *v;
692: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
693: const PetscInt *idx, *ii, *ridx = NULL;
694: PetscInt k;
695: PetscBool usecprow = a->compressedrow.use;
697: __m256d a0, a1, a2, a3, a4, a5;
698: __m256d w0, w1, w2, w3;
699: __m256d z0, z1, z2;
700: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
702: VecGetArrayRead(xx, &x);
703: VecGetArrayWrite(zz, &zarray);
705: idx = a->j;
706: v = a->a;
707: if (usecprow) {
708: mbs = a->compressedrow.nrows;
709: ii = a->compressedrow.i;
710: ridx = a->compressedrow.rindex;
711: PetscArrayzero(zarray, bs * a->mbs);
712: } else {
713: mbs = a->mbs;
714: ii = a->i;
715: z = zarray;
716: }
718: if (!a->mult_work) {
719: k = PetscMax(A->rmap->n, A->cmap->n);
720: PetscMalloc1(k + 1, &a->mult_work);
721: }
723: work = a->mult_work;
724: for (i = 0; i < mbs; i++) {
725: n = ii[1] - ii[0];
726: ii++;
727: workt = work;
728: for (j = 0; j < n; j++) {
729: xb = x + bs * (*idx++);
730: for (k = 0; k < bs; k++) workt[k] = xb[k];
731: workt += bs;
732: }
733: if (usecprow) z = zarray + bs * ridx[i];
735: z0 = _mm256_setzero_pd();
736: z1 = _mm256_setzero_pd();
737: z2 = _mm256_setzero_pd();
739: for (j = 0; j < n; j++) {
740: /* first column of a */
741: w0 = _mm256_set1_pd(work[j * 9]);
742: a0 = _mm256_loadu_pd(&v[j * 81]);
743: z0 = _mm256_fmadd_pd(a0, w0, z0);
744: a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
745: z1 = _mm256_fmadd_pd(a1, w0, z1);
746: a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
747: z2 = _mm256_fmadd_pd(a2, w0, z2);
749: /* second column of a */
750: w1 = _mm256_set1_pd(work[j * 9 + 1]);
751: a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
752: z0 = _mm256_fmadd_pd(a0, w1, z0);
753: a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
754: z1 = _mm256_fmadd_pd(a1, w1, z1);
755: a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
756: z2 = _mm256_fmadd_pd(a2, w1, z2);
758: /* third column of a */
759: w2 = _mm256_set1_pd(work[j * 9 + 2]);
760: a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
761: z0 = _mm256_fmadd_pd(a3, w2, z0);
762: a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
763: z1 = _mm256_fmadd_pd(a4, w2, z1);
764: a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
765: z2 = _mm256_fmadd_pd(a5, w2, z2);
767: /* fourth column of a */
768: w3 = _mm256_set1_pd(work[j * 9 + 3]);
769: a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
770: z0 = _mm256_fmadd_pd(a0, w3, z0);
771: a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
772: z1 = _mm256_fmadd_pd(a1, w3, z1);
773: a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
774: z2 = _mm256_fmadd_pd(a2, w3, z2);
776: /* fifth column of a */
777: w0 = _mm256_set1_pd(work[j * 9 + 4]);
778: a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
779: z0 = _mm256_fmadd_pd(a3, w0, z0);
780: a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
781: z1 = _mm256_fmadd_pd(a4, w0, z1);
782: a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
783: z2 = _mm256_fmadd_pd(a5, w0, z2);
785: /* sixth column of a */
786: w1 = _mm256_set1_pd(work[j * 9 + 5]);
787: a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
788: z0 = _mm256_fmadd_pd(a0, w1, z0);
789: a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
790: z1 = _mm256_fmadd_pd(a1, w1, z1);
791: a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
792: z2 = _mm256_fmadd_pd(a2, w1, z2);
794: /* seventh column of a */
795: w2 = _mm256_set1_pd(work[j * 9 + 6]);
796: a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
797: z0 = _mm256_fmadd_pd(a0, w2, z0);
798: a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
799: z1 = _mm256_fmadd_pd(a1, w2, z1);
800: a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
801: z2 = _mm256_fmadd_pd(a2, w2, z2);
803: /* eighth column of a */
804: w3 = _mm256_set1_pd(work[j * 9 + 7]);
805: a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
806: z0 = _mm256_fmadd_pd(a3, w3, z0);
807: a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
808: z1 = _mm256_fmadd_pd(a4, w3, z1);
809: a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
810: z2 = _mm256_fmadd_pd(a5, w3, z2);
812: /* ninth column of a */
813: w0 = _mm256_set1_pd(work[j * 9 + 8]);
814: a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
815: z0 = _mm256_fmadd_pd(a0, w0, z0);
816: a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
817: z1 = _mm256_fmadd_pd(a1, w0, z1);
818: a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
819: z2 = _mm256_fmadd_pd(a2, w0, z2);
820: }
822: _mm256_storeu_pd(&z[0], z0);
823: _mm256_storeu_pd(&z[4], z1);
824: _mm256_maskstore_pd(&z[8], mask1, z2);
826: v += n * bs2;
827: if (!usecprow) z += bs;
828: }
829: VecRestoreArrayRead(xx, &x);
830: VecRestoreArrayWrite(zz, &zarray);
831: PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt);
832: return 0;
833: }
834: #endif
836: PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz)
837: {
838: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
839: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
840: const PetscScalar *x, *xb;
841: PetscScalar *zarray, xv;
842: const MatScalar *v;
843: const PetscInt *ii, *ij = a->j, *idx;
844: PetscInt mbs, i, j, k, n, *ridx = NULL;
845: PetscBool usecprow = a->compressedrow.use;
847: VecGetArrayRead(xx, &x);
848: VecGetArrayWrite(zz, &zarray);
850: v = a->a;
851: if (usecprow) {
852: mbs = a->compressedrow.nrows;
853: ii = a->compressedrow.i;
854: ridx = a->compressedrow.rindex;
855: PetscArrayzero(zarray, 11 * a->mbs);
856: } else {
857: mbs = a->mbs;
858: ii = a->i;
859: z = zarray;
860: }
862: for (i = 0; i < mbs; i++) {
863: n = ii[i + 1] - ii[i];
864: idx = ij + ii[i];
865: sum1 = 0.0;
866: sum2 = 0.0;
867: sum3 = 0.0;
868: sum4 = 0.0;
869: sum5 = 0.0;
870: sum6 = 0.0;
871: sum7 = 0.0;
872: sum8 = 0.0;
873: sum9 = 0.0;
874: sum10 = 0.0;
875: sum11 = 0.0;
877: for (j = 0; j < n; j++) {
878: xb = x + 11 * (idx[j]);
880: for (k = 0; k < 11; k++) {
881: xv = xb[k];
882: sum1 += v[0] * xv;
883: sum2 += v[1] * xv;
884: sum3 += v[2] * xv;
885: sum4 += v[3] * xv;
886: sum5 += v[4] * xv;
887: sum6 += v[5] * xv;
888: sum7 += v[6] * xv;
889: sum8 += v[7] * xv;
890: sum9 += v[8] * xv;
891: sum10 += v[9] * xv;
892: sum11 += v[10] * xv;
893: v += 11;
894: }
895: }
896: if (usecprow) z = zarray + 11 * ridx[i];
897: z[0] = sum1;
898: z[1] = sum2;
899: z[2] = sum3;
900: z[3] = sum4;
901: z[4] = sum5;
902: z[5] = sum6;
903: z[6] = sum7;
904: z[7] = sum8;
905: z[8] = sum9;
906: z[9] = sum10;
907: z[10] = sum11;
909: if (!usecprow) z += 11;
910: }
912: VecRestoreArrayRead(xx, &x);
913: VecRestoreArrayWrite(zz, &zarray);
914: PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt);
915: return 0;
916: }
918: /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
919: PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz)
920: {
921: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
922: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
923: const PetscScalar *x, *xb;
924: PetscScalar *zarray, xv;
925: const MatScalar *v;
926: const PetscInt *ii, *ij = a->j, *idx;
927: PetscInt mbs, i, j, k, n, *ridx = NULL;
928: PetscBool usecprow = a->compressedrow.use;
930: VecGetArrayRead(xx, &x);
931: VecGetArrayWrite(zz, &zarray);
933: v = a->a;
934: if (usecprow) {
935: mbs = a->compressedrow.nrows;
936: ii = a->compressedrow.i;
937: ridx = a->compressedrow.rindex;
938: PetscArrayzero(zarray, 12 * a->mbs);
939: } else {
940: mbs = a->mbs;
941: ii = a->i;
942: z = zarray;
943: }
945: for (i = 0; i < mbs; i++) {
946: n = ii[i + 1] - ii[i];
947: idx = ij + ii[i];
948: sum1 = 0.0;
949: sum2 = 0.0;
950: sum3 = 0.0;
951: sum4 = 0.0;
952: sum5 = 0.0;
953: sum6 = 0.0;
954: sum7 = 0.0;
955: sum8 = 0.0;
956: sum9 = 0.0;
957: sum10 = 0.0;
958: sum11 = 0.0;
959: sum12 = 0.0;
961: for (j = 0; j < n; j++) {
962: xb = x + 12 * (idx[j]);
964: for (k = 0; k < 12; k++) {
965: xv = xb[k];
966: sum1 += v[0] * xv;
967: sum2 += v[1] * xv;
968: sum3 += v[2] * xv;
969: sum4 += v[3] * xv;
970: sum5 += v[4] * xv;
971: sum6 += v[5] * xv;
972: sum7 += v[6] * xv;
973: sum8 += v[7] * xv;
974: sum9 += v[8] * xv;
975: sum10 += v[9] * xv;
976: sum11 += v[10] * xv;
977: sum12 += v[11] * xv;
978: v += 12;
979: }
980: }
981: if (usecprow) z = zarray + 12 * ridx[i];
982: z[0] = sum1;
983: z[1] = sum2;
984: z[2] = sum3;
985: z[3] = sum4;
986: z[4] = sum5;
987: z[5] = sum6;
988: z[6] = sum7;
989: z[7] = sum8;
990: z[8] = sum9;
991: z[9] = sum10;
992: z[10] = sum11;
993: z[11] = sum12;
994: if (!usecprow) z += 12;
995: }
996: VecRestoreArrayRead(xx, &x);
997: VecRestoreArrayWrite(zz, &zarray);
998: PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt);
999: return 0;
1000: }
1002: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz)
1003: {
1004: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1005: PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1006: const PetscScalar *x, *xb;
1007: PetscScalar *zarray, *yarray, xv;
1008: const MatScalar *v;
1009: const PetscInt *ii, *ij = a->j, *idx;
1010: PetscInt mbs = a->mbs, i, j, k, n, *ridx = NULL;
1011: PetscBool usecprow = a->compressedrow.use;
1013: VecGetArrayRead(xx, &x);
1014: VecGetArrayPair(yy, zz, &yarray, &zarray);
1016: v = a->a;
1017: if (usecprow) {
1018: if (zz != yy) PetscArraycpy(zarray, yarray, 12 * mbs);
1019: mbs = a->compressedrow.nrows;
1020: ii = a->compressedrow.i;
1021: ridx = a->compressedrow.rindex;
1022: } else {
1023: ii = a->i;
1024: y = yarray;
1025: z = zarray;
1026: }
1028: for (i = 0; i < mbs; i++) {
1029: n = ii[i + 1] - ii[i];
1030: idx = ij + ii[i];
1032: if (usecprow) {
1033: y = yarray + 12 * ridx[i];
1034: z = zarray + 12 * ridx[i];
1035: }
1036: sum1 = y[0];
1037: sum2 = y[1];
1038: sum3 = y[2];
1039: sum4 = y[3];
1040: sum5 = y[4];
1041: sum6 = y[5];
1042: sum7 = y[6];
1043: sum8 = y[7];
1044: sum9 = y[8];
1045: sum10 = y[9];
1046: sum11 = y[10];
1047: sum12 = y[11];
1049: for (j = 0; j < n; j++) {
1050: xb = x + 12 * (idx[j]);
1052: for (k = 0; k < 12; k++) {
1053: xv = xb[k];
1054: sum1 += v[0] * xv;
1055: sum2 += v[1] * xv;
1056: sum3 += v[2] * xv;
1057: sum4 += v[3] * xv;
1058: sum5 += v[4] * xv;
1059: sum6 += v[5] * xv;
1060: sum7 += v[6] * xv;
1061: sum8 += v[7] * xv;
1062: sum9 += v[8] * xv;
1063: sum10 += v[9] * xv;
1064: sum11 += v[10] * xv;
1065: sum12 += v[11] * xv;
1066: v += 12;
1067: }
1068: }
1070: z[0] = sum1;
1071: z[1] = sum2;
1072: z[2] = sum3;
1073: z[3] = sum4;
1074: z[4] = sum5;
1075: z[5] = sum6;
1076: z[6] = sum7;
1077: z[7] = sum8;
1078: z[8] = sum9;
1079: z[9] = sum10;
1080: z[10] = sum11;
1081: z[11] = sum12;
1082: if (!usecprow) {
1083: y += 12;
1084: z += 12;
1085: }
1086: }
1087: VecRestoreArrayRead(xx, &x);
1088: VecRestoreArrayPair(yy, zz, &yarray, &zarray);
1089: PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt);
1090: return 0;
1091: }
1093: /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1094: PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz)
1095: {
1096: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1097: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1098: const PetscScalar *x, *xb;
1099: PetscScalar x1, x2, x3, x4, *zarray;
1100: const MatScalar *v;
1101: const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL;
1102: PetscInt mbs, i, j, n;
1103: PetscBool usecprow = a->compressedrow.use;
1105: VecGetArrayRead(xx, &x);
1106: VecGetArrayWrite(zz, &zarray);
1108: v = a->a;
1109: if (usecprow) {
1110: mbs = a->compressedrow.nrows;
1111: ii = a->compressedrow.i;
1112: ridx = a->compressedrow.rindex;
1113: PetscArrayzero(zarray, 12 * a->mbs);
1114: } else {
1115: mbs = a->mbs;
1116: ii = a->i;
1117: z = zarray;
1118: }
1120: for (i = 0; i < mbs; i++) {
1121: n = ii[i + 1] - ii[i];
1122: idx = ij + ii[i];
1124: sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1125: for (j = 0; j < n; j++) {
1126: xb = x + 12 * (idx[j]);
1127: x1 = xb[0];
1128: x2 = xb[1];
1129: x3 = xb[2];
1130: x4 = xb[3];
1132: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1133: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1134: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1135: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1136: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1137: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1138: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1139: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1140: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1141: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1142: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1143: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1144: v += 48;
1146: x1 = xb[4];
1147: x2 = xb[5];
1148: x3 = xb[6];
1149: x4 = xb[7];
1151: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1152: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1153: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1154: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1155: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1156: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1157: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1158: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1159: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1160: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1161: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1162: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1163: v += 48;
1165: x1 = xb[8];
1166: x2 = xb[9];
1167: x3 = xb[10];
1168: x4 = xb[11];
1169: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1170: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1171: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1172: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1173: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1174: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1175: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1176: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1177: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1178: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1179: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1180: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1181: v += 48;
1182: }
1183: if (usecprow) z = zarray + 12 * ridx[i];
1184: z[0] = sum1;
1185: z[1] = sum2;
1186: z[2] = sum3;
1187: z[3] = sum4;
1188: z[4] = sum5;
1189: z[5] = sum6;
1190: z[6] = sum7;
1191: z[7] = sum8;
1192: z[8] = sum9;
1193: z[9] = sum10;
1194: z[10] = sum11;
1195: z[11] = sum12;
1196: if (!usecprow) z += 12;
1197: }
1198: VecRestoreArrayRead(xx, &x);
1199: VecRestoreArrayWrite(zz, &zarray);
1200: PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt);
1201: return 0;
1202: }
1204: /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1205: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz)
1206: {
1207: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1208: PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1209: const PetscScalar *x, *xb;
1210: PetscScalar x1, x2, x3, x4, *zarray, *yarray;
1211: const MatScalar *v;
1212: const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL;
1213: PetscInt mbs = a->mbs, i, j, n;
1214: PetscBool usecprow = a->compressedrow.use;
1216: VecGetArrayRead(xx, &x);
1217: VecGetArrayPair(yy, zz, &yarray, &zarray);
1219: v = a->a;
1220: if (usecprow) {
1221: if (zz != yy) PetscArraycpy(zarray, yarray, 12 * mbs);
1222: mbs = a->compressedrow.nrows;
1223: ii = a->compressedrow.i;
1224: ridx = a->compressedrow.rindex;
1225: } else {
1226: ii = a->i;
1227: y = yarray;
1228: z = zarray;
1229: }
1231: for (i = 0; i < mbs; i++) {
1232: n = ii[i + 1] - ii[i];
1233: idx = ij + ii[i];
1235: if (usecprow) {
1236: y = yarray + 12 * ridx[i];
1237: z = zarray + 12 * ridx[i];
1238: }
1239: sum1 = y[0];
1240: sum2 = y[1];
1241: sum3 = y[2];
1242: sum4 = y[3];
1243: sum5 = y[4];
1244: sum6 = y[5];
1245: sum7 = y[6];
1246: sum8 = y[7];
1247: sum9 = y[8];
1248: sum10 = y[9];
1249: sum11 = y[10];
1250: sum12 = y[11];
1252: for (j = 0; j < n; j++) {
1253: xb = x + 12 * (idx[j]);
1254: x1 = xb[0];
1255: x2 = xb[1];
1256: x3 = xb[2];
1257: x4 = xb[3];
1259: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1260: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1261: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1262: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1263: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1264: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1265: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1266: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1267: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1268: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1269: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1270: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1271: v += 48;
1273: x1 = xb[4];
1274: x2 = xb[5];
1275: x3 = xb[6];
1276: x4 = xb[7];
1278: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1279: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1280: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1281: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1282: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1283: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1284: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1285: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1286: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1287: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1288: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1289: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1290: v += 48;
1292: x1 = xb[8];
1293: x2 = xb[9];
1294: x3 = xb[10];
1295: x4 = xb[11];
1296: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1297: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1298: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1299: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1300: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1301: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1302: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1303: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1304: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1305: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1306: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1307: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1308: v += 48;
1309: }
1310: z[0] = sum1;
1311: z[1] = sum2;
1312: z[2] = sum3;
1313: z[3] = sum4;
1314: z[4] = sum5;
1315: z[5] = sum6;
1316: z[6] = sum7;
1317: z[7] = sum8;
1318: z[8] = sum9;
1319: z[9] = sum10;
1320: z[10] = sum11;
1321: z[11] = sum12;
1322: if (!usecprow) {
1323: y += 12;
1324: z += 12;
1325: }
1326: }
1327: VecRestoreArrayRead(xx, &x);
1328: VecRestoreArrayPair(yy, zz, &yarray, &zarray);
1329: PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt);
1330: return 0;
1331: }
1333: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1334: PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz)
1335: {
1336: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1337: PetscScalar *z = NULL, *zarray;
1338: const PetscScalar *x, *work;
1339: const MatScalar *v = a->a;
1340: PetscInt mbs, i, j, n;
1341: const PetscInt *idx = a->j, *ii, *ridx = NULL;
1342: PetscBool usecprow = a->compressedrow.use;
1343: const PetscInt bs = 12, bs2 = 144;
1345: __m256d a0, a1, a2, a3, a4, a5;
1346: __m256d w0, w1, w2, w3;
1347: __m256d z0, z1, z2;
1349: VecGetArrayRead(xx, &x);
1350: VecGetArrayWrite(zz, &zarray);
1352: if (usecprow) {
1353: mbs = a->compressedrow.nrows;
1354: ii = a->compressedrow.i;
1355: ridx = a->compressedrow.rindex;
1356: PetscArrayzero(zarray, bs * a->mbs);
1357: } else {
1358: mbs = a->mbs;
1359: ii = a->i;
1360: z = zarray;
1361: }
1363: for (i = 0; i < mbs; i++) {
1364: z0 = _mm256_setzero_pd();
1365: z1 = _mm256_setzero_pd();
1366: z2 = _mm256_setzero_pd();
1368: n = ii[1] - ii[0];
1369: ii++;
1370: for (j = 0; j < n; j++) {
1371: work = x + bs * (*idx++);
1373: /* first column of a */
1374: w0 = _mm256_set1_pd(work[0]);
1375: a0 = _mm256_loadu_pd(v + 0);
1376: z0 = _mm256_fmadd_pd(a0, w0, z0);
1377: a1 = _mm256_loadu_pd(v + 4);
1378: z1 = _mm256_fmadd_pd(a1, w0, z1);
1379: a2 = _mm256_loadu_pd(v + 8);
1380: z2 = _mm256_fmadd_pd(a2, w0, z2);
1382: /* second column of a */
1383: w1 = _mm256_set1_pd(work[1]);
1384: a3 = _mm256_loadu_pd(v + 12);
1385: z0 = _mm256_fmadd_pd(a3, w1, z0);
1386: a4 = _mm256_loadu_pd(v + 16);
1387: z1 = _mm256_fmadd_pd(a4, w1, z1);
1388: a5 = _mm256_loadu_pd(v + 20);
1389: z2 = _mm256_fmadd_pd(a5, w1, z2);
1391: /* third column of a */
1392: w2 = _mm256_set1_pd(work[2]);
1393: a0 = _mm256_loadu_pd(v + 24);
1394: z0 = _mm256_fmadd_pd(a0, w2, z0);
1395: a1 = _mm256_loadu_pd(v + 28);
1396: z1 = _mm256_fmadd_pd(a1, w2, z1);
1397: a2 = _mm256_loadu_pd(v + 32);
1398: z2 = _mm256_fmadd_pd(a2, w2, z2);
1400: /* fourth column of a */
1401: w3 = _mm256_set1_pd(work[3]);
1402: a3 = _mm256_loadu_pd(v + 36);
1403: z0 = _mm256_fmadd_pd(a3, w3, z0);
1404: a4 = _mm256_loadu_pd(v + 40);
1405: z1 = _mm256_fmadd_pd(a4, w3, z1);
1406: a5 = _mm256_loadu_pd(v + 44);
1407: z2 = _mm256_fmadd_pd(a5, w3, z2);
1409: /* fifth column of a */
1410: w0 = _mm256_set1_pd(work[4]);
1411: a0 = _mm256_loadu_pd(v + 48);
1412: z0 = _mm256_fmadd_pd(a0, w0, z0);
1413: a1 = _mm256_loadu_pd(v + 52);
1414: z1 = _mm256_fmadd_pd(a1, w0, z1);
1415: a2 = _mm256_loadu_pd(v + 56);
1416: z2 = _mm256_fmadd_pd(a2, w0, z2);
1418: /* sixth column of a */
1419: w1 = _mm256_set1_pd(work[5]);
1420: a3 = _mm256_loadu_pd(v + 60);
1421: z0 = _mm256_fmadd_pd(a3, w1, z0);
1422: a4 = _mm256_loadu_pd(v + 64);
1423: z1 = _mm256_fmadd_pd(a4, w1, z1);
1424: a5 = _mm256_loadu_pd(v + 68);
1425: z2 = _mm256_fmadd_pd(a5, w1, z2);
1427: /* seventh column of a */
1428: w2 = _mm256_set1_pd(work[6]);
1429: a0 = _mm256_loadu_pd(v + 72);
1430: z0 = _mm256_fmadd_pd(a0, w2, z0);
1431: a1 = _mm256_loadu_pd(v + 76);
1432: z1 = _mm256_fmadd_pd(a1, w2, z1);
1433: a2 = _mm256_loadu_pd(v + 80);
1434: z2 = _mm256_fmadd_pd(a2, w2, z2);
1436: /* eighth column of a */
1437: w3 = _mm256_set1_pd(work[7]);
1438: a3 = _mm256_loadu_pd(v + 84);
1439: z0 = _mm256_fmadd_pd(a3, w3, z0);
1440: a4 = _mm256_loadu_pd(v + 88);
1441: z1 = _mm256_fmadd_pd(a4, w3, z1);
1442: a5 = _mm256_loadu_pd(v + 92);
1443: z2 = _mm256_fmadd_pd(a5, w3, z2);
1445: /* ninth column of a */
1446: w0 = _mm256_set1_pd(work[8]);
1447: a0 = _mm256_loadu_pd(v + 96);
1448: z0 = _mm256_fmadd_pd(a0, w0, z0);
1449: a1 = _mm256_loadu_pd(v + 100);
1450: z1 = _mm256_fmadd_pd(a1, w0, z1);
1451: a2 = _mm256_loadu_pd(v + 104);
1452: z2 = _mm256_fmadd_pd(a2, w0, z2);
1454: /* tenth column of a */
1455: w1 = _mm256_set1_pd(work[9]);
1456: a3 = _mm256_loadu_pd(v + 108);
1457: z0 = _mm256_fmadd_pd(a3, w1, z0);
1458: a4 = _mm256_loadu_pd(v + 112);
1459: z1 = _mm256_fmadd_pd(a4, w1, z1);
1460: a5 = _mm256_loadu_pd(v + 116);
1461: z2 = _mm256_fmadd_pd(a5, w1, z2);
1463: /* eleventh column of a */
1464: w2 = _mm256_set1_pd(work[10]);
1465: a0 = _mm256_loadu_pd(v + 120);
1466: z0 = _mm256_fmadd_pd(a0, w2, z0);
1467: a1 = _mm256_loadu_pd(v + 124);
1468: z1 = _mm256_fmadd_pd(a1, w2, z1);
1469: a2 = _mm256_loadu_pd(v + 128);
1470: z2 = _mm256_fmadd_pd(a2, w2, z2);
1472: /* twelveth column of a */
1473: w3 = _mm256_set1_pd(work[11]);
1474: a3 = _mm256_loadu_pd(v + 132);
1475: z0 = _mm256_fmadd_pd(a3, w3, z0);
1476: a4 = _mm256_loadu_pd(v + 136);
1477: z1 = _mm256_fmadd_pd(a4, w3, z1);
1478: a5 = _mm256_loadu_pd(v + 140);
1479: z2 = _mm256_fmadd_pd(a5, w3, z2);
1481: v += bs2;
1482: }
1483: if (usecprow) z = zarray + bs * ridx[i];
1484: _mm256_storeu_pd(&z[0], z0);
1485: _mm256_storeu_pd(&z[4], z1);
1486: _mm256_storeu_pd(&z[8], z2);
1487: if (!usecprow) z += bs;
1488: }
1489: VecRestoreArrayRead(xx, &x);
1490: VecRestoreArrayWrite(zz, &zarray);
1491: PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt);
1492: return 0;
1493: }
1494: #endif
1496: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1497: /* Default MatMult for block size 15 */
1498: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz)
1499: {
1500: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1501: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1502: const PetscScalar *x, *xb;
1503: PetscScalar *zarray, xv;
1504: const MatScalar *v;
1505: const PetscInt *ii, *ij = a->j, *idx;
1506: PetscInt mbs, i, j, k, n, *ridx = NULL;
1507: PetscBool usecprow = a->compressedrow.use;
1509: VecGetArrayRead(xx, &x);
1510: VecGetArrayWrite(zz, &zarray);
1512: v = a->a;
1513: if (usecprow) {
1514: mbs = a->compressedrow.nrows;
1515: ii = a->compressedrow.i;
1516: ridx = a->compressedrow.rindex;
1517: PetscArrayzero(zarray, 15 * a->mbs);
1518: } else {
1519: mbs = a->mbs;
1520: ii = a->i;
1521: z = zarray;
1522: }
1524: for (i = 0; i < mbs; i++) {
1525: n = ii[i + 1] - ii[i];
1526: idx = ij + ii[i];
1527: sum1 = 0.0;
1528: sum2 = 0.0;
1529: sum3 = 0.0;
1530: sum4 = 0.0;
1531: sum5 = 0.0;
1532: sum6 = 0.0;
1533: sum7 = 0.0;
1534: sum8 = 0.0;
1535: sum9 = 0.0;
1536: sum10 = 0.0;
1537: sum11 = 0.0;
1538: sum12 = 0.0;
1539: sum13 = 0.0;
1540: sum14 = 0.0;
1541: sum15 = 0.0;
1543: for (j = 0; j < n; j++) {
1544: xb = x + 15 * (idx[j]);
1546: for (k = 0; k < 15; k++) {
1547: xv = xb[k];
1548: sum1 += v[0] * xv;
1549: sum2 += v[1] * xv;
1550: sum3 += v[2] * xv;
1551: sum4 += v[3] * xv;
1552: sum5 += v[4] * xv;
1553: sum6 += v[5] * xv;
1554: sum7 += v[6] * xv;
1555: sum8 += v[7] * xv;
1556: sum9 += v[8] * xv;
1557: sum10 += v[9] * xv;
1558: sum11 += v[10] * xv;
1559: sum12 += v[11] * xv;
1560: sum13 += v[12] * xv;
1561: sum14 += v[13] * xv;
1562: sum15 += v[14] * xv;
1563: v += 15;
1564: }
1565: }
1566: if (usecprow) z = zarray + 15 * ridx[i];
1567: z[0] = sum1;
1568: z[1] = sum2;
1569: z[2] = sum3;
1570: z[3] = sum4;
1571: z[4] = sum5;
1572: z[5] = sum6;
1573: z[6] = sum7;
1574: z[7] = sum8;
1575: z[8] = sum9;
1576: z[9] = sum10;
1577: z[10] = sum11;
1578: z[11] = sum12;
1579: z[12] = sum13;
1580: z[13] = sum14;
1581: z[14] = sum15;
1583: if (!usecprow) z += 15;
1584: }
1586: VecRestoreArrayRead(xx, &x);
1587: VecRestoreArrayWrite(zz, &zarray);
1588: PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt);
1589: return 0;
1590: }
1592: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1593: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz)
1594: {
1595: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1596: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1597: const PetscScalar *x, *xb;
1598: PetscScalar x1, x2, x3, x4, *zarray;
1599: const MatScalar *v;
1600: const PetscInt *ii, *ij = a->j, *idx;
1601: PetscInt mbs, i, j, n, *ridx = NULL;
1602: PetscBool usecprow = a->compressedrow.use;
1604: VecGetArrayRead(xx, &x);
1605: VecGetArrayWrite(zz, &zarray);
1607: v = a->a;
1608: if (usecprow) {
1609: mbs = a->compressedrow.nrows;
1610: ii = a->compressedrow.i;
1611: ridx = a->compressedrow.rindex;
1612: PetscArrayzero(zarray, 15 * a->mbs);
1613: } else {
1614: mbs = a->mbs;
1615: ii = a->i;
1616: z = zarray;
1617: }
1619: for (i = 0; i < mbs; i++) {
1620: n = ii[i + 1] - ii[i];
1621: idx = ij + ii[i];
1622: sum1 = 0.0;
1623: sum2 = 0.0;
1624: sum3 = 0.0;
1625: sum4 = 0.0;
1626: sum5 = 0.0;
1627: sum6 = 0.0;
1628: sum7 = 0.0;
1629: sum8 = 0.0;
1630: sum9 = 0.0;
1631: sum10 = 0.0;
1632: sum11 = 0.0;
1633: sum12 = 0.0;
1634: sum13 = 0.0;
1635: sum14 = 0.0;
1636: sum15 = 0.0;
1638: for (j = 0; j < n; j++) {
1639: xb = x + 15 * (idx[j]);
1640: x1 = xb[0];
1641: x2 = xb[1];
1642: x3 = xb[2];
1643: x4 = xb[3];
1645: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1646: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1647: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1648: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1649: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1650: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1651: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1652: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1653: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1654: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1655: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1656: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1657: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1658: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1659: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1661: v += 60;
1663: x1 = xb[4];
1664: x2 = xb[5];
1665: x3 = xb[6];
1666: x4 = xb[7];
1668: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1669: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1670: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1671: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1672: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1673: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1674: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1675: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1676: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1677: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1678: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1679: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1680: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1681: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1682: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1683: v += 60;
1685: x1 = xb[8];
1686: x2 = xb[9];
1687: x3 = xb[10];
1688: x4 = xb[11];
1689: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1690: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1691: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1692: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1693: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1694: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1695: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1696: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1697: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1698: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1699: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1700: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1701: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1702: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1703: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1704: v += 60;
1706: x1 = xb[12];
1707: x2 = xb[13];
1708: x3 = xb[14];
1709: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3;
1710: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3;
1711: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3;
1712: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3;
1713: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3;
1714: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3;
1715: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3;
1716: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3;
1717: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3;
1718: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3;
1719: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3;
1720: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3;
1721: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3;
1722: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3;
1723: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3;
1724: v += 45;
1725: }
1726: if (usecprow) z = zarray + 15 * ridx[i];
1727: z[0] = sum1;
1728: z[1] = sum2;
1729: z[2] = sum3;
1730: z[3] = sum4;
1731: z[4] = sum5;
1732: z[5] = sum6;
1733: z[6] = sum7;
1734: z[7] = sum8;
1735: z[8] = sum9;
1736: z[9] = sum10;
1737: z[10] = sum11;
1738: z[11] = sum12;
1739: z[12] = sum13;
1740: z[13] = sum14;
1741: z[14] = sum15;
1743: if (!usecprow) z += 15;
1744: }
1746: VecRestoreArrayRead(xx, &x);
1747: VecRestoreArrayWrite(zz, &zarray);
1748: PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt);
1749: return 0;
1750: }
1752: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1753: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz)
1754: {
1755: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1756: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1757: const PetscScalar *x, *xb;
1758: PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, *zarray;
1759: const MatScalar *v;
1760: const PetscInt *ii, *ij = a->j, *idx;
1761: PetscInt mbs, i, j, n, *ridx = NULL;
1762: PetscBool usecprow = a->compressedrow.use;
1764: VecGetArrayRead(xx, &x);
1765: VecGetArrayWrite(zz, &zarray);
1767: v = a->a;
1768: if (usecprow) {
1769: mbs = a->compressedrow.nrows;
1770: ii = a->compressedrow.i;
1771: ridx = a->compressedrow.rindex;
1772: PetscArrayzero(zarray, 15 * a->mbs);
1773: } else {
1774: mbs = a->mbs;
1775: ii = a->i;
1776: z = zarray;
1777: }
1779: for (i = 0; i < mbs; i++) {
1780: n = ii[i + 1] - ii[i];
1781: idx = ij + ii[i];
1782: sum1 = 0.0;
1783: sum2 = 0.0;
1784: sum3 = 0.0;
1785: sum4 = 0.0;
1786: sum5 = 0.0;
1787: sum6 = 0.0;
1788: sum7 = 0.0;
1789: sum8 = 0.0;
1790: sum9 = 0.0;
1791: sum10 = 0.0;
1792: sum11 = 0.0;
1793: sum12 = 0.0;
1794: sum13 = 0.0;
1795: sum14 = 0.0;
1796: sum15 = 0.0;
1798: for (j = 0; j < n; j++) {
1799: xb = x + 15 * (idx[j]);
1800: x1 = xb[0];
1801: x2 = xb[1];
1802: x3 = xb[2];
1803: x4 = xb[3];
1804: x5 = xb[4];
1805: x6 = xb[5];
1806: x7 = xb[6];
1807: x8 = xb[7];
1809: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8;
1810: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8;
1811: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8;
1812: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8;
1813: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8;
1814: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8;
1815: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8;
1816: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8;
1817: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8;
1818: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8;
1819: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8;
1820: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8;
1821: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8;
1822: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8;
1823: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8;
1824: v += 120;
1826: x1 = xb[8];
1827: x2 = xb[9];
1828: x3 = xb[10];
1829: x4 = xb[11];
1830: x5 = xb[12];
1831: x6 = xb[13];
1832: x7 = xb[14];
1834: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7;
1835: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7;
1836: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7;
1837: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7;
1838: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7;
1839: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7;
1840: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7;
1841: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7;
1842: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7;
1843: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7;
1844: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7;
1845: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7;
1846: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7;
1847: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7;
1848: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7;
1849: v += 105;
1850: }
1851: if (usecprow) z = zarray + 15 * ridx[i];
1852: z[0] = sum1;
1853: z[1] = sum2;
1854: z[2] = sum3;
1855: z[3] = sum4;
1856: z[4] = sum5;
1857: z[5] = sum6;
1858: z[6] = sum7;
1859: z[7] = sum8;
1860: z[8] = sum9;
1861: z[9] = sum10;
1862: z[10] = sum11;
1863: z[11] = sum12;
1864: z[12] = sum13;
1865: z[13] = sum14;
1866: z[14] = sum15;
1868: if (!usecprow) z += 15;
1869: }
1871: VecRestoreArrayRead(xx, &x);
1872: VecRestoreArrayWrite(zz, &zarray);
1873: PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt);
1874: return 0;
1875: }
1877: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1878: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz)
1879: {
1880: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1881: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1882: const PetscScalar *x, *xb;
1883: PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray;
1884: const MatScalar *v;
1885: const PetscInt *ii, *ij = a->j, *idx;
1886: PetscInt mbs, i, j, n, *ridx = NULL;
1887: PetscBool usecprow = a->compressedrow.use;
1889: VecGetArrayRead(xx, &x);
1890: VecGetArrayWrite(zz, &zarray);
1892: v = a->a;
1893: if (usecprow) {
1894: mbs = a->compressedrow.nrows;
1895: ii = a->compressedrow.i;
1896: ridx = a->compressedrow.rindex;
1897: PetscArrayzero(zarray, 15 * a->mbs);
1898: } else {
1899: mbs = a->mbs;
1900: ii = a->i;
1901: z = zarray;
1902: }
1904: for (i = 0; i < mbs; i++) {
1905: n = ii[i + 1] - ii[i];
1906: idx = ij + ii[i];
1907: sum1 = 0.0;
1908: sum2 = 0.0;
1909: sum3 = 0.0;
1910: sum4 = 0.0;
1911: sum5 = 0.0;
1912: sum6 = 0.0;
1913: sum7 = 0.0;
1914: sum8 = 0.0;
1915: sum9 = 0.0;
1916: sum10 = 0.0;
1917: sum11 = 0.0;
1918: sum12 = 0.0;
1919: sum13 = 0.0;
1920: sum14 = 0.0;
1921: sum15 = 0.0;
1923: for (j = 0; j < n; j++) {
1924: xb = x + 15 * (idx[j]);
1925: x1 = xb[0];
1926: x2 = xb[1];
1927: x3 = xb[2];
1928: x4 = xb[3];
1929: x5 = xb[4];
1930: x6 = xb[5];
1931: x7 = xb[6];
1932: x8 = xb[7];
1933: x9 = xb[8];
1934: x10 = xb[9];
1935: x11 = xb[10];
1936: x12 = xb[11];
1937: x13 = xb[12];
1938: x14 = xb[13];
1939: x15 = xb[14];
1941: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15;
1942: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15;
1943: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15;
1944: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15;
1945: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15;
1946: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15;
1947: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15;
1948: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15;
1949: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15;
1950: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15;
1951: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15;
1952: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15;
1953: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15;
1954: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15;
1955: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15;
1956: v += 225;
1957: }
1958: if (usecprow) z = zarray + 15 * ridx[i];
1959: z[0] = sum1;
1960: z[1] = sum2;
1961: z[2] = sum3;
1962: z[3] = sum4;
1963: z[4] = sum5;
1964: z[5] = sum6;
1965: z[6] = sum7;
1966: z[7] = sum8;
1967: z[8] = sum9;
1968: z[9] = sum10;
1969: z[10] = sum11;
1970: z[11] = sum12;
1971: z[12] = sum13;
1972: z[13] = sum14;
1973: z[14] = sum15;
1975: if (!usecprow) z += 15;
1976: }
1978: VecRestoreArrayRead(xx, &x);
1979: VecRestoreArrayWrite(zz, &zarray);
1980: PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt);
1981: return 0;
1982: }
1984: /*
1985: This will not work with MatScalar == float because it calls the BLAS
1986: */
1987: PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz)
1988: {
1989: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1990: PetscScalar *z = NULL, *work, *workt, *zarray;
1991: const PetscScalar *x, *xb;
1992: const MatScalar *v;
1993: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1994: const PetscInt *idx, *ii, *ridx = NULL;
1995: PetscInt ncols, k;
1996: PetscBool usecprow = a->compressedrow.use;
1998: VecGetArrayRead(xx, &x);
1999: VecGetArrayWrite(zz, &zarray);
2001: idx = a->j;
2002: v = a->a;
2003: if (usecprow) {
2004: mbs = a->compressedrow.nrows;
2005: ii = a->compressedrow.i;
2006: ridx = a->compressedrow.rindex;
2007: PetscArrayzero(zarray, bs * a->mbs);
2008: } else {
2009: mbs = a->mbs;
2010: ii = a->i;
2011: z = zarray;
2012: }
2014: if (!a->mult_work) {
2015: k = PetscMax(A->rmap->n, A->cmap->n);
2016: PetscMalloc1(k + 1, &a->mult_work);
2017: }
2018: work = a->mult_work;
2019: for (i = 0; i < mbs; i++) {
2020: n = ii[1] - ii[0];
2021: ii++;
2022: ncols = n * bs;
2023: workt = work;
2024: for (j = 0; j < n; j++) {
2025: xb = x + bs * (*idx++);
2026: for (k = 0; k < bs; k++) workt[k] = xb[k];
2027: workt += bs;
2028: }
2029: if (usecprow) z = zarray + bs * ridx[i];
2030: PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z);
2031: v += n * bs2;
2032: if (!usecprow) z += bs;
2033: }
2034: VecRestoreArrayRead(xx, &x);
2035: VecRestoreArrayWrite(zz, &zarray);
2036: PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt);
2037: return 0;
2038: }
2040: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
2041: {
2042: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2043: const PetscScalar *x;
2044: PetscScalar *y, *z, sum;
2045: const MatScalar *v;
2046: PetscInt mbs = a->mbs, i, n, *ridx = NULL;
2047: const PetscInt *idx, *ii;
2048: PetscBool usecprow = a->compressedrow.use;
2050: VecGetArrayRead(xx, &x);
2051: VecGetArrayPair(yy, zz, &y, &z);
2053: idx = a->j;
2054: v = a->a;
2055: if (usecprow) {
2056: if (zz != yy) PetscArraycpy(z, y, mbs);
2057: mbs = a->compressedrow.nrows;
2058: ii = a->compressedrow.i;
2059: ridx = a->compressedrow.rindex;
2060: } else {
2061: ii = a->i;
2062: }
2064: for (i = 0; i < mbs; i++) {
2065: n = ii[1] - ii[0];
2066: ii++;
2067: if (!usecprow) {
2068: sum = y[i];
2069: } else {
2070: sum = y[ridx[i]];
2071: }
2072: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2073: PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2074: PetscSparseDensePlusDot(sum, x, v, idx, n);
2075: v += n;
2076: idx += n;
2077: if (usecprow) {
2078: z[ridx[i]] = sum;
2079: } else {
2080: z[i] = sum;
2081: }
2082: }
2083: VecRestoreArrayRead(xx, &x);
2084: VecRestoreArrayPair(yy, zz, &y, &z);
2085: PetscLogFlops(2.0 * a->nz);
2086: return 0;
2087: }
2089: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
2090: {
2091: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2092: PetscScalar *y = NULL, *z = NULL, sum1, sum2;
2093: const PetscScalar *x, *xb;
2094: PetscScalar x1, x2, *yarray, *zarray;
2095: const MatScalar *v;
2096: PetscInt mbs = a->mbs, i, n, j;
2097: const PetscInt *idx, *ii, *ridx = NULL;
2098: PetscBool usecprow = a->compressedrow.use;
2100: VecGetArrayRead(xx, &x);
2101: VecGetArrayPair(yy, zz, &yarray, &zarray);
2103: idx = a->j;
2104: v = a->a;
2105: if (usecprow) {
2106: if (zz != yy) PetscArraycpy(zarray, yarray, 2 * mbs);
2107: mbs = a->compressedrow.nrows;
2108: ii = a->compressedrow.i;
2109: ridx = a->compressedrow.rindex;
2110: } else {
2111: ii = a->i;
2112: y = yarray;
2113: z = zarray;
2114: }
2116: for (i = 0; i < mbs; i++) {
2117: n = ii[1] - ii[0];
2118: ii++;
2119: if (usecprow) {
2120: z = zarray + 2 * ridx[i];
2121: y = yarray + 2 * ridx[i];
2122: }
2123: sum1 = y[0];
2124: sum2 = y[1];
2125: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2126: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2127: for (j = 0; j < n; j++) {
2128: xb = x + 2 * (*idx++);
2129: x1 = xb[0];
2130: x2 = xb[1];
2132: sum1 += v[0] * x1 + v[2] * x2;
2133: sum2 += v[1] * x1 + v[3] * x2;
2134: v += 4;
2135: }
2136: z[0] = sum1;
2137: z[1] = sum2;
2138: if (!usecprow) {
2139: z += 2;
2140: y += 2;
2141: }
2142: }
2143: VecRestoreArrayRead(xx, &x);
2144: VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2145: PetscLogFlops(4.0 * a->nz);
2146: return 0;
2147: }
2149: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
2150: {
2151: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2152: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray;
2153: const PetscScalar *x, *xb;
2154: const MatScalar *v;
2155: PetscInt mbs = a->mbs, i, j, n;
2156: const PetscInt *idx, *ii, *ridx = NULL;
2157: PetscBool usecprow = a->compressedrow.use;
2159: VecGetArrayRead(xx, &x);
2160: VecGetArrayPair(yy, zz, &yarray, &zarray);
2162: idx = a->j;
2163: v = a->a;
2164: if (usecprow) {
2165: if (zz != yy) PetscArraycpy(zarray, yarray, 3 * mbs);
2166: mbs = a->compressedrow.nrows;
2167: ii = a->compressedrow.i;
2168: ridx = a->compressedrow.rindex;
2169: } else {
2170: ii = a->i;
2171: y = yarray;
2172: z = zarray;
2173: }
2175: for (i = 0; i < mbs; i++) {
2176: n = ii[1] - ii[0];
2177: ii++;
2178: if (usecprow) {
2179: z = zarray + 3 * ridx[i];
2180: y = yarray + 3 * ridx[i];
2181: }
2182: sum1 = y[0];
2183: sum2 = y[1];
2184: sum3 = y[2];
2185: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2186: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2187: for (j = 0; j < n; j++) {
2188: xb = x + 3 * (*idx++);
2189: x1 = xb[0];
2190: x2 = xb[1];
2191: x3 = xb[2];
2192: sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
2193: sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
2194: sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
2195: v += 9;
2196: }
2197: z[0] = sum1;
2198: z[1] = sum2;
2199: z[2] = sum3;
2200: if (!usecprow) {
2201: z += 3;
2202: y += 3;
2203: }
2204: }
2205: VecRestoreArrayRead(xx, &x);
2206: VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2207: PetscLogFlops(18.0 * a->nz);
2208: return 0;
2209: }
2211: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
2212: {
2213: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2214: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray;
2215: const PetscScalar *x, *xb;
2216: const MatScalar *v;
2217: PetscInt mbs = a->mbs, i, j, n;
2218: const PetscInt *idx, *ii, *ridx = NULL;
2219: PetscBool usecprow = a->compressedrow.use;
2221: VecGetArrayRead(xx, &x);
2222: VecGetArrayPair(yy, zz, &yarray, &zarray);
2224: idx = a->j;
2225: v = a->a;
2226: if (usecprow) {
2227: if (zz != yy) PetscArraycpy(zarray, yarray, 4 * mbs);
2228: mbs = a->compressedrow.nrows;
2229: ii = a->compressedrow.i;
2230: ridx = a->compressedrow.rindex;
2231: } else {
2232: ii = a->i;
2233: y = yarray;
2234: z = zarray;
2235: }
2237: for (i = 0; i < mbs; i++) {
2238: n = ii[1] - ii[0];
2239: ii++;
2240: if (usecprow) {
2241: z = zarray + 4 * ridx[i];
2242: y = yarray + 4 * ridx[i];
2243: }
2244: sum1 = y[0];
2245: sum2 = y[1];
2246: sum3 = y[2];
2247: sum4 = y[3];
2248: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2249: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2250: for (j = 0; j < n; j++) {
2251: xb = x + 4 * (*idx++);
2252: x1 = xb[0];
2253: x2 = xb[1];
2254: x3 = xb[2];
2255: x4 = xb[3];
2256: sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
2257: sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
2258: sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
2259: sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
2260: v += 16;
2261: }
2262: z[0] = sum1;
2263: z[1] = sum2;
2264: z[2] = sum3;
2265: z[3] = sum4;
2266: if (!usecprow) {
2267: z += 4;
2268: y += 4;
2269: }
2270: }
2271: VecRestoreArrayRead(xx, &x);
2272: VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2273: PetscLogFlops(32.0 * a->nz);
2274: return 0;
2275: }
2277: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
2278: {
2279: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2280: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5;
2281: const PetscScalar *x, *xb;
2282: PetscScalar *yarray, *zarray;
2283: const MatScalar *v;
2284: PetscInt mbs = a->mbs, i, j, n;
2285: const PetscInt *idx, *ii, *ridx = NULL;
2286: PetscBool usecprow = a->compressedrow.use;
2288: VecGetArrayRead(xx, &x);
2289: VecGetArrayPair(yy, zz, &yarray, &zarray);
2291: idx = a->j;
2292: v = a->a;
2293: if (usecprow) {
2294: if (zz != yy) PetscArraycpy(zarray, yarray, 5 * mbs);
2295: mbs = a->compressedrow.nrows;
2296: ii = a->compressedrow.i;
2297: ridx = a->compressedrow.rindex;
2298: } else {
2299: ii = a->i;
2300: y = yarray;
2301: z = zarray;
2302: }
2304: for (i = 0; i < mbs; i++) {
2305: n = ii[1] - ii[0];
2306: ii++;
2307: if (usecprow) {
2308: z = zarray + 5 * ridx[i];
2309: y = yarray + 5 * ridx[i];
2310: }
2311: sum1 = y[0];
2312: sum2 = y[1];
2313: sum3 = y[2];
2314: sum4 = y[3];
2315: sum5 = y[4];
2316: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2317: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2318: for (j = 0; j < n; j++) {
2319: xb = x + 5 * (*idx++);
2320: x1 = xb[0];
2321: x2 = xb[1];
2322: x3 = xb[2];
2323: x4 = xb[3];
2324: x5 = xb[4];
2325: sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
2326: sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
2327: sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
2328: sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
2329: sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
2330: v += 25;
2331: }
2332: z[0] = sum1;
2333: z[1] = sum2;
2334: z[2] = sum3;
2335: z[3] = sum4;
2336: z[4] = sum5;
2337: if (!usecprow) {
2338: z += 5;
2339: y += 5;
2340: }
2341: }
2342: VecRestoreArrayRead(xx, &x);
2343: VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2344: PetscLogFlops(50.0 * a->nz);
2345: return 0;
2346: }
2348: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
2349: {
2350: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2351: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
2352: const PetscScalar *x, *xb;
2353: PetscScalar x1, x2, x3, x4, x5, x6, *yarray, *zarray;
2354: const MatScalar *v;
2355: PetscInt mbs = a->mbs, i, j, n;
2356: const PetscInt *idx, *ii, *ridx = NULL;
2357: PetscBool usecprow = a->compressedrow.use;
2359: VecGetArrayRead(xx, &x);
2360: VecGetArrayPair(yy, zz, &yarray, &zarray);
2362: idx = a->j;
2363: v = a->a;
2364: if (usecprow) {
2365: if (zz != yy) PetscArraycpy(zarray, yarray, 6 * mbs);
2366: mbs = a->compressedrow.nrows;
2367: ii = a->compressedrow.i;
2368: ridx = a->compressedrow.rindex;
2369: } else {
2370: ii = a->i;
2371: y = yarray;
2372: z = zarray;
2373: }
2375: for (i = 0; i < mbs; i++) {
2376: n = ii[1] - ii[0];
2377: ii++;
2378: if (usecprow) {
2379: z = zarray + 6 * ridx[i];
2380: y = yarray + 6 * ridx[i];
2381: }
2382: sum1 = y[0];
2383: sum2 = y[1];
2384: sum3 = y[2];
2385: sum4 = y[3];
2386: sum5 = y[4];
2387: sum6 = y[5];
2388: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2389: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2390: for (j = 0; j < n; j++) {
2391: xb = x + 6 * (*idx++);
2392: x1 = xb[0];
2393: x2 = xb[1];
2394: x3 = xb[2];
2395: x4 = xb[3];
2396: x5 = xb[4];
2397: x6 = xb[5];
2398: sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
2399: sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
2400: sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
2401: sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
2402: sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
2403: sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
2404: v += 36;
2405: }
2406: z[0] = sum1;
2407: z[1] = sum2;
2408: z[2] = sum3;
2409: z[3] = sum4;
2410: z[4] = sum5;
2411: z[5] = sum6;
2412: if (!usecprow) {
2413: z += 6;
2414: y += 6;
2415: }
2416: }
2417: VecRestoreArrayRead(xx, &x);
2418: VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2419: PetscLogFlops(72.0 * a->nz);
2420: return 0;
2421: }
2423: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
2424: {
2425: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2426: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
2427: const PetscScalar *x, *xb;
2428: PetscScalar x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray;
2429: const MatScalar *v;
2430: PetscInt mbs = a->mbs, i, j, n;
2431: const PetscInt *idx, *ii, *ridx = NULL;
2432: PetscBool usecprow = a->compressedrow.use;
2434: VecGetArrayRead(xx, &x);
2435: VecGetArrayPair(yy, zz, &yarray, &zarray);
2437: idx = a->j;
2438: v = a->a;
2439: if (usecprow) {
2440: if (zz != yy) PetscArraycpy(zarray, yarray, 7 * mbs);
2441: mbs = a->compressedrow.nrows;
2442: ii = a->compressedrow.i;
2443: ridx = a->compressedrow.rindex;
2444: } else {
2445: ii = a->i;
2446: y = yarray;
2447: z = zarray;
2448: }
2450: for (i = 0; i < mbs; i++) {
2451: n = ii[1] - ii[0];
2452: ii++;
2453: if (usecprow) {
2454: z = zarray + 7 * ridx[i];
2455: y = yarray + 7 * ridx[i];
2456: }
2457: sum1 = y[0];
2458: sum2 = y[1];
2459: sum3 = y[2];
2460: sum4 = y[3];
2461: sum5 = y[4];
2462: sum6 = y[5];
2463: sum7 = y[6];
2464: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2465: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2466: for (j = 0; j < n; j++) {
2467: xb = x + 7 * (*idx++);
2468: x1 = xb[0];
2469: x2 = xb[1];
2470: x3 = xb[2];
2471: x4 = xb[3];
2472: x5 = xb[4];
2473: x6 = xb[5];
2474: x7 = xb[6];
2475: sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
2476: sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
2477: sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
2478: sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
2479: sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
2480: sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
2481: sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
2482: v += 49;
2483: }
2484: z[0] = sum1;
2485: z[1] = sum2;
2486: z[2] = sum3;
2487: z[3] = sum4;
2488: z[4] = sum5;
2489: z[5] = sum6;
2490: z[6] = sum7;
2491: if (!usecprow) {
2492: z += 7;
2493: y += 7;
2494: }
2495: }
2496: VecRestoreArrayRead(xx, &x);
2497: VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2498: PetscLogFlops(98.0 * a->nz);
2499: return 0;
2500: }
2502: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2503: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz)
2504: {
2505: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2506: PetscScalar *z = NULL, *work, *workt, *zarray;
2507: const PetscScalar *x, *xb;
2508: const MatScalar *v;
2509: PetscInt mbs, i, j, n;
2510: PetscInt k;
2511: PetscBool usecprow = a->compressedrow.use;
2512: const PetscInt *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81;
2514: __m256d a0, a1, a2, a3, a4, a5;
2515: __m256d w0, w1, w2, w3;
2516: __m256d z0, z1, z2;
2517: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
2519: VecCopy(yy, zz);
2520: VecGetArrayRead(xx, &x);
2521: VecGetArray(zz, &zarray);
2523: idx = a->j;
2524: v = a->a;
2525: if (usecprow) {
2526: mbs = a->compressedrow.nrows;
2527: ii = a->compressedrow.i;
2528: ridx = a->compressedrow.rindex;
2529: } else {
2530: mbs = a->mbs;
2531: ii = a->i;
2532: z = zarray;
2533: }
2535: if (!a->mult_work) {
2536: k = PetscMax(A->rmap->n, A->cmap->n);
2537: PetscMalloc1(k + 1, &a->mult_work);
2538: }
2540: work = a->mult_work;
2541: for (i = 0; i < mbs; i++) {
2542: n = ii[1] - ii[0];
2543: ii++;
2544: workt = work;
2545: for (j = 0; j < n; j++) {
2546: xb = x + bs * (*idx++);
2547: for (k = 0; k < bs; k++) workt[k] = xb[k];
2548: workt += bs;
2549: }
2550: if (usecprow) z = zarray + bs * ridx[i];
2552: z0 = _mm256_loadu_pd(&z[0]);
2553: z1 = _mm256_loadu_pd(&z[4]);
2554: z2 = _mm256_set1_pd(z[8]);
2556: for (j = 0; j < n; j++) {
2557: /* first column of a */
2558: w0 = _mm256_set1_pd(work[j * 9]);
2559: a0 = _mm256_loadu_pd(&v[j * 81]);
2560: z0 = _mm256_fmadd_pd(a0, w0, z0);
2561: a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
2562: z1 = _mm256_fmadd_pd(a1, w0, z1);
2563: a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
2564: z2 = _mm256_fmadd_pd(a2, w0, z2);
2566: /* second column of a */
2567: w1 = _mm256_set1_pd(work[j * 9 + 1]);
2568: a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
2569: z0 = _mm256_fmadd_pd(a0, w1, z0);
2570: a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
2571: z1 = _mm256_fmadd_pd(a1, w1, z1);
2572: a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
2573: z2 = _mm256_fmadd_pd(a2, w1, z2);
2575: /* third column of a */
2576: w2 = _mm256_set1_pd(work[j * 9 + 2]);
2577: a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
2578: z0 = _mm256_fmadd_pd(a3, w2, z0);
2579: a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
2580: z1 = _mm256_fmadd_pd(a4, w2, z1);
2581: a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
2582: z2 = _mm256_fmadd_pd(a5, w2, z2);
2584: /* fourth column of a */
2585: w3 = _mm256_set1_pd(work[j * 9 + 3]);
2586: a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
2587: z0 = _mm256_fmadd_pd(a0, w3, z0);
2588: a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
2589: z1 = _mm256_fmadd_pd(a1, w3, z1);
2590: a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
2591: z2 = _mm256_fmadd_pd(a2, w3, z2);
2593: /* fifth column of a */
2594: w0 = _mm256_set1_pd(work[j * 9 + 4]);
2595: a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
2596: z0 = _mm256_fmadd_pd(a3, w0, z0);
2597: a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
2598: z1 = _mm256_fmadd_pd(a4, w0, z1);
2599: a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
2600: z2 = _mm256_fmadd_pd(a5, w0, z2);
2602: /* sixth column of a */
2603: w1 = _mm256_set1_pd(work[j * 9 + 5]);
2604: a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
2605: z0 = _mm256_fmadd_pd(a0, w1, z0);
2606: a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
2607: z1 = _mm256_fmadd_pd(a1, w1, z1);
2608: a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
2609: z2 = _mm256_fmadd_pd(a2, w1, z2);
2611: /* seventh column of a */
2612: w2 = _mm256_set1_pd(work[j * 9 + 6]);
2613: a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
2614: z0 = _mm256_fmadd_pd(a0, w2, z0);
2615: a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
2616: z1 = _mm256_fmadd_pd(a1, w2, z1);
2617: a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
2618: z2 = _mm256_fmadd_pd(a2, w2, z2);
2620: /* eighth column of a */
2621: w3 = _mm256_set1_pd(work[j * 9 + 7]);
2622: a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
2623: z0 = _mm256_fmadd_pd(a3, w3, z0);
2624: a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
2625: z1 = _mm256_fmadd_pd(a4, w3, z1);
2626: a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
2627: z2 = _mm256_fmadd_pd(a5, w3, z2);
2629: /* ninth column of a */
2630: w0 = _mm256_set1_pd(work[j * 9 + 8]);
2631: a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
2632: z0 = _mm256_fmadd_pd(a0, w0, z0);
2633: a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
2634: z1 = _mm256_fmadd_pd(a1, w0, z1);
2635: a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
2636: z2 = _mm256_fmadd_pd(a2, w0, z2);
2637: }
2639: _mm256_storeu_pd(&z[0], z0);
2640: _mm256_storeu_pd(&z[4], z1);
2641: _mm256_maskstore_pd(&z[8], mask1, z2);
2643: v += n * bs2;
2644: if (!usecprow) z += bs;
2645: }
2646: VecRestoreArrayRead(xx, &x);
2647: VecRestoreArray(zz, &zarray);
2648: PetscLogFlops(162.0 * a->nz);
2649: return 0;
2650: }
2651: #endif
2653: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz)
2654: {
2655: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2656: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
2657: const PetscScalar *x, *xb;
2658: PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray;
2659: const MatScalar *v;
2660: PetscInt mbs = a->mbs, i, j, n;
2661: const PetscInt *idx, *ii, *ridx = NULL;
2662: PetscBool usecprow = a->compressedrow.use;
2664: VecGetArrayRead(xx, &x);
2665: VecGetArrayPair(yy, zz, &yarray, &zarray);
2667: idx = a->j;
2668: v = a->a;
2669: if (usecprow) {
2670: if (zz != yy) PetscArraycpy(zarray, yarray, 7 * mbs);
2671: mbs = a->compressedrow.nrows;
2672: ii = a->compressedrow.i;
2673: ridx = a->compressedrow.rindex;
2674: } else {
2675: ii = a->i;
2676: y = yarray;
2677: z = zarray;
2678: }
2680: for (i = 0; i < mbs; i++) {
2681: n = ii[1] - ii[0];
2682: ii++;
2683: if (usecprow) {
2684: z = zarray + 11 * ridx[i];
2685: y = yarray + 11 * ridx[i];
2686: }
2687: sum1 = y[0];
2688: sum2 = y[1];
2689: sum3 = y[2];
2690: sum4 = y[3];
2691: sum5 = y[4];
2692: sum6 = y[5];
2693: sum7 = y[6];
2694: sum8 = y[7];
2695: sum9 = y[8];
2696: sum10 = y[9];
2697: sum11 = y[10];
2698: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2699: PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2700: for (j = 0; j < n; j++) {
2701: xb = x + 11 * (*idx++);
2702: x1 = xb[0];
2703: x2 = xb[1];
2704: x3 = xb[2];
2705: x4 = xb[3];
2706: x5 = xb[4];
2707: x6 = xb[5];
2708: x7 = xb[6];
2709: x8 = xb[7];
2710: x9 = xb[8];
2711: x10 = xb[9];
2712: x11 = xb[10];
2713: sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11;
2714: sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11;
2715: sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11;
2716: sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11;
2717: sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11;
2718: sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11;
2719: sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11;
2720: sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11;
2721: sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11;
2722: sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11;
2723: sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11;
2724: v += 121;
2725: }
2726: z[0] = sum1;
2727: z[1] = sum2;
2728: z[2] = sum3;
2729: z[3] = sum4;
2730: z[4] = sum5;
2731: z[5] = sum6;
2732: z[6] = sum7;
2733: z[7] = sum8;
2734: z[8] = sum9;
2735: z[9] = sum10;
2736: z[10] = sum11;
2737: if (!usecprow) {
2738: z += 11;
2739: y += 11;
2740: }
2741: }
2742: VecRestoreArrayRead(xx, &x);
2743: VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2744: PetscLogFlops(242.0 * a->nz);
2745: return 0;
2746: }
2748: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
2749: {
2750: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2751: PetscScalar *z = NULL, *work, *workt, *zarray;
2752: const PetscScalar *x, *xb;
2753: const MatScalar *v;
2754: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2755: PetscInt ncols, k;
2756: const PetscInt *ridx = NULL, *idx, *ii;
2757: PetscBool usecprow = a->compressedrow.use;
2759: VecCopy(yy, zz);
2760: VecGetArrayRead(xx, &x);
2761: VecGetArray(zz, &zarray);
2763: idx = a->j;
2764: v = a->a;
2765: if (usecprow) {
2766: mbs = a->compressedrow.nrows;
2767: ii = a->compressedrow.i;
2768: ridx = a->compressedrow.rindex;
2769: } else {
2770: mbs = a->mbs;
2771: ii = a->i;
2772: z = zarray;
2773: }
2775: if (!a->mult_work) {
2776: k = PetscMax(A->rmap->n, A->cmap->n);
2777: PetscMalloc1(k + 1, &a->mult_work);
2778: }
2779: work = a->mult_work;
2780: for (i = 0; i < mbs; i++) {
2781: n = ii[1] - ii[0];
2782: ii++;
2783: ncols = n * bs;
2784: workt = work;
2785: for (j = 0; j < n; j++) {
2786: xb = x + bs * (*idx++);
2787: for (k = 0; k < bs; k++) workt[k] = xb[k];
2788: workt += bs;
2789: }
2790: if (usecprow) z = zarray + bs * ridx[i];
2791: PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
2792: v += n * bs2;
2793: if (!usecprow) z += bs;
2794: }
2795: VecRestoreArrayRead(xx, &x);
2796: VecRestoreArray(zz, &zarray);
2797: PetscLogFlops(2.0 * a->nz * bs2);
2798: return 0;
2799: }
2801: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2802: {
2803: PetscScalar zero = 0.0;
2805: VecSet(zz, zero);
2806: MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz);
2807: return 0;
2808: }
2810: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2811: {
2812: PetscScalar zero = 0.0;
2814: VecSet(zz, zero);
2815: MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz);
2816: return 0;
2817: }
2819: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2820: {
2821: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2822: PetscScalar *z, x1, x2, x3, x4, x5;
2823: const PetscScalar *x, *xb = NULL;
2824: const MatScalar *v;
2825: PetscInt mbs, i, rval, bs = A->rmap->bs, j, n;
2826: const PetscInt *idx, *ii, *ib, *ridx = NULL;
2827: Mat_CompressedRow cprow = a->compressedrow;
2828: PetscBool usecprow = cprow.use;
2830: if (yy != zz) VecCopy(yy, zz);
2831: VecGetArrayRead(xx, &x);
2832: VecGetArray(zz, &z);
2834: idx = a->j;
2835: v = a->a;
2836: if (usecprow) {
2837: mbs = cprow.nrows;
2838: ii = cprow.i;
2839: ridx = cprow.rindex;
2840: } else {
2841: mbs = a->mbs;
2842: ii = a->i;
2843: xb = x;
2844: }
2846: switch (bs) {
2847: case 1:
2848: for (i = 0; i < mbs; i++) {
2849: if (usecprow) xb = x + ridx[i];
2850: x1 = xb[0];
2851: ib = idx + ii[0];
2852: n = ii[1] - ii[0];
2853: ii++;
2854: for (j = 0; j < n; j++) {
2855: rval = ib[j];
2856: z[rval] += PetscConj(*v) * x1;
2857: v++;
2858: }
2859: if (!usecprow) xb++;
2860: }
2861: break;
2862: case 2:
2863: for (i = 0; i < mbs; i++) {
2864: if (usecprow) xb = x + 2 * ridx[i];
2865: x1 = xb[0];
2866: x2 = xb[1];
2867: ib = idx + ii[0];
2868: n = ii[1] - ii[0];
2869: ii++;
2870: for (j = 0; j < n; j++) {
2871: rval = ib[j] * 2;
2872: z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2;
2873: z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2;
2874: v += 4;
2875: }
2876: if (!usecprow) xb += 2;
2877: }
2878: break;
2879: case 3:
2880: for (i = 0; i < mbs; i++) {
2881: if (usecprow) xb = x + 3 * ridx[i];
2882: x1 = xb[0];
2883: x2 = xb[1];
2884: x3 = xb[2];
2885: ib = idx + ii[0];
2886: n = ii[1] - ii[0];
2887: ii++;
2888: for (j = 0; j < n; j++) {
2889: rval = ib[j] * 3;
2890: z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3;
2891: z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3;
2892: z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3;
2893: v += 9;
2894: }
2895: if (!usecprow) xb += 3;
2896: }
2897: break;
2898: case 4:
2899: for (i = 0; i < mbs; i++) {
2900: if (usecprow) xb = x + 4 * ridx[i];
2901: x1 = xb[0];
2902: x2 = xb[1];
2903: x3 = xb[2];
2904: x4 = xb[3];
2905: ib = idx + ii[0];
2906: n = ii[1] - ii[0];
2907: ii++;
2908: for (j = 0; j < n; j++) {
2909: rval = ib[j] * 4;
2910: z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4;
2911: z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4;
2912: z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4;
2913: z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4;
2914: v += 16;
2915: }
2916: if (!usecprow) xb += 4;
2917: }
2918: break;
2919: case 5:
2920: for (i = 0; i < mbs; i++) {
2921: if (usecprow) xb = x + 5 * ridx[i];
2922: x1 = xb[0];
2923: x2 = xb[1];
2924: x3 = xb[2];
2925: x4 = xb[3];
2926: x5 = xb[4];
2927: ib = idx + ii[0];
2928: n = ii[1] - ii[0];
2929: ii++;
2930: for (j = 0; j < n; j++) {
2931: rval = ib[j] * 5;
2932: z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5;
2933: z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5;
2934: z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5;
2935: z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5;
2936: z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5;
2937: v += 25;
2938: }
2939: if (!usecprow) xb += 5;
2940: }
2941: break;
2942: default: /* block sizes larger than 5 by 5 are handled by BLAS */
2943: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet");
2944: #if 0
2945: {
2946: PetscInt ncols,k,bs2=a->bs2;
2947: PetscScalar *work,*workt,zb;
2948: const PetscScalar *xtmp;
2949: if (!a->mult_work) {
2950: k = PetscMax(A->rmap->n,A->cmap->n);
2951: PetscMalloc1(k+1,&a->mult_work);
2952: }
2953: work = a->mult_work;
2954: xtmp = x;
2955: for (i=0; i<mbs; i++) {
2956: n = ii[1] - ii[0]; ii++;
2957: ncols = n*bs;
2958: PetscArrayzero(work,ncols);
2959: if (usecprow) xtmp = x + bs*ridx[i];
2960: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2961: v += n*bs2;
2962: if (!usecprow) xtmp += bs;
2963: workt = work;
2964: for (j=0; j<n; j++) {
2965: zb = z + bs*(*idx++);
2966: for (k=0; k<bs; k++) zb[k] += workt[k] ;
2967: workt += bs;
2968: }
2969: }
2970: }
2971: #endif
2972: }
2973: VecRestoreArrayRead(xx, &x);
2974: VecRestoreArray(zz, &z);
2975: PetscLogFlops(2.0 * a->nz * a->bs2);
2976: return 0;
2977: }
2979: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2980: {
2981: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2982: PetscScalar *zb, *z, x1, x2, x3, x4, x5;
2983: const PetscScalar *x, *xb = NULL;
2984: const MatScalar *v;
2985: PetscInt mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2986: const PetscInt *idx, *ii, *ib, *ridx = NULL;
2987: Mat_CompressedRow cprow = a->compressedrow;
2988: PetscBool usecprow = cprow.use;
2990: if (yy != zz) VecCopy(yy, zz);
2991: VecGetArrayRead(xx, &x);
2992: VecGetArray(zz, &z);
2994: idx = a->j;
2995: v = a->a;
2996: if (usecprow) {
2997: mbs = cprow.nrows;
2998: ii = cprow.i;
2999: ridx = cprow.rindex;
3000: } else {
3001: mbs = a->mbs;
3002: ii = a->i;
3003: xb = x;
3004: }
3006: switch (bs) {
3007: case 1:
3008: for (i = 0; i < mbs; i++) {
3009: if (usecprow) xb = x + ridx[i];
3010: x1 = xb[0];
3011: ib = idx + ii[0];
3012: n = ii[1] - ii[0];
3013: ii++;
3014: for (j = 0; j < n; j++) {
3015: rval = ib[j];
3016: z[rval] += *v * x1;
3017: v++;
3018: }
3019: if (!usecprow) xb++;
3020: }
3021: break;
3022: case 2:
3023: for (i = 0; i < mbs; i++) {
3024: if (usecprow) xb = x + 2 * ridx[i];
3025: x1 = xb[0];
3026: x2 = xb[1];
3027: ib = idx + ii[0];
3028: n = ii[1] - ii[0];
3029: ii++;
3030: for (j = 0; j < n; j++) {
3031: rval = ib[j] * 2;
3032: z[rval++] += v[0] * x1 + v[1] * x2;
3033: z[rval++] += v[2] * x1 + v[3] * x2;
3034: v += 4;
3035: }
3036: if (!usecprow) xb += 2;
3037: }
3038: break;
3039: case 3:
3040: for (i = 0; i < mbs; i++) {
3041: if (usecprow) xb = x + 3 * ridx[i];
3042: x1 = xb[0];
3043: x2 = xb[1];
3044: x3 = xb[2];
3045: ib = idx + ii[0];
3046: n = ii[1] - ii[0];
3047: ii++;
3048: for (j = 0; j < n; j++) {
3049: rval = ib[j] * 3;
3050: z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3;
3051: z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3;
3052: z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3053: v += 9;
3054: }
3055: if (!usecprow) xb += 3;
3056: }
3057: break;
3058: case 4:
3059: for (i = 0; i < mbs; i++) {
3060: if (usecprow) xb = x + 4 * ridx[i];
3061: x1 = xb[0];
3062: x2 = xb[1];
3063: x3 = xb[2];
3064: x4 = xb[3];
3065: ib = idx + ii[0];
3066: n = ii[1] - ii[0];
3067: ii++;
3068: for (j = 0; j < n; j++) {
3069: rval = ib[j] * 4;
3070: z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
3071: z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
3072: z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
3073: z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
3074: v += 16;
3075: }
3076: if (!usecprow) xb += 4;
3077: }
3078: break;
3079: case 5:
3080: for (i = 0; i < mbs; i++) {
3081: if (usecprow) xb = x + 5 * ridx[i];
3082: x1 = xb[0];
3083: x2 = xb[1];
3084: x3 = xb[2];
3085: x4 = xb[3];
3086: x5 = xb[4];
3087: ib = idx + ii[0];
3088: n = ii[1] - ii[0];
3089: ii++;
3090: for (j = 0; j < n; j++) {
3091: rval = ib[j] * 5;
3092: z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
3093: z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
3094: z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
3095: z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
3096: z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
3097: v += 25;
3098: }
3099: if (!usecprow) xb += 5;
3100: }
3101: break;
3102: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
3103: PetscInt ncols, k;
3104: PetscScalar *work, *workt;
3105: const PetscScalar *xtmp;
3106: if (!a->mult_work) {
3107: k = PetscMax(A->rmap->n, A->cmap->n);
3108: PetscMalloc1(k + 1, &a->mult_work);
3109: }
3110: work = a->mult_work;
3111: xtmp = x;
3112: for (i = 0; i < mbs; i++) {
3113: n = ii[1] - ii[0];
3114: ii++;
3115: ncols = n * bs;
3116: PetscArrayzero(work, ncols);
3117: if (usecprow) xtmp = x + bs * ridx[i];
3118: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work);
3119: v += n * bs2;
3120: if (!usecprow) xtmp += bs;
3121: workt = work;
3122: for (j = 0; j < n; j++) {
3123: zb = z + bs * (*idx++);
3124: for (k = 0; k < bs; k++) zb[k] += workt[k];
3125: workt += bs;
3126: }
3127: }
3128: }
3129: }
3130: VecRestoreArrayRead(xx, &x);
3131: VecRestoreArray(zz, &z);
3132: PetscLogFlops(2.0 * a->nz * a->bs2);
3133: return 0;
3134: }
3136: PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha)
3137: {
3138: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
3139: PetscInt totalnz = a->bs2 * a->nz;
3140: PetscScalar oalpha = alpha;
3141: PetscBLASInt one = 1, tnz;
3143: PetscBLASIntCast(totalnz, &tnz);
3144: PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one));
3145: PetscLogFlops(totalnz);
3146: return 0;
3147: }
3149: PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm)
3150: {
3151: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3152: MatScalar *v = a->a;
3153: PetscReal sum = 0.0;
3154: PetscInt i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1;
3156: if (type == NORM_FROBENIUS) {
3157: #if defined(PETSC_USE_REAL___FP16)
3158: PetscBLASInt one = 1, cnt = bs2 * nz;
3159: PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one));
3160: #else
3161: for (i = 0; i < bs2 * nz; i++) {
3162: sum += PetscRealPart(PetscConj(*v) * (*v));
3163: v++;
3164: }
3165: #endif
3166: *norm = PetscSqrtReal(sum);
3167: PetscLogFlops(2.0 * bs2 * nz);
3168: } else if (type == NORM_1) { /* maximum column sum */
3169: PetscReal *tmp;
3170: PetscInt *bcol = a->j;
3171: PetscCalloc1(A->cmap->n + 1, &tmp);
3172: for (i = 0; i < nz; i++) {
3173: for (j = 0; j < bs; j++) {
3174: k1 = bs * (*bcol) + j; /* column index */
3175: for (k = 0; k < bs; k++) {
3176: tmp[k1] += PetscAbsScalar(*v);
3177: v++;
3178: }
3179: }
3180: bcol++;
3181: }
3182: *norm = 0.0;
3183: for (j = 0; j < A->cmap->n; j++) {
3184: if (tmp[j] > *norm) *norm = tmp[j];
3185: }
3186: PetscFree(tmp);
3187: PetscLogFlops(PetscMax(bs2 * nz - 1, 0));
3188: } else if (type == NORM_INFINITY) { /* maximum row sum */
3189: *norm = 0.0;
3190: for (k = 0; k < bs; k++) {
3191: for (j = 0; j < a->mbs; j++) {
3192: v = a->a + bs2 * a->i[j] + k;
3193: sum = 0.0;
3194: for (i = 0; i < a->i[j + 1] - a->i[j]; i++) {
3195: for (k1 = 0; k1 < bs; k1++) {
3196: sum += PetscAbsScalar(*v);
3197: v += bs;
3198: }
3199: }
3200: if (sum > *norm) *norm = sum;
3201: }
3202: }
3203: PetscLogFlops(PetscMax(bs2 * nz - 1, 0));
3204: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
3205: return 0;
3206: }
3208: PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg)
3209: {
3210: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
3212: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
3213: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
3214: *flg = PETSC_FALSE;
3215: return 0;
3216: }
3218: /* if the a->i are the same */
3219: PetscArraycmp(a->i, b->i, a->mbs + 1, flg);
3220: if (!*flg) return 0;
3222: /* if a->j are the same */
3223: PetscArraycmp(a->j, b->j, a->nz, flg);
3224: if (!*flg) return 0;
3226: /* if a->a are the same */
3227: PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg);
3228: return 0;
3229: }
3231: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v)
3232: {
3233: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3234: PetscInt i, j, k, n, row, bs, *ai, *aj, ambs, bs2;
3235: PetscScalar *x, zero = 0.0;
3236: MatScalar *aa, *aa_j;
3239: bs = A->rmap->bs;
3240: aa = a->a;
3241: ai = a->i;
3242: aj = a->j;
3243: ambs = a->mbs;
3244: bs2 = a->bs2;
3246: VecSet(v, zero);
3247: VecGetArray(v, &x);
3248: VecGetLocalSize(v, &n);
3250: for (i = 0; i < ambs; i++) {
3251: for (j = ai[i]; j < ai[i + 1]; j++) {
3252: if (aj[j] == i) {
3253: row = i * bs;
3254: aa_j = aa + j * bs2;
3255: for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
3256: break;
3257: }
3258: }
3259: }
3260: VecRestoreArray(v, &x);
3261: return 0;
3262: }
3264: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr)
3265: {
3266: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3267: const PetscScalar *l, *r, *li, *ri;
3268: PetscScalar x;
3269: MatScalar *aa, *v;
3270: PetscInt i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai;
3271: const PetscInt *ai, *aj;
3273: ai = a->i;
3274: aj = a->j;
3275: aa = a->a;
3276: m = A->rmap->n;
3277: n = A->cmap->n;
3278: bs = A->rmap->bs;
3279: mbs = a->mbs;
3280: bs2 = a->bs2;
3281: if (ll) {
3282: VecGetArrayRead(ll, &l);
3283: VecGetLocalSize(ll, &lm);
3285: for (i = 0; i < mbs; i++) { /* for each block row */
3286: M = ai[i + 1] - ai[i];
3287: li = l + i * bs;
3288: v = aa + bs2 * ai[i];
3289: for (j = 0; j < M; j++) { /* for each block */
3290: for (k = 0; k < bs2; k++) (*v++) *= li[k % bs];
3291: }
3292: }
3293: VecRestoreArrayRead(ll, &l);
3294: PetscLogFlops(a->nz);
3295: }
3297: if (rr) {
3298: VecGetArrayRead(rr, &r);
3299: VecGetLocalSize(rr, &rn);
3301: for (i = 0; i < mbs; i++) { /* for each block row */
3302: iai = ai[i];
3303: M = ai[i + 1] - iai;
3304: v = aa + bs2 * iai;
3305: for (j = 0; j < M; j++) { /* for each block */
3306: ri = r + bs * aj[iai + j];
3307: for (k = 0; k < bs; k++) {
3308: x = ri[k];
3309: for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x;
3310: v += bs;
3311: }
3312: }
3313: }
3314: VecRestoreArrayRead(rr, &r);
3315: PetscLogFlops(a->nz);
3316: }
3317: return 0;
3318: }
3320: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info)
3321: {
3322: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3324: info->block_size = a->bs2;
3325: info->nz_allocated = a->bs2 * a->maxnz;
3326: info->nz_used = a->bs2 * a->nz;
3327: info->nz_unneeded = info->nz_allocated - info->nz_used;
3328: info->assemblies = A->num_ass;
3329: info->mallocs = A->info.mallocs;
3330: info->memory = 0; /* REVIEW ME */
3331: if (A->factortype) {
3332: info->fill_ratio_given = A->info.fill_ratio_given;
3333: info->fill_ratio_needed = A->info.fill_ratio_needed;
3334: info->factor_mallocs = A->info.factor_mallocs;
3335: } else {
3336: info->fill_ratio_given = 0;
3337: info->fill_ratio_needed = 0;
3338: info->factor_mallocs = 0;
3339: }
3340: return 0;
3341: }
3343: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
3344: {
3345: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3347: PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]);
3348: return 0;
3349: }
3351: PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
3352: {
3353: MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C);
3354: C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
3355: return 0;
3356: }
3358: PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3359: {
3360: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3361: PetscScalar *z = NULL, sum1;
3362: const PetscScalar *xb;
3363: PetscScalar x1;
3364: const MatScalar *v, *vv;
3365: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3366: PetscBool usecprow = a->compressedrow.use;
3368: idx = a->j;
3369: v = a->a;
3370: if (usecprow) {
3371: mbs = a->compressedrow.nrows;
3372: ii = a->compressedrow.i;
3373: ridx = a->compressedrow.rindex;
3374: } else {
3375: mbs = a->mbs;
3376: ii = a->i;
3377: z = c;
3378: }
3380: for (i = 0; i < mbs; i++) {
3381: n = ii[1] - ii[0];
3382: ii++;
3383: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3384: PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3385: if (usecprow) z = c + ridx[i];
3386: jj = idx;
3387: vv = v;
3388: for (k = 0; k < cn; k++) {
3389: idx = jj;
3390: v = vv;
3391: sum1 = 0.0;
3392: for (j = 0; j < n; j++) {
3393: xb = b + (*idx++);
3394: x1 = xb[0 + k * bm];
3395: sum1 += v[0] * x1;
3396: v += 1;
3397: }
3398: z[0 + k * cm] = sum1;
3399: }
3400: if (!usecprow) z += 1;
3401: }
3402: return 0;
3403: }
3405: PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3406: {
3407: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3408: PetscScalar *z = NULL, sum1, sum2;
3409: const PetscScalar *xb;
3410: PetscScalar x1, x2;
3411: const MatScalar *v, *vv;
3412: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3413: PetscBool usecprow = a->compressedrow.use;
3415: idx = a->j;
3416: v = a->a;
3417: if (usecprow) {
3418: mbs = a->compressedrow.nrows;
3419: ii = a->compressedrow.i;
3420: ridx = a->compressedrow.rindex;
3421: } else {
3422: mbs = a->mbs;
3423: ii = a->i;
3424: z = c;
3425: }
3427: for (i = 0; i < mbs; i++) {
3428: n = ii[1] - ii[0];
3429: ii++;
3430: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3431: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3432: if (usecprow) z = c + 2 * ridx[i];
3433: jj = idx;
3434: vv = v;
3435: for (k = 0; k < cn; k++) {
3436: idx = jj;
3437: v = vv;
3438: sum1 = 0.0;
3439: sum2 = 0.0;
3440: for (j = 0; j < n; j++) {
3441: xb = b + 2 * (*idx++);
3442: x1 = xb[0 + k * bm];
3443: x2 = xb[1 + k * bm];
3444: sum1 += v[0] * x1 + v[2] * x2;
3445: sum2 += v[1] * x1 + v[3] * x2;
3446: v += 4;
3447: }
3448: z[0 + k * cm] = sum1;
3449: z[1 + k * cm] = sum2;
3450: }
3451: if (!usecprow) z += 2;
3452: }
3453: return 0;
3454: }
3456: PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3457: {
3458: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3459: PetscScalar *z = NULL, sum1, sum2, sum3;
3460: const PetscScalar *xb;
3461: PetscScalar x1, x2, x3;
3462: const MatScalar *v, *vv;
3463: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3464: PetscBool usecprow = a->compressedrow.use;
3466: idx = a->j;
3467: v = a->a;
3468: if (usecprow) {
3469: mbs = a->compressedrow.nrows;
3470: ii = a->compressedrow.i;
3471: ridx = a->compressedrow.rindex;
3472: } else {
3473: mbs = a->mbs;
3474: ii = a->i;
3475: z = c;
3476: }
3478: for (i = 0; i < mbs; i++) {
3479: n = ii[1] - ii[0];
3480: ii++;
3481: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3482: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3483: if (usecprow) z = c + 3 * ridx[i];
3484: jj = idx;
3485: vv = v;
3486: for (k = 0; k < cn; k++) {
3487: idx = jj;
3488: v = vv;
3489: sum1 = 0.0;
3490: sum2 = 0.0;
3491: sum3 = 0.0;
3492: for (j = 0; j < n; j++) {
3493: xb = b + 3 * (*idx++);
3494: x1 = xb[0 + k * bm];
3495: x2 = xb[1 + k * bm];
3496: x3 = xb[2 + k * bm];
3497: sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
3498: sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
3499: sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
3500: v += 9;
3501: }
3502: z[0 + k * cm] = sum1;
3503: z[1 + k * cm] = sum2;
3504: z[2 + k * cm] = sum3;
3505: }
3506: if (!usecprow) z += 3;
3507: }
3508: return 0;
3509: }
3511: PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3512: {
3513: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3514: PetscScalar *z = NULL, sum1, sum2, sum3, sum4;
3515: const PetscScalar *xb;
3516: PetscScalar x1, x2, x3, x4;
3517: const MatScalar *v, *vv;
3518: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3519: PetscBool usecprow = a->compressedrow.use;
3521: idx = a->j;
3522: v = a->a;
3523: if (usecprow) {
3524: mbs = a->compressedrow.nrows;
3525: ii = a->compressedrow.i;
3526: ridx = a->compressedrow.rindex;
3527: } else {
3528: mbs = a->mbs;
3529: ii = a->i;
3530: z = c;
3531: }
3533: for (i = 0; i < mbs; i++) {
3534: n = ii[1] - ii[0];
3535: ii++;
3536: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3537: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3538: if (usecprow) z = c + 4 * ridx[i];
3539: jj = idx;
3540: vv = v;
3541: for (k = 0; k < cn; k++) {
3542: idx = jj;
3543: v = vv;
3544: sum1 = 0.0;
3545: sum2 = 0.0;
3546: sum3 = 0.0;
3547: sum4 = 0.0;
3548: for (j = 0; j < n; j++) {
3549: xb = b + 4 * (*idx++);
3550: x1 = xb[0 + k * bm];
3551: x2 = xb[1 + k * bm];
3552: x3 = xb[2 + k * bm];
3553: x4 = xb[3 + k * bm];
3554: sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
3555: sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
3556: sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
3557: sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
3558: v += 16;
3559: }
3560: z[0 + k * cm] = sum1;
3561: z[1 + k * cm] = sum2;
3562: z[2 + k * cm] = sum3;
3563: z[3 + k * cm] = sum4;
3564: }
3565: if (!usecprow) z += 4;
3566: }
3567: return 0;
3568: }
3570: PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3571: {
3572: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3573: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5;
3574: const PetscScalar *xb;
3575: PetscScalar x1, x2, x3, x4, x5;
3576: const MatScalar *v, *vv;
3577: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3578: PetscBool usecprow = a->compressedrow.use;
3580: idx = a->j;
3581: v = a->a;
3582: if (usecprow) {
3583: mbs = a->compressedrow.nrows;
3584: ii = a->compressedrow.i;
3585: ridx = a->compressedrow.rindex;
3586: } else {
3587: mbs = a->mbs;
3588: ii = a->i;
3589: z = c;
3590: }
3592: for (i = 0; i < mbs; i++) {
3593: n = ii[1] - ii[0];
3594: ii++;
3595: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3596: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3597: if (usecprow) z = c + 5 * ridx[i];
3598: jj = idx;
3599: vv = v;
3600: for (k = 0; k < cn; k++) {
3601: idx = jj;
3602: v = vv;
3603: sum1 = 0.0;
3604: sum2 = 0.0;
3605: sum3 = 0.0;
3606: sum4 = 0.0;
3607: sum5 = 0.0;
3608: for (j = 0; j < n; j++) {
3609: xb = b + 5 * (*idx++);
3610: x1 = xb[0 + k * bm];
3611: x2 = xb[1 + k * bm];
3612: x3 = xb[2 + k * bm];
3613: x4 = xb[3 + k * bm];
3614: x5 = xb[4 + k * bm];
3615: sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
3616: sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
3617: sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
3618: sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
3619: sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
3620: v += 25;
3621: }
3622: z[0 + k * cm] = sum1;
3623: z[1 + k * cm] = sum2;
3624: z[2 + k * cm] = sum3;
3625: z[3 + k * cm] = sum4;
3626: z[4 + k * cm] = sum5;
3627: }
3628: if (!usecprow) z += 5;
3629: }
3630: return 0;
3631: }
3633: PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C)
3634: {
3635: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3636: Mat_SeqDense *bd = (Mat_SeqDense *)B->data;
3637: Mat_SeqDense *cd = (Mat_SeqDense *)C->data;
3638: PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3639: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3640: PetscBLASInt bbs, bcn, bbm, bcm;
3641: PetscScalar *z = NULL;
3642: PetscScalar *c, *b;
3643: const MatScalar *v;
3644: const PetscInt *idx, *ii, *ridx = NULL;
3645: PetscScalar _DZero = 0.0, _DOne = 1.0;
3646: PetscBool usecprow = a->compressedrow.use;
3648: if (!cm || !cn) return 0;
3652: b = bd->v;
3653: if (a->nonzerorowcnt != A->rmap->n) MatZeroEntries(C);
3654: MatDenseGetArray(C, &c);
3655: switch (bs) {
3656: case 1:
3657: MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);
3658: break;
3659: case 2:
3660: MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);
3661: break;
3662: case 3:
3663: MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);
3664: break;
3665: case 4:
3666: MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);
3667: break;
3668: case 5:
3669: MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);
3670: break;
3671: default: /* block sizes larger than 5 by 5 are handled by BLAS */
3672: PetscBLASIntCast(bs, &bbs);
3673: PetscBLASIntCast(cn, &bcn);
3674: PetscBLASIntCast(bm, &bbm);
3675: PetscBLASIntCast(cm, &bcm);
3676: idx = a->j;
3677: v = a->a;
3678: if (usecprow) {
3679: mbs = a->compressedrow.nrows;
3680: ii = a->compressedrow.i;
3681: ridx = a->compressedrow.rindex;
3682: } else {
3683: mbs = a->mbs;
3684: ii = a->i;
3685: z = c;
3686: }
3687: for (i = 0; i < mbs; i++) {
3688: n = ii[1] - ii[0];
3689: ii++;
3690: if (usecprow) z = c + bs * ridx[i];
3691: if (n) {
3692: PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm));
3693: v += bs2;
3694: }
3695: for (j = 1; j < n; j++) {
3696: PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
3697: v += bs2;
3698: }
3699: if (!usecprow) z += bs;
3700: }
3701: }
3702: MatDenseRestoreArray(C, &c);
3703: PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn);
3704: return 0;
3705: }