Actual source code: sbaij2.c
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <../src/mat/impls/dense/seq/dense.h>
4: #include <../src/mat/impls/sbaij/seq/sbaij.h>
5: #include <petsc/private/kernels/blockinvert.h>
6: #include <petscbt.h>
7: #include <petscblaslapack.h>
9: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
10: {
11: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
12: PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs;
13: const PetscInt *idx;
14: PetscBT table_out, table_in;
17: mbs = a->mbs;
18: ai = a->i;
19: aj = a->j;
20: bs = A->rmap->bs;
21: PetscBTCreate(mbs, &table_out);
22: PetscMalloc1(mbs + 1, &nidx);
23: PetscBTCreate(mbs, &table_in);
25: for (i = 0; i < is_max; i++) { /* for each is */
26: isz = 0;
27: PetscBTMemzero(mbs, table_out);
29: /* Extract the indices, assume there can be duplicate entries */
30: ISGetIndices(is[i], &idx);
31: ISGetLocalSize(is[i], &n);
33: /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
34: bcol_max = 0;
35: for (j = 0; j < n; ++j) {
36: brow = idx[j] / bs; /* convert the indices into block indices */
38: if (!PetscBTLookupSet(table_out, brow)) {
39: nidx[isz++] = brow;
40: if (bcol_max < brow) bcol_max = brow;
41: }
42: }
43: ISRestoreIndices(is[i], &idx);
44: ISDestroy(&is[i]);
46: k = 0;
47: for (j = 0; j < ov; j++) { /* for each overlap */
48: /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
49: PetscBTMemzero(mbs, table_in);
50: for (l = k; l < isz; l++) PetscBTSet(table_in, nidx[l]);
52: n = isz; /* length of the updated is[i] */
53: for (brow = 0; brow < mbs; brow++) {
54: start = ai[brow];
55: end = ai[brow + 1];
56: if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
57: for (l = start; l < end; l++) {
58: bcol = aj[l];
59: if (!PetscBTLookupSet(table_out, bcol)) {
60: nidx[isz++] = bcol;
61: if (bcol_max < bcol) bcol_max = bcol;
62: }
63: }
64: k++;
65: if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
66: } else { /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */
67: for (l = start; l < end; l++) {
68: bcol = aj[l];
69: if (bcol > bcol_max) break;
70: if (PetscBTLookup(table_in, bcol)) {
71: if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
72: break; /* for l = start; l<end ; l++) */
73: }
74: }
75: }
76: }
77: } /* for each overlap */
78: ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i);
79: } /* for each is */
80: PetscBTDestroy(&table_out);
81: PetscFree(nidx);
82: PetscBTDestroy(&table_in);
83: return 0;
84: }
86: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
87: Zero some ops' to avoid invalid usse */
88: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
89: {
90: MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE);
91: Bseq->ops->mult = NULL;
92: Bseq->ops->multadd = NULL;
93: Bseq->ops->multtranspose = NULL;
94: Bseq->ops->multtransposeadd = NULL;
95: Bseq->ops->lufactor = NULL;
96: Bseq->ops->choleskyfactor = NULL;
97: Bseq->ops->lufactorsymbolic = NULL;
98: Bseq->ops->choleskyfactorsymbolic = NULL;
99: Bseq->ops->getinertia = NULL;
100: return 0;
101: }
103: /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
104: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
105: {
106: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c;
107: PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
108: PetscInt row, mat_i, *mat_j, tcol, *mat_ilen;
109: const PetscInt *irow, *icol;
110: PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
111: PetscInt *aj = a->j, *ai = a->i;
112: MatScalar *mat_a;
113: Mat C;
114: PetscBool flag;
117: ISGetIndices(isrow, &irow);
118: ISGetIndices(iscol, &icol);
119: ISGetLocalSize(isrow, &nrows);
120: ISGetLocalSize(iscol, &ncols);
122: PetscCalloc1(1 + oldcols, &smap);
123: ssmap = smap;
124: PetscMalloc1(1 + nrows, &lens);
125: for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
126: /* determine lens of each row */
127: for (i = 0; i < nrows; i++) {
128: kstart = ai[irow[i]];
129: kend = kstart + a->ilen[irow[i]];
130: lens[i] = 0;
131: for (k = kstart; k < kend; k++) {
132: if (ssmap[aj[k]]) lens[i]++;
133: }
134: }
135: /* Create and fill new matrix */
136: if (scall == MAT_REUSE_MATRIX) {
137: c = (Mat_SeqSBAIJ *)((*B)->data);
140: PetscArraycmp(c->ilen, lens, c->mbs, &flag);
142: PetscArrayzero(c->ilen, c->mbs);
143: C = *B;
144: } else {
145: MatCreate(PetscObjectComm((PetscObject)A), &C);
146: MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE);
147: MatSetType(C, ((PetscObject)A)->type_name);
148: MatSeqSBAIJSetPreallocation(C, bs, 0, lens);
149: }
150: c = (Mat_SeqSBAIJ *)(C->data);
151: for (i = 0; i < nrows; i++) {
152: row = irow[i];
153: kstart = ai[row];
154: kend = kstart + a->ilen[row];
155: mat_i = c->i[i];
156: mat_j = c->j + mat_i;
157: mat_a = c->a + mat_i * bs2;
158: mat_ilen = c->ilen + i;
159: for (k = kstart; k < kend; k++) {
160: if ((tcol = ssmap[a->j[k]])) {
161: *mat_j++ = tcol - 1;
162: PetscArraycpy(mat_a, a->a + k * bs2, bs2);
163: mat_a += bs2;
164: (*mat_ilen)++;
165: }
166: }
167: }
168: /* sort */
169: {
170: MatScalar *work;
172: PetscMalloc1(bs2, &work);
173: for (i = 0; i < nrows; i++) {
174: PetscInt ilen;
175: mat_i = c->i[i];
176: mat_j = c->j + mat_i;
177: mat_a = c->a + mat_i * bs2;
178: ilen = c->ilen[i];
179: PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work);
180: }
181: PetscFree(work);
182: }
184: /* Free work space */
185: ISRestoreIndices(iscol, &icol);
186: PetscFree(smap);
187: PetscFree(lens);
188: MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
189: MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);
191: ISRestoreIndices(isrow, &irow);
192: *B = C;
193: return 0;
194: }
196: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
197: {
198: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
199: IS is1, is2;
200: PetscInt *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs;
201: const PetscInt *irow, *icol;
203: ISGetIndices(isrow, &irow);
204: ISGetIndices(iscol, &icol);
205: ISGetLocalSize(isrow, &nrows);
206: ISGetLocalSize(iscol, &ncols);
208: /* Verify if the indices correspond to each element in a block
209: and form the IS with compressed IS */
210: maxmnbs = PetscMax(a->mbs, a->nbs);
211: PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary);
212: PetscArrayzero(vary, a->mbs);
213: for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
215: count = 0;
216: for (i = 0; i < nrows; i++) {
217: PetscInt j = irow[i] / bs;
218: if ((vary[j]--) == bs) iary[count++] = j;
219: }
220: ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1);
222: PetscArrayzero(vary, a->nbs);
223: for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
225: count = 0;
226: for (i = 0; i < ncols; i++) {
227: PetscInt j = icol[i] / bs;
228: if ((vary[j]--) == bs) iary[count++] = j;
229: }
230: ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2);
231: ISRestoreIndices(isrow, &irow);
232: ISRestoreIndices(iscol, &icol);
233: PetscFree2(vary, iary);
235: MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B);
236: ISDestroy(&is1);
237: ISDestroy(&is2);
239: if (isrow != iscol) {
240: PetscBool isequal;
241: ISEqual(isrow, iscol, &isequal);
242: if (!isequal) MatSeqSBAIJZeroOps_Private(*B);
243: }
244: return 0;
245: }
247: PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
248: {
249: PetscInt i;
251: if (scall == MAT_INITIAL_MATRIX) PetscCalloc1(n + 1, B);
253: for (i = 0; i < n; i++) MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]);
254: return 0;
255: }
257: /* -------------------------------------------------------*/
258: /* Should check that shapes of vectors and matrices match */
259: /* -------------------------------------------------------*/
261: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
262: {
263: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
264: PetscScalar *z, x1, x2, zero = 0.0;
265: const PetscScalar *x, *xb;
266: const MatScalar *v;
267: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
268: const PetscInt *aj = a->j, *ai = a->i, *ib;
269: PetscInt nonzerorow = 0;
271: VecSet(zz, zero);
272: if (!a->nz) return 0;
273: VecGetArrayRead(xx, &x);
274: VecGetArray(zz, &z);
276: v = a->a;
277: xb = x;
279: for (i = 0; i < mbs; i++) {
280: n = ai[1] - ai[0]; /* length of i_th block row of A */
281: x1 = xb[0];
282: x2 = xb[1];
283: ib = aj + *ai;
284: jmin = 0;
285: nonzerorow += (n > 0);
286: if (*ib == i) { /* (diag of A)*x */
287: z[2 * i] += v[0] * x1 + v[2] * x2;
288: z[2 * i + 1] += v[2] * x1 + v[3] * x2;
289: v += 4;
290: jmin++;
291: }
292: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
293: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
294: for (j = jmin; j < n; j++) {
295: /* (strict lower triangular part of A)*x */
296: cval = ib[j] * 2;
297: z[cval] += v[0] * x1 + v[1] * x2;
298: z[cval + 1] += v[2] * x1 + v[3] * x2;
299: /* (strict upper triangular part of A)*x */
300: z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
301: z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
302: v += 4;
303: }
304: xb += 2;
305: ai++;
306: }
308: VecRestoreArrayRead(xx, &x);
309: VecRestoreArray(zz, &z);
310: PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
311: return 0;
312: }
314: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
315: {
316: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
317: PetscScalar *z, x1, x2, x3, zero = 0.0;
318: const PetscScalar *x, *xb;
319: const MatScalar *v;
320: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
321: const PetscInt *aj = a->j, *ai = a->i, *ib;
322: PetscInt nonzerorow = 0;
324: VecSet(zz, zero);
325: if (!a->nz) return 0;
326: VecGetArrayRead(xx, &x);
327: VecGetArray(zz, &z);
329: v = a->a;
330: xb = x;
332: for (i = 0; i < mbs; i++) {
333: n = ai[1] - ai[0]; /* length of i_th block row of A */
334: x1 = xb[0];
335: x2 = xb[1];
336: x3 = xb[2];
337: ib = aj + *ai;
338: jmin = 0;
339: nonzerorow += (n > 0);
340: if (*ib == i) { /* (diag of A)*x */
341: z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
342: z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
343: z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
344: v += 9;
345: jmin++;
346: }
347: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
348: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
349: for (j = jmin; j < n; j++) {
350: /* (strict lower triangular part of A)*x */
351: cval = ib[j] * 3;
352: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
353: z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
354: z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
355: /* (strict upper triangular part of A)*x */
356: z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
357: z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
358: z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
359: v += 9;
360: }
361: xb += 3;
362: ai++;
363: }
365: VecRestoreArrayRead(xx, &x);
366: VecRestoreArray(zz, &z);
367: PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
368: return 0;
369: }
371: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
372: {
373: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
374: PetscScalar *z, x1, x2, x3, x4, zero = 0.0;
375: const PetscScalar *x, *xb;
376: const MatScalar *v;
377: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
378: const PetscInt *aj = a->j, *ai = a->i, *ib;
379: PetscInt nonzerorow = 0;
381: VecSet(zz, zero);
382: if (!a->nz) return 0;
383: VecGetArrayRead(xx, &x);
384: VecGetArray(zz, &z);
386: v = a->a;
387: xb = x;
389: for (i = 0; i < mbs; i++) {
390: n = ai[1] - ai[0]; /* length of i_th block row of A */
391: x1 = xb[0];
392: x2 = xb[1];
393: x3 = xb[2];
394: x4 = xb[3];
395: ib = aj + *ai;
396: jmin = 0;
397: nonzerorow += (n > 0);
398: if (*ib == i) { /* (diag of A)*x */
399: z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
400: z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
401: z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
402: z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
403: v += 16;
404: jmin++;
405: }
406: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
407: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
408: for (j = jmin; j < n; j++) {
409: /* (strict lower triangular part of A)*x */
410: cval = ib[j] * 4;
411: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
412: z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
413: z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
414: z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
415: /* (strict upper triangular part of A)*x */
416: z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
417: z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
418: z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
419: z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
420: v += 16;
421: }
422: xb += 4;
423: ai++;
424: }
426: VecRestoreArrayRead(xx, &x);
427: VecRestoreArray(zz, &z);
428: PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
429: return 0;
430: }
432: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
433: {
434: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
435: PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0;
436: const PetscScalar *x, *xb;
437: const MatScalar *v;
438: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
439: const PetscInt *aj = a->j, *ai = a->i, *ib;
440: PetscInt nonzerorow = 0;
442: VecSet(zz, zero);
443: if (!a->nz) return 0;
444: VecGetArrayRead(xx, &x);
445: VecGetArray(zz, &z);
447: v = a->a;
448: xb = x;
450: for (i = 0; i < mbs; i++) {
451: n = ai[1] - ai[0]; /* length of i_th block row of A */
452: x1 = xb[0];
453: x2 = xb[1];
454: x3 = xb[2];
455: x4 = xb[3];
456: x5 = xb[4];
457: ib = aj + *ai;
458: jmin = 0;
459: nonzerorow += (n > 0);
460: if (*ib == i) { /* (diag of A)*x */
461: z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
462: z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
463: z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
464: z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
465: z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
466: v += 25;
467: jmin++;
468: }
469: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
470: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
471: for (j = jmin; j < n; j++) {
472: /* (strict lower triangular part of A)*x */
473: cval = ib[j] * 5;
474: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
475: z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
476: z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
477: z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
478: z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
479: /* (strict upper triangular part of A)*x */
480: z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
481: z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
482: z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
483: z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
484: z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
485: v += 25;
486: }
487: xb += 5;
488: ai++;
489: }
491: VecRestoreArrayRead(xx, &x);
492: VecRestoreArray(zz, &z);
493: PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
494: return 0;
495: }
497: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
498: {
499: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
500: PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
501: const PetscScalar *x, *xb;
502: const MatScalar *v;
503: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
504: const PetscInt *aj = a->j, *ai = a->i, *ib;
505: PetscInt nonzerorow = 0;
507: VecSet(zz, zero);
508: if (!a->nz) return 0;
509: VecGetArrayRead(xx, &x);
510: VecGetArray(zz, &z);
512: v = a->a;
513: xb = x;
515: for (i = 0; i < mbs; i++) {
516: n = ai[1] - ai[0]; /* length of i_th block row of A */
517: x1 = xb[0];
518: x2 = xb[1];
519: x3 = xb[2];
520: x4 = xb[3];
521: x5 = xb[4];
522: x6 = xb[5];
523: ib = aj + *ai;
524: jmin = 0;
525: nonzerorow += (n > 0);
526: if (*ib == i) { /* (diag of A)*x */
527: z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
528: z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
529: z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
530: z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
531: z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
532: z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
533: v += 36;
534: jmin++;
535: }
536: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
537: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
538: for (j = jmin; j < n; j++) {
539: /* (strict lower triangular part of A)*x */
540: cval = ib[j] * 6;
541: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
542: z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
543: z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
544: z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
545: z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
546: z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
547: /* (strict upper triangular part of A)*x */
548: z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
549: z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
550: z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
551: z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
552: z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
553: z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
554: v += 36;
555: }
556: xb += 6;
557: ai++;
558: }
560: VecRestoreArrayRead(xx, &x);
561: VecRestoreArray(zz, &z);
562: PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
563: return 0;
564: }
566: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
567: {
568: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
569: PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
570: const PetscScalar *x, *xb;
571: const MatScalar *v;
572: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
573: const PetscInt *aj = a->j, *ai = a->i, *ib;
574: PetscInt nonzerorow = 0;
576: VecSet(zz, zero);
577: if (!a->nz) return 0;
578: VecGetArrayRead(xx, &x);
579: VecGetArray(zz, &z);
581: v = a->a;
582: xb = x;
584: for (i = 0; i < mbs; i++) {
585: n = ai[1] - ai[0]; /* length of i_th block row of A */
586: x1 = xb[0];
587: x2 = xb[1];
588: x3 = xb[2];
589: x4 = xb[3];
590: x5 = xb[4];
591: x6 = xb[5];
592: x7 = xb[6];
593: ib = aj + *ai;
594: jmin = 0;
595: nonzerorow += (n > 0);
596: if (*ib == i) { /* (diag of A)*x */
597: z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
598: z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
599: z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
600: z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
601: z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
602: z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
603: z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
604: v += 49;
605: jmin++;
606: }
607: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
608: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
609: for (j = jmin; j < n; j++) {
610: /* (strict lower triangular part of A)*x */
611: cval = ib[j] * 7;
612: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
613: z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
614: z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
615: z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
616: z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
617: z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
618: z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
619: /* (strict upper triangular part of A)*x */
620: z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
621: z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
622: z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
623: z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
624: z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
625: z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
626: z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
627: v += 49;
628: }
629: xb += 7;
630: ai++;
631: }
632: VecRestoreArrayRead(xx, &x);
633: VecRestoreArray(zz, &z);
634: PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
635: return 0;
636: }
638: /*
639: This will not work with MatScalar == float because it calls the BLAS
640: */
641: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
642: {
643: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
644: PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
645: const PetscScalar *x, *x_ptr, *xb;
646: const MatScalar *v;
647: PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
648: const PetscInt *idx, *aj, *ii;
649: PetscInt nonzerorow = 0;
651: VecSet(zz, zero);
652: if (!a->nz) return 0;
653: VecGetArrayRead(xx, &x);
654: VecGetArray(zz, &z);
656: x_ptr = x;
657: z_ptr = z;
659: aj = a->j;
660: v = a->a;
661: ii = a->i;
663: if (!a->mult_work) PetscMalloc1(A->rmap->N + 1, &a->mult_work);
664: work = a->mult_work;
666: for (i = 0; i < mbs; i++) {
667: n = ii[1] - ii[0];
668: ncols = n * bs;
669: workt = work;
670: idx = aj + ii[0];
671: nonzerorow += (n > 0);
673: /* upper triangular part */
674: for (j = 0; j < n; j++) {
675: xb = x_ptr + bs * (*idx++);
676: for (k = 0; k < bs; k++) workt[k] = xb[k];
677: workt += bs;
678: }
679: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
680: PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
682: /* strict lower triangular part */
683: idx = aj + ii[0];
684: if (n && *idx == i) {
685: ncols -= bs;
686: v += bs2;
687: idx++;
688: n--;
689: }
691: if (ncols > 0) {
692: workt = work;
693: PetscArrayzero(workt, ncols);
694: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
695: for (j = 0; j < n; j++) {
696: zb = z_ptr + bs * (*idx++);
697: for (k = 0; k < bs; k++) zb[k] += workt[k];
698: workt += bs;
699: }
700: }
701: x += bs;
702: v += n * bs2;
703: z += bs;
704: ii++;
705: }
707: VecRestoreArrayRead(xx, &x);
708: VecRestoreArray(zz, &z);
709: PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow);
710: return 0;
711: }
713: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
714: {
715: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
716: PetscScalar *z, x1;
717: const PetscScalar *x, *xb;
718: const MatScalar *v;
719: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
720: const PetscInt *aj = a->j, *ai = a->i, *ib;
721: PetscInt nonzerorow = 0;
722: #if defined(PETSC_USE_COMPLEX)
723: const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
724: #else
725: const int aconj = 0;
726: #endif
728: VecCopy(yy, zz);
729: if (!a->nz) return 0;
730: VecGetArrayRead(xx, &x);
731: VecGetArray(zz, &z);
732: v = a->a;
733: xb = x;
735: for (i = 0; i < mbs; i++) {
736: n = ai[1] - ai[0]; /* length of i_th row of A */
737: x1 = xb[0];
738: ib = aj + *ai;
739: jmin = 0;
740: nonzerorow += (n > 0);
741: if (n && *ib == i) { /* (diag of A)*x */
742: z[i] += *v++ * x[*ib++];
743: jmin++;
744: }
745: if (aconj) {
746: for (j = jmin; j < n; j++) {
747: cval = *ib;
748: z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */
749: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
750: }
751: } else {
752: for (j = jmin; j < n; j++) {
753: cval = *ib;
754: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
755: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
756: }
757: }
758: xb++;
759: ai++;
760: }
762: VecRestoreArrayRead(xx, &x);
763: VecRestoreArray(zz, &z);
765: PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow));
766: return 0;
767: }
769: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
770: {
771: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
772: PetscScalar *z, x1, x2;
773: const PetscScalar *x, *xb;
774: const MatScalar *v;
775: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
776: const PetscInt *aj = a->j, *ai = a->i, *ib;
777: PetscInt nonzerorow = 0;
779: VecCopy(yy, zz);
780: if (!a->nz) return 0;
781: VecGetArrayRead(xx, &x);
782: VecGetArray(zz, &z);
784: v = a->a;
785: xb = x;
787: for (i = 0; i < mbs; i++) {
788: n = ai[1] - ai[0]; /* length of i_th block row of A */
789: x1 = xb[0];
790: x2 = xb[1];
791: ib = aj + *ai;
792: jmin = 0;
793: nonzerorow += (n > 0);
794: if (n && *ib == i) { /* (diag of A)*x */
795: z[2 * i] += v[0] * x1 + v[2] * x2;
796: z[2 * i + 1] += v[2] * x1 + v[3] * x2;
797: v += 4;
798: jmin++;
799: }
800: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
801: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
802: for (j = jmin; j < n; j++) {
803: /* (strict lower triangular part of A)*x */
804: cval = ib[j] * 2;
805: z[cval] += v[0] * x1 + v[1] * x2;
806: z[cval + 1] += v[2] * x1 + v[3] * x2;
807: /* (strict upper triangular part of A)*x */
808: z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
809: z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
810: v += 4;
811: }
812: xb += 2;
813: ai++;
814: }
815: VecRestoreArrayRead(xx, &x);
816: VecRestoreArray(zz, &z);
818: PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow));
819: return 0;
820: }
822: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
823: {
824: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
825: PetscScalar *z, x1, x2, x3;
826: const PetscScalar *x, *xb;
827: const MatScalar *v;
828: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
829: const PetscInt *aj = a->j, *ai = a->i, *ib;
830: PetscInt nonzerorow = 0;
832: VecCopy(yy, zz);
833: if (!a->nz) return 0;
834: VecGetArrayRead(xx, &x);
835: VecGetArray(zz, &z);
837: v = a->a;
838: xb = x;
840: for (i = 0; i < mbs; i++) {
841: n = ai[1] - ai[0]; /* length of i_th block row of A */
842: x1 = xb[0];
843: x2 = xb[1];
844: x3 = xb[2];
845: ib = aj + *ai;
846: jmin = 0;
847: nonzerorow += (n > 0);
848: if (n && *ib == i) { /* (diag of A)*x */
849: z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
850: z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
851: z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
852: v += 9;
853: jmin++;
854: }
855: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
856: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
857: for (j = jmin; j < n; j++) {
858: /* (strict lower triangular part of A)*x */
859: cval = ib[j] * 3;
860: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
861: z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
862: z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
863: /* (strict upper triangular part of A)*x */
864: z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
865: z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
866: z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
867: v += 9;
868: }
869: xb += 3;
870: ai++;
871: }
873: VecRestoreArrayRead(xx, &x);
874: VecRestoreArray(zz, &z);
876: PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow));
877: return 0;
878: }
880: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
881: {
882: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
883: PetscScalar *z, x1, x2, x3, x4;
884: const PetscScalar *x, *xb;
885: const MatScalar *v;
886: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
887: const PetscInt *aj = a->j, *ai = a->i, *ib;
888: PetscInt nonzerorow = 0;
890: VecCopy(yy, zz);
891: if (!a->nz) return 0;
892: VecGetArrayRead(xx, &x);
893: VecGetArray(zz, &z);
895: v = a->a;
896: xb = x;
898: for (i = 0; i < mbs; i++) {
899: n = ai[1] - ai[0]; /* length of i_th block row of A */
900: x1 = xb[0];
901: x2 = xb[1];
902: x3 = xb[2];
903: x4 = xb[3];
904: ib = aj + *ai;
905: jmin = 0;
906: nonzerorow += (n > 0);
907: if (n && *ib == i) { /* (diag of A)*x */
908: z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
909: z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
910: z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
911: z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
912: v += 16;
913: jmin++;
914: }
915: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
916: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
917: for (j = jmin; j < n; j++) {
918: /* (strict lower triangular part of A)*x */
919: cval = ib[j] * 4;
920: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
921: z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
922: z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
923: z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
924: /* (strict upper triangular part of A)*x */
925: z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
926: z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
927: z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
928: z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
929: v += 16;
930: }
931: xb += 4;
932: ai++;
933: }
935: VecRestoreArrayRead(xx, &x);
936: VecRestoreArray(zz, &z);
938: PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow));
939: return 0;
940: }
942: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
943: {
944: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
945: PetscScalar *z, x1, x2, x3, x4, x5;
946: const PetscScalar *x, *xb;
947: const MatScalar *v;
948: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
949: const PetscInt *aj = a->j, *ai = a->i, *ib;
950: PetscInt nonzerorow = 0;
952: VecCopy(yy, zz);
953: if (!a->nz) return 0;
954: VecGetArrayRead(xx, &x);
955: VecGetArray(zz, &z);
957: v = a->a;
958: xb = x;
960: for (i = 0; i < mbs; i++) {
961: n = ai[1] - ai[0]; /* length of i_th block row of A */
962: x1 = xb[0];
963: x2 = xb[1];
964: x3 = xb[2];
965: x4 = xb[3];
966: x5 = xb[4];
967: ib = aj + *ai;
968: jmin = 0;
969: nonzerorow += (n > 0);
970: if (n && *ib == i) { /* (diag of A)*x */
971: z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
972: z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
973: z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
974: z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
975: z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
976: v += 25;
977: jmin++;
978: }
979: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
980: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
981: for (j = jmin; j < n; j++) {
982: /* (strict lower triangular part of A)*x */
983: cval = ib[j] * 5;
984: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
985: z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
986: z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
987: z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
988: z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
989: /* (strict upper triangular part of A)*x */
990: z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
991: z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
992: z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
993: z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
994: z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
995: v += 25;
996: }
997: xb += 5;
998: ai++;
999: }
1001: VecRestoreArrayRead(xx, &x);
1002: VecRestoreArray(zz, &z);
1004: PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow));
1005: return 0;
1006: }
1008: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
1009: {
1010: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1011: PetscScalar *z, x1, x2, x3, x4, x5, x6;
1012: const PetscScalar *x, *xb;
1013: const MatScalar *v;
1014: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
1015: const PetscInt *aj = a->j, *ai = a->i, *ib;
1016: PetscInt nonzerorow = 0;
1018: VecCopy(yy, zz);
1019: if (!a->nz) return 0;
1020: VecGetArrayRead(xx, &x);
1021: VecGetArray(zz, &z);
1023: v = a->a;
1024: xb = x;
1026: for (i = 0; i < mbs; i++) {
1027: n = ai[1] - ai[0]; /* length of i_th block row of A */
1028: x1 = xb[0];
1029: x2 = xb[1];
1030: x3 = xb[2];
1031: x4 = xb[3];
1032: x5 = xb[4];
1033: x6 = xb[5];
1034: ib = aj + *ai;
1035: jmin = 0;
1036: nonzerorow += (n > 0);
1037: if (n && *ib == i) { /* (diag of A)*x */
1038: z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
1039: z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
1040: z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
1041: z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
1042: z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
1043: z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1044: v += 36;
1045: jmin++;
1046: }
1047: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1048: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1049: for (j = jmin; j < n; j++) {
1050: /* (strict lower triangular part of A)*x */
1051: cval = ib[j] * 6;
1052: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
1053: z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
1054: z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
1055: z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
1056: z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
1057: z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1058: /* (strict upper triangular part of A)*x */
1059: z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
1060: z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
1061: z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
1062: z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
1063: z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
1064: z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
1065: v += 36;
1066: }
1067: xb += 6;
1068: ai++;
1069: }
1071: VecRestoreArrayRead(xx, &x);
1072: VecRestoreArray(zz, &z);
1074: PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow));
1075: return 0;
1076: }
1078: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1079: {
1080: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1081: PetscScalar *z, x1, x2, x3, x4, x5, x6, x7;
1082: const PetscScalar *x, *xb;
1083: const MatScalar *v;
1084: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
1085: const PetscInt *aj = a->j, *ai = a->i, *ib;
1086: PetscInt nonzerorow = 0;
1088: VecCopy(yy, zz);
1089: if (!a->nz) return 0;
1090: VecGetArrayRead(xx, &x);
1091: VecGetArray(zz, &z);
1093: v = a->a;
1094: xb = x;
1096: for (i = 0; i < mbs; i++) {
1097: n = ai[1] - ai[0]; /* length of i_th block row of A */
1098: x1 = xb[0];
1099: x2 = xb[1];
1100: x3 = xb[2];
1101: x4 = xb[3];
1102: x5 = xb[4];
1103: x6 = xb[5];
1104: x7 = xb[6];
1105: ib = aj + *ai;
1106: jmin = 0;
1107: nonzerorow += (n > 0);
1108: if (n && *ib == i) { /* (diag of A)*x */
1109: z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
1110: z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
1111: z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
1112: z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
1113: z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
1114: z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
1115: z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1116: v += 49;
1117: jmin++;
1118: }
1119: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1120: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1121: for (j = jmin; j < n; j++) {
1122: /* (strict lower triangular part of A)*x */
1123: cval = ib[j] * 7;
1124: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
1125: z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
1126: z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
1127: z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
1128: z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
1129: z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
1130: z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1131: /* (strict upper triangular part of A)*x */
1132: z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
1133: z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
1134: z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
1135: z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
1136: z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
1137: z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
1138: z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
1139: v += 49;
1140: }
1141: xb += 7;
1142: ai++;
1143: }
1145: VecRestoreArrayRead(xx, &x);
1146: VecRestoreArray(zz, &z);
1148: PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow));
1149: return 0;
1150: }
1152: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1153: {
1154: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1155: PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt;
1156: const PetscScalar *x, *x_ptr, *xb;
1157: const MatScalar *v;
1158: PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1159: const PetscInt *idx, *aj, *ii;
1160: PetscInt nonzerorow = 0;
1162: VecCopy(yy, zz);
1163: if (!a->nz) return 0;
1164: VecGetArrayRead(xx, &x);
1165: x_ptr = x;
1166: VecGetArray(zz, &z);
1167: z_ptr = z;
1169: aj = a->j;
1170: v = a->a;
1171: ii = a->i;
1173: if (!a->mult_work) PetscMalloc1(A->rmap->n + 1, &a->mult_work);
1174: work = a->mult_work;
1176: for (i = 0; i < mbs; i++) {
1177: n = ii[1] - ii[0];
1178: ncols = n * bs;
1179: workt = work;
1180: idx = aj + ii[0];
1181: nonzerorow += (n > 0);
1183: /* upper triangular part */
1184: for (j = 0; j < n; j++) {
1185: xb = x_ptr + bs * (*idx++);
1186: for (k = 0; k < bs; k++) workt[k] = xb[k];
1187: workt += bs;
1188: }
1189: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1190: PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
1192: /* strict lower triangular part */
1193: idx = aj + ii[0];
1194: if (n && *idx == i) {
1195: ncols -= bs;
1196: v += bs2;
1197: idx++;
1198: n--;
1199: }
1200: if (ncols > 0) {
1201: workt = work;
1202: PetscArrayzero(workt, ncols);
1203: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1204: for (j = 0; j < n; j++) {
1205: zb = z_ptr + bs * (*idx++);
1206: for (k = 0; k < bs; k++) zb[k] += workt[k];
1207: workt += bs;
1208: }
1209: }
1211: x += bs;
1212: v += n * bs2;
1213: z += bs;
1214: ii++;
1215: }
1217: VecRestoreArrayRead(xx, &x);
1218: VecRestoreArray(zz, &z);
1220: PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow));
1221: return 0;
1222: }
1224: PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1225: {
1226: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data;
1227: PetscScalar oalpha = alpha;
1228: PetscBLASInt one = 1, totalnz;
1230: PetscBLASIntCast(a->bs2 * a->nz, &totalnz);
1231: PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
1232: PetscLogFlops(totalnz);
1233: return 0;
1234: }
1236: PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1237: {
1238: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1239: const MatScalar *v = a->a;
1240: PetscReal sum_diag = 0.0, sum_off = 0.0, *sum;
1241: PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1242: const PetscInt *aj = a->j, *col;
1244: if (!a->nz) {
1245: *norm = 0.0;
1246: return 0;
1247: }
1248: if (type == NORM_FROBENIUS) {
1249: for (k = 0; k < mbs; k++) {
1250: jmin = a->i[k];
1251: jmax = a->i[k + 1];
1252: col = aj + jmin;
1253: if (jmax - jmin > 0 && *col == k) { /* diagonal block */
1254: for (i = 0; i < bs2; i++) {
1255: sum_diag += PetscRealPart(PetscConj(*v) * (*v));
1256: v++;
1257: }
1258: jmin++;
1259: }
1260: for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
1261: for (i = 0; i < bs2; i++) {
1262: sum_off += PetscRealPart(PetscConj(*v) * (*v));
1263: v++;
1264: }
1265: }
1266: }
1267: *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
1268: PetscLogFlops(2.0 * bs2 * a->nz);
1269: } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1270: PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl);
1271: for (i = 0; i < mbs; i++) jl[i] = mbs;
1272: il[0] = 0;
1274: *norm = 0.0;
1275: for (k = 0; k < mbs; k++) { /* k_th block row */
1276: for (j = 0; j < bs; j++) sum[j] = 0.0;
1277: /*-- col sum --*/
1278: i = jl[k]; /* first |A(i,k)| to be added */
1279: /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1280: at step k */
1281: while (i < mbs) {
1282: nexti = jl[i]; /* next block row to be added */
1283: ik = il[i]; /* block index of A(i,k) in the array a */
1284: for (j = 0; j < bs; j++) {
1285: v = a->a + ik * bs2 + j * bs;
1286: for (k1 = 0; k1 < bs; k1++) {
1287: sum[j] += PetscAbsScalar(*v);
1288: v++;
1289: }
1290: }
1291: /* update il, jl */
1292: jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1293: jmax = a->i[i + 1];
1294: if (jmin < jmax) {
1295: il[i] = jmin;
1296: j = a->j[jmin];
1297: jl[i] = jl[j];
1298: jl[j] = i;
1299: }
1300: i = nexti;
1301: }
1302: /*-- row sum --*/
1303: jmin = a->i[k];
1304: jmax = a->i[k + 1];
1305: for (i = jmin; i < jmax; i++) {
1306: for (j = 0; j < bs; j++) {
1307: v = a->a + i * bs2 + j;
1308: for (k1 = 0; k1 < bs; k1++) {
1309: sum[j] += PetscAbsScalar(*v);
1310: v += bs;
1311: }
1312: }
1313: }
1314: /* add k_th block row to il, jl */
1315: col = aj + jmin;
1316: if (jmax - jmin > 0 && *col == k) jmin++;
1317: if (jmin < jmax) {
1318: il[k] = jmin;
1319: j = a->j[jmin];
1320: jl[k] = jl[j];
1321: jl[j] = k;
1322: }
1323: for (j = 0; j < bs; j++) {
1324: if (sum[j] > *norm) *norm = sum[j];
1325: }
1326: }
1327: PetscFree3(sum, il, jl);
1328: PetscLogFlops(PetscMax(mbs * a->nz - 1, 0));
1329: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
1330: return 0;
1331: }
1333: PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1334: {
1335: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;
1337: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1338: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1339: *flg = PETSC_FALSE;
1340: return 0;
1341: }
1343: /* if the a->i are the same */
1344: PetscArraycmp(a->i, b->i, a->mbs + 1, flg);
1345: if (!*flg) return 0;
1347: /* if a->j are the same */
1348: PetscArraycmp(a->j, b->j, a->nz, flg);
1349: if (!*flg) return 0;
1351: /* if a->a are the same */
1352: PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg);
1353: return 0;
1354: }
1356: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1357: {
1358: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1359: PetscInt i, j, k, row, bs, ambs, bs2;
1360: const PetscInt *ai, *aj;
1361: PetscScalar *x, zero = 0.0;
1362: const MatScalar *aa, *aa_j;
1364: bs = A->rmap->bs;
1367: aa = a->a;
1368: ambs = a->mbs;
1370: if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1371: PetscInt *diag = a->diag;
1372: aa = a->a;
1373: ambs = a->mbs;
1374: VecGetArray(v, &x);
1375: for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]];
1376: VecRestoreArray(v, &x);
1377: return 0;
1378: }
1380: ai = a->i;
1381: aj = a->j;
1382: bs2 = a->bs2;
1383: VecSet(v, zero);
1384: if (!a->nz) return 0;
1385: VecGetArray(v, &x);
1386: for (i = 0; i < ambs; i++) {
1387: j = ai[i];
1388: if (aj[j] == i) { /* if this is a diagonal element */
1389: row = i * bs;
1390: aa_j = aa + j * bs2;
1391: for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
1392: }
1393: }
1394: VecRestoreArray(v, &x);
1395: return 0;
1396: }
1398: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1399: {
1400: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1401: PetscScalar x;
1402: const PetscScalar *l, *li, *ri;
1403: MatScalar *aa, *v;
1404: PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1405: const PetscInt *ai, *aj;
1406: PetscBool flg;
1408: if (ll != rr) {
1409: VecEqual(ll, rr, &flg);
1411: }
1412: if (!ll) return 0;
1413: ai = a->i;
1414: aj = a->j;
1415: aa = a->a;
1416: m = A->rmap->N;
1417: bs = A->rmap->bs;
1418: mbs = a->mbs;
1419: bs2 = a->bs2;
1421: VecGetArrayRead(ll, &l);
1422: VecGetLocalSize(ll, &lm);
1424: for (i = 0; i < mbs; i++) { /* for each block row */
1425: M = ai[i + 1] - ai[i];
1426: li = l + i * bs;
1427: v = aa + bs2 * ai[i];
1428: for (j = 0; j < M; j++) { /* for each block */
1429: ri = l + bs * aj[ai[i] + j];
1430: for (k = 0; k < bs; k++) {
1431: x = ri[k];
1432: for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
1433: }
1434: }
1435: }
1436: VecRestoreArrayRead(ll, &l);
1437: PetscLogFlops(2.0 * a->nz);
1438: return 0;
1439: }
1441: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1442: {
1443: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1445: info->block_size = a->bs2;
1446: info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
1447: info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */
1448: info->nz_unneeded = info->nz_allocated - info->nz_used;
1449: info->assemblies = A->num_ass;
1450: info->mallocs = A->info.mallocs;
1451: info->memory = 0; /* REVIEW ME */
1452: if (A->factortype) {
1453: info->fill_ratio_given = A->info.fill_ratio_given;
1454: info->fill_ratio_needed = A->info.fill_ratio_needed;
1455: info->factor_mallocs = A->info.factor_mallocs;
1456: } else {
1457: info->fill_ratio_given = 0;
1458: info->fill_ratio_needed = 0;
1459: info->factor_mallocs = 0;
1460: }
1461: return 0;
1462: }
1464: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1465: {
1466: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1468: PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]);
1469: return 0;
1470: }
1472: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1473: {
1474: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1475: PetscInt i, j, n, row, col, bs, mbs;
1476: const PetscInt *ai, *aj;
1477: PetscReal atmp;
1478: const MatScalar *aa;
1479: PetscScalar *x;
1480: PetscInt ncols, brow, bcol, krow, kcol;
1484: bs = A->rmap->bs;
1485: aa = a->a;
1486: ai = a->i;
1487: aj = a->j;
1488: mbs = a->mbs;
1490: VecSet(v, 0.0);
1491: VecGetArray(v, &x);
1492: VecGetLocalSize(v, &n);
1494: for (i = 0; i < mbs; i++) {
1495: ncols = ai[1] - ai[0];
1496: ai++;
1497: brow = bs * i;
1498: for (j = 0; j < ncols; j++) {
1499: bcol = bs * (*aj);
1500: for (kcol = 0; kcol < bs; kcol++) {
1501: col = bcol + kcol; /* col index */
1502: for (krow = 0; krow < bs; krow++) {
1503: atmp = PetscAbsScalar(*aa);
1504: aa++;
1505: row = brow + krow; /* row index */
1506: if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1507: if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1508: }
1509: }
1510: aj++;
1511: }
1512: }
1513: VecRestoreArray(v, &x);
1514: return 0;
1515: }
1517: PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1518: {
1519: MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C);
1520: C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1521: return 0;
1522: }
1524: PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1525: {
1526: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1527: PetscScalar *z = c;
1528: const PetscScalar *xb;
1529: PetscScalar x1;
1530: const MatScalar *v = a->a, *vv;
1531: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1532: #if defined(PETSC_USE_COMPLEX)
1533: const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
1534: #else
1535: const int aconj = 0;
1536: #endif
1538: for (i = 0; i < mbs; i++) {
1539: n = ii[1] - ii[0];
1540: ii++;
1541: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1542: PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1543: jj = idx;
1544: vv = v;
1545: for (k = 0; k < cn; k++) {
1546: idx = jj;
1547: v = vv;
1548: for (j = 0; j < n; j++) {
1549: xb = b + (*idx);
1550: x1 = xb[0 + k * bm];
1551: z[0 + k * cm] += v[0] * x1;
1552: if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1553: v += 1;
1554: ++idx;
1555: }
1556: }
1557: z += 1;
1558: }
1559: return 0;
1560: }
1562: PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1563: {
1564: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1565: PetscScalar *z = c;
1566: const PetscScalar *xb;
1567: PetscScalar x1, x2;
1568: const MatScalar *v = a->a, *vv;
1569: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1571: for (i = 0; i < mbs; i++) {
1572: n = ii[1] - ii[0];
1573: ii++;
1574: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1575: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1576: jj = idx;
1577: vv = v;
1578: for (k = 0; k < cn; k++) {
1579: idx = jj;
1580: v = vv;
1581: for (j = 0; j < n; j++) {
1582: xb = b + 2 * (*idx);
1583: x1 = xb[0 + k * bm];
1584: x2 = xb[1 + k * bm];
1585: z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1586: z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1587: if (*idx != i) {
1588: c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1589: c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1590: }
1591: v += 4;
1592: ++idx;
1593: }
1594: }
1595: z += 2;
1596: }
1597: return 0;
1598: }
1600: PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1601: {
1602: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1603: PetscScalar *z = c;
1604: const PetscScalar *xb;
1605: PetscScalar x1, x2, x3;
1606: const MatScalar *v = a->a, *vv;
1607: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1609: for (i = 0; i < mbs; i++) {
1610: n = ii[1] - ii[0];
1611: ii++;
1612: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1613: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1614: jj = idx;
1615: vv = v;
1616: for (k = 0; k < cn; k++) {
1617: idx = jj;
1618: v = vv;
1619: for (j = 0; j < n; j++) {
1620: xb = b + 3 * (*idx);
1621: x1 = xb[0 + k * bm];
1622: x2 = xb[1 + k * bm];
1623: x3 = xb[2 + k * bm];
1624: z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1625: z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1626: z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1627: if (*idx != i) {
1628: c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1629: c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1630: c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1631: }
1632: v += 9;
1633: ++idx;
1634: }
1635: }
1636: z += 3;
1637: }
1638: return 0;
1639: }
1641: PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1642: {
1643: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1644: PetscScalar *z = c;
1645: const PetscScalar *xb;
1646: PetscScalar x1, x2, x3, x4;
1647: const MatScalar *v = a->a, *vv;
1648: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1650: for (i = 0; i < mbs; i++) {
1651: n = ii[1] - ii[0];
1652: ii++;
1653: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1654: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1655: jj = idx;
1656: vv = v;
1657: for (k = 0; k < cn; k++) {
1658: idx = jj;
1659: v = vv;
1660: for (j = 0; j < n; j++) {
1661: xb = b + 4 * (*idx);
1662: x1 = xb[0 + k * bm];
1663: x2 = xb[1 + k * bm];
1664: x3 = xb[2 + k * bm];
1665: x4 = xb[3 + k * bm];
1666: z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1667: z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1668: z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1669: z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1670: if (*idx != i) {
1671: c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1672: c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1673: c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1674: c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1675: }
1676: v += 16;
1677: ++idx;
1678: }
1679: }
1680: z += 4;
1681: }
1682: return 0;
1683: }
1685: PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1686: {
1687: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1688: PetscScalar *z = c;
1689: const PetscScalar *xb;
1690: PetscScalar x1, x2, x3, x4, x5;
1691: const MatScalar *v = a->a, *vv;
1692: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1694: for (i = 0; i < mbs; i++) {
1695: n = ii[1] - ii[0];
1696: ii++;
1697: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1698: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1699: jj = idx;
1700: vv = v;
1701: for (k = 0; k < cn; k++) {
1702: idx = jj;
1703: v = vv;
1704: for (j = 0; j < n; j++) {
1705: xb = b + 5 * (*idx);
1706: x1 = xb[0 + k * bm];
1707: x2 = xb[1 + k * bm];
1708: x3 = xb[2 + k * bm];
1709: x4 = xb[3 + k * bm];
1710: x5 = xb[4 + k * cm];
1711: z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1712: z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1713: z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1714: z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1715: z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1716: if (*idx != i) {
1717: c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1718: c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1719: c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1720: c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1721: c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1722: }
1723: v += 25;
1724: ++idx;
1725: }
1726: }
1727: z += 5;
1728: }
1729: return 0;
1730: }
1732: PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1733: {
1734: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1735: Mat_SeqDense *bd = (Mat_SeqDense *)B->data;
1736: Mat_SeqDense *cd = (Mat_SeqDense *)C->data;
1737: PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1738: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1739: PetscBLASInt bbs, bcn, bbm, bcm;
1740: PetscScalar *z = NULL;
1741: PetscScalar *c, *b;
1742: const MatScalar *v;
1743: const PetscInt *idx, *ii;
1744: PetscScalar _DOne = 1.0;
1746: if (!cm || !cn) return 0;
1750: b = bd->v;
1751: MatZeroEntries(C);
1752: MatDenseGetArray(C, &c);
1753: switch (bs) {
1754: case 1:
1755: MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn);
1756: break;
1757: case 2:
1758: MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn);
1759: break;
1760: case 3:
1761: MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn);
1762: break;
1763: case 4:
1764: MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn);
1765: break;
1766: case 5:
1767: MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn);
1768: break;
1769: default: /* block sizes larger than 5 by 5 are handled by BLAS */
1770: PetscBLASIntCast(bs, &bbs);
1771: PetscBLASIntCast(cn, &bcn);
1772: PetscBLASIntCast(bm, &bbm);
1773: PetscBLASIntCast(cm, &bcm);
1774: idx = a->j;
1775: v = a->a;
1776: mbs = a->mbs;
1777: ii = a->i;
1778: z = c;
1779: for (i = 0; i < mbs; i++) {
1780: n = ii[1] - ii[0];
1781: ii++;
1782: for (j = 0; j < n; j++) {
1783: if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1784: PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1785: v += bs2;
1786: }
1787: z += bs;
1788: }
1789: }
1790: MatDenseRestoreArray(C, &c);
1791: PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn);
1792: return 0;
1793: }