Actual source code: inode.c
2: /*
3: This file provides high performance routines for the Inode format (compressed sparse row)
4: by taking advantage of rows with identical nonzero structure (I-nodes).
5: */
6: #include <../src/mat/impls/aij/seq/aij.h>
8: static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
9: {
10: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
11: PetscInt i, count, m, n, min_mn, *ns_row, *ns_col;
13: n = A->cmap->n;
14: m = A->rmap->n;
16: ns_row = a->inode.size;
18: min_mn = (m < n) ? m : n;
19: if (!ns) {
20: for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++)
21: ;
22: for (; count + 1 < n; count++, i++)
23: ;
24: if (count < n) i++;
25: *size = i;
26: return 0;
27: }
28: PetscMalloc1(n + 1, &ns_col);
30: /* Use the same row structure wherever feasible. */
31: for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i];
33: /* if m < n; pad up the remainder with inode_limit */
34: for (; count + 1 < n; count++, i++) ns_col[i] = 1;
35: /* The last node is the odd ball. padd it up with the remaining rows; */
36: if (count < n) {
37: ns_col[i] = n - count;
38: i++;
39: } else if (count > n) {
40: /* Adjust for the over estimation */
41: ns_col[i - 1] += n - count;
42: }
43: *size = i;
44: *ns = ns_col;
45: return 0;
46: }
48: /*
49: This builds symmetric version of nonzero structure,
50: */
51: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
52: {
53: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
54: PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
55: PetscInt *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2;
56: const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
58: nslim_row = a->inode.node_count;
59: m = A->rmap->n;
60: n = A->cmap->n;
64: /* Use the row_inode as column_inode */
65: nslim_col = nslim_row;
66: ns_col = ns_row;
68: /* allocate space for reformatted inode structure */
69: PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc);
70: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1];
72: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
73: nsz = ns_col[i1];
74: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
75: }
76: /* allocate space for row pointers */
77: PetscCalloc1(nslim_row + 1, &ia);
78: *iia = ia;
79: PetscMalloc1(nslim_row + 1, &work);
81: /* determine the number of columns in each row */
82: ia[0] = oshift;
83: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
84: j = aj + ai[row] + ishift;
85: jmax = aj + ai[row + 1] + ishift;
86: if (j == jmax) continue; /* empty row */
87: col = *j++ + ishift;
88: i2 = tvc[col];
89: while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
90: ia[i1 + 1]++;
91: ia[i2 + 1]++;
92: i2++; /* Start col of next node */
93: while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
94: i2 = tvc[col];
95: }
96: if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
97: }
99: /* shift ia[i] to point to next row */
100: for (i1 = 1; i1 < nslim_row + 1; i1++) {
101: row = ia[i1 - 1];
102: ia[i1] += row;
103: work[i1 - 1] = row - oshift;
104: }
106: /* allocate space for column pointers */
107: nz = ia[nslim_row] + (!ishift);
108: PetscMalloc1(nz, &ja);
109: *jja = ja;
111: /* loop over lower triangular part putting into ja */
112: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
113: j = aj + ai[row] + ishift;
114: jmax = aj + ai[row + 1] + ishift;
115: if (j == jmax) continue; /* empty row */
116: col = *j++ + ishift;
117: i2 = tvc[col];
118: while (i2 < i1 && j < jmax) {
119: ja[work[i2]++] = i1 + oshift;
120: ja[work[i1]++] = i2 + oshift;
121: ++i2;
122: while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
123: i2 = tvc[col];
124: }
125: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
126: }
127: PetscFree(work);
128: PetscFree2(tns, tvc);
129: return 0;
130: }
132: /*
133: This builds nonsymmetric version of nonzero structure,
134: */
135: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
136: {
137: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
138: PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
139: PetscInt *tns, *tvc, nsz, i1, i2;
140: const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size;
143: nslim_row = a->inode.node_count;
144: n = A->cmap->n;
146: /* Create The column_inode for this matrix */
147: MatCreateColInode_Private(A, &nslim_col, &ns_col);
149: /* allocate space for reformatted column_inode structure */
150: PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc);
151: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
153: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
154: nsz = ns_col[i1];
155: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
156: }
157: /* allocate space for row pointers */
158: PetscCalloc1(nslim_row + 1, &ia);
159: *iia = ia;
160: PetscMalloc1(nslim_row + 1, &work);
162: /* determine the number of columns in each row */
163: ia[0] = oshift;
164: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
165: j = aj + ai[row] + ishift;
166: nz = ai[row + 1] - ai[row];
167: if (!nz) continue; /* empty row */
168: col = *j++ + ishift;
169: i2 = tvc[col];
170: while (nz-- > 0) { /* off-diagonal elements */
171: ia[i1 + 1]++;
172: i2++; /* Start col of next node */
173: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
174: if (nz > 0) i2 = tvc[col];
175: }
176: }
178: /* shift ia[i] to point to next row */
179: for (i1 = 1; i1 < nslim_row + 1; i1++) {
180: row = ia[i1 - 1];
181: ia[i1] += row;
182: work[i1 - 1] = row - oshift;
183: }
185: /* allocate space for column pointers */
186: nz = ia[nslim_row] + (!ishift);
187: PetscMalloc1(nz, &ja);
188: *jja = ja;
190: /* loop over matrix putting into ja */
191: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
192: j = aj + ai[row] + ishift;
193: nz = ai[row + 1] - ai[row];
194: if (!nz) continue; /* empty row */
195: col = *j++ + ishift;
196: i2 = tvc[col];
197: while (nz-- > 0) {
198: ja[work[i1]++] = i2 + oshift;
199: ++i2;
200: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
201: if (nz > 0) i2 = tvc[col];
202: }
203: }
204: PetscFree(ns_col);
205: PetscFree(work);
206: PetscFree2(tns, tvc);
207: return 0;
208: }
210: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
211: {
212: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
214: if (n) *n = a->inode.node_count;
215: if (!ia) return 0;
216: if (!blockcompressed) {
217: MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done);
218: } else if (symmetric) {
219: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift);
220: } else {
221: MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift);
222: }
223: return 0;
224: }
226: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
227: {
228: if (!ia) return 0;
230: if (!blockcompressed) {
231: MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done);
232: } else {
233: PetscFree(*ia);
234: PetscFree(*ja);
235: }
236: return 0;
237: }
239: /* ----------------------------------------------------------- */
241: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
242: {
243: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
244: PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
245: PetscInt *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j;
248: nslim_row = a->inode.node_count;
249: n = A->cmap->n;
251: /* Create The column_inode for this matrix */
252: MatCreateColInode_Private(A, &nslim_col, &ns_col);
254: /* allocate space for reformatted column_inode structure */
255: PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc);
256: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
258: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
259: nsz = ns_col[i1];
260: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
261: }
262: /* allocate space for column pointers */
263: PetscCalloc1(nslim_col + 1, &ia);
264: *iia = ia;
265: PetscMalloc1(nslim_col + 1, &work);
267: /* determine the number of columns in each row */
268: ia[0] = oshift;
269: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
270: j = aj + ai[row] + ishift;
271: col = *j++ + ishift;
272: i2 = tvc[col];
273: nz = ai[row + 1] - ai[row];
274: while (nz-- > 0) { /* off-diagonal elements */
275: /* ia[i1+1]++; */
276: ia[i2 + 1]++;
277: i2++;
278: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
279: if (nz > 0) i2 = tvc[col];
280: }
281: }
283: /* shift ia[i] to point to next col */
284: for (i1 = 1; i1 < nslim_col + 1; i1++) {
285: col = ia[i1 - 1];
286: ia[i1] += col;
287: work[i1 - 1] = col - oshift;
288: }
290: /* allocate space for column pointers */
291: nz = ia[nslim_col] + (!ishift);
292: PetscMalloc1(nz, &ja);
293: *jja = ja;
295: /* loop over matrix putting into ja */
296: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
297: j = aj + ai[row] + ishift;
298: col = *j++ + ishift;
299: i2 = tvc[col];
300: nz = ai[row + 1] - ai[row];
301: while (nz-- > 0) {
302: /* ja[work[i1]++] = i2 + oshift; */
303: ja[work[i2]++] = i1 + oshift;
304: i2++;
305: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
306: if (nz > 0) i2 = tvc[col];
307: }
308: }
309: PetscFree(ns_col);
310: PetscFree(work);
311: PetscFree2(tns, tvc);
312: return 0;
313: }
315: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
316: {
317: MatCreateColInode_Private(A, n, NULL);
318: if (!ia) return 0;
320: if (!blockcompressed) {
321: MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done);
322: } else if (symmetric) {
323: /* Since the indices are symmetric it doesn't matter */
324: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift);
325: } else {
326: MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift);
327: }
328: return 0;
329: }
331: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
332: {
333: if (!ia) return 0;
334: if (!blockcompressed) {
335: MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done);
336: } else {
337: PetscFree(*ia);
338: PetscFree(*ja);
339: }
340: return 0;
341: }
343: /* ----------------------------------------------------------- */
345: PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
346: {
347: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
348: PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
349: PetscScalar *y;
350: const PetscScalar *x;
351: const MatScalar *v1, *v2, *v3, *v4, *v5;
352: PetscInt i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0;
353: const PetscInt *idx, *ns, *ii;
355: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
356: #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
357: #endif
360: node_max = a->inode.node_count;
361: ns = a->inode.size; /* Node Size array */
362: VecGetArrayRead(xx, &x);
363: VecGetArray(yy, &y);
364: idx = a->j;
365: v1 = a->a;
366: ii = a->i;
368: for (i = 0, row = 0; i < node_max; ++i) {
369: nsz = ns[i];
370: n = ii[1] - ii[0];
371: nonzerorow += (n > 0) * nsz;
372: ii += nsz;
373: PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */
374: PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */
375: sz = n; /* No of non zeros in this row */
376: /* Switch on the size of Node */
377: switch (nsz) { /* Each loop in 'case' is unrolled */
378: case 1:
379: sum1 = 0.;
381: for (n = 0; n < sz - 1; n += 2) {
382: i1 = idx[0]; /* The instructions are ordered to */
383: i2 = idx[1]; /* make the compiler's job easy */
384: idx += 2;
385: tmp0 = x[i1];
386: tmp1 = x[i2];
387: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
388: v1 += 2;
389: }
391: if (n == sz - 1) { /* Take care of the last nonzero */
392: tmp0 = x[*idx++];
393: sum1 += *v1++ * tmp0;
394: }
395: y[row++] = sum1;
396: break;
397: case 2:
398: sum1 = 0.;
399: sum2 = 0.;
400: v2 = v1 + n;
402: for (n = 0; n < sz - 1; n += 2) {
403: i1 = idx[0];
404: i2 = idx[1];
405: idx += 2;
406: tmp0 = x[i1];
407: tmp1 = x[i2];
408: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
409: v1 += 2;
410: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
411: v2 += 2;
412: }
413: if (n == sz - 1) {
414: tmp0 = x[*idx++];
415: sum1 += *v1++ * tmp0;
416: sum2 += *v2++ * tmp0;
417: }
418: y[row++] = sum1;
419: y[row++] = sum2;
420: v1 = v2; /* Since the next block to be processed starts there*/
421: idx += sz;
422: break;
423: case 3:
424: sum1 = 0.;
425: sum2 = 0.;
426: sum3 = 0.;
427: v2 = v1 + n;
428: v3 = v2 + n;
430: for (n = 0; n < sz - 1; n += 2) {
431: i1 = idx[0];
432: i2 = idx[1];
433: idx += 2;
434: tmp0 = x[i1];
435: tmp1 = x[i2];
436: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
437: v1 += 2;
438: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
439: v2 += 2;
440: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
441: v3 += 2;
442: }
443: if (n == sz - 1) {
444: tmp0 = x[*idx++];
445: sum1 += *v1++ * tmp0;
446: sum2 += *v2++ * tmp0;
447: sum3 += *v3++ * tmp0;
448: }
449: y[row++] = sum1;
450: y[row++] = sum2;
451: y[row++] = sum3;
452: v1 = v3; /* Since the next block to be processed starts there*/
453: idx += 2 * sz;
454: break;
455: case 4:
456: sum1 = 0.;
457: sum2 = 0.;
458: sum3 = 0.;
459: sum4 = 0.;
460: v2 = v1 + n;
461: v3 = v2 + n;
462: v4 = v3 + n;
464: for (n = 0; n < sz - 1; n += 2) {
465: i1 = idx[0];
466: i2 = idx[1];
467: idx += 2;
468: tmp0 = x[i1];
469: tmp1 = x[i2];
470: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
471: v1 += 2;
472: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
473: v2 += 2;
474: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
475: v3 += 2;
476: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
477: v4 += 2;
478: }
479: if (n == sz - 1) {
480: tmp0 = x[*idx++];
481: sum1 += *v1++ * tmp0;
482: sum2 += *v2++ * tmp0;
483: sum3 += *v3++ * tmp0;
484: sum4 += *v4++ * tmp0;
485: }
486: y[row++] = sum1;
487: y[row++] = sum2;
488: y[row++] = sum3;
489: y[row++] = sum4;
490: v1 = v4; /* Since the next block to be processed starts there*/
491: idx += 3 * sz;
492: break;
493: case 5:
494: sum1 = 0.;
495: sum2 = 0.;
496: sum3 = 0.;
497: sum4 = 0.;
498: sum5 = 0.;
499: v2 = v1 + n;
500: v3 = v2 + n;
501: v4 = v3 + n;
502: v5 = v4 + n;
504: for (n = 0; n < sz - 1; n += 2) {
505: i1 = idx[0];
506: i2 = idx[1];
507: idx += 2;
508: tmp0 = x[i1];
509: tmp1 = x[i2];
510: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
511: v1 += 2;
512: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
513: v2 += 2;
514: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
515: v3 += 2;
516: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
517: v4 += 2;
518: sum5 += v5[0] * tmp0 + v5[1] * tmp1;
519: v5 += 2;
520: }
521: if (n == sz - 1) {
522: tmp0 = x[*idx++];
523: sum1 += *v1++ * tmp0;
524: sum2 += *v2++ * tmp0;
525: sum3 += *v3++ * tmp0;
526: sum4 += *v4++ * tmp0;
527: sum5 += *v5++ * tmp0;
528: }
529: y[row++] = sum1;
530: y[row++] = sum2;
531: y[row++] = sum3;
532: y[row++] = sum4;
533: y[row++] = sum5;
534: v1 = v5; /* Since the next block to be processed starts there */
535: idx += 4 * sz;
536: break;
537: default:
538: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
539: }
540: }
541: VecRestoreArrayRead(xx, &x);
542: VecRestoreArray(yy, &y);
543: PetscLogFlops(2.0 * a->nz - nonzerorow);
544: return 0;
545: }
546: /* ----------------------------------------------------------- */
547: /* Almost same code as the MatMult_SeqAIJ_Inode() */
548: PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
549: {
550: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
551: PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
552: const MatScalar *v1, *v2, *v3, *v4, *v5;
553: const PetscScalar *x;
554: PetscScalar *y, *z, *zt;
555: PetscInt i1, i2, n, i, row, node_max, nsz, sz;
556: const PetscInt *idx, *ns, *ii;
559: node_max = a->inode.node_count;
560: ns = a->inode.size; /* Node Size array */
562: VecGetArrayRead(xx, &x);
563: VecGetArrayPair(zz, yy, &z, &y);
564: zt = z;
566: idx = a->j;
567: v1 = a->a;
568: ii = a->i;
570: for (i = 0, row = 0; i < node_max; ++i) {
571: nsz = ns[i];
572: n = ii[1] - ii[0];
573: ii += nsz;
574: sz = n; /* No of non zeros in this row */
575: /* Switch on the size of Node */
576: switch (nsz) { /* Each loop in 'case' is unrolled */
577: case 1:
578: sum1 = *zt++;
580: for (n = 0; n < sz - 1; n += 2) {
581: i1 = idx[0]; /* The instructions are ordered to */
582: i2 = idx[1]; /* make the compiler's job easy */
583: idx += 2;
584: tmp0 = x[i1];
585: tmp1 = x[i2];
586: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
587: v1 += 2;
588: }
590: if (n == sz - 1) { /* Take care of the last nonzero */
591: tmp0 = x[*idx++];
592: sum1 += *v1++ * tmp0;
593: }
594: y[row++] = sum1;
595: break;
596: case 2:
597: sum1 = *zt++;
598: sum2 = *zt++;
599: v2 = v1 + n;
601: for (n = 0; n < sz - 1; n += 2) {
602: i1 = idx[0];
603: i2 = idx[1];
604: idx += 2;
605: tmp0 = x[i1];
606: tmp1 = x[i2];
607: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
608: v1 += 2;
609: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
610: v2 += 2;
611: }
612: if (n == sz - 1) {
613: tmp0 = x[*idx++];
614: sum1 += *v1++ * tmp0;
615: sum2 += *v2++ * tmp0;
616: }
617: y[row++] = sum1;
618: y[row++] = sum2;
619: v1 = v2; /* Since the next block to be processed starts there*/
620: idx += sz;
621: break;
622: case 3:
623: sum1 = *zt++;
624: sum2 = *zt++;
625: sum3 = *zt++;
626: v2 = v1 + n;
627: v3 = v2 + n;
629: for (n = 0; n < sz - 1; n += 2) {
630: i1 = idx[0];
631: i2 = idx[1];
632: idx += 2;
633: tmp0 = x[i1];
634: tmp1 = x[i2];
635: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
636: v1 += 2;
637: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
638: v2 += 2;
639: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
640: v3 += 2;
641: }
642: if (n == sz - 1) {
643: tmp0 = x[*idx++];
644: sum1 += *v1++ * tmp0;
645: sum2 += *v2++ * tmp0;
646: sum3 += *v3++ * tmp0;
647: }
648: y[row++] = sum1;
649: y[row++] = sum2;
650: y[row++] = sum3;
651: v1 = v3; /* Since the next block to be processed starts there*/
652: idx += 2 * sz;
653: break;
654: case 4:
655: sum1 = *zt++;
656: sum2 = *zt++;
657: sum3 = *zt++;
658: sum4 = *zt++;
659: v2 = v1 + n;
660: v3 = v2 + n;
661: v4 = v3 + n;
663: for (n = 0; n < sz - 1; n += 2) {
664: i1 = idx[0];
665: i2 = idx[1];
666: idx += 2;
667: tmp0 = x[i1];
668: tmp1 = x[i2];
669: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
670: v1 += 2;
671: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
672: v2 += 2;
673: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
674: v3 += 2;
675: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
676: v4 += 2;
677: }
678: if (n == sz - 1) {
679: tmp0 = x[*idx++];
680: sum1 += *v1++ * tmp0;
681: sum2 += *v2++ * tmp0;
682: sum3 += *v3++ * tmp0;
683: sum4 += *v4++ * tmp0;
684: }
685: y[row++] = sum1;
686: y[row++] = sum2;
687: y[row++] = sum3;
688: y[row++] = sum4;
689: v1 = v4; /* Since the next block to be processed starts there*/
690: idx += 3 * sz;
691: break;
692: case 5:
693: sum1 = *zt++;
694: sum2 = *zt++;
695: sum3 = *zt++;
696: sum4 = *zt++;
697: sum5 = *zt++;
698: v2 = v1 + n;
699: v3 = v2 + n;
700: v4 = v3 + n;
701: v5 = v4 + n;
703: for (n = 0; n < sz - 1; n += 2) {
704: i1 = idx[0];
705: i2 = idx[1];
706: idx += 2;
707: tmp0 = x[i1];
708: tmp1 = x[i2];
709: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
710: v1 += 2;
711: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
712: v2 += 2;
713: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
714: v3 += 2;
715: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
716: v4 += 2;
717: sum5 += v5[0] * tmp0 + v5[1] * tmp1;
718: v5 += 2;
719: }
720: if (n == sz - 1) {
721: tmp0 = x[*idx++];
722: sum1 += *v1++ * tmp0;
723: sum2 += *v2++ * tmp0;
724: sum3 += *v3++ * tmp0;
725: sum4 += *v4++ * tmp0;
726: sum5 += *v5++ * tmp0;
727: }
728: y[row++] = sum1;
729: y[row++] = sum2;
730: y[row++] = sum3;
731: y[row++] = sum4;
732: y[row++] = sum5;
733: v1 = v5; /* Since the next block to be processed starts there */
734: idx += 4 * sz;
735: break;
736: default:
737: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
738: }
739: }
740: VecRestoreArrayRead(xx, &x);
741: VecRestoreArrayPair(zz, yy, &z, &y);
742: PetscLogFlops(2.0 * a->nz);
743: return 0;
744: }
746: /* ----------------------------------------------------------- */
747: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
748: {
749: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
750: IS iscol = a->col, isrow = a->row;
751: const PetscInt *r, *c, *rout, *cout;
752: PetscInt i, j, n = A->rmap->n, nz;
753: PetscInt node_max, *ns, row, nsz, aii, i0, i1;
754: const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
755: PetscScalar *x, *tmp, *tmps, tmp0, tmp1;
756: PetscScalar sum1, sum2, sum3, sum4, sum5;
757: const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
758: const PetscScalar *b;
761: node_max = a->inode.node_count;
762: ns = a->inode.size; /* Node Size array */
764: VecGetArrayRead(bb, &b);
765: VecGetArrayWrite(xx, &x);
766: tmp = a->solve_work;
768: ISGetIndices(isrow, &rout);
769: r = rout;
770: ISGetIndices(iscol, &cout);
771: c = cout + (n - 1);
773: /* forward solve the lower triangular */
774: tmps = tmp;
775: aa = a_a;
776: aj = a_j;
777: ad = a->diag;
779: for (i = 0, row = 0; i < node_max; ++i) {
780: nsz = ns[i];
781: aii = ai[row];
782: v1 = aa + aii;
783: vi = aj + aii;
784: nz = ad[row] - aii;
785: if (i < node_max - 1) {
786: /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
787: * but our indexing to determine it's size could. */
788: PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
789: /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
790: PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
791: /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
792: }
794: switch (nsz) { /* Each loop in 'case' is unrolled */
795: case 1:
796: sum1 = b[*r++];
797: for (j = 0; j < nz - 1; j += 2) {
798: i0 = vi[0];
799: i1 = vi[1];
800: vi += 2;
801: tmp0 = tmps[i0];
802: tmp1 = tmps[i1];
803: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
804: v1 += 2;
805: }
806: if (j == nz - 1) {
807: tmp0 = tmps[*vi++];
808: sum1 -= *v1++ * tmp0;
809: }
810: tmp[row++] = sum1;
811: break;
812: case 2:
813: sum1 = b[*r++];
814: sum2 = b[*r++];
815: v2 = aa + ai[row + 1];
817: for (j = 0; j < nz - 1; j += 2) {
818: i0 = vi[0];
819: i1 = vi[1];
820: vi += 2;
821: tmp0 = tmps[i0];
822: tmp1 = tmps[i1];
823: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
824: v1 += 2;
825: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
826: v2 += 2;
827: }
828: if (j == nz - 1) {
829: tmp0 = tmps[*vi++];
830: sum1 -= *v1++ * tmp0;
831: sum2 -= *v2++ * tmp0;
832: }
833: sum2 -= *v2++ * sum1;
834: tmp[row++] = sum1;
835: tmp[row++] = sum2;
836: break;
837: case 3:
838: sum1 = b[*r++];
839: sum2 = b[*r++];
840: sum3 = b[*r++];
841: v2 = aa + ai[row + 1];
842: v3 = aa + ai[row + 2];
844: for (j = 0; j < nz - 1; j += 2) {
845: i0 = vi[0];
846: i1 = vi[1];
847: vi += 2;
848: tmp0 = tmps[i0];
849: tmp1 = tmps[i1];
850: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
851: v1 += 2;
852: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
853: v2 += 2;
854: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
855: v3 += 2;
856: }
857: if (j == nz - 1) {
858: tmp0 = tmps[*vi++];
859: sum1 -= *v1++ * tmp0;
860: sum2 -= *v2++ * tmp0;
861: sum3 -= *v3++ * tmp0;
862: }
863: sum2 -= *v2++ * sum1;
864: sum3 -= *v3++ * sum1;
865: sum3 -= *v3++ * sum2;
867: tmp[row++] = sum1;
868: tmp[row++] = sum2;
869: tmp[row++] = sum3;
870: break;
872: case 4:
873: sum1 = b[*r++];
874: sum2 = b[*r++];
875: sum3 = b[*r++];
876: sum4 = b[*r++];
877: v2 = aa + ai[row + 1];
878: v3 = aa + ai[row + 2];
879: v4 = aa + ai[row + 3];
881: for (j = 0; j < nz - 1; j += 2) {
882: i0 = vi[0];
883: i1 = vi[1];
884: vi += 2;
885: tmp0 = tmps[i0];
886: tmp1 = tmps[i1];
887: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
888: v1 += 2;
889: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
890: v2 += 2;
891: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
892: v3 += 2;
893: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
894: v4 += 2;
895: }
896: if (j == nz - 1) {
897: tmp0 = tmps[*vi++];
898: sum1 -= *v1++ * tmp0;
899: sum2 -= *v2++ * tmp0;
900: sum3 -= *v3++ * tmp0;
901: sum4 -= *v4++ * tmp0;
902: }
903: sum2 -= *v2++ * sum1;
904: sum3 -= *v3++ * sum1;
905: sum4 -= *v4++ * sum1;
906: sum3 -= *v3++ * sum2;
907: sum4 -= *v4++ * sum2;
908: sum4 -= *v4++ * sum3;
910: tmp[row++] = sum1;
911: tmp[row++] = sum2;
912: tmp[row++] = sum3;
913: tmp[row++] = sum4;
914: break;
915: case 5:
916: sum1 = b[*r++];
917: sum2 = b[*r++];
918: sum3 = b[*r++];
919: sum4 = b[*r++];
920: sum5 = b[*r++];
921: v2 = aa + ai[row + 1];
922: v3 = aa + ai[row + 2];
923: v4 = aa + ai[row + 3];
924: v5 = aa + ai[row + 4];
926: for (j = 0; j < nz - 1; j += 2) {
927: i0 = vi[0];
928: i1 = vi[1];
929: vi += 2;
930: tmp0 = tmps[i0];
931: tmp1 = tmps[i1];
932: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
933: v1 += 2;
934: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
935: v2 += 2;
936: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
937: v3 += 2;
938: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
939: v4 += 2;
940: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
941: v5 += 2;
942: }
943: if (j == nz - 1) {
944: tmp0 = tmps[*vi++];
945: sum1 -= *v1++ * tmp0;
946: sum2 -= *v2++ * tmp0;
947: sum3 -= *v3++ * tmp0;
948: sum4 -= *v4++ * tmp0;
949: sum5 -= *v5++ * tmp0;
950: }
952: sum2 -= *v2++ * sum1;
953: sum3 -= *v3++ * sum1;
954: sum4 -= *v4++ * sum1;
955: sum5 -= *v5++ * sum1;
956: sum3 -= *v3++ * sum2;
957: sum4 -= *v4++ * sum2;
958: sum5 -= *v5++ * sum2;
959: sum4 -= *v4++ * sum3;
960: sum5 -= *v5++ * sum3;
961: sum5 -= *v5++ * sum4;
963: tmp[row++] = sum1;
964: tmp[row++] = sum2;
965: tmp[row++] = sum3;
966: tmp[row++] = sum4;
967: tmp[row++] = sum5;
968: break;
969: default:
970: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
971: }
972: }
973: /* backward solve the upper triangular */
974: for (i = node_max - 1, row = n - 1; i >= 0; i--) {
975: nsz = ns[i];
976: aii = ai[row + 1] - 1;
977: v1 = aa + aii;
978: vi = aj + aii;
979: nz = aii - ad[row];
980: switch (nsz) { /* Each loop in 'case' is unrolled */
981: case 1:
982: sum1 = tmp[row];
984: for (j = nz; j > 1; j -= 2) {
985: vi -= 2;
986: i0 = vi[2];
987: i1 = vi[1];
988: tmp0 = tmps[i0];
989: tmp1 = tmps[i1];
990: v1 -= 2;
991: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
992: }
993: if (j == 1) {
994: tmp0 = tmps[*vi--];
995: sum1 -= *v1-- * tmp0;
996: }
997: x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
998: row--;
999: break;
1000: case 2:
1001: sum1 = tmp[row];
1002: sum2 = tmp[row - 1];
1003: v2 = aa + ai[row] - 1;
1004: for (j = nz; j > 1; j -= 2) {
1005: vi -= 2;
1006: i0 = vi[2];
1007: i1 = vi[1];
1008: tmp0 = tmps[i0];
1009: tmp1 = tmps[i1];
1010: v1 -= 2;
1011: v2 -= 2;
1012: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1013: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1014: }
1015: if (j == 1) {
1016: tmp0 = tmps[*vi--];
1017: sum1 -= *v1-- * tmp0;
1018: sum2 -= *v2-- * tmp0;
1019: }
1021: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1022: row--;
1023: sum2 -= *v2-- * tmp0;
1024: x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1025: row--;
1026: break;
1027: case 3:
1028: sum1 = tmp[row];
1029: sum2 = tmp[row - 1];
1030: sum3 = tmp[row - 2];
1031: v2 = aa + ai[row] - 1;
1032: v3 = aa + ai[row - 1] - 1;
1033: for (j = nz; j > 1; j -= 2) {
1034: vi -= 2;
1035: i0 = vi[2];
1036: i1 = vi[1];
1037: tmp0 = tmps[i0];
1038: tmp1 = tmps[i1];
1039: v1 -= 2;
1040: v2 -= 2;
1041: v3 -= 2;
1042: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1043: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1044: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1045: }
1046: if (j == 1) {
1047: tmp0 = tmps[*vi--];
1048: sum1 -= *v1-- * tmp0;
1049: sum2 -= *v2-- * tmp0;
1050: sum3 -= *v3-- * tmp0;
1051: }
1052: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1053: row--;
1054: sum2 -= *v2-- * tmp0;
1055: sum3 -= *v3-- * tmp0;
1056: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1057: row--;
1058: sum3 -= *v3-- * tmp0;
1059: x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1060: row--;
1062: break;
1063: case 4:
1064: sum1 = tmp[row];
1065: sum2 = tmp[row - 1];
1066: sum3 = tmp[row - 2];
1067: sum4 = tmp[row - 3];
1068: v2 = aa + ai[row] - 1;
1069: v3 = aa + ai[row - 1] - 1;
1070: v4 = aa + ai[row - 2] - 1;
1072: for (j = nz; j > 1; j -= 2) {
1073: vi -= 2;
1074: i0 = vi[2];
1075: i1 = vi[1];
1076: tmp0 = tmps[i0];
1077: tmp1 = tmps[i1];
1078: v1 -= 2;
1079: v2 -= 2;
1080: v3 -= 2;
1081: v4 -= 2;
1082: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1083: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1084: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1085: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1086: }
1087: if (j == 1) {
1088: tmp0 = tmps[*vi--];
1089: sum1 -= *v1-- * tmp0;
1090: sum2 -= *v2-- * tmp0;
1091: sum3 -= *v3-- * tmp0;
1092: sum4 -= *v4-- * tmp0;
1093: }
1095: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1096: row--;
1097: sum2 -= *v2-- * tmp0;
1098: sum3 -= *v3-- * tmp0;
1099: sum4 -= *v4-- * tmp0;
1100: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1101: row--;
1102: sum3 -= *v3-- * tmp0;
1103: sum4 -= *v4-- * tmp0;
1104: tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1105: row--;
1106: sum4 -= *v4-- * tmp0;
1107: x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1108: row--;
1109: break;
1110: case 5:
1111: sum1 = tmp[row];
1112: sum2 = tmp[row - 1];
1113: sum3 = tmp[row - 2];
1114: sum4 = tmp[row - 3];
1115: sum5 = tmp[row - 4];
1116: v2 = aa + ai[row] - 1;
1117: v3 = aa + ai[row - 1] - 1;
1118: v4 = aa + ai[row - 2] - 1;
1119: v5 = aa + ai[row - 3] - 1;
1120: for (j = nz; j > 1; j -= 2) {
1121: vi -= 2;
1122: i0 = vi[2];
1123: i1 = vi[1];
1124: tmp0 = tmps[i0];
1125: tmp1 = tmps[i1];
1126: v1 -= 2;
1127: v2 -= 2;
1128: v3 -= 2;
1129: v4 -= 2;
1130: v5 -= 2;
1131: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1132: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1133: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1134: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1135: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1136: }
1137: if (j == 1) {
1138: tmp0 = tmps[*vi--];
1139: sum1 -= *v1-- * tmp0;
1140: sum2 -= *v2-- * tmp0;
1141: sum3 -= *v3-- * tmp0;
1142: sum4 -= *v4-- * tmp0;
1143: sum5 -= *v5-- * tmp0;
1144: }
1146: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1147: row--;
1148: sum2 -= *v2-- * tmp0;
1149: sum3 -= *v3-- * tmp0;
1150: sum4 -= *v4-- * tmp0;
1151: sum5 -= *v5-- * tmp0;
1152: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1153: row--;
1154: sum3 -= *v3-- * tmp0;
1155: sum4 -= *v4-- * tmp0;
1156: sum5 -= *v5-- * tmp0;
1157: tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1158: row--;
1159: sum4 -= *v4-- * tmp0;
1160: sum5 -= *v5-- * tmp0;
1161: tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1162: row--;
1163: sum5 -= *v5-- * tmp0;
1164: x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1165: row--;
1166: break;
1167: default:
1168: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1169: }
1170: }
1171: ISRestoreIndices(isrow, &rout);
1172: ISRestoreIndices(iscol, &cout);
1173: VecRestoreArrayRead(bb, &b);
1174: VecRestoreArrayWrite(xx, &x);
1175: PetscLogFlops(2.0 * a->nz - A->cmap->n);
1176: return 0;
1177: }
1179: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1180: {
1181: Mat C = B;
1182: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1183: IS isrow = b->row, isicol = b->icol;
1184: const PetscInt *r, *ic, *ics;
1185: const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1186: PetscInt i, j, k, nz, nzL, row, *pj;
1187: const PetscInt *ajtmp, *bjtmp;
1188: MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1189: const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1190: FactorShiftCtx sctx;
1191: const PetscInt *ddiag;
1192: PetscReal rs;
1193: MatScalar d;
1194: PetscInt inod, nodesz, node_max, col;
1195: const PetscInt *ns;
1196: PetscInt *tmp_vec1, *tmp_vec2, *nsmap;
1198: /* MatPivotSetUp(): initialize shift context sctx */
1199: PetscMemzero(&sctx, sizeof(FactorShiftCtx));
1201: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1202: ddiag = a->diag;
1203: sctx.shift_top = info->zeropivot;
1204: for (i = 0; i < n; i++) {
1205: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1206: d = (aa)[ddiag[i]];
1207: rs = -PetscAbsScalar(d) - PetscRealPart(d);
1208: v = aa + ai[i];
1209: nz = ai[i + 1] - ai[i];
1210: for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1211: if (rs > sctx.shift_top) sctx.shift_top = rs;
1212: }
1213: sctx.shift_top *= 1.1;
1214: sctx.nshift_max = 5;
1215: sctx.shift_lo = 0.;
1216: sctx.shift_hi = 1.;
1217: }
1219: ISGetIndices(isrow, &r);
1220: ISGetIndices(isicol, &ic);
1222: PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4);
1223: ics = ic;
1225: node_max = a->inode.node_count;
1226: ns = a->inode.size;
1229: /* If max inode size > 4, split it into two inodes.*/
1230: /* also map the inode sizes according to the ordering */
1231: PetscMalloc1(n + 1, &tmp_vec1);
1232: for (i = 0, j = 0; i < node_max; ++i, ++j) {
1233: if (ns[i] > 4) {
1234: tmp_vec1[j] = 4;
1235: ++j;
1236: tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
1237: } else {
1238: tmp_vec1[j] = ns[i];
1239: }
1240: }
1241: /* Use the correct node_max */
1242: node_max = j;
1244: /* Now reorder the inode info based on mat re-ordering info */
1245: /* First create a row -> inode_size_array_index map */
1246: PetscMalloc1(n + 1, &nsmap);
1247: PetscMalloc1(node_max + 1, &tmp_vec2);
1248: for (i = 0, row = 0; i < node_max; i++) {
1249: nodesz = tmp_vec1[i];
1250: for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1251: }
1252: /* Using nsmap, create a reordered ns structure */
1253: for (i = 0, j = 0; i < node_max; i++) {
1254: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1255: tmp_vec2[i] = nodesz;
1256: j += nodesz;
1257: }
1258: PetscFree(nsmap);
1259: PetscFree(tmp_vec1);
1261: /* Now use the correct ns */
1262: ns = tmp_vec2;
1264: do {
1265: sctx.newshift = PETSC_FALSE;
1266: /* Now loop over each block-row, and do the factorization */
1267: for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1268: nodesz = ns[inod];
1270: switch (nodesz) {
1271: case 1:
1272: /*----------*/
1273: /* zero rtmp1 */
1274: /* L part */
1275: nz = bi[i + 1] - bi[i];
1276: bjtmp = bj + bi[i];
1277: for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1279: /* U part */
1280: nz = bdiag[i] - bdiag[i + 1];
1281: bjtmp = bj + bdiag[i + 1] + 1;
1282: for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1284: /* load in initial (unfactored row) */
1285: nz = ai[r[i] + 1] - ai[r[i]];
1286: ajtmp = aj + ai[r[i]];
1287: v = aa + ai[r[i]];
1288: for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1290: /* ZeropivotApply() */
1291: rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1293: /* elimination */
1294: bjtmp = bj + bi[i];
1295: row = *bjtmp++;
1296: nzL = bi[i + 1] - bi[i];
1297: for (k = 0; k < nzL; k++) {
1298: pc = rtmp1 + row;
1299: if (*pc != 0.0) {
1300: pv = b->a + bdiag[row];
1301: mul1 = *pc * (*pv);
1302: *pc = mul1;
1303: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1304: pv = b->a + bdiag[row + 1] + 1;
1305: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1306: for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1307: PetscLogFlops(1 + 2.0 * nz);
1308: }
1309: row = *bjtmp++;
1310: }
1312: /* finished row so stick it into b->a */
1313: rs = 0.0;
1314: /* L part */
1315: pv = b->a + bi[i];
1316: pj = b->j + bi[i];
1317: nz = bi[i + 1] - bi[i];
1318: for (j = 0; j < nz; j++) {
1319: pv[j] = rtmp1[pj[j]];
1320: rs += PetscAbsScalar(pv[j]);
1321: }
1323: /* U part */
1324: pv = b->a + bdiag[i + 1] + 1;
1325: pj = b->j + bdiag[i + 1] + 1;
1326: nz = bdiag[i] - bdiag[i + 1] - 1;
1327: for (j = 0; j < nz; j++) {
1328: pv[j] = rtmp1[pj[j]];
1329: rs += PetscAbsScalar(pv[j]);
1330: }
1332: /* Check zero pivot */
1333: sctx.rs = rs;
1334: sctx.pv = rtmp1[i];
1335: MatPivotCheck(B, A, info, &sctx, i);
1336: if (sctx.newshift) break;
1338: /* Mark diagonal and invert diagonal for simpler triangular solves */
1339: pv = b->a + bdiag[i];
1340: *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1341: break;
1343: case 2:
1344: /*----------*/
1345: /* zero rtmp1 and rtmp2 */
1346: /* L part */
1347: nz = bi[i + 1] - bi[i];
1348: bjtmp = bj + bi[i];
1349: for (j = 0; j < nz; j++) {
1350: col = bjtmp[j];
1351: rtmp1[col] = 0.0;
1352: rtmp2[col] = 0.0;
1353: }
1355: /* U part */
1356: nz = bdiag[i] - bdiag[i + 1];
1357: bjtmp = bj + bdiag[i + 1] + 1;
1358: for (j = 0; j < nz; j++) {
1359: col = bjtmp[j];
1360: rtmp1[col] = 0.0;
1361: rtmp2[col] = 0.0;
1362: }
1364: /* load in initial (unfactored row) */
1365: nz = ai[r[i] + 1] - ai[r[i]];
1366: ajtmp = aj + ai[r[i]];
1367: v1 = aa + ai[r[i]];
1368: v2 = aa + ai[r[i] + 1];
1369: for (j = 0; j < nz; j++) {
1370: col = ics[ajtmp[j]];
1371: rtmp1[col] = v1[j];
1372: rtmp2[col] = v2[j];
1373: }
1374: /* ZeropivotApply(): shift the diagonal of the matrix */
1375: rtmp1[i] += sctx.shift_amount;
1376: rtmp2[i + 1] += sctx.shift_amount;
1378: /* elimination */
1379: bjtmp = bj + bi[i];
1380: row = *bjtmp++; /* pivot row */
1381: nzL = bi[i + 1] - bi[i];
1382: for (k = 0; k < nzL; k++) {
1383: pc1 = rtmp1 + row;
1384: pc2 = rtmp2 + row;
1385: if (*pc1 != 0.0 || *pc2 != 0.0) {
1386: pv = b->a + bdiag[row];
1387: mul1 = *pc1 * (*pv);
1388: mul2 = *pc2 * (*pv);
1389: *pc1 = mul1;
1390: *pc2 = mul2;
1392: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1393: pv = b->a + bdiag[row + 1] + 1;
1394: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1395: for (j = 0; j < nz; j++) {
1396: col = pj[j];
1397: rtmp1[col] -= mul1 * pv[j];
1398: rtmp2[col] -= mul2 * pv[j];
1399: }
1400: PetscLogFlops(2 + 4.0 * nz);
1401: }
1402: row = *bjtmp++;
1403: }
1405: /* finished row i; check zero pivot, then stick row i into b->a */
1406: rs = 0.0;
1407: /* L part */
1408: pc1 = b->a + bi[i];
1409: pj = b->j + bi[i];
1410: nz = bi[i + 1] - bi[i];
1411: for (j = 0; j < nz; j++) {
1412: col = pj[j];
1413: pc1[j] = rtmp1[col];
1414: rs += PetscAbsScalar(pc1[j]);
1415: }
1416: /* U part */
1417: pc1 = b->a + bdiag[i + 1] + 1;
1418: pj = b->j + bdiag[i + 1] + 1;
1419: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1420: for (j = 0; j < nz; j++) {
1421: col = pj[j];
1422: pc1[j] = rtmp1[col];
1423: rs += PetscAbsScalar(pc1[j]);
1424: }
1426: sctx.rs = rs;
1427: sctx.pv = rtmp1[i];
1428: MatPivotCheck(B, A, info, &sctx, i);
1429: if (sctx.newshift) break;
1430: pc1 = b->a + bdiag[i]; /* Mark diagonal */
1431: *pc1 = 1.0 / sctx.pv;
1433: /* Now take care of diagonal 2x2 block. */
1434: pc2 = rtmp2 + i;
1435: if (*pc2 != 0.0) {
1436: mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */
1437: *pc2 = mul1; /* insert L entry */
1438: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1439: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1440: for (j = 0; j < nz; j++) {
1441: col = pj[j];
1442: rtmp2[col] -= mul1 * rtmp1[col];
1443: }
1444: PetscLogFlops(1 + 2.0 * nz);
1445: }
1447: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1448: rs = 0.0;
1449: /* L part */
1450: pc2 = b->a + bi[i + 1];
1451: pj = b->j + bi[i + 1];
1452: nz = bi[i + 2] - bi[i + 1];
1453: for (j = 0; j < nz; j++) {
1454: col = pj[j];
1455: pc2[j] = rtmp2[col];
1456: rs += PetscAbsScalar(pc2[j]);
1457: }
1458: /* U part */
1459: pc2 = b->a + bdiag[i + 2] + 1;
1460: pj = b->j + bdiag[i + 2] + 1;
1461: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1462: for (j = 0; j < nz; j++) {
1463: col = pj[j];
1464: pc2[j] = rtmp2[col];
1465: rs += PetscAbsScalar(pc2[j]);
1466: }
1468: sctx.rs = rs;
1469: sctx.pv = rtmp2[i + 1];
1470: MatPivotCheck(B, A, info, &sctx, i + 1);
1471: if (sctx.newshift) break;
1472: pc2 = b->a + bdiag[i + 1];
1473: *pc2 = 1.0 / sctx.pv;
1474: break;
1476: case 3:
1477: /*----------*/
1478: /* zero rtmp */
1479: /* L part */
1480: nz = bi[i + 1] - bi[i];
1481: bjtmp = bj + bi[i];
1482: for (j = 0; j < nz; j++) {
1483: col = bjtmp[j];
1484: rtmp1[col] = 0.0;
1485: rtmp2[col] = 0.0;
1486: rtmp3[col] = 0.0;
1487: }
1489: /* U part */
1490: nz = bdiag[i] - bdiag[i + 1];
1491: bjtmp = bj + bdiag[i + 1] + 1;
1492: for (j = 0; j < nz; j++) {
1493: col = bjtmp[j];
1494: rtmp1[col] = 0.0;
1495: rtmp2[col] = 0.0;
1496: rtmp3[col] = 0.0;
1497: }
1499: /* load in initial (unfactored row) */
1500: nz = ai[r[i] + 1] - ai[r[i]];
1501: ajtmp = aj + ai[r[i]];
1502: v1 = aa + ai[r[i]];
1503: v2 = aa + ai[r[i] + 1];
1504: v3 = aa + ai[r[i] + 2];
1505: for (j = 0; j < nz; j++) {
1506: col = ics[ajtmp[j]];
1507: rtmp1[col] = v1[j];
1508: rtmp2[col] = v2[j];
1509: rtmp3[col] = v3[j];
1510: }
1511: /* ZeropivotApply(): shift the diagonal of the matrix */
1512: rtmp1[i] += sctx.shift_amount;
1513: rtmp2[i + 1] += sctx.shift_amount;
1514: rtmp3[i + 2] += sctx.shift_amount;
1516: /* elimination */
1517: bjtmp = bj + bi[i];
1518: row = *bjtmp++; /* pivot row */
1519: nzL = bi[i + 1] - bi[i];
1520: for (k = 0; k < nzL; k++) {
1521: pc1 = rtmp1 + row;
1522: pc2 = rtmp2 + row;
1523: pc3 = rtmp3 + row;
1524: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1525: pv = b->a + bdiag[row];
1526: mul1 = *pc1 * (*pv);
1527: mul2 = *pc2 * (*pv);
1528: mul3 = *pc3 * (*pv);
1529: *pc1 = mul1;
1530: *pc2 = mul2;
1531: *pc3 = mul3;
1533: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1534: pv = b->a + bdiag[row + 1] + 1;
1535: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1536: for (j = 0; j < nz; j++) {
1537: col = pj[j];
1538: rtmp1[col] -= mul1 * pv[j];
1539: rtmp2[col] -= mul2 * pv[j];
1540: rtmp3[col] -= mul3 * pv[j];
1541: }
1542: PetscLogFlops(3 + 6.0 * nz);
1543: }
1544: row = *bjtmp++;
1545: }
1547: /* finished row i; check zero pivot, then stick row i into b->a */
1548: rs = 0.0;
1549: /* L part */
1550: pc1 = b->a + bi[i];
1551: pj = b->j + bi[i];
1552: nz = bi[i + 1] - bi[i];
1553: for (j = 0; j < nz; j++) {
1554: col = pj[j];
1555: pc1[j] = rtmp1[col];
1556: rs += PetscAbsScalar(pc1[j]);
1557: }
1558: /* U part */
1559: pc1 = b->a + bdiag[i + 1] + 1;
1560: pj = b->j + bdiag[i + 1] + 1;
1561: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1562: for (j = 0; j < nz; j++) {
1563: col = pj[j];
1564: pc1[j] = rtmp1[col];
1565: rs += PetscAbsScalar(pc1[j]);
1566: }
1568: sctx.rs = rs;
1569: sctx.pv = rtmp1[i];
1570: MatPivotCheck(B, A, info, &sctx, i);
1571: if (sctx.newshift) break;
1572: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1573: *pc1 = 1.0 / sctx.pv;
1575: /* Now take care of 1st column of diagonal 3x3 block. */
1576: pc2 = rtmp2 + i;
1577: pc3 = rtmp3 + i;
1578: if (*pc2 != 0.0 || *pc3 != 0.0) {
1579: mul2 = (*pc2) * (*pc1);
1580: *pc2 = mul2;
1581: mul3 = (*pc3) * (*pc1);
1582: *pc3 = mul3;
1583: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1584: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1585: for (j = 0; j < nz; j++) {
1586: col = pj[j];
1587: rtmp2[col] -= mul2 * rtmp1[col];
1588: rtmp3[col] -= mul3 * rtmp1[col];
1589: }
1590: PetscLogFlops(2 + 4.0 * nz);
1591: }
1593: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1594: rs = 0.0;
1595: /* L part */
1596: pc2 = b->a + bi[i + 1];
1597: pj = b->j + bi[i + 1];
1598: nz = bi[i + 2] - bi[i + 1];
1599: for (j = 0; j < nz; j++) {
1600: col = pj[j];
1601: pc2[j] = rtmp2[col];
1602: rs += PetscAbsScalar(pc2[j]);
1603: }
1604: /* U part */
1605: pc2 = b->a + bdiag[i + 2] + 1;
1606: pj = b->j + bdiag[i + 2] + 1;
1607: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1608: for (j = 0; j < nz; j++) {
1609: col = pj[j];
1610: pc2[j] = rtmp2[col];
1611: rs += PetscAbsScalar(pc2[j]);
1612: }
1614: sctx.rs = rs;
1615: sctx.pv = rtmp2[i + 1];
1616: MatPivotCheck(B, A, info, &sctx, i + 1);
1617: if (sctx.newshift) break;
1618: pc2 = b->a + bdiag[i + 1];
1619: *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1621: /* Now take care of 2nd column of diagonal 3x3 block. */
1622: pc3 = rtmp3 + i + 1;
1623: if (*pc3 != 0.0) {
1624: mul3 = (*pc3) * (*pc2);
1625: *pc3 = mul3;
1626: pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */
1627: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1628: for (j = 0; j < nz; j++) {
1629: col = pj[j];
1630: rtmp3[col] -= mul3 * rtmp2[col];
1631: }
1632: PetscLogFlops(1 + 2.0 * nz);
1633: }
1635: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1636: rs = 0.0;
1637: /* L part */
1638: pc3 = b->a + bi[i + 2];
1639: pj = b->j + bi[i + 2];
1640: nz = bi[i + 3] - bi[i + 2];
1641: for (j = 0; j < nz; j++) {
1642: col = pj[j];
1643: pc3[j] = rtmp3[col];
1644: rs += PetscAbsScalar(pc3[j]);
1645: }
1646: /* U part */
1647: pc3 = b->a + bdiag[i + 3] + 1;
1648: pj = b->j + bdiag[i + 3] + 1;
1649: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1650: for (j = 0; j < nz; j++) {
1651: col = pj[j];
1652: pc3[j] = rtmp3[col];
1653: rs += PetscAbsScalar(pc3[j]);
1654: }
1656: sctx.rs = rs;
1657: sctx.pv = rtmp3[i + 2];
1658: MatPivotCheck(B, A, info, &sctx, i + 2);
1659: if (sctx.newshift) break;
1660: pc3 = b->a + bdiag[i + 2];
1661: *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1662: break;
1663: case 4:
1664: /*----------*/
1665: /* zero rtmp */
1666: /* L part */
1667: nz = bi[i + 1] - bi[i];
1668: bjtmp = bj + bi[i];
1669: for (j = 0; j < nz; j++) {
1670: col = bjtmp[j];
1671: rtmp1[col] = 0.0;
1672: rtmp2[col] = 0.0;
1673: rtmp3[col] = 0.0;
1674: rtmp4[col] = 0.0;
1675: }
1677: /* U part */
1678: nz = bdiag[i] - bdiag[i + 1];
1679: bjtmp = bj + bdiag[i + 1] + 1;
1680: for (j = 0; j < nz; j++) {
1681: col = bjtmp[j];
1682: rtmp1[col] = 0.0;
1683: rtmp2[col] = 0.0;
1684: rtmp3[col] = 0.0;
1685: rtmp4[col] = 0.0;
1686: }
1688: /* load in initial (unfactored row) */
1689: nz = ai[r[i] + 1] - ai[r[i]];
1690: ajtmp = aj + ai[r[i]];
1691: v1 = aa + ai[r[i]];
1692: v2 = aa + ai[r[i] + 1];
1693: v3 = aa + ai[r[i] + 2];
1694: v4 = aa + ai[r[i] + 3];
1695: for (j = 0; j < nz; j++) {
1696: col = ics[ajtmp[j]];
1697: rtmp1[col] = v1[j];
1698: rtmp2[col] = v2[j];
1699: rtmp3[col] = v3[j];
1700: rtmp4[col] = v4[j];
1701: }
1702: /* ZeropivotApply(): shift the diagonal of the matrix */
1703: rtmp1[i] += sctx.shift_amount;
1704: rtmp2[i + 1] += sctx.shift_amount;
1705: rtmp3[i + 2] += sctx.shift_amount;
1706: rtmp4[i + 3] += sctx.shift_amount;
1708: /* elimination */
1709: bjtmp = bj + bi[i];
1710: row = *bjtmp++; /* pivot row */
1711: nzL = bi[i + 1] - bi[i];
1712: for (k = 0; k < nzL; k++) {
1713: pc1 = rtmp1 + row;
1714: pc2 = rtmp2 + row;
1715: pc3 = rtmp3 + row;
1716: pc4 = rtmp4 + row;
1717: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1718: pv = b->a + bdiag[row];
1719: mul1 = *pc1 * (*pv);
1720: mul2 = *pc2 * (*pv);
1721: mul3 = *pc3 * (*pv);
1722: mul4 = *pc4 * (*pv);
1723: *pc1 = mul1;
1724: *pc2 = mul2;
1725: *pc3 = mul3;
1726: *pc4 = mul4;
1728: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1729: pv = b->a + bdiag[row + 1] + 1;
1730: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1731: for (j = 0; j < nz; j++) {
1732: col = pj[j];
1733: rtmp1[col] -= mul1 * pv[j];
1734: rtmp2[col] -= mul2 * pv[j];
1735: rtmp3[col] -= mul3 * pv[j];
1736: rtmp4[col] -= mul4 * pv[j];
1737: }
1738: PetscLogFlops(4 + 8.0 * nz);
1739: }
1740: row = *bjtmp++;
1741: }
1743: /* finished row i; check zero pivot, then stick row i into b->a */
1744: rs = 0.0;
1745: /* L part */
1746: pc1 = b->a + bi[i];
1747: pj = b->j + bi[i];
1748: nz = bi[i + 1] - bi[i];
1749: for (j = 0; j < nz; j++) {
1750: col = pj[j];
1751: pc1[j] = rtmp1[col];
1752: rs += PetscAbsScalar(pc1[j]);
1753: }
1754: /* U part */
1755: pc1 = b->a + bdiag[i + 1] + 1;
1756: pj = b->j + bdiag[i + 1] + 1;
1757: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1758: for (j = 0; j < nz; j++) {
1759: col = pj[j];
1760: pc1[j] = rtmp1[col];
1761: rs += PetscAbsScalar(pc1[j]);
1762: }
1764: sctx.rs = rs;
1765: sctx.pv = rtmp1[i];
1766: MatPivotCheck(B, A, info, &sctx, i);
1767: if (sctx.newshift) break;
1768: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1769: *pc1 = 1.0 / sctx.pv;
1771: /* Now take care of 1st column of diagonal 4x4 block. */
1772: pc2 = rtmp2 + i;
1773: pc3 = rtmp3 + i;
1774: pc4 = rtmp4 + i;
1775: if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1776: mul2 = (*pc2) * (*pc1);
1777: *pc2 = mul2;
1778: mul3 = (*pc3) * (*pc1);
1779: *pc3 = mul3;
1780: mul4 = (*pc4) * (*pc1);
1781: *pc4 = mul4;
1782: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1783: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1784: for (j = 0; j < nz; j++) {
1785: col = pj[j];
1786: rtmp2[col] -= mul2 * rtmp1[col];
1787: rtmp3[col] -= mul3 * rtmp1[col];
1788: rtmp4[col] -= mul4 * rtmp1[col];
1789: }
1790: PetscLogFlops(3 + 6.0 * nz);
1791: }
1793: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1794: rs = 0.0;
1795: /* L part */
1796: pc2 = b->a + bi[i + 1];
1797: pj = b->j + bi[i + 1];
1798: nz = bi[i + 2] - bi[i + 1];
1799: for (j = 0; j < nz; j++) {
1800: col = pj[j];
1801: pc2[j] = rtmp2[col];
1802: rs += PetscAbsScalar(pc2[j]);
1803: }
1804: /* U part */
1805: pc2 = b->a + bdiag[i + 2] + 1;
1806: pj = b->j + bdiag[i + 2] + 1;
1807: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1808: for (j = 0; j < nz; j++) {
1809: col = pj[j];
1810: pc2[j] = rtmp2[col];
1811: rs += PetscAbsScalar(pc2[j]);
1812: }
1814: sctx.rs = rs;
1815: sctx.pv = rtmp2[i + 1];
1816: MatPivotCheck(B, A, info, &sctx, i + 1);
1817: if (sctx.newshift) break;
1818: pc2 = b->a + bdiag[i + 1];
1819: *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1821: /* Now take care of 2nd column of diagonal 4x4 block. */
1822: pc3 = rtmp3 + i + 1;
1823: pc4 = rtmp4 + i + 1;
1824: if (*pc3 != 0.0 || *pc4 != 0.0) {
1825: mul3 = (*pc3) * (*pc2);
1826: *pc3 = mul3;
1827: mul4 = (*pc4) * (*pc2);
1828: *pc4 = mul4;
1829: pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */
1830: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1831: for (j = 0; j < nz; j++) {
1832: col = pj[j];
1833: rtmp3[col] -= mul3 * rtmp2[col];
1834: rtmp4[col] -= mul4 * rtmp2[col];
1835: }
1836: PetscLogFlops(4.0 * nz);
1837: }
1839: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1840: rs = 0.0;
1841: /* L part */
1842: pc3 = b->a + bi[i + 2];
1843: pj = b->j + bi[i + 2];
1844: nz = bi[i + 3] - bi[i + 2];
1845: for (j = 0; j < nz; j++) {
1846: col = pj[j];
1847: pc3[j] = rtmp3[col];
1848: rs += PetscAbsScalar(pc3[j]);
1849: }
1850: /* U part */
1851: pc3 = b->a + bdiag[i + 3] + 1;
1852: pj = b->j + bdiag[i + 3] + 1;
1853: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1854: for (j = 0; j < nz; j++) {
1855: col = pj[j];
1856: pc3[j] = rtmp3[col];
1857: rs += PetscAbsScalar(pc3[j]);
1858: }
1860: sctx.rs = rs;
1861: sctx.pv = rtmp3[i + 2];
1862: MatPivotCheck(B, A, info, &sctx, i + 2);
1863: if (sctx.newshift) break;
1864: pc3 = b->a + bdiag[i + 2];
1865: *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1867: /* Now take care of 3rd column of diagonal 4x4 block. */
1868: pc4 = rtmp4 + i + 2;
1869: if (*pc4 != 0.0) {
1870: mul4 = (*pc4) * (*pc3);
1871: *pc4 = mul4;
1872: pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */
1873: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1874: for (j = 0; j < nz; j++) {
1875: col = pj[j];
1876: rtmp4[col] -= mul4 * rtmp3[col];
1877: }
1878: PetscLogFlops(1 + 2.0 * nz);
1879: }
1881: /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1882: rs = 0.0;
1883: /* L part */
1884: pc4 = b->a + bi[i + 3];
1885: pj = b->j + bi[i + 3];
1886: nz = bi[i + 4] - bi[i + 3];
1887: for (j = 0; j < nz; j++) {
1888: col = pj[j];
1889: pc4[j] = rtmp4[col];
1890: rs += PetscAbsScalar(pc4[j]);
1891: }
1892: /* U part */
1893: pc4 = b->a + bdiag[i + 4] + 1;
1894: pj = b->j + bdiag[i + 4] + 1;
1895: nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1896: for (j = 0; j < nz; j++) {
1897: col = pj[j];
1898: pc4[j] = rtmp4[col];
1899: rs += PetscAbsScalar(pc4[j]);
1900: }
1902: sctx.rs = rs;
1903: sctx.pv = rtmp4[i + 3];
1904: MatPivotCheck(B, A, info, &sctx, i + 3);
1905: if (sctx.newshift) break;
1906: pc4 = b->a + bdiag[i + 3];
1907: *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1908: break;
1910: default:
1911: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1912: }
1913: if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1914: i += nodesz; /* Update the row */
1915: }
1917: /* MatPivotRefine() */
1918: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1919: /*
1920: * if no shift in this attempt & shifting & started shifting & can refine,
1921: * then try lower shift
1922: */
1923: sctx.shift_hi = sctx.shift_fraction;
1924: sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1925: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
1926: sctx.newshift = PETSC_TRUE;
1927: sctx.nshift++;
1928: }
1929: } while (sctx.newshift);
1931: PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4);
1932: PetscFree(tmp_vec2);
1933: ISRestoreIndices(isicol, &ic);
1934: ISRestoreIndices(isrow, &r);
1936: if (b->inode.size) {
1937: C->ops->solve = MatSolve_SeqAIJ_Inode;
1938: } else {
1939: C->ops->solve = MatSolve_SeqAIJ;
1940: }
1941: C->ops->solveadd = MatSolveAdd_SeqAIJ;
1942: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1943: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1944: C->ops->matsolve = MatMatSolve_SeqAIJ;
1945: C->assembled = PETSC_TRUE;
1946: C->preallocated = PETSC_TRUE;
1948: PetscLogFlops(C->cmap->n);
1950: /* MatShiftView(A,info,&sctx) */
1951: if (sctx.nshift) {
1952: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1953: PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top);
1954: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1955: PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount);
1956: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1957: PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount);
1958: }
1959: }
1960: return 0;
1961: }
1963: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1964: {
1965: Mat C = B;
1966: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1967: IS iscol = b->col, isrow = b->row, isicol = b->icol;
1968: const PetscInt *r, *ic, *c, *ics;
1969: PetscInt n = A->rmap->n, *bi = b->i;
1970: PetscInt *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
1971: PetscInt i, j, idx, *bd = b->diag, node_max, nodesz;
1972: PetscInt *ai = a->i, *aj = a->j;
1973: PetscInt *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1974: PetscScalar mul1, mul2, mul3, tmp;
1975: MatScalar *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1976: const MatScalar *v1, *v2, *v3, *aa = a->a, *rtmp1;
1977: PetscReal rs = 0.0;
1978: FactorShiftCtx sctx;
1980: sctx.shift_top = 0;
1981: sctx.nshift_max = 0;
1982: sctx.shift_lo = 0;
1983: sctx.shift_hi = 0;
1984: sctx.shift_fraction = 0;
1986: /* if both shift schemes are chosen by user, only use info->shiftpd */
1987: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1988: sctx.shift_top = 0;
1989: for (i = 0; i < n; i++) {
1990: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1991: rs = 0.0;
1992: ajtmp = aj + ai[i];
1993: rtmp1 = aa + ai[i];
1994: nz = ai[i + 1] - ai[i];
1995: for (j = 0; j < nz; j++) {
1996: if (*ajtmp != i) {
1997: rs += PetscAbsScalar(*rtmp1++);
1998: } else {
1999: rs -= PetscRealPart(*rtmp1++);
2000: }
2001: ajtmp++;
2002: }
2003: if (rs > sctx.shift_top) sctx.shift_top = rs;
2004: }
2005: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2006: sctx.shift_top *= 1.1;
2007: sctx.nshift_max = 5;
2008: sctx.shift_lo = 0.;
2009: sctx.shift_hi = 1.;
2010: }
2011: sctx.shift_amount = 0;
2012: sctx.nshift = 0;
2014: ISGetIndices(isrow, &r);
2015: ISGetIndices(iscol, &c);
2016: ISGetIndices(isicol, &ic);
2017: PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33);
2018: ics = ic;
2020: node_max = a->inode.node_count;
2021: ns = a->inode.size;
2024: /* If max inode size > 3, split it into two inodes.*/
2025: /* also map the inode sizes according to the ordering */
2026: PetscMalloc1(n + 1, &tmp_vec1);
2027: for (i = 0, j = 0; i < node_max; ++i, ++j) {
2028: if (ns[i] > 3) {
2029: tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5 */
2030: ++j;
2031: tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2032: } else {
2033: tmp_vec1[j] = ns[i];
2034: }
2035: }
2036: /* Use the correct node_max */
2037: node_max = j;
2039: /* Now reorder the inode info based on mat re-ordering info */
2040: /* First create a row -> inode_size_array_index map */
2041: PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2);
2042: for (i = 0, row = 0; i < node_max; i++) {
2043: nodesz = tmp_vec1[i];
2044: for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2045: }
2046: /* Using nsmap, create a reordered ns structure */
2047: for (i = 0, j = 0; i < node_max; i++) {
2048: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2049: tmp_vec2[i] = nodesz;
2050: j += nodesz;
2051: }
2052: PetscFree2(nsmap, tmp_vec1);
2053: /* Now use the correct ns */
2054: ns = tmp_vec2;
2056: do {
2057: sctx.newshift = PETSC_FALSE;
2058: /* Now loop over each block-row, and do the factorization */
2059: for (i = 0, row = 0; i < node_max; i++) {
2060: nodesz = ns[i];
2061: nz = bi[row + 1] - bi[row];
2062: bjtmp = bj + bi[row];
2064: switch (nodesz) {
2065: case 1:
2066: for (j = 0; j < nz; j++) {
2067: idx = bjtmp[j];
2068: rtmp11[idx] = 0.0;
2069: }
2071: /* load in initial (unfactored row) */
2072: idx = r[row];
2073: nz_tmp = ai[idx + 1] - ai[idx];
2074: ajtmp = aj + ai[idx];
2075: v1 = aa + ai[idx];
2077: for (j = 0; j < nz_tmp; j++) {
2078: idx = ics[ajtmp[j]];
2079: rtmp11[idx] = v1[j];
2080: }
2081: rtmp11[ics[r[row]]] += sctx.shift_amount;
2083: prow = *bjtmp++;
2084: while (prow < row) {
2085: pc1 = rtmp11 + prow;
2086: if (*pc1 != 0.0) {
2087: pv = ba + bd[prow];
2088: pj = nbj + bd[prow];
2089: mul1 = *pc1 * *pv++;
2090: *pc1 = mul1;
2091: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2092: PetscLogFlops(1 + 2.0 * nz_tmp);
2093: for (j = 0; j < nz_tmp; j++) {
2094: tmp = pv[j];
2095: idx = pj[j];
2096: rtmp11[idx] -= mul1 * tmp;
2097: }
2098: }
2099: prow = *bjtmp++;
2100: }
2101: pj = bj + bi[row];
2102: pc1 = ba + bi[row];
2104: sctx.pv = rtmp11[row];
2105: rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2106: rs = 0.0;
2107: for (j = 0; j < nz; j++) {
2108: idx = pj[j];
2109: pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2110: if (idx != row) rs += PetscAbsScalar(pc1[j]);
2111: }
2112: sctx.rs = rs;
2113: MatPivotCheck(B, A, info, &sctx, row);
2114: if (sctx.newshift) goto endofwhile;
2115: break;
2117: case 2:
2118: for (j = 0; j < nz; j++) {
2119: idx = bjtmp[j];
2120: rtmp11[idx] = 0.0;
2121: rtmp22[idx] = 0.0;
2122: }
2124: /* load in initial (unfactored row) */
2125: idx = r[row];
2126: nz_tmp = ai[idx + 1] - ai[idx];
2127: ajtmp = aj + ai[idx];
2128: v1 = aa + ai[idx];
2129: v2 = aa + ai[idx + 1];
2130: for (j = 0; j < nz_tmp; j++) {
2131: idx = ics[ajtmp[j]];
2132: rtmp11[idx] = v1[j];
2133: rtmp22[idx] = v2[j];
2134: }
2135: rtmp11[ics[r[row]]] += sctx.shift_amount;
2136: rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2138: prow = *bjtmp++;
2139: while (prow < row) {
2140: pc1 = rtmp11 + prow;
2141: pc2 = rtmp22 + prow;
2142: if (*pc1 != 0.0 || *pc2 != 0.0) {
2143: pv = ba + bd[prow];
2144: pj = nbj + bd[prow];
2145: mul1 = *pc1 * *pv;
2146: mul2 = *pc2 * *pv;
2147: ++pv;
2148: *pc1 = mul1;
2149: *pc2 = mul2;
2151: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2152: for (j = 0; j < nz_tmp; j++) {
2153: tmp = pv[j];
2154: idx = pj[j];
2155: rtmp11[idx] -= mul1 * tmp;
2156: rtmp22[idx] -= mul2 * tmp;
2157: }
2158: PetscLogFlops(2 + 4.0 * nz_tmp);
2159: }
2160: prow = *bjtmp++;
2161: }
2163: /* Now take care of diagonal 2x2 block. Note: prow = row here */
2164: pc1 = rtmp11 + prow;
2165: pc2 = rtmp22 + prow;
2167: sctx.pv = *pc1;
2168: pj = bj + bi[prow];
2169: rs = 0.0;
2170: for (j = 0; j < nz; j++) {
2171: idx = pj[j];
2172: if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2173: }
2174: sctx.rs = rs;
2175: MatPivotCheck(B, A, info, &sctx, row);
2176: if (sctx.newshift) goto endofwhile;
2178: if (*pc2 != 0.0) {
2179: pj = nbj + bd[prow];
2180: mul2 = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2181: *pc2 = mul2;
2182: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2183: for (j = 0; j < nz_tmp; j++) {
2184: idx = pj[j];
2185: tmp = rtmp11[idx];
2186: rtmp22[idx] -= mul2 * tmp;
2187: }
2188: PetscLogFlops(1 + 2.0 * nz_tmp);
2189: }
2191: pj = bj + bi[row];
2192: pc1 = ba + bi[row];
2193: pc2 = ba + bi[row + 1];
2195: sctx.pv = rtmp22[row + 1];
2196: rs = 0.0;
2197: rtmp11[row] = 1.0 / rtmp11[row];
2198: rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2199: /* copy row entries from dense representation to sparse */
2200: for (j = 0; j < nz; j++) {
2201: idx = pj[j];
2202: pc1[j] = rtmp11[idx];
2203: pc2[j] = rtmp22[idx];
2204: if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2205: }
2206: sctx.rs = rs;
2207: MatPivotCheck(B, A, info, &sctx, row + 1);
2208: if (sctx.newshift) goto endofwhile;
2209: break;
2211: case 3:
2212: for (j = 0; j < nz; j++) {
2213: idx = bjtmp[j];
2214: rtmp11[idx] = 0.0;
2215: rtmp22[idx] = 0.0;
2216: rtmp33[idx] = 0.0;
2217: }
2218: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2219: idx = r[row];
2220: nz_tmp = ai[idx + 1] - ai[idx];
2221: ajtmp = aj + ai[idx];
2222: v1 = aa + ai[idx];
2223: v2 = aa + ai[idx + 1];
2224: v3 = aa + ai[idx + 2];
2225: for (j = 0; j < nz_tmp; j++) {
2226: idx = ics[ajtmp[j]];
2227: rtmp11[idx] = v1[j];
2228: rtmp22[idx] = v2[j];
2229: rtmp33[idx] = v3[j];
2230: }
2231: rtmp11[ics[r[row]]] += sctx.shift_amount;
2232: rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2233: rtmp33[ics[r[row + 2]]] += sctx.shift_amount;
2235: /* loop over all pivot row blocks above this row block */
2236: prow = *bjtmp++;
2237: while (prow < row) {
2238: pc1 = rtmp11 + prow;
2239: pc2 = rtmp22 + prow;
2240: pc3 = rtmp33 + prow;
2241: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2242: pv = ba + bd[prow];
2243: pj = nbj + bd[prow];
2244: mul1 = *pc1 * *pv;
2245: mul2 = *pc2 * *pv;
2246: mul3 = *pc3 * *pv;
2247: ++pv;
2248: *pc1 = mul1;
2249: *pc2 = mul2;
2250: *pc3 = mul3;
2252: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2253: /* update this row based on pivot row */
2254: for (j = 0; j < nz_tmp; j++) {
2255: tmp = pv[j];
2256: idx = pj[j];
2257: rtmp11[idx] -= mul1 * tmp;
2258: rtmp22[idx] -= mul2 * tmp;
2259: rtmp33[idx] -= mul3 * tmp;
2260: }
2261: PetscLogFlops(3 + 6.0 * nz_tmp);
2262: }
2263: prow = *bjtmp++;
2264: }
2266: /* Now take care of diagonal 3x3 block in this set of rows */
2267: /* note: prow = row here */
2268: pc1 = rtmp11 + prow;
2269: pc2 = rtmp22 + prow;
2270: pc3 = rtmp33 + prow;
2272: sctx.pv = *pc1;
2273: pj = bj + bi[prow];
2274: rs = 0.0;
2275: for (j = 0; j < nz; j++) {
2276: idx = pj[j];
2277: if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2278: }
2279: sctx.rs = rs;
2280: MatPivotCheck(B, A, info, &sctx, row);
2281: if (sctx.newshift) goto endofwhile;
2283: if (*pc2 != 0.0 || *pc3 != 0.0) {
2284: mul2 = (*pc2) / (*pc1);
2285: mul3 = (*pc3) / (*pc1);
2286: *pc2 = mul2;
2287: *pc3 = mul3;
2288: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2289: pj = nbj + bd[prow];
2290: for (j = 0; j < nz_tmp; j++) {
2291: idx = pj[j];
2292: tmp = rtmp11[idx];
2293: rtmp22[idx] -= mul2 * tmp;
2294: rtmp33[idx] -= mul3 * tmp;
2295: }
2296: PetscLogFlops(2 + 4.0 * nz_tmp);
2297: }
2298: ++prow;
2300: pc2 = rtmp22 + prow;
2301: pc3 = rtmp33 + prow;
2302: sctx.pv = *pc2;
2303: pj = bj + bi[prow];
2304: rs = 0.0;
2305: for (j = 0; j < nz; j++) {
2306: idx = pj[j];
2307: if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2308: }
2309: sctx.rs = rs;
2310: MatPivotCheck(B, A, info, &sctx, row + 1);
2311: if (sctx.newshift) goto endofwhile;
2313: if (*pc3 != 0.0) {
2314: mul3 = (*pc3) / (*pc2);
2315: *pc3 = mul3;
2316: pj = nbj + bd[prow];
2317: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2318: for (j = 0; j < nz_tmp; j++) {
2319: idx = pj[j];
2320: tmp = rtmp22[idx];
2321: rtmp33[idx] -= mul3 * tmp;
2322: }
2323: PetscLogFlops(1 + 2.0 * nz_tmp);
2324: }
2326: pj = bj + bi[row];
2327: pc1 = ba + bi[row];
2328: pc2 = ba + bi[row + 1];
2329: pc3 = ba + bi[row + 2];
2331: sctx.pv = rtmp33[row + 2];
2332: rs = 0.0;
2333: rtmp11[row] = 1.0 / rtmp11[row];
2334: rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2335: rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2336: /* copy row entries from dense representation to sparse */
2337: for (j = 0; j < nz; j++) {
2338: idx = pj[j];
2339: pc1[j] = rtmp11[idx];
2340: pc2[j] = rtmp22[idx];
2341: pc3[j] = rtmp33[idx];
2342: if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2343: }
2345: sctx.rs = rs;
2346: MatPivotCheck(B, A, info, &sctx, row + 2);
2347: if (sctx.newshift) goto endofwhile;
2348: break;
2350: default:
2351: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2352: }
2353: row += nodesz; /* Update the row */
2354: }
2355: endofwhile:;
2356: } while (sctx.newshift);
2357: PetscFree3(rtmp11, rtmp22, rtmp33);
2358: PetscFree(tmp_vec2);
2359: ISRestoreIndices(isicol, &ic);
2360: ISRestoreIndices(isrow, &r);
2361: ISRestoreIndices(iscol, &c);
2363: (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2364: /* do not set solve add, since MatSolve_Inode + Add is faster */
2365: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace;
2366: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2367: C->assembled = PETSC_TRUE;
2368: C->preallocated = PETSC_TRUE;
2369: if (sctx.nshift) {
2370: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2371: PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top);
2372: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2373: PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount);
2374: }
2375: }
2376: PetscLogFlops(C->cmap->n);
2377: MatSeqAIJCheckInode(C);
2378: return 0;
2379: }
2381: /* ----------------------------------------------------------- */
2382: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2383: {
2384: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2385: IS iscol = a->col, isrow = a->row;
2386: const PetscInt *r, *c, *rout, *cout;
2387: PetscInt i, j, n = A->rmap->n;
2388: PetscInt node_max, row, nsz, aii, i0, i1, nz;
2389: const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2390: PetscScalar *x, *tmp, *tmps, tmp0, tmp1;
2391: PetscScalar sum1, sum2, sum3, sum4, sum5;
2392: const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2393: const PetscScalar *b;
2396: node_max = a->inode.node_count;
2397: ns = a->inode.size; /* Node Size array */
2399: VecGetArrayRead(bb, &b);
2400: VecGetArrayWrite(xx, &x);
2401: tmp = a->solve_work;
2403: ISGetIndices(isrow, &rout);
2404: r = rout;
2405: ISGetIndices(iscol, &cout);
2406: c = cout;
2408: /* forward solve the lower triangular */
2409: tmps = tmp;
2410: aa = a_a;
2411: aj = a_j;
2412: ad = a->diag;
2414: for (i = 0, row = 0; i < node_max; ++i) {
2415: nsz = ns[i];
2416: aii = ai[row];
2417: v1 = aa + aii;
2418: vi = aj + aii;
2419: nz = ai[row + 1] - ai[row];
2421: if (i < node_max - 1) {
2422: /* Prefetch the indices for the next block */
2423: PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2424: /* Prefetch the data for the next block */
2425: PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2426: }
2428: switch (nsz) { /* Each loop in 'case' is unrolled */
2429: case 1:
2430: sum1 = b[r[row]];
2431: for (j = 0; j < nz - 1; j += 2) {
2432: i0 = vi[j];
2433: i1 = vi[j + 1];
2434: tmp0 = tmps[i0];
2435: tmp1 = tmps[i1];
2436: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2437: }
2438: if (j == nz - 1) {
2439: tmp0 = tmps[vi[j]];
2440: sum1 -= v1[j] * tmp0;
2441: }
2442: tmp[row++] = sum1;
2443: break;
2444: case 2:
2445: sum1 = b[r[row]];
2446: sum2 = b[r[row + 1]];
2447: v2 = aa + ai[row + 1];
2449: for (j = 0; j < nz - 1; j += 2) {
2450: i0 = vi[j];
2451: i1 = vi[j + 1];
2452: tmp0 = tmps[i0];
2453: tmp1 = tmps[i1];
2454: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2455: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2456: }
2457: if (j == nz - 1) {
2458: tmp0 = tmps[vi[j]];
2459: sum1 -= v1[j] * tmp0;
2460: sum2 -= v2[j] * tmp0;
2461: }
2462: sum2 -= v2[nz] * sum1;
2463: tmp[row++] = sum1;
2464: tmp[row++] = sum2;
2465: break;
2466: case 3:
2467: sum1 = b[r[row]];
2468: sum2 = b[r[row + 1]];
2469: sum3 = b[r[row + 2]];
2470: v2 = aa + ai[row + 1];
2471: v3 = aa + ai[row + 2];
2473: for (j = 0; j < nz - 1; j += 2) {
2474: i0 = vi[j];
2475: i1 = vi[j + 1];
2476: tmp0 = tmps[i0];
2477: tmp1 = tmps[i1];
2478: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2479: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2480: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2481: }
2482: if (j == nz - 1) {
2483: tmp0 = tmps[vi[j]];
2484: sum1 -= v1[j] * tmp0;
2485: sum2 -= v2[j] * tmp0;
2486: sum3 -= v3[j] * tmp0;
2487: }
2488: sum2 -= v2[nz] * sum1;
2489: sum3 -= v3[nz] * sum1;
2490: sum3 -= v3[nz + 1] * sum2;
2491: tmp[row++] = sum1;
2492: tmp[row++] = sum2;
2493: tmp[row++] = sum3;
2494: break;
2496: case 4:
2497: sum1 = b[r[row]];
2498: sum2 = b[r[row + 1]];
2499: sum3 = b[r[row + 2]];
2500: sum4 = b[r[row + 3]];
2501: v2 = aa + ai[row + 1];
2502: v3 = aa + ai[row + 2];
2503: v4 = aa + ai[row + 3];
2505: for (j = 0; j < nz - 1; j += 2) {
2506: i0 = vi[j];
2507: i1 = vi[j + 1];
2508: tmp0 = tmps[i0];
2509: tmp1 = tmps[i1];
2510: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2511: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2512: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2513: sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2514: }
2515: if (j == nz - 1) {
2516: tmp0 = tmps[vi[j]];
2517: sum1 -= v1[j] * tmp0;
2518: sum2 -= v2[j] * tmp0;
2519: sum3 -= v3[j] * tmp0;
2520: sum4 -= v4[j] * tmp0;
2521: }
2522: sum2 -= v2[nz] * sum1;
2523: sum3 -= v3[nz] * sum1;
2524: sum4 -= v4[nz] * sum1;
2525: sum3 -= v3[nz + 1] * sum2;
2526: sum4 -= v4[nz + 1] * sum2;
2527: sum4 -= v4[nz + 2] * sum3;
2529: tmp[row++] = sum1;
2530: tmp[row++] = sum2;
2531: tmp[row++] = sum3;
2532: tmp[row++] = sum4;
2533: break;
2534: case 5:
2535: sum1 = b[r[row]];
2536: sum2 = b[r[row + 1]];
2537: sum3 = b[r[row + 2]];
2538: sum4 = b[r[row + 3]];
2539: sum5 = b[r[row + 4]];
2540: v2 = aa + ai[row + 1];
2541: v3 = aa + ai[row + 2];
2542: v4 = aa + ai[row + 3];
2543: v5 = aa + ai[row + 4];
2545: for (j = 0; j < nz - 1; j += 2) {
2546: i0 = vi[j];
2547: i1 = vi[j + 1];
2548: tmp0 = tmps[i0];
2549: tmp1 = tmps[i1];
2550: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2551: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2552: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2553: sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2554: sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2555: }
2556: if (j == nz - 1) {
2557: tmp0 = tmps[vi[j]];
2558: sum1 -= v1[j] * tmp0;
2559: sum2 -= v2[j] * tmp0;
2560: sum3 -= v3[j] * tmp0;
2561: sum4 -= v4[j] * tmp0;
2562: sum5 -= v5[j] * tmp0;
2563: }
2565: sum2 -= v2[nz] * sum1;
2566: sum3 -= v3[nz] * sum1;
2567: sum4 -= v4[nz] * sum1;
2568: sum5 -= v5[nz] * sum1;
2569: sum3 -= v3[nz + 1] * sum2;
2570: sum4 -= v4[nz + 1] * sum2;
2571: sum5 -= v5[nz + 1] * sum2;
2572: sum4 -= v4[nz + 2] * sum3;
2573: sum5 -= v5[nz + 2] * sum3;
2574: sum5 -= v5[nz + 3] * sum4;
2576: tmp[row++] = sum1;
2577: tmp[row++] = sum2;
2578: tmp[row++] = sum3;
2579: tmp[row++] = sum4;
2580: tmp[row++] = sum5;
2581: break;
2582: default:
2583: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2584: }
2585: }
2586: /* backward solve the upper triangular */
2587: for (i = node_max - 1, row = n - 1; i >= 0; i--) {
2588: nsz = ns[i];
2589: aii = ad[row + 1] + 1;
2590: v1 = aa + aii;
2591: vi = aj + aii;
2592: nz = ad[row] - ad[row + 1] - 1;
2594: if (i > 0) {
2595: /* Prefetch the indices for the next block */
2596: PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2597: /* Prefetch the data for the next block */
2598: PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2599: }
2601: switch (nsz) { /* Each loop in 'case' is unrolled */
2602: case 1:
2603: sum1 = tmp[row];
2605: for (j = 0; j < nz - 1; j += 2) {
2606: i0 = vi[j];
2607: i1 = vi[j + 1];
2608: tmp0 = tmps[i0];
2609: tmp1 = tmps[i1];
2610: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2611: }
2612: if (j == nz - 1) {
2613: tmp0 = tmps[vi[j]];
2614: sum1 -= v1[j] * tmp0;
2615: }
2616: x[c[row]] = tmp[row] = sum1 * v1[nz];
2617: row--;
2618: break;
2619: case 2:
2620: sum1 = tmp[row];
2621: sum2 = tmp[row - 1];
2622: v2 = aa + ad[row] + 1;
2623: for (j = 0; j < nz - 1; j += 2) {
2624: i0 = vi[j];
2625: i1 = vi[j + 1];
2626: tmp0 = tmps[i0];
2627: tmp1 = tmps[i1];
2628: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2629: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2630: }
2631: if (j == nz - 1) {
2632: tmp0 = tmps[vi[j]];
2633: sum1 -= v1[j] * tmp0;
2634: sum2 -= v2[j + 1] * tmp0;
2635: }
2637: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2638: row--;
2639: sum2 -= v2[0] * tmp0;
2640: x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2641: row--;
2642: break;
2643: case 3:
2644: sum1 = tmp[row];
2645: sum2 = tmp[row - 1];
2646: sum3 = tmp[row - 2];
2647: v2 = aa + ad[row] + 1;
2648: v3 = aa + ad[row - 1] + 1;
2649: for (j = 0; j < nz - 1; j += 2) {
2650: i0 = vi[j];
2651: i1 = vi[j + 1];
2652: tmp0 = tmps[i0];
2653: tmp1 = tmps[i1];
2654: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2655: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2656: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2657: }
2658: if (j == nz - 1) {
2659: tmp0 = tmps[vi[j]];
2660: sum1 -= v1[j] * tmp0;
2661: sum2 -= v2[j + 1] * tmp0;
2662: sum3 -= v3[j + 2] * tmp0;
2663: }
2664: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2665: row--;
2666: sum2 -= v2[0] * tmp0;
2667: sum3 -= v3[1] * tmp0;
2668: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2669: row--;
2670: sum3 -= v3[0] * tmp0;
2671: x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2672: row--;
2674: break;
2675: case 4:
2676: sum1 = tmp[row];
2677: sum2 = tmp[row - 1];
2678: sum3 = tmp[row - 2];
2679: sum4 = tmp[row - 3];
2680: v2 = aa + ad[row] + 1;
2681: v3 = aa + ad[row - 1] + 1;
2682: v4 = aa + ad[row - 2] + 1;
2684: for (j = 0; j < nz - 1; j += 2) {
2685: i0 = vi[j];
2686: i1 = vi[j + 1];
2687: tmp0 = tmps[i0];
2688: tmp1 = tmps[i1];
2689: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2690: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2691: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2692: sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2693: }
2694: if (j == nz - 1) {
2695: tmp0 = tmps[vi[j]];
2696: sum1 -= v1[j] * tmp0;
2697: sum2 -= v2[j + 1] * tmp0;
2698: sum3 -= v3[j + 2] * tmp0;
2699: sum4 -= v4[j + 3] * tmp0;
2700: }
2702: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2703: row--;
2704: sum2 -= v2[0] * tmp0;
2705: sum3 -= v3[1] * tmp0;
2706: sum4 -= v4[2] * tmp0;
2707: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2708: row--;
2709: sum3 -= v3[0] * tmp0;
2710: sum4 -= v4[1] * tmp0;
2711: tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2712: row--;
2713: sum4 -= v4[0] * tmp0;
2714: x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2715: row--;
2716: break;
2717: case 5:
2718: sum1 = tmp[row];
2719: sum2 = tmp[row - 1];
2720: sum3 = tmp[row - 2];
2721: sum4 = tmp[row - 3];
2722: sum5 = tmp[row - 4];
2723: v2 = aa + ad[row] + 1;
2724: v3 = aa + ad[row - 1] + 1;
2725: v4 = aa + ad[row - 2] + 1;
2726: v5 = aa + ad[row - 3] + 1;
2727: for (j = 0; j < nz - 1; j += 2) {
2728: i0 = vi[j];
2729: i1 = vi[j + 1];
2730: tmp0 = tmps[i0];
2731: tmp1 = tmps[i1];
2732: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2733: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2734: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2735: sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2736: sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2737: }
2738: if (j == nz - 1) {
2739: tmp0 = tmps[vi[j]];
2740: sum1 -= v1[j] * tmp0;
2741: sum2 -= v2[j + 1] * tmp0;
2742: sum3 -= v3[j + 2] * tmp0;
2743: sum4 -= v4[j + 3] * tmp0;
2744: sum5 -= v5[j + 4] * tmp0;
2745: }
2747: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2748: row--;
2749: sum2 -= v2[0] * tmp0;
2750: sum3 -= v3[1] * tmp0;
2751: sum4 -= v4[2] * tmp0;
2752: sum5 -= v5[3] * tmp0;
2753: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2754: row--;
2755: sum3 -= v3[0] * tmp0;
2756: sum4 -= v4[1] * tmp0;
2757: sum5 -= v5[2] * tmp0;
2758: tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2759: row--;
2760: sum4 -= v4[0] * tmp0;
2761: sum5 -= v5[1] * tmp0;
2762: tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2763: row--;
2764: sum5 -= v5[0] * tmp0;
2765: x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2766: row--;
2767: break;
2768: default:
2769: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2770: }
2771: }
2772: ISRestoreIndices(isrow, &rout);
2773: ISRestoreIndices(iscol, &cout);
2774: VecRestoreArrayRead(bb, &b);
2775: VecRestoreArrayWrite(xx, &x);
2776: PetscLogFlops(2.0 * a->nz - A->cmap->n);
2777: return 0;
2778: }
2780: /*
2781: Makes a longer coloring[] array and calls the usual code with that
2782: */
2783: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2784: {
2785: Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data;
2786: PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row;
2787: PetscInt *colorused, i;
2788: ISColoringValue *newcolor;
2791: PetscMalloc1(n + 1, &newcolor);
2792: /* loop over inodes, marking a color for each column*/
2793: row = 0;
2794: for (i = 0; i < m; i++) {
2795: for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors;
2796: }
2798: /* eliminate unneeded colors */
2799: PetscCalloc1(5 * ncolors, &colorused);
2800: for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
2802: for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2803: ncolors = colorused[5 * ncolors - 1];
2804: for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1;
2805: PetscFree(colorused);
2806: ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring);
2807: PetscFree(coloring);
2808: return 0;
2809: }
2811: #include <petsc/private/kernels/blockinvert.h>
2813: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2814: {
2815: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2816: PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2817: MatScalar *ibdiag, *bdiag, work[25], *t;
2818: PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2819: const MatScalar *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2820: const PetscScalar *xb, *b;
2821: PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2822: PetscInt n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2;
2823: PetscInt sz, k, ipvt[5];
2824: PetscBool allowzeropivot, zeropivotdetected;
2825: const PetscInt *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i;
2827: allowzeropivot = PetscNot(A->erroriffailure);
2832: if (!a->inode.ibdiagvalid) {
2833: if (!a->inode.ibdiag) {
2834: /* calculate space needed for diagonal blocks */
2835: for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i];
2836: a->inode.bdiagsize = cnt;
2838: PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work);
2839: }
2841: /* copy over the diagonal blocks and invert them */
2842: ibdiag = a->inode.ibdiag;
2843: bdiag = a->inode.bdiag;
2844: cnt = 0;
2845: for (i = 0, row = 0; i < m; i++) {
2846: for (j = 0; j < sizes[i]; j++) {
2847: for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k];
2848: }
2849: PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i]);
2851: switch (sizes[i]) {
2852: case 1:
2853: /* Create matrix data structure */
2854: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2855: if (allowzeropivot) {
2856: A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2857: A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2858: A->factorerror_zeropivot_row = row;
2859: PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row);
2860: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2861: }
2862: ibdiag[cnt] = 1.0 / ibdiag[cnt];
2863: break;
2864: case 2:
2865: PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected);
2866: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2867: break;
2868: case 3:
2869: PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected);
2870: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2871: break;
2872: case 4:
2873: PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected);
2874: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2875: break;
2876: case 5:
2877: PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected);
2878: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2879: break;
2880: default:
2881: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
2882: }
2883: cnt += sizes[i] * sizes[i];
2884: row += sizes[i];
2885: }
2886: a->inode.ibdiagvalid = PETSC_TRUE;
2887: }
2888: ibdiag = a->inode.ibdiag;
2889: bdiag = a->inode.bdiag;
2890: t = a->inode.ssor_work;
2892: VecGetArray(xx, &x);
2893: VecGetArrayRead(bb, &b);
2894: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2895: if (flag & SOR_ZERO_INITIAL_GUESS) {
2896: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2897: for (i = 0, row = 0; i < m; i++) {
2898: sz = diag[row] - ii[row];
2899: v1 = a->a + ii[row];
2900: idx = a->j + ii[row];
2902: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2903: switch (sizes[i]) {
2904: case 1:
2906: sum1 = b[row];
2907: for (n = 0; n < sz - 1; n += 2) {
2908: i1 = idx[0];
2909: i2 = idx[1];
2910: idx += 2;
2911: tmp0 = x[i1];
2912: tmp1 = x[i2];
2913: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2914: v1 += 2;
2915: }
2917: if (n == sz - 1) {
2918: tmp0 = x[*idx];
2919: sum1 -= *v1 * tmp0;
2920: }
2921: t[row] = sum1;
2922: x[row++] = sum1 * (*ibdiag++);
2923: break;
2924: case 2:
2925: v2 = a->a + ii[row + 1];
2926: sum1 = b[row];
2927: sum2 = b[row + 1];
2928: for (n = 0; n < sz - 1; n += 2) {
2929: i1 = idx[0];
2930: i2 = idx[1];
2931: idx += 2;
2932: tmp0 = x[i1];
2933: tmp1 = x[i2];
2934: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2935: v1 += 2;
2936: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2937: v2 += 2;
2938: }
2940: if (n == sz - 1) {
2941: tmp0 = x[*idx];
2942: sum1 -= v1[0] * tmp0;
2943: sum2 -= v2[0] * tmp0;
2944: }
2945: t[row] = sum1;
2946: t[row + 1] = sum2;
2947: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2948: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2949: ibdiag += 4;
2950: break;
2951: case 3:
2952: v2 = a->a + ii[row + 1];
2953: v3 = a->a + ii[row + 2];
2954: sum1 = b[row];
2955: sum2 = b[row + 1];
2956: sum3 = b[row + 2];
2957: for (n = 0; n < sz - 1; n += 2) {
2958: i1 = idx[0];
2959: i2 = idx[1];
2960: idx += 2;
2961: tmp0 = x[i1];
2962: tmp1 = x[i2];
2963: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2964: v1 += 2;
2965: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2966: v2 += 2;
2967: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2968: v3 += 2;
2969: }
2971: if (n == sz - 1) {
2972: tmp0 = x[*idx];
2973: sum1 -= v1[0] * tmp0;
2974: sum2 -= v2[0] * tmp0;
2975: sum3 -= v3[0] * tmp0;
2976: }
2977: t[row] = sum1;
2978: t[row + 1] = sum2;
2979: t[row + 2] = sum3;
2980: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
2981: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
2982: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
2983: ibdiag += 9;
2984: break;
2985: case 4:
2986: v2 = a->a + ii[row + 1];
2987: v3 = a->a + ii[row + 2];
2988: v4 = a->a + ii[row + 3];
2989: sum1 = b[row];
2990: sum2 = b[row + 1];
2991: sum3 = b[row + 2];
2992: sum4 = b[row + 3];
2993: for (n = 0; n < sz - 1; n += 2) {
2994: i1 = idx[0];
2995: i2 = idx[1];
2996: idx += 2;
2997: tmp0 = x[i1];
2998: tmp1 = x[i2];
2999: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3000: v1 += 2;
3001: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3002: v2 += 2;
3003: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3004: v3 += 2;
3005: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3006: v4 += 2;
3007: }
3009: if (n == sz - 1) {
3010: tmp0 = x[*idx];
3011: sum1 -= v1[0] * tmp0;
3012: sum2 -= v2[0] * tmp0;
3013: sum3 -= v3[0] * tmp0;
3014: sum4 -= v4[0] * tmp0;
3015: }
3016: t[row] = sum1;
3017: t[row + 1] = sum2;
3018: t[row + 2] = sum3;
3019: t[row + 3] = sum4;
3020: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3021: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3022: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3023: x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3024: ibdiag += 16;
3025: break;
3026: case 5:
3027: v2 = a->a + ii[row + 1];
3028: v3 = a->a + ii[row + 2];
3029: v4 = a->a + ii[row + 3];
3030: v5 = a->a + ii[row + 4];
3031: sum1 = b[row];
3032: sum2 = b[row + 1];
3033: sum3 = b[row + 2];
3034: sum4 = b[row + 3];
3035: sum5 = b[row + 4];
3036: for (n = 0; n < sz - 1; n += 2) {
3037: i1 = idx[0];
3038: i2 = idx[1];
3039: idx += 2;
3040: tmp0 = x[i1];
3041: tmp1 = x[i2];
3042: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3043: v1 += 2;
3044: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3045: v2 += 2;
3046: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3047: v3 += 2;
3048: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3049: v4 += 2;
3050: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3051: v5 += 2;
3052: }
3054: if (n == sz - 1) {
3055: tmp0 = x[*idx];
3056: sum1 -= v1[0] * tmp0;
3057: sum2 -= v2[0] * tmp0;
3058: sum3 -= v3[0] * tmp0;
3059: sum4 -= v4[0] * tmp0;
3060: sum5 -= v5[0] * tmp0;
3061: }
3062: t[row] = sum1;
3063: t[row + 1] = sum2;
3064: t[row + 2] = sum3;
3065: t[row + 3] = sum4;
3066: t[row + 4] = sum5;
3067: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3068: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3069: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3070: x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3071: x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3072: ibdiag += 25;
3073: break;
3074: default:
3075: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3076: }
3077: }
3079: xb = t;
3080: PetscLogFlops(a->nz);
3081: } else xb = b;
3082: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3083: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3084: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3085: ibdiag -= sizes[i] * sizes[i];
3086: sz = ii[row + 1] - diag[row] - 1;
3087: v1 = a->a + diag[row] + 1;
3088: idx = a->j + diag[row] + 1;
3090: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3091: switch (sizes[i]) {
3092: case 1:
3094: sum1 = xb[row];
3095: for (n = 0; n < sz - 1; n += 2) {
3096: i1 = idx[0];
3097: i2 = idx[1];
3098: idx += 2;
3099: tmp0 = x[i1];
3100: tmp1 = x[i2];
3101: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3102: v1 += 2;
3103: }
3105: if (n == sz - 1) {
3106: tmp0 = x[*idx];
3107: sum1 -= *v1 * tmp0;
3108: }
3109: x[row--] = sum1 * (*ibdiag);
3110: break;
3112: case 2:
3114: sum1 = xb[row];
3115: sum2 = xb[row - 1];
3116: /* note that sum1 is associated with the second of the two rows */
3117: v2 = a->a + diag[row - 1] + 2;
3118: for (n = 0; n < sz - 1; n += 2) {
3119: i1 = idx[0];
3120: i2 = idx[1];
3121: idx += 2;
3122: tmp0 = x[i1];
3123: tmp1 = x[i2];
3124: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3125: v1 += 2;
3126: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3127: v2 += 2;
3128: }
3130: if (n == sz - 1) {
3131: tmp0 = x[*idx];
3132: sum1 -= *v1 * tmp0;
3133: sum2 -= *v2 * tmp0;
3134: }
3135: x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3136: x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3137: break;
3138: case 3:
3140: sum1 = xb[row];
3141: sum2 = xb[row - 1];
3142: sum3 = xb[row - 2];
3143: v2 = a->a + diag[row - 1] + 2;
3144: v3 = a->a + diag[row - 2] + 3;
3145: for (n = 0; n < sz - 1; n += 2) {
3146: i1 = idx[0];
3147: i2 = idx[1];
3148: idx += 2;
3149: tmp0 = x[i1];
3150: tmp1 = x[i2];
3151: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3152: v1 += 2;
3153: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3154: v2 += 2;
3155: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3156: v3 += 2;
3157: }
3159: if (n == sz - 1) {
3160: tmp0 = x[*idx];
3161: sum1 -= *v1 * tmp0;
3162: sum2 -= *v2 * tmp0;
3163: sum3 -= *v3 * tmp0;
3164: }
3165: x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3166: x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3167: x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3168: break;
3169: case 4:
3171: sum1 = xb[row];
3172: sum2 = xb[row - 1];
3173: sum3 = xb[row - 2];
3174: sum4 = xb[row - 3];
3175: v2 = a->a + diag[row - 1] + 2;
3176: v3 = a->a + diag[row - 2] + 3;
3177: v4 = a->a + diag[row - 3] + 4;
3178: for (n = 0; n < sz - 1; n += 2) {
3179: i1 = idx[0];
3180: i2 = idx[1];
3181: idx += 2;
3182: tmp0 = x[i1];
3183: tmp1 = x[i2];
3184: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3185: v1 += 2;
3186: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3187: v2 += 2;
3188: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3189: v3 += 2;
3190: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3191: v4 += 2;
3192: }
3194: if (n == sz - 1) {
3195: tmp0 = x[*idx];
3196: sum1 -= *v1 * tmp0;
3197: sum2 -= *v2 * tmp0;
3198: sum3 -= *v3 * tmp0;
3199: sum4 -= *v4 * tmp0;
3200: }
3201: x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3202: x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3203: x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3204: x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3205: break;
3206: case 5:
3208: sum1 = xb[row];
3209: sum2 = xb[row - 1];
3210: sum3 = xb[row - 2];
3211: sum4 = xb[row - 3];
3212: sum5 = xb[row - 4];
3213: v2 = a->a + diag[row - 1] + 2;
3214: v3 = a->a + diag[row - 2] + 3;
3215: v4 = a->a + diag[row - 3] + 4;
3216: v5 = a->a + diag[row - 4] + 5;
3217: for (n = 0; n < sz - 1; n += 2) {
3218: i1 = idx[0];
3219: i2 = idx[1];
3220: idx += 2;
3221: tmp0 = x[i1];
3222: tmp1 = x[i2];
3223: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3224: v1 += 2;
3225: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3226: v2 += 2;
3227: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3228: v3 += 2;
3229: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3230: v4 += 2;
3231: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3232: v5 += 2;
3233: }
3235: if (n == sz - 1) {
3236: tmp0 = x[*idx];
3237: sum1 -= *v1 * tmp0;
3238: sum2 -= *v2 * tmp0;
3239: sum3 -= *v3 * tmp0;
3240: sum4 -= *v4 * tmp0;
3241: sum5 -= *v5 * tmp0;
3242: }
3243: x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3244: x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3245: x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3246: x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3247: x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3248: break;
3249: default:
3250: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3251: }
3252: }
3254: PetscLogFlops(a->nz);
3255: }
3256: its--;
3257: }
3258: while (its--) {
3259: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3260: for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) {
3261: sz = diag[row] - ii[row];
3262: v1 = a->a + ii[row];
3263: idx = a->j + ii[row];
3264: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3265: switch (sizes[i]) {
3266: case 1:
3267: sum1 = b[row];
3268: for (n = 0; n < sz - 1; n += 2) {
3269: i1 = idx[0];
3270: i2 = idx[1];
3271: idx += 2;
3272: tmp0 = x[i1];
3273: tmp1 = x[i2];
3274: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3275: v1 += 2;
3276: }
3277: if (n == sz - 1) {
3278: tmp0 = x[*idx++];
3279: sum1 -= *v1 * tmp0;
3280: v1++;
3281: }
3282: t[row] = sum1;
3283: sz = ii[row + 1] - diag[row] - 1;
3284: idx = a->j + diag[row] + 1;
3285: v1 += 1;
3286: for (n = 0; n < sz - 1; n += 2) {
3287: i1 = idx[0];
3288: i2 = idx[1];
3289: idx += 2;
3290: tmp0 = x[i1];
3291: tmp1 = x[i2];
3292: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3293: v1 += 2;
3294: }
3295: if (n == sz - 1) {
3296: tmp0 = x[*idx++];
3297: sum1 -= *v1 * tmp0;
3298: }
3299: /* in MatSOR_SeqAIJ this line would be
3300: *
3301: * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3302: *
3303: * but omega == 1, so this becomes
3304: *
3305: * x[row] = sum1*(*ibdiag++);
3306: *
3307: */
3308: x[row] = sum1 * (*ibdiag);
3309: break;
3310: case 2:
3311: v2 = a->a + ii[row + 1];
3312: sum1 = b[row];
3313: sum2 = b[row + 1];
3314: for (n = 0; n < sz - 1; n += 2) {
3315: i1 = idx[0];
3316: i2 = idx[1];
3317: idx += 2;
3318: tmp0 = x[i1];
3319: tmp1 = x[i2];
3320: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3321: v1 += 2;
3322: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3323: v2 += 2;
3324: }
3325: if (n == sz - 1) {
3326: tmp0 = x[*idx++];
3327: sum1 -= v1[0] * tmp0;
3328: sum2 -= v2[0] * tmp0;
3329: v1++;
3330: v2++;
3331: }
3332: t[row] = sum1;
3333: t[row + 1] = sum2;
3334: sz = ii[row + 1] - diag[row] - 2;
3335: idx = a->j + diag[row] + 2;
3336: v1 += 2;
3337: v2 += 2;
3338: for (n = 0; n < sz - 1; n += 2) {
3339: i1 = idx[0];
3340: i2 = idx[1];
3341: idx += 2;
3342: tmp0 = x[i1];
3343: tmp1 = x[i2];
3344: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3345: v1 += 2;
3346: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3347: v2 += 2;
3348: }
3349: if (n == sz - 1) {
3350: tmp0 = x[*idx];
3351: sum1 -= v1[0] * tmp0;
3352: sum2 -= v2[0] * tmp0;
3353: }
3354: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3355: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3356: break;
3357: case 3:
3358: v2 = a->a + ii[row + 1];
3359: v3 = a->a + ii[row + 2];
3360: sum1 = b[row];
3361: sum2 = b[row + 1];
3362: sum3 = b[row + 2];
3363: for (n = 0; n < sz - 1; n += 2) {
3364: i1 = idx[0];
3365: i2 = idx[1];
3366: idx += 2;
3367: tmp0 = x[i1];
3368: tmp1 = x[i2];
3369: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3370: v1 += 2;
3371: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3372: v2 += 2;
3373: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3374: v3 += 2;
3375: }
3376: if (n == sz - 1) {
3377: tmp0 = x[*idx++];
3378: sum1 -= v1[0] * tmp0;
3379: sum2 -= v2[0] * tmp0;
3380: sum3 -= v3[0] * tmp0;
3381: v1++;
3382: v2++;
3383: v3++;
3384: }
3385: t[row] = sum1;
3386: t[row + 1] = sum2;
3387: t[row + 2] = sum3;
3388: sz = ii[row + 1] - diag[row] - 3;
3389: idx = a->j + diag[row] + 3;
3390: v1 += 3;
3391: v2 += 3;
3392: v3 += 3;
3393: for (n = 0; n < sz - 1; n += 2) {
3394: i1 = idx[0];
3395: i2 = idx[1];
3396: idx += 2;
3397: tmp0 = x[i1];
3398: tmp1 = x[i2];
3399: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3400: v1 += 2;
3401: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3402: v2 += 2;
3403: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3404: v3 += 2;
3405: }
3406: if (n == sz - 1) {
3407: tmp0 = x[*idx];
3408: sum1 -= v1[0] * tmp0;
3409: sum2 -= v2[0] * tmp0;
3410: sum3 -= v3[0] * tmp0;
3411: }
3412: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3413: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3414: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3415: break;
3416: case 4:
3417: v2 = a->a + ii[row + 1];
3418: v3 = a->a + ii[row + 2];
3419: v4 = a->a + ii[row + 3];
3420: sum1 = b[row];
3421: sum2 = b[row + 1];
3422: sum3 = b[row + 2];
3423: sum4 = b[row + 3];
3424: for (n = 0; n < sz - 1; n += 2) {
3425: i1 = idx[0];
3426: i2 = idx[1];
3427: idx += 2;
3428: tmp0 = x[i1];
3429: tmp1 = x[i2];
3430: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3431: v1 += 2;
3432: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3433: v2 += 2;
3434: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3435: v3 += 2;
3436: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3437: v4 += 2;
3438: }
3439: if (n == sz - 1) {
3440: tmp0 = x[*idx++];
3441: sum1 -= v1[0] * tmp0;
3442: sum2 -= v2[0] * tmp0;
3443: sum3 -= v3[0] * tmp0;
3444: sum4 -= v4[0] * tmp0;
3445: v1++;
3446: v2++;
3447: v3++;
3448: v4++;
3449: }
3450: t[row] = sum1;
3451: t[row + 1] = sum2;
3452: t[row + 2] = sum3;
3453: t[row + 3] = sum4;
3454: sz = ii[row + 1] - diag[row] - 4;
3455: idx = a->j + diag[row] + 4;
3456: v1 += 4;
3457: v2 += 4;
3458: v3 += 4;
3459: v4 += 4;
3460: for (n = 0; n < sz - 1; n += 2) {
3461: i1 = idx[0];
3462: i2 = idx[1];
3463: idx += 2;
3464: tmp0 = x[i1];
3465: tmp1 = x[i2];
3466: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3467: v1 += 2;
3468: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3469: v2 += 2;
3470: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3471: v3 += 2;
3472: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3473: v4 += 2;
3474: }
3475: if (n == sz - 1) {
3476: tmp0 = x[*idx];
3477: sum1 -= v1[0] * tmp0;
3478: sum2 -= v2[0] * tmp0;
3479: sum3 -= v3[0] * tmp0;
3480: sum4 -= v4[0] * tmp0;
3481: }
3482: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3483: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3484: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3485: x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3486: break;
3487: case 5:
3488: v2 = a->a + ii[row + 1];
3489: v3 = a->a + ii[row + 2];
3490: v4 = a->a + ii[row + 3];
3491: v5 = a->a + ii[row + 4];
3492: sum1 = b[row];
3493: sum2 = b[row + 1];
3494: sum3 = b[row + 2];
3495: sum4 = b[row + 3];
3496: sum5 = b[row + 4];
3497: for (n = 0; n < sz - 1; n += 2) {
3498: i1 = idx[0];
3499: i2 = idx[1];
3500: idx += 2;
3501: tmp0 = x[i1];
3502: tmp1 = x[i2];
3503: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3504: v1 += 2;
3505: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3506: v2 += 2;
3507: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3508: v3 += 2;
3509: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3510: v4 += 2;
3511: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3512: v5 += 2;
3513: }
3514: if (n == sz - 1) {
3515: tmp0 = x[*idx++];
3516: sum1 -= v1[0] * tmp0;
3517: sum2 -= v2[0] * tmp0;
3518: sum3 -= v3[0] * tmp0;
3519: sum4 -= v4[0] * tmp0;
3520: sum5 -= v5[0] * tmp0;
3521: v1++;
3522: v2++;
3523: v3++;
3524: v4++;
3525: v5++;
3526: }
3527: t[row] = sum1;
3528: t[row + 1] = sum2;
3529: t[row + 2] = sum3;
3530: t[row + 3] = sum4;
3531: t[row + 4] = sum5;
3532: sz = ii[row + 1] - diag[row] - 5;
3533: idx = a->j + diag[row] + 5;
3534: v1 += 5;
3535: v2 += 5;
3536: v3 += 5;
3537: v4 += 5;
3538: v5 += 5;
3539: for (n = 0; n < sz - 1; n += 2) {
3540: i1 = idx[0];
3541: i2 = idx[1];
3542: idx += 2;
3543: tmp0 = x[i1];
3544: tmp1 = x[i2];
3545: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3546: v1 += 2;
3547: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3548: v2 += 2;
3549: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3550: v3 += 2;
3551: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3552: v4 += 2;
3553: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3554: v5 += 2;
3555: }
3556: if (n == sz - 1) {
3557: tmp0 = x[*idx];
3558: sum1 -= v1[0] * tmp0;
3559: sum2 -= v2[0] * tmp0;
3560: sum3 -= v3[0] * tmp0;
3561: sum4 -= v4[0] * tmp0;
3562: sum5 -= v5[0] * tmp0;
3563: }
3564: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3565: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3566: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3567: x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3568: x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3569: break;
3570: default:
3571: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3572: }
3573: }
3574: xb = t;
3575: PetscLogFlops(2.0 * a->nz); /* undercounts diag inverse */
3576: } else xb = b;
3578: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3579: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3580: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3581: ibdiag -= sizes[i] * sizes[i];
3583: /* set RHS */
3584: if (xb == b) {
3585: /* whole (old way) */
3586: sz = ii[row + 1] - ii[row];
3587: idx = a->j + ii[row];
3588: switch (sizes[i]) {
3589: case 5:
3590: v5 = a->a + ii[row - 4]; /* fall through */
3591: case 4:
3592: v4 = a->a + ii[row - 3]; /* fall through */
3593: case 3:
3594: v3 = a->a + ii[row - 2]; /* fall through */
3595: case 2:
3596: v2 = a->a + ii[row - 1]; /* fall through */
3597: case 1:
3598: v1 = a->a + ii[row];
3599: break;
3600: default:
3601: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3602: }
3603: } else {
3604: /* upper, no diag */
3605: sz = ii[row + 1] - diag[row] - 1;
3606: idx = a->j + diag[row] + 1;
3607: switch (sizes[i]) {
3608: case 5:
3609: v5 = a->a + diag[row - 4] + 5; /* fall through */
3610: case 4:
3611: v4 = a->a + diag[row - 3] + 4; /* fall through */
3612: case 3:
3613: v3 = a->a + diag[row - 2] + 3; /* fall through */
3614: case 2:
3615: v2 = a->a + diag[row - 1] + 2; /* fall through */
3616: case 1:
3617: v1 = a->a + diag[row] + 1;
3618: }
3619: }
3620: /* set sum */
3621: switch (sizes[i]) {
3622: case 5:
3623: sum5 = xb[row - 4]; /* fall through */
3624: case 4:
3625: sum4 = xb[row - 3]; /* fall through */
3626: case 3:
3627: sum3 = xb[row - 2]; /* fall through */
3628: case 2:
3629: sum2 = xb[row - 1]; /* fall through */
3630: case 1:
3631: /* note that sum1 is associated with the last row */
3632: sum1 = xb[row];
3633: }
3634: /* do sums */
3635: for (n = 0; n < sz - 1; n += 2) {
3636: i1 = idx[0];
3637: i2 = idx[1];
3638: idx += 2;
3639: tmp0 = x[i1];
3640: tmp1 = x[i2];
3641: switch (sizes[i]) {
3642: case 5:
3643: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3644: v5 += 2; /* fall through */
3645: case 4:
3646: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3647: v4 += 2; /* fall through */
3648: case 3:
3649: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3650: v3 += 2; /* fall through */
3651: case 2:
3652: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3653: v2 += 2; /* fall through */
3654: case 1:
3655: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3656: v1 += 2;
3657: }
3658: }
3659: /* ragged edge */
3660: if (n == sz - 1) {
3661: tmp0 = x[*idx];
3662: switch (sizes[i]) {
3663: case 5:
3664: sum5 -= *v5 * tmp0; /* fall through */
3665: case 4:
3666: sum4 -= *v4 * tmp0; /* fall through */
3667: case 3:
3668: sum3 -= *v3 * tmp0; /* fall through */
3669: case 2:
3670: sum2 -= *v2 * tmp0; /* fall through */
3671: case 1:
3672: sum1 -= *v1 * tmp0;
3673: }
3674: }
3675: /* update */
3676: if (xb == b) {
3677: /* whole (old way) w/ diag */
3678: switch (sizes[i]) {
3679: case 5:
3680: x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3681: x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3682: x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3683: x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3684: x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3685: break;
3686: case 4:
3687: x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3688: x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3689: x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3690: x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3691: break;
3692: case 3:
3693: x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3694: x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3695: x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3696: break;
3697: case 2:
3698: x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3699: x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3700: break;
3701: case 1:
3702: x[row--] += sum1 * (*ibdiag);
3703: break;
3704: }
3705: } else {
3706: /* no diag so set = */
3707: switch (sizes[i]) {
3708: case 5:
3709: x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3710: x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3711: x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3712: x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3713: x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3714: break;
3715: case 4:
3716: x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3717: x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3718: x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3719: x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3720: break;
3721: case 3:
3722: x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3723: x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3724: x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3725: break;
3726: case 2:
3727: x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3728: x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3729: break;
3730: case 1:
3731: x[row--] = sum1 * (*ibdiag);
3732: break;
3733: }
3734: }
3735: }
3736: if (xb == b) {
3737: PetscLogFlops(2.0 * a->nz);
3738: } else {
3739: PetscLogFlops(a->nz); /* assumes 1/2 in upper, undercounts diag inverse */
3740: }
3741: }
3742: }
3743: if (flag & SOR_EISENSTAT) {
3744: /*
3745: Apply (U + D)^-1 where D is now the block diagonal
3746: */
3747: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3748: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3749: ibdiag -= sizes[i] * sizes[i];
3750: sz = ii[row + 1] - diag[row] - 1;
3751: v1 = a->a + diag[row] + 1;
3752: idx = a->j + diag[row] + 1;
3753: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3754: switch (sizes[i]) {
3755: case 1:
3757: sum1 = b[row];
3758: for (n = 0; n < sz - 1; n += 2) {
3759: i1 = idx[0];
3760: i2 = idx[1];
3761: idx += 2;
3762: tmp0 = x[i1];
3763: tmp1 = x[i2];
3764: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3765: v1 += 2;
3766: }
3768: if (n == sz - 1) {
3769: tmp0 = x[*idx];
3770: sum1 -= *v1 * tmp0;
3771: }
3772: x[row] = sum1 * (*ibdiag);
3773: row--;
3774: break;
3776: case 2:
3778: sum1 = b[row];
3779: sum2 = b[row - 1];
3780: /* note that sum1 is associated with the second of the two rows */
3781: v2 = a->a + diag[row - 1] + 2;
3782: for (n = 0; n < sz - 1; n += 2) {
3783: i1 = idx[0];
3784: i2 = idx[1];
3785: idx += 2;
3786: tmp0 = x[i1];
3787: tmp1 = x[i2];
3788: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3789: v1 += 2;
3790: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3791: v2 += 2;
3792: }
3794: if (n == sz - 1) {
3795: tmp0 = x[*idx];
3796: sum1 -= *v1 * tmp0;
3797: sum2 -= *v2 * tmp0;
3798: }
3799: x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3800: x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3801: row -= 2;
3802: break;
3803: case 3:
3805: sum1 = b[row];
3806: sum2 = b[row - 1];
3807: sum3 = b[row - 2];
3808: v2 = a->a + diag[row - 1] + 2;
3809: v3 = a->a + diag[row - 2] + 3;
3810: for (n = 0; n < sz - 1; n += 2) {
3811: i1 = idx[0];
3812: i2 = idx[1];
3813: idx += 2;
3814: tmp0 = x[i1];
3815: tmp1 = x[i2];
3816: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3817: v1 += 2;
3818: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3819: v2 += 2;
3820: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3821: v3 += 2;
3822: }
3824: if (n == sz - 1) {
3825: tmp0 = x[*idx];
3826: sum1 -= *v1 * tmp0;
3827: sum2 -= *v2 * tmp0;
3828: sum3 -= *v3 * tmp0;
3829: }
3830: x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3831: x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3832: x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3833: row -= 3;
3834: break;
3835: case 4:
3837: sum1 = b[row];
3838: sum2 = b[row - 1];
3839: sum3 = b[row - 2];
3840: sum4 = b[row - 3];
3841: v2 = a->a + diag[row - 1] + 2;
3842: v3 = a->a + diag[row - 2] + 3;
3843: v4 = a->a + diag[row - 3] + 4;
3844: for (n = 0; n < sz - 1; n += 2) {
3845: i1 = idx[0];
3846: i2 = idx[1];
3847: idx += 2;
3848: tmp0 = x[i1];
3849: tmp1 = x[i2];
3850: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3851: v1 += 2;
3852: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3853: v2 += 2;
3854: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3855: v3 += 2;
3856: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3857: v4 += 2;
3858: }
3860: if (n == sz - 1) {
3861: tmp0 = x[*idx];
3862: sum1 -= *v1 * tmp0;
3863: sum2 -= *v2 * tmp0;
3864: sum3 -= *v3 * tmp0;
3865: sum4 -= *v4 * tmp0;
3866: }
3867: x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3868: x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3869: x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3870: x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3871: row -= 4;
3872: break;
3873: case 5:
3875: sum1 = b[row];
3876: sum2 = b[row - 1];
3877: sum3 = b[row - 2];
3878: sum4 = b[row - 3];
3879: sum5 = b[row - 4];
3880: v2 = a->a + diag[row - 1] + 2;
3881: v3 = a->a + diag[row - 2] + 3;
3882: v4 = a->a + diag[row - 3] + 4;
3883: v5 = a->a + diag[row - 4] + 5;
3884: for (n = 0; n < sz - 1; n += 2) {
3885: i1 = idx[0];
3886: i2 = idx[1];
3887: idx += 2;
3888: tmp0 = x[i1];
3889: tmp1 = x[i2];
3890: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3891: v1 += 2;
3892: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3893: v2 += 2;
3894: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3895: v3 += 2;
3896: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3897: v4 += 2;
3898: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3899: v5 += 2;
3900: }
3902: if (n == sz - 1) {
3903: tmp0 = x[*idx];
3904: sum1 -= *v1 * tmp0;
3905: sum2 -= *v2 * tmp0;
3906: sum3 -= *v3 * tmp0;
3907: sum4 -= *v4 * tmp0;
3908: sum5 -= *v5 * tmp0;
3909: }
3910: x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3911: x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3912: x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3913: x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3914: x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3915: row -= 5;
3916: break;
3917: default:
3918: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3919: }
3920: }
3921: PetscLogFlops(a->nz);
3923: /*
3924: t = b - D x where D is the block diagonal
3925: */
3926: cnt = 0;
3927: for (i = 0, row = 0; i < m; i++) {
3928: switch (sizes[i]) {
3929: case 1:
3930: t[row] = b[row] - bdiag[cnt++] * x[row];
3931: row++;
3932: break;
3933: case 2:
3934: x1 = x[row];
3935: x2 = x[row + 1];
3936: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3937: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3938: t[row] = b[row] - tmp1;
3939: t[row + 1] = b[row + 1] - tmp2;
3940: row += 2;
3941: cnt += 4;
3942: break;
3943: case 3:
3944: x1 = x[row];
3945: x2 = x[row + 1];
3946: x3 = x[row + 2];
3947: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3948: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3949: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3950: t[row] = b[row] - tmp1;
3951: t[row + 1] = b[row + 1] - tmp2;
3952: t[row + 2] = b[row + 2] - tmp3;
3953: row += 3;
3954: cnt += 9;
3955: break;
3956: case 4:
3957: x1 = x[row];
3958: x2 = x[row + 1];
3959: x3 = x[row + 2];
3960: x4 = x[row + 3];
3961: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3962: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3963: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
3964: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
3965: t[row] = b[row] - tmp1;
3966: t[row + 1] = b[row + 1] - tmp2;
3967: t[row + 2] = b[row + 2] - tmp3;
3968: t[row + 3] = b[row + 3] - tmp4;
3969: row += 4;
3970: cnt += 16;
3971: break;
3972: case 5:
3973: x1 = x[row];
3974: x2 = x[row + 1];
3975: x3 = x[row + 2];
3976: x4 = x[row + 3];
3977: x5 = x[row + 4];
3978: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
3979: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
3980: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
3981: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
3982: tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
3983: t[row] = b[row] - tmp1;
3984: t[row + 1] = b[row + 1] - tmp2;
3985: t[row + 2] = b[row + 2] - tmp3;
3986: t[row + 3] = b[row + 3] - tmp4;
3987: t[row + 4] = b[row + 4] - tmp5;
3988: row += 5;
3989: cnt += 25;
3990: break;
3991: default:
3992: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3993: }
3994: }
3995: PetscLogFlops(m);
3997: /*
3998: Apply (L + D)^-1 where D is the block diagonal
3999: */
4000: for (i = 0, row = 0; i < m; i++) {
4001: sz = diag[row] - ii[row];
4002: v1 = a->a + ii[row];
4003: idx = a->j + ii[row];
4004: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
4005: switch (sizes[i]) {
4006: case 1:
4008: sum1 = t[row];
4009: for (n = 0; n < sz - 1; n += 2) {
4010: i1 = idx[0];
4011: i2 = idx[1];
4012: idx += 2;
4013: tmp0 = t[i1];
4014: tmp1 = t[i2];
4015: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4016: v1 += 2;
4017: }
4019: if (n == sz - 1) {
4020: tmp0 = t[*idx];
4021: sum1 -= *v1 * tmp0;
4022: }
4023: x[row] += t[row] = sum1 * (*ibdiag++);
4024: row++;
4025: break;
4026: case 2:
4027: v2 = a->a + ii[row + 1];
4028: sum1 = t[row];
4029: sum2 = t[row + 1];
4030: for (n = 0; n < sz - 1; n += 2) {
4031: i1 = idx[0];
4032: i2 = idx[1];
4033: idx += 2;
4034: tmp0 = t[i1];
4035: tmp1 = t[i2];
4036: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4037: v1 += 2;
4038: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4039: v2 += 2;
4040: }
4042: if (n == sz - 1) {
4043: tmp0 = t[*idx];
4044: sum1 -= v1[0] * tmp0;
4045: sum2 -= v2[0] * tmp0;
4046: }
4047: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
4048: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
4049: ibdiag += 4;
4050: row += 2;
4051: break;
4052: case 3:
4053: v2 = a->a + ii[row + 1];
4054: v3 = a->a + ii[row + 2];
4055: sum1 = t[row];
4056: sum2 = t[row + 1];
4057: sum3 = t[row + 2];
4058: for (n = 0; n < sz - 1; n += 2) {
4059: i1 = idx[0];
4060: i2 = idx[1];
4061: idx += 2;
4062: tmp0 = t[i1];
4063: tmp1 = t[i2];
4064: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4065: v1 += 2;
4066: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4067: v2 += 2;
4068: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4069: v3 += 2;
4070: }
4072: if (n == sz - 1) {
4073: tmp0 = t[*idx];
4074: sum1 -= v1[0] * tmp0;
4075: sum2 -= v2[0] * tmp0;
4076: sum3 -= v3[0] * tmp0;
4077: }
4078: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
4079: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
4080: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
4081: ibdiag += 9;
4082: row += 3;
4083: break;
4084: case 4:
4085: v2 = a->a + ii[row + 1];
4086: v3 = a->a + ii[row + 2];
4087: v4 = a->a + ii[row + 3];
4088: sum1 = t[row];
4089: sum2 = t[row + 1];
4090: sum3 = t[row + 2];
4091: sum4 = t[row + 3];
4092: for (n = 0; n < sz - 1; n += 2) {
4093: i1 = idx[0];
4094: i2 = idx[1];
4095: idx += 2;
4096: tmp0 = t[i1];
4097: tmp1 = t[i2];
4098: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4099: v1 += 2;
4100: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4101: v2 += 2;
4102: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4103: v3 += 2;
4104: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4105: v4 += 2;
4106: }
4108: if (n == sz - 1) {
4109: tmp0 = t[*idx];
4110: sum1 -= v1[0] * tmp0;
4111: sum2 -= v2[0] * tmp0;
4112: sum3 -= v3[0] * tmp0;
4113: sum4 -= v4[0] * tmp0;
4114: }
4115: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
4116: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
4117: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
4118: x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
4119: ibdiag += 16;
4120: row += 4;
4121: break;
4122: case 5:
4123: v2 = a->a + ii[row + 1];
4124: v3 = a->a + ii[row + 2];
4125: v4 = a->a + ii[row + 3];
4126: v5 = a->a + ii[row + 4];
4127: sum1 = t[row];
4128: sum2 = t[row + 1];
4129: sum3 = t[row + 2];
4130: sum4 = t[row + 3];
4131: sum5 = t[row + 4];
4132: for (n = 0; n < sz - 1; n += 2) {
4133: i1 = idx[0];
4134: i2 = idx[1];
4135: idx += 2;
4136: tmp0 = t[i1];
4137: tmp1 = t[i2];
4138: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4139: v1 += 2;
4140: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4141: v2 += 2;
4142: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4143: v3 += 2;
4144: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4145: v4 += 2;
4146: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
4147: v5 += 2;
4148: }
4150: if (n == sz - 1) {
4151: tmp0 = t[*idx];
4152: sum1 -= v1[0] * tmp0;
4153: sum2 -= v2[0] * tmp0;
4154: sum3 -= v3[0] * tmp0;
4155: sum4 -= v4[0] * tmp0;
4156: sum5 -= v5[0] * tmp0;
4157: }
4158: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
4159: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
4160: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
4161: x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
4162: x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
4163: ibdiag += 25;
4164: row += 5;
4165: break;
4166: default:
4167: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4168: }
4169: }
4170: PetscLogFlops(a->nz);
4171: }
4172: VecRestoreArray(xx, &x);
4173: VecRestoreArrayRead(bb, &b);
4174: return 0;
4175: }
4177: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4178: {
4179: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4180: PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
4181: const MatScalar *bdiag = a->inode.bdiag;
4182: const PetscScalar *b;
4183: PetscInt m = a->inode.node_count, cnt = 0, i, row;
4184: const PetscInt *sizes = a->inode.size;
4187: VecGetArray(xx, &x);
4188: VecGetArrayRead(bb, &b);
4189: cnt = 0;
4190: for (i = 0, row = 0; i < m; i++) {
4191: switch (sizes[i]) {
4192: case 1:
4193: x[row] = b[row] * bdiag[cnt++];
4194: row++;
4195: break;
4196: case 2:
4197: x1 = b[row];
4198: x2 = b[row + 1];
4199: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
4200: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
4201: x[row++] = tmp1;
4202: x[row++] = tmp2;
4203: cnt += 4;
4204: break;
4205: case 3:
4206: x1 = b[row];
4207: x2 = b[row + 1];
4208: x3 = b[row + 2];
4209: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
4210: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
4211: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
4212: x[row++] = tmp1;
4213: x[row++] = tmp2;
4214: x[row++] = tmp3;
4215: cnt += 9;
4216: break;
4217: case 4:
4218: x1 = b[row];
4219: x2 = b[row + 1];
4220: x3 = b[row + 2];
4221: x4 = b[row + 3];
4222: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
4223: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4224: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4225: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4226: x[row++] = tmp1;
4227: x[row++] = tmp2;
4228: x[row++] = tmp3;
4229: x[row++] = tmp4;
4230: cnt += 16;
4231: break;
4232: case 5:
4233: x1 = b[row];
4234: x2 = b[row + 1];
4235: x3 = b[row + 2];
4236: x4 = b[row + 3];
4237: x5 = b[row + 4];
4238: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4239: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4240: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4241: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4242: tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4243: x[row++] = tmp1;
4244: x[row++] = tmp2;
4245: x[row++] = tmp3;
4246: x[row++] = tmp4;
4247: x[row++] = tmp5;
4248: cnt += 25;
4249: break;
4250: default:
4251: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4252: }
4253: }
4254: PetscLogFlops(2.0 * cnt);
4255: VecRestoreArray(xx, &x);
4256: VecRestoreArrayRead(bb, &b);
4257: return 0;
4258: }
4260: static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4261: {
4262: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4264: a->inode.node_count = 0;
4265: a->inode.use = PETSC_FALSE;
4266: a->inode.checked = PETSC_FALSE;
4267: a->inode.mat_nonzerostate = -1;
4268: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
4269: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
4270: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
4271: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
4272: A->ops->coloringpatch = NULL;
4273: A->ops->multdiagonalblock = NULL;
4274: if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
4275: return 0;
4276: }
4278: /*
4279: samestructure indicates that the matrix has not changed its nonzero structure so we
4280: do not need to recompute the inodes
4281: */
4282: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4283: {
4284: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4285: PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size;
4286: PetscBool flag;
4287: const PetscInt *idx, *idy, *ii;
4289: if (!a->inode.use) {
4290: MatSeqAIJ_Inode_ResetOps(A);
4291: PetscFree(a->inode.size);
4292: return 0;
4293: }
4294: if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) return 0;
4296: m = A->rmap->n;
4297: if (!a->inode.size) PetscMalloc1(m + 1, &a->inode.size);
4298: ns = a->inode.size;
4300: i = 0;
4301: node_count = 0;
4302: idx = a->j;
4303: ii = a->i;
4304: while (i < m) { /* For each row */
4305: nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
4306: /* Limits the number of elements in a node to 'a->inode.limit' */
4307: for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4308: nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
4309: if (nzy != nzx) break;
4310: idy += nzx; /* Same nonzero pattern */
4311: PetscArraycmp(idx, idy, nzx, &flag);
4312: if (!flag) break;
4313: }
4314: ns[node_count++] = blk_size;
4315: idx += blk_size * nzx;
4316: i = j;
4317: }
4319: /* If not enough inodes found,, do not use inode version of the routines */
4320: if (!m || node_count > .8 * m) {
4321: MatSeqAIJ_Inode_ResetOps(A);
4322: PetscFree(a->inode.size);
4323: PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m);
4324: } else {
4325: if (!A->factortype) {
4326: A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4327: if (A->rmap->n == A->cmap->n) {
4328: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4329: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4330: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4331: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4332: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4333: }
4334: } else {
4335: A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4336: }
4337: a->inode.node_count = node_count;
4338: PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit);
4339: }
4340: a->inode.checked = PETSC_TRUE;
4341: a->inode.mat_nonzerostate = A->nonzerostate;
4342: return 0;
4343: }
4345: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4346: {
4347: Mat B = *C;
4348: Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4349: PetscInt m = A->rmap->n;
4351: c->inode.use = a->inode.use;
4352: c->inode.limit = a->inode.limit;
4353: c->inode.max_limit = a->inode.max_limit;
4354: c->inode.checked = PETSC_FALSE;
4355: c->inode.size = NULL;
4356: c->inode.node_count = 0;
4357: c->inode.ibdiagvalid = PETSC_FALSE;
4358: c->inode.ibdiag = NULL;
4359: c->inode.bdiag = NULL;
4360: c->inode.mat_nonzerostate = -1;
4361: if (a->inode.use) {
4362: if (a->inode.checked && a->inode.size) {
4363: PetscMalloc1(m + 1, &c->inode.size);
4364: PetscArraycpy(c->inode.size, a->inode.size, m + 1);
4366: c->inode.checked = PETSC_TRUE;
4367: c->inode.node_count = a->inode.node_count;
4368: c->inode.mat_nonzerostate = (*C)->nonzerostate;
4369: }
4370: /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4371: if (!B->factortype) {
4372: B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4373: B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4374: B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4375: B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4376: B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4377: B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4378: } else {
4379: B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4380: }
4381: }
4382: return 0;
4383: }
4385: static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4386: {
4387: PetscInt k;
4388: const PetscInt *vi;
4390: vi = aj + ai[row];
4391: for (k = 0; k < nzl; k++) cols[k] = vi[k];
4392: vi = aj + adiag[row];
4393: cols[nzl] = vi[0];
4394: vi = aj + adiag[row + 1] + 1;
4395: for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4396: return 0;
4397: }
4398: /*
4399: MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4400: Modified from MatSeqAIJCheckInode().
4402: Input Parameters:
4403: . Mat A - ILU or LU matrix factor
4405: */
4406: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4407: {
4408: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4409: PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4410: PetscInt *cols1, *cols2, *ns;
4411: const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4412: PetscBool flag;
4414: if (!a->inode.use) return 0;
4415: if (a->inode.checked) return 0;
4417: m = A->rmap->n;
4418: if (a->inode.size) ns = a->inode.size;
4419: else PetscMalloc1(m + 1, &ns);
4421: i = 0;
4422: node_count = 0;
4423: PetscMalloc2(m, &cols1, m, &cols2);
4424: while (i < m) { /* For each row */
4425: nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */
4426: nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4427: nzx = nzl1 + nzu1 + 1;
4428: MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i);
4430: /* Limits the number of elements in a node to 'a->inode.limit' */
4431: for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4432: nzl2 = ai[j + 1] - ai[j];
4433: nzu2 = adiag[j] - adiag[j + 1] - 1;
4434: nzy = nzl2 + nzu2 + 1;
4435: if (nzy != nzx) break;
4436: MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j);
4437: PetscArraycmp(cols1, cols2, nzx, &flag);
4438: if (!flag) break;
4439: }
4440: ns[node_count++] = blk_size;
4441: i = j;
4442: }
4443: PetscFree2(cols1, cols2);
4444: /* If not enough inodes found,, do not use inode version of the routines */
4445: if (!m || node_count > .8 * m) {
4446: PetscFree(ns);
4448: a->inode.node_count = 0;
4449: a->inode.size = NULL;
4450: a->inode.use = PETSC_FALSE;
4452: PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m);
4453: } else {
4454: A->ops->mult = NULL;
4455: A->ops->sor = NULL;
4456: A->ops->multadd = NULL;
4457: A->ops->getrowij = NULL;
4458: A->ops->restorerowij = NULL;
4459: A->ops->getcolumnij = NULL;
4460: A->ops->restorecolumnij = NULL;
4461: A->ops->coloringpatch = NULL;
4462: A->ops->multdiagonalblock = NULL;
4463: a->inode.node_count = node_count;
4464: a->inode.size = ns;
4466: PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit);
4467: }
4468: a->inode.checked = PETSC_TRUE;
4469: return 0;
4470: }
4472: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4473: {
4474: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4476: a->inode.ibdiagvalid = PETSC_FALSE;
4477: return 0;
4478: }
4480: /*
4481: This is really ugly. if inodes are used this replaces the
4482: permutations with ones that correspond to rows/cols of the matrix
4483: rather then inode blocks
4484: */
4485: PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4486: {
4487: PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4488: return 0;
4489: }
4491: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4492: {
4493: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4494: PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4495: const PetscInt *ridx, *cidx;
4496: PetscInt row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx;
4497: PetscInt nslim_col, *ns_col;
4498: IS ris = *rperm, cis = *cperm;
4500: if (!a->inode.size) return 0; /* no inodes so return */
4501: if (a->inode.node_count == m) return 0; /* all inodes are of size 1 */
4503: MatCreateColInode_Private(A, &nslim_col, &ns_col);
4504: PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns);
4505: PetscMalloc2(m, &permr, n, &permc);
4507: ISGetIndices(ris, &ridx);
4508: ISGetIndices(cis, &cidx);
4510: /* Form the inode structure for the rows of permuted matric using inv perm*/
4511: for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i];
4513: /* Construct the permutations for rows*/
4514: for (i = 0, row = 0; i < nslim_row; ++i) {
4515: indx = ridx[i];
4516: start_val = tns[indx];
4517: end_val = tns[indx + 1];
4518: for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4519: }
4521: /* Form the inode structure for the columns of permuted matrix using inv perm*/
4522: for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i];
4524: /* Construct permutations for columns */
4525: for (i = 0, col = 0; i < nslim_col; ++i) {
4526: indx = cidx[i];
4527: start_val = tns[indx];
4528: end_val = tns[indx + 1];
4529: for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4530: }
4532: ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm);
4533: ISSetPermutation(*rperm);
4534: ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm);
4535: ISSetPermutation(*cperm);
4537: ISRestoreIndices(ris, &ridx);
4538: ISRestoreIndices(cis, &cidx);
4540: PetscFree(ns_col);
4541: PetscFree2(permr, permc);
4542: ISDestroy(&cis);
4543: ISDestroy(&ris);
4544: PetscFree(tns);
4545: return 0;
4546: }
4548: /*@C
4549: MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
4551: Not Collective
4553: Input Parameter:
4554: . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
4556: Output Parameters:
4557: + node_count - no of inodes present in the matrix.
4558: . sizes - an array of size node_count,with sizes of each inode.
4559: - limit - the max size used to generate the inodes.
4561: Level: advanced
4563: Note:
4564: This routine returns some internal storage information
4565: of the matrix, it is intended to be used by advanced users.
4566: It should be called after the matrix is assembled.
4567: The contents of the sizes[] array should not be changed.
4568: NULL may be passed for information not requested.
4570: .seealso: `MatGetInfo()`
4571: @*/
4572: PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4573: {
4574: PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
4577: PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f);
4578: if (f) (*f)(A, node_count, sizes, limit);
4579: return 0;
4580: }
4582: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4583: {
4584: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4586: if (node_count) *node_count = a->inode.node_count;
4587: if (sizes) *sizes = a->inode.size;
4588: if (limit) *limit = a->inode.limit;
4589: return 0;
4590: }