Actual source code: inode.c


  2: /*
  3:   This file provides high performance routines for the Inode format (compressed sparse row)
  4:   by taking advantage of rows with identical nonzero structure (I-nodes).
  5: */
  6: #include <../src/mat/impls/aij/seq/aij.h>

  8: static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
  9: {
 10:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
 11:   PetscInt    i, count, m, n, min_mn, *ns_row, *ns_col;

 13:   n = A->cmap->n;
 14:   m = A->rmap->n;
 16:   ns_row = a->inode.size;

 18:   min_mn = (m < n) ? m : n;
 19:   if (!ns) {
 20:     for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++)
 21:       ;
 22:     for (; count + 1 < n; count++, i++)
 23:       ;
 24:     if (count < n) i++;
 25:     *size = i;
 26:     return 0;
 27:   }
 28:   PetscMalloc1(n + 1, &ns_col);

 30:   /* Use the same row structure wherever feasible. */
 31:   for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i];

 33:   /* if m < n; pad up the remainder with inode_limit */
 34:   for (; count + 1 < n; count++, i++) ns_col[i] = 1;
 35:   /* The last node is the odd ball. padd it up with the remaining rows; */
 36:   if (count < n) {
 37:     ns_col[i] = n - count;
 38:     i++;
 39:   } else if (count > n) {
 40:     /* Adjust for the over estimation */
 41:     ns_col[i - 1] += n - count;
 42:   }
 43:   *size = i;
 44:   *ns   = ns_col;
 45:   return 0;
 46: }

 48: /*
 49:       This builds symmetric version of nonzero structure,
 50: */
 51: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
 52: {
 53:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
 54:   PetscInt       *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
 55:   PetscInt       *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2;
 56:   const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;

 58:   nslim_row = a->inode.node_count;
 59:   m         = A->rmap->n;
 60:   n         = A->cmap->n;

 64:   /* Use the row_inode as column_inode */
 65:   nslim_col = nslim_row;
 66:   ns_col    = ns_row;

 68:   /* allocate space for reformatted inode structure */
 69:   PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc);
 70:   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1];

 72:   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
 73:     nsz = ns_col[i1];
 74:     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
 75:   }
 76:   /* allocate space for row pointers */
 77:   PetscCalloc1(nslim_row + 1, &ia);
 78:   *iia = ia;
 79:   PetscMalloc1(nslim_row + 1, &work);

 81:   /* determine the number of columns in each row */
 82:   ia[0] = oshift;
 83:   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
 84:     j    = aj + ai[row] + ishift;
 85:     jmax = aj + ai[row + 1] + ishift;
 86:     if (j == jmax) continue; /* empty row */
 87:     col = *j++ + ishift;
 88:     i2  = tvc[col];
 89:     while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
 90:       ia[i1 + 1]++;
 91:       ia[i2 + 1]++;
 92:       i2++; /* Start col of next node */
 93:       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
 94:       i2 = tvc[col];
 95:     }
 96:     if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
 97:   }

 99:   /* shift ia[i] to point to next row */
100:   for (i1 = 1; i1 < nslim_row + 1; i1++) {
101:     row = ia[i1 - 1];
102:     ia[i1] += row;
103:     work[i1 - 1] = row - oshift;
104:   }

106:   /* allocate space for column pointers */
107:   nz = ia[nslim_row] + (!ishift);
108:   PetscMalloc1(nz, &ja);
109:   *jja = ja;

111:   /* loop over lower triangular part putting into ja */
112:   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
113:     j    = aj + ai[row] + ishift;
114:     jmax = aj + ai[row + 1] + ishift;
115:     if (j == jmax) continue; /* empty row */
116:     col = *j++ + ishift;
117:     i2  = tvc[col];
118:     while (i2 < i1 && j < jmax) {
119:       ja[work[i2]++] = i1 + oshift;
120:       ja[work[i1]++] = i2 + oshift;
121:       ++i2;
122:       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
123:       i2 = tvc[col];
124:     }
125:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
126:   }
127:   PetscFree(work);
128:   PetscFree2(tns, tvc);
129:   return 0;
130: }

132: /*
133:       This builds nonsymmetric version of nonzero structure,
134: */
135: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
136: {
137:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
138:   PetscInt       *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
139:   PetscInt       *tns, *tvc, nsz, i1, i2;
140:   const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size;

143:   nslim_row = a->inode.node_count;
144:   n         = A->cmap->n;

146:   /* Create The column_inode for this matrix */
147:   MatCreateColInode_Private(A, &nslim_col, &ns_col);

149:   /* allocate space for reformatted column_inode structure */
150:   PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc);
151:   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];

153:   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
154:     nsz = ns_col[i1];
155:     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
156:   }
157:   /* allocate space for row pointers */
158:   PetscCalloc1(nslim_row + 1, &ia);
159:   *iia = ia;
160:   PetscMalloc1(nslim_row + 1, &work);

162:   /* determine the number of columns in each row */
163:   ia[0] = oshift;
164:   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
165:     j  = aj + ai[row] + ishift;
166:     nz = ai[row + 1] - ai[row];
167:     if (!nz) continue; /* empty row */
168:     col = *j++ + ishift;
169:     i2  = tvc[col];
170:     while (nz-- > 0) { /* off-diagonal elements */
171:       ia[i1 + 1]++;
172:       i2++; /* Start col of next node */
173:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
174:       if (nz > 0) i2 = tvc[col];
175:     }
176:   }

178:   /* shift ia[i] to point to next row */
179:   for (i1 = 1; i1 < nslim_row + 1; i1++) {
180:     row = ia[i1 - 1];
181:     ia[i1] += row;
182:     work[i1 - 1] = row - oshift;
183:   }

185:   /* allocate space for column pointers */
186:   nz = ia[nslim_row] + (!ishift);
187:   PetscMalloc1(nz, &ja);
188:   *jja = ja;

190:   /* loop over matrix putting into ja */
191:   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
192:     j  = aj + ai[row] + ishift;
193:     nz = ai[row + 1] - ai[row];
194:     if (!nz) continue; /* empty row */
195:     col = *j++ + ishift;
196:     i2  = tvc[col];
197:     while (nz-- > 0) {
198:       ja[work[i1]++] = i2 + oshift;
199:       ++i2;
200:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
201:       if (nz > 0) i2 = tvc[col];
202:     }
203:   }
204:   PetscFree(ns_col);
205:   PetscFree(work);
206:   PetscFree2(tns, tvc);
207:   return 0;
208: }

210: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
211: {
212:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

214:   if (n) *n = a->inode.node_count;
215:   if (!ia) return 0;
216:   if (!blockcompressed) {
217:     MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done);
218:   } else if (symmetric) {
219:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift);
220:   } else {
221:     MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift);
222:   }
223:   return 0;
224: }

226: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
227: {
228:   if (!ia) return 0;

230:   if (!blockcompressed) {
231:     MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done);
232:   } else {
233:     PetscFree(*ia);
234:     PetscFree(*ja);
235:   }
236:   return 0;
237: }

239: /* ----------------------------------------------------------- */

241: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
242: {
243:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
244:   PetscInt   *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
245:   PetscInt   *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j;

248:   nslim_row = a->inode.node_count;
249:   n         = A->cmap->n;

251:   /* Create The column_inode for this matrix */
252:   MatCreateColInode_Private(A, &nslim_col, &ns_col);

254:   /* allocate space for reformatted column_inode structure */
255:   PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc);
256:   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];

258:   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
259:     nsz = ns_col[i1];
260:     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
261:   }
262:   /* allocate space for column pointers */
263:   PetscCalloc1(nslim_col + 1, &ia);
264:   *iia = ia;
265:   PetscMalloc1(nslim_col + 1, &work);

267:   /* determine the number of columns in each row */
268:   ia[0] = oshift;
269:   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
270:     j   = aj + ai[row] + ishift;
271:     col = *j++ + ishift;
272:     i2  = tvc[col];
273:     nz  = ai[row + 1] - ai[row];
274:     while (nz-- > 0) { /* off-diagonal elements */
275:       /* ia[i1+1]++; */
276:       ia[i2 + 1]++;
277:       i2++;
278:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
279:       if (nz > 0) i2 = tvc[col];
280:     }
281:   }

283:   /* shift ia[i] to point to next col */
284:   for (i1 = 1; i1 < nslim_col + 1; i1++) {
285:     col = ia[i1 - 1];
286:     ia[i1] += col;
287:     work[i1 - 1] = col - oshift;
288:   }

290:   /* allocate space for column pointers */
291:   nz = ia[nslim_col] + (!ishift);
292:   PetscMalloc1(nz, &ja);
293:   *jja = ja;

295:   /* loop over matrix putting into ja */
296:   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
297:     j   = aj + ai[row] + ishift;
298:     col = *j++ + ishift;
299:     i2  = tvc[col];
300:     nz  = ai[row + 1] - ai[row];
301:     while (nz-- > 0) {
302:       /* ja[work[i1]++] = i2 + oshift; */
303:       ja[work[i2]++] = i1 + oshift;
304:       i2++;
305:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
306:       if (nz > 0) i2 = tvc[col];
307:     }
308:   }
309:   PetscFree(ns_col);
310:   PetscFree(work);
311:   PetscFree2(tns, tvc);
312:   return 0;
313: }

315: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
316: {
317:   MatCreateColInode_Private(A, n, NULL);
318:   if (!ia) return 0;

320:   if (!blockcompressed) {
321:     MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done);
322:   } else if (symmetric) {
323:     /* Since the indices are symmetric it doesn't matter */
324:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift);
325:   } else {
326:     MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift);
327:   }
328:   return 0;
329: }

331: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
332: {
333:   if (!ia) return 0;
334:   if (!blockcompressed) {
335:     MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done);
336:   } else {
337:     PetscFree(*ia);
338:     PetscFree(*ja);
339:   }
340:   return 0;
341: }

343: /* ----------------------------------------------------------- */

345: PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
346: {
347:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
348:   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
349:   PetscScalar       *y;
350:   const PetscScalar *x;
351:   const MatScalar   *v1, *v2, *v3, *v4, *v5;
352:   PetscInt           i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0;
353:   const PetscInt    *idx, *ns, *ii;

355: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
356:   #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
357: #endif

360:   node_max = a->inode.node_count;
361:   ns       = a->inode.size; /* Node Size array */
362:   VecGetArrayRead(xx, &x);
363:   VecGetArray(yy, &y);
364:   idx = a->j;
365:   v1  = a->a;
366:   ii  = a->i;

368:   for (i = 0, row = 0; i < node_max; ++i) {
369:     nsz = ns[i];
370:     n   = ii[1] - ii[0];
371:     nonzerorow += (n > 0) * nsz;
372:     ii += nsz;
373:     PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA);      /* Prefetch the indices for the block row after the current one */
374:     PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
375:     sz = n;                                                                /* No of non zeros in this row */
376:                                                                            /* Switch on the size of Node */
377:     switch (nsz) {                                                         /* Each loop in 'case' is unrolled */
378:     case 1:
379:       sum1 = 0.;

381:       for (n = 0; n < sz - 1; n += 2) {
382:         i1 = idx[0]; /* The instructions are ordered to */
383:         i2 = idx[1]; /* make the compiler's job easy */
384:         idx += 2;
385:         tmp0 = x[i1];
386:         tmp1 = x[i2];
387:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
388:         v1 += 2;
389:       }

391:       if (n == sz - 1) { /* Take care of the last nonzero  */
392:         tmp0 = x[*idx++];
393:         sum1 += *v1++ * tmp0;
394:       }
395:       y[row++] = sum1;
396:       break;
397:     case 2:
398:       sum1 = 0.;
399:       sum2 = 0.;
400:       v2   = v1 + n;

402:       for (n = 0; n < sz - 1; n += 2) {
403:         i1 = idx[0];
404:         i2 = idx[1];
405:         idx += 2;
406:         tmp0 = x[i1];
407:         tmp1 = x[i2];
408:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
409:         v1 += 2;
410:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
411:         v2 += 2;
412:       }
413:       if (n == sz - 1) {
414:         tmp0 = x[*idx++];
415:         sum1 += *v1++ * tmp0;
416:         sum2 += *v2++ * tmp0;
417:       }
418:       y[row++] = sum1;
419:       y[row++] = sum2;
420:       v1       = v2; /* Since the next block to be processed starts there*/
421:       idx += sz;
422:       break;
423:     case 3:
424:       sum1 = 0.;
425:       sum2 = 0.;
426:       sum3 = 0.;
427:       v2   = v1 + n;
428:       v3   = v2 + n;

430:       for (n = 0; n < sz - 1; n += 2) {
431:         i1 = idx[0];
432:         i2 = idx[1];
433:         idx += 2;
434:         tmp0 = x[i1];
435:         tmp1 = x[i2];
436:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
437:         v1 += 2;
438:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
439:         v2 += 2;
440:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
441:         v3 += 2;
442:       }
443:       if (n == sz - 1) {
444:         tmp0 = x[*idx++];
445:         sum1 += *v1++ * tmp0;
446:         sum2 += *v2++ * tmp0;
447:         sum3 += *v3++ * tmp0;
448:       }
449:       y[row++] = sum1;
450:       y[row++] = sum2;
451:       y[row++] = sum3;
452:       v1       = v3; /* Since the next block to be processed starts there*/
453:       idx += 2 * sz;
454:       break;
455:     case 4:
456:       sum1 = 0.;
457:       sum2 = 0.;
458:       sum3 = 0.;
459:       sum4 = 0.;
460:       v2   = v1 + n;
461:       v3   = v2 + n;
462:       v4   = v3 + n;

464:       for (n = 0; n < sz - 1; n += 2) {
465:         i1 = idx[0];
466:         i2 = idx[1];
467:         idx += 2;
468:         tmp0 = x[i1];
469:         tmp1 = x[i2];
470:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
471:         v1 += 2;
472:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
473:         v2 += 2;
474:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
475:         v3 += 2;
476:         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
477:         v4 += 2;
478:       }
479:       if (n == sz - 1) {
480:         tmp0 = x[*idx++];
481:         sum1 += *v1++ * tmp0;
482:         sum2 += *v2++ * tmp0;
483:         sum3 += *v3++ * tmp0;
484:         sum4 += *v4++ * tmp0;
485:       }
486:       y[row++] = sum1;
487:       y[row++] = sum2;
488:       y[row++] = sum3;
489:       y[row++] = sum4;
490:       v1       = v4; /* Since the next block to be processed starts there*/
491:       idx += 3 * sz;
492:       break;
493:     case 5:
494:       sum1 = 0.;
495:       sum2 = 0.;
496:       sum3 = 0.;
497:       sum4 = 0.;
498:       sum5 = 0.;
499:       v2   = v1 + n;
500:       v3   = v2 + n;
501:       v4   = v3 + n;
502:       v5   = v4 + n;

504:       for (n = 0; n < sz - 1; n += 2) {
505:         i1 = idx[0];
506:         i2 = idx[1];
507:         idx += 2;
508:         tmp0 = x[i1];
509:         tmp1 = x[i2];
510:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
511:         v1 += 2;
512:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
513:         v2 += 2;
514:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
515:         v3 += 2;
516:         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
517:         v4 += 2;
518:         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
519:         v5 += 2;
520:       }
521:       if (n == sz - 1) {
522:         tmp0 = x[*idx++];
523:         sum1 += *v1++ * tmp0;
524:         sum2 += *v2++ * tmp0;
525:         sum3 += *v3++ * tmp0;
526:         sum4 += *v4++ * tmp0;
527:         sum5 += *v5++ * tmp0;
528:       }
529:       y[row++] = sum1;
530:       y[row++] = sum2;
531:       y[row++] = sum3;
532:       y[row++] = sum4;
533:       y[row++] = sum5;
534:       v1       = v5; /* Since the next block to be processed starts there */
535:       idx += 4 * sz;
536:       break;
537:     default:
538:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
539:     }
540:   }
541:   VecRestoreArrayRead(xx, &x);
542:   VecRestoreArray(yy, &y);
543:   PetscLogFlops(2.0 * a->nz - nonzerorow);
544:   return 0;
545: }
546: /* ----------------------------------------------------------- */
547: /* Almost same code as the MatMult_SeqAIJ_Inode() */
548: PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
549: {
550:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
551:   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
552:   const MatScalar   *v1, *v2, *v3, *v4, *v5;
553:   const PetscScalar *x;
554:   PetscScalar       *y, *z, *zt;
555:   PetscInt           i1, i2, n, i, row, node_max, nsz, sz;
556:   const PetscInt    *idx, *ns, *ii;

559:   node_max = a->inode.node_count;
560:   ns       = a->inode.size; /* Node Size array */

562:   VecGetArrayRead(xx, &x);
563:   VecGetArrayPair(zz, yy, &z, &y);
564:   zt = z;

566:   idx = a->j;
567:   v1  = a->a;
568:   ii  = a->i;

570:   for (i = 0, row = 0; i < node_max; ++i) {
571:     nsz = ns[i];
572:     n   = ii[1] - ii[0];
573:     ii += nsz;
574:     sz = n;        /* No of non zeros in this row */
575:                    /* Switch on the size of Node */
576:     switch (nsz) { /* Each loop in 'case' is unrolled */
577:     case 1:
578:       sum1 = *zt++;

580:       for (n = 0; n < sz - 1; n += 2) {
581:         i1 = idx[0]; /* The instructions are ordered to */
582:         i2 = idx[1]; /* make the compiler's job easy */
583:         idx += 2;
584:         tmp0 = x[i1];
585:         tmp1 = x[i2];
586:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
587:         v1 += 2;
588:       }

590:       if (n == sz - 1) { /* Take care of the last nonzero  */
591:         tmp0 = x[*idx++];
592:         sum1 += *v1++ * tmp0;
593:       }
594:       y[row++] = sum1;
595:       break;
596:     case 2:
597:       sum1 = *zt++;
598:       sum2 = *zt++;
599:       v2   = v1 + n;

601:       for (n = 0; n < sz - 1; n += 2) {
602:         i1 = idx[0];
603:         i2 = idx[1];
604:         idx += 2;
605:         tmp0 = x[i1];
606:         tmp1 = x[i2];
607:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
608:         v1 += 2;
609:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
610:         v2 += 2;
611:       }
612:       if (n == sz - 1) {
613:         tmp0 = x[*idx++];
614:         sum1 += *v1++ * tmp0;
615:         sum2 += *v2++ * tmp0;
616:       }
617:       y[row++] = sum1;
618:       y[row++] = sum2;
619:       v1       = v2; /* Since the next block to be processed starts there*/
620:       idx += sz;
621:       break;
622:     case 3:
623:       sum1 = *zt++;
624:       sum2 = *zt++;
625:       sum3 = *zt++;
626:       v2   = v1 + n;
627:       v3   = v2 + n;

629:       for (n = 0; n < sz - 1; n += 2) {
630:         i1 = idx[0];
631:         i2 = idx[1];
632:         idx += 2;
633:         tmp0 = x[i1];
634:         tmp1 = x[i2];
635:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
636:         v1 += 2;
637:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
638:         v2 += 2;
639:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
640:         v3 += 2;
641:       }
642:       if (n == sz - 1) {
643:         tmp0 = x[*idx++];
644:         sum1 += *v1++ * tmp0;
645:         sum2 += *v2++ * tmp0;
646:         sum3 += *v3++ * tmp0;
647:       }
648:       y[row++] = sum1;
649:       y[row++] = sum2;
650:       y[row++] = sum3;
651:       v1       = v3; /* Since the next block to be processed starts there*/
652:       idx += 2 * sz;
653:       break;
654:     case 4:
655:       sum1 = *zt++;
656:       sum2 = *zt++;
657:       sum3 = *zt++;
658:       sum4 = *zt++;
659:       v2   = v1 + n;
660:       v3   = v2 + n;
661:       v4   = v3 + n;

663:       for (n = 0; n < sz - 1; n += 2) {
664:         i1 = idx[0];
665:         i2 = idx[1];
666:         idx += 2;
667:         tmp0 = x[i1];
668:         tmp1 = x[i2];
669:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
670:         v1 += 2;
671:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
672:         v2 += 2;
673:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
674:         v3 += 2;
675:         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
676:         v4 += 2;
677:       }
678:       if (n == sz - 1) {
679:         tmp0 = x[*idx++];
680:         sum1 += *v1++ * tmp0;
681:         sum2 += *v2++ * tmp0;
682:         sum3 += *v3++ * tmp0;
683:         sum4 += *v4++ * tmp0;
684:       }
685:       y[row++] = sum1;
686:       y[row++] = sum2;
687:       y[row++] = sum3;
688:       y[row++] = sum4;
689:       v1       = v4; /* Since the next block to be processed starts there*/
690:       idx += 3 * sz;
691:       break;
692:     case 5:
693:       sum1 = *zt++;
694:       sum2 = *zt++;
695:       sum3 = *zt++;
696:       sum4 = *zt++;
697:       sum5 = *zt++;
698:       v2   = v1 + n;
699:       v3   = v2 + n;
700:       v4   = v3 + n;
701:       v5   = v4 + n;

703:       for (n = 0; n < sz - 1; n += 2) {
704:         i1 = idx[0];
705:         i2 = idx[1];
706:         idx += 2;
707:         tmp0 = x[i1];
708:         tmp1 = x[i2];
709:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
710:         v1 += 2;
711:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
712:         v2 += 2;
713:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
714:         v3 += 2;
715:         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
716:         v4 += 2;
717:         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
718:         v5 += 2;
719:       }
720:       if (n == sz - 1) {
721:         tmp0 = x[*idx++];
722:         sum1 += *v1++ * tmp0;
723:         sum2 += *v2++ * tmp0;
724:         sum3 += *v3++ * tmp0;
725:         sum4 += *v4++ * tmp0;
726:         sum5 += *v5++ * tmp0;
727:       }
728:       y[row++] = sum1;
729:       y[row++] = sum2;
730:       y[row++] = sum3;
731:       y[row++] = sum4;
732:       y[row++] = sum5;
733:       v1       = v5; /* Since the next block to be processed starts there */
734:       idx += 4 * sz;
735:       break;
736:     default:
737:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
738:     }
739:   }
740:   VecRestoreArrayRead(xx, &x);
741:   VecRestoreArrayPair(zz, yy, &z, &y);
742:   PetscLogFlops(2.0 * a->nz);
743:   return 0;
744: }

746: /* ----------------------------------------------------------- */
747: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
748: {
749:   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
750:   IS                 iscol = a->col, isrow = a->row;
751:   const PetscInt    *r, *c, *rout, *cout;
752:   PetscInt           i, j, n = A->rmap->n, nz;
753:   PetscInt           node_max, *ns, row, nsz, aii, i0, i1;
754:   const PetscInt    *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
755:   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
756:   PetscScalar        sum1, sum2, sum3, sum4, sum5;
757:   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
758:   const PetscScalar *b;

761:   node_max = a->inode.node_count;
762:   ns       = a->inode.size; /* Node Size array */

764:   VecGetArrayRead(bb, &b);
765:   VecGetArrayWrite(xx, &x);
766:   tmp = a->solve_work;

768:   ISGetIndices(isrow, &rout);
769:   r = rout;
770:   ISGetIndices(iscol, &cout);
771:   c = cout + (n - 1);

773:   /* forward solve the lower triangular */
774:   tmps = tmp;
775:   aa   = a_a;
776:   aj   = a_j;
777:   ad   = a->diag;

779:   for (i = 0, row = 0; i < node_max; ++i) {
780:     nsz = ns[i];
781:     aii = ai[row];
782:     v1  = aa + aii;
783:     vi  = aj + aii;
784:     nz  = ad[row] - aii;
785:     if (i < node_max - 1) {
786:       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
787:       * but our indexing to determine it's size could. */
788:       PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
789:       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
790:       PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
791:       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
792:     }

794:     switch (nsz) { /* Each loop in 'case' is unrolled */
795:     case 1:
796:       sum1 = b[*r++];
797:       for (j = 0; j < nz - 1; j += 2) {
798:         i0 = vi[0];
799:         i1 = vi[1];
800:         vi += 2;
801:         tmp0 = tmps[i0];
802:         tmp1 = tmps[i1];
803:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
804:         v1 += 2;
805:       }
806:       if (j == nz - 1) {
807:         tmp0 = tmps[*vi++];
808:         sum1 -= *v1++ * tmp0;
809:       }
810:       tmp[row++] = sum1;
811:       break;
812:     case 2:
813:       sum1 = b[*r++];
814:       sum2 = b[*r++];
815:       v2   = aa + ai[row + 1];

817:       for (j = 0; j < nz - 1; j += 2) {
818:         i0 = vi[0];
819:         i1 = vi[1];
820:         vi += 2;
821:         tmp0 = tmps[i0];
822:         tmp1 = tmps[i1];
823:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
824:         v1 += 2;
825:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
826:         v2 += 2;
827:       }
828:       if (j == nz - 1) {
829:         tmp0 = tmps[*vi++];
830:         sum1 -= *v1++ * tmp0;
831:         sum2 -= *v2++ * tmp0;
832:       }
833:       sum2 -= *v2++ * sum1;
834:       tmp[row++] = sum1;
835:       tmp[row++] = sum2;
836:       break;
837:     case 3:
838:       sum1 = b[*r++];
839:       sum2 = b[*r++];
840:       sum3 = b[*r++];
841:       v2   = aa + ai[row + 1];
842:       v3   = aa + ai[row + 2];

844:       for (j = 0; j < nz - 1; j += 2) {
845:         i0 = vi[0];
846:         i1 = vi[1];
847:         vi += 2;
848:         tmp0 = tmps[i0];
849:         tmp1 = tmps[i1];
850:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
851:         v1 += 2;
852:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
853:         v2 += 2;
854:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
855:         v3 += 2;
856:       }
857:       if (j == nz - 1) {
858:         tmp0 = tmps[*vi++];
859:         sum1 -= *v1++ * tmp0;
860:         sum2 -= *v2++ * tmp0;
861:         sum3 -= *v3++ * tmp0;
862:       }
863:       sum2 -= *v2++ * sum1;
864:       sum3 -= *v3++ * sum1;
865:       sum3 -= *v3++ * sum2;

867:       tmp[row++] = sum1;
868:       tmp[row++] = sum2;
869:       tmp[row++] = sum3;
870:       break;

872:     case 4:
873:       sum1 = b[*r++];
874:       sum2 = b[*r++];
875:       sum3 = b[*r++];
876:       sum4 = b[*r++];
877:       v2   = aa + ai[row + 1];
878:       v3   = aa + ai[row + 2];
879:       v4   = aa + ai[row + 3];

881:       for (j = 0; j < nz - 1; j += 2) {
882:         i0 = vi[0];
883:         i1 = vi[1];
884:         vi += 2;
885:         tmp0 = tmps[i0];
886:         tmp1 = tmps[i1];
887:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
888:         v1 += 2;
889:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
890:         v2 += 2;
891:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
892:         v3 += 2;
893:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
894:         v4 += 2;
895:       }
896:       if (j == nz - 1) {
897:         tmp0 = tmps[*vi++];
898:         sum1 -= *v1++ * tmp0;
899:         sum2 -= *v2++ * tmp0;
900:         sum3 -= *v3++ * tmp0;
901:         sum4 -= *v4++ * tmp0;
902:       }
903:       sum2 -= *v2++ * sum1;
904:       sum3 -= *v3++ * sum1;
905:       sum4 -= *v4++ * sum1;
906:       sum3 -= *v3++ * sum2;
907:       sum4 -= *v4++ * sum2;
908:       sum4 -= *v4++ * sum3;

910:       tmp[row++] = sum1;
911:       tmp[row++] = sum2;
912:       tmp[row++] = sum3;
913:       tmp[row++] = sum4;
914:       break;
915:     case 5:
916:       sum1 = b[*r++];
917:       sum2 = b[*r++];
918:       sum3 = b[*r++];
919:       sum4 = b[*r++];
920:       sum5 = b[*r++];
921:       v2   = aa + ai[row + 1];
922:       v3   = aa + ai[row + 2];
923:       v4   = aa + ai[row + 3];
924:       v5   = aa + ai[row + 4];

926:       for (j = 0; j < nz - 1; j += 2) {
927:         i0 = vi[0];
928:         i1 = vi[1];
929:         vi += 2;
930:         tmp0 = tmps[i0];
931:         tmp1 = tmps[i1];
932:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
933:         v1 += 2;
934:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
935:         v2 += 2;
936:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
937:         v3 += 2;
938:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
939:         v4 += 2;
940:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
941:         v5 += 2;
942:       }
943:       if (j == nz - 1) {
944:         tmp0 = tmps[*vi++];
945:         sum1 -= *v1++ * tmp0;
946:         sum2 -= *v2++ * tmp0;
947:         sum3 -= *v3++ * tmp0;
948:         sum4 -= *v4++ * tmp0;
949:         sum5 -= *v5++ * tmp0;
950:       }

952:       sum2 -= *v2++ * sum1;
953:       sum3 -= *v3++ * sum1;
954:       sum4 -= *v4++ * sum1;
955:       sum5 -= *v5++ * sum1;
956:       sum3 -= *v3++ * sum2;
957:       sum4 -= *v4++ * sum2;
958:       sum5 -= *v5++ * sum2;
959:       sum4 -= *v4++ * sum3;
960:       sum5 -= *v5++ * sum3;
961:       sum5 -= *v5++ * sum4;

963:       tmp[row++] = sum1;
964:       tmp[row++] = sum2;
965:       tmp[row++] = sum3;
966:       tmp[row++] = sum4;
967:       tmp[row++] = sum5;
968:       break;
969:     default:
970:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
971:     }
972:   }
973:   /* backward solve the upper triangular */
974:   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
975:     nsz = ns[i];
976:     aii = ai[row + 1] - 1;
977:     v1  = aa + aii;
978:     vi  = aj + aii;
979:     nz  = aii - ad[row];
980:     switch (nsz) { /* Each loop in 'case' is unrolled */
981:     case 1:
982:       sum1 = tmp[row];

984:       for (j = nz; j > 1; j -= 2) {
985:         vi -= 2;
986:         i0   = vi[2];
987:         i1   = vi[1];
988:         tmp0 = tmps[i0];
989:         tmp1 = tmps[i1];
990:         v1 -= 2;
991:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
992:       }
993:       if (j == 1) {
994:         tmp0 = tmps[*vi--];
995:         sum1 -= *v1-- * tmp0;
996:       }
997:       x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
998:       row--;
999:       break;
1000:     case 2:
1001:       sum1 = tmp[row];
1002:       sum2 = tmp[row - 1];
1003:       v2   = aa + ai[row] - 1;
1004:       for (j = nz; j > 1; j -= 2) {
1005:         vi -= 2;
1006:         i0   = vi[2];
1007:         i1   = vi[1];
1008:         tmp0 = tmps[i0];
1009:         tmp1 = tmps[i1];
1010:         v1 -= 2;
1011:         v2 -= 2;
1012:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1013:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1014:       }
1015:       if (j == 1) {
1016:         tmp0 = tmps[*vi--];
1017:         sum1 -= *v1-- * tmp0;
1018:         sum2 -= *v2-- * tmp0;
1019:       }

1021:       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1022:       row--;
1023:       sum2 -= *v2-- * tmp0;
1024:       x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1025:       row--;
1026:       break;
1027:     case 3:
1028:       sum1 = tmp[row];
1029:       sum2 = tmp[row - 1];
1030:       sum3 = tmp[row - 2];
1031:       v2   = aa + ai[row] - 1;
1032:       v3   = aa + ai[row - 1] - 1;
1033:       for (j = nz; j > 1; j -= 2) {
1034:         vi -= 2;
1035:         i0   = vi[2];
1036:         i1   = vi[1];
1037:         tmp0 = tmps[i0];
1038:         tmp1 = tmps[i1];
1039:         v1 -= 2;
1040:         v2 -= 2;
1041:         v3 -= 2;
1042:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1043:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1044:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1045:       }
1046:       if (j == 1) {
1047:         tmp0 = tmps[*vi--];
1048:         sum1 -= *v1-- * tmp0;
1049:         sum2 -= *v2-- * tmp0;
1050:         sum3 -= *v3-- * tmp0;
1051:       }
1052:       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1053:       row--;
1054:       sum2 -= *v2-- * tmp0;
1055:       sum3 -= *v3-- * tmp0;
1056:       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1057:       row--;
1058:       sum3 -= *v3-- * tmp0;
1059:       x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1060:       row--;

1062:       break;
1063:     case 4:
1064:       sum1 = tmp[row];
1065:       sum2 = tmp[row - 1];
1066:       sum3 = tmp[row - 2];
1067:       sum4 = tmp[row - 3];
1068:       v2   = aa + ai[row] - 1;
1069:       v3   = aa + ai[row - 1] - 1;
1070:       v4   = aa + ai[row - 2] - 1;

1072:       for (j = nz; j > 1; j -= 2) {
1073:         vi -= 2;
1074:         i0   = vi[2];
1075:         i1   = vi[1];
1076:         tmp0 = tmps[i0];
1077:         tmp1 = tmps[i1];
1078:         v1 -= 2;
1079:         v2 -= 2;
1080:         v3 -= 2;
1081:         v4 -= 2;
1082:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1083:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1084:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1085:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1086:       }
1087:       if (j == 1) {
1088:         tmp0 = tmps[*vi--];
1089:         sum1 -= *v1-- * tmp0;
1090:         sum2 -= *v2-- * tmp0;
1091:         sum3 -= *v3-- * tmp0;
1092:         sum4 -= *v4-- * tmp0;
1093:       }

1095:       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1096:       row--;
1097:       sum2 -= *v2-- * tmp0;
1098:       sum3 -= *v3-- * tmp0;
1099:       sum4 -= *v4-- * tmp0;
1100:       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1101:       row--;
1102:       sum3 -= *v3-- * tmp0;
1103:       sum4 -= *v4-- * tmp0;
1104:       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1105:       row--;
1106:       sum4 -= *v4-- * tmp0;
1107:       x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1108:       row--;
1109:       break;
1110:     case 5:
1111:       sum1 = tmp[row];
1112:       sum2 = tmp[row - 1];
1113:       sum3 = tmp[row - 2];
1114:       sum4 = tmp[row - 3];
1115:       sum5 = tmp[row - 4];
1116:       v2   = aa + ai[row] - 1;
1117:       v3   = aa + ai[row - 1] - 1;
1118:       v4   = aa + ai[row - 2] - 1;
1119:       v5   = aa + ai[row - 3] - 1;
1120:       for (j = nz; j > 1; j -= 2) {
1121:         vi -= 2;
1122:         i0   = vi[2];
1123:         i1   = vi[1];
1124:         tmp0 = tmps[i0];
1125:         tmp1 = tmps[i1];
1126:         v1 -= 2;
1127:         v2 -= 2;
1128:         v3 -= 2;
1129:         v4 -= 2;
1130:         v5 -= 2;
1131:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1132:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1133:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1134:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1135:         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1136:       }
1137:       if (j == 1) {
1138:         tmp0 = tmps[*vi--];
1139:         sum1 -= *v1-- * tmp0;
1140:         sum2 -= *v2-- * tmp0;
1141:         sum3 -= *v3-- * tmp0;
1142:         sum4 -= *v4-- * tmp0;
1143:         sum5 -= *v5-- * tmp0;
1144:       }

1146:       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1147:       row--;
1148:       sum2 -= *v2-- * tmp0;
1149:       sum3 -= *v3-- * tmp0;
1150:       sum4 -= *v4-- * tmp0;
1151:       sum5 -= *v5-- * tmp0;
1152:       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1153:       row--;
1154:       sum3 -= *v3-- * tmp0;
1155:       sum4 -= *v4-- * tmp0;
1156:       sum5 -= *v5-- * tmp0;
1157:       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1158:       row--;
1159:       sum4 -= *v4-- * tmp0;
1160:       sum5 -= *v5-- * tmp0;
1161:       tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1162:       row--;
1163:       sum5 -= *v5-- * tmp0;
1164:       x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1165:       row--;
1166:       break;
1167:     default:
1168:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1169:     }
1170:   }
1171:   ISRestoreIndices(isrow, &rout);
1172:   ISRestoreIndices(iscol, &cout);
1173:   VecRestoreArrayRead(bb, &b);
1174:   VecRestoreArrayWrite(xx, &x);
1175:   PetscLogFlops(2.0 * a->nz - A->cmap->n);
1176:   return 0;
1177: }

1179: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1180: {
1181:   Mat              C = B;
1182:   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1183:   IS               isrow = b->row, isicol = b->icol;
1184:   const PetscInt  *r, *ic, *ics;
1185:   const PetscInt   n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1186:   PetscInt         i, j, k, nz, nzL, row, *pj;
1187:   const PetscInt  *ajtmp, *bjtmp;
1188:   MatScalar       *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1189:   const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1190:   FactorShiftCtx   sctx;
1191:   const PetscInt  *ddiag;
1192:   PetscReal        rs;
1193:   MatScalar        d;
1194:   PetscInt         inod, nodesz, node_max, col;
1195:   const PetscInt  *ns;
1196:   PetscInt        *tmp_vec1, *tmp_vec2, *nsmap;

1198:   /* MatPivotSetUp(): initialize shift context sctx */
1199:   PetscMemzero(&sctx, sizeof(FactorShiftCtx));

1201:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1202:     ddiag          = a->diag;
1203:     sctx.shift_top = info->zeropivot;
1204:     for (i = 0; i < n; i++) {
1205:       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1206:       d  = (aa)[ddiag[i]];
1207:       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1208:       v  = aa + ai[i];
1209:       nz = ai[i + 1] - ai[i];
1210:       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1211:       if (rs > sctx.shift_top) sctx.shift_top = rs;
1212:     }
1213:     sctx.shift_top *= 1.1;
1214:     sctx.nshift_max = 5;
1215:     sctx.shift_lo   = 0.;
1216:     sctx.shift_hi   = 1.;
1217:   }

1219:   ISGetIndices(isrow, &r);
1220:   ISGetIndices(isicol, &ic);

1222:   PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4);
1223:   ics = ic;

1225:   node_max = a->inode.node_count;
1226:   ns       = a->inode.size;

1229:   /* If max inode size > 4, split it into two inodes.*/
1230:   /* also map the inode sizes according to the ordering */
1231:   PetscMalloc1(n + 1, &tmp_vec1);
1232:   for (i = 0, j = 0; i < node_max; ++i, ++j) {
1233:     if (ns[i] > 4) {
1234:       tmp_vec1[j] = 4;
1235:       ++j;
1236:       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
1237:     } else {
1238:       tmp_vec1[j] = ns[i];
1239:     }
1240:   }
1241:   /* Use the correct node_max */
1242:   node_max = j;

1244:   /* Now reorder the inode info based on mat re-ordering info */
1245:   /* First create a row -> inode_size_array_index map */
1246:   PetscMalloc1(n + 1, &nsmap);
1247:   PetscMalloc1(node_max + 1, &tmp_vec2);
1248:   for (i = 0, row = 0; i < node_max; i++) {
1249:     nodesz = tmp_vec1[i];
1250:     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1251:   }
1252:   /* Using nsmap, create a reordered ns structure */
1253:   for (i = 0, j = 0; i < node_max; i++) {
1254:     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1255:     tmp_vec2[i] = nodesz;
1256:     j += nodesz;
1257:   }
1258:   PetscFree(nsmap);
1259:   PetscFree(tmp_vec1);

1261:   /* Now use the correct ns */
1262:   ns = tmp_vec2;

1264:   do {
1265:     sctx.newshift = PETSC_FALSE;
1266:     /* Now loop over each block-row, and do the factorization */
1267:     for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1268:       nodesz = ns[inod];

1270:       switch (nodesz) {
1271:       case 1:
1272:         /*----------*/
1273:         /* zero rtmp1 */
1274:         /* L part */
1275:         nz    = bi[i + 1] - bi[i];
1276:         bjtmp = bj + bi[i];
1277:         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;

1279:         /* U part */
1280:         nz    = bdiag[i] - bdiag[i + 1];
1281:         bjtmp = bj + bdiag[i + 1] + 1;
1282:         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;

1284:         /* load in initial (unfactored row) */
1285:         nz    = ai[r[i] + 1] - ai[r[i]];
1286:         ajtmp = aj + ai[r[i]];
1287:         v     = aa + ai[r[i]];
1288:         for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];

1290:         /* ZeropivotApply() */
1291:         rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */

1293:         /* elimination */
1294:         bjtmp = bj + bi[i];
1295:         row   = *bjtmp++;
1296:         nzL   = bi[i + 1] - bi[i];
1297:         for (k = 0; k < nzL; k++) {
1298:           pc = rtmp1 + row;
1299:           if (*pc != 0.0) {
1300:             pv   = b->a + bdiag[row];
1301:             mul1 = *pc * (*pv);
1302:             *pc  = mul1;
1303:             pj   = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1304:             pv   = b->a + bdiag[row + 1] + 1;
1305:             nz   = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1306:             for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1307:             PetscLogFlops(1 + 2.0 * nz);
1308:           }
1309:           row = *bjtmp++;
1310:         }

1312:         /* finished row so stick it into b->a */
1313:         rs = 0.0;
1314:         /* L part */
1315:         pv = b->a + bi[i];
1316:         pj = b->j + bi[i];
1317:         nz = bi[i + 1] - bi[i];
1318:         for (j = 0; j < nz; j++) {
1319:           pv[j] = rtmp1[pj[j]];
1320:           rs += PetscAbsScalar(pv[j]);
1321:         }

1323:         /* U part */
1324:         pv = b->a + bdiag[i + 1] + 1;
1325:         pj = b->j + bdiag[i + 1] + 1;
1326:         nz = bdiag[i] - bdiag[i + 1] - 1;
1327:         for (j = 0; j < nz; j++) {
1328:           pv[j] = rtmp1[pj[j]];
1329:           rs += PetscAbsScalar(pv[j]);
1330:         }

1332:         /* Check zero pivot */
1333:         sctx.rs = rs;
1334:         sctx.pv = rtmp1[i];
1335:         MatPivotCheck(B, A, info, &sctx, i);
1336:         if (sctx.newshift) break;

1338:         /* Mark diagonal and invert diagonal for simpler triangular solves */
1339:         pv  = b->a + bdiag[i];
1340:         *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1341:         break;

1343:       case 2:
1344:         /*----------*/
1345:         /* zero rtmp1 and rtmp2 */
1346:         /* L part */
1347:         nz    = bi[i + 1] - bi[i];
1348:         bjtmp = bj + bi[i];
1349:         for (j = 0; j < nz; j++) {
1350:           col        = bjtmp[j];
1351:           rtmp1[col] = 0.0;
1352:           rtmp2[col] = 0.0;
1353:         }

1355:         /* U part */
1356:         nz    = bdiag[i] - bdiag[i + 1];
1357:         bjtmp = bj + bdiag[i + 1] + 1;
1358:         for (j = 0; j < nz; j++) {
1359:           col        = bjtmp[j];
1360:           rtmp1[col] = 0.0;
1361:           rtmp2[col] = 0.0;
1362:         }

1364:         /* load in initial (unfactored row) */
1365:         nz    = ai[r[i] + 1] - ai[r[i]];
1366:         ajtmp = aj + ai[r[i]];
1367:         v1    = aa + ai[r[i]];
1368:         v2    = aa + ai[r[i] + 1];
1369:         for (j = 0; j < nz; j++) {
1370:           col        = ics[ajtmp[j]];
1371:           rtmp1[col] = v1[j];
1372:           rtmp2[col] = v2[j];
1373:         }
1374:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1375:         rtmp1[i] += sctx.shift_amount;
1376:         rtmp2[i + 1] += sctx.shift_amount;

1378:         /* elimination */
1379:         bjtmp = bj + bi[i];
1380:         row   = *bjtmp++; /* pivot row */
1381:         nzL   = bi[i + 1] - bi[i];
1382:         for (k = 0; k < nzL; k++) {
1383:           pc1 = rtmp1 + row;
1384:           pc2 = rtmp2 + row;
1385:           if (*pc1 != 0.0 || *pc2 != 0.0) {
1386:             pv   = b->a + bdiag[row];
1387:             mul1 = *pc1 * (*pv);
1388:             mul2 = *pc2 * (*pv);
1389:             *pc1 = mul1;
1390:             *pc2 = mul2;

1392:             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1393:             pv = b->a + bdiag[row + 1] + 1;
1394:             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1395:             for (j = 0; j < nz; j++) {
1396:               col = pj[j];
1397:               rtmp1[col] -= mul1 * pv[j];
1398:               rtmp2[col] -= mul2 * pv[j];
1399:             }
1400:             PetscLogFlops(2 + 4.0 * nz);
1401:           }
1402:           row = *bjtmp++;
1403:         }

1405:         /* finished row i; check zero pivot, then stick row i into b->a */
1406:         rs = 0.0;
1407:         /* L part */
1408:         pc1 = b->a + bi[i];
1409:         pj  = b->j + bi[i];
1410:         nz  = bi[i + 1] - bi[i];
1411:         for (j = 0; j < nz; j++) {
1412:           col    = pj[j];
1413:           pc1[j] = rtmp1[col];
1414:           rs += PetscAbsScalar(pc1[j]);
1415:         }
1416:         /* U part */
1417:         pc1 = b->a + bdiag[i + 1] + 1;
1418:         pj  = b->j + bdiag[i + 1] + 1;
1419:         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1420:         for (j = 0; j < nz; j++) {
1421:           col    = pj[j];
1422:           pc1[j] = rtmp1[col];
1423:           rs += PetscAbsScalar(pc1[j]);
1424:         }

1426:         sctx.rs = rs;
1427:         sctx.pv = rtmp1[i];
1428:         MatPivotCheck(B, A, info, &sctx, i);
1429:         if (sctx.newshift) break;
1430:         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1431:         *pc1 = 1.0 / sctx.pv;

1433:         /* Now take care of diagonal 2x2 block. */
1434:         pc2 = rtmp2 + i;
1435:         if (*pc2 != 0.0) {
1436:           mul1 = (*pc2) * (*pc1);             /* *pc1=diag[i] is inverted! */
1437:           *pc2 = mul1;                        /* insert L entry */
1438:           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1439:           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1440:           for (j = 0; j < nz; j++) {
1441:             col = pj[j];
1442:             rtmp2[col] -= mul1 * rtmp1[col];
1443:           }
1444:           PetscLogFlops(1 + 2.0 * nz);
1445:         }

1447:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1448:         rs = 0.0;
1449:         /* L part */
1450:         pc2 = b->a + bi[i + 1];
1451:         pj  = b->j + bi[i + 1];
1452:         nz  = bi[i + 2] - bi[i + 1];
1453:         for (j = 0; j < nz; j++) {
1454:           col    = pj[j];
1455:           pc2[j] = rtmp2[col];
1456:           rs += PetscAbsScalar(pc2[j]);
1457:         }
1458:         /* U part */
1459:         pc2 = b->a + bdiag[i + 2] + 1;
1460:         pj  = b->j + bdiag[i + 2] + 1;
1461:         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1462:         for (j = 0; j < nz; j++) {
1463:           col    = pj[j];
1464:           pc2[j] = rtmp2[col];
1465:           rs += PetscAbsScalar(pc2[j]);
1466:         }

1468:         sctx.rs = rs;
1469:         sctx.pv = rtmp2[i + 1];
1470:         MatPivotCheck(B, A, info, &sctx, i + 1);
1471:         if (sctx.newshift) break;
1472:         pc2  = b->a + bdiag[i + 1];
1473:         *pc2 = 1.0 / sctx.pv;
1474:         break;

1476:       case 3:
1477:         /*----------*/
1478:         /* zero rtmp */
1479:         /* L part */
1480:         nz    = bi[i + 1] - bi[i];
1481:         bjtmp = bj + bi[i];
1482:         for (j = 0; j < nz; j++) {
1483:           col        = bjtmp[j];
1484:           rtmp1[col] = 0.0;
1485:           rtmp2[col] = 0.0;
1486:           rtmp3[col] = 0.0;
1487:         }

1489:         /* U part */
1490:         nz    = bdiag[i] - bdiag[i + 1];
1491:         bjtmp = bj + bdiag[i + 1] + 1;
1492:         for (j = 0; j < nz; j++) {
1493:           col        = bjtmp[j];
1494:           rtmp1[col] = 0.0;
1495:           rtmp2[col] = 0.0;
1496:           rtmp3[col] = 0.0;
1497:         }

1499:         /* load in initial (unfactored row) */
1500:         nz    = ai[r[i] + 1] - ai[r[i]];
1501:         ajtmp = aj + ai[r[i]];
1502:         v1    = aa + ai[r[i]];
1503:         v2    = aa + ai[r[i] + 1];
1504:         v3    = aa + ai[r[i] + 2];
1505:         for (j = 0; j < nz; j++) {
1506:           col        = ics[ajtmp[j]];
1507:           rtmp1[col] = v1[j];
1508:           rtmp2[col] = v2[j];
1509:           rtmp3[col] = v3[j];
1510:         }
1511:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1512:         rtmp1[i] += sctx.shift_amount;
1513:         rtmp2[i + 1] += sctx.shift_amount;
1514:         rtmp3[i + 2] += sctx.shift_amount;

1516:         /* elimination */
1517:         bjtmp = bj + bi[i];
1518:         row   = *bjtmp++; /* pivot row */
1519:         nzL   = bi[i + 1] - bi[i];
1520:         for (k = 0; k < nzL; k++) {
1521:           pc1 = rtmp1 + row;
1522:           pc2 = rtmp2 + row;
1523:           pc3 = rtmp3 + row;
1524:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1525:             pv   = b->a + bdiag[row];
1526:             mul1 = *pc1 * (*pv);
1527:             mul2 = *pc2 * (*pv);
1528:             mul3 = *pc3 * (*pv);
1529:             *pc1 = mul1;
1530:             *pc2 = mul2;
1531:             *pc3 = mul3;

1533:             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1534:             pv = b->a + bdiag[row + 1] + 1;
1535:             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1536:             for (j = 0; j < nz; j++) {
1537:               col = pj[j];
1538:               rtmp1[col] -= mul1 * pv[j];
1539:               rtmp2[col] -= mul2 * pv[j];
1540:               rtmp3[col] -= mul3 * pv[j];
1541:             }
1542:             PetscLogFlops(3 + 6.0 * nz);
1543:           }
1544:           row = *bjtmp++;
1545:         }

1547:         /* finished row i; check zero pivot, then stick row i into b->a */
1548:         rs = 0.0;
1549:         /* L part */
1550:         pc1 = b->a + bi[i];
1551:         pj  = b->j + bi[i];
1552:         nz  = bi[i + 1] - bi[i];
1553:         for (j = 0; j < nz; j++) {
1554:           col    = pj[j];
1555:           pc1[j] = rtmp1[col];
1556:           rs += PetscAbsScalar(pc1[j]);
1557:         }
1558:         /* U part */
1559:         pc1 = b->a + bdiag[i + 1] + 1;
1560:         pj  = b->j + bdiag[i + 1] + 1;
1561:         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1562:         for (j = 0; j < nz; j++) {
1563:           col    = pj[j];
1564:           pc1[j] = rtmp1[col];
1565:           rs += PetscAbsScalar(pc1[j]);
1566:         }

1568:         sctx.rs = rs;
1569:         sctx.pv = rtmp1[i];
1570:         MatPivotCheck(B, A, info, &sctx, i);
1571:         if (sctx.newshift) break;
1572:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1573:         *pc1 = 1.0 / sctx.pv;

1575:         /* Now take care of 1st column of diagonal 3x3 block. */
1576:         pc2 = rtmp2 + i;
1577:         pc3 = rtmp3 + i;
1578:         if (*pc2 != 0.0 || *pc3 != 0.0) {
1579:           mul2 = (*pc2) * (*pc1);
1580:           *pc2 = mul2;
1581:           mul3 = (*pc3) * (*pc1);
1582:           *pc3 = mul3;
1583:           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1584:           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1585:           for (j = 0; j < nz; j++) {
1586:             col = pj[j];
1587:             rtmp2[col] -= mul2 * rtmp1[col];
1588:             rtmp3[col] -= mul3 * rtmp1[col];
1589:           }
1590:           PetscLogFlops(2 + 4.0 * nz);
1591:         }

1593:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1594:         rs = 0.0;
1595:         /* L part */
1596:         pc2 = b->a + bi[i + 1];
1597:         pj  = b->j + bi[i + 1];
1598:         nz  = bi[i + 2] - bi[i + 1];
1599:         for (j = 0; j < nz; j++) {
1600:           col    = pj[j];
1601:           pc2[j] = rtmp2[col];
1602:           rs += PetscAbsScalar(pc2[j]);
1603:         }
1604:         /* U part */
1605:         pc2 = b->a + bdiag[i + 2] + 1;
1606:         pj  = b->j + bdiag[i + 2] + 1;
1607:         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1608:         for (j = 0; j < nz; j++) {
1609:           col    = pj[j];
1610:           pc2[j] = rtmp2[col];
1611:           rs += PetscAbsScalar(pc2[j]);
1612:         }

1614:         sctx.rs = rs;
1615:         sctx.pv = rtmp2[i + 1];
1616:         MatPivotCheck(B, A, info, &sctx, i + 1);
1617:         if (sctx.newshift) break;
1618:         pc2  = b->a + bdiag[i + 1];
1619:         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */

1621:         /* Now take care of 2nd column of diagonal 3x3 block. */
1622:         pc3 = rtmp3 + i + 1;
1623:         if (*pc3 != 0.0) {
1624:           mul3 = (*pc3) * (*pc2);
1625:           *pc3 = mul3;
1626:           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1627:           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1628:           for (j = 0; j < nz; j++) {
1629:             col = pj[j];
1630:             rtmp3[col] -= mul3 * rtmp2[col];
1631:           }
1632:           PetscLogFlops(1 + 2.0 * nz);
1633:         }

1635:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1636:         rs = 0.0;
1637:         /* L part */
1638:         pc3 = b->a + bi[i + 2];
1639:         pj  = b->j + bi[i + 2];
1640:         nz  = bi[i + 3] - bi[i + 2];
1641:         for (j = 0; j < nz; j++) {
1642:           col    = pj[j];
1643:           pc3[j] = rtmp3[col];
1644:           rs += PetscAbsScalar(pc3[j]);
1645:         }
1646:         /* U part */
1647:         pc3 = b->a + bdiag[i + 3] + 1;
1648:         pj  = b->j + bdiag[i + 3] + 1;
1649:         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1650:         for (j = 0; j < nz; j++) {
1651:           col    = pj[j];
1652:           pc3[j] = rtmp3[col];
1653:           rs += PetscAbsScalar(pc3[j]);
1654:         }

1656:         sctx.rs = rs;
1657:         sctx.pv = rtmp3[i + 2];
1658:         MatPivotCheck(B, A, info, &sctx, i + 2);
1659:         if (sctx.newshift) break;
1660:         pc3  = b->a + bdiag[i + 2];
1661:         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1662:         break;
1663:       case 4:
1664:         /*----------*/
1665:         /* zero rtmp */
1666:         /* L part */
1667:         nz    = bi[i + 1] - bi[i];
1668:         bjtmp = bj + bi[i];
1669:         for (j = 0; j < nz; j++) {
1670:           col        = bjtmp[j];
1671:           rtmp1[col] = 0.0;
1672:           rtmp2[col] = 0.0;
1673:           rtmp3[col] = 0.0;
1674:           rtmp4[col] = 0.0;
1675:         }

1677:         /* U part */
1678:         nz    = bdiag[i] - bdiag[i + 1];
1679:         bjtmp = bj + bdiag[i + 1] + 1;
1680:         for (j = 0; j < nz; j++) {
1681:           col        = bjtmp[j];
1682:           rtmp1[col] = 0.0;
1683:           rtmp2[col] = 0.0;
1684:           rtmp3[col] = 0.0;
1685:           rtmp4[col] = 0.0;
1686:         }

1688:         /* load in initial (unfactored row) */
1689:         nz    = ai[r[i] + 1] - ai[r[i]];
1690:         ajtmp = aj + ai[r[i]];
1691:         v1    = aa + ai[r[i]];
1692:         v2    = aa + ai[r[i] + 1];
1693:         v3    = aa + ai[r[i] + 2];
1694:         v4    = aa + ai[r[i] + 3];
1695:         for (j = 0; j < nz; j++) {
1696:           col        = ics[ajtmp[j]];
1697:           rtmp1[col] = v1[j];
1698:           rtmp2[col] = v2[j];
1699:           rtmp3[col] = v3[j];
1700:           rtmp4[col] = v4[j];
1701:         }
1702:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1703:         rtmp1[i] += sctx.shift_amount;
1704:         rtmp2[i + 1] += sctx.shift_amount;
1705:         rtmp3[i + 2] += sctx.shift_amount;
1706:         rtmp4[i + 3] += sctx.shift_amount;

1708:         /* elimination */
1709:         bjtmp = bj + bi[i];
1710:         row   = *bjtmp++; /* pivot row */
1711:         nzL   = bi[i + 1] - bi[i];
1712:         for (k = 0; k < nzL; k++) {
1713:           pc1 = rtmp1 + row;
1714:           pc2 = rtmp2 + row;
1715:           pc3 = rtmp3 + row;
1716:           pc4 = rtmp4 + row;
1717:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1718:             pv   = b->a + bdiag[row];
1719:             mul1 = *pc1 * (*pv);
1720:             mul2 = *pc2 * (*pv);
1721:             mul3 = *pc3 * (*pv);
1722:             mul4 = *pc4 * (*pv);
1723:             *pc1 = mul1;
1724:             *pc2 = mul2;
1725:             *pc3 = mul3;
1726:             *pc4 = mul4;

1728:             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1729:             pv = b->a + bdiag[row + 1] + 1;
1730:             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1731:             for (j = 0; j < nz; j++) {
1732:               col = pj[j];
1733:               rtmp1[col] -= mul1 * pv[j];
1734:               rtmp2[col] -= mul2 * pv[j];
1735:               rtmp3[col] -= mul3 * pv[j];
1736:               rtmp4[col] -= mul4 * pv[j];
1737:             }
1738:             PetscLogFlops(4 + 8.0 * nz);
1739:           }
1740:           row = *bjtmp++;
1741:         }

1743:         /* finished row i; check zero pivot, then stick row i into b->a */
1744:         rs = 0.0;
1745:         /* L part */
1746:         pc1 = b->a + bi[i];
1747:         pj  = b->j + bi[i];
1748:         nz  = bi[i + 1] - bi[i];
1749:         for (j = 0; j < nz; j++) {
1750:           col    = pj[j];
1751:           pc1[j] = rtmp1[col];
1752:           rs += PetscAbsScalar(pc1[j]);
1753:         }
1754:         /* U part */
1755:         pc1 = b->a + bdiag[i + 1] + 1;
1756:         pj  = b->j + bdiag[i + 1] + 1;
1757:         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1758:         for (j = 0; j < nz; j++) {
1759:           col    = pj[j];
1760:           pc1[j] = rtmp1[col];
1761:           rs += PetscAbsScalar(pc1[j]);
1762:         }

1764:         sctx.rs = rs;
1765:         sctx.pv = rtmp1[i];
1766:         MatPivotCheck(B, A, info, &sctx, i);
1767:         if (sctx.newshift) break;
1768:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1769:         *pc1 = 1.0 / sctx.pv;

1771:         /* Now take care of 1st column of diagonal 4x4 block. */
1772:         pc2 = rtmp2 + i;
1773:         pc3 = rtmp3 + i;
1774:         pc4 = rtmp4 + i;
1775:         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1776:           mul2 = (*pc2) * (*pc1);
1777:           *pc2 = mul2;
1778:           mul3 = (*pc3) * (*pc1);
1779:           *pc3 = mul3;
1780:           mul4 = (*pc4) * (*pc1);
1781:           *pc4 = mul4;
1782:           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1783:           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1784:           for (j = 0; j < nz; j++) {
1785:             col = pj[j];
1786:             rtmp2[col] -= mul2 * rtmp1[col];
1787:             rtmp3[col] -= mul3 * rtmp1[col];
1788:             rtmp4[col] -= mul4 * rtmp1[col];
1789:           }
1790:           PetscLogFlops(3 + 6.0 * nz);
1791:         }

1793:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1794:         rs = 0.0;
1795:         /* L part */
1796:         pc2 = b->a + bi[i + 1];
1797:         pj  = b->j + bi[i + 1];
1798:         nz  = bi[i + 2] - bi[i + 1];
1799:         for (j = 0; j < nz; j++) {
1800:           col    = pj[j];
1801:           pc2[j] = rtmp2[col];
1802:           rs += PetscAbsScalar(pc2[j]);
1803:         }
1804:         /* U part */
1805:         pc2 = b->a + bdiag[i + 2] + 1;
1806:         pj  = b->j + bdiag[i + 2] + 1;
1807:         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1808:         for (j = 0; j < nz; j++) {
1809:           col    = pj[j];
1810:           pc2[j] = rtmp2[col];
1811:           rs += PetscAbsScalar(pc2[j]);
1812:         }

1814:         sctx.rs = rs;
1815:         sctx.pv = rtmp2[i + 1];
1816:         MatPivotCheck(B, A, info, &sctx, i + 1);
1817:         if (sctx.newshift) break;
1818:         pc2  = b->a + bdiag[i + 1];
1819:         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */

1821:         /* Now take care of 2nd column of diagonal 4x4 block. */
1822:         pc3 = rtmp3 + i + 1;
1823:         pc4 = rtmp4 + i + 1;
1824:         if (*pc3 != 0.0 || *pc4 != 0.0) {
1825:           mul3 = (*pc3) * (*pc2);
1826:           *pc3 = mul3;
1827:           mul4 = (*pc4) * (*pc2);
1828:           *pc4 = mul4;
1829:           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1830:           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1831:           for (j = 0; j < nz; j++) {
1832:             col = pj[j];
1833:             rtmp3[col] -= mul3 * rtmp2[col];
1834:             rtmp4[col] -= mul4 * rtmp2[col];
1835:           }
1836:           PetscLogFlops(4.0 * nz);
1837:         }

1839:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1840:         rs = 0.0;
1841:         /* L part */
1842:         pc3 = b->a + bi[i + 2];
1843:         pj  = b->j + bi[i + 2];
1844:         nz  = bi[i + 3] - bi[i + 2];
1845:         for (j = 0; j < nz; j++) {
1846:           col    = pj[j];
1847:           pc3[j] = rtmp3[col];
1848:           rs += PetscAbsScalar(pc3[j]);
1849:         }
1850:         /* U part */
1851:         pc3 = b->a + bdiag[i + 3] + 1;
1852:         pj  = b->j + bdiag[i + 3] + 1;
1853:         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1854:         for (j = 0; j < nz; j++) {
1855:           col    = pj[j];
1856:           pc3[j] = rtmp3[col];
1857:           rs += PetscAbsScalar(pc3[j]);
1858:         }

1860:         sctx.rs = rs;
1861:         sctx.pv = rtmp3[i + 2];
1862:         MatPivotCheck(B, A, info, &sctx, i + 2);
1863:         if (sctx.newshift) break;
1864:         pc3  = b->a + bdiag[i + 2];
1865:         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */

1867:         /* Now take care of 3rd column of diagonal 4x4 block. */
1868:         pc4 = rtmp4 + i + 2;
1869:         if (*pc4 != 0.0) {
1870:           mul4 = (*pc4) * (*pc3);
1871:           *pc4 = mul4;
1872:           pj   = b->j + bdiag[i + 3] + 1;         /* beginning of U(i+2,:) */
1873:           nz   = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1874:           for (j = 0; j < nz; j++) {
1875:             col = pj[j];
1876:             rtmp4[col] -= mul4 * rtmp3[col];
1877:           }
1878:           PetscLogFlops(1 + 2.0 * nz);
1879:         }

1881:         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1882:         rs = 0.0;
1883:         /* L part */
1884:         pc4 = b->a + bi[i + 3];
1885:         pj  = b->j + bi[i + 3];
1886:         nz  = bi[i + 4] - bi[i + 3];
1887:         for (j = 0; j < nz; j++) {
1888:           col    = pj[j];
1889:           pc4[j] = rtmp4[col];
1890:           rs += PetscAbsScalar(pc4[j]);
1891:         }
1892:         /* U part */
1893:         pc4 = b->a + bdiag[i + 4] + 1;
1894:         pj  = b->j + bdiag[i + 4] + 1;
1895:         nz  = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1896:         for (j = 0; j < nz; j++) {
1897:           col    = pj[j];
1898:           pc4[j] = rtmp4[col];
1899:           rs += PetscAbsScalar(pc4[j]);
1900:         }

1902:         sctx.rs = rs;
1903:         sctx.pv = rtmp4[i + 3];
1904:         MatPivotCheck(B, A, info, &sctx, i + 3);
1905:         if (sctx.newshift) break;
1906:         pc4  = b->a + bdiag[i + 3];
1907:         *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1908:         break;

1910:       default:
1911:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1912:       }
1913:       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1914:       i += nodesz;              /* Update the row */
1915:     }

1917:     /* MatPivotRefine() */
1918:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1919:       /*
1920:        * if no shift in this attempt & shifting & started shifting & can refine,
1921:        * then try lower shift
1922:        */
1923:       sctx.shift_hi       = sctx.shift_fraction;
1924:       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1925:       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1926:       sctx.newshift       = PETSC_TRUE;
1927:       sctx.nshift++;
1928:     }
1929:   } while (sctx.newshift);

1931:   PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4);
1932:   PetscFree(tmp_vec2);
1933:   ISRestoreIndices(isicol, &ic);
1934:   ISRestoreIndices(isrow, &r);

1936:   if (b->inode.size) {
1937:     C->ops->solve = MatSolve_SeqAIJ_Inode;
1938:   } else {
1939:     C->ops->solve = MatSolve_SeqAIJ;
1940:   }
1941:   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1942:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1943:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1944:   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1945:   C->assembled              = PETSC_TRUE;
1946:   C->preallocated           = PETSC_TRUE;

1948:   PetscLogFlops(C->cmap->n);

1950:   /* MatShiftView(A,info,&sctx) */
1951:   if (sctx.nshift) {
1952:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1953:       PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top);
1954:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1955:       PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount);
1956:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1957:       PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount);
1958:     }
1959:   }
1960:   return 0;
1961: }

1963: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1964: {
1965:   Mat              C = B;
1966:   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1967:   IS               iscol = b->col, isrow = b->row, isicol = b->icol;
1968:   const PetscInt  *r, *ic, *c, *ics;
1969:   PetscInt         n = A->rmap->n, *bi = b->i;
1970:   PetscInt        *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
1971:   PetscInt         i, j, idx, *bd = b->diag, node_max, nodesz;
1972:   PetscInt        *ai = a->i, *aj = a->j;
1973:   PetscInt        *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1974:   PetscScalar      mul1, mul2, mul3, tmp;
1975:   MatScalar       *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1976:   const MatScalar *v1, *v2, *v3, *aa    = a->a, *rtmp1;
1977:   PetscReal        rs = 0.0;
1978:   FactorShiftCtx   sctx;

1980:   sctx.shift_top      = 0;
1981:   sctx.nshift_max     = 0;
1982:   sctx.shift_lo       = 0;
1983:   sctx.shift_hi       = 0;
1984:   sctx.shift_fraction = 0;

1986:   /* if both shift schemes are chosen by user, only use info->shiftpd */
1987:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1988:     sctx.shift_top = 0;
1989:     for (i = 0; i < n; i++) {
1990:       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1991:       rs    = 0.0;
1992:       ajtmp = aj + ai[i];
1993:       rtmp1 = aa + ai[i];
1994:       nz    = ai[i + 1] - ai[i];
1995:       for (j = 0; j < nz; j++) {
1996:         if (*ajtmp != i) {
1997:           rs += PetscAbsScalar(*rtmp1++);
1998:         } else {
1999:           rs -= PetscRealPart(*rtmp1++);
2000:         }
2001:         ajtmp++;
2002:       }
2003:       if (rs > sctx.shift_top) sctx.shift_top = rs;
2004:     }
2005:     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2006:     sctx.shift_top *= 1.1;
2007:     sctx.nshift_max = 5;
2008:     sctx.shift_lo   = 0.;
2009:     sctx.shift_hi   = 1.;
2010:   }
2011:   sctx.shift_amount = 0;
2012:   sctx.nshift       = 0;

2014:   ISGetIndices(isrow, &r);
2015:   ISGetIndices(iscol, &c);
2016:   ISGetIndices(isicol, &ic);
2017:   PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33);
2018:   ics = ic;

2020:   node_max = a->inode.node_count;
2021:   ns       = a->inode.size;

2024:   /* If max inode size > 3, split it into two inodes.*/
2025:   /* also map the inode sizes according to the ordering */
2026:   PetscMalloc1(n + 1, &tmp_vec1);
2027:   for (i = 0, j = 0; i < node_max; ++i, ++j) {
2028:     if (ns[i] > 3) {
2029:       tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5  */
2030:       ++j;
2031:       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2032:     } else {
2033:       tmp_vec1[j] = ns[i];
2034:     }
2035:   }
2036:   /* Use the correct node_max */
2037:   node_max = j;

2039:   /* Now reorder the inode info based on mat re-ordering info */
2040:   /* First create a row -> inode_size_array_index map */
2041:   PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2);
2042:   for (i = 0, row = 0; i < node_max; i++) {
2043:     nodesz = tmp_vec1[i];
2044:     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2045:   }
2046:   /* Using nsmap, create a reordered ns structure */
2047:   for (i = 0, j = 0; i < node_max; i++) {
2048:     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2049:     tmp_vec2[i] = nodesz;
2050:     j += nodesz;
2051:   }
2052:   PetscFree2(nsmap, tmp_vec1);
2053:   /* Now use the correct ns */
2054:   ns = tmp_vec2;

2056:   do {
2057:     sctx.newshift = PETSC_FALSE;
2058:     /* Now loop over each block-row, and do the factorization */
2059:     for (i = 0, row = 0; i < node_max; i++) {
2060:       nodesz = ns[i];
2061:       nz     = bi[row + 1] - bi[row];
2062:       bjtmp  = bj + bi[row];

2064:       switch (nodesz) {
2065:       case 1:
2066:         for (j = 0; j < nz; j++) {
2067:           idx         = bjtmp[j];
2068:           rtmp11[idx] = 0.0;
2069:         }

2071:         /* load in initial (unfactored row) */
2072:         idx    = r[row];
2073:         nz_tmp = ai[idx + 1] - ai[idx];
2074:         ajtmp  = aj + ai[idx];
2075:         v1     = aa + ai[idx];

2077:         for (j = 0; j < nz_tmp; j++) {
2078:           idx         = ics[ajtmp[j]];
2079:           rtmp11[idx] = v1[j];
2080:         }
2081:         rtmp11[ics[r[row]]] += sctx.shift_amount;

2083:         prow = *bjtmp++;
2084:         while (prow < row) {
2085:           pc1 = rtmp11 + prow;
2086:           if (*pc1 != 0.0) {
2087:             pv     = ba + bd[prow];
2088:             pj     = nbj + bd[prow];
2089:             mul1   = *pc1 * *pv++;
2090:             *pc1   = mul1;
2091:             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2092:             PetscLogFlops(1 + 2.0 * nz_tmp);
2093:             for (j = 0; j < nz_tmp; j++) {
2094:               tmp = pv[j];
2095:               idx = pj[j];
2096:               rtmp11[idx] -= mul1 * tmp;
2097:             }
2098:           }
2099:           prow = *bjtmp++;
2100:         }
2101:         pj  = bj + bi[row];
2102:         pc1 = ba + bi[row];

2104:         sctx.pv     = rtmp11[row];
2105:         rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2106:         rs          = 0.0;
2107:         for (j = 0; j < nz; j++) {
2108:           idx    = pj[j];
2109:           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2110:           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2111:         }
2112:         sctx.rs = rs;
2113:         MatPivotCheck(B, A, info, &sctx, row);
2114:         if (sctx.newshift) goto endofwhile;
2115:         break;

2117:       case 2:
2118:         for (j = 0; j < nz; j++) {
2119:           idx         = bjtmp[j];
2120:           rtmp11[idx] = 0.0;
2121:           rtmp22[idx] = 0.0;
2122:         }

2124:         /* load in initial (unfactored row) */
2125:         idx    = r[row];
2126:         nz_tmp = ai[idx + 1] - ai[idx];
2127:         ajtmp  = aj + ai[idx];
2128:         v1     = aa + ai[idx];
2129:         v2     = aa + ai[idx + 1];
2130:         for (j = 0; j < nz_tmp; j++) {
2131:           idx         = ics[ajtmp[j]];
2132:           rtmp11[idx] = v1[j];
2133:           rtmp22[idx] = v2[j];
2134:         }
2135:         rtmp11[ics[r[row]]] += sctx.shift_amount;
2136:         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;

2138:         prow = *bjtmp++;
2139:         while (prow < row) {
2140:           pc1 = rtmp11 + prow;
2141:           pc2 = rtmp22 + prow;
2142:           if (*pc1 != 0.0 || *pc2 != 0.0) {
2143:             pv   = ba + bd[prow];
2144:             pj   = nbj + bd[prow];
2145:             mul1 = *pc1 * *pv;
2146:             mul2 = *pc2 * *pv;
2147:             ++pv;
2148:             *pc1 = mul1;
2149:             *pc2 = mul2;

2151:             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2152:             for (j = 0; j < nz_tmp; j++) {
2153:               tmp = pv[j];
2154:               idx = pj[j];
2155:               rtmp11[idx] -= mul1 * tmp;
2156:               rtmp22[idx] -= mul2 * tmp;
2157:             }
2158:             PetscLogFlops(2 + 4.0 * nz_tmp);
2159:           }
2160:           prow = *bjtmp++;
2161:         }

2163:         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2164:         pc1 = rtmp11 + prow;
2165:         pc2 = rtmp22 + prow;

2167:         sctx.pv = *pc1;
2168:         pj      = bj + bi[prow];
2169:         rs      = 0.0;
2170:         for (j = 0; j < nz; j++) {
2171:           idx = pj[j];
2172:           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2173:         }
2174:         sctx.rs = rs;
2175:         MatPivotCheck(B, A, info, &sctx, row);
2176:         if (sctx.newshift) goto endofwhile;

2178:         if (*pc2 != 0.0) {
2179:           pj     = nbj + bd[prow];
2180:           mul2   = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2181:           *pc2   = mul2;
2182:           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2183:           for (j = 0; j < nz_tmp; j++) {
2184:             idx = pj[j];
2185:             tmp = rtmp11[idx];
2186:             rtmp22[idx] -= mul2 * tmp;
2187:           }
2188:           PetscLogFlops(1 + 2.0 * nz_tmp);
2189:         }

2191:         pj  = bj + bi[row];
2192:         pc1 = ba + bi[row];
2193:         pc2 = ba + bi[row + 1];

2195:         sctx.pv         = rtmp22[row + 1];
2196:         rs              = 0.0;
2197:         rtmp11[row]     = 1.0 / rtmp11[row];
2198:         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2199:         /* copy row entries from dense representation to sparse */
2200:         for (j = 0; j < nz; j++) {
2201:           idx    = pj[j];
2202:           pc1[j] = rtmp11[idx];
2203:           pc2[j] = rtmp22[idx];
2204:           if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2205:         }
2206:         sctx.rs = rs;
2207:         MatPivotCheck(B, A, info, &sctx, row + 1);
2208:         if (sctx.newshift) goto endofwhile;
2209:         break;

2211:       case 3:
2212:         for (j = 0; j < nz; j++) {
2213:           idx         = bjtmp[j];
2214:           rtmp11[idx] = 0.0;
2215:           rtmp22[idx] = 0.0;
2216:           rtmp33[idx] = 0.0;
2217:         }
2218:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2219:         idx    = r[row];
2220:         nz_tmp = ai[idx + 1] - ai[idx];
2221:         ajtmp  = aj + ai[idx];
2222:         v1     = aa + ai[idx];
2223:         v2     = aa + ai[idx + 1];
2224:         v3     = aa + ai[idx + 2];
2225:         for (j = 0; j < nz_tmp; j++) {
2226:           idx         = ics[ajtmp[j]];
2227:           rtmp11[idx] = v1[j];
2228:           rtmp22[idx] = v2[j];
2229:           rtmp33[idx] = v3[j];
2230:         }
2231:         rtmp11[ics[r[row]]] += sctx.shift_amount;
2232:         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2233:         rtmp33[ics[r[row + 2]]] += sctx.shift_amount;

2235:         /* loop over all pivot row blocks above this row block */
2236:         prow = *bjtmp++;
2237:         while (prow < row) {
2238:           pc1 = rtmp11 + prow;
2239:           pc2 = rtmp22 + prow;
2240:           pc3 = rtmp33 + prow;
2241:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2242:             pv   = ba + bd[prow];
2243:             pj   = nbj + bd[prow];
2244:             mul1 = *pc1 * *pv;
2245:             mul2 = *pc2 * *pv;
2246:             mul3 = *pc3 * *pv;
2247:             ++pv;
2248:             *pc1 = mul1;
2249:             *pc2 = mul2;
2250:             *pc3 = mul3;

2252:             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2253:             /* update this row based on pivot row */
2254:             for (j = 0; j < nz_tmp; j++) {
2255:               tmp = pv[j];
2256:               idx = pj[j];
2257:               rtmp11[idx] -= mul1 * tmp;
2258:               rtmp22[idx] -= mul2 * tmp;
2259:               rtmp33[idx] -= mul3 * tmp;
2260:             }
2261:             PetscLogFlops(3 + 6.0 * nz_tmp);
2262:           }
2263:           prow = *bjtmp++;
2264:         }

2266:         /* Now take care of diagonal 3x3 block in this set of rows */
2267:         /* note: prow = row here */
2268:         pc1 = rtmp11 + prow;
2269:         pc2 = rtmp22 + prow;
2270:         pc3 = rtmp33 + prow;

2272:         sctx.pv = *pc1;
2273:         pj      = bj + bi[prow];
2274:         rs      = 0.0;
2275:         for (j = 0; j < nz; j++) {
2276:           idx = pj[j];
2277:           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2278:         }
2279:         sctx.rs = rs;
2280:         MatPivotCheck(B, A, info, &sctx, row);
2281:         if (sctx.newshift) goto endofwhile;

2283:         if (*pc2 != 0.0 || *pc3 != 0.0) {
2284:           mul2   = (*pc2) / (*pc1);
2285:           mul3   = (*pc3) / (*pc1);
2286:           *pc2   = mul2;
2287:           *pc3   = mul3;
2288:           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2289:           pj     = nbj + bd[prow];
2290:           for (j = 0; j < nz_tmp; j++) {
2291:             idx = pj[j];
2292:             tmp = rtmp11[idx];
2293:             rtmp22[idx] -= mul2 * tmp;
2294:             rtmp33[idx] -= mul3 * tmp;
2295:           }
2296:           PetscLogFlops(2 + 4.0 * nz_tmp);
2297:         }
2298:         ++prow;

2300:         pc2     = rtmp22 + prow;
2301:         pc3     = rtmp33 + prow;
2302:         sctx.pv = *pc2;
2303:         pj      = bj + bi[prow];
2304:         rs      = 0.0;
2305:         for (j = 0; j < nz; j++) {
2306:           idx = pj[j];
2307:           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2308:         }
2309:         sctx.rs = rs;
2310:         MatPivotCheck(B, A, info, &sctx, row + 1);
2311:         if (sctx.newshift) goto endofwhile;

2313:         if (*pc3 != 0.0) {
2314:           mul3   = (*pc3) / (*pc2);
2315:           *pc3   = mul3;
2316:           pj     = nbj + bd[prow];
2317:           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2318:           for (j = 0; j < nz_tmp; j++) {
2319:             idx = pj[j];
2320:             tmp = rtmp22[idx];
2321:             rtmp33[idx] -= mul3 * tmp;
2322:           }
2323:           PetscLogFlops(1 + 2.0 * nz_tmp);
2324:         }

2326:         pj  = bj + bi[row];
2327:         pc1 = ba + bi[row];
2328:         pc2 = ba + bi[row + 1];
2329:         pc3 = ba + bi[row + 2];

2331:         sctx.pv         = rtmp33[row + 2];
2332:         rs              = 0.0;
2333:         rtmp11[row]     = 1.0 / rtmp11[row];
2334:         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2335:         rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2336:         /* copy row entries from dense representation to sparse */
2337:         for (j = 0; j < nz; j++) {
2338:           idx    = pj[j];
2339:           pc1[j] = rtmp11[idx];
2340:           pc2[j] = rtmp22[idx];
2341:           pc3[j] = rtmp33[idx];
2342:           if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2343:         }

2345:         sctx.rs = rs;
2346:         MatPivotCheck(B, A, info, &sctx, row + 2);
2347:         if (sctx.newshift) goto endofwhile;
2348:         break;

2350:       default:
2351:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2352:       }
2353:       row += nodesz; /* Update the row */
2354:     }
2355:   endofwhile:;
2356:   } while (sctx.newshift);
2357:   PetscFree3(rtmp11, rtmp22, rtmp33);
2358:   PetscFree(tmp_vec2);
2359:   ISRestoreIndices(isicol, &ic);
2360:   ISRestoreIndices(isrow, &r);
2361:   ISRestoreIndices(iscol, &c);

2363:   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2364:   /* do not set solve add, since MatSolve_Inode + Add is faster */
2365:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2366:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2367:   C->assembled              = PETSC_TRUE;
2368:   C->preallocated           = PETSC_TRUE;
2369:   if (sctx.nshift) {
2370:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2371:       PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top);
2372:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2373:       PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount);
2374:     }
2375:   }
2376:   PetscLogFlops(C->cmap->n);
2377:   MatSeqAIJCheckInode(C);
2378:   return 0;
2379: }

2381: /* ----------------------------------------------------------- */
2382: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2383: {
2384:   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
2385:   IS                 iscol = a->col, isrow = a->row;
2386:   const PetscInt    *r, *c, *rout, *cout;
2387:   PetscInt           i, j, n = A->rmap->n;
2388:   PetscInt           node_max, row, nsz, aii, i0, i1, nz;
2389:   const PetscInt    *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2390:   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
2391:   PetscScalar        sum1, sum2, sum3, sum4, sum5;
2392:   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2393:   const PetscScalar *b;

2396:   node_max = a->inode.node_count;
2397:   ns       = a->inode.size; /* Node Size array */

2399:   VecGetArrayRead(bb, &b);
2400:   VecGetArrayWrite(xx, &x);
2401:   tmp = a->solve_work;

2403:   ISGetIndices(isrow, &rout);
2404:   r = rout;
2405:   ISGetIndices(iscol, &cout);
2406:   c = cout;

2408:   /* forward solve the lower triangular */
2409:   tmps = tmp;
2410:   aa   = a_a;
2411:   aj   = a_j;
2412:   ad   = a->diag;

2414:   for (i = 0, row = 0; i < node_max; ++i) {
2415:     nsz = ns[i];
2416:     aii = ai[row];
2417:     v1  = aa + aii;
2418:     vi  = aj + aii;
2419:     nz  = ai[row + 1] - ai[row];

2421:     if (i < node_max - 1) {
2422:       /* Prefetch the indices for the next block */
2423:       PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2424:       /* Prefetch the data for the next block */
2425:       PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2426:     }

2428:     switch (nsz) { /* Each loop in 'case' is unrolled */
2429:     case 1:
2430:       sum1 = b[r[row]];
2431:       for (j = 0; j < nz - 1; j += 2) {
2432:         i0   = vi[j];
2433:         i1   = vi[j + 1];
2434:         tmp0 = tmps[i0];
2435:         tmp1 = tmps[i1];
2436:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2437:       }
2438:       if (j == nz - 1) {
2439:         tmp0 = tmps[vi[j]];
2440:         sum1 -= v1[j] * tmp0;
2441:       }
2442:       tmp[row++] = sum1;
2443:       break;
2444:     case 2:
2445:       sum1 = b[r[row]];
2446:       sum2 = b[r[row + 1]];
2447:       v2   = aa + ai[row + 1];

2449:       for (j = 0; j < nz - 1; j += 2) {
2450:         i0   = vi[j];
2451:         i1   = vi[j + 1];
2452:         tmp0 = tmps[i0];
2453:         tmp1 = tmps[i1];
2454:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2455:         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2456:       }
2457:       if (j == nz - 1) {
2458:         tmp0 = tmps[vi[j]];
2459:         sum1 -= v1[j] * tmp0;
2460:         sum2 -= v2[j] * tmp0;
2461:       }
2462:       sum2 -= v2[nz] * sum1;
2463:       tmp[row++] = sum1;
2464:       tmp[row++] = sum2;
2465:       break;
2466:     case 3:
2467:       sum1 = b[r[row]];
2468:       sum2 = b[r[row + 1]];
2469:       sum3 = b[r[row + 2]];
2470:       v2   = aa + ai[row + 1];
2471:       v3   = aa + ai[row + 2];

2473:       for (j = 0; j < nz - 1; j += 2) {
2474:         i0   = vi[j];
2475:         i1   = vi[j + 1];
2476:         tmp0 = tmps[i0];
2477:         tmp1 = tmps[i1];
2478:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2479:         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2480:         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2481:       }
2482:       if (j == nz - 1) {
2483:         tmp0 = tmps[vi[j]];
2484:         sum1 -= v1[j] * tmp0;
2485:         sum2 -= v2[j] * tmp0;
2486:         sum3 -= v3[j] * tmp0;
2487:       }
2488:       sum2 -= v2[nz] * sum1;
2489:       sum3 -= v3[nz] * sum1;
2490:       sum3 -= v3[nz + 1] * sum2;
2491:       tmp[row++] = sum1;
2492:       tmp[row++] = sum2;
2493:       tmp[row++] = sum3;
2494:       break;

2496:     case 4:
2497:       sum1 = b[r[row]];
2498:       sum2 = b[r[row + 1]];
2499:       sum3 = b[r[row + 2]];
2500:       sum4 = b[r[row + 3]];
2501:       v2   = aa + ai[row + 1];
2502:       v3   = aa + ai[row + 2];
2503:       v4   = aa + ai[row + 3];

2505:       for (j = 0; j < nz - 1; j += 2) {
2506:         i0   = vi[j];
2507:         i1   = vi[j + 1];
2508:         tmp0 = tmps[i0];
2509:         tmp1 = tmps[i1];
2510:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2511:         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2512:         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2513:         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2514:       }
2515:       if (j == nz - 1) {
2516:         tmp0 = tmps[vi[j]];
2517:         sum1 -= v1[j] * tmp0;
2518:         sum2 -= v2[j] * tmp0;
2519:         sum3 -= v3[j] * tmp0;
2520:         sum4 -= v4[j] * tmp0;
2521:       }
2522:       sum2 -= v2[nz] * sum1;
2523:       sum3 -= v3[nz] * sum1;
2524:       sum4 -= v4[nz] * sum1;
2525:       sum3 -= v3[nz + 1] * sum2;
2526:       sum4 -= v4[nz + 1] * sum2;
2527:       sum4 -= v4[nz + 2] * sum3;

2529:       tmp[row++] = sum1;
2530:       tmp[row++] = sum2;
2531:       tmp[row++] = sum3;
2532:       tmp[row++] = sum4;
2533:       break;
2534:     case 5:
2535:       sum1 = b[r[row]];
2536:       sum2 = b[r[row + 1]];
2537:       sum3 = b[r[row + 2]];
2538:       sum4 = b[r[row + 3]];
2539:       sum5 = b[r[row + 4]];
2540:       v2   = aa + ai[row + 1];
2541:       v3   = aa + ai[row + 2];
2542:       v4   = aa + ai[row + 3];
2543:       v5   = aa + ai[row + 4];

2545:       for (j = 0; j < nz - 1; j += 2) {
2546:         i0   = vi[j];
2547:         i1   = vi[j + 1];
2548:         tmp0 = tmps[i0];
2549:         tmp1 = tmps[i1];
2550:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2551:         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2552:         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2553:         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2554:         sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2555:       }
2556:       if (j == nz - 1) {
2557:         tmp0 = tmps[vi[j]];
2558:         sum1 -= v1[j] * tmp0;
2559:         sum2 -= v2[j] * tmp0;
2560:         sum3 -= v3[j] * tmp0;
2561:         sum4 -= v4[j] * tmp0;
2562:         sum5 -= v5[j] * tmp0;
2563:       }

2565:       sum2 -= v2[nz] * sum1;
2566:       sum3 -= v3[nz] * sum1;
2567:       sum4 -= v4[nz] * sum1;
2568:       sum5 -= v5[nz] * sum1;
2569:       sum3 -= v3[nz + 1] * sum2;
2570:       sum4 -= v4[nz + 1] * sum2;
2571:       sum5 -= v5[nz + 1] * sum2;
2572:       sum4 -= v4[nz + 2] * sum3;
2573:       sum5 -= v5[nz + 2] * sum3;
2574:       sum5 -= v5[nz + 3] * sum4;

2576:       tmp[row++] = sum1;
2577:       tmp[row++] = sum2;
2578:       tmp[row++] = sum3;
2579:       tmp[row++] = sum4;
2580:       tmp[row++] = sum5;
2581:       break;
2582:     default:
2583:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2584:     }
2585:   }
2586:   /* backward solve the upper triangular */
2587:   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
2588:     nsz = ns[i];
2589:     aii = ad[row + 1] + 1;
2590:     v1  = aa + aii;
2591:     vi  = aj + aii;
2592:     nz  = ad[row] - ad[row + 1] - 1;

2594:     if (i > 0) {
2595:       /* Prefetch the indices for the next block */
2596:       PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2597:       /* Prefetch the data for the next block */
2598:       PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2599:     }

2601:     switch (nsz) { /* Each loop in 'case' is unrolled */
2602:     case 1:
2603:       sum1 = tmp[row];

2605:       for (j = 0; j < nz - 1; j += 2) {
2606:         i0   = vi[j];
2607:         i1   = vi[j + 1];
2608:         tmp0 = tmps[i0];
2609:         tmp1 = tmps[i1];
2610:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2611:       }
2612:       if (j == nz - 1) {
2613:         tmp0 = tmps[vi[j]];
2614:         sum1 -= v1[j] * tmp0;
2615:       }
2616:       x[c[row]] = tmp[row] = sum1 * v1[nz];
2617:       row--;
2618:       break;
2619:     case 2:
2620:       sum1 = tmp[row];
2621:       sum2 = tmp[row - 1];
2622:       v2   = aa + ad[row] + 1;
2623:       for (j = 0; j < nz - 1; j += 2) {
2624:         i0   = vi[j];
2625:         i1   = vi[j + 1];
2626:         tmp0 = tmps[i0];
2627:         tmp1 = tmps[i1];
2628:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2629:         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2630:       }
2631:       if (j == nz - 1) {
2632:         tmp0 = tmps[vi[j]];
2633:         sum1 -= v1[j] * tmp0;
2634:         sum2 -= v2[j + 1] * tmp0;
2635:       }

2637:       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2638:       row--;
2639:       sum2 -= v2[0] * tmp0;
2640:       x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2641:       row--;
2642:       break;
2643:     case 3:
2644:       sum1 = tmp[row];
2645:       sum2 = tmp[row - 1];
2646:       sum3 = tmp[row - 2];
2647:       v2   = aa + ad[row] + 1;
2648:       v3   = aa + ad[row - 1] + 1;
2649:       for (j = 0; j < nz - 1; j += 2) {
2650:         i0   = vi[j];
2651:         i1   = vi[j + 1];
2652:         tmp0 = tmps[i0];
2653:         tmp1 = tmps[i1];
2654:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2655:         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2656:         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2657:       }
2658:       if (j == nz - 1) {
2659:         tmp0 = tmps[vi[j]];
2660:         sum1 -= v1[j] * tmp0;
2661:         sum2 -= v2[j + 1] * tmp0;
2662:         sum3 -= v3[j + 2] * tmp0;
2663:       }
2664:       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2665:       row--;
2666:       sum2 -= v2[0] * tmp0;
2667:       sum3 -= v3[1] * tmp0;
2668:       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2669:       row--;
2670:       sum3 -= v3[0] * tmp0;
2671:       x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2672:       row--;

2674:       break;
2675:     case 4:
2676:       sum1 = tmp[row];
2677:       sum2 = tmp[row - 1];
2678:       sum3 = tmp[row - 2];
2679:       sum4 = tmp[row - 3];
2680:       v2   = aa + ad[row] + 1;
2681:       v3   = aa + ad[row - 1] + 1;
2682:       v4   = aa + ad[row - 2] + 1;

2684:       for (j = 0; j < nz - 1; j += 2) {
2685:         i0   = vi[j];
2686:         i1   = vi[j + 1];
2687:         tmp0 = tmps[i0];
2688:         tmp1 = tmps[i1];
2689:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2690:         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2691:         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2692:         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2693:       }
2694:       if (j == nz - 1) {
2695:         tmp0 = tmps[vi[j]];
2696:         sum1 -= v1[j] * tmp0;
2697:         sum2 -= v2[j + 1] * tmp0;
2698:         sum3 -= v3[j + 2] * tmp0;
2699:         sum4 -= v4[j + 3] * tmp0;
2700:       }

2702:       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2703:       row--;
2704:       sum2 -= v2[0] * tmp0;
2705:       sum3 -= v3[1] * tmp0;
2706:       sum4 -= v4[2] * tmp0;
2707:       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2708:       row--;
2709:       sum3 -= v3[0] * tmp0;
2710:       sum4 -= v4[1] * tmp0;
2711:       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2712:       row--;
2713:       sum4 -= v4[0] * tmp0;
2714:       x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2715:       row--;
2716:       break;
2717:     case 5:
2718:       sum1 = tmp[row];
2719:       sum2 = tmp[row - 1];
2720:       sum3 = tmp[row - 2];
2721:       sum4 = tmp[row - 3];
2722:       sum5 = tmp[row - 4];
2723:       v2   = aa + ad[row] + 1;
2724:       v3   = aa + ad[row - 1] + 1;
2725:       v4   = aa + ad[row - 2] + 1;
2726:       v5   = aa + ad[row - 3] + 1;
2727:       for (j = 0; j < nz - 1; j += 2) {
2728:         i0   = vi[j];
2729:         i1   = vi[j + 1];
2730:         tmp0 = tmps[i0];
2731:         tmp1 = tmps[i1];
2732:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2733:         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2734:         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2735:         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2736:         sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2737:       }
2738:       if (j == nz - 1) {
2739:         tmp0 = tmps[vi[j]];
2740:         sum1 -= v1[j] * tmp0;
2741:         sum2 -= v2[j + 1] * tmp0;
2742:         sum3 -= v3[j + 2] * tmp0;
2743:         sum4 -= v4[j + 3] * tmp0;
2744:         sum5 -= v5[j + 4] * tmp0;
2745:       }

2747:       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2748:       row--;
2749:       sum2 -= v2[0] * tmp0;
2750:       sum3 -= v3[1] * tmp0;
2751:       sum4 -= v4[2] * tmp0;
2752:       sum5 -= v5[3] * tmp0;
2753:       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2754:       row--;
2755:       sum3 -= v3[0] * tmp0;
2756:       sum4 -= v4[1] * tmp0;
2757:       sum5 -= v5[2] * tmp0;
2758:       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2759:       row--;
2760:       sum4 -= v4[0] * tmp0;
2761:       sum5 -= v5[1] * tmp0;
2762:       tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2763:       row--;
2764:       sum5 -= v5[0] * tmp0;
2765:       x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2766:       row--;
2767:       break;
2768:     default:
2769:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2770:     }
2771:   }
2772:   ISRestoreIndices(isrow, &rout);
2773:   ISRestoreIndices(iscol, &cout);
2774:   VecRestoreArrayRead(bb, &b);
2775:   VecRestoreArrayWrite(xx, &x);
2776:   PetscLogFlops(2.0 * a->nz - A->cmap->n);
2777:   return 0;
2778: }

2780: /*
2781:      Makes a longer coloring[] array and calls the usual code with that
2782: */
2783: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2784: {
2785:   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)mat->data;
2786:   PetscInt         n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row;
2787:   PetscInt        *colorused, i;
2788:   ISColoringValue *newcolor;

2791:   PetscMalloc1(n + 1, &newcolor);
2792:   /* loop over inodes, marking a color for each column*/
2793:   row = 0;
2794:   for (i = 0; i < m; i++) {
2795:     for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors;
2796:   }

2798:   /* eliminate unneeded colors */
2799:   PetscCalloc1(5 * ncolors, &colorused);
2800:   for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;

2802:   for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2803:   ncolors = colorused[5 * ncolors - 1];
2804:   for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1;
2805:   PetscFree(colorused);
2806:   ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring);
2807:   PetscFree(coloring);
2808:   return 0;
2809: }

2811: #include <petsc/private/kernels/blockinvert.h>

2813: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2814: {
2815:   Mat_SeqAIJ        *a    = (Mat_SeqAIJ *)A->data;
2816:   PetscScalar        sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2817:   MatScalar         *ibdiag, *bdiag, work[25], *t;
2818:   PetscScalar       *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2819:   const MatScalar   *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2820:   const PetscScalar *xb, *b;
2821:   PetscReal          zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2822:   PetscInt           n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2;
2823:   PetscInt           sz, k, ipvt[5];
2824:   PetscBool          allowzeropivot, zeropivotdetected;
2825:   const PetscInt    *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i;

2827:   allowzeropivot = PetscNot(A->erroriffailure);

2832:   if (!a->inode.ibdiagvalid) {
2833:     if (!a->inode.ibdiag) {
2834:       /* calculate space needed for diagonal blocks */
2835:       for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i];
2836:       a->inode.bdiagsize = cnt;

2838:       PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work);
2839:     }

2841:     /* copy over the diagonal blocks and invert them */
2842:     ibdiag = a->inode.ibdiag;
2843:     bdiag  = a->inode.bdiag;
2844:     cnt    = 0;
2845:     for (i = 0, row = 0; i < m; i++) {
2846:       for (j = 0; j < sizes[i]; j++) {
2847:         for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k];
2848:       }
2849:       PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i]);

2851:       switch (sizes[i]) {
2852:       case 1:
2853:         /* Create matrix data structure */
2854:         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2855:           if (allowzeropivot) {
2856:             A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2857:             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2858:             A->factorerror_zeropivot_row   = row;
2859:             PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row);
2860:           } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2861:         }
2862:         ibdiag[cnt] = 1.0 / ibdiag[cnt];
2863:         break;
2864:       case 2:
2865:         PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected);
2866:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2867:         break;
2868:       case 3:
2869:         PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected);
2870:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2871:         break;
2872:       case 4:
2873:         PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected);
2874:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2875:         break;
2876:       case 5:
2877:         PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected);
2878:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2879:         break;
2880:       default:
2881:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
2882:       }
2883:       cnt += sizes[i] * sizes[i];
2884:       row += sizes[i];
2885:     }
2886:     a->inode.ibdiagvalid = PETSC_TRUE;
2887:   }
2888:   ibdiag = a->inode.ibdiag;
2889:   bdiag  = a->inode.bdiag;
2890:   t      = a->inode.ssor_work;

2892:   VecGetArray(xx, &x);
2893:   VecGetArrayRead(bb, &b);
2894:   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2895:   if (flag & SOR_ZERO_INITIAL_GUESS) {
2896:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2897:       for (i = 0, row = 0; i < m; i++) {
2898:         sz  = diag[row] - ii[row];
2899:         v1  = a->a + ii[row];
2900:         idx = a->j + ii[row];

2902:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2903:         switch (sizes[i]) {
2904:         case 1:

2906:           sum1 = b[row];
2907:           for (n = 0; n < sz - 1; n += 2) {
2908:             i1 = idx[0];
2909:             i2 = idx[1];
2910:             idx += 2;
2911:             tmp0 = x[i1];
2912:             tmp1 = x[i2];
2913:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2914:             v1 += 2;
2915:           }

2917:           if (n == sz - 1) {
2918:             tmp0 = x[*idx];
2919:             sum1 -= *v1 * tmp0;
2920:           }
2921:           t[row]   = sum1;
2922:           x[row++] = sum1 * (*ibdiag++);
2923:           break;
2924:         case 2:
2925:           v2   = a->a + ii[row + 1];
2926:           sum1 = b[row];
2927:           sum2 = b[row + 1];
2928:           for (n = 0; n < sz - 1; n += 2) {
2929:             i1 = idx[0];
2930:             i2 = idx[1];
2931:             idx += 2;
2932:             tmp0 = x[i1];
2933:             tmp1 = x[i2];
2934:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2935:             v1 += 2;
2936:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2937:             v2 += 2;
2938:           }

2940:           if (n == sz - 1) {
2941:             tmp0 = x[*idx];
2942:             sum1 -= v1[0] * tmp0;
2943:             sum2 -= v2[0] * tmp0;
2944:           }
2945:           t[row]     = sum1;
2946:           t[row + 1] = sum2;
2947:           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2948:           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2949:           ibdiag += 4;
2950:           break;
2951:         case 3:
2952:           v2   = a->a + ii[row + 1];
2953:           v3   = a->a + ii[row + 2];
2954:           sum1 = b[row];
2955:           sum2 = b[row + 1];
2956:           sum3 = b[row + 2];
2957:           for (n = 0; n < sz - 1; n += 2) {
2958:             i1 = idx[0];
2959:             i2 = idx[1];
2960:             idx += 2;
2961:             tmp0 = x[i1];
2962:             tmp1 = x[i2];
2963:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2964:             v1 += 2;
2965:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2966:             v2 += 2;
2967:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2968:             v3 += 2;
2969:           }

2971:           if (n == sz - 1) {
2972:             tmp0 = x[*idx];
2973:             sum1 -= v1[0] * tmp0;
2974:             sum2 -= v2[0] * tmp0;
2975:             sum3 -= v3[0] * tmp0;
2976:           }
2977:           t[row]     = sum1;
2978:           t[row + 1] = sum2;
2979:           t[row + 2] = sum3;
2980:           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
2981:           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
2982:           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
2983:           ibdiag += 9;
2984:           break;
2985:         case 4:
2986:           v2   = a->a + ii[row + 1];
2987:           v3   = a->a + ii[row + 2];
2988:           v4   = a->a + ii[row + 3];
2989:           sum1 = b[row];
2990:           sum2 = b[row + 1];
2991:           sum3 = b[row + 2];
2992:           sum4 = b[row + 3];
2993:           for (n = 0; n < sz - 1; n += 2) {
2994:             i1 = idx[0];
2995:             i2 = idx[1];
2996:             idx += 2;
2997:             tmp0 = x[i1];
2998:             tmp1 = x[i2];
2999:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3000:             v1 += 2;
3001:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3002:             v2 += 2;
3003:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3004:             v3 += 2;
3005:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3006:             v4 += 2;
3007:           }

3009:           if (n == sz - 1) {
3010:             tmp0 = x[*idx];
3011:             sum1 -= v1[0] * tmp0;
3012:             sum2 -= v2[0] * tmp0;
3013:             sum3 -= v3[0] * tmp0;
3014:             sum4 -= v4[0] * tmp0;
3015:           }
3016:           t[row]     = sum1;
3017:           t[row + 1] = sum2;
3018:           t[row + 2] = sum3;
3019:           t[row + 3] = sum4;
3020:           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3021:           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3022:           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3023:           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3024:           ibdiag += 16;
3025:           break;
3026:         case 5:
3027:           v2   = a->a + ii[row + 1];
3028:           v3   = a->a + ii[row + 2];
3029:           v4   = a->a + ii[row + 3];
3030:           v5   = a->a + ii[row + 4];
3031:           sum1 = b[row];
3032:           sum2 = b[row + 1];
3033:           sum3 = b[row + 2];
3034:           sum4 = b[row + 3];
3035:           sum5 = b[row + 4];
3036:           for (n = 0; n < sz - 1; n += 2) {
3037:             i1 = idx[0];
3038:             i2 = idx[1];
3039:             idx += 2;
3040:             tmp0 = x[i1];
3041:             tmp1 = x[i2];
3042:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3043:             v1 += 2;
3044:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3045:             v2 += 2;
3046:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3047:             v3 += 2;
3048:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3049:             v4 += 2;
3050:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3051:             v5 += 2;
3052:           }

3054:           if (n == sz - 1) {
3055:             tmp0 = x[*idx];
3056:             sum1 -= v1[0] * tmp0;
3057:             sum2 -= v2[0] * tmp0;
3058:             sum3 -= v3[0] * tmp0;
3059:             sum4 -= v4[0] * tmp0;
3060:             sum5 -= v5[0] * tmp0;
3061:           }
3062:           t[row]     = sum1;
3063:           t[row + 1] = sum2;
3064:           t[row + 2] = sum3;
3065:           t[row + 3] = sum4;
3066:           t[row + 4] = sum5;
3067:           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3068:           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3069:           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3070:           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3071:           x[row++]   = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3072:           ibdiag += 25;
3073:           break;
3074:         default:
3075:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3076:         }
3077:       }

3079:       xb = t;
3080:       PetscLogFlops(a->nz);
3081:     } else xb = b;
3082:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3083:       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3084:       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3085:         ibdiag -= sizes[i] * sizes[i];
3086:         sz  = ii[row + 1] - diag[row] - 1;
3087:         v1  = a->a + diag[row] + 1;
3088:         idx = a->j + diag[row] + 1;

3090:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3091:         switch (sizes[i]) {
3092:         case 1:

3094:           sum1 = xb[row];
3095:           for (n = 0; n < sz - 1; n += 2) {
3096:             i1 = idx[0];
3097:             i2 = idx[1];
3098:             idx += 2;
3099:             tmp0 = x[i1];
3100:             tmp1 = x[i2];
3101:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3102:             v1 += 2;
3103:           }

3105:           if (n == sz - 1) {
3106:             tmp0 = x[*idx];
3107:             sum1 -= *v1 * tmp0;
3108:           }
3109:           x[row--] = sum1 * (*ibdiag);
3110:           break;

3112:         case 2:

3114:           sum1 = xb[row];
3115:           sum2 = xb[row - 1];
3116:           /* note that sum1 is associated with the second of the two rows */
3117:           v2 = a->a + diag[row - 1] + 2;
3118:           for (n = 0; n < sz - 1; n += 2) {
3119:             i1 = idx[0];
3120:             i2 = idx[1];
3121:             idx += 2;
3122:             tmp0 = x[i1];
3123:             tmp1 = x[i2];
3124:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3125:             v1 += 2;
3126:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3127:             v2 += 2;
3128:           }

3130:           if (n == sz - 1) {
3131:             tmp0 = x[*idx];
3132:             sum1 -= *v1 * tmp0;
3133:             sum2 -= *v2 * tmp0;
3134:           }
3135:           x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3136:           x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3137:           break;
3138:         case 3:

3140:           sum1 = xb[row];
3141:           sum2 = xb[row - 1];
3142:           sum3 = xb[row - 2];
3143:           v2   = a->a + diag[row - 1] + 2;
3144:           v3   = a->a + diag[row - 2] + 3;
3145:           for (n = 0; n < sz - 1; n += 2) {
3146:             i1 = idx[0];
3147:             i2 = idx[1];
3148:             idx += 2;
3149:             tmp0 = x[i1];
3150:             tmp1 = x[i2];
3151:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3152:             v1 += 2;
3153:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3154:             v2 += 2;
3155:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3156:             v3 += 2;
3157:           }

3159:           if (n == sz - 1) {
3160:             tmp0 = x[*idx];
3161:             sum1 -= *v1 * tmp0;
3162:             sum2 -= *v2 * tmp0;
3163:             sum3 -= *v3 * tmp0;
3164:           }
3165:           x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3166:           x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3167:           x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3168:           break;
3169:         case 4:

3171:           sum1 = xb[row];
3172:           sum2 = xb[row - 1];
3173:           sum3 = xb[row - 2];
3174:           sum4 = xb[row - 3];
3175:           v2   = a->a + diag[row - 1] + 2;
3176:           v3   = a->a + diag[row - 2] + 3;
3177:           v4   = a->a + diag[row - 3] + 4;
3178:           for (n = 0; n < sz - 1; n += 2) {
3179:             i1 = idx[0];
3180:             i2 = idx[1];
3181:             idx += 2;
3182:             tmp0 = x[i1];
3183:             tmp1 = x[i2];
3184:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3185:             v1 += 2;
3186:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3187:             v2 += 2;
3188:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3189:             v3 += 2;
3190:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3191:             v4 += 2;
3192:           }

3194:           if (n == sz - 1) {
3195:             tmp0 = x[*idx];
3196:             sum1 -= *v1 * tmp0;
3197:             sum2 -= *v2 * tmp0;
3198:             sum3 -= *v3 * tmp0;
3199:             sum4 -= *v4 * tmp0;
3200:           }
3201:           x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3202:           x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3203:           x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3204:           x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3205:           break;
3206:         case 5:

3208:           sum1 = xb[row];
3209:           sum2 = xb[row - 1];
3210:           sum3 = xb[row - 2];
3211:           sum4 = xb[row - 3];
3212:           sum5 = xb[row - 4];
3213:           v2   = a->a + diag[row - 1] + 2;
3214:           v3   = a->a + diag[row - 2] + 3;
3215:           v4   = a->a + diag[row - 3] + 4;
3216:           v5   = a->a + diag[row - 4] + 5;
3217:           for (n = 0; n < sz - 1; n += 2) {
3218:             i1 = idx[0];
3219:             i2 = idx[1];
3220:             idx += 2;
3221:             tmp0 = x[i1];
3222:             tmp1 = x[i2];
3223:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3224:             v1 += 2;
3225:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3226:             v2 += 2;
3227:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3228:             v3 += 2;
3229:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3230:             v4 += 2;
3231:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3232:             v5 += 2;
3233:           }

3235:           if (n == sz - 1) {
3236:             tmp0 = x[*idx];
3237:             sum1 -= *v1 * tmp0;
3238:             sum2 -= *v2 * tmp0;
3239:             sum3 -= *v3 * tmp0;
3240:             sum4 -= *v4 * tmp0;
3241:             sum5 -= *v5 * tmp0;
3242:           }
3243:           x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3244:           x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3245:           x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3246:           x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3247:           x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3248:           break;
3249:         default:
3250:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3251:         }
3252:       }

3254:       PetscLogFlops(a->nz);
3255:     }
3256:     its--;
3257:   }
3258:   while (its--) {
3259:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3260:       for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) {
3261:         sz  = diag[row] - ii[row];
3262:         v1  = a->a + ii[row];
3263:         idx = a->j + ii[row];
3264:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3265:         switch (sizes[i]) {
3266:         case 1:
3267:           sum1 = b[row];
3268:           for (n = 0; n < sz - 1; n += 2) {
3269:             i1 = idx[0];
3270:             i2 = idx[1];
3271:             idx += 2;
3272:             tmp0 = x[i1];
3273:             tmp1 = x[i2];
3274:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3275:             v1 += 2;
3276:           }
3277:           if (n == sz - 1) {
3278:             tmp0 = x[*idx++];
3279:             sum1 -= *v1 * tmp0;
3280:             v1++;
3281:           }
3282:           t[row] = sum1;
3283:           sz     = ii[row + 1] - diag[row] - 1;
3284:           idx    = a->j + diag[row] + 1;
3285:           v1 += 1;
3286:           for (n = 0; n < sz - 1; n += 2) {
3287:             i1 = idx[0];
3288:             i2 = idx[1];
3289:             idx += 2;
3290:             tmp0 = x[i1];
3291:             tmp1 = x[i2];
3292:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3293:             v1 += 2;
3294:           }
3295:           if (n == sz - 1) {
3296:             tmp0 = x[*idx++];
3297:             sum1 -= *v1 * tmp0;
3298:           }
3299:           /* in MatSOR_SeqAIJ this line would be
3300:            *
3301:            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3302:            *
3303:            * but omega == 1, so this becomes
3304:            *
3305:            * x[row] = sum1*(*ibdiag++);
3306:            *
3307:            */
3308:           x[row] = sum1 * (*ibdiag);
3309:           break;
3310:         case 2:
3311:           v2   = a->a + ii[row + 1];
3312:           sum1 = b[row];
3313:           sum2 = b[row + 1];
3314:           for (n = 0; n < sz - 1; n += 2) {
3315:             i1 = idx[0];
3316:             i2 = idx[1];
3317:             idx += 2;
3318:             tmp0 = x[i1];
3319:             tmp1 = x[i2];
3320:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3321:             v1 += 2;
3322:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3323:             v2 += 2;
3324:           }
3325:           if (n == sz - 1) {
3326:             tmp0 = x[*idx++];
3327:             sum1 -= v1[0] * tmp0;
3328:             sum2 -= v2[0] * tmp0;
3329:             v1++;
3330:             v2++;
3331:           }
3332:           t[row]     = sum1;
3333:           t[row + 1] = sum2;
3334:           sz         = ii[row + 1] - diag[row] - 2;
3335:           idx        = a->j + diag[row] + 2;
3336:           v1 += 2;
3337:           v2 += 2;
3338:           for (n = 0; n < sz - 1; n += 2) {
3339:             i1 = idx[0];
3340:             i2 = idx[1];
3341:             idx += 2;
3342:             tmp0 = x[i1];
3343:             tmp1 = x[i2];
3344:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3345:             v1 += 2;
3346:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3347:             v2 += 2;
3348:           }
3349:           if (n == sz - 1) {
3350:             tmp0 = x[*idx];
3351:             sum1 -= v1[0] * tmp0;
3352:             sum2 -= v2[0] * tmp0;
3353:           }
3354:           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3355:           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3356:           break;
3357:         case 3:
3358:           v2   = a->a + ii[row + 1];
3359:           v3   = a->a + ii[row + 2];
3360:           sum1 = b[row];
3361:           sum2 = b[row + 1];
3362:           sum3 = b[row + 2];
3363:           for (n = 0; n < sz - 1; n += 2) {
3364:             i1 = idx[0];
3365:             i2 = idx[1];
3366:             idx += 2;
3367:             tmp0 = x[i1];
3368:             tmp1 = x[i2];
3369:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3370:             v1 += 2;
3371:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3372:             v2 += 2;
3373:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3374:             v3 += 2;
3375:           }
3376:           if (n == sz - 1) {
3377:             tmp0 = x[*idx++];
3378:             sum1 -= v1[0] * tmp0;
3379:             sum2 -= v2[0] * tmp0;
3380:             sum3 -= v3[0] * tmp0;
3381:             v1++;
3382:             v2++;
3383:             v3++;
3384:           }
3385:           t[row]     = sum1;
3386:           t[row + 1] = sum2;
3387:           t[row + 2] = sum3;
3388:           sz         = ii[row + 1] - diag[row] - 3;
3389:           idx        = a->j + diag[row] + 3;
3390:           v1 += 3;
3391:           v2 += 3;
3392:           v3 += 3;
3393:           for (n = 0; n < sz - 1; n += 2) {
3394:             i1 = idx[0];
3395:             i2 = idx[1];
3396:             idx += 2;
3397:             tmp0 = x[i1];
3398:             tmp1 = x[i2];
3399:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3400:             v1 += 2;
3401:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3402:             v2 += 2;
3403:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3404:             v3 += 2;
3405:           }
3406:           if (n == sz - 1) {
3407:             tmp0 = x[*idx];
3408:             sum1 -= v1[0] * tmp0;
3409:             sum2 -= v2[0] * tmp0;
3410:             sum3 -= v3[0] * tmp0;
3411:           }
3412:           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3413:           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3414:           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3415:           break;
3416:         case 4:
3417:           v2   = a->a + ii[row + 1];
3418:           v3   = a->a + ii[row + 2];
3419:           v4   = a->a + ii[row + 3];
3420:           sum1 = b[row];
3421:           sum2 = b[row + 1];
3422:           sum3 = b[row + 2];
3423:           sum4 = b[row + 3];
3424:           for (n = 0; n < sz - 1; n += 2) {
3425:             i1 = idx[0];
3426:             i2 = idx[1];
3427:             idx += 2;
3428:             tmp0 = x[i1];
3429:             tmp1 = x[i2];
3430:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3431:             v1 += 2;
3432:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3433:             v2 += 2;
3434:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3435:             v3 += 2;
3436:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3437:             v4 += 2;
3438:           }
3439:           if (n == sz - 1) {
3440:             tmp0 = x[*idx++];
3441:             sum1 -= v1[0] * tmp0;
3442:             sum2 -= v2[0] * tmp0;
3443:             sum3 -= v3[0] * tmp0;
3444:             sum4 -= v4[0] * tmp0;
3445:             v1++;
3446:             v2++;
3447:             v3++;
3448:             v4++;
3449:           }
3450:           t[row]     = sum1;
3451:           t[row + 1] = sum2;
3452:           t[row + 2] = sum3;
3453:           t[row + 3] = sum4;
3454:           sz         = ii[row + 1] - diag[row] - 4;
3455:           idx        = a->j + diag[row] + 4;
3456:           v1 += 4;
3457:           v2 += 4;
3458:           v3 += 4;
3459:           v4 += 4;
3460:           for (n = 0; n < sz - 1; n += 2) {
3461:             i1 = idx[0];
3462:             i2 = idx[1];
3463:             idx += 2;
3464:             tmp0 = x[i1];
3465:             tmp1 = x[i2];
3466:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3467:             v1 += 2;
3468:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3469:             v2 += 2;
3470:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3471:             v3 += 2;
3472:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3473:             v4 += 2;
3474:           }
3475:           if (n == sz - 1) {
3476:             tmp0 = x[*idx];
3477:             sum1 -= v1[0] * tmp0;
3478:             sum2 -= v2[0] * tmp0;
3479:             sum3 -= v3[0] * tmp0;
3480:             sum4 -= v4[0] * tmp0;
3481:           }
3482:           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3483:           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3484:           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3485:           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3486:           break;
3487:         case 5:
3488:           v2   = a->a + ii[row + 1];
3489:           v3   = a->a + ii[row + 2];
3490:           v4   = a->a + ii[row + 3];
3491:           v5   = a->a + ii[row + 4];
3492:           sum1 = b[row];
3493:           sum2 = b[row + 1];
3494:           sum3 = b[row + 2];
3495:           sum4 = b[row + 3];
3496:           sum5 = b[row + 4];
3497:           for (n = 0; n < sz - 1; n += 2) {
3498:             i1 = idx[0];
3499:             i2 = idx[1];
3500:             idx += 2;
3501:             tmp0 = x[i1];
3502:             tmp1 = x[i2];
3503:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3504:             v1 += 2;
3505:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3506:             v2 += 2;
3507:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3508:             v3 += 2;
3509:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3510:             v4 += 2;
3511:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3512:             v5 += 2;
3513:           }
3514:           if (n == sz - 1) {
3515:             tmp0 = x[*idx++];
3516:             sum1 -= v1[0] * tmp0;
3517:             sum2 -= v2[0] * tmp0;
3518:             sum3 -= v3[0] * tmp0;
3519:             sum4 -= v4[0] * tmp0;
3520:             sum5 -= v5[0] * tmp0;
3521:             v1++;
3522:             v2++;
3523:             v3++;
3524:             v4++;
3525:             v5++;
3526:           }
3527:           t[row]     = sum1;
3528:           t[row + 1] = sum2;
3529:           t[row + 2] = sum3;
3530:           t[row + 3] = sum4;
3531:           t[row + 4] = sum5;
3532:           sz         = ii[row + 1] - diag[row] - 5;
3533:           idx        = a->j + diag[row] + 5;
3534:           v1 += 5;
3535:           v2 += 5;
3536:           v3 += 5;
3537:           v4 += 5;
3538:           v5 += 5;
3539:           for (n = 0; n < sz - 1; n += 2) {
3540:             i1 = idx[0];
3541:             i2 = idx[1];
3542:             idx += 2;
3543:             tmp0 = x[i1];
3544:             tmp1 = x[i2];
3545:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3546:             v1 += 2;
3547:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3548:             v2 += 2;
3549:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3550:             v3 += 2;
3551:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3552:             v4 += 2;
3553:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3554:             v5 += 2;
3555:           }
3556:           if (n == sz - 1) {
3557:             tmp0 = x[*idx];
3558:             sum1 -= v1[0] * tmp0;
3559:             sum2 -= v2[0] * tmp0;
3560:             sum3 -= v3[0] * tmp0;
3561:             sum4 -= v4[0] * tmp0;
3562:             sum5 -= v5[0] * tmp0;
3563:           }
3564:           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3565:           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3566:           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3567:           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3568:           x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3569:           break;
3570:         default:
3571:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3572:         }
3573:       }
3574:       xb = t;
3575:       PetscLogFlops(2.0 * a->nz); /* undercounts diag inverse */
3576:     } else xb = b;

3578:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3579:       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3580:       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3581:         ibdiag -= sizes[i] * sizes[i];

3583:         /* set RHS */
3584:         if (xb == b) {
3585:           /* whole (old way) */
3586:           sz  = ii[row + 1] - ii[row];
3587:           idx = a->j + ii[row];
3588:           switch (sizes[i]) {
3589:           case 5:
3590:             v5 = a->a + ii[row - 4]; /* fall through */
3591:           case 4:
3592:             v4 = a->a + ii[row - 3]; /* fall through */
3593:           case 3:
3594:             v3 = a->a + ii[row - 2]; /* fall through */
3595:           case 2:
3596:             v2 = a->a + ii[row - 1]; /* fall through */
3597:           case 1:
3598:             v1 = a->a + ii[row];
3599:             break;
3600:           default:
3601:             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3602:           }
3603:         } else {
3604:           /* upper, no diag */
3605:           sz  = ii[row + 1] - diag[row] - 1;
3606:           idx = a->j + diag[row] + 1;
3607:           switch (sizes[i]) {
3608:           case 5:
3609:             v5 = a->a + diag[row - 4] + 5; /* fall through */
3610:           case 4:
3611:             v4 = a->a + diag[row - 3] + 4; /* fall through */
3612:           case 3:
3613:             v3 = a->a + diag[row - 2] + 3; /* fall through */
3614:           case 2:
3615:             v2 = a->a + diag[row - 1] + 2; /* fall through */
3616:           case 1:
3617:             v1 = a->a + diag[row] + 1;
3618:           }
3619:         }
3620:         /* set sum */
3621:         switch (sizes[i]) {
3622:         case 5:
3623:           sum5 = xb[row - 4]; /* fall through */
3624:         case 4:
3625:           sum4 = xb[row - 3]; /* fall through */
3626:         case 3:
3627:           sum3 = xb[row - 2]; /* fall through */
3628:         case 2:
3629:           sum2 = xb[row - 1]; /* fall through */
3630:         case 1:
3631:           /* note that sum1 is associated with the last row */
3632:           sum1 = xb[row];
3633:         }
3634:         /* do sums */
3635:         for (n = 0; n < sz - 1; n += 2) {
3636:           i1 = idx[0];
3637:           i2 = idx[1];
3638:           idx += 2;
3639:           tmp0 = x[i1];
3640:           tmp1 = x[i2];
3641:           switch (sizes[i]) {
3642:           case 5:
3643:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3644:             v5 += 2; /* fall through */
3645:           case 4:
3646:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3647:             v4 += 2; /* fall through */
3648:           case 3:
3649:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3650:             v3 += 2; /* fall through */
3651:           case 2:
3652:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3653:             v2 += 2; /* fall through */
3654:           case 1:
3655:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3656:             v1 += 2;
3657:           }
3658:         }
3659:         /* ragged edge */
3660:         if (n == sz - 1) {
3661:           tmp0 = x[*idx];
3662:           switch (sizes[i]) {
3663:           case 5:
3664:             sum5 -= *v5 * tmp0; /* fall through */
3665:           case 4:
3666:             sum4 -= *v4 * tmp0; /* fall through */
3667:           case 3:
3668:             sum3 -= *v3 * tmp0; /* fall through */
3669:           case 2:
3670:             sum2 -= *v2 * tmp0; /* fall through */
3671:           case 1:
3672:             sum1 -= *v1 * tmp0;
3673:           }
3674:         }
3675:         /* update */
3676:         if (xb == b) {
3677:           /* whole (old way) w/ diag */
3678:           switch (sizes[i]) {
3679:           case 5:
3680:             x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3681:             x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3682:             x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3683:             x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3684:             x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3685:             break;
3686:           case 4:
3687:             x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3688:             x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3689:             x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3690:             x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3691:             break;
3692:           case 3:
3693:             x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3694:             x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3695:             x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3696:             break;
3697:           case 2:
3698:             x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3699:             x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3700:             break;
3701:           case 1:
3702:             x[row--] += sum1 * (*ibdiag);
3703:             break;
3704:           }
3705:         } else {
3706:           /* no diag so set =  */
3707:           switch (sizes[i]) {
3708:           case 5:
3709:             x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3710:             x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3711:             x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3712:             x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3713:             x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3714:             break;
3715:           case 4:
3716:             x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3717:             x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3718:             x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3719:             x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3720:             break;
3721:           case 3:
3722:             x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3723:             x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3724:             x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3725:             break;
3726:           case 2:
3727:             x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3728:             x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3729:             break;
3730:           case 1:
3731:             x[row--] = sum1 * (*ibdiag);
3732:             break;
3733:           }
3734:         }
3735:       }
3736:       if (xb == b) {
3737:         PetscLogFlops(2.0 * a->nz);
3738:       } else {
3739:         PetscLogFlops(a->nz); /* assumes 1/2 in upper, undercounts diag inverse */
3740:       }
3741:     }
3742:   }
3743:   if (flag & SOR_EISENSTAT) {
3744:     /*
3745:           Apply  (U + D)^-1  where D is now the block diagonal
3746:     */
3747:     ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3748:     for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3749:       ibdiag -= sizes[i] * sizes[i];
3750:       sz  = ii[row + 1] - diag[row] - 1;
3751:       v1  = a->a + diag[row] + 1;
3752:       idx = a->j + diag[row] + 1;
3753:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3754:       switch (sizes[i]) {
3755:       case 1:

3757:         sum1 = b[row];
3758:         for (n = 0; n < sz - 1; n += 2) {
3759:           i1 = idx[0];
3760:           i2 = idx[1];
3761:           idx += 2;
3762:           tmp0 = x[i1];
3763:           tmp1 = x[i2];
3764:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3765:           v1 += 2;
3766:         }

3768:         if (n == sz - 1) {
3769:           tmp0 = x[*idx];
3770:           sum1 -= *v1 * tmp0;
3771:         }
3772:         x[row] = sum1 * (*ibdiag);
3773:         row--;
3774:         break;

3776:       case 2:

3778:         sum1 = b[row];
3779:         sum2 = b[row - 1];
3780:         /* note that sum1 is associated with the second of the two rows */
3781:         v2 = a->a + diag[row - 1] + 2;
3782:         for (n = 0; n < sz - 1; n += 2) {
3783:           i1 = idx[0];
3784:           i2 = idx[1];
3785:           idx += 2;
3786:           tmp0 = x[i1];
3787:           tmp1 = x[i2];
3788:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3789:           v1 += 2;
3790:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3791:           v2 += 2;
3792:         }

3794:         if (n == sz - 1) {
3795:           tmp0 = x[*idx];
3796:           sum1 -= *v1 * tmp0;
3797:           sum2 -= *v2 * tmp0;
3798:         }
3799:         x[row]     = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3800:         x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3801:         row -= 2;
3802:         break;
3803:       case 3:

3805:         sum1 = b[row];
3806:         sum2 = b[row - 1];
3807:         sum3 = b[row - 2];
3808:         v2   = a->a + diag[row - 1] + 2;
3809:         v3   = a->a + diag[row - 2] + 3;
3810:         for (n = 0; n < sz - 1; n += 2) {
3811:           i1 = idx[0];
3812:           i2 = idx[1];
3813:           idx += 2;
3814:           tmp0 = x[i1];
3815:           tmp1 = x[i2];
3816:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3817:           v1 += 2;
3818:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3819:           v2 += 2;
3820:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3821:           v3 += 2;
3822:         }

3824:         if (n == sz - 1) {
3825:           tmp0 = x[*idx];
3826:           sum1 -= *v1 * tmp0;
3827:           sum2 -= *v2 * tmp0;
3828:           sum3 -= *v3 * tmp0;
3829:         }
3830:         x[row]     = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3831:         x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3832:         x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3833:         row -= 3;
3834:         break;
3835:       case 4:

3837:         sum1 = b[row];
3838:         sum2 = b[row - 1];
3839:         sum3 = b[row - 2];
3840:         sum4 = b[row - 3];
3841:         v2   = a->a + diag[row - 1] + 2;
3842:         v3   = a->a + diag[row - 2] + 3;
3843:         v4   = a->a + diag[row - 3] + 4;
3844:         for (n = 0; n < sz - 1; n += 2) {
3845:           i1 = idx[0];
3846:           i2 = idx[1];
3847:           idx += 2;
3848:           tmp0 = x[i1];
3849:           tmp1 = x[i2];
3850:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3851:           v1 += 2;
3852:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3853:           v2 += 2;
3854:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3855:           v3 += 2;
3856:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3857:           v4 += 2;
3858:         }

3860:         if (n == sz - 1) {
3861:           tmp0 = x[*idx];
3862:           sum1 -= *v1 * tmp0;
3863:           sum2 -= *v2 * tmp0;
3864:           sum3 -= *v3 * tmp0;
3865:           sum4 -= *v4 * tmp0;
3866:         }
3867:         x[row]     = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3868:         x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3869:         x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3870:         x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3871:         row -= 4;
3872:         break;
3873:       case 5:

3875:         sum1 = b[row];
3876:         sum2 = b[row - 1];
3877:         sum3 = b[row - 2];
3878:         sum4 = b[row - 3];
3879:         sum5 = b[row - 4];
3880:         v2   = a->a + diag[row - 1] + 2;
3881:         v3   = a->a + diag[row - 2] + 3;
3882:         v4   = a->a + diag[row - 3] + 4;
3883:         v5   = a->a + diag[row - 4] + 5;
3884:         for (n = 0; n < sz - 1; n += 2) {
3885:           i1 = idx[0];
3886:           i2 = idx[1];
3887:           idx += 2;
3888:           tmp0 = x[i1];
3889:           tmp1 = x[i2];
3890:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3891:           v1 += 2;
3892:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3893:           v2 += 2;
3894:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3895:           v3 += 2;
3896:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3897:           v4 += 2;
3898:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3899:           v5 += 2;
3900:         }

3902:         if (n == sz - 1) {
3903:           tmp0 = x[*idx];
3904:           sum1 -= *v1 * tmp0;
3905:           sum2 -= *v2 * tmp0;
3906:           sum3 -= *v3 * tmp0;
3907:           sum4 -= *v4 * tmp0;
3908:           sum5 -= *v5 * tmp0;
3909:         }
3910:         x[row]     = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3911:         x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3912:         x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3913:         x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3914:         x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3915:         row -= 5;
3916:         break;
3917:       default:
3918:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3919:       }
3920:     }
3921:     PetscLogFlops(a->nz);

3923:     /*
3924:            t = b - D x    where D is the block diagonal
3925:     */
3926:     cnt = 0;
3927:     for (i = 0, row = 0; i < m; i++) {
3928:       switch (sizes[i]) {
3929:       case 1:
3930:         t[row] = b[row] - bdiag[cnt++] * x[row];
3931:         row++;
3932:         break;
3933:       case 2:
3934:         x1         = x[row];
3935:         x2         = x[row + 1];
3936:         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3937:         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3938:         t[row]     = b[row] - tmp1;
3939:         t[row + 1] = b[row + 1] - tmp2;
3940:         row += 2;
3941:         cnt += 4;
3942:         break;
3943:       case 3:
3944:         x1         = x[row];
3945:         x2         = x[row + 1];
3946:         x3         = x[row + 2];
3947:         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3948:         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3949:         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3950:         t[row]     = b[row] - tmp1;
3951:         t[row + 1] = b[row + 1] - tmp2;
3952:         t[row + 2] = b[row + 2] - tmp3;
3953:         row += 3;
3954:         cnt += 9;
3955:         break;
3956:       case 4:
3957:         x1         = x[row];
3958:         x2         = x[row + 1];
3959:         x3         = x[row + 2];
3960:         x4         = x[row + 3];
3961:         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3962:         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3963:         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
3964:         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
3965:         t[row]     = b[row] - tmp1;
3966:         t[row + 1] = b[row + 1] - tmp2;
3967:         t[row + 2] = b[row + 2] - tmp3;
3968:         t[row + 3] = b[row + 3] - tmp4;
3969:         row += 4;
3970:         cnt += 16;
3971:         break;
3972:       case 5:
3973:         x1         = x[row];
3974:         x2         = x[row + 1];
3975:         x3         = x[row + 2];
3976:         x4         = x[row + 3];
3977:         x5         = x[row + 4];
3978:         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
3979:         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
3980:         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
3981:         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
3982:         tmp5       = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
3983:         t[row]     = b[row] - tmp1;
3984:         t[row + 1] = b[row + 1] - tmp2;
3985:         t[row + 2] = b[row + 2] - tmp3;
3986:         t[row + 3] = b[row + 3] - tmp4;
3987:         t[row + 4] = b[row + 4] - tmp5;
3988:         row += 5;
3989:         cnt += 25;
3990:         break;
3991:       default:
3992:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3993:       }
3994:     }
3995:     PetscLogFlops(m);

3997:     /*
3998:           Apply (L + D)^-1 where D is the block diagonal
3999:     */
4000:     for (i = 0, row = 0; i < m; i++) {
4001:       sz  = diag[row] - ii[row];
4002:       v1  = a->a + ii[row];
4003:       idx = a->j + ii[row];
4004:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
4005:       switch (sizes[i]) {
4006:       case 1:

4008:         sum1 = t[row];
4009:         for (n = 0; n < sz - 1; n += 2) {
4010:           i1 = idx[0];
4011:           i2 = idx[1];
4012:           idx += 2;
4013:           tmp0 = t[i1];
4014:           tmp1 = t[i2];
4015:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4016:           v1 += 2;
4017:         }

4019:         if (n == sz - 1) {
4020:           tmp0 = t[*idx];
4021:           sum1 -= *v1 * tmp0;
4022:         }
4023:         x[row] += t[row] = sum1 * (*ibdiag++);
4024:         row++;
4025:         break;
4026:       case 2:
4027:         v2   = a->a + ii[row + 1];
4028:         sum1 = t[row];
4029:         sum2 = t[row + 1];
4030:         for (n = 0; n < sz - 1; n += 2) {
4031:           i1 = idx[0];
4032:           i2 = idx[1];
4033:           idx += 2;
4034:           tmp0 = t[i1];
4035:           tmp1 = t[i2];
4036:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4037:           v1 += 2;
4038:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4039:           v2 += 2;
4040:         }

4042:         if (n == sz - 1) {
4043:           tmp0 = t[*idx];
4044:           sum1 -= v1[0] * tmp0;
4045:           sum2 -= v2[0] * tmp0;
4046:         }
4047:         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[2];
4048:         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
4049:         ibdiag += 4;
4050:         row += 2;
4051:         break;
4052:       case 3:
4053:         v2   = a->a + ii[row + 1];
4054:         v3   = a->a + ii[row + 2];
4055:         sum1 = t[row];
4056:         sum2 = t[row + 1];
4057:         sum3 = t[row + 2];
4058:         for (n = 0; n < sz - 1; n += 2) {
4059:           i1 = idx[0];
4060:           i2 = idx[1];
4061:           idx += 2;
4062:           tmp0 = t[i1];
4063:           tmp1 = t[i2];
4064:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4065:           v1 += 2;
4066:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4067:           v2 += 2;
4068:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4069:           v3 += 2;
4070:         }

4072:         if (n == sz - 1) {
4073:           tmp0 = t[*idx];
4074:           sum1 -= v1[0] * tmp0;
4075:           sum2 -= v2[0] * tmp0;
4076:           sum3 -= v3[0] * tmp0;
4077:         }
4078:         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
4079:         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
4080:         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
4081:         ibdiag += 9;
4082:         row += 3;
4083:         break;
4084:       case 4:
4085:         v2   = a->a + ii[row + 1];
4086:         v3   = a->a + ii[row + 2];
4087:         v4   = a->a + ii[row + 3];
4088:         sum1 = t[row];
4089:         sum2 = t[row + 1];
4090:         sum3 = t[row + 2];
4091:         sum4 = t[row + 3];
4092:         for (n = 0; n < sz - 1; n += 2) {
4093:           i1 = idx[0];
4094:           i2 = idx[1];
4095:           idx += 2;
4096:           tmp0 = t[i1];
4097:           tmp1 = t[i2];
4098:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4099:           v1 += 2;
4100:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4101:           v2 += 2;
4102:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4103:           v3 += 2;
4104:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4105:           v4 += 2;
4106:         }

4108:         if (n == sz - 1) {
4109:           tmp0 = t[*idx];
4110:           sum1 -= v1[0] * tmp0;
4111:           sum2 -= v2[0] * tmp0;
4112:           sum3 -= v3[0] * tmp0;
4113:           sum4 -= v4[0] * tmp0;
4114:         }
4115:         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
4116:         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
4117:         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
4118:         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
4119:         ibdiag += 16;
4120:         row += 4;
4121:         break;
4122:       case 5:
4123:         v2   = a->a + ii[row + 1];
4124:         v3   = a->a + ii[row + 2];
4125:         v4   = a->a + ii[row + 3];
4126:         v5   = a->a + ii[row + 4];
4127:         sum1 = t[row];
4128:         sum2 = t[row + 1];
4129:         sum3 = t[row + 2];
4130:         sum4 = t[row + 3];
4131:         sum5 = t[row + 4];
4132:         for (n = 0; n < sz - 1; n += 2) {
4133:           i1 = idx[0];
4134:           i2 = idx[1];
4135:           idx += 2;
4136:           tmp0 = t[i1];
4137:           tmp1 = t[i2];
4138:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4139:           v1 += 2;
4140:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4141:           v2 += 2;
4142:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4143:           v3 += 2;
4144:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4145:           v4 += 2;
4146:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
4147:           v5 += 2;
4148:         }

4150:         if (n == sz - 1) {
4151:           tmp0 = t[*idx];
4152:           sum1 -= v1[0] * tmp0;
4153:           sum2 -= v2[0] * tmp0;
4154:           sum3 -= v3[0] * tmp0;
4155:           sum4 -= v4[0] * tmp0;
4156:           sum5 -= v5[0] * tmp0;
4157:         }
4158:         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
4159:         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
4160:         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
4161:         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
4162:         x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
4163:         ibdiag += 25;
4164:         row += 5;
4165:         break;
4166:       default:
4167:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4168:       }
4169:     }
4170:     PetscLogFlops(a->nz);
4171:   }
4172:   VecRestoreArray(xx, &x);
4173:   VecRestoreArrayRead(bb, &b);
4174:   return 0;
4175: }

4177: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4178: {
4179:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
4180:   PetscScalar       *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
4181:   const MatScalar   *bdiag = a->inode.bdiag;
4182:   const PetscScalar *b;
4183:   PetscInt           m = a->inode.node_count, cnt = 0, i, row;
4184:   const PetscInt    *sizes = a->inode.size;

4187:   VecGetArray(xx, &x);
4188:   VecGetArrayRead(bb, &b);
4189:   cnt = 0;
4190:   for (i = 0, row = 0; i < m; i++) {
4191:     switch (sizes[i]) {
4192:     case 1:
4193:       x[row] = b[row] * bdiag[cnt++];
4194:       row++;
4195:       break;
4196:     case 2:
4197:       x1       = b[row];
4198:       x2       = b[row + 1];
4199:       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
4200:       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
4201:       x[row++] = tmp1;
4202:       x[row++] = tmp2;
4203:       cnt += 4;
4204:       break;
4205:     case 3:
4206:       x1       = b[row];
4207:       x2       = b[row + 1];
4208:       x3       = b[row + 2];
4209:       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
4210:       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
4211:       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
4212:       x[row++] = tmp1;
4213:       x[row++] = tmp2;
4214:       x[row++] = tmp3;
4215:       cnt += 9;
4216:       break;
4217:     case 4:
4218:       x1       = b[row];
4219:       x2       = b[row + 1];
4220:       x3       = b[row + 2];
4221:       x4       = b[row + 3];
4222:       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
4223:       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4224:       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4225:       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4226:       x[row++] = tmp1;
4227:       x[row++] = tmp2;
4228:       x[row++] = tmp3;
4229:       x[row++] = tmp4;
4230:       cnt += 16;
4231:       break;
4232:     case 5:
4233:       x1       = b[row];
4234:       x2       = b[row + 1];
4235:       x3       = b[row + 2];
4236:       x4       = b[row + 3];
4237:       x5       = b[row + 4];
4238:       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4239:       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4240:       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4241:       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4242:       tmp5     = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4243:       x[row++] = tmp1;
4244:       x[row++] = tmp2;
4245:       x[row++] = tmp3;
4246:       x[row++] = tmp4;
4247:       x[row++] = tmp5;
4248:       cnt += 25;
4249:       break;
4250:     default:
4251:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4252:     }
4253:   }
4254:   PetscLogFlops(2.0 * cnt);
4255:   VecRestoreArray(xx, &x);
4256:   VecRestoreArrayRead(bb, &b);
4257:   return 0;
4258: }

4260: static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4261: {
4262:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

4264:   a->inode.node_count       = 0;
4265:   a->inode.use              = PETSC_FALSE;
4266:   a->inode.checked          = PETSC_FALSE;
4267:   a->inode.mat_nonzerostate = -1;
4268:   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4269:   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4270:   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4271:   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4272:   A->ops->coloringpatch     = NULL;
4273:   A->ops->multdiagonalblock = NULL;
4274:   if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
4275:   return 0;
4276: }

4278: /*
4279:     samestructure indicates that the matrix has not changed its nonzero structure so we
4280:     do not need to recompute the inodes
4281: */
4282: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4283: {
4284:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4285:   PetscInt        i, j, m, nzx, nzy, *ns, node_count, blk_size;
4286:   PetscBool       flag;
4287:   const PetscInt *idx, *idy, *ii;

4289:   if (!a->inode.use) {
4290:     MatSeqAIJ_Inode_ResetOps(A);
4291:     PetscFree(a->inode.size);
4292:     return 0;
4293:   }
4294:   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) return 0;

4296:   m = A->rmap->n;
4297:   if (!a->inode.size) PetscMalloc1(m + 1, &a->inode.size);
4298:   ns = a->inode.size;

4300:   i          = 0;
4301:   node_count = 0;
4302:   idx        = a->j;
4303:   ii         = a->i;
4304:   while (i < m) {            /* For each row */
4305:     nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
4306:     /* Limits the number of elements in a node to 'a->inode.limit' */
4307:     for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4308:       nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
4309:       if (nzy != nzx) break;
4310:       idy += nzx; /* Same nonzero pattern */
4311:       PetscArraycmp(idx, idy, nzx, &flag);
4312:       if (!flag) break;
4313:     }
4314:     ns[node_count++] = blk_size;
4315:     idx += blk_size * nzx;
4316:     i = j;
4317:   }

4319:   /* If not enough inodes found,, do not use inode version of the routines */
4320:   if (!m || node_count > .8 * m) {
4321:     MatSeqAIJ_Inode_ResetOps(A);
4322:     PetscFree(a->inode.size);
4323:     PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m);
4324:   } else {
4325:     if (!A->factortype) {
4326:       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4327:       if (A->rmap->n == A->cmap->n) {
4328:         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
4329:         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
4330:         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
4331:         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4332:         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4333:       }
4334:     } else {
4335:       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4336:     }
4337:     a->inode.node_count = node_count;
4338:     PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit);
4339:   }
4340:   a->inode.checked          = PETSC_TRUE;
4341:   a->inode.mat_nonzerostate = A->nonzerostate;
4342:   return 0;
4343: }

4345: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4346: {
4347:   Mat         B = *C;
4348:   Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4349:   PetscInt    m = A->rmap->n;

4351:   c->inode.use              = a->inode.use;
4352:   c->inode.limit            = a->inode.limit;
4353:   c->inode.max_limit        = a->inode.max_limit;
4354:   c->inode.checked          = PETSC_FALSE;
4355:   c->inode.size             = NULL;
4356:   c->inode.node_count       = 0;
4357:   c->inode.ibdiagvalid      = PETSC_FALSE;
4358:   c->inode.ibdiag           = NULL;
4359:   c->inode.bdiag            = NULL;
4360:   c->inode.mat_nonzerostate = -1;
4361:   if (a->inode.use) {
4362:     if (a->inode.checked && a->inode.size) {
4363:       PetscMalloc1(m + 1, &c->inode.size);
4364:       PetscArraycpy(c->inode.size, a->inode.size, m + 1);

4366:       c->inode.checked          = PETSC_TRUE;
4367:       c->inode.node_count       = a->inode.node_count;
4368:       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4369:     }
4370:     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4371:     if (!B->factortype) {
4372:       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4373:       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4374:       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4375:       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4376:       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4377:       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4378:     } else {
4379:       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4380:     }
4381:   }
4382:   return 0;
4383: }

4385: static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4386: {
4387:   PetscInt        k;
4388:   const PetscInt *vi;

4390:   vi = aj + ai[row];
4391:   for (k = 0; k < nzl; k++) cols[k] = vi[k];
4392:   vi        = aj + adiag[row];
4393:   cols[nzl] = vi[0];
4394:   vi        = aj + adiag[row + 1] + 1;
4395:   for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4396:   return 0;
4397: }
4398: /*
4399:    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4400:    Modified from MatSeqAIJCheckInode().

4402:    Input Parameters:
4403: .  Mat A - ILU or LU matrix factor

4405: */
4406: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4407: {
4408:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4409:   PetscInt        i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4410:   PetscInt       *cols1, *cols2, *ns;
4411:   const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4412:   PetscBool       flag;

4414:   if (!a->inode.use) return 0;
4415:   if (a->inode.checked) return 0;

4417:   m = A->rmap->n;
4418:   if (a->inode.size) ns = a->inode.size;
4419:   else PetscMalloc1(m + 1, &ns);

4421:   i          = 0;
4422:   node_count = 0;
4423:   PetscMalloc2(m, &cols1, m, &cols2);
4424:   while (i < m) {                       /* For each row */
4425:     nzl1 = ai[i + 1] - ai[i];           /* Number of nonzeros in L */
4426:     nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4427:     nzx  = nzl1 + nzu1 + 1;
4428:     MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i);

4430:     /* Limits the number of elements in a node to 'a->inode.limit' */
4431:     for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4432:       nzl2 = ai[j + 1] - ai[j];
4433:       nzu2 = adiag[j] - adiag[j + 1] - 1;
4434:       nzy  = nzl2 + nzu2 + 1;
4435:       if (nzy != nzx) break;
4436:       MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j);
4437:       PetscArraycmp(cols1, cols2, nzx, &flag);
4438:       if (!flag) break;
4439:     }
4440:     ns[node_count++] = blk_size;
4441:     i                = j;
4442:   }
4443:   PetscFree2(cols1, cols2);
4444:   /* If not enough inodes found,, do not use inode version of the routines */
4445:   if (!m || node_count > .8 * m) {
4446:     PetscFree(ns);

4448:     a->inode.node_count = 0;
4449:     a->inode.size       = NULL;
4450:     a->inode.use        = PETSC_FALSE;

4452:     PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m);
4453:   } else {
4454:     A->ops->mult              = NULL;
4455:     A->ops->sor               = NULL;
4456:     A->ops->multadd           = NULL;
4457:     A->ops->getrowij          = NULL;
4458:     A->ops->restorerowij      = NULL;
4459:     A->ops->getcolumnij       = NULL;
4460:     A->ops->restorecolumnij   = NULL;
4461:     A->ops->coloringpatch     = NULL;
4462:     A->ops->multdiagonalblock = NULL;
4463:     a->inode.node_count       = node_count;
4464:     a->inode.size             = ns;

4466:     PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit);
4467:   }
4468:   a->inode.checked = PETSC_TRUE;
4469:   return 0;
4470: }

4472: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4473: {
4474:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

4476:   a->inode.ibdiagvalid = PETSC_FALSE;
4477:   return 0;
4478: }

4480: /*
4481:      This is really ugly. if inodes are used this replaces the
4482:   permutations with ones that correspond to rows/cols of the matrix
4483:   rather then inode blocks
4484: */
4485: PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4486: {
4487:   PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4488:   return 0;
4489: }

4491: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4492: {
4493:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4494:   PetscInt        m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4495:   const PetscInt *ridx, *cidx;
4496:   PetscInt        row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx;
4497:   PetscInt        nslim_col, *ns_col;
4498:   IS              ris = *rperm, cis = *cperm;

4500:   if (!a->inode.size) return 0;           /* no inodes so return */
4501:   if (a->inode.node_count == m) return 0; /* all inodes are of size 1 */

4503:   MatCreateColInode_Private(A, &nslim_col, &ns_col);
4504:   PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns);
4505:   PetscMalloc2(m, &permr, n, &permc);

4507:   ISGetIndices(ris, &ridx);
4508:   ISGetIndices(cis, &cidx);

4510:   /* Form the inode structure for the rows of permuted matric using inv perm*/
4511:   for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i];

4513:   /* Construct the permutations for rows*/
4514:   for (i = 0, row = 0; i < nslim_row; ++i) {
4515:     indx      = ridx[i];
4516:     start_val = tns[indx];
4517:     end_val   = tns[indx + 1];
4518:     for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4519:   }

4521:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4522:   for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i];

4524:   /* Construct permutations for columns */
4525:   for (i = 0, col = 0; i < nslim_col; ++i) {
4526:     indx      = cidx[i];
4527:     start_val = tns[indx];
4528:     end_val   = tns[indx + 1];
4529:     for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4530:   }

4532:   ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm);
4533:   ISSetPermutation(*rperm);
4534:   ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm);
4535:   ISSetPermutation(*cperm);

4537:   ISRestoreIndices(ris, &ridx);
4538:   ISRestoreIndices(cis, &cidx);

4540:   PetscFree(ns_col);
4541:   PetscFree2(permr, permc);
4542:   ISDestroy(&cis);
4543:   ISDestroy(&ris);
4544:   PetscFree(tns);
4545:   return 0;
4546: }

4548: /*@C
4549:    MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes

4551:    Not Collective

4553:    Input Parameter:
4554: .  A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`

4556:    Output Parameters:
4557: +  node_count - no of inodes present in the matrix.
4558: .  sizes      - an array of size node_count,with sizes of each inode.
4559: -  limit      - the max size used to generate the inodes.

4561:    Level: advanced

4563:    Note:
4564:     This routine returns some internal storage information
4565:    of the matrix, it is intended to be used by advanced users.
4566:    It should be called after the matrix is assembled.
4567:    The contents of the sizes[] array should not be changed.
4568:    NULL may be passed for information not requested.

4570: .seealso: `MatGetInfo()`
4571: @*/
4572: PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4573: {
4574:   PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);

4577:   PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f);
4578:   if (f) (*f)(A, node_count, sizes, limit);
4579:   return 0;
4580: }

4582: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4583: {
4584:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

4586:   if (node_count) *node_count = a->inode.node_count;
4587:   if (sizes) *sizes = a->inode.size;
4588:   if (limit) *limit = a->inode.limit;
4589:   return 0;
4590: }