Actual source code: baijfact13.c


  2: /*
  3:     Factorization code for BAIJ format.
  4: */
  5: #include <../src/mat/impls/baij/seq/baij.h>
  6: #include <petsc/private/kernels/blockinvert.h>

  8: /*
  9:       Version for when blocks are 3 by 3
 10: */
 11: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_inplace(Mat C, Mat A, const MatFactorInfo *info)
 12: {
 13:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
 14:   IS              isrow = b->row, isicol = b->icol;
 15:   const PetscInt *r, *ic;
 16:   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
 17:   PetscInt       *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
 18:   PetscInt       *diag_offset = b->diag, idx, *pj;
 19:   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
 20:   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
 21:   MatScalar       p5, p6, p7, p8, p9, x5, x6, x7, x8, x9;
 22:   MatScalar      *ba = b->a, *aa = a->a;
 23:   PetscReal       shift = info->shiftamount;
 24:   PetscBool       allowzeropivot, zeropivotdetected;

 26:   ISGetIndices(isrow, &r);
 27:   ISGetIndices(isicol, &ic);
 28:   PetscMalloc1(9 * (n + 1), &rtmp);
 29:   allowzeropivot = PetscNot(A->erroriffailure);

 31:   for (i = 0; i < n; i++) {
 32:     nz    = bi[i + 1] - bi[i];
 33:     ajtmp = bj + bi[i];
 34:     for (j = 0; j < nz; j++) {
 35:       x    = rtmp + 9 * ajtmp[j];
 36:       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
 37:     }
 38:     /* load in initial (unfactored row) */
 39:     idx      = r[i];
 40:     nz       = ai[idx + 1] - ai[idx];
 41:     ajtmpold = aj + ai[idx];
 42:     v        = aa + 9 * ai[idx];
 43:     for (j = 0; j < nz; j++) {
 44:       x    = rtmp + 9 * ic[ajtmpold[j]];
 45:       x[0] = v[0];
 46:       x[1] = v[1];
 47:       x[2] = v[2];
 48:       x[3] = v[3];
 49:       x[4] = v[4];
 50:       x[5] = v[5];
 51:       x[6] = v[6];
 52:       x[7] = v[7];
 53:       x[8] = v[8];
 54:       v += 9;
 55:     }
 56:     row = *ajtmp++;
 57:     while (row < i) {
 58:       pc = rtmp + 9 * row;
 59:       p1 = pc[0];
 60:       p2 = pc[1];
 61:       p3 = pc[2];
 62:       p4 = pc[3];
 63:       p5 = pc[4];
 64:       p6 = pc[5];
 65:       p7 = pc[6];
 66:       p8 = pc[7];
 67:       p9 = pc[8];
 68:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
 69:         pv    = ba + 9 * diag_offset[row];
 70:         pj    = bj + diag_offset[row] + 1;
 71:         x1    = pv[0];
 72:         x2    = pv[1];
 73:         x3    = pv[2];
 74:         x4    = pv[3];
 75:         x5    = pv[4];
 76:         x6    = pv[5];
 77:         x7    = pv[6];
 78:         x8    = pv[7];
 79:         x9    = pv[8];
 80:         pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3;
 81:         pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3;
 82:         pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3;

 84:         pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6;
 85:         pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6;
 86:         pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6;

 88:         pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9;
 89:         pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9;
 90:         pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9;
 91:         nz         = bi[row + 1] - diag_offset[row] - 1;
 92:         pv += 9;
 93:         for (j = 0; j < nz; j++) {
 94:           x1 = pv[0];
 95:           x2 = pv[1];
 96:           x3 = pv[2];
 97:           x4 = pv[3];
 98:           x5 = pv[4];
 99:           x6 = pv[5];
100:           x7 = pv[6];
101:           x8 = pv[7];
102:           x9 = pv[8];
103:           x  = rtmp + 9 * pj[j];
104:           x[0] -= m1 * x1 + m4 * x2 + m7 * x3;
105:           x[1] -= m2 * x1 + m5 * x2 + m8 * x3;
106:           x[2] -= m3 * x1 + m6 * x2 + m9 * x3;

108:           x[3] -= m1 * x4 + m4 * x5 + m7 * x6;
109:           x[4] -= m2 * x4 + m5 * x5 + m8 * x6;
110:           x[5] -= m3 * x4 + m6 * x5 + m9 * x6;

112:           x[6] -= m1 * x7 + m4 * x8 + m7 * x9;
113:           x[7] -= m2 * x7 + m5 * x8 + m8 * x9;
114:           x[8] -= m3 * x7 + m6 * x8 + m9 * x9;
115:           pv += 9;
116:         }
117:         PetscLogFlops(54.0 * nz + 36.0);
118:       }
119:       row = *ajtmp++;
120:     }
121:     /* finished row so stick it into b->a */
122:     pv = ba + 9 * bi[i];
123:     pj = bj + bi[i];
124:     nz = bi[i + 1] - bi[i];
125:     for (j = 0; j < nz; j++) {
126:       x     = rtmp + 9 * pj[j];
127:       pv[0] = x[0];
128:       pv[1] = x[1];
129:       pv[2] = x[2];
130:       pv[3] = x[3];
131:       pv[4] = x[4];
132:       pv[5] = x[5];
133:       pv[6] = x[6];
134:       pv[7] = x[7];
135:       pv[8] = x[8];
136:       pv += 9;
137:     }
138:     /* invert diagonal block */
139:     w = ba + 9 * diag_offset[i];
140:     PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected);
141:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
142:   }

144:   PetscFree(rtmp);
145:   ISRestoreIndices(isicol, &ic);
146:   ISRestoreIndices(isrow, &r);

148:   C->ops->solve          = MatSolve_SeqBAIJ_3_inplace;
149:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_inplace;
150:   C->assembled           = PETSC_TRUE;

152:   PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs); /* from inverting diagonal blocks */
153:   return 0;
154: }

156: /* MatLUFactorNumeric_SeqBAIJ_3 -
157:      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
158:        PetscKernel_A_gets_A_times_B()
159:        PetscKernel_A_gets_A_minus_B_times_C()
160:        PetscKernel_A_gets_inverse_A()
161: */
162: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3(Mat B, Mat A, const MatFactorInfo *info)
163: {
164:   Mat             C = B;
165:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
166:   IS              isrow = b->row, isicol = b->icol;
167:   const PetscInt *r, *ic;
168:   PetscInt        i, j, k, nz, nzL, row;
169:   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
170:   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
171:   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
172:   PetscInt        flg;
173:   PetscReal       shift = info->shiftamount;
174:   PetscBool       allowzeropivot, zeropivotdetected;

176:   ISGetIndices(isrow, &r);
177:   ISGetIndices(isicol, &ic);
178:   allowzeropivot = PetscNot(A->erroriffailure);

180:   /* generate work space needed by the factorization */
181:   PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork);
182:   PetscArrayzero(rtmp, bs2 * n);

184:   for (i = 0; i < n; i++) {
185:     /* zero rtmp */
186:     /* L part */
187:     nz    = bi[i + 1] - bi[i];
188:     bjtmp = bj + bi[i];
189:     for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);

191:     /* U part */
192:     nz    = bdiag[i] - bdiag[i + 1];
193:     bjtmp = bj + bdiag[i + 1] + 1;
194:     for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);

196:     /* load in initial (unfactored row) */
197:     nz    = ai[r[i] + 1] - ai[r[i]];
198:     ajtmp = aj + ai[r[i]];
199:     v     = aa + bs2 * ai[r[i]];
200:     for (j = 0; j < nz; j++) PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2);

202:     /* elimination */
203:     bjtmp = bj + bi[i];
204:     nzL   = bi[i + 1] - bi[i];
205:     for (k = 0; k < nzL; k++) {
206:       row = bjtmp[k];
207:       pc  = rtmp + bs2 * row;
208:       for (flg = 0, j = 0; j < bs2; j++) {
209:         if (pc[j] != 0.0) {
210:           flg = 1;
211:           break;
212:         }
213:       }
214:       if (flg) {
215:         pv = b->a + bs2 * bdiag[row];
216:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
217:         PetscKernel_A_gets_A_times_B_3(pc, pv, mwork);

219:         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
220:         pv = b->a + bs2 * (bdiag[row + 1] + 1);
221:         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
222:         for (j = 0; j < nz; j++) {
223:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
224:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
225:           v = rtmp + bs2 * pj[j];
226:           PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv);
227:           pv += bs2;
228:         }
229:         PetscLogFlops(54.0 * nz + 45); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
230:       }
231:     }

233:     /* finished row so stick it into b->a */
234:     /* L part */
235:     pv = b->a + bs2 * bi[i];
236:     pj = b->j + bi[i];
237:     nz = bi[i + 1] - bi[i];
238:     for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);

240:     /* Mark diagonal and invert diagonal for simpler triangular solves */
241:     pv = b->a + bs2 * bdiag[i];
242:     pj = b->j + bdiag[i];
243:     PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2);
244:     PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected);
245:     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

247:     /* U part */
248:     pj = b->j + bdiag[i + 1] + 1;
249:     pv = b->a + bs2 * (bdiag[i + 1] + 1);
250:     nz = bdiag[i] - bdiag[i + 1] - 1;
251:     for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);
252:   }

254:   PetscFree2(rtmp, mwork);
255:   ISRestoreIndices(isicol, &ic);
256:   ISRestoreIndices(isrow, &r);

258:   C->ops->solve          = MatSolve_SeqBAIJ_3;
259:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3;
260:   C->assembled           = PETSC_TRUE;

262:   PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n); /* from inverting diagonal blocks */
263:   return 0;
264: }

266: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
267: {
268:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
269:   PetscInt     i, j, n = a->mbs, *bi = b->i, *bj = b->j;
270:   PetscInt    *ajtmpold, *ajtmp, nz, row;
271:   PetscInt    *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
272:   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
273:   MatScalar    p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
274:   MatScalar    p5, p6, p7, p8, p9, x5, x6, x7, x8, x9;
275:   MatScalar   *ba = b->a, *aa = a->a;
276:   PetscReal    shift = info->shiftamount;
277:   PetscBool    allowzeropivot, zeropivotdetected;

279:   PetscMalloc1(9 * (n + 1), &rtmp);
280:   allowzeropivot = PetscNot(A->erroriffailure);

282:   for (i = 0; i < n; i++) {
283:     nz    = bi[i + 1] - bi[i];
284:     ajtmp = bj + bi[i];
285:     for (j = 0; j < nz; j++) {
286:       x    = rtmp + 9 * ajtmp[j];
287:       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
288:     }
289:     /* load in initial (unfactored row) */
290:     nz       = ai[i + 1] - ai[i];
291:     ajtmpold = aj + ai[i];
292:     v        = aa + 9 * ai[i];
293:     for (j = 0; j < nz; j++) {
294:       x    = rtmp + 9 * ajtmpold[j];
295:       x[0] = v[0];
296:       x[1] = v[1];
297:       x[2] = v[2];
298:       x[3] = v[3];
299:       x[4] = v[4];
300:       x[5] = v[5];
301:       x[6] = v[6];
302:       x[7] = v[7];
303:       x[8] = v[8];
304:       v += 9;
305:     }
306:     row = *ajtmp++;
307:     while (row < i) {
308:       pc = rtmp + 9 * row;
309:       p1 = pc[0];
310:       p2 = pc[1];
311:       p3 = pc[2];
312:       p4 = pc[3];
313:       p5 = pc[4];
314:       p6 = pc[5];
315:       p7 = pc[6];
316:       p8 = pc[7];
317:       p9 = pc[8];
318:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
319:         pv    = ba + 9 * diag_offset[row];
320:         pj    = bj + diag_offset[row] + 1;
321:         x1    = pv[0];
322:         x2    = pv[1];
323:         x3    = pv[2];
324:         x4    = pv[3];
325:         x5    = pv[4];
326:         x6    = pv[5];
327:         x7    = pv[6];
328:         x8    = pv[7];
329:         x9    = pv[8];
330:         pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3;
331:         pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3;
332:         pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3;

334:         pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6;
335:         pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6;
336:         pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6;

338:         pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9;
339:         pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9;
340:         pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9;

342:         nz = bi[row + 1] - diag_offset[row] - 1;
343:         pv += 9;
344:         for (j = 0; j < nz; j++) {
345:           x1 = pv[0];
346:           x2 = pv[1];
347:           x3 = pv[2];
348:           x4 = pv[3];
349:           x5 = pv[4];
350:           x6 = pv[5];
351:           x7 = pv[6];
352:           x8 = pv[7];
353:           x9 = pv[8];
354:           x  = rtmp + 9 * pj[j];
355:           x[0] -= m1 * x1 + m4 * x2 + m7 * x3;
356:           x[1] -= m2 * x1 + m5 * x2 + m8 * x3;
357:           x[2] -= m3 * x1 + m6 * x2 + m9 * x3;

359:           x[3] -= m1 * x4 + m4 * x5 + m7 * x6;
360:           x[4] -= m2 * x4 + m5 * x5 + m8 * x6;
361:           x[5] -= m3 * x4 + m6 * x5 + m9 * x6;

363:           x[6] -= m1 * x7 + m4 * x8 + m7 * x9;
364:           x[7] -= m2 * x7 + m5 * x8 + m8 * x9;
365:           x[8] -= m3 * x7 + m6 * x8 + m9 * x9;
366:           pv += 9;
367:         }
368:         PetscLogFlops(54.0 * nz + 36.0);
369:       }
370:       row = *ajtmp++;
371:     }
372:     /* finished row so stick it into b->a */
373:     pv = ba + 9 * bi[i];
374:     pj = bj + bi[i];
375:     nz = bi[i + 1] - bi[i];
376:     for (j = 0; j < nz; j++) {
377:       x     = rtmp + 9 * pj[j];
378:       pv[0] = x[0];
379:       pv[1] = x[1];
380:       pv[2] = x[2];
381:       pv[3] = x[3];
382:       pv[4] = x[4];
383:       pv[5] = x[5];
384:       pv[6] = x[6];
385:       pv[7] = x[7];
386:       pv[8] = x[8];
387:       pv += 9;
388:     }
389:     /* invert diagonal block */
390:     w = ba + 9 * diag_offset[i];
391:     PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected);
392:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
393:   }

395:   PetscFree(rtmp);

397:   C->ops->solve          = MatSolve_SeqBAIJ_3_NaturalOrdering_inplace;
398:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace;
399:   C->assembled           = PETSC_TRUE;

401:   PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs); /* from inverting diagonal blocks */
402:   return 0;
403: }

405: /*
406:   MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering -
407:     copied from MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace()
408: */
409: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
410: {
411:   Mat             C = B;
412:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
413:   PetscInt        i, j, k, nz, nzL, row;
414:   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
415:   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
416:   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
417:   PetscInt        flg;
418:   PetscReal       shift = info->shiftamount;
419:   PetscBool       allowzeropivot, zeropivotdetected;

421:   allowzeropivot = PetscNot(A->erroriffailure);

423:   /* generate work space needed by the factorization */
424:   PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork);
425:   PetscArrayzero(rtmp, bs2 * n);

427:   for (i = 0; i < n; i++) {
428:     /* zero rtmp */
429:     /* L part */
430:     nz    = bi[i + 1] - bi[i];
431:     bjtmp = bj + bi[i];
432:     for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);

434:     /* U part */
435:     nz    = bdiag[i] - bdiag[i + 1];
436:     bjtmp = bj + bdiag[i + 1] + 1;
437:     for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);

439:     /* load in initial (unfactored row) */
440:     nz    = ai[i + 1] - ai[i];
441:     ajtmp = aj + ai[i];
442:     v     = aa + bs2 * ai[i];
443:     for (j = 0; j < nz; j++) PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2);

445:     /* elimination */
446:     bjtmp = bj + bi[i];
447:     nzL   = bi[i + 1] - bi[i];
448:     for (k = 0; k < nzL; k++) {
449:       row = bjtmp[k];
450:       pc  = rtmp + bs2 * row;
451:       for (flg = 0, j = 0; j < bs2; j++) {
452:         if (pc[j] != 0.0) {
453:           flg = 1;
454:           break;
455:         }
456:       }
457:       if (flg) {
458:         pv = b->a + bs2 * bdiag[row];
459:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
460:         PetscKernel_A_gets_A_times_B_3(pc, pv, mwork);

462:         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
463:         pv = b->a + bs2 * (bdiag[row + 1] + 1);
464:         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
465:         for (j = 0; j < nz; j++) {
466:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
467:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
468:           v = rtmp + bs2 * pj[j];
469:           PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv);
470:           pv += bs2;
471:         }
472:         PetscLogFlops(54.0 * nz + 45); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
473:       }
474:     }

476:     /* finished row so stick it into b->a */
477:     /* L part */
478:     pv = b->a + bs2 * bi[i];
479:     pj = b->j + bi[i];
480:     nz = bi[i + 1] - bi[i];
481:     for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);

483:     /* Mark diagonal and invert diagonal for simpler triangular solves */
484:     pv = b->a + bs2 * bdiag[i];
485:     pj = b->j + bdiag[i];
486:     PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2);
487:     PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected);
488:     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

490:     /* U part */
491:     pv = b->a + bs2 * (bdiag[i + 1] + 1);
492:     pj = b->j + bdiag[i + 1] + 1;
493:     nz = bdiag[i] - bdiag[i + 1] - 1;
494:     for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);
495:   }
496:   PetscFree2(rtmp, mwork);

498:   C->ops->solve          = MatSolve_SeqBAIJ_3_NaturalOrdering;
499:   C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_3_NaturalOrdering;
500:   C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_3_NaturalOrdering;
501:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
502:   C->assembled           = PETSC_TRUE;

504:   PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n); /* from inverting diagonal blocks */
505:   return 0;
506: }