Actual source code: sbaijfact2.c


  2: /*
  3:     Factorization code for SBAIJ format.
  4: */

  6: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  7: #include <../src/mat/impls/baij/seq/baij.h>
  8: #include <petsc/private/kernels/blockinvert.h>

 10: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
 11: {
 12:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
 13:   IS                 isrow = a->row;
 14:   PetscInt           mbs = a->mbs, *ai = a->i, *aj = a->j;
 15:   const PetscInt    *r;
 16:   PetscInt           nz, *vj, k, idx, k1;
 17:   PetscInt           bs = A->rmap->bs, bs2 = a->bs2;
 18:   const MatScalar   *aa = a->a, *v, *diag;
 19:   PetscScalar       *x, *xk, *xj, *xk_tmp, *t;
 20:   const PetscScalar *b;

 22:   VecGetArrayRead(bb, &b);
 23:   VecGetArray(xx, &x);
 24:   t = a->solve_work;
 25:   ISGetIndices(isrow, &r);
 26:   PetscMalloc1(bs, &xk_tmp);

 28:   /* solve U^T * D * y = b by forward substitution */
 29:   xk = t;
 30:   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
 31:     idx = bs * r[k];
 32:     for (k1 = 0; k1 < bs; k1++) *xk++ = b[idx + k1];
 33:   }
 34:   for (k = 0; k < mbs; k++) {
 35:     v  = aa + bs2 * ai[k];
 36:     xk = t + k * bs;                          /* Dk*xk = k-th block of x */
 37:     PetscArraycpy(xk_tmp, xk, bs); /* xk_tmp <- xk */
 38:     nz = ai[k + 1] - ai[k];
 39:     vj = aj + ai[k];
 40:     xj = t + (*vj) * bs; /* *vj-th block of x, *vj>k */
 41:     while (nz--) {
 42:       /* x(:) += U(k,:)^T*(Dk*xk) */
 43:       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */
 44:       vj++;
 45:       xj = t + (*vj) * bs;
 46:       v += bs2;
 47:     }
 48:     /* xk = inv(Dk)*(Dk*xk) */
 49:     diag = aa + k * bs2;                                /* ptr to inv(Dk) */
 50:     PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */
 51:   }

 53:   /* solve U*x = y by back substitution */
 54:   for (k = mbs - 1; k >= 0; k--) {
 55:     v  = aa + bs2 * ai[k];
 56:     xk = t + k * bs; /* xk */
 57:     nz = ai[k + 1] - ai[k];
 58:     vj = aj + ai[k];
 59:     xj = t + (*vj) * bs;
 60:     while (nz--) {
 61:       /* xk += U(k,:)*x(:) */
 62:       PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */
 63:       vj++;
 64:       v += bs2;
 65:       xj = t + (*vj) * bs;
 66:     }
 67:     idx = bs * r[k];
 68:     for (k1 = 0; k1 < bs; k1++) x[idx + k1] = *xk++;
 69:   }

 71:   PetscFree(xk_tmp);
 72:   ISRestoreIndices(isrow, &r);
 73:   VecRestoreArrayRead(bb, &b);
 74:   VecRestoreArray(xx, &x);
 75:   PetscLogFlops(4.0 * bs2 * a->nz - (bs + 2.0 * bs2) * mbs);
 76:   return 0;
 77: }

 79: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
 80: {
 81:   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "not implemented yet");
 82: }

 84: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
 85: {
 86:   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not yet implemented");
 87: }

 89: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x)
 90: {
 91:   PetscInt         nz, k;
 92:   const PetscInt  *vj, bs2 = bs * bs;
 93:   const MatScalar *v, *diag;
 94:   PetscScalar     *xk, *xj, *xk_tmp;

 96:   PetscMalloc1(bs, &xk_tmp);
 97:   for (k = 0; k < mbs; k++) {
 98:     v  = aa + bs2 * ai[k];
 99:     xk = x + k * bs;                          /* Dk*xk = k-th block of x */
100:     PetscArraycpy(xk_tmp, xk, bs); /* xk_tmp <- xk */
101:     nz = ai[k + 1] - ai[k];
102:     vj = aj + ai[k];
103:     xj = x + (*vj) * bs; /* *vj-th block of x, *vj>k */
104:     while (nz--) {
105:       /* x(:) += U(k,:)^T*(Dk*xk) */
106:       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */
107:       vj++;
108:       xj = x + (*vj) * bs;
109:       v += bs2;
110:     }
111:     /* xk = inv(Dk)*(Dk*xk) */
112:     diag = aa + k * bs2;                                /* ptr to inv(Dk) */
113:     PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */
114:   }
115:   PetscFree(xk_tmp);
116:   return 0;
117: }

119: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x)
120: {
121:   PetscInt         nz, k;
122:   const PetscInt  *vj, bs2 = bs * bs;
123:   const MatScalar *v;
124:   PetscScalar     *xk, *xj;

126:   for (k = mbs - 1; k >= 0; k--) {
127:     v  = aa + bs2 * ai[k];
128:     xk = x + k * bs; /* xk */
129:     nz = ai[k + 1] - ai[k];
130:     vj = aj + ai[k];
131:     xj = x + (*vj) * bs;
132:     while (nz--) {
133:       /* xk += U(k,:)*x(:) */
134:       PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */
135:       vj++;
136:       v += bs2;
137:       xj = x + (*vj) * bs;
138:     }
139:   }
140:   return 0;
141: }

143: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
144: {
145:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
146:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
147:   PetscInt           bs = A->rmap->bs;
148:   const MatScalar   *aa = a->a;
149:   PetscScalar       *x;
150:   const PetscScalar *b;

152:   VecGetArrayRead(bb, &b);
153:   VecGetArray(xx, &x);

155:   /* solve U^T * D * y = b by forward substitution */
156:   PetscArraycpy(x, b, bs * mbs); /* x <- b */
157:   MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x);

159:   /* solve U*x = y by back substitution */
160:   MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x);

162:   VecRestoreArrayRead(bb, &b);
163:   VecRestoreArray(xx, &x);
164:   PetscLogFlops(4.0 * a->bs2 * a->nz - (bs + 2.0 * a->bs2) * mbs);
165:   return 0;
166: }

168: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
169: {
170:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
171:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
172:   PetscInt           bs = A->rmap->bs;
173:   const MatScalar   *aa = a->a;
174:   const PetscScalar *b;
175:   PetscScalar       *x;

177:   VecGetArrayRead(bb, &b);
178:   VecGetArray(xx, &x);
179:   PetscArraycpy(x, b, bs * mbs); /* x <- b */
180:   MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x);
181:   VecRestoreArrayRead(bb, &b);
182:   VecRestoreArray(xx, &x);
183:   PetscLogFlops(2.0 * a->bs2 * a->nz - bs * mbs);
184:   return 0;
185: }

187: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
188: {
189:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
190:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
191:   PetscInt           bs = A->rmap->bs;
192:   const MatScalar   *aa = a->a;
193:   const PetscScalar *b;
194:   PetscScalar       *x;

196:   VecGetArrayRead(bb, &b);
197:   VecGetArray(xx, &x);
198:   PetscArraycpy(x, b, bs * mbs);
199:   MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x);
200:   VecRestoreArrayRead(bb, &b);
201:   VecRestoreArray(xx, &x);
202:   PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs));
203:   return 0;
204: }

206: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
207: {
208:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
209:   IS                 isrow = a->row;
210:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj;
211:   PetscInt           nz, k, idx;
212:   const MatScalar   *aa = a->a, *v, *d;
213:   PetscScalar       *x, x0, x1, x2, x3, x4, x5, x6, *t, *tp;
214:   const PetscScalar *b;

216:   VecGetArrayRead(bb, &b);
217:   VecGetArray(xx, &x);
218:   t = a->solve_work;
219:   ISGetIndices(isrow, &r);

221:   /* solve U^T * D * y = b by forward substitution */
222:   tp = t;
223:   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
224:     idx   = 7 * r[k];
225:     tp[0] = b[idx];
226:     tp[1] = b[idx + 1];
227:     tp[2] = b[idx + 2];
228:     tp[3] = b[idx + 3];
229:     tp[4] = b[idx + 4];
230:     tp[5] = b[idx + 5];
231:     tp[6] = b[idx + 6];
232:     tp += 7;
233:   }

235:   for (k = 0; k < mbs; k++) {
236:     v  = aa + 49 * ai[k];
237:     vj = aj + ai[k];
238:     tp = t + k * 7;
239:     x0 = tp[0];
240:     x1 = tp[1];
241:     x2 = tp[2];
242:     x3 = tp[3];
243:     x4 = tp[4];
244:     x5 = tp[5];
245:     x6 = tp[6];
246:     nz = ai[k + 1] - ai[k];
247:     tp = t + (*vj) * 7;
248:     while (nz--) {
249:       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6;
250:       tp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6;
251:       tp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6;
252:       tp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6;
253:       tp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6;
254:       tp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6;
255:       tp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6;
256:       vj++;
257:       tp = t + (*vj) * 7;
258:       v += 49;
259:     }

261:     /* xk = inv(Dk)*(Dk*xk) */
262:     d     = aa + k * 49; /* ptr to inv(Dk) */
263:     tp    = t + k * 7;
264:     tp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6;
265:     tp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6;
266:     tp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6;
267:     tp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6;
268:     tp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6;
269:     tp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6;
270:     tp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6;
271:   }

273:   /* solve U*x = y by back substitution */
274:   for (k = mbs - 1; k >= 0; k--) {
275:     v  = aa + 49 * ai[k];
276:     vj = aj + ai[k];
277:     tp = t + k * 7;
278:     x0 = tp[0];
279:     x1 = tp[1];
280:     x2 = tp[2];
281:     x3 = tp[3];
282:     x4 = tp[4];
283:     x5 = tp[5];
284:     x6 = tp[6]; /* xk */
285:     nz = ai[k + 1] - ai[k];

287:     tp = t + (*vj) * 7;
288:     while (nz--) {
289:       /* xk += U(k,:)*x(:) */
290:       x0 += v[0] * tp[0] + v[7] * tp[1] + v[14] * tp[2] + v[21] * tp[3] + v[28] * tp[4] + v[35] * tp[5] + v[42] * tp[6];
291:       x1 += v[1] * tp[0] + v[8] * tp[1] + v[15] * tp[2] + v[22] * tp[3] + v[29] * tp[4] + v[36] * tp[5] + v[43] * tp[6];
292:       x2 += v[2] * tp[0] + v[9] * tp[1] + v[16] * tp[2] + v[23] * tp[3] + v[30] * tp[4] + v[37] * tp[5] + v[44] * tp[6];
293:       x3 += v[3] * tp[0] + v[10] * tp[1] + v[17] * tp[2] + v[24] * tp[3] + v[31] * tp[4] + v[38] * tp[5] + v[45] * tp[6];
294:       x4 += v[4] * tp[0] + v[11] * tp[1] + v[18] * tp[2] + v[25] * tp[3] + v[32] * tp[4] + v[39] * tp[5] + v[46] * tp[6];
295:       x5 += v[5] * tp[0] + v[12] * tp[1] + v[19] * tp[2] + v[26] * tp[3] + v[33] * tp[4] + v[40] * tp[5] + v[47] * tp[6];
296:       x6 += v[6] * tp[0] + v[13] * tp[1] + v[20] * tp[2] + v[27] * tp[3] + v[34] * tp[4] + v[41] * tp[5] + v[48] * tp[6];
297:       vj++;
298:       tp = t + (*vj) * 7;
299:       v += 49;
300:     }
301:     tp         = t + k * 7;
302:     tp[0]      = x0;
303:     tp[1]      = x1;
304:     tp[2]      = x2;
305:     tp[3]      = x3;
306:     tp[4]      = x4;
307:     tp[5]      = x5;
308:     tp[6]      = x6;
309:     idx        = 7 * r[k];
310:     x[idx]     = x0;
311:     x[idx + 1] = x1;
312:     x[idx + 2] = x2;
313:     x[idx + 3] = x3;
314:     x[idx + 4] = x4;
315:     x[idx + 5] = x5;
316:     x[idx + 6] = x6;
317:   }

319:   ISRestoreIndices(isrow, &r);
320:   VecRestoreArrayRead(bb, &b);
321:   VecRestoreArray(xx, &x);
322:   PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
323:   return 0;
324: }

326: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
327: {
328:   const MatScalar *v, *d;
329:   PetscScalar     *xp, x0, x1, x2, x3, x4, x5, x6;
330:   PetscInt         nz, k;
331:   const PetscInt  *vj;

333:   for (k = 0; k < mbs; k++) {
334:     v  = aa + 49 * ai[k];
335:     xp = x + k * 7;
336:     x0 = xp[0];
337:     x1 = xp[1];
338:     x2 = xp[2];
339:     x3 = xp[3];
340:     x4 = xp[4];
341:     x5 = xp[5];
342:     x6 = xp[6]; /* Dk*xk = k-th block of x */
343:     nz = ai[k + 1] - ai[k];
344:     vj = aj + ai[k];
345:     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
346:     PetscPrefetchBlock(v + 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
347:     while (nz--) {
348:       xp = x + (*vj) * 7;
349:       /* x(:) += U(k,:)^T*(Dk*xk) */
350:       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6;
351:       xp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6;
352:       xp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6;
353:       xp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6;
354:       xp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6;
355:       xp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6;
356:       xp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6;
357:       vj++;
358:       v += 49;
359:     }
360:     /* xk = inv(Dk)*(Dk*xk) */
361:     d     = aa + k * 49; /* ptr to inv(Dk) */
362:     xp    = x + k * 7;
363:     xp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6;
364:     xp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6;
365:     xp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6;
366:     xp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6;
367:     xp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6;
368:     xp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6;
369:     xp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6;
370:   }
371:   return 0;
372: }

374: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
375: {
376:   const MatScalar *v;
377:   PetscScalar     *xp, x0, x1, x2, x3, x4, x5, x6;
378:   PetscInt         nz, k;
379:   const PetscInt  *vj;

381:   for (k = mbs - 1; k >= 0; k--) {
382:     v  = aa + 49 * ai[k];
383:     xp = x + k * 7;
384:     x0 = xp[0];
385:     x1 = xp[1];
386:     x2 = xp[2];
387:     x3 = xp[3];
388:     x4 = xp[4];
389:     x5 = xp[5];
390:     x6 = xp[6]; /* xk */
391:     nz = ai[k + 1] - ai[k];
392:     vj = aj + ai[k];
393:     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
394:     PetscPrefetchBlock(v - 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
395:     while (nz--) {
396:       xp = x + (*vj) * 7;
397:       /* xk += U(k,:)*x(:) */
398:       x0 += v[0] * xp[0] + v[7] * xp[1] + v[14] * xp[2] + v[21] * xp[3] + v[28] * xp[4] + v[35] * xp[5] + v[42] * xp[6];
399:       x1 += v[1] * xp[0] + v[8] * xp[1] + v[15] * xp[2] + v[22] * xp[3] + v[29] * xp[4] + v[36] * xp[5] + v[43] * xp[6];
400:       x2 += v[2] * xp[0] + v[9] * xp[1] + v[16] * xp[2] + v[23] * xp[3] + v[30] * xp[4] + v[37] * xp[5] + v[44] * xp[6];
401:       x3 += v[3] * xp[0] + v[10] * xp[1] + v[17] * xp[2] + v[24] * xp[3] + v[31] * xp[4] + v[38] * xp[5] + v[45] * xp[6];
402:       x4 += v[4] * xp[0] + v[11] * xp[1] + v[18] * xp[2] + v[25] * xp[3] + v[32] * xp[4] + v[39] * xp[5] + v[46] * xp[6];
403:       x5 += v[5] * xp[0] + v[12] * xp[1] + v[19] * xp[2] + v[26] * xp[3] + v[33] * xp[4] + v[40] * xp[5] + v[47] * xp[6];
404:       x6 += v[6] * xp[0] + v[13] * xp[1] + v[20] * xp[2] + v[27] * xp[3] + v[34] * xp[4] + v[41] * xp[5] + v[48] * xp[6];
405:       vj++;
406:       v += 49;
407:     }
408:     xp    = x + k * 7;
409:     xp[0] = x0;
410:     xp[1] = x1;
411:     xp[2] = x2;
412:     xp[3] = x3;
413:     xp[4] = x4;
414:     xp[5] = x5;
415:     xp[6] = x6;
416:   }
417:   return 0;
418: }

420: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
421: {
422:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
423:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
424:   const MatScalar   *aa = a->a;
425:   PetscScalar       *x;
426:   const PetscScalar *b;

428:   VecGetArrayRead(bb, &b);
429:   VecGetArray(xx, &x);

431:   /* solve U^T * D * y = b by forward substitution */
432:   PetscArraycpy(x, b, 7 * mbs); /* x <- b */
433:   MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x);

435:   /* solve U*x = y by back substitution */
436:   MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x);

438:   VecRestoreArrayRead(bb, &b);
439:   VecRestoreArray(xx, &x);
440:   PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
441:   return 0;
442: }

444: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
445: {
446:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
447:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
448:   const MatScalar   *aa = a->a;
449:   PetscScalar       *x;
450:   const PetscScalar *b;

452:   VecGetArrayRead(bb, &b);
453:   VecGetArray(xx, &x);
454:   PetscArraycpy(x, b, 7 * mbs);
455:   MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x);
456:   VecRestoreArrayRead(bb, &b);
457:   VecRestoreArray(xx, &x);
458:   PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs);
459:   return 0;
460: }

462: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
463: {
464:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
465:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
466:   const MatScalar   *aa = a->a;
467:   PetscScalar       *x;
468:   const PetscScalar *b;

470:   VecGetArrayRead(bb, &b);
471:   VecGetArray(xx, &x);
472:   PetscArraycpy(x, b, 7 * mbs);
473:   MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x);
474:   VecRestoreArrayRead(bb, &b);
475:   VecRestoreArray(xx, &x);
476:   PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs));
477:   return 0;
478: }

480: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
481: {
482:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
483:   IS                 isrow = a->row;
484:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj;
485:   PetscInt           nz, k, idx;
486:   const MatScalar   *aa = a->a, *v, *d;
487:   PetscScalar       *x, x0, x1, x2, x3, x4, x5, *t, *tp;
488:   const PetscScalar *b;

490:   VecGetArrayRead(bb, &b);
491:   VecGetArray(xx, &x);
492:   t = a->solve_work;
493:   ISGetIndices(isrow, &r);

495:   /* solve U^T * D * y = b by forward substitution */
496:   tp = t;
497:   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
498:     idx   = 6 * r[k];
499:     tp[0] = b[idx];
500:     tp[1] = b[idx + 1];
501:     tp[2] = b[idx + 2];
502:     tp[3] = b[idx + 3];
503:     tp[4] = b[idx + 4];
504:     tp[5] = b[idx + 5];
505:     tp += 6;
506:   }

508:   for (k = 0; k < mbs; k++) {
509:     v  = aa + 36 * ai[k];
510:     vj = aj + ai[k];
511:     tp = t + k * 6;
512:     x0 = tp[0];
513:     x1 = tp[1];
514:     x2 = tp[2];
515:     x3 = tp[3];
516:     x4 = tp[4];
517:     x5 = tp[5];
518:     nz = ai[k + 1] - ai[k];
519:     tp = t + (*vj) * 6;
520:     while (nz--) {
521:       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5;
522:       tp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5;
523:       tp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5;
524:       tp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5;
525:       tp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5;
526:       tp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5;
527:       vj++;
528:       tp = t + (*vj) * 6;
529:       v += 36;
530:     }

532:     /* xk = inv(Dk)*(Dk*xk) */
533:     d     = aa + k * 36; /* ptr to inv(Dk) */
534:     tp    = t + k * 6;
535:     tp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5;
536:     tp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5;
537:     tp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5;
538:     tp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5;
539:     tp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5;
540:     tp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5;
541:   }

543:   /* solve U*x = y by back substitution */
544:   for (k = mbs - 1; k >= 0; k--) {
545:     v  = aa + 36 * ai[k];
546:     vj = aj + ai[k];
547:     tp = t + k * 6;
548:     x0 = tp[0];
549:     x1 = tp[1];
550:     x2 = tp[2];
551:     x3 = tp[3];
552:     x4 = tp[4];
553:     x5 = tp[5]; /* xk */
554:     nz = ai[k + 1] - ai[k];

556:     tp = t + (*vj) * 6;
557:     while (nz--) {
558:       /* xk += U(k,:)*x(:) */
559:       x0 += v[0] * tp[0] + v[6] * tp[1] + v[12] * tp[2] + v[18] * tp[3] + v[24] * tp[4] + v[30] * tp[5];
560:       x1 += v[1] * tp[0] + v[7] * tp[1] + v[13] * tp[2] + v[19] * tp[3] + v[25] * tp[4] + v[31] * tp[5];
561:       x2 += v[2] * tp[0] + v[8] * tp[1] + v[14] * tp[2] + v[20] * tp[3] + v[26] * tp[4] + v[32] * tp[5];
562:       x3 += v[3] * tp[0] + v[9] * tp[1] + v[15] * tp[2] + v[21] * tp[3] + v[27] * tp[4] + v[33] * tp[5];
563:       x4 += v[4] * tp[0] + v[10] * tp[1] + v[16] * tp[2] + v[22] * tp[3] + v[28] * tp[4] + v[34] * tp[5];
564:       x5 += v[5] * tp[0] + v[11] * tp[1] + v[17] * tp[2] + v[23] * tp[3] + v[29] * tp[4] + v[35] * tp[5];
565:       vj++;
566:       tp = t + (*vj) * 6;
567:       v += 36;
568:     }
569:     tp         = t + k * 6;
570:     tp[0]      = x0;
571:     tp[1]      = x1;
572:     tp[2]      = x2;
573:     tp[3]      = x3;
574:     tp[4]      = x4;
575:     tp[5]      = x5;
576:     idx        = 6 * r[k];
577:     x[idx]     = x0;
578:     x[idx + 1] = x1;
579:     x[idx + 2] = x2;
580:     x[idx + 3] = x3;
581:     x[idx + 4] = x4;
582:     x[idx + 5] = x5;
583:   }

585:   ISRestoreIndices(isrow, &r);
586:   VecRestoreArrayRead(bb, &b);
587:   VecRestoreArray(xx, &x);
588:   PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
589:   return 0;
590: }

592: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
593: {
594:   const MatScalar *v, *d;
595:   PetscScalar     *xp, x0, x1, x2, x3, x4, x5;
596:   PetscInt         nz, k;
597:   const PetscInt  *vj;

599:   for (k = 0; k < mbs; k++) {
600:     v  = aa + 36 * ai[k];
601:     xp = x + k * 6;
602:     x0 = xp[0];
603:     x1 = xp[1];
604:     x2 = xp[2];
605:     x3 = xp[3];
606:     x4 = xp[4];
607:     x5 = xp[5]; /* Dk*xk = k-th block of x */
608:     nz = ai[k + 1] - ai[k];
609:     vj = aj + ai[k];
610:     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
611:     PetscPrefetchBlock(v + 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
612:     while (nz--) {
613:       xp = x + (*vj) * 6;
614:       /* x(:) += U(k,:)^T*(Dk*xk) */
615:       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5;
616:       xp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5;
617:       xp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5;
618:       xp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5;
619:       xp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5;
620:       xp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5;
621:       vj++;
622:       v += 36;
623:     }
624:     /* xk = inv(Dk)*(Dk*xk) */
625:     d     = aa + k * 36; /* ptr to inv(Dk) */
626:     xp    = x + k * 6;
627:     xp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5;
628:     xp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5;
629:     xp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5;
630:     xp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5;
631:     xp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5;
632:     xp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5;
633:   }
634:   return 0;
635: }
636: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
637: {
638:   const MatScalar *v;
639:   PetscScalar     *xp, x0, x1, x2, x3, x4, x5;
640:   PetscInt         nz, k;
641:   const PetscInt  *vj;

643:   for (k = mbs - 1; k >= 0; k--) {
644:     v  = aa + 36 * ai[k];
645:     xp = x + k * 6;
646:     x0 = xp[0];
647:     x1 = xp[1];
648:     x2 = xp[2];
649:     x3 = xp[3];
650:     x4 = xp[4];
651:     x5 = xp[5]; /* xk */
652:     nz = ai[k + 1] - ai[k];
653:     vj = aj + ai[k];
654:     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
655:     PetscPrefetchBlock(v - 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
656:     while (nz--) {
657:       xp = x + (*vj) * 6;
658:       /* xk += U(k,:)*x(:) */
659:       x0 += v[0] * xp[0] + v[6] * xp[1] + v[12] * xp[2] + v[18] * xp[3] + v[24] * xp[4] + v[30] * xp[5];
660:       x1 += v[1] * xp[0] + v[7] * xp[1] + v[13] * xp[2] + v[19] * xp[3] + v[25] * xp[4] + v[31] * xp[5];
661:       x2 += v[2] * xp[0] + v[8] * xp[1] + v[14] * xp[2] + v[20] * xp[3] + v[26] * xp[4] + v[32] * xp[5];
662:       x3 += v[3] * xp[0] + v[9] * xp[1] + v[15] * xp[2] + v[21] * xp[3] + v[27] * xp[4] + v[33] * xp[5];
663:       x4 += v[4] * xp[0] + v[10] * xp[1] + v[16] * xp[2] + v[22] * xp[3] + v[28] * xp[4] + v[34] * xp[5];
664:       x5 += v[5] * xp[0] + v[11] * xp[1] + v[17] * xp[2] + v[23] * xp[3] + v[29] * xp[4] + v[35] * xp[5];
665:       vj++;
666:       v += 36;
667:     }
668:     xp    = x + k * 6;
669:     xp[0] = x0;
670:     xp[1] = x1;
671:     xp[2] = x2;
672:     xp[3] = x3;
673:     xp[4] = x4;
674:     xp[5] = x5;
675:   }
676:   return 0;
677: }

679: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
680: {
681:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
682:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
683:   const MatScalar   *aa = a->a;
684:   PetscScalar       *x;
685:   const PetscScalar *b;

687:   VecGetArrayRead(bb, &b);
688:   VecGetArray(xx, &x);

690:   /* solve U^T * D * y = b by forward substitution */
691:   PetscArraycpy(x, b, 6 * mbs); /* x <- b */
692:   MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x);

694:   /* solve U*x = y by back substitution */
695:   MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x);

697:   VecRestoreArrayRead(bb, &b);
698:   VecRestoreArray(xx, &x);
699:   PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
700:   return 0;
701: }

703: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
704: {
705:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
706:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
707:   const MatScalar   *aa = a->a;
708:   PetscScalar       *x;
709:   const PetscScalar *b;

711:   VecGetArrayRead(bb, &b);
712:   VecGetArray(xx, &x);
713:   PetscArraycpy(x, b, 6 * mbs); /* x <- b */
714:   MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x);
715:   VecRestoreArrayRead(bb, &b);
716:   VecRestoreArray(xx, &x);
717:   PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs);
718:   return 0;
719: }

721: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
722: {
723:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
724:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
725:   const MatScalar   *aa = a->a;
726:   PetscScalar       *x;
727:   const PetscScalar *b;

729:   VecGetArrayRead(bb, &b);
730:   VecGetArray(xx, &x);
731:   PetscArraycpy(x, b, 6 * mbs); /* x <- b */
732:   MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x);
733:   VecRestoreArrayRead(bb, &b);
734:   VecRestoreArray(xx, &x);
735:   PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs));
736:   return 0;
737: }

739: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
740: {
741:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
742:   IS                 isrow = a->row;
743:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
744:   const PetscInt    *r, *vj;
745:   PetscInt           nz, k, idx;
746:   const MatScalar   *aa = a->a, *v, *diag;
747:   PetscScalar       *x, x0, x1, x2, x3, x4, *t, *tp;
748:   const PetscScalar *b;

750:   VecGetArrayRead(bb, &b);
751:   VecGetArray(xx, &x);
752:   t = a->solve_work;
753:   ISGetIndices(isrow, &r);

755:   /* solve U^T * D * y = b by forward substitution */
756:   tp = t;
757:   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
758:     idx   = 5 * r[k];
759:     tp[0] = b[idx];
760:     tp[1] = b[idx + 1];
761:     tp[2] = b[idx + 2];
762:     tp[3] = b[idx + 3];
763:     tp[4] = b[idx + 4];
764:     tp += 5;
765:   }

767:   for (k = 0; k < mbs; k++) {
768:     v  = aa + 25 * ai[k];
769:     vj = aj + ai[k];
770:     tp = t + k * 5;
771:     x0 = tp[0];
772:     x1 = tp[1];
773:     x2 = tp[2];
774:     x3 = tp[3];
775:     x4 = tp[4];
776:     nz = ai[k + 1] - ai[k];

778:     tp = t + (*vj) * 5;
779:     while (nz--) {
780:       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4;
781:       tp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4;
782:       tp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4;
783:       tp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4;
784:       tp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4;
785:       vj++;
786:       tp = t + (*vj) * 5;
787:       v += 25;
788:     }

790:     /* xk = inv(Dk)*(Dk*xk) */
791:     diag  = aa + k * 25; /* ptr to inv(Dk) */
792:     tp    = t + k * 5;
793:     tp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4;
794:     tp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4;
795:     tp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4;
796:     tp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4;
797:     tp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4;
798:   }

800:   /* solve U*x = y by back substitution */
801:   for (k = mbs - 1; k >= 0; k--) {
802:     v  = aa + 25 * ai[k];
803:     vj = aj + ai[k];
804:     tp = t + k * 5;
805:     x0 = tp[0];
806:     x1 = tp[1];
807:     x2 = tp[2];
808:     x3 = tp[3];
809:     x4 = tp[4]; /* xk */
810:     nz = ai[k + 1] - ai[k];

812:     tp = t + (*vj) * 5;
813:     while (nz--) {
814:       /* xk += U(k,:)*x(:) */
815:       x0 += v[0] * tp[0] + v[5] * tp[1] + v[10] * tp[2] + v[15] * tp[3] + v[20] * tp[4];
816:       x1 += v[1] * tp[0] + v[6] * tp[1] + v[11] * tp[2] + v[16] * tp[3] + v[21] * tp[4];
817:       x2 += v[2] * tp[0] + v[7] * tp[1] + v[12] * tp[2] + v[17] * tp[3] + v[22] * tp[4];
818:       x3 += v[3] * tp[0] + v[8] * tp[1] + v[13] * tp[2] + v[18] * tp[3] + v[23] * tp[4];
819:       x4 += v[4] * tp[0] + v[9] * tp[1] + v[14] * tp[2] + v[19] * tp[3] + v[24] * tp[4];
820:       vj++;
821:       tp = t + (*vj) * 5;
822:       v += 25;
823:     }
824:     tp         = t + k * 5;
825:     tp[0]      = x0;
826:     tp[1]      = x1;
827:     tp[2]      = x2;
828:     tp[3]      = x3;
829:     tp[4]      = x4;
830:     idx        = 5 * r[k];
831:     x[idx]     = x0;
832:     x[idx + 1] = x1;
833:     x[idx + 2] = x2;
834:     x[idx + 3] = x3;
835:     x[idx + 4] = x4;
836:   }

838:   ISRestoreIndices(isrow, &r);
839:   VecRestoreArrayRead(bb, &b);
840:   VecRestoreArray(xx, &x);
841:   PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
842:   return 0;
843: }

845: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
846: {
847:   const MatScalar *v, *diag;
848:   PetscScalar     *xp, x0, x1, x2, x3, x4;
849:   PetscInt         nz, k;
850:   const PetscInt  *vj;

852:   for (k = 0; k < mbs; k++) {
853:     v  = aa + 25 * ai[k];
854:     xp = x + k * 5;
855:     x0 = xp[0];
856:     x1 = xp[1];
857:     x2 = xp[2];
858:     x3 = xp[3];
859:     x4 = xp[4]; /* Dk*xk = k-th block of x */
860:     nz = ai[k + 1] - ai[k];
861:     vj = aj + ai[k];
862:     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
863:     PetscPrefetchBlock(v + 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
864:     while (nz--) {
865:       xp = x + (*vj) * 5;
866:       /* x(:) += U(k,:)^T*(Dk*xk) */
867:       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4;
868:       xp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4;
869:       xp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4;
870:       xp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4;
871:       xp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4;
872:       vj++;
873:       v += 25;
874:     }
875:     /* xk = inv(Dk)*(Dk*xk) */
876:     diag  = aa + k * 25; /* ptr to inv(Dk) */
877:     xp    = x + k * 5;
878:     xp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4;
879:     xp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4;
880:     xp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4;
881:     xp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4;
882:     xp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4;
883:   }
884:   return 0;
885: }

887: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
888: {
889:   const MatScalar *v;
890:   PetscScalar     *xp, x0, x1, x2, x3, x4;
891:   PetscInt         nz, k;
892:   const PetscInt  *vj;

894:   for (k = mbs - 1; k >= 0; k--) {
895:     v  = aa + 25 * ai[k];
896:     xp = x + k * 5;
897:     x0 = xp[0];
898:     x1 = xp[1];
899:     x2 = xp[2];
900:     x3 = xp[3];
901:     x4 = xp[4]; /* xk */
902:     nz = ai[k + 1] - ai[k];
903:     vj = aj + ai[k];
904:     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
905:     PetscPrefetchBlock(v - 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
906:     while (nz--) {
907:       xp = x + (*vj) * 5;
908:       /* xk += U(k,:)*x(:) */
909:       x0 += v[0] * xp[0] + v[5] * xp[1] + v[10] * xp[2] + v[15] * xp[3] + v[20] * xp[4];
910:       x1 += v[1] * xp[0] + v[6] * xp[1] + v[11] * xp[2] + v[16] * xp[3] + v[21] * xp[4];
911:       x2 += v[2] * xp[0] + v[7] * xp[1] + v[12] * xp[2] + v[17] * xp[3] + v[22] * xp[4];
912:       x3 += v[3] * xp[0] + v[8] * xp[1] + v[13] * xp[2] + v[18] * xp[3] + v[23] * xp[4];
913:       x4 += v[4] * xp[0] + v[9] * xp[1] + v[14] * xp[2] + v[19] * xp[3] + v[24] * xp[4];
914:       vj++;
915:       v += 25;
916:     }
917:     xp    = x + k * 5;
918:     xp[0] = x0;
919:     xp[1] = x1;
920:     xp[2] = x2;
921:     xp[3] = x3;
922:     xp[4] = x4;
923:   }
924:   return 0;
925: }

927: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
928: {
929:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
930:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
931:   const MatScalar   *aa = a->a;
932:   PetscScalar       *x;
933:   const PetscScalar *b;

935:   VecGetArrayRead(bb, &b);
936:   VecGetArray(xx, &x);

938:   /* solve U^T * D * y = b by forward substitution */
939:   PetscArraycpy(x, b, 5 * mbs); /* x <- b */
940:   MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x);

942:   /* solve U*x = y by back substitution */
943:   MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x);

945:   VecRestoreArrayRead(bb, &b);
946:   VecRestoreArray(xx, &x);
947:   PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
948:   return 0;
949: }

951: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
952: {
953:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
954:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
955:   const MatScalar   *aa = a->a;
956:   PetscScalar       *x;
957:   const PetscScalar *b;

959:   VecGetArrayRead(bb, &b);
960:   VecGetArray(xx, &x);
961:   PetscArraycpy(x, b, 5 * mbs); /* x <- b */
962:   MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x);
963:   VecRestoreArrayRead(bb, &b);
964:   VecRestoreArray(xx, &x);
965:   PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs);
966:   return 0;
967: }

969: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
970: {
971:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
972:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
973:   const MatScalar   *aa = a->a;
974:   PetscScalar       *x;
975:   const PetscScalar *b;

977:   VecGetArrayRead(bb, &b);
978:   VecGetArray(xx, &x);
979:   PetscArraycpy(x, b, 5 * mbs);
980:   MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x);
981:   VecRestoreArrayRead(bb, &b);
982:   VecRestoreArray(xx, &x);
983:   PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs));
984:   return 0;
985: }

987: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
988: {
989:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
990:   IS                 isrow = a->row;
991:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
992:   const PetscInt    *r, *vj;
993:   PetscInt           nz, k, idx;
994:   const MatScalar   *aa = a->a, *v, *diag;
995:   PetscScalar       *x, x0, x1, x2, x3, *t, *tp;
996:   const PetscScalar *b;

998:   VecGetArrayRead(bb, &b);
999:   VecGetArray(xx, &x);
1000:   t = a->solve_work;
1001:   ISGetIndices(isrow, &r);

1003:   /* solve U^T * D * y = b by forward substitution */
1004:   tp = t;
1005:   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1006:     idx   = 4 * r[k];
1007:     tp[0] = b[idx];
1008:     tp[1] = b[idx + 1];
1009:     tp[2] = b[idx + 2];
1010:     tp[3] = b[idx + 3];
1011:     tp += 4;
1012:   }

1014:   for (k = 0; k < mbs; k++) {
1015:     v  = aa + 16 * ai[k];
1016:     vj = aj + ai[k];
1017:     tp = t + k * 4;
1018:     x0 = tp[0];
1019:     x1 = tp[1];
1020:     x2 = tp[2];
1021:     x3 = tp[3];
1022:     nz = ai[k + 1] - ai[k];

1024:     tp = t + (*vj) * 4;
1025:     while (nz--) {
1026:       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3;
1027:       tp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3;
1028:       tp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3;
1029:       tp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3;
1030:       vj++;
1031:       tp = t + (*vj) * 4;
1032:       v += 16;
1033:     }

1035:     /* xk = inv(Dk)*(Dk*xk) */
1036:     diag  = aa + k * 16; /* ptr to inv(Dk) */
1037:     tp    = t + k * 4;
1038:     tp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3;
1039:     tp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3;
1040:     tp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3;
1041:     tp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3;
1042:   }

1044:   /* solve U*x = y by back substitution */
1045:   for (k = mbs - 1; k >= 0; k--) {
1046:     v  = aa + 16 * ai[k];
1047:     vj = aj + ai[k];
1048:     tp = t + k * 4;
1049:     x0 = tp[0];
1050:     x1 = tp[1];
1051:     x2 = tp[2];
1052:     x3 = tp[3]; /* xk */
1053:     nz = ai[k + 1] - ai[k];

1055:     tp = t + (*vj) * 4;
1056:     while (nz--) {
1057:       /* xk += U(k,:)*x(:) */
1058:       x0 += v[0] * tp[0] + v[4] * tp[1] + v[8] * tp[2] + v[12] * tp[3];
1059:       x1 += v[1] * tp[0] + v[5] * tp[1] + v[9] * tp[2] + v[13] * tp[3];
1060:       x2 += v[2] * tp[0] + v[6] * tp[1] + v[10] * tp[2] + v[14] * tp[3];
1061:       x3 += v[3] * tp[0] + v[7] * tp[1] + v[11] * tp[2] + v[15] * tp[3];
1062:       vj++;
1063:       tp = t + (*vj) * 4;
1064:       v += 16;
1065:     }
1066:     tp         = t + k * 4;
1067:     tp[0]      = x0;
1068:     tp[1]      = x1;
1069:     tp[2]      = x2;
1070:     tp[3]      = x3;
1071:     idx        = 4 * r[k];
1072:     x[idx]     = x0;
1073:     x[idx + 1] = x1;
1074:     x[idx + 2] = x2;
1075:     x[idx + 3] = x3;
1076:   }

1078:   ISRestoreIndices(isrow, &r);
1079:   VecRestoreArrayRead(bb, &b);
1080:   VecRestoreArray(xx, &x);
1081:   PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
1082:   return 0;
1083: }

1085: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1086: {
1087:   const MatScalar *v, *diag;
1088:   PetscScalar     *xp, x0, x1, x2, x3;
1089:   PetscInt         nz, k;
1090:   const PetscInt  *vj;

1092:   for (k = 0; k < mbs; k++) {
1093:     v  = aa + 16 * ai[k];
1094:     xp = x + k * 4;
1095:     x0 = xp[0];
1096:     x1 = xp[1];
1097:     x2 = xp[2];
1098:     x3 = xp[3]; /* Dk*xk = k-th block of x */
1099:     nz = ai[k + 1] - ai[k];
1100:     vj = aj + ai[k];
1101:     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
1102:     PetscPrefetchBlock(v + 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1103:     while (nz--) {
1104:       xp = x + (*vj) * 4;
1105:       /* x(:) += U(k,:)^T*(Dk*xk) */
1106:       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3;
1107:       xp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3;
1108:       xp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3;
1109:       xp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3;
1110:       vj++;
1111:       v += 16;
1112:     }
1113:     /* xk = inv(Dk)*(Dk*xk) */
1114:     diag  = aa + k * 16; /* ptr to inv(Dk) */
1115:     xp    = x + k * 4;
1116:     xp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3;
1117:     xp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3;
1118:     xp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3;
1119:     xp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3;
1120:   }
1121:   return 0;
1122: }

1124: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1125: {
1126:   const MatScalar *v;
1127:   PetscScalar     *xp, x0, x1, x2, x3;
1128:   PetscInt         nz, k;
1129:   const PetscInt  *vj;

1131:   for (k = mbs - 1; k >= 0; k--) {
1132:     v  = aa + 16 * ai[k];
1133:     xp = x + k * 4;
1134:     x0 = xp[0];
1135:     x1 = xp[1];
1136:     x2 = xp[2];
1137:     x3 = xp[3]; /* xk */
1138:     nz = ai[k + 1] - ai[k];
1139:     vj = aj + ai[k];
1140:     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
1141:     PetscPrefetchBlock(v - 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1142:     while (nz--) {
1143:       xp = x + (*vj) * 4;
1144:       /* xk += U(k,:)*x(:) */
1145:       x0 += v[0] * xp[0] + v[4] * xp[1] + v[8] * xp[2] + v[12] * xp[3];
1146:       x1 += v[1] * xp[0] + v[5] * xp[1] + v[9] * xp[2] + v[13] * xp[3];
1147:       x2 += v[2] * xp[0] + v[6] * xp[1] + v[10] * xp[2] + v[14] * xp[3];
1148:       x3 += v[3] * xp[0] + v[7] * xp[1] + v[11] * xp[2] + v[15] * xp[3];
1149:       vj++;
1150:       v += 16;
1151:     }
1152:     xp    = x + k * 4;
1153:     xp[0] = x0;
1154:     xp[1] = x1;
1155:     xp[2] = x2;
1156:     xp[3] = x3;
1157:   }
1158:   return 0;
1159: }

1161: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1162: {
1163:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1164:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1165:   const MatScalar   *aa = a->a;
1166:   PetscScalar       *x;
1167:   const PetscScalar *b;

1169:   VecGetArrayRead(bb, &b);
1170:   VecGetArray(xx, &x);

1172:   /* solve U^T * D * y = b by forward substitution */
1173:   PetscArraycpy(x, b, 4 * mbs); /* x <- b */
1174:   MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x);

1176:   /* solve U*x = y by back substitution */
1177:   MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x);
1178:   VecRestoreArrayRead(bb, &b);
1179:   VecRestoreArray(xx, &x);
1180:   PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
1181:   return 0;
1182: }

1184: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1185: {
1186:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1187:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1188:   const MatScalar   *aa = a->a;
1189:   PetscScalar       *x;
1190:   const PetscScalar *b;

1192:   VecGetArrayRead(bb, &b);
1193:   VecGetArray(xx, &x);
1194:   PetscArraycpy(x, b, 4 * mbs); /* x <- b */
1195:   MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x);
1196:   VecRestoreArrayRead(bb, &b);
1197:   VecRestoreArray(xx, &x);
1198:   PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs);
1199:   return 0;
1200: }

1202: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1203: {
1204:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1205:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1206:   const MatScalar   *aa = a->a;
1207:   PetscScalar       *x;
1208:   const PetscScalar *b;

1210:   VecGetArrayRead(bb, &b);
1211:   VecGetArray(xx, &x);
1212:   PetscArraycpy(x, b, 4 * mbs);
1213:   MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x);
1214:   VecRestoreArrayRead(bb, &b);
1215:   VecRestoreArray(xx, &x);
1216:   PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs));
1217:   return 0;
1218: }

1220: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
1221: {
1222:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1223:   IS                 isrow = a->row;
1224:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1225:   const PetscInt    *r;
1226:   PetscInt           nz, k, idx;
1227:   const PetscInt    *vj;
1228:   const MatScalar   *aa = a->a, *v, *diag;
1229:   PetscScalar       *x, x0, x1, x2, *t, *tp;
1230:   const PetscScalar *b;

1232:   VecGetArrayRead(bb, &b);
1233:   VecGetArray(xx, &x);
1234:   t = a->solve_work;
1235:   ISGetIndices(isrow, &r);

1237:   /* solve U^T * D * y = b by forward substitution */
1238:   tp = t;
1239:   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1240:     idx   = 3 * r[k];
1241:     tp[0] = b[idx];
1242:     tp[1] = b[idx + 1];
1243:     tp[2] = b[idx + 2];
1244:     tp += 3;
1245:   }

1247:   for (k = 0; k < mbs; k++) {
1248:     v  = aa + 9 * ai[k];
1249:     vj = aj + ai[k];
1250:     tp = t + k * 3;
1251:     x0 = tp[0];
1252:     x1 = tp[1];
1253:     x2 = tp[2];
1254:     nz = ai[k + 1] - ai[k];

1256:     tp = t + (*vj) * 3;
1257:     while (nz--) {
1258:       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2;
1259:       tp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2;
1260:       tp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2;
1261:       vj++;
1262:       tp = t + (*vj) * 3;
1263:       v += 9;
1264:     }

1266:     /* xk = inv(Dk)*(Dk*xk) */
1267:     diag  = aa + k * 9; /* ptr to inv(Dk) */
1268:     tp    = t + k * 3;
1269:     tp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2;
1270:     tp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2;
1271:     tp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2;
1272:   }

1274:   /* solve U*x = y by back substitution */
1275:   for (k = mbs - 1; k >= 0; k--) {
1276:     v  = aa + 9 * ai[k];
1277:     vj = aj + ai[k];
1278:     tp = t + k * 3;
1279:     x0 = tp[0];
1280:     x1 = tp[1];
1281:     x2 = tp[2]; /* xk */
1282:     nz = ai[k + 1] - ai[k];

1284:     tp = t + (*vj) * 3;
1285:     while (nz--) {
1286:       /* xk += U(k,:)*x(:) */
1287:       x0 += v[0] * tp[0] + v[3] * tp[1] + v[6] * tp[2];
1288:       x1 += v[1] * tp[0] + v[4] * tp[1] + v[7] * tp[2];
1289:       x2 += v[2] * tp[0] + v[5] * tp[1] + v[8] * tp[2];
1290:       vj++;
1291:       tp = t + (*vj) * 3;
1292:       v += 9;
1293:     }
1294:     tp         = t + k * 3;
1295:     tp[0]      = x0;
1296:     tp[1]      = x1;
1297:     tp[2]      = x2;
1298:     idx        = 3 * r[k];
1299:     x[idx]     = x0;
1300:     x[idx + 1] = x1;
1301:     x[idx + 2] = x2;
1302:   }

1304:   ISRestoreIndices(isrow, &r);
1305:   VecRestoreArrayRead(bb, &b);
1306:   VecRestoreArray(xx, &x);
1307:   PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
1308:   return 0;
1309: }

1311: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1312: {
1313:   const MatScalar *v, *diag;
1314:   PetscScalar     *xp, x0, x1, x2;
1315:   PetscInt         nz, k;
1316:   const PetscInt  *vj;

1318:   for (k = 0; k < mbs; k++) {
1319:     v  = aa + 9 * ai[k];
1320:     xp = x + k * 3;
1321:     x0 = xp[0];
1322:     x1 = xp[1];
1323:     x2 = xp[2]; /* Dk*xk = k-th block of x */
1324:     nz = ai[k + 1] - ai[k];
1325:     vj = aj + ai[k];
1326:     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1327:     PetscPrefetchBlock(v + 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1328:     while (nz--) {
1329:       xp = x + (*vj) * 3;
1330:       /* x(:) += U(k,:)^T*(Dk*xk) */
1331:       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2;
1332:       xp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2;
1333:       xp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2;
1334:       vj++;
1335:       v += 9;
1336:     }
1337:     /* xk = inv(Dk)*(Dk*xk) */
1338:     diag  = aa + k * 9; /* ptr to inv(Dk) */
1339:     xp    = x + k * 3;
1340:     xp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2;
1341:     xp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2;
1342:     xp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2;
1343:   }
1344:   return 0;
1345: }

1347: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1348: {
1349:   const MatScalar *v;
1350:   PetscScalar     *xp, x0, x1, x2;
1351:   PetscInt         nz, k;
1352:   const PetscInt  *vj;

1354:   for (k = mbs - 1; k >= 0; k--) {
1355:     v  = aa + 9 * ai[k];
1356:     xp = x + k * 3;
1357:     x0 = xp[0];
1358:     x1 = xp[1];
1359:     x2 = xp[2]; /* xk */
1360:     nz = ai[k + 1] - ai[k];
1361:     vj = aj + ai[k];
1362:     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1363:     PetscPrefetchBlock(v - 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1364:     while (nz--) {
1365:       xp = x + (*vj) * 3;
1366:       /* xk += U(k,:)*x(:) */
1367:       x0 += v[0] * xp[0] + v[3] * xp[1] + v[6] * xp[2];
1368:       x1 += v[1] * xp[0] + v[4] * xp[1] + v[7] * xp[2];
1369:       x2 += v[2] * xp[0] + v[5] * xp[1] + v[8] * xp[2];
1370:       vj++;
1371:       v += 9;
1372:     }
1373:     xp    = x + k * 3;
1374:     xp[0] = x0;
1375:     xp[1] = x1;
1376:     xp[2] = x2;
1377:   }
1378:   return 0;
1379: }

1381: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1382: {
1383:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1384:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1385:   const MatScalar   *aa = a->a;
1386:   PetscScalar       *x;
1387:   const PetscScalar *b;

1389:   VecGetArrayRead(bb, &b);
1390:   VecGetArray(xx, &x);

1392:   /* solve U^T * D * y = b by forward substitution */
1393:   PetscArraycpy(x, b, 3 * mbs);
1394:   MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x);

1396:   /* solve U*x = y by back substitution */
1397:   MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x);

1399:   VecRestoreArrayRead(bb, &b);
1400:   VecRestoreArray(xx, &x);
1401:   PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
1402:   return 0;
1403: }

1405: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1406: {
1407:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1408:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1409:   const MatScalar   *aa = a->a;
1410:   PetscScalar       *x;
1411:   const PetscScalar *b;

1413:   VecGetArrayRead(bb, &b);
1414:   VecGetArray(xx, &x);
1415:   PetscArraycpy(x, b, 3 * mbs);
1416:   MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x);
1417:   VecRestoreArrayRead(bb, &b);
1418:   VecRestoreArray(xx, &x);
1419:   PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs);
1420:   return 0;
1421: }

1423: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1424: {
1425:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1426:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1427:   const MatScalar   *aa = a->a;
1428:   PetscScalar       *x;
1429:   const PetscScalar *b;

1431:   VecGetArrayRead(bb, &b);
1432:   VecGetArray(xx, &x);
1433:   PetscArraycpy(x, b, 3 * mbs);
1434:   MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x);
1435:   VecRestoreArrayRead(bb, &b);
1436:   VecRestoreArray(xx, &x);
1437:   PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs));
1438:   return 0;
1439: }

1441: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
1442: {
1443:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1444:   IS                 isrow = a->row;
1445:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1446:   const PetscInt    *r, *vj;
1447:   PetscInt           nz, k, k2, idx;
1448:   const MatScalar   *aa = a->a, *v, *diag;
1449:   PetscScalar       *x, x0, x1, *t;
1450:   const PetscScalar *b;

1452:   VecGetArrayRead(bb, &b);
1453:   VecGetArray(xx, &x);
1454:   t = a->solve_work;
1455:   ISGetIndices(isrow, &r);

1457:   /* solve U^T * D * y = perm(b) by forward substitution */
1458:   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1459:     idx          = 2 * r[k];
1460:     t[k * 2]     = b[idx];
1461:     t[k * 2 + 1] = b[idx + 1];
1462:   }
1463:   for (k = 0; k < mbs; k++) {
1464:     v  = aa + 4 * ai[k];
1465:     vj = aj + ai[k];
1466:     k2 = k * 2;
1467:     x0 = t[k2];
1468:     x1 = t[k2 + 1];
1469:     nz = ai[k + 1] - ai[k];
1470:     while (nz--) {
1471:       t[(*vj) * 2] += v[0] * x0 + v[1] * x1;
1472:       t[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1;
1473:       vj++;
1474:       v += 4;
1475:     }
1476:     diag      = aa + k * 4; /* ptr to inv(Dk) */
1477:     t[k2]     = diag[0] * x0 + diag[2] * x1;
1478:     t[k2 + 1] = diag[1] * x0 + diag[3] * x1;
1479:   }

1481:   /* solve U*x = y by back substitution */
1482:   for (k = mbs - 1; k >= 0; k--) {
1483:     v  = aa + 4 * ai[k];
1484:     vj = aj + ai[k];
1485:     k2 = k * 2;
1486:     x0 = t[k2];
1487:     x1 = t[k2 + 1];
1488:     nz = ai[k + 1] - ai[k];
1489:     while (nz--) {
1490:       x0 += v[0] * t[(*vj) * 2] + v[2] * t[(*vj) * 2 + 1];
1491:       x1 += v[1] * t[(*vj) * 2] + v[3] * t[(*vj) * 2 + 1];
1492:       vj++;
1493:       v += 4;
1494:     }
1495:     t[k2]      = x0;
1496:     t[k2 + 1]  = x1;
1497:     idx        = 2 * r[k];
1498:     x[idx]     = x0;
1499:     x[idx + 1] = x1;
1500:   }

1502:   ISRestoreIndices(isrow, &r);
1503:   VecRestoreArrayRead(bb, &b);
1504:   VecRestoreArray(xx, &x);
1505:   PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
1506:   return 0;
1507: }

1509: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1510: {
1511:   const MatScalar *v, *diag;
1512:   PetscScalar      x0, x1;
1513:   PetscInt         nz, k, k2;
1514:   const PetscInt  *vj;

1516:   for (k = 0; k < mbs; k++) {
1517:     v  = aa + 4 * ai[k];
1518:     vj = aj + ai[k];
1519:     k2 = k * 2;
1520:     x0 = x[k2];
1521:     x1 = x[k2 + 1]; /* Dk*xk = k-th block of x */
1522:     nz = ai[k + 1] - ai[k];
1523:     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1524:     PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1525:     while (nz--) {
1526:       /* x(:) += U(k,:)^T*(Dk*xk) */
1527:       x[(*vj) * 2] += v[0] * x0 + v[1] * x1;
1528:       x[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1;
1529:       vj++;
1530:       v += 4;
1531:     }
1532:     /* xk = inv(Dk)*(Dk*xk) */
1533:     diag      = aa + k * 4; /* ptr to inv(Dk) */
1534:     x[k2]     = diag[0] * x0 + diag[2] * x1;
1535:     x[k2 + 1] = diag[1] * x0 + diag[3] * x1;
1536:   }
1537:   return 0;
1538: }

1540: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1541: {
1542:   const MatScalar *v;
1543:   PetscScalar      x0, x1;
1544:   PetscInt         nz, k, k2;
1545:   const PetscInt  *vj;

1547:   for (k = mbs - 1; k >= 0; k--) {
1548:     v  = aa + 4 * ai[k];
1549:     vj = aj + ai[k];
1550:     k2 = k * 2;
1551:     x0 = x[k2];
1552:     x1 = x[k2 + 1]; /* xk */
1553:     nz = ai[k + 1] - ai[k];
1554:     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1555:     PetscPrefetchBlock(v - 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1556:     while (nz--) {
1557:       /* xk += U(k,:)*x(:) */
1558:       x0 += v[0] * x[(*vj) * 2] + v[2] * x[(*vj) * 2 + 1];
1559:       x1 += v[1] * x[(*vj) * 2] + v[3] * x[(*vj) * 2 + 1];
1560:       vj++;
1561:       v += 4;
1562:     }
1563:     x[k2]     = x0;
1564:     x[k2 + 1] = x1;
1565:   }
1566:   return 0;
1567: }

1569: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1570: {
1571:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1572:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1573:   const MatScalar   *aa = a->a;
1574:   PetscScalar       *x;
1575:   const PetscScalar *b;

1577:   VecGetArrayRead(bb, &b);
1578:   VecGetArray(xx, &x);

1580:   /* solve U^T * D * y = b by forward substitution */
1581:   PetscArraycpy(x, b, 2 * mbs);
1582:   MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x);

1584:   /* solve U*x = y by back substitution */
1585:   MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x);

1587:   VecRestoreArrayRead(bb, &b);
1588:   VecRestoreArray(xx, &x);
1589:   PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
1590:   return 0;
1591: }

1593: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1594: {
1595:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1596:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1597:   const MatScalar   *aa = a->a;
1598:   PetscScalar       *x;
1599:   const PetscScalar *b;

1601:   VecGetArrayRead(bb, &b);
1602:   VecGetArray(xx, &x);
1603:   PetscArraycpy(x, b, 2 * mbs);
1604:   MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x);
1605:   VecRestoreArrayRead(bb, &b);
1606:   VecRestoreArray(xx, &x);
1607:   PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs);
1608:   return 0;
1609: }

1611: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1612: {
1613:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1614:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1615:   const MatScalar   *aa = a->a;
1616:   PetscScalar       *x;
1617:   const PetscScalar *b;

1619:   VecGetArrayRead(bb, &b);
1620:   VecGetArray(xx, &x);
1621:   PetscArraycpy(x, b, 2 * mbs);
1622:   MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x);
1623:   VecRestoreArrayRead(bb, &b);
1624:   VecRestoreArray(xx, &x);
1625:   PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs));
1626:   return 0;
1627: }

1629: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1630: {
1631:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1632:   IS                 isrow = a->row;
1633:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1634:   const MatScalar   *aa = a->a, *v;
1635:   const PetscScalar *b;
1636:   PetscScalar       *x, xk, *t;
1637:   PetscInt           nz, k, j;

1639:   VecGetArrayRead(bb, &b);
1640:   VecGetArray(xx, &x);
1641:   t = a->solve_work;
1642:   ISGetIndices(isrow, &rp);

1644:   /* solve U^T*D*y = perm(b) by forward substitution */
1645:   for (k = 0; k < mbs; k++) t[k] = b[rp[k]];
1646:   for (k = 0; k < mbs; k++) {
1647:     v  = aa + ai[k];
1648:     vj = aj + ai[k];
1649:     xk = t[k];
1650:     nz = ai[k + 1] - ai[k] - 1;
1651:     for (j = 0; j < nz; j++) t[vj[j]] += v[j] * xk;
1652:     t[k] = xk * v[nz]; /* v[nz] = 1/D(k) */
1653:   }

1655:   /* solve U*perm(x) = y by back substitution */
1656:   for (k = mbs - 1; k >= 0; k--) {
1657:     v  = aa + adiag[k] - 1;
1658:     vj = aj + adiag[k] - 1;
1659:     nz = ai[k + 1] - ai[k] - 1;
1660:     for (j = 0; j < nz; j++) t[k] += v[-j] * t[vj[-j]];
1661:     x[rp[k]] = t[k];
1662:   }

1664:   ISRestoreIndices(isrow, &rp);
1665:   VecRestoreArrayRead(bb, &b);
1666:   VecRestoreArray(xx, &x);
1667:   PetscLogFlops(4.0 * a->nz - 3.0 * mbs);
1668:   return 0;
1669: }

1671: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1672: {
1673:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1674:   IS                 isrow = a->row;
1675:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1676:   const MatScalar   *aa = a->a, *v;
1677:   PetscScalar       *x, xk, *t;
1678:   const PetscScalar *b;
1679:   PetscInt           nz, k;

1681:   VecGetArrayRead(bb, &b);
1682:   VecGetArray(xx, &x);
1683:   t = a->solve_work;
1684:   ISGetIndices(isrow, &rp);

1686:   /* solve U^T*D*y = perm(b) by forward substitution */
1687:   for (k = 0; k < mbs; k++) t[k] = b[rp[k]];
1688:   for (k = 0; k < mbs; k++) {
1689:     v  = aa + ai[k] + 1;
1690:     vj = aj + ai[k] + 1;
1691:     xk = t[k];
1692:     nz = ai[k + 1] - ai[k] - 1;
1693:     while (nz--) t[*vj++] += (*v++) * xk;
1694:     t[k] = xk * aa[ai[k]]; /* aa[k] = 1/D(k) */
1695:   }

1697:   /* solve U*perm(x) = y by back substitution */
1698:   for (k = mbs - 1; k >= 0; k--) {
1699:     v  = aa + ai[k] + 1;
1700:     vj = aj + ai[k] + 1;
1701:     nz = ai[k + 1] - ai[k] - 1;
1702:     while (nz--) t[k] += (*v++) * t[*vj++];
1703:     x[rp[k]] = t[k];
1704:   }

1706:   ISRestoreIndices(isrow, &rp);
1707:   VecRestoreArrayRead(bb, &b);
1708:   VecRestoreArray(xx, &x);
1709:   PetscLogFlops(4.0 * a->nz - 3 * mbs);
1710:   return 0;
1711: }

1713: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1714: {
1715:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1716:   IS                 isrow = a->row;
1717:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1718:   const MatScalar   *aa = a->a, *v;
1719:   PetscReal          diagk;
1720:   PetscScalar       *x, xk;
1721:   const PetscScalar *b;
1722:   PetscInt           nz, k;

1724:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1725:   VecGetArrayRead(bb, &b);
1726:   VecGetArray(xx, &x);
1727:   ISGetIndices(isrow, &rp);

1729:   for (k = 0; k < mbs; k++) x[k] = b[rp[k]];
1730:   for (k = 0; k < mbs; k++) {
1731:     v  = aa + ai[k];
1732:     vj = aj + ai[k];
1733:     xk = x[k];
1734:     nz = ai[k + 1] - ai[k] - 1;
1735:     while (nz--) x[*vj++] += (*v++) * xk;

1737:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1739:     x[k] = xk * PetscSqrtReal(diagk);
1740:   }
1741:   ISRestoreIndices(isrow, &rp);
1742:   VecRestoreArrayRead(bb, &b);
1743:   VecRestoreArray(xx, &x);
1744:   PetscLogFlops(2.0 * a->nz - mbs);
1745:   return 0;
1746: }

1748: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1749: {
1750:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1751:   IS                 isrow = a->row;
1752:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1753:   const MatScalar   *aa = a->a, *v;
1754:   PetscReal          diagk;
1755:   PetscScalar       *x, xk;
1756:   const PetscScalar *b;
1757:   PetscInt           nz, k;

1759:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1760:   VecGetArrayRead(bb, &b);
1761:   VecGetArray(xx, &x);
1762:   ISGetIndices(isrow, &rp);

1764:   for (k = 0; k < mbs; k++) x[k] = b[rp[k]];
1765:   for (k = 0; k < mbs; k++) {
1766:     v  = aa + ai[k] + 1;
1767:     vj = aj + ai[k] + 1;
1768:     xk = x[k];
1769:     nz = ai[k + 1] - ai[k] - 1;
1770:     while (nz--) x[*vj++] += (*v++) * xk;

1772:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1774:     x[k] = xk * PetscSqrtReal(diagk);
1775:   }
1776:   ISRestoreIndices(isrow, &rp);
1777:   VecRestoreArrayRead(bb, &b);
1778:   VecRestoreArray(xx, &x);
1779:   PetscLogFlops(2.0 * a->nz - mbs);
1780:   return 0;
1781: }

1783: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1784: {
1785:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1786:   IS                 isrow = a->row;
1787:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1788:   const MatScalar   *aa = a->a, *v;
1789:   PetscReal          diagk;
1790:   PetscScalar       *x, *t;
1791:   const PetscScalar *b;
1792:   PetscInt           nz, k;

1794:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1795:   VecGetArrayRead(bb, &b);
1796:   VecGetArray(xx, &x);
1797:   t = a->solve_work;
1798:   ISGetIndices(isrow, &rp);

1800:   for (k = mbs - 1; k >= 0; k--) {
1801:     v     = aa + ai[k];
1802:     vj    = aj + ai[k];
1803:     diagk = PetscRealPart(aa[adiag[k]]);
1805:     t[k] = b[k] * PetscSqrtReal(diagk);
1806:     nz   = ai[k + 1] - ai[k] - 1;
1807:     while (nz--) t[k] += (*v++) * t[*vj++];
1808:     x[rp[k]] = t[k];
1809:   }
1810:   ISRestoreIndices(isrow, &rp);
1811:   VecRestoreArrayRead(bb, &b);
1812:   VecRestoreArray(xx, &x);
1813:   PetscLogFlops(2.0 * a->nz - mbs);
1814:   return 0;
1815: }

1817: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1818: {
1819:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1820:   IS                 isrow = a->row;
1821:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1822:   const MatScalar   *aa = a->a, *v;
1823:   PetscReal          diagk;
1824:   PetscScalar       *x, *t;
1825:   const PetscScalar *b;
1826:   PetscInt           nz, k;

1828:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1829:   VecGetArrayRead(bb, &b);
1830:   VecGetArray(xx, &x);
1831:   t = a->solve_work;
1832:   ISGetIndices(isrow, &rp);

1834:   for (k = mbs - 1; k >= 0; k--) {
1835:     v     = aa + ai[k] + 1;
1836:     vj    = aj + ai[k] + 1;
1837:     diagk = PetscRealPart(aa[ai[k]]);
1839:     t[k] = b[k] * PetscSqrtReal(diagk);
1840:     nz   = ai[k + 1] - ai[k] - 1;
1841:     while (nz--) t[k] += (*v++) * t[*vj++];
1842:     x[rp[k]] = t[k];
1843:   }
1844:   ISRestoreIndices(isrow, &rp);
1845:   VecRestoreArrayRead(bb, &b);
1846:   VecRestoreArray(xx, &x);
1847:   PetscLogFlops(2.0 * a->nz - mbs);
1848:   return 0;
1849: }

1851: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A, Vecs bb, Vecs xx)
1852: {
1853:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;

1855:   if (A->rmap->bs == 1) {
1856:     MatSolve_SeqSBAIJ_1(A, bb->v, xx->v);
1857:   } else {
1858:     IS                 isrow = a->row;
1859:     const PetscInt    *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp;
1860:     const MatScalar   *aa = a->a, *v;
1861:     PetscScalar       *x, *t;
1862:     const PetscScalar *b;
1863:     PetscInt           nz, k, n, i, j;

1865:     if (bb->n > a->solves_work_n) {
1866:       PetscFree(a->solves_work);
1867:       PetscMalloc1(bb->n * A->rmap->N, &a->solves_work);
1868:       a->solves_work_n = bb->n;
1869:     }
1870:     n = bb->n;
1871:     VecGetArrayRead(bb->v, &b);
1872:     VecGetArray(xx->v, &x);
1873:     t = a->solves_work;

1875:     ISGetIndices(isrow, &rp);

1877:     /* solve U^T*D*y = perm(b) by forward substitution */
1878:     for (k = 0; k < mbs; k++) {
1879:       for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */
1880:     }
1881:     for (k = 0; k < mbs; k++) {
1882:       v  = aa + ai[k];
1883:       vj = aj + ai[k];
1884:       nz = ai[k + 1] - ai[k] - 1;
1885:       for (j = 0; j < nz; j++) {
1886:         for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i];
1887:         v++;
1888:         vj++;
1889:       }
1890:       for (i = 0; i < n; i++) t[n * k + i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */
1891:     }

1893:     /* solve U*perm(x) = y by back substitution */
1894:     for (k = mbs - 1; k >= 0; k--) {
1895:       v  = aa + ai[k] - 1;
1896:       vj = aj + ai[k] - 1;
1897:       nz = ai[k + 1] - ai[k] - 1;
1898:       for (j = 0; j < nz; j++) {
1899:         for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i];
1900:         v++;
1901:         vj++;
1902:       }
1903:       for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i];
1904:     }

1906:     ISRestoreIndices(isrow, &rp);
1907:     VecRestoreArrayRead(bb->v, &b);
1908:     VecRestoreArray(xx->v, &x);
1909:     PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs));
1910:   }
1911:   return 0;
1912: }

1914: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A, Vecs bb, Vecs xx)
1915: {
1916:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;

1918:   if (A->rmap->bs == 1) {
1919:     MatSolve_SeqSBAIJ_1_inplace(A, bb->v, xx->v);
1920:   } else {
1921:     IS                 isrow = a->row;
1922:     const PetscInt    *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp;
1923:     const MatScalar   *aa = a->a, *v;
1924:     PetscScalar       *x, *t;
1925:     const PetscScalar *b;
1926:     PetscInt           nz, k, n, i;

1928:     if (bb->n > a->solves_work_n) {
1929:       PetscFree(a->solves_work);
1930:       PetscMalloc1(bb->n * A->rmap->N, &a->solves_work);
1931:       a->solves_work_n = bb->n;
1932:     }
1933:     n = bb->n;
1934:     VecGetArrayRead(bb->v, &b);
1935:     VecGetArray(xx->v, &x);
1936:     t = a->solves_work;

1938:     ISGetIndices(isrow, &rp);

1940:     /* solve U^T*D*y = perm(b) by forward substitution */
1941:     for (k = 0; k < mbs; k++) {
1942:       for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */
1943:     }
1944:     for (k = 0; k < mbs; k++) {
1945:       v  = aa + ai[k];
1946:       vj = aj + ai[k];
1947:       nz = ai[k + 1] - ai[k];
1948:       while (nz--) {
1949:         for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i];
1950:         v++;
1951:         vj++;
1952:       }
1953:       for (i = 0; i < n; i++) t[n * k + i] *= aa[k]; /* note: aa[k] = 1/D(k) */
1954:     }

1956:     /* solve U*perm(x) = y by back substitution */
1957:     for (k = mbs - 1; k >= 0; k--) {
1958:       v  = aa + ai[k];
1959:       vj = aj + ai[k];
1960:       nz = ai[k + 1] - ai[k];
1961:       while (nz--) {
1962:         for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i];
1963:         v++;
1964:         vj++;
1965:       }
1966:       for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i];
1967:     }

1969:     ISRestoreIndices(isrow, &rp);
1970:     VecRestoreArrayRead(bb->v, &b);
1971:     VecRestoreArray(xx->v, &x);
1972:     PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs));
1973:   }
1974:   return 0;
1975: }

1977: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
1978: {
1979:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1980:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag;
1981:   const MatScalar   *aa = a->a, *v;
1982:   const PetscScalar *b;
1983:   PetscScalar       *x, xi;
1984:   PetscInt           nz, i, j;

1986:   VecGetArrayRead(bb, &b);
1987:   VecGetArray(xx, &x);
1988:   /* solve U^T*D*y = b by forward substitution */
1989:   PetscArraycpy(x, b, mbs);
1990:   for (i = 0; i < mbs; i++) {
1991:     v  = aa + ai[i];
1992:     vj = aj + ai[i];
1993:     xi = x[i];
1994:     nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
1995:     for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi;
1996:     x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
1997:   }
1998:   /* solve U*x = y by backward substitution */
1999:   for (i = mbs - 2; i >= 0; i--) {
2000:     xi = x[i];
2001:     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2002:     vj = aj + adiag[i] - 1;
2003:     nz = ai[i + 1] - ai[i] - 1;
2004:     for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]];
2005:     x[i] = xi;
2006:   }
2007:   VecRestoreArrayRead(bb, &b);
2008:   VecRestoreArray(xx, &x);
2009:   PetscLogFlops(4.0 * a->nz - 3 * mbs);
2010:   return 0;
2011: }

2013: PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Mat B, Mat X)
2014: {
2015:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2016:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag;
2017:   const MatScalar   *aa = a->a, *v;
2018:   const PetscScalar *b;
2019:   PetscScalar       *x, xi;
2020:   PetscInt           nz, i, j, neq, ldb, ldx;
2021:   PetscBool          isdense;

2023:   if (!mbs) return 0;
2024:   PetscObjectTypeCompare((PetscObject)B, MATSEQDENSE, &isdense);
2026:   if (X != B) {
2027:     PetscObjectTypeCompare((PetscObject)X, MATSEQDENSE, &isdense);
2029:   }
2030:   MatDenseGetArrayRead(B, &b);
2031:   MatDenseGetLDA(B, &ldb);
2032:   MatDenseGetArray(X, &x);
2033:   MatDenseGetLDA(X, &ldx);
2034:   for (neq = 0; neq < B->cmap->n; neq++) {
2035:     /* solve U^T*D*y = b by forward substitution */
2036:     PetscArraycpy(x, b, mbs);
2037:     for (i = 0; i < mbs; i++) {
2038:       v  = aa + ai[i];
2039:       vj = aj + ai[i];
2040:       xi = x[i];
2041:       nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
2042:       for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi;
2043:       x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2044:     }
2045:     /* solve U*x = y by backward substitution */
2046:     for (i = mbs - 2; i >= 0; i--) {
2047:       xi = x[i];
2048:       v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2049:       vj = aj + adiag[i] - 1;
2050:       nz = ai[i + 1] - ai[i] - 1;
2051:       for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]];
2052:       x[i] = xi;
2053:     }
2054:     b += ldb;
2055:     x += ldx;
2056:   }
2057:   MatDenseRestoreArrayRead(B, &b);
2058:   MatDenseRestoreArray(X, &x);
2059:   PetscLogFlops(B->cmap->n * (4.0 * a->nz - 3 * mbs));
2060:   return 0;
2061: }

2063: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2064: {
2065:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2066:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2067:   const MatScalar   *aa = a->a, *v;
2068:   PetscScalar       *x, xk;
2069:   const PetscScalar *b;
2070:   PetscInt           nz, k;

2072:   VecGetArrayRead(bb, &b);
2073:   VecGetArray(xx, &x);

2075:   /* solve U^T*D*y = b by forward substitution */
2076:   PetscArraycpy(x, b, mbs);
2077:   for (k = 0; k < mbs; k++) {
2078:     v  = aa + ai[k] + 1;
2079:     vj = aj + ai[k] + 1;
2080:     xk = x[k];
2081:     nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2082:     while (nz--) x[*vj++] += (*v++) * xk;
2083:     x[k] = xk * aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
2084:   }

2086:   /* solve U*x = y by back substitution */
2087:   for (k = mbs - 2; k >= 0; k--) {
2088:     v  = aa + ai[k] + 1;
2089:     vj = aj + ai[k] + 1;
2090:     xk = x[k];
2091:     nz = ai[k + 1] - ai[k] - 1;
2092:     while (nz--) xk += (*v++) * x[*vj++];
2093:     x[k] = xk;
2094:   }

2096:   VecRestoreArrayRead(bb, &b);
2097:   VecRestoreArray(xx, &x);
2098:   PetscLogFlops(4.0 * a->nz - 3 * mbs);
2099:   return 0;
2100: }

2102: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2103: {
2104:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2105:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj;
2106:   const MatScalar   *aa = a->a, *v;
2107:   PetscReal          diagk;
2108:   PetscScalar       *x;
2109:   const PetscScalar *b;
2110:   PetscInt           nz, k;

2112:   /* solve U^T*D^(1/2)*x = b by forward substitution */
2113:   VecGetArrayRead(bb, &b);
2114:   VecGetArray(xx, &x);
2115:   PetscArraycpy(x, b, mbs);
2116:   for (k = 0; k < mbs; k++) {
2117:     v  = aa + ai[k];
2118:     vj = aj + ai[k];
2119:     nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2120:     while (nz--) x[*vj++] += (*v++) * x[k];
2121:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2123:     x[k] *= PetscSqrtReal(diagk);
2124:   }
2125:   VecRestoreArrayRead(bb, &b);
2126:   VecRestoreArray(xx, &x);
2127:   PetscLogFlops(2.0 * a->nz - mbs);
2128:   return 0;
2129: }

2131: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2132: {
2133:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2134:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2135:   const MatScalar   *aa = a->a, *v;
2136:   PetscReal          diagk;
2137:   PetscScalar       *x;
2138:   const PetscScalar *b;
2139:   PetscInt           nz, k;

2141:   /* solve U^T*D^(1/2)*x = b by forward substitution */
2142:   VecGetArrayRead(bb, &b);
2143:   VecGetArray(xx, &x);
2144:   PetscArraycpy(x, b, mbs);
2145:   for (k = 0; k < mbs; k++) {
2146:     v  = aa + ai[k] + 1;
2147:     vj = aj + ai[k] + 1;
2148:     nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2149:     while (nz--) x[*vj++] += (*v++) * x[k];
2150:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2152:     x[k] *= PetscSqrtReal(diagk);
2153:   }
2154:   VecRestoreArrayRead(bb, &b);
2155:   VecRestoreArray(xx, &x);
2156:   PetscLogFlops(2.0 * a->nz - mbs);
2157:   return 0;
2158: }

2160: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2161: {
2162:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2163:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj;
2164:   const MatScalar   *aa = a->a, *v;
2165:   PetscReal          diagk;
2166:   PetscScalar       *x;
2167:   const PetscScalar *b;
2168:   PetscInt           nz, k;

2170:   /* solve D^(1/2)*U*x = b by back substitution */
2171:   VecGetArrayRead(bb, &b);
2172:   VecGetArray(xx, &x);

2174:   for (k = mbs - 1; k >= 0; k--) {
2175:     v     = aa + ai[k];
2176:     vj    = aj + ai[k];
2177:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2179:     x[k] = PetscSqrtReal(diagk) * b[k];
2180:     nz   = ai[k + 1] - ai[k] - 1;
2181:     while (nz--) x[k] += (*v++) * x[*vj++];
2182:   }
2183:   VecRestoreArrayRead(bb, &b);
2184:   VecRestoreArray(xx, &x);
2185:   PetscLogFlops(2.0 * a->nz - mbs);
2186:   return 0;
2187: }

2189: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2190: {
2191:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2192:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2193:   const MatScalar   *aa = a->a, *v;
2194:   PetscReal          diagk;
2195:   PetscScalar       *x;
2196:   const PetscScalar *b;
2197:   PetscInt           nz, k;

2199:   /* solve D^(1/2)*U*x = b by back substitution */
2200:   VecGetArrayRead(bb, &b);
2201:   VecGetArray(xx, &x);

2203:   for (k = mbs - 1; k >= 0; k--) {
2204:     v     = aa + ai[k] + 1;
2205:     vj    = aj + ai[k] + 1;
2206:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2208:     x[k] = PetscSqrtReal(diagk) * b[k];
2209:     nz   = ai[k + 1] - ai[k] - 1;
2210:     while (nz--) x[k] += (*v++) * x[*vj++];
2211:   }
2212:   VecRestoreArrayRead(bb, &b);
2213:   VecRestoreArray(xx, &x);
2214:   PetscLogFlops(2.0 * a->nz - mbs);
2215:   return 0;
2216: }

2218: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2219: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2220: {
2221:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *b;
2222:   const PetscInt *rip, mbs    = a->mbs, *ai, *aj;
2223:   PetscInt       *jutmp, bs   = A->rmap->bs, i;
2224:   PetscInt        m, reallocs = 0, *levtmp;
2225:   PetscInt       *prowl, *q, jmin, jmax, juidx, nzk, qm, *iu, *ju, k, j, vj, umax, maxadd;
2226:   PetscInt        incrlev, *lev, shift, prow, nz;
2227:   PetscReal       f = info->fill, levels = info->levels;
2228:   PetscBool       perm_identity;

2230:   /* check whether perm is the identity mapping */
2231:   ISIdentity(perm, &perm_identity);

2233:   if (perm_identity) {
2234:     a->permute = PETSC_FALSE;
2235:     ai         = a->i;
2236:     aj         = a->j;
2237:   } else { /*  non-trivial permutation */
2238:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2239:   }

2241:   /* initialization */
2242:   ISGetIndices(perm, &rip);
2243:   umax = (PetscInt)(f * ai[mbs] + 1);
2244:   PetscMalloc1(umax, &lev);
2245:   umax += mbs + 1;
2246:   shift = mbs + 1;
2247:   PetscMalloc1(mbs + 1, &iu);
2248:   PetscMalloc1(umax, &ju);
2249:   iu[0] = mbs + 1;
2250:   juidx = mbs + 1;
2251:   /* prowl: linked list for pivot row */
2252:   PetscMalloc3(mbs, &prowl, mbs, &q, mbs, &levtmp);
2253:   /* q: linked list for col index */

2255:   for (i = 0; i < mbs; i++) {
2256:     prowl[i] = mbs;
2257:     q[i]     = 0;
2258:   }

2260:   /* for each row k */
2261:   for (k = 0; k < mbs; k++) {
2262:     nzk  = 0;
2263:     q[k] = mbs;
2264:     /* copy current row into linked list */
2265:     nz = ai[rip[k] + 1] - ai[rip[k]];
2266:     j  = ai[rip[k]];
2267:     while (nz--) {
2268:       vj = rip[aj[j++]];
2269:       if (vj > k) {
2270:         qm = k;
2271:         do {
2272:           m  = qm;
2273:           qm = q[m];
2274:         } while (qm < vj);
2276:         nzk++;
2277:         q[m]       = vj;
2278:         q[vj]      = qm;
2279:         levtmp[vj] = 0; /* initialize lev for nonzero element */
2280:       }
2281:     }

2283:     /* modify nonzero structure of k-th row by computing fill-in
2284:        for each row prow to be merged in */
2285:     prow = k;
2286:     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */

2288:     while (prow < k) {
2289:       /* merge row prow into k-th row */
2290:       jmin = iu[prow] + 1;
2291:       jmax = iu[prow + 1];
2292:       qm   = k;
2293:       for (j = jmin; j < jmax; j++) {
2294:         incrlev = lev[j - shift] + 1;
2295:         if (incrlev > levels) continue;
2296:         vj = ju[j];
2297:         do {
2298:           m  = qm;
2299:           qm = q[m];
2300:         } while (qm < vj);
2301:         if (qm != vj) { /* a fill */
2302:           nzk++;
2303:           q[m]       = vj;
2304:           q[vj]      = qm;
2305:           qm         = vj;
2306:           levtmp[vj] = incrlev;
2307:         } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2308:       }
2309:       prow = prowl[prow]; /* next pivot row */
2310:     }

2312:     /* add k to row list for first nonzero element in k-th row */
2313:     if (nzk > 1) {
2314:       i        = q[k]; /* col value of first nonzero element in k_th row of U */
2315:       prowl[k] = prowl[i];
2316:       prowl[i] = k;
2317:     }
2318:     iu[k + 1] = iu[k] + nzk;

2320:     /* allocate more space to ju and lev if needed */
2321:     if (iu[k + 1] > umax) {
2322:       /* estimate how much additional space we will need */
2323:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2324:       /* just double the memory each time */
2325:       maxadd = umax;
2326:       if (maxadd < nzk) maxadd = (mbs - k) * (nzk + 1) / 2;
2327:       umax += maxadd;

2329:       /* allocate a longer ju */
2330:       PetscMalloc1(umax, &jutmp);
2331:       PetscArraycpy(jutmp, ju, iu[k]);
2332:       PetscFree(ju);
2333:       ju = jutmp;

2335:       PetscMalloc1(umax, &jutmp);
2336:       PetscArraycpy(jutmp, lev, iu[k] - shift);
2337:       PetscFree(lev);
2338:       lev = jutmp;
2339:       reallocs += 2; /* count how many times we realloc */
2340:     }

2342:     /* save nonzero structure of k-th row in ju */
2343:     i = k;
2344:     while (nzk--) {
2345:       i                  = q[i];
2346:       ju[juidx]          = i;
2347:       lev[juidx - shift] = levtmp[i];
2348:       juidx++;
2349:     }
2350:   }

2352: #if defined(PETSC_USE_INFO)
2353:   if (ai[mbs] != 0) {
2354:     PetscReal af = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
2355:     PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af);
2356:     PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af);
2357:     PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af);
2358:     PetscInfo(A, "for best performance.\n");
2359:   } else {
2360:     PetscInfo(A, "Empty matrix\n");
2361:   }
2362: #endif

2364:   ISRestoreIndices(perm, &rip);
2365:   PetscFree3(prowl, q, levtmp);
2366:   PetscFree(lev);

2368:   /* put together the new matrix */
2369:   MatSeqSBAIJSetPreallocation(B, bs, 0, NULL);

2371:   b = (Mat_SeqSBAIJ *)(B)->data;
2372:   PetscFree2(b->imax, b->ilen);

2374:   b->singlemalloc = PETSC_FALSE;
2375:   b->free_a       = PETSC_TRUE;
2376:   b->free_ij      = PETSC_TRUE;
2377:   /* the next line frees the default space generated by the Create() */
2378:   PetscFree3(b->a, b->j, b->i);
2379:   PetscMalloc1((iu[mbs] + 1) * a->bs2, &b->a);
2380:   b->j    = ju;
2381:   b->i    = iu;
2382:   b->diag = NULL;
2383:   b->ilen = NULL;
2384:   b->imax = NULL;

2386:   ISDestroy(&b->row);
2387:   ISDestroy(&b->icol);
2388:   b->row  = perm;
2389:   b->icol = perm;
2390:   PetscObjectReference((PetscObject)perm);
2391:   PetscObjectReference((PetscObject)perm);
2392:   PetscMalloc1(bs * mbs + bs, &b->solve_work);
2393:   /* In b structure:  Free imax, ilen, old a, old j.
2394:      Allocate idnew, solve_work, new a, new j */
2395:   b->maxnz = b->nz = iu[mbs];

2397:   (B)->info.factor_mallocs   = reallocs;
2398:   (B)->info.fill_ratio_given = f;
2399:   if (ai[mbs] != 0) {
2400:     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
2401:   } else {
2402:     (B)->info.fill_ratio_needed = 0.0;
2403:   }
2404:   MatSeqSBAIJSetNumericFactorization_inplace(B, perm_identity);
2405:   return 0;
2406: }

2408: /*
2409:   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2410: */
2411: #include <petscbt.h>
2412: #include <../src/mat/utils/freespace.h>
2413: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
2414: {
2415:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data, *b;
2416:   PetscBool          perm_identity, free_ij = PETSC_TRUE, missing;
2417:   PetscInt           bs = A->rmap->bs, am = a->mbs, d, *ai = a->i, *aj = a->j;
2418:   const PetscInt    *rip;
2419:   PetscInt           reallocs = 0, i, *ui, *udiag, *cols;
2420:   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
2421:   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, *uj, **uj_ptr, **uj_lvl_ptr;
2422:   PetscReal          fill = info->fill, levels = info->levels;
2423:   PetscFreeSpaceList free_space = NULL, current_space = NULL;
2424:   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
2425:   PetscBT            lnkbt;

2428:   MatMissingDiagonal(A, &missing, &d);
2430:   if (bs > 1) {
2431:     MatICCFactorSymbolic_SeqSBAIJ_inplace(fact, A, perm, info);
2432:     return 0;
2433:   }

2435:   /* check whether perm is the identity mapping */
2436:   ISIdentity(perm, &perm_identity);
2438:   a->permute = PETSC_FALSE;

2440:   PetscMalloc1(am + 1, &ui);
2441:   PetscMalloc1(am + 1, &udiag);
2442:   ui[0] = 0;

2444:   /* ICC(0) without matrix ordering: simply rearrange column indices */
2445:   if (!levels) {
2446:     /* reuse the column pointers and row offsets for memory savings */
2447:     for (i = 0; i < am; i++) {
2448:       ncols     = ai[i + 1] - ai[i];
2449:       ui[i + 1] = ui[i] + ncols;
2450:       udiag[i]  = ui[i + 1] - 1; /* points to the last entry of U(i,:) */
2451:     }
2452:     PetscMalloc1(ui[am] + 1, &uj);
2453:     cols = uj;
2454:     for (i = 0; i < am; i++) {
2455:       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2456:       ncols = ai[i + 1] - ai[i] - 1;
2457:       for (j = 0; j < ncols; j++) *cols++ = aj[j];
2458:       *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
2459:     }
2460:   } else { /* case: levels>0 */
2461:     ISGetIndices(perm, &rip);

2463:     /* initialization */
2464:     /* jl: linked list for storing indices of the pivot rows
2465:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2466:     PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl);
2467:     for (i = 0; i < am; i++) {
2468:       jl[i] = am;
2469:       il[i] = 0;
2470:     }

2472:     /* create and initialize a linked list for storing column indices of the active row k */
2473:     nlnk = am + 1;
2474:     PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt);

2476:     /* initial FreeSpace size is fill*(ai[am]+1) */
2477:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space);

2479:     current_space = free_space;

2481:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl);

2483:     current_space_lvl = free_space_lvl;

2485:     for (k = 0; k < am; k++) { /* for each active row k */
2486:       /* initialize lnk by the column indices of row k */
2487:       nzk   = 0;
2488:       ncols = ai[k + 1] - ai[k];
2490:       cols = aj + ai[k];
2491:       PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt);
2492:       nzk += nlnk;

2494:       /* update lnk by computing fill-in for each pivot row to be merged in */
2495:       prow = jl[k]; /* 1st pivot row */

2497:       while (prow < k) {
2498:         nextprow = jl[prow];

2500:         /* merge prow into k-th row */
2501:         jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2502:         jmax  = ui[prow + 1];
2503:         ncols = jmax - jmin;
2504:         i     = jmin - ui[prow];

2506:         cols = uj_ptr[prow] + i;     /* points to the 2nd nzero entry in U(prow,k:am-1) */
2507:         uj   = uj_lvl_ptr[prow] + i; /* levels of cols */
2508:         j    = *(uj - 1);
2509:         PetscICCLLAddSorted(ncols, cols, levels, uj, am, &nlnk, lnk, lnk_lvl, lnkbt, j);
2510:         nzk += nlnk;

2512:         /* update il and jl for prow */
2513:         if (jmin < jmax) {
2514:           il[prow] = jmin;

2516:           j        = *cols;
2517:           jl[prow] = jl[j];
2518:           jl[j]    = prow;
2519:         }
2520:         prow = nextprow;
2521:       }

2523:       /* if free space is not available, make more free space */
2524:       if (current_space->local_remaining < nzk) {
2525:         i = am - k + 1;                                    /* num of unfactored rows */
2526:         i = PetscIntMultTruncate(i, PetscMin(nzk, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2527:         PetscFreeSpaceGet(i, &current_space);
2528:         PetscFreeSpaceGet(i, &current_space_lvl);
2529:         reallocs++;
2530:       }

2532:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2534:       PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt);

2536:       /* add the k-th row into il and jl */
2537:       if (nzk > 1) {
2538:         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2539:         jl[k] = jl[i];
2540:         jl[i] = k;
2541:         il[k] = ui[k] + 1;
2542:       }
2543:       uj_ptr[k]     = current_space->array;
2544:       uj_lvl_ptr[k] = current_space_lvl->array;

2546:       current_space->array += nzk;
2547:       current_space->local_used += nzk;
2548:       current_space->local_remaining -= nzk;
2549:       current_space_lvl->array += nzk;
2550:       current_space_lvl->local_used += nzk;
2551:       current_space_lvl->local_remaining -= nzk;

2553:       ui[k + 1] = ui[k] + nzk;
2554:     }

2556:     ISRestoreIndices(perm, &rip);
2557:     PetscFree4(uj_ptr, uj_lvl_ptr, il, jl);

2559:     /* destroy list of free space and other temporary array(s) */
2560:     PetscMalloc1(ui[am] + 1, &uj);
2561:     PetscFreeSpaceContiguous_Cholesky(&free_space, uj, am, ui, udiag); /* store matrix factor  */
2562:     PetscIncompleteLLDestroy(lnk, lnkbt);
2563:     PetscFreeSpaceDestroy(free_space_lvl);

2565:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2567:   /* put together the new matrix in MATSEQSBAIJ format */
2568:   MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL);

2570:   b = (Mat_SeqSBAIJ *)(fact)->data;
2571:   PetscFree2(b->imax, b->ilen);

2573:   b->singlemalloc = PETSC_FALSE;
2574:   b->free_a       = PETSC_TRUE;
2575:   b->free_ij      = free_ij;

2577:   PetscMalloc1(ui[am] + 1, &b->a);

2579:   b->j         = uj;
2580:   b->i         = ui;
2581:   b->diag      = udiag;
2582:   b->free_diag = PETSC_TRUE;
2583:   b->ilen      = NULL;
2584:   b->imax      = NULL;
2585:   b->row       = perm;
2586:   b->col       = perm;

2588:   PetscObjectReference((PetscObject)perm);
2589:   PetscObjectReference((PetscObject)perm);

2591:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

2593:   PetscMalloc1(am + 1, &b->solve_work);

2595:   b->maxnz = b->nz = ui[am];

2597:   fact->info.factor_mallocs   = reallocs;
2598:   fact->info.fill_ratio_given = fill;
2599:   if (ai[am] != 0) {
2600:     fact->info.fill_ratio_needed = ((PetscReal)ui[am]) / ai[am];
2601:   } else {
2602:     fact->info.fill_ratio_needed = 0.0;
2603:   }
2604: #if defined(PETSC_USE_INFO)
2605:   if (ai[am] != 0) {
2606:     PetscReal af = fact->info.fill_ratio_needed;
2607:     PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af);
2608:     PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af);
2609:     PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af);
2610:   } else {
2611:     PetscInfo(A, "Empty matrix\n");
2612:   }
2613: #endif
2614:   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2615:   return 0;
2616: }

2618: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
2619: {
2620:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
2621:   Mat_SeqSBAIJ      *b;
2622:   PetscBool          perm_identity, free_ij = PETSC_TRUE;
2623:   PetscInt           bs = A->rmap->bs, am = a->mbs;
2624:   const PetscInt    *cols, *rip, *ai = a->i, *aj = a->j;
2625:   PetscInt           reallocs = 0, i, *ui;
2626:   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
2627:   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
2628:   PetscReal          fill = info->fill, levels = info->levels, ratio_needed;
2629:   PetscFreeSpaceList free_space = NULL, current_space = NULL;
2630:   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
2631:   PetscBT            lnkbt;

2633:   /*
2634:    This code originally uses Modified Sparse Row (MSR) storage
2635:    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2636:    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2637:    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2638:    thus the original code in MSR format is still used for these cases.
2639:    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2640:    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2641:   */
2642:   if (bs > 1) {
2643:     MatICCFactorSymbolic_SeqSBAIJ_MSR(fact, A, perm, info);
2644:     return 0;
2645:   }

2647:   /* check whether perm is the identity mapping */
2648:   ISIdentity(perm, &perm_identity);
2650:   a->permute = PETSC_FALSE;

2652:   /* special case that simply copies fill pattern */
2653:   if (!levels) {
2654:     /* reuse the column pointers and row offsets for memory savings */
2655:     ui           = a->i;
2656:     uj           = a->j;
2657:     free_ij      = PETSC_FALSE;
2658:     ratio_needed = 1.0;
2659:   } else { /* case: levels>0 */
2660:     ISGetIndices(perm, &rip);

2662:     /* initialization */
2663:     PetscMalloc1(am + 1, &ui);
2664:     ui[0] = 0;

2666:     /* jl: linked list for storing indices of the pivot rows
2667:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2668:     PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl);
2669:     for (i = 0; i < am; i++) {
2670:       jl[i] = am;
2671:       il[i] = 0;
2672:     }

2674:     /* create and initialize a linked list for storing column indices of the active row k */
2675:     nlnk = am + 1;
2676:     PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt);

2678:     /* initial FreeSpace size is fill*(ai[am]+1) */
2679:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space);

2681:     current_space = free_space;

2683:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl);

2685:     current_space_lvl = free_space_lvl;

2687:     for (k = 0; k < am; k++) { /* for each active row k */
2688:       /* initialize lnk by the column indices of row rip[k] */
2689:       nzk   = 0;
2690:       ncols = ai[rip[k] + 1] - ai[rip[k]];
2691:       cols  = aj + ai[rip[k]];
2692:       PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt);
2693:       nzk += nlnk;

2695:       /* update lnk by computing fill-in for each pivot row to be merged in */
2696:       prow = jl[k]; /* 1st pivot row */

2698:       while (prow < k) {
2699:         nextprow = jl[prow];

2701:         /* merge prow into k-th row */
2702:         jmin     = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2703:         jmax     = ui[prow + 1];
2704:         ncols    = jmax - jmin;
2705:         i        = jmin - ui[prow];
2706:         cols     = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2707:         j        = *(uj_lvl_ptr[prow] + i - 1);
2708:         cols_lvl = uj_lvl_ptr[prow] + i;
2709:         PetscICCLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt, j);
2710:         nzk += nlnk;

2712:         /* update il and jl for prow */
2713:         if (jmin < jmax) {
2714:           il[prow] = jmin;
2715:           j        = *cols;
2716:           jl[prow] = jl[j];
2717:           jl[j]    = prow;
2718:         }
2719:         prow = nextprow;
2720:       }

2722:       /* if free space is not available, make more free space */
2723:       if (current_space->local_remaining < nzk) {
2724:         i = am - k + 1;                                                             /* num of unfactored rows */
2725:         i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2726:         PetscFreeSpaceGet(i, &current_space);
2727:         PetscFreeSpaceGet(i, &current_space_lvl);
2728:         reallocs++;
2729:       }

2731:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2732:       PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt);

2734:       /* add the k-th row into il and jl */
2735:       if (nzk - 1 > 0) {
2736:         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2737:         jl[k] = jl[i];
2738:         jl[i] = k;
2739:         il[k] = ui[k] + 1;
2740:       }
2741:       uj_ptr[k]     = current_space->array;
2742:       uj_lvl_ptr[k] = current_space_lvl->array;

2744:       current_space->array += nzk;
2745:       current_space->local_used += nzk;
2746:       current_space->local_remaining -= nzk;
2747:       current_space_lvl->array += nzk;
2748:       current_space_lvl->local_used += nzk;
2749:       current_space_lvl->local_remaining -= nzk;

2751:       ui[k + 1] = ui[k] + nzk;
2752:     }

2754:     ISRestoreIndices(perm, &rip);
2755:     PetscFree4(uj_ptr, uj_lvl_ptr, il, jl);

2757:     /* destroy list of free space and other temporary array(s) */
2758:     PetscMalloc1(ui[am] + 1, &uj);
2759:     PetscFreeSpaceContiguous(&free_space, uj);
2760:     PetscIncompleteLLDestroy(lnk, lnkbt);
2761:     PetscFreeSpaceDestroy(free_space_lvl);
2762:     if (ai[am] != 0) {
2763:       ratio_needed = ((PetscReal)ui[am]) / ((PetscReal)ai[am]);
2764:     } else {
2765:       ratio_needed = 0.0;
2766:     }
2767:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2769:   /* put together the new matrix in MATSEQSBAIJ format */
2770:   MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL);

2772:   b = (Mat_SeqSBAIJ *)(fact)->data;

2774:   PetscFree2(b->imax, b->ilen);

2776:   b->singlemalloc = PETSC_FALSE;
2777:   b->free_a       = PETSC_TRUE;
2778:   b->free_ij      = free_ij;

2780:   PetscMalloc1(ui[am] + 1, &b->a);

2782:   b->j             = uj;
2783:   b->i             = ui;
2784:   b->diag          = NULL;
2785:   b->ilen          = NULL;
2786:   b->imax          = NULL;
2787:   b->row           = perm;
2788:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

2790:   PetscObjectReference((PetscObject)perm);
2791:   b->icol = perm;
2792:   PetscObjectReference((PetscObject)perm);
2793:   PetscMalloc1(am + 1, &b->solve_work);

2795:   b->maxnz = b->nz = ui[am];

2797:   fact->info.factor_mallocs    = reallocs;
2798:   fact->info.fill_ratio_given  = fill;
2799:   fact->info.fill_ratio_needed = ratio_needed;
2800: #if defined(PETSC_USE_INFO)
2801:   if (ai[am] != 0) {
2802:     PetscReal af = fact->info.fill_ratio_needed;
2803:     PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af);
2804:     PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af);
2805:     PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af);
2806:   } else {
2807:     PetscInfo(A, "Empty matrix\n");
2808:   }
2809: #endif
2810:   if (perm_identity) {
2811:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2812:   } else {
2813:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2814:   }
2815:   return 0;
2816: }