Actual source code: baijsolv.c

  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <petsc/private/kernels/blockinvert.h>

  4: PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
  5: {
  6:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
  7:   IS                 iscol = a->col, isrow = a->row;
  8:   const PetscInt    *r, *c, *rout, *cout;
  9:   const PetscInt     n = a->mbs, *ai = a->i, *aj = a->j, *vi;
 10:   PetscInt           i, nz;
 11:   const PetscInt     bs = A->rmap->bs, bs2 = a->bs2;
 12:   const MatScalar   *aa = a->a, *v;
 13:   PetscScalar       *x, *s, *t, *ls;
 14:   const PetscScalar *b;

 16:   VecGetArrayRead(bb, &b);
 17:   VecGetArray(xx, &x);
 18:   t = a->solve_work;

 20:   ISGetIndices(isrow, &rout);
 21:   r = rout;
 22:   ISGetIndices(iscol, &cout);
 23:   c = cout + (n - 1);

 25:   /* forward solve the lower triangular */
 26:   PetscArraycpy(t, b + bs * (*r++), bs);
 27:   for (i = 1; i < n; i++) {
 28:     v  = aa + bs2 * ai[i];
 29:     vi = aj + ai[i];
 30:     nz = a->diag[i] - ai[i];
 31:     s  = t + bs * i;
 32:     PetscArraycpy(s, b + bs * (*r++), bs);
 33:     while (nz--) {
 34:       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * (*vi++));
 35:       v += bs2;
 36:     }
 37:   }
 38:   /* backward solve the upper triangular */
 39:   ls = a->solve_work + A->cmap->n;
 40:   for (i = n - 1; i >= 0; i--) {
 41:     v  = aa + bs2 * (a->diag[i] + 1);
 42:     vi = aj + a->diag[i] + 1;
 43:     nz = ai[i + 1] - a->diag[i] - 1;
 44:     PetscArraycpy(ls, t + i * bs, bs);
 45:     while (nz--) {
 46:       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * (*vi++));
 47:       v += bs2;
 48:     }
 49:     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs);
 50:     PetscArraycpy(x + bs * (*c--), t + i * bs, bs);
 51:   }

 53:   ISRestoreIndices(isrow, &rout);
 54:   ISRestoreIndices(iscol, &cout);
 55:   VecRestoreArrayRead(bb, &b);
 56:   VecRestoreArray(xx, &x);
 57:   PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n);
 58:   return 0;
 59: }

 61: PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
 62: {
 63:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
 64:   IS                 iscol = a->col, isrow = a->row;
 65:   const PetscInt    *r, *c, *ai = a->i, *aj = a->j;
 66:   const PetscInt    *rout, *cout, *diag = a->diag, *vi, n = a->mbs;
 67:   PetscInt           i, nz, idx, idt, idc;
 68:   const MatScalar   *aa = a->a, *v;
 69:   PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
 70:   const PetscScalar *b;

 72:   VecGetArrayRead(bb, &b);
 73:   VecGetArray(xx, &x);
 74:   t = a->solve_work;

 76:   ISGetIndices(isrow, &rout);
 77:   r = rout;
 78:   ISGetIndices(iscol, &cout);
 79:   c = cout + (n - 1);

 81:   /* forward solve the lower triangular */
 82:   idx  = 7 * (*r++);
 83:   t[0] = b[idx];
 84:   t[1] = b[1 + idx];
 85:   t[2] = b[2 + idx];
 86:   t[3] = b[3 + idx];
 87:   t[4] = b[4 + idx];
 88:   t[5] = b[5 + idx];
 89:   t[6] = b[6 + idx];

 91:   for (i = 1; i < n; i++) {
 92:     v   = aa + 49 * ai[i];
 93:     vi  = aj + ai[i];
 94:     nz  = diag[i] - ai[i];
 95:     idx = 7 * (*r++);
 96:     s1  = b[idx];
 97:     s2  = b[1 + idx];
 98:     s3  = b[2 + idx];
 99:     s4  = b[3 + idx];
100:     s5  = b[4 + idx];
101:     s6  = b[5 + idx];
102:     s7  = b[6 + idx];
103:     while (nz--) {
104:       idx = 7 * (*vi++);
105:       x1  = t[idx];
106:       x2  = t[1 + idx];
107:       x3  = t[2 + idx];
108:       x4  = t[3 + idx];
109:       x5  = t[4 + idx];
110:       x6  = t[5 + idx];
111:       x7  = t[6 + idx];
112:       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
113:       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
114:       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
115:       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
116:       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
117:       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
118:       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
119:       v += 49;
120:     }
121:     idx        = 7 * i;
122:     t[idx]     = s1;
123:     t[1 + idx] = s2;
124:     t[2 + idx] = s3;
125:     t[3 + idx] = s4;
126:     t[4 + idx] = s5;
127:     t[5 + idx] = s6;
128:     t[6 + idx] = s7;
129:   }
130:   /* backward solve the upper triangular */
131:   for (i = n - 1; i >= 0; i--) {
132:     v   = aa + 49 * diag[i] + 49;
133:     vi  = aj + diag[i] + 1;
134:     nz  = ai[i + 1] - diag[i] - 1;
135:     idt = 7 * i;
136:     s1  = t[idt];
137:     s2  = t[1 + idt];
138:     s3  = t[2 + idt];
139:     s4  = t[3 + idt];
140:     s5  = t[4 + idt];
141:     s6  = t[5 + idt];
142:     s7  = t[6 + idt];
143:     while (nz--) {
144:       idx = 7 * (*vi++);
145:       x1  = t[idx];
146:       x2  = t[1 + idx];
147:       x3  = t[2 + idx];
148:       x4  = t[3 + idx];
149:       x5  = t[4 + idx];
150:       x6  = t[5 + idx];
151:       x7  = t[6 + idx];
152:       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
153:       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
154:       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
155:       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
156:       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
157:       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
158:       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
159:       v += 49;
160:     }
161:     idc    = 7 * (*c--);
162:     v      = aa + 49 * diag[i];
163:     x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
164:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
165:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
166:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
167:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
168:     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
169:     x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
170:   }

172:   ISRestoreIndices(isrow, &rout);
173:   ISRestoreIndices(iscol, &cout);
174:   VecRestoreArrayRead(bb, &b);
175:   VecRestoreArray(xx, &x);
176:   PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n);
177:   return 0;
178: }

180: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A, Vec bb, Vec xx)
181: {
182:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
183:   IS                 iscol = a->col, isrow = a->row;
184:   const PetscInt    *r, *c, *ai = a->i, *aj = a->j, *adiag = a->diag;
185:   const PetscInt     n = a->mbs, *rout, *cout, *vi;
186:   PetscInt           i, nz, idx, idt, idc, m;
187:   const MatScalar   *aa = a->a, *v;
188:   PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
189:   const PetscScalar *b;

191:   VecGetArrayRead(bb, &b);
192:   VecGetArray(xx, &x);
193:   t = a->solve_work;

195:   ISGetIndices(isrow, &rout);
196:   r = rout;
197:   ISGetIndices(iscol, &cout);
198:   c = cout;

200:   /* forward solve the lower triangular */
201:   idx  = 7 * r[0];
202:   t[0] = b[idx];
203:   t[1] = b[1 + idx];
204:   t[2] = b[2 + idx];
205:   t[3] = b[3 + idx];
206:   t[4] = b[4 + idx];
207:   t[5] = b[5 + idx];
208:   t[6] = b[6 + idx];

210:   for (i = 1; i < n; i++) {
211:     v   = aa + 49 * ai[i];
212:     vi  = aj + ai[i];
213:     nz  = ai[i + 1] - ai[i];
214:     idx = 7 * r[i];
215:     s1  = b[idx];
216:     s2  = b[1 + idx];
217:     s3  = b[2 + idx];
218:     s4  = b[3 + idx];
219:     s5  = b[4 + idx];
220:     s6  = b[5 + idx];
221:     s7  = b[6 + idx];
222:     for (m = 0; m < nz; m++) {
223:       idx = 7 * vi[m];
224:       x1  = t[idx];
225:       x2  = t[1 + idx];
226:       x3  = t[2 + idx];
227:       x4  = t[3 + idx];
228:       x5  = t[4 + idx];
229:       x6  = t[5 + idx];
230:       x7  = t[6 + idx];
231:       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
232:       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
233:       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
234:       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
235:       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
236:       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
237:       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
238:       v += 49;
239:     }
240:     idx        = 7 * i;
241:     t[idx]     = s1;
242:     t[1 + idx] = s2;
243:     t[2 + idx] = s3;
244:     t[3 + idx] = s4;
245:     t[4 + idx] = s5;
246:     t[5 + idx] = s6;
247:     t[6 + idx] = s7;
248:   }
249:   /* backward solve the upper triangular */
250:   for (i = n - 1; i >= 0; i--) {
251:     v   = aa + 49 * (adiag[i + 1] + 1);
252:     vi  = aj + adiag[i + 1] + 1;
253:     nz  = adiag[i] - adiag[i + 1] - 1;
254:     idt = 7 * i;
255:     s1  = t[idt];
256:     s2  = t[1 + idt];
257:     s3  = t[2 + idt];
258:     s4  = t[3 + idt];
259:     s5  = t[4 + idt];
260:     s6  = t[5 + idt];
261:     s7  = t[6 + idt];
262:     for (m = 0; m < nz; m++) {
263:       idx = 7 * vi[m];
264:       x1  = t[idx];
265:       x2  = t[1 + idx];
266:       x3  = t[2 + idx];
267:       x4  = t[3 + idx];
268:       x5  = t[4 + idx];
269:       x6  = t[5 + idx];
270:       x7  = t[6 + idx];
271:       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
272:       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
273:       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
274:       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
275:       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
276:       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
277:       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
278:       v += 49;
279:     }
280:     idc    = 7 * c[i];
281:     x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
282:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
283:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
284:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
285:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
286:     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
287:     x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
288:   }

290:   ISRestoreIndices(isrow, &rout);
291:   ISRestoreIndices(iscol, &cout);
292:   VecRestoreArrayRead(bb, &b);
293:   VecRestoreArray(xx, &x);
294:   PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n);
295:   return 0;
296: }

298: PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
299: {
300:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
301:   IS                 iscol = a->col, isrow = a->row;
302:   const PetscInt    *r, *c, *rout, *cout;
303:   const PetscInt    *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j;
304:   PetscInt           i, nz, idx, idt, idc;
305:   const MatScalar   *aa = a->a, *v;
306:   PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
307:   const PetscScalar *b;

309:   VecGetArrayRead(bb, &b);
310:   VecGetArray(xx, &x);
311:   t = a->solve_work;

313:   ISGetIndices(isrow, &rout);
314:   r = rout;
315:   ISGetIndices(iscol, &cout);
316:   c = cout + (n - 1);

318:   /* forward solve the lower triangular */
319:   idx  = 6 * (*r++);
320:   t[0] = b[idx];
321:   t[1] = b[1 + idx];
322:   t[2] = b[2 + idx];
323:   t[3] = b[3 + idx];
324:   t[4] = b[4 + idx];
325:   t[5] = b[5 + idx];
326:   for (i = 1; i < n; i++) {
327:     v   = aa + 36 * ai[i];
328:     vi  = aj + ai[i];
329:     nz  = diag[i] - ai[i];
330:     idx = 6 * (*r++);
331:     s1  = b[idx];
332:     s2  = b[1 + idx];
333:     s3  = b[2 + idx];
334:     s4  = b[3 + idx];
335:     s5  = b[4 + idx];
336:     s6  = b[5 + idx];
337:     while (nz--) {
338:       idx = 6 * (*vi++);
339:       x1  = t[idx];
340:       x2  = t[1 + idx];
341:       x3  = t[2 + idx];
342:       x4  = t[3 + idx];
343:       x5  = t[4 + idx];
344:       x6  = t[5 + idx];
345:       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
346:       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
347:       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
348:       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
349:       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
350:       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
351:       v += 36;
352:     }
353:     idx        = 6 * i;
354:     t[idx]     = s1;
355:     t[1 + idx] = s2;
356:     t[2 + idx] = s3;
357:     t[3 + idx] = s4;
358:     t[4 + idx] = s5;
359:     t[5 + idx] = s6;
360:   }
361:   /* backward solve the upper triangular */
362:   for (i = n - 1; i >= 0; i--) {
363:     v   = aa + 36 * diag[i] + 36;
364:     vi  = aj + diag[i] + 1;
365:     nz  = ai[i + 1] - diag[i] - 1;
366:     idt = 6 * i;
367:     s1  = t[idt];
368:     s2  = t[1 + idt];
369:     s3  = t[2 + idt];
370:     s4  = t[3 + idt];
371:     s5  = t[4 + idt];
372:     s6  = t[5 + idt];
373:     while (nz--) {
374:       idx = 6 * (*vi++);
375:       x1  = t[idx];
376:       x2  = t[1 + idx];
377:       x3  = t[2 + idx];
378:       x4  = t[3 + idx];
379:       x5  = t[4 + idx];
380:       x6  = t[5 + idx];
381:       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
382:       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
383:       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
384:       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
385:       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
386:       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
387:       v += 36;
388:     }
389:     idc    = 6 * (*c--);
390:     v      = aa + 36 * diag[i];
391:     x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
392:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
393:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
394:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
395:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
396:     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
397:   }

399:   ISRestoreIndices(isrow, &rout);
400:   ISRestoreIndices(iscol, &cout);
401:   VecRestoreArrayRead(bb, &b);
402:   VecRestoreArray(xx, &x);
403:   PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n);
404:   return 0;
405: }

407: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A, Vec bb, Vec xx)
408: {
409:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
410:   IS                 iscol = a->col, isrow = a->row;
411:   const PetscInt    *r, *c, *rout, *cout;
412:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
413:   PetscInt           i, nz, idx, idt, idc, m;
414:   const MatScalar   *aa = a->a, *v;
415:   PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
416:   const PetscScalar *b;

418:   VecGetArrayRead(bb, &b);
419:   VecGetArray(xx, &x);
420:   t = a->solve_work;

422:   ISGetIndices(isrow, &rout);
423:   r = rout;
424:   ISGetIndices(iscol, &cout);
425:   c = cout;

427:   /* forward solve the lower triangular */
428:   idx  = 6 * r[0];
429:   t[0] = b[idx];
430:   t[1] = b[1 + idx];
431:   t[2] = b[2 + idx];
432:   t[3] = b[3 + idx];
433:   t[4] = b[4 + idx];
434:   t[5] = b[5 + idx];
435:   for (i = 1; i < n; i++) {
436:     v   = aa + 36 * ai[i];
437:     vi  = aj + ai[i];
438:     nz  = ai[i + 1] - ai[i];
439:     idx = 6 * r[i];
440:     s1  = b[idx];
441:     s2  = b[1 + idx];
442:     s3  = b[2 + idx];
443:     s4  = b[3 + idx];
444:     s5  = b[4 + idx];
445:     s6  = b[5 + idx];
446:     for (m = 0; m < nz; m++) {
447:       idx = 6 * vi[m];
448:       x1  = t[idx];
449:       x2  = t[1 + idx];
450:       x3  = t[2 + idx];
451:       x4  = t[3 + idx];
452:       x5  = t[4 + idx];
453:       x6  = t[5 + idx];
454:       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
455:       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
456:       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
457:       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
458:       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
459:       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
460:       v += 36;
461:     }
462:     idx        = 6 * i;
463:     t[idx]     = s1;
464:     t[1 + idx] = s2;
465:     t[2 + idx] = s3;
466:     t[3 + idx] = s4;
467:     t[4 + idx] = s5;
468:     t[5 + idx] = s6;
469:   }
470:   /* backward solve the upper triangular */
471:   for (i = n - 1; i >= 0; i--) {
472:     v   = aa + 36 * (adiag[i + 1] + 1);
473:     vi  = aj + adiag[i + 1] + 1;
474:     nz  = adiag[i] - adiag[i + 1] - 1;
475:     idt = 6 * i;
476:     s1  = t[idt];
477:     s2  = t[1 + idt];
478:     s3  = t[2 + idt];
479:     s4  = t[3 + idt];
480:     s5  = t[4 + idt];
481:     s6  = t[5 + idt];
482:     for (m = 0; m < nz; m++) {
483:       idx = 6 * vi[m];
484:       x1  = t[idx];
485:       x2  = t[1 + idx];
486:       x3  = t[2 + idx];
487:       x4  = t[3 + idx];
488:       x5  = t[4 + idx];
489:       x6  = t[5 + idx];
490:       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
491:       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
492:       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
493:       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
494:       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
495:       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
496:       v += 36;
497:     }
498:     idc    = 6 * c[i];
499:     x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
500:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
501:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
502:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
503:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
504:     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
505:   }

507:   ISRestoreIndices(isrow, &rout);
508:   ISRestoreIndices(iscol, &cout);
509:   VecRestoreArrayRead(bb, &b);
510:   VecRestoreArray(xx, &x);
511:   PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n);
512:   return 0;
513: }

515: PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
516: {
517:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
518:   IS                 iscol = a->col, isrow = a->row;
519:   const PetscInt    *r, *c, *rout, *cout, *diag = a->diag;
520:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
521:   PetscInt           i, nz, idx, idt, idc;
522:   const MatScalar   *aa = a->a, *v;
523:   PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
524:   const PetscScalar *b;

526:   VecGetArrayRead(bb, &b);
527:   VecGetArray(xx, &x);
528:   t = a->solve_work;

530:   ISGetIndices(isrow, &rout);
531:   r = rout;
532:   ISGetIndices(iscol, &cout);
533:   c = cout + (n - 1);

535:   /* forward solve the lower triangular */
536:   idx  = 5 * (*r++);
537:   t[0] = b[idx];
538:   t[1] = b[1 + idx];
539:   t[2] = b[2 + idx];
540:   t[3] = b[3 + idx];
541:   t[4] = b[4 + idx];
542:   for (i = 1; i < n; i++) {
543:     v   = aa + 25 * ai[i];
544:     vi  = aj + ai[i];
545:     nz  = diag[i] - ai[i];
546:     idx = 5 * (*r++);
547:     s1  = b[idx];
548:     s2  = b[1 + idx];
549:     s3  = b[2 + idx];
550:     s4  = b[3 + idx];
551:     s5  = b[4 + idx];
552:     while (nz--) {
553:       idx = 5 * (*vi++);
554:       x1  = t[idx];
555:       x2  = t[1 + idx];
556:       x3  = t[2 + idx];
557:       x4  = t[3 + idx];
558:       x5  = t[4 + idx];
559:       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
560:       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
561:       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
562:       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
563:       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
564:       v += 25;
565:     }
566:     idx        = 5 * i;
567:     t[idx]     = s1;
568:     t[1 + idx] = s2;
569:     t[2 + idx] = s3;
570:     t[3 + idx] = s4;
571:     t[4 + idx] = s5;
572:   }
573:   /* backward solve the upper triangular */
574:   for (i = n - 1; i >= 0; i--) {
575:     v   = aa + 25 * diag[i] + 25;
576:     vi  = aj + diag[i] + 1;
577:     nz  = ai[i + 1] - diag[i] - 1;
578:     idt = 5 * i;
579:     s1  = t[idt];
580:     s2  = t[1 + idt];
581:     s3  = t[2 + idt];
582:     s4  = t[3 + idt];
583:     s5  = t[4 + idt];
584:     while (nz--) {
585:       idx = 5 * (*vi++);
586:       x1  = t[idx];
587:       x2  = t[1 + idx];
588:       x3  = t[2 + idx];
589:       x4  = t[3 + idx];
590:       x5  = t[4 + idx];
591:       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
592:       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
593:       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
594:       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
595:       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
596:       v += 25;
597:     }
598:     idc    = 5 * (*c--);
599:     v      = aa + 25 * diag[i];
600:     x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
601:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
602:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
603:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
604:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
605:   }

607:   ISRestoreIndices(isrow, &rout);
608:   ISRestoreIndices(iscol, &cout);
609:   VecRestoreArrayRead(bb, &b);
610:   VecRestoreArray(xx, &x);
611:   PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n);
612:   return 0;
613: }

615: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A, Vec bb, Vec xx)
616: {
617:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
618:   IS                 iscol = a->col, isrow = a->row;
619:   const PetscInt    *r, *c, *rout, *cout;
620:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
621:   PetscInt           i, nz, idx, idt, idc, m;
622:   const MatScalar   *aa = a->a, *v;
623:   PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
624:   const PetscScalar *b;

626:   VecGetArrayRead(bb, &b);
627:   VecGetArray(xx, &x);
628:   t = a->solve_work;

630:   ISGetIndices(isrow, &rout);
631:   r = rout;
632:   ISGetIndices(iscol, &cout);
633:   c = cout;

635:   /* forward solve the lower triangular */
636:   idx  = 5 * r[0];
637:   t[0] = b[idx];
638:   t[1] = b[1 + idx];
639:   t[2] = b[2 + idx];
640:   t[3] = b[3 + idx];
641:   t[4] = b[4 + idx];
642:   for (i = 1; i < n; i++) {
643:     v   = aa + 25 * ai[i];
644:     vi  = aj + ai[i];
645:     nz  = ai[i + 1] - ai[i];
646:     idx = 5 * r[i];
647:     s1  = b[idx];
648:     s2  = b[1 + idx];
649:     s3  = b[2 + idx];
650:     s4  = b[3 + idx];
651:     s5  = b[4 + idx];
652:     for (m = 0; m < nz; m++) {
653:       idx = 5 * vi[m];
654:       x1  = t[idx];
655:       x2  = t[1 + idx];
656:       x3  = t[2 + idx];
657:       x4  = t[3 + idx];
658:       x5  = t[4 + idx];
659:       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
660:       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
661:       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
662:       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
663:       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
664:       v += 25;
665:     }
666:     idx        = 5 * i;
667:     t[idx]     = s1;
668:     t[1 + idx] = s2;
669:     t[2 + idx] = s3;
670:     t[3 + idx] = s4;
671:     t[4 + idx] = s5;
672:   }
673:   /* backward solve the upper triangular */
674:   for (i = n - 1; i >= 0; i--) {
675:     v   = aa + 25 * (adiag[i + 1] + 1);
676:     vi  = aj + adiag[i + 1] + 1;
677:     nz  = adiag[i] - adiag[i + 1] - 1;
678:     idt = 5 * i;
679:     s1  = t[idt];
680:     s2  = t[1 + idt];
681:     s3  = t[2 + idt];
682:     s4  = t[3 + idt];
683:     s5  = t[4 + idt];
684:     for (m = 0; m < nz; m++) {
685:       idx = 5 * vi[m];
686:       x1  = t[idx];
687:       x2  = t[1 + idx];
688:       x3  = t[2 + idx];
689:       x4  = t[3 + idx];
690:       x5  = t[4 + idx];
691:       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
692:       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
693:       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
694:       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
695:       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
696:       v += 25;
697:     }
698:     idc    = 5 * c[i];
699:     x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
700:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
701:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
702:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
703:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
704:   }

706:   ISRestoreIndices(isrow, &rout);
707:   ISRestoreIndices(iscol, &cout);
708:   VecRestoreArrayRead(bb, &b);
709:   VecRestoreArray(xx, &x);
710:   PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n);
711:   return 0;
712: }

714: PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
715: {
716:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
717:   IS                 iscol = a->col, isrow = a->row;
718:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
719:   PetscInt           i, nz, idx, idt, idc;
720:   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
721:   const MatScalar   *aa = a->a, *v;
722:   PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
723:   const PetscScalar *b;

725:   VecGetArrayRead(bb, &b);
726:   VecGetArray(xx, &x);
727:   t = a->solve_work;

729:   ISGetIndices(isrow, &rout);
730:   r = rout;
731:   ISGetIndices(iscol, &cout);
732:   c = cout + (n - 1);

734:   /* forward solve the lower triangular */
735:   idx  = 4 * (*r++);
736:   t[0] = b[idx];
737:   t[1] = b[1 + idx];
738:   t[2] = b[2 + idx];
739:   t[3] = b[3 + idx];
740:   for (i = 1; i < n; i++) {
741:     v   = aa + 16 * ai[i];
742:     vi  = aj + ai[i];
743:     nz  = diag[i] - ai[i];
744:     idx = 4 * (*r++);
745:     s1  = b[idx];
746:     s2  = b[1 + idx];
747:     s3  = b[2 + idx];
748:     s4  = b[3 + idx];
749:     while (nz--) {
750:       idx = 4 * (*vi++);
751:       x1  = t[idx];
752:       x2  = t[1 + idx];
753:       x3  = t[2 + idx];
754:       x4  = t[3 + idx];
755:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
756:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
757:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
758:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
759:       v += 16;
760:     }
761:     idx        = 4 * i;
762:     t[idx]     = s1;
763:     t[1 + idx] = s2;
764:     t[2 + idx] = s3;
765:     t[3 + idx] = s4;
766:   }
767:   /* backward solve the upper triangular */
768:   for (i = n - 1; i >= 0; i--) {
769:     v   = aa + 16 * diag[i] + 16;
770:     vi  = aj + diag[i] + 1;
771:     nz  = ai[i + 1] - diag[i] - 1;
772:     idt = 4 * i;
773:     s1  = t[idt];
774:     s2  = t[1 + idt];
775:     s3  = t[2 + idt];
776:     s4  = t[3 + idt];
777:     while (nz--) {
778:       idx = 4 * (*vi++);
779:       x1  = t[idx];
780:       x2  = t[1 + idx];
781:       x3  = t[2 + idx];
782:       x4  = t[3 + idx];
783:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
784:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
785:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
786:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
787:       v += 16;
788:     }
789:     idc    = 4 * (*c--);
790:     v      = aa + 16 * diag[i];
791:     x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
792:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
793:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
794:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
795:   }

797:   ISRestoreIndices(isrow, &rout);
798:   ISRestoreIndices(iscol, &cout);
799:   VecRestoreArrayRead(bb, &b);
800:   VecRestoreArray(xx, &x);
801:   PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n);
802:   return 0;
803: }

805: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A, Vec bb, Vec xx)
806: {
807:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
808:   IS                 iscol = a->col, isrow = a->row;
809:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
810:   PetscInt           i, nz, idx, idt, idc, m;
811:   const PetscInt    *r, *c, *rout, *cout;
812:   const MatScalar   *aa = a->a, *v;
813:   PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
814:   const PetscScalar *b;

816:   VecGetArrayRead(bb, &b);
817:   VecGetArray(xx, &x);
818:   t = a->solve_work;

820:   ISGetIndices(isrow, &rout);
821:   r = rout;
822:   ISGetIndices(iscol, &cout);
823:   c = cout;

825:   /* forward solve the lower triangular */
826:   idx  = 4 * r[0];
827:   t[0] = b[idx];
828:   t[1] = b[1 + idx];
829:   t[2] = b[2 + idx];
830:   t[3] = b[3 + idx];
831:   for (i = 1; i < n; i++) {
832:     v   = aa + 16 * ai[i];
833:     vi  = aj + ai[i];
834:     nz  = ai[i + 1] - ai[i];
835:     idx = 4 * r[i];
836:     s1  = b[idx];
837:     s2  = b[1 + idx];
838:     s3  = b[2 + idx];
839:     s4  = b[3 + idx];
840:     for (m = 0; m < nz; m++) {
841:       idx = 4 * vi[m];
842:       x1  = t[idx];
843:       x2  = t[1 + idx];
844:       x3  = t[2 + idx];
845:       x4  = t[3 + idx];
846:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
847:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
848:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
849:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
850:       v += 16;
851:     }
852:     idx        = 4 * i;
853:     t[idx]     = s1;
854:     t[1 + idx] = s2;
855:     t[2 + idx] = s3;
856:     t[3 + idx] = s4;
857:   }
858:   /* backward solve the upper triangular */
859:   for (i = n - 1; i >= 0; i--) {
860:     v   = aa + 16 * (adiag[i + 1] + 1);
861:     vi  = aj + adiag[i + 1] + 1;
862:     nz  = adiag[i] - adiag[i + 1] - 1;
863:     idt = 4 * i;
864:     s1  = t[idt];
865:     s2  = t[1 + idt];
866:     s3  = t[2 + idt];
867:     s4  = t[3 + idt];
868:     for (m = 0; m < nz; m++) {
869:       idx = 4 * vi[m];
870:       x1  = t[idx];
871:       x2  = t[1 + idx];
872:       x3  = t[2 + idx];
873:       x4  = t[3 + idx];
874:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
875:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
876:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
877:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
878:       v += 16;
879:     }
880:     idc    = 4 * c[i];
881:     x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
882:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
883:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
884:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
885:   }

887:   ISRestoreIndices(isrow, &rout);
888:   ISRestoreIndices(iscol, &cout);
889:   VecRestoreArrayRead(bb, &b);
890:   VecRestoreArray(xx, &x);
891:   PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n);
892:   return 0;
893: }

895: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A, Vec bb, Vec xx)
896: {
897:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
898:   IS                 iscol = a->col, isrow = a->row;
899:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
900:   PetscInt           i, nz, idx, idt, idc;
901:   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
902:   const MatScalar   *aa = a->a, *v;
903:   MatScalar          s1, s2, s3, s4, x1, x2, x3, x4, *t;
904:   PetscScalar       *x;
905:   const PetscScalar *b;

907:   VecGetArrayRead(bb, &b);
908:   VecGetArray(xx, &x);
909:   t = (MatScalar *)a->solve_work;

911:   ISGetIndices(isrow, &rout);
912:   r = rout;
913:   ISGetIndices(iscol, &cout);
914:   c = cout + (n - 1);

916:   /* forward solve the lower triangular */
917:   idx  = 4 * (*r++);
918:   t[0] = (MatScalar)b[idx];
919:   t[1] = (MatScalar)b[1 + idx];
920:   t[2] = (MatScalar)b[2 + idx];
921:   t[3] = (MatScalar)b[3 + idx];
922:   for (i = 1; i < n; i++) {
923:     v   = aa + 16 * ai[i];
924:     vi  = aj + ai[i];
925:     nz  = diag[i] - ai[i];
926:     idx = 4 * (*r++);
927:     s1  = (MatScalar)b[idx];
928:     s2  = (MatScalar)b[1 + idx];
929:     s3  = (MatScalar)b[2 + idx];
930:     s4  = (MatScalar)b[3 + idx];
931:     while (nz--) {
932:       idx = 4 * (*vi++);
933:       x1  = t[idx];
934:       x2  = t[1 + idx];
935:       x3  = t[2 + idx];
936:       x4  = t[3 + idx];
937:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
938:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
939:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
940:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
941:       v += 16;
942:     }
943:     idx        = 4 * i;
944:     t[idx]     = s1;
945:     t[1 + idx] = s2;
946:     t[2 + idx] = s3;
947:     t[3 + idx] = s4;
948:   }
949:   /* backward solve the upper triangular */
950:   for (i = n - 1; i >= 0; i--) {
951:     v   = aa + 16 * diag[i] + 16;
952:     vi  = aj + diag[i] + 1;
953:     nz  = ai[i + 1] - diag[i] - 1;
954:     idt = 4 * i;
955:     s1  = t[idt];
956:     s2  = t[1 + idt];
957:     s3  = t[2 + idt];
958:     s4  = t[3 + idt];
959:     while (nz--) {
960:       idx = 4 * (*vi++);
961:       x1  = t[idx];
962:       x2  = t[1 + idx];
963:       x3  = t[2 + idx];
964:       x4  = t[3 + idx];
965:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
966:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
967:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
968:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
969:       v += 16;
970:     }
971:     idc        = 4 * (*c--);
972:     v          = aa + 16 * diag[i];
973:     t[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
974:     t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
975:     t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
976:     t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
977:     x[idc]     = (PetscScalar)t[idt];
978:     x[1 + idc] = (PetscScalar)t[1 + idt];
979:     x[2 + idc] = (PetscScalar)t[2 + idt];
980:     x[3 + idc] = (PetscScalar)t[3 + idt];
981:   }

983:   ISRestoreIndices(isrow, &rout);
984:   ISRestoreIndices(iscol, &cout);
985:   VecRestoreArrayRead(bb, &b);
986:   VecRestoreArray(xx, &x);
987:   PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n);
988:   return 0;
989: }

991: #if defined(PETSC_HAVE_SSE)

993:   #include PETSC_HAVE_SSE

995: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A, Vec bb, Vec xx)
996: {
997:   /*
998:      Note: This code uses demotion of double
999:      to float when performing the mixed-mode computation.
1000:      This may not be numerically reasonable for all applications.
1001:   */
1002:   Mat_SeqBAIJ    *a     = (Mat_SeqBAIJ *)A->data;
1003:   IS              iscol = a->col, isrow = a->row;
1004:   PetscInt        i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, nz, idx, idt, idc, ai16;
1005:   const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
1006:   MatScalar      *aa = a->a, *v;
1007:   PetscScalar    *x, *b, *t;

1009:   /* Make space in temp stack for 16 Byte Aligned arrays */
1010:   float         ssealignedspace[11], *tmps, *tmpx;
1011:   unsigned long offset;

1013:   SSE_SCOPE_BEGIN;

1015:   offset = (unsigned long)ssealignedspace % 16;
1016:   if (offset) offset = (16 - offset) / 4;
1017:   tmps = &ssealignedspace[offset];
1018:   tmpx = &ssealignedspace[offset + 4];
1019:   PREFETCH_NTA(aa + 16 * ai[1]);

1021:   VecGetArray(bb, &b);
1022:   VecGetArray(xx, &x);
1023:   t = a->solve_work;

1025:   ISGetIndices(isrow, &rout);
1026:   r = rout;
1027:   ISGetIndices(iscol, &cout);
1028:   c = cout + (n - 1);

1030:   /* forward solve the lower triangular */
1031:   idx  = 4 * (*r++);
1032:   t[0] = b[idx];
1033:   t[1] = b[1 + idx];
1034:   t[2] = b[2 + idx];
1035:   t[3] = b[3 + idx];
1036:   v    = aa + 16 * ai[1];

1038:   for (i = 1; i < n;) {
1039:     PREFETCH_NTA(&v[8]);
1040:     vi  = aj + ai[i];
1041:     nz  = diag[i] - ai[i];
1042:     idx = 4 * (*r++);

1044:     /* Demote sum from double to float */
1045:     CONVERT_DOUBLE4_FLOAT4(tmps, &b[idx]);
1046:     LOAD_PS(tmps, XMM7);

1048:     while (nz--) {
1049:       PREFETCH_NTA(&v[16]);
1050:       idx = 4 * (*vi++);

1052:       /* Demote solution (so far) from double to float */
1053:       CONVERT_DOUBLE4_FLOAT4(tmpx, &x[idx]);

1055:       /* 4x4 Matrix-Vector product with negative accumulation: */
1056:       SSE_INLINE_BEGIN_2(tmpx, v)
1057:       SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)

1059:       /* First Column */
1060:       SSE_COPY_PS(XMM0, XMM6)
1061:       SSE_SHUFFLE(XMM0, XMM0, 0x00)
1062:       SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
1063:       SSE_SUB_PS(XMM7, XMM0)

1065:       /* Second Column */
1066:       SSE_COPY_PS(XMM1, XMM6)
1067:       SSE_SHUFFLE(XMM1, XMM1, 0x55)
1068:       SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
1069:       SSE_SUB_PS(XMM7, XMM1)

1071:       SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)

1073:       /* Third Column */
1074:       SSE_COPY_PS(XMM2, XMM6)
1075:       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1076:       SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
1077:       SSE_SUB_PS(XMM7, XMM2)

1079:       /* Fourth Column */
1080:       SSE_COPY_PS(XMM3, XMM6)
1081:       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1082:       SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
1083:       SSE_SUB_PS(XMM7, XMM3)
1084:       SSE_INLINE_END_2

1086:       v += 16;
1087:     }
1088:     idx = 4 * i;
1089:     v   = aa + 16 * ai[++i];
1090:     PREFETCH_NTA(v);
1091:     STORE_PS(tmps, XMM7);

1093:     /* Promote result from float to double */
1094:     CONVERT_FLOAT4_DOUBLE4(&t[idx], tmps);
1095:   }
1096:   /* backward solve the upper triangular */
1097:   idt  = 4 * (n - 1);
1098:   ai16 = 16 * diag[n - 1];
1099:   v    = aa + ai16 + 16;
1100:   for (i = n - 1; i >= 0;) {
1101:     PREFETCH_NTA(&v[8]);
1102:     vi = aj + diag[i] + 1;
1103:     nz = ai[i + 1] - diag[i] - 1;

1105:     /* Demote accumulator from double to float */
1106:     CONVERT_DOUBLE4_FLOAT4(tmps, &t[idt]);
1107:     LOAD_PS(tmps, XMM7);

1109:     while (nz--) {
1110:       PREFETCH_NTA(&v[16]);
1111:       idx = 4 * (*vi++);

1113:       /* Demote solution (so far) from double to float */
1114:       CONVERT_DOUBLE4_FLOAT4(tmpx, &t[idx]);

1116:       /* 4x4 Matrix-Vector Product with negative accumulation: */
1117:       SSE_INLINE_BEGIN_2(tmpx, v)
1118:       SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)

1120:       /* First Column */
1121:       SSE_COPY_PS(XMM0, XMM6)
1122:       SSE_SHUFFLE(XMM0, XMM0, 0x00)
1123:       SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
1124:       SSE_SUB_PS(XMM7, XMM0)

1126:       /* Second Column */
1127:       SSE_COPY_PS(XMM1, XMM6)
1128:       SSE_SHUFFLE(XMM1, XMM1, 0x55)
1129:       SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
1130:       SSE_SUB_PS(XMM7, XMM1)

1132:       SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)

1134:       /* Third Column */
1135:       SSE_COPY_PS(XMM2, XMM6)
1136:       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1137:       SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
1138:       SSE_SUB_PS(XMM7, XMM2)

1140:       /* Fourth Column */
1141:       SSE_COPY_PS(XMM3, XMM6)
1142:       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1143:       SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
1144:       SSE_SUB_PS(XMM7, XMM3)
1145:       SSE_INLINE_END_2
1146:       v += 16;
1147:     }
1148:     v    = aa + ai16;
1149:     ai16 = 16 * diag[--i];
1150:     PREFETCH_NTA(aa + ai16 + 16);
1151:     /*
1152:        Scale the result by the diagonal 4x4 block,
1153:        which was inverted as part of the factorization
1154:     */
1155:     SSE_INLINE_BEGIN_3(v, tmps, aa + ai16)
1156:     /* First Column */
1157:     SSE_COPY_PS(XMM0, XMM7)
1158:     SSE_SHUFFLE(XMM0, XMM0, 0x00)
1159:     SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)

1161:     /* Second Column */
1162:     SSE_COPY_PS(XMM1, XMM7)
1163:     SSE_SHUFFLE(XMM1, XMM1, 0x55)
1164:     SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
1165:     SSE_ADD_PS(XMM0, XMM1)

1167:     SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)

1169:     /* Third Column */
1170:     SSE_COPY_PS(XMM2, XMM7)
1171:     SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1172:     SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
1173:     SSE_ADD_PS(XMM0, XMM2)

1175:     /* Fourth Column */
1176:     SSE_COPY_PS(XMM3, XMM7)
1177:     SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1178:     SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
1179:     SSE_ADD_PS(XMM0, XMM3)

1181:     SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
1182:     SSE_INLINE_END_3

1184:     /* Promote solution from float to double */
1185:     CONVERT_FLOAT4_DOUBLE4(&t[idt], tmps);

1187:     /* Apply reordering to t and stream into x.    */
1188:     /* This way, x doesn't pollute the cache.      */
1189:     /* Be careful with size: 2 doubles = 4 floats! */
1190:     idc = 4 * (*c--);
1191:     SSE_INLINE_BEGIN_2((float *)&t[idt], (float *)&x[idc])
1192:     /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
1193:     SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM0)
1194:     SSE_STREAM_PS(SSE_ARG_2, FLOAT_0, XMM0)
1195:     /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1196:     SSE_LOAD_PS(SSE_ARG_1, FLOAT_4, XMM1)
1197:     SSE_STREAM_PS(SSE_ARG_2, FLOAT_4, XMM1)
1198:     SSE_INLINE_END_2
1199:     v = aa + ai16 + 16;
1200:     idt -= 4;
1201:   }

1203:   ISRestoreIndices(isrow, &rout);
1204:   ISRestoreIndices(iscol, &cout);
1205:   VecRestoreArray(bb, &b);
1206:   VecRestoreArray(xx, &x);
1207:   PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n);
1208:   SSE_SCOPE_END;
1209:   return 0;
1210: }

1212: #endif

1214: PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
1215: {
1216:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1217:   IS                 iscol = a->col, isrow = a->row;
1218:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1219:   PetscInt           i, nz, idx, idt, idc;
1220:   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
1221:   const MatScalar   *aa = a->a, *v;
1222:   PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
1223:   const PetscScalar *b;

1225:   VecGetArrayRead(bb, &b);
1226:   VecGetArray(xx, &x);
1227:   t = a->solve_work;

1229:   ISGetIndices(isrow, &rout);
1230:   r = rout;
1231:   ISGetIndices(iscol, &cout);
1232:   c = cout + (n - 1);

1234:   /* forward solve the lower triangular */
1235:   idx  = 3 * (*r++);
1236:   t[0] = b[idx];
1237:   t[1] = b[1 + idx];
1238:   t[2] = b[2 + idx];
1239:   for (i = 1; i < n; i++) {
1240:     v   = aa + 9 * ai[i];
1241:     vi  = aj + ai[i];
1242:     nz  = diag[i] - ai[i];
1243:     idx = 3 * (*r++);
1244:     s1  = b[idx];
1245:     s2  = b[1 + idx];
1246:     s3  = b[2 + idx];
1247:     while (nz--) {
1248:       idx = 3 * (*vi++);
1249:       x1  = t[idx];
1250:       x2  = t[1 + idx];
1251:       x3  = t[2 + idx];
1252:       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1253:       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1254:       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1255:       v += 9;
1256:     }
1257:     idx        = 3 * i;
1258:     t[idx]     = s1;
1259:     t[1 + idx] = s2;
1260:     t[2 + idx] = s3;
1261:   }
1262:   /* backward solve the upper triangular */
1263:   for (i = n - 1; i >= 0; i--) {
1264:     v   = aa + 9 * diag[i] + 9;
1265:     vi  = aj + diag[i] + 1;
1266:     nz  = ai[i + 1] - diag[i] - 1;
1267:     idt = 3 * i;
1268:     s1  = t[idt];
1269:     s2  = t[1 + idt];
1270:     s3  = t[2 + idt];
1271:     while (nz--) {
1272:       idx = 3 * (*vi++);
1273:       x1  = t[idx];
1274:       x2  = t[1 + idx];
1275:       x3  = t[2 + idx];
1276:       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1277:       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1278:       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1279:       v += 9;
1280:     }
1281:     idc    = 3 * (*c--);
1282:     v      = aa + 9 * diag[i];
1283:     x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
1284:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
1285:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
1286:   }
1287:   ISRestoreIndices(isrow, &rout);
1288:   ISRestoreIndices(iscol, &cout);
1289:   VecRestoreArrayRead(bb, &b);
1290:   VecRestoreArray(xx, &x);
1291:   PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n);
1292:   return 0;
1293: }

1295: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A, Vec bb, Vec xx)
1296: {
1297:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1298:   IS                 iscol = a->col, isrow = a->row;
1299:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1300:   PetscInt           i, nz, idx, idt, idc, m;
1301:   const PetscInt    *r, *c, *rout, *cout;
1302:   const MatScalar   *aa = a->a, *v;
1303:   PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
1304:   const PetscScalar *b;

1306:   VecGetArrayRead(bb, &b);
1307:   VecGetArray(xx, &x);
1308:   t = a->solve_work;

1310:   ISGetIndices(isrow, &rout);
1311:   r = rout;
1312:   ISGetIndices(iscol, &cout);
1313:   c = cout;

1315:   /* forward solve the lower triangular */
1316:   idx  = 3 * r[0];
1317:   t[0] = b[idx];
1318:   t[1] = b[1 + idx];
1319:   t[2] = b[2 + idx];
1320:   for (i = 1; i < n; i++) {
1321:     v   = aa + 9 * ai[i];
1322:     vi  = aj + ai[i];
1323:     nz  = ai[i + 1] - ai[i];
1324:     idx = 3 * r[i];
1325:     s1  = b[idx];
1326:     s2  = b[1 + idx];
1327:     s3  = b[2 + idx];
1328:     for (m = 0; m < nz; m++) {
1329:       idx = 3 * vi[m];
1330:       x1  = t[idx];
1331:       x2  = t[1 + idx];
1332:       x3  = t[2 + idx];
1333:       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1334:       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1335:       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1336:       v += 9;
1337:     }
1338:     idx        = 3 * i;
1339:     t[idx]     = s1;
1340:     t[1 + idx] = s2;
1341:     t[2 + idx] = s3;
1342:   }
1343:   /* backward solve the upper triangular */
1344:   for (i = n - 1; i >= 0; i--) {
1345:     v   = aa + 9 * (adiag[i + 1] + 1);
1346:     vi  = aj + adiag[i + 1] + 1;
1347:     nz  = adiag[i] - adiag[i + 1] - 1;
1348:     idt = 3 * i;
1349:     s1  = t[idt];
1350:     s2  = t[1 + idt];
1351:     s3  = t[2 + idt];
1352:     for (m = 0; m < nz; m++) {
1353:       idx = 3 * vi[m];
1354:       x1  = t[idx];
1355:       x2  = t[1 + idx];
1356:       x3  = t[2 + idx];
1357:       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1358:       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1359:       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1360:       v += 9;
1361:     }
1362:     idc    = 3 * c[i];
1363:     x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
1364:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
1365:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
1366:   }
1367:   ISRestoreIndices(isrow, &rout);
1368:   ISRestoreIndices(iscol, &cout);
1369:   VecRestoreArrayRead(bb, &b);
1370:   VecRestoreArray(xx, &x);
1371:   PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n);
1372:   return 0;
1373: }

1375: PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
1376: {
1377:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1378:   IS                 iscol = a->col, isrow = a->row;
1379:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1380:   PetscInt           i, nz, idx, idt, idc;
1381:   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
1382:   const MatScalar   *aa = a->a, *v;
1383:   PetscScalar       *x, s1, s2, x1, x2, *t;
1384:   const PetscScalar *b;

1386:   VecGetArrayRead(bb, &b);
1387:   VecGetArray(xx, &x);
1388:   t = a->solve_work;

1390:   ISGetIndices(isrow, &rout);
1391:   r = rout;
1392:   ISGetIndices(iscol, &cout);
1393:   c = cout + (n - 1);

1395:   /* forward solve the lower triangular */
1396:   idx  = 2 * (*r++);
1397:   t[0] = b[idx];
1398:   t[1] = b[1 + idx];
1399:   for (i = 1; i < n; i++) {
1400:     v   = aa + 4 * ai[i];
1401:     vi  = aj + ai[i];
1402:     nz  = diag[i] - ai[i];
1403:     idx = 2 * (*r++);
1404:     s1  = b[idx];
1405:     s2  = b[1 + idx];
1406:     while (nz--) {
1407:       idx = 2 * (*vi++);
1408:       x1  = t[idx];
1409:       x2  = t[1 + idx];
1410:       s1 -= v[0] * x1 + v[2] * x2;
1411:       s2 -= v[1] * x1 + v[3] * x2;
1412:       v += 4;
1413:     }
1414:     idx        = 2 * i;
1415:     t[idx]     = s1;
1416:     t[1 + idx] = s2;
1417:   }
1418:   /* backward solve the upper triangular */
1419:   for (i = n - 1; i >= 0; i--) {
1420:     v   = aa + 4 * diag[i] + 4;
1421:     vi  = aj + diag[i] + 1;
1422:     nz  = ai[i + 1] - diag[i] - 1;
1423:     idt = 2 * i;
1424:     s1  = t[idt];
1425:     s2  = t[1 + idt];
1426:     while (nz--) {
1427:       idx = 2 * (*vi++);
1428:       x1  = t[idx];
1429:       x2  = t[1 + idx];
1430:       s1 -= v[0] * x1 + v[2] * x2;
1431:       s2 -= v[1] * x1 + v[3] * x2;
1432:       v += 4;
1433:     }
1434:     idc    = 2 * (*c--);
1435:     v      = aa + 4 * diag[i];
1436:     x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1437:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1438:   }
1439:   ISRestoreIndices(isrow, &rout);
1440:   ISRestoreIndices(iscol, &cout);
1441:   VecRestoreArrayRead(bb, &b);
1442:   VecRestoreArray(xx, &x);
1443:   PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n);
1444:   return 0;
1445: }

1447: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A, Vec bb, Vec xx)
1448: {
1449:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1450:   IS                 iscol = a->col, isrow = a->row;
1451:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1452:   PetscInt           i, nz, idx, jdx, idt, idc, m;
1453:   const PetscInt    *r, *c, *rout, *cout;
1454:   const MatScalar   *aa = a->a, *v;
1455:   PetscScalar       *x, s1, s2, x1, x2, *t;
1456:   const PetscScalar *b;

1458:   VecGetArrayRead(bb, &b);
1459:   VecGetArray(xx, &x);
1460:   t = a->solve_work;

1462:   ISGetIndices(isrow, &rout);
1463:   r = rout;
1464:   ISGetIndices(iscol, &cout);
1465:   c = cout;

1467:   /* forward solve the lower triangular */
1468:   idx  = 2 * r[0];
1469:   t[0] = b[idx];
1470:   t[1] = b[1 + idx];
1471:   for (i = 1; i < n; i++) {
1472:     v   = aa + 4 * ai[i];
1473:     vi  = aj + ai[i];
1474:     nz  = ai[i + 1] - ai[i];
1475:     idx = 2 * r[i];
1476:     s1  = b[idx];
1477:     s2  = b[1 + idx];
1478:     for (m = 0; m < nz; m++) {
1479:       jdx = 2 * vi[m];
1480:       x1  = t[jdx];
1481:       x2  = t[1 + jdx];
1482:       s1 -= v[0] * x1 + v[2] * x2;
1483:       s2 -= v[1] * x1 + v[3] * x2;
1484:       v += 4;
1485:     }
1486:     idx        = 2 * i;
1487:     t[idx]     = s1;
1488:     t[1 + idx] = s2;
1489:   }
1490:   /* backward solve the upper triangular */
1491:   for (i = n - 1; i >= 0; i--) {
1492:     v   = aa + 4 * (adiag[i + 1] + 1);
1493:     vi  = aj + adiag[i + 1] + 1;
1494:     nz  = adiag[i] - adiag[i + 1] - 1;
1495:     idt = 2 * i;
1496:     s1  = t[idt];
1497:     s2  = t[1 + idt];
1498:     for (m = 0; m < nz; m++) {
1499:       idx = 2 * vi[m];
1500:       x1  = t[idx];
1501:       x2  = t[1 + idx];
1502:       s1 -= v[0] * x1 + v[2] * x2;
1503:       s2 -= v[1] * x1 + v[3] * x2;
1504:       v += 4;
1505:     }
1506:     idc    = 2 * c[i];
1507:     x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1508:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1509:   }
1510:   ISRestoreIndices(isrow, &rout);
1511:   ISRestoreIndices(iscol, &cout);
1512:   VecRestoreArrayRead(bb, &b);
1513:   VecRestoreArray(xx, &x);
1514:   PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n);
1515:   return 0;
1516: }

1518: PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1519: {
1520:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1521:   IS                 iscol = a->col, isrow = a->row;
1522:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1523:   PetscInt           i, nz;
1524:   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
1525:   const MatScalar   *aa = a->a, *v;
1526:   PetscScalar       *x, s1, *t;
1527:   const PetscScalar *b;

1529:   if (!n) return 0;

1531:   VecGetArrayRead(bb, &b);
1532:   VecGetArray(xx, &x);
1533:   t = a->solve_work;

1535:   ISGetIndices(isrow, &rout);
1536:   r = rout;
1537:   ISGetIndices(iscol, &cout);
1538:   c = cout + (n - 1);

1540:   /* forward solve the lower triangular */
1541:   t[0] = b[*r++];
1542:   for (i = 1; i < n; i++) {
1543:     v  = aa + ai[i];
1544:     vi = aj + ai[i];
1545:     nz = diag[i] - ai[i];
1546:     s1 = b[*r++];
1547:     while (nz--) s1 -= (*v++) * t[*vi++];
1548:     t[i] = s1;
1549:   }
1550:   /* backward solve the upper triangular */
1551:   for (i = n - 1; i >= 0; i--) {
1552:     v  = aa + diag[i] + 1;
1553:     vi = aj + diag[i] + 1;
1554:     nz = ai[i + 1] - diag[i] - 1;
1555:     s1 = t[i];
1556:     while (nz--) s1 -= (*v++) * t[*vi++];
1557:     x[*c--] = t[i] = aa[diag[i]] * s1;
1558:   }

1560:   ISRestoreIndices(isrow, &rout);
1561:   ISRestoreIndices(iscol, &cout);
1562:   VecRestoreArrayRead(bb, &b);
1563:   VecRestoreArray(xx, &x);
1564:   PetscLogFlops(2.0 * 1 * (a->nz) - A->cmap->n);
1565:   return 0;
1566: }

1568: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A, Vec bb, Vec xx)
1569: {
1570:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1571:   IS                 iscol = a->col, isrow = a->row;
1572:   PetscInt           i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz;
1573:   const PetscInt    *rout, *cout, *r, *c;
1574:   PetscScalar       *x, *tmp, sum;
1575:   const PetscScalar *b;
1576:   const MatScalar   *aa = a->a, *v;

1578:   if (!n) return 0;

1580:   VecGetArrayRead(bb, &b);
1581:   VecGetArray(xx, &x);
1582:   tmp = a->solve_work;

1584:   ISGetIndices(isrow, &rout);
1585:   r = rout;
1586:   ISGetIndices(iscol, &cout);
1587:   c = cout;

1589:   /* forward solve the lower triangular */
1590:   tmp[0] = b[r[0]];
1591:   v      = aa;
1592:   vi     = aj;
1593:   for (i = 1; i < n; i++) {
1594:     nz  = ai[i + 1] - ai[i];
1595:     sum = b[r[i]];
1596:     PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1597:     tmp[i] = sum;
1598:     v += nz;
1599:     vi += nz;
1600:   }

1602:   /* backward solve the upper triangular */
1603:   for (i = n - 1; i >= 0; i--) {
1604:     v   = aa + adiag[i + 1] + 1;
1605:     vi  = aj + adiag[i + 1] + 1;
1606:     nz  = adiag[i] - adiag[i + 1] - 1;
1607:     sum = tmp[i];
1608:     PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1609:     x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */
1610:   }

1612:   ISRestoreIndices(isrow, &rout);
1613:   ISRestoreIndices(iscol, &cout);
1614:   VecRestoreArrayRead(bb, &b);
1615:   VecRestoreArray(xx, &x);
1616:   PetscLogFlops(2.0 * a->nz - A->cmap->n);
1617:   return 0;
1618: }