Actual source code: baijsolv.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <petsc/private/kernels/blockinvert.h>
4: PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
5: {
6: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
7: IS iscol = a->col, isrow = a->row;
8: const PetscInt *r, *c, *rout, *cout;
9: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *vi;
10: PetscInt i, nz;
11: const PetscInt bs = A->rmap->bs, bs2 = a->bs2;
12: const MatScalar *aa = a->a, *v;
13: PetscScalar *x, *s, *t, *ls;
14: const PetscScalar *b;
16: VecGetArrayRead(bb, &b);
17: VecGetArray(xx, &x);
18: t = a->solve_work;
20: ISGetIndices(isrow, &rout);
21: r = rout;
22: ISGetIndices(iscol, &cout);
23: c = cout + (n - 1);
25: /* forward solve the lower triangular */
26: PetscArraycpy(t, b + bs * (*r++), bs);
27: for (i = 1; i < n; i++) {
28: v = aa + bs2 * ai[i];
29: vi = aj + ai[i];
30: nz = a->diag[i] - ai[i];
31: s = t + bs * i;
32: PetscArraycpy(s, b + bs * (*r++), bs);
33: while (nz--) {
34: PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * (*vi++));
35: v += bs2;
36: }
37: }
38: /* backward solve the upper triangular */
39: ls = a->solve_work + A->cmap->n;
40: for (i = n - 1; i >= 0; i--) {
41: v = aa + bs2 * (a->diag[i] + 1);
42: vi = aj + a->diag[i] + 1;
43: nz = ai[i + 1] - a->diag[i] - 1;
44: PetscArraycpy(ls, t + i * bs, bs);
45: while (nz--) {
46: PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * (*vi++));
47: v += bs2;
48: }
49: PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs);
50: PetscArraycpy(x + bs * (*c--), t + i * bs, bs);
51: }
53: ISRestoreIndices(isrow, &rout);
54: ISRestoreIndices(iscol, &cout);
55: VecRestoreArrayRead(bb, &b);
56: VecRestoreArray(xx, &x);
57: PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n);
58: return 0;
59: }
61: PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
62: {
63: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
64: IS iscol = a->col, isrow = a->row;
65: const PetscInt *r, *c, *ai = a->i, *aj = a->j;
66: const PetscInt *rout, *cout, *diag = a->diag, *vi, n = a->mbs;
67: PetscInt i, nz, idx, idt, idc;
68: const MatScalar *aa = a->a, *v;
69: PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
70: const PetscScalar *b;
72: VecGetArrayRead(bb, &b);
73: VecGetArray(xx, &x);
74: t = a->solve_work;
76: ISGetIndices(isrow, &rout);
77: r = rout;
78: ISGetIndices(iscol, &cout);
79: c = cout + (n - 1);
81: /* forward solve the lower triangular */
82: idx = 7 * (*r++);
83: t[0] = b[idx];
84: t[1] = b[1 + idx];
85: t[2] = b[2 + idx];
86: t[3] = b[3 + idx];
87: t[4] = b[4 + idx];
88: t[5] = b[5 + idx];
89: t[6] = b[6 + idx];
91: for (i = 1; i < n; i++) {
92: v = aa + 49 * ai[i];
93: vi = aj + ai[i];
94: nz = diag[i] - ai[i];
95: idx = 7 * (*r++);
96: s1 = b[idx];
97: s2 = b[1 + idx];
98: s3 = b[2 + idx];
99: s4 = b[3 + idx];
100: s5 = b[4 + idx];
101: s6 = b[5 + idx];
102: s7 = b[6 + idx];
103: while (nz--) {
104: idx = 7 * (*vi++);
105: x1 = t[idx];
106: x2 = t[1 + idx];
107: x3 = t[2 + idx];
108: x4 = t[3 + idx];
109: x5 = t[4 + idx];
110: x6 = t[5 + idx];
111: x7 = t[6 + idx];
112: s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
113: s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
114: s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
115: s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
116: s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
117: s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
118: s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
119: v += 49;
120: }
121: idx = 7 * i;
122: t[idx] = s1;
123: t[1 + idx] = s2;
124: t[2 + idx] = s3;
125: t[3 + idx] = s4;
126: t[4 + idx] = s5;
127: t[5 + idx] = s6;
128: t[6 + idx] = s7;
129: }
130: /* backward solve the upper triangular */
131: for (i = n - 1; i >= 0; i--) {
132: v = aa + 49 * diag[i] + 49;
133: vi = aj + diag[i] + 1;
134: nz = ai[i + 1] - diag[i] - 1;
135: idt = 7 * i;
136: s1 = t[idt];
137: s2 = t[1 + idt];
138: s3 = t[2 + idt];
139: s4 = t[3 + idt];
140: s5 = t[4 + idt];
141: s6 = t[5 + idt];
142: s7 = t[6 + idt];
143: while (nz--) {
144: idx = 7 * (*vi++);
145: x1 = t[idx];
146: x2 = t[1 + idx];
147: x3 = t[2 + idx];
148: x4 = t[3 + idx];
149: x5 = t[4 + idx];
150: x6 = t[5 + idx];
151: x7 = t[6 + idx];
152: s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
153: s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
154: s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
155: s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
156: s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
157: s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
158: s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
159: v += 49;
160: }
161: idc = 7 * (*c--);
162: v = aa + 49 * diag[i];
163: x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
164: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
165: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
166: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
167: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
168: x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
169: x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
170: }
172: ISRestoreIndices(isrow, &rout);
173: ISRestoreIndices(iscol, &cout);
174: VecRestoreArrayRead(bb, &b);
175: VecRestoreArray(xx, &x);
176: PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n);
177: return 0;
178: }
180: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A, Vec bb, Vec xx)
181: {
182: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
183: IS iscol = a->col, isrow = a->row;
184: const PetscInt *r, *c, *ai = a->i, *aj = a->j, *adiag = a->diag;
185: const PetscInt n = a->mbs, *rout, *cout, *vi;
186: PetscInt i, nz, idx, idt, idc, m;
187: const MatScalar *aa = a->a, *v;
188: PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
189: const PetscScalar *b;
191: VecGetArrayRead(bb, &b);
192: VecGetArray(xx, &x);
193: t = a->solve_work;
195: ISGetIndices(isrow, &rout);
196: r = rout;
197: ISGetIndices(iscol, &cout);
198: c = cout;
200: /* forward solve the lower triangular */
201: idx = 7 * r[0];
202: t[0] = b[idx];
203: t[1] = b[1 + idx];
204: t[2] = b[2 + idx];
205: t[3] = b[3 + idx];
206: t[4] = b[4 + idx];
207: t[5] = b[5 + idx];
208: t[6] = b[6 + idx];
210: for (i = 1; i < n; i++) {
211: v = aa + 49 * ai[i];
212: vi = aj + ai[i];
213: nz = ai[i + 1] - ai[i];
214: idx = 7 * r[i];
215: s1 = b[idx];
216: s2 = b[1 + idx];
217: s3 = b[2 + idx];
218: s4 = b[3 + idx];
219: s5 = b[4 + idx];
220: s6 = b[5 + idx];
221: s7 = b[6 + idx];
222: for (m = 0; m < nz; m++) {
223: idx = 7 * vi[m];
224: x1 = t[idx];
225: x2 = t[1 + idx];
226: x3 = t[2 + idx];
227: x4 = t[3 + idx];
228: x5 = t[4 + idx];
229: x6 = t[5 + idx];
230: x7 = t[6 + idx];
231: s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
232: s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
233: s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
234: s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
235: s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
236: s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
237: s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
238: v += 49;
239: }
240: idx = 7 * i;
241: t[idx] = s1;
242: t[1 + idx] = s2;
243: t[2 + idx] = s3;
244: t[3 + idx] = s4;
245: t[4 + idx] = s5;
246: t[5 + idx] = s6;
247: t[6 + idx] = s7;
248: }
249: /* backward solve the upper triangular */
250: for (i = n - 1; i >= 0; i--) {
251: v = aa + 49 * (adiag[i + 1] + 1);
252: vi = aj + adiag[i + 1] + 1;
253: nz = adiag[i] - adiag[i + 1] - 1;
254: idt = 7 * i;
255: s1 = t[idt];
256: s2 = t[1 + idt];
257: s3 = t[2 + idt];
258: s4 = t[3 + idt];
259: s5 = t[4 + idt];
260: s6 = t[5 + idt];
261: s7 = t[6 + idt];
262: for (m = 0; m < nz; m++) {
263: idx = 7 * vi[m];
264: x1 = t[idx];
265: x2 = t[1 + idx];
266: x3 = t[2 + idx];
267: x4 = t[3 + idx];
268: x5 = t[4 + idx];
269: x6 = t[5 + idx];
270: x7 = t[6 + idx];
271: s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
272: s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
273: s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
274: s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
275: s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
276: s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
277: s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
278: v += 49;
279: }
280: idc = 7 * c[i];
281: x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
282: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
283: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
284: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
285: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
286: x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
287: x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
288: }
290: ISRestoreIndices(isrow, &rout);
291: ISRestoreIndices(iscol, &cout);
292: VecRestoreArrayRead(bb, &b);
293: VecRestoreArray(xx, &x);
294: PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n);
295: return 0;
296: }
298: PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
299: {
300: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
301: IS iscol = a->col, isrow = a->row;
302: const PetscInt *r, *c, *rout, *cout;
303: const PetscInt *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j;
304: PetscInt i, nz, idx, idt, idc;
305: const MatScalar *aa = a->a, *v;
306: PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
307: const PetscScalar *b;
309: VecGetArrayRead(bb, &b);
310: VecGetArray(xx, &x);
311: t = a->solve_work;
313: ISGetIndices(isrow, &rout);
314: r = rout;
315: ISGetIndices(iscol, &cout);
316: c = cout + (n - 1);
318: /* forward solve the lower triangular */
319: idx = 6 * (*r++);
320: t[0] = b[idx];
321: t[1] = b[1 + idx];
322: t[2] = b[2 + idx];
323: t[3] = b[3 + idx];
324: t[4] = b[4 + idx];
325: t[5] = b[5 + idx];
326: for (i = 1; i < n; i++) {
327: v = aa + 36 * ai[i];
328: vi = aj + ai[i];
329: nz = diag[i] - ai[i];
330: idx = 6 * (*r++);
331: s1 = b[idx];
332: s2 = b[1 + idx];
333: s3 = b[2 + idx];
334: s4 = b[3 + idx];
335: s5 = b[4 + idx];
336: s6 = b[5 + idx];
337: while (nz--) {
338: idx = 6 * (*vi++);
339: x1 = t[idx];
340: x2 = t[1 + idx];
341: x3 = t[2 + idx];
342: x4 = t[3 + idx];
343: x5 = t[4 + idx];
344: x6 = t[5 + idx];
345: s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
346: s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
347: s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
348: s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
349: s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
350: s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
351: v += 36;
352: }
353: idx = 6 * i;
354: t[idx] = s1;
355: t[1 + idx] = s2;
356: t[2 + idx] = s3;
357: t[3 + idx] = s4;
358: t[4 + idx] = s5;
359: t[5 + idx] = s6;
360: }
361: /* backward solve the upper triangular */
362: for (i = n - 1; i >= 0; i--) {
363: v = aa + 36 * diag[i] + 36;
364: vi = aj + diag[i] + 1;
365: nz = ai[i + 1] - diag[i] - 1;
366: idt = 6 * i;
367: s1 = t[idt];
368: s2 = t[1 + idt];
369: s3 = t[2 + idt];
370: s4 = t[3 + idt];
371: s5 = t[4 + idt];
372: s6 = t[5 + idt];
373: while (nz--) {
374: idx = 6 * (*vi++);
375: x1 = t[idx];
376: x2 = t[1 + idx];
377: x3 = t[2 + idx];
378: x4 = t[3 + idx];
379: x5 = t[4 + idx];
380: x6 = t[5 + idx];
381: s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
382: s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
383: s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
384: s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
385: s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
386: s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
387: v += 36;
388: }
389: idc = 6 * (*c--);
390: v = aa + 36 * diag[i];
391: x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
392: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
393: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
394: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
395: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
396: x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
397: }
399: ISRestoreIndices(isrow, &rout);
400: ISRestoreIndices(iscol, &cout);
401: VecRestoreArrayRead(bb, &b);
402: VecRestoreArray(xx, &x);
403: PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n);
404: return 0;
405: }
407: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A, Vec bb, Vec xx)
408: {
409: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
410: IS iscol = a->col, isrow = a->row;
411: const PetscInt *r, *c, *rout, *cout;
412: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
413: PetscInt i, nz, idx, idt, idc, m;
414: const MatScalar *aa = a->a, *v;
415: PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
416: const PetscScalar *b;
418: VecGetArrayRead(bb, &b);
419: VecGetArray(xx, &x);
420: t = a->solve_work;
422: ISGetIndices(isrow, &rout);
423: r = rout;
424: ISGetIndices(iscol, &cout);
425: c = cout;
427: /* forward solve the lower triangular */
428: idx = 6 * r[0];
429: t[0] = b[idx];
430: t[1] = b[1 + idx];
431: t[2] = b[2 + idx];
432: t[3] = b[3 + idx];
433: t[4] = b[4 + idx];
434: t[5] = b[5 + idx];
435: for (i = 1; i < n; i++) {
436: v = aa + 36 * ai[i];
437: vi = aj + ai[i];
438: nz = ai[i + 1] - ai[i];
439: idx = 6 * r[i];
440: s1 = b[idx];
441: s2 = b[1 + idx];
442: s3 = b[2 + idx];
443: s4 = b[3 + idx];
444: s5 = b[4 + idx];
445: s6 = b[5 + idx];
446: for (m = 0; m < nz; m++) {
447: idx = 6 * vi[m];
448: x1 = t[idx];
449: x2 = t[1 + idx];
450: x3 = t[2 + idx];
451: x4 = t[3 + idx];
452: x5 = t[4 + idx];
453: x6 = t[5 + idx];
454: s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
455: s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
456: s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
457: s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
458: s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
459: s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
460: v += 36;
461: }
462: idx = 6 * i;
463: t[idx] = s1;
464: t[1 + idx] = s2;
465: t[2 + idx] = s3;
466: t[3 + idx] = s4;
467: t[4 + idx] = s5;
468: t[5 + idx] = s6;
469: }
470: /* backward solve the upper triangular */
471: for (i = n - 1; i >= 0; i--) {
472: v = aa + 36 * (adiag[i + 1] + 1);
473: vi = aj + adiag[i + 1] + 1;
474: nz = adiag[i] - adiag[i + 1] - 1;
475: idt = 6 * i;
476: s1 = t[idt];
477: s2 = t[1 + idt];
478: s3 = t[2 + idt];
479: s4 = t[3 + idt];
480: s5 = t[4 + idt];
481: s6 = t[5 + idt];
482: for (m = 0; m < nz; m++) {
483: idx = 6 * vi[m];
484: x1 = t[idx];
485: x2 = t[1 + idx];
486: x3 = t[2 + idx];
487: x4 = t[3 + idx];
488: x5 = t[4 + idx];
489: x6 = t[5 + idx];
490: s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
491: s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
492: s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
493: s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
494: s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
495: s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
496: v += 36;
497: }
498: idc = 6 * c[i];
499: x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
500: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
501: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
502: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
503: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
504: x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
505: }
507: ISRestoreIndices(isrow, &rout);
508: ISRestoreIndices(iscol, &cout);
509: VecRestoreArrayRead(bb, &b);
510: VecRestoreArray(xx, &x);
511: PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n);
512: return 0;
513: }
515: PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
516: {
517: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
518: IS iscol = a->col, isrow = a->row;
519: const PetscInt *r, *c, *rout, *cout, *diag = a->diag;
520: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
521: PetscInt i, nz, idx, idt, idc;
522: const MatScalar *aa = a->a, *v;
523: PetscScalar *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
524: const PetscScalar *b;
526: VecGetArrayRead(bb, &b);
527: VecGetArray(xx, &x);
528: t = a->solve_work;
530: ISGetIndices(isrow, &rout);
531: r = rout;
532: ISGetIndices(iscol, &cout);
533: c = cout + (n - 1);
535: /* forward solve the lower triangular */
536: idx = 5 * (*r++);
537: t[0] = b[idx];
538: t[1] = b[1 + idx];
539: t[2] = b[2 + idx];
540: t[3] = b[3 + idx];
541: t[4] = b[4 + idx];
542: for (i = 1; i < n; i++) {
543: v = aa + 25 * ai[i];
544: vi = aj + ai[i];
545: nz = diag[i] - ai[i];
546: idx = 5 * (*r++);
547: s1 = b[idx];
548: s2 = b[1 + idx];
549: s3 = b[2 + idx];
550: s4 = b[3 + idx];
551: s5 = b[4 + idx];
552: while (nz--) {
553: idx = 5 * (*vi++);
554: x1 = t[idx];
555: x2 = t[1 + idx];
556: x3 = t[2 + idx];
557: x4 = t[3 + idx];
558: x5 = t[4 + idx];
559: s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
560: s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
561: s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
562: s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
563: s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
564: v += 25;
565: }
566: idx = 5 * i;
567: t[idx] = s1;
568: t[1 + idx] = s2;
569: t[2 + idx] = s3;
570: t[3 + idx] = s4;
571: t[4 + idx] = s5;
572: }
573: /* backward solve the upper triangular */
574: for (i = n - 1; i >= 0; i--) {
575: v = aa + 25 * diag[i] + 25;
576: vi = aj + diag[i] + 1;
577: nz = ai[i + 1] - diag[i] - 1;
578: idt = 5 * i;
579: s1 = t[idt];
580: s2 = t[1 + idt];
581: s3 = t[2 + idt];
582: s4 = t[3 + idt];
583: s5 = t[4 + idt];
584: while (nz--) {
585: idx = 5 * (*vi++);
586: x1 = t[idx];
587: x2 = t[1 + idx];
588: x3 = t[2 + idx];
589: x4 = t[3 + idx];
590: x5 = t[4 + idx];
591: s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
592: s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
593: s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
594: s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
595: s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
596: v += 25;
597: }
598: idc = 5 * (*c--);
599: v = aa + 25 * diag[i];
600: x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
601: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
602: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
603: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
604: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
605: }
607: ISRestoreIndices(isrow, &rout);
608: ISRestoreIndices(iscol, &cout);
609: VecRestoreArrayRead(bb, &b);
610: VecRestoreArray(xx, &x);
611: PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n);
612: return 0;
613: }
615: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A, Vec bb, Vec xx)
616: {
617: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
618: IS iscol = a->col, isrow = a->row;
619: const PetscInt *r, *c, *rout, *cout;
620: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
621: PetscInt i, nz, idx, idt, idc, m;
622: const MatScalar *aa = a->a, *v;
623: PetscScalar *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
624: const PetscScalar *b;
626: VecGetArrayRead(bb, &b);
627: VecGetArray(xx, &x);
628: t = a->solve_work;
630: ISGetIndices(isrow, &rout);
631: r = rout;
632: ISGetIndices(iscol, &cout);
633: c = cout;
635: /* forward solve the lower triangular */
636: idx = 5 * r[0];
637: t[0] = b[idx];
638: t[1] = b[1 + idx];
639: t[2] = b[2 + idx];
640: t[3] = b[3 + idx];
641: t[4] = b[4 + idx];
642: for (i = 1; i < n; i++) {
643: v = aa + 25 * ai[i];
644: vi = aj + ai[i];
645: nz = ai[i + 1] - ai[i];
646: idx = 5 * r[i];
647: s1 = b[idx];
648: s2 = b[1 + idx];
649: s3 = b[2 + idx];
650: s4 = b[3 + idx];
651: s5 = b[4 + idx];
652: for (m = 0; m < nz; m++) {
653: idx = 5 * vi[m];
654: x1 = t[idx];
655: x2 = t[1 + idx];
656: x3 = t[2 + idx];
657: x4 = t[3 + idx];
658: x5 = t[4 + idx];
659: s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
660: s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
661: s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
662: s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
663: s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
664: v += 25;
665: }
666: idx = 5 * i;
667: t[idx] = s1;
668: t[1 + idx] = s2;
669: t[2 + idx] = s3;
670: t[3 + idx] = s4;
671: t[4 + idx] = s5;
672: }
673: /* backward solve the upper triangular */
674: for (i = n - 1; i >= 0; i--) {
675: v = aa + 25 * (adiag[i + 1] + 1);
676: vi = aj + adiag[i + 1] + 1;
677: nz = adiag[i] - adiag[i + 1] - 1;
678: idt = 5 * i;
679: s1 = t[idt];
680: s2 = t[1 + idt];
681: s3 = t[2 + idt];
682: s4 = t[3 + idt];
683: s5 = t[4 + idt];
684: for (m = 0; m < nz; m++) {
685: idx = 5 * vi[m];
686: x1 = t[idx];
687: x2 = t[1 + idx];
688: x3 = t[2 + idx];
689: x4 = t[3 + idx];
690: x5 = t[4 + idx];
691: s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
692: s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
693: s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
694: s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
695: s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
696: v += 25;
697: }
698: idc = 5 * c[i];
699: x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
700: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
701: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
702: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
703: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
704: }
706: ISRestoreIndices(isrow, &rout);
707: ISRestoreIndices(iscol, &cout);
708: VecRestoreArrayRead(bb, &b);
709: VecRestoreArray(xx, &x);
710: PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n);
711: return 0;
712: }
714: PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
715: {
716: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
717: IS iscol = a->col, isrow = a->row;
718: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
719: PetscInt i, nz, idx, idt, idc;
720: const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
721: const MatScalar *aa = a->a, *v;
722: PetscScalar *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
723: const PetscScalar *b;
725: VecGetArrayRead(bb, &b);
726: VecGetArray(xx, &x);
727: t = a->solve_work;
729: ISGetIndices(isrow, &rout);
730: r = rout;
731: ISGetIndices(iscol, &cout);
732: c = cout + (n - 1);
734: /* forward solve the lower triangular */
735: idx = 4 * (*r++);
736: t[0] = b[idx];
737: t[1] = b[1 + idx];
738: t[2] = b[2 + idx];
739: t[3] = b[3 + idx];
740: for (i = 1; i < n; i++) {
741: v = aa + 16 * ai[i];
742: vi = aj + ai[i];
743: nz = diag[i] - ai[i];
744: idx = 4 * (*r++);
745: s1 = b[idx];
746: s2 = b[1 + idx];
747: s3 = b[2 + idx];
748: s4 = b[3 + idx];
749: while (nz--) {
750: idx = 4 * (*vi++);
751: x1 = t[idx];
752: x2 = t[1 + idx];
753: x3 = t[2 + idx];
754: x4 = t[3 + idx];
755: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
756: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
757: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
758: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
759: v += 16;
760: }
761: idx = 4 * i;
762: t[idx] = s1;
763: t[1 + idx] = s2;
764: t[2 + idx] = s3;
765: t[3 + idx] = s4;
766: }
767: /* backward solve the upper triangular */
768: for (i = n - 1; i >= 0; i--) {
769: v = aa + 16 * diag[i] + 16;
770: vi = aj + diag[i] + 1;
771: nz = ai[i + 1] - diag[i] - 1;
772: idt = 4 * i;
773: s1 = t[idt];
774: s2 = t[1 + idt];
775: s3 = t[2 + idt];
776: s4 = t[3 + idt];
777: while (nz--) {
778: idx = 4 * (*vi++);
779: x1 = t[idx];
780: x2 = t[1 + idx];
781: x3 = t[2 + idx];
782: x4 = t[3 + idx];
783: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
784: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
785: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
786: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
787: v += 16;
788: }
789: idc = 4 * (*c--);
790: v = aa + 16 * diag[i];
791: x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
792: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
793: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
794: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
795: }
797: ISRestoreIndices(isrow, &rout);
798: ISRestoreIndices(iscol, &cout);
799: VecRestoreArrayRead(bb, &b);
800: VecRestoreArray(xx, &x);
801: PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n);
802: return 0;
803: }
805: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A, Vec bb, Vec xx)
806: {
807: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
808: IS iscol = a->col, isrow = a->row;
809: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
810: PetscInt i, nz, idx, idt, idc, m;
811: const PetscInt *r, *c, *rout, *cout;
812: const MatScalar *aa = a->a, *v;
813: PetscScalar *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
814: const PetscScalar *b;
816: VecGetArrayRead(bb, &b);
817: VecGetArray(xx, &x);
818: t = a->solve_work;
820: ISGetIndices(isrow, &rout);
821: r = rout;
822: ISGetIndices(iscol, &cout);
823: c = cout;
825: /* forward solve the lower triangular */
826: idx = 4 * r[0];
827: t[0] = b[idx];
828: t[1] = b[1 + idx];
829: t[2] = b[2 + idx];
830: t[3] = b[3 + idx];
831: for (i = 1; i < n; i++) {
832: v = aa + 16 * ai[i];
833: vi = aj + ai[i];
834: nz = ai[i + 1] - ai[i];
835: idx = 4 * r[i];
836: s1 = b[idx];
837: s2 = b[1 + idx];
838: s3 = b[2 + idx];
839: s4 = b[3 + idx];
840: for (m = 0; m < nz; m++) {
841: idx = 4 * vi[m];
842: x1 = t[idx];
843: x2 = t[1 + idx];
844: x3 = t[2 + idx];
845: x4 = t[3 + idx];
846: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
847: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
848: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
849: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
850: v += 16;
851: }
852: idx = 4 * i;
853: t[idx] = s1;
854: t[1 + idx] = s2;
855: t[2 + idx] = s3;
856: t[3 + idx] = s4;
857: }
858: /* backward solve the upper triangular */
859: for (i = n - 1; i >= 0; i--) {
860: v = aa + 16 * (adiag[i + 1] + 1);
861: vi = aj + adiag[i + 1] + 1;
862: nz = adiag[i] - adiag[i + 1] - 1;
863: idt = 4 * i;
864: s1 = t[idt];
865: s2 = t[1 + idt];
866: s3 = t[2 + idt];
867: s4 = t[3 + idt];
868: for (m = 0; m < nz; m++) {
869: idx = 4 * vi[m];
870: x1 = t[idx];
871: x2 = t[1 + idx];
872: x3 = t[2 + idx];
873: x4 = t[3 + idx];
874: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
875: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
876: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
877: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
878: v += 16;
879: }
880: idc = 4 * c[i];
881: x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
882: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
883: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
884: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
885: }
887: ISRestoreIndices(isrow, &rout);
888: ISRestoreIndices(iscol, &cout);
889: VecRestoreArrayRead(bb, &b);
890: VecRestoreArray(xx, &x);
891: PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n);
892: return 0;
893: }
895: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A, Vec bb, Vec xx)
896: {
897: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
898: IS iscol = a->col, isrow = a->row;
899: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
900: PetscInt i, nz, idx, idt, idc;
901: const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
902: const MatScalar *aa = a->a, *v;
903: MatScalar s1, s2, s3, s4, x1, x2, x3, x4, *t;
904: PetscScalar *x;
905: const PetscScalar *b;
907: VecGetArrayRead(bb, &b);
908: VecGetArray(xx, &x);
909: t = (MatScalar *)a->solve_work;
911: ISGetIndices(isrow, &rout);
912: r = rout;
913: ISGetIndices(iscol, &cout);
914: c = cout + (n - 1);
916: /* forward solve the lower triangular */
917: idx = 4 * (*r++);
918: t[0] = (MatScalar)b[idx];
919: t[1] = (MatScalar)b[1 + idx];
920: t[2] = (MatScalar)b[2 + idx];
921: t[3] = (MatScalar)b[3 + idx];
922: for (i = 1; i < n; i++) {
923: v = aa + 16 * ai[i];
924: vi = aj + ai[i];
925: nz = diag[i] - ai[i];
926: idx = 4 * (*r++);
927: s1 = (MatScalar)b[idx];
928: s2 = (MatScalar)b[1 + idx];
929: s3 = (MatScalar)b[2 + idx];
930: s4 = (MatScalar)b[3 + idx];
931: while (nz--) {
932: idx = 4 * (*vi++);
933: x1 = t[idx];
934: x2 = t[1 + idx];
935: x3 = t[2 + idx];
936: x4 = t[3 + idx];
937: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
938: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
939: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
940: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
941: v += 16;
942: }
943: idx = 4 * i;
944: t[idx] = s1;
945: t[1 + idx] = s2;
946: t[2 + idx] = s3;
947: t[3 + idx] = s4;
948: }
949: /* backward solve the upper triangular */
950: for (i = n - 1; i >= 0; i--) {
951: v = aa + 16 * diag[i] + 16;
952: vi = aj + diag[i] + 1;
953: nz = ai[i + 1] - diag[i] - 1;
954: idt = 4 * i;
955: s1 = t[idt];
956: s2 = t[1 + idt];
957: s3 = t[2 + idt];
958: s4 = t[3 + idt];
959: while (nz--) {
960: idx = 4 * (*vi++);
961: x1 = t[idx];
962: x2 = t[1 + idx];
963: x3 = t[2 + idx];
964: x4 = t[3 + idx];
965: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
966: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
967: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
968: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
969: v += 16;
970: }
971: idc = 4 * (*c--);
972: v = aa + 16 * diag[i];
973: t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
974: t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
975: t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
976: t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
977: x[idc] = (PetscScalar)t[idt];
978: x[1 + idc] = (PetscScalar)t[1 + idt];
979: x[2 + idc] = (PetscScalar)t[2 + idt];
980: x[3 + idc] = (PetscScalar)t[3 + idt];
981: }
983: ISRestoreIndices(isrow, &rout);
984: ISRestoreIndices(iscol, &cout);
985: VecRestoreArrayRead(bb, &b);
986: VecRestoreArray(xx, &x);
987: PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n);
988: return 0;
989: }
991: #if defined(PETSC_HAVE_SSE)
993: #include PETSC_HAVE_SSE
995: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A, Vec bb, Vec xx)
996: {
997: /*
998: Note: This code uses demotion of double
999: to float when performing the mixed-mode computation.
1000: This may not be numerically reasonable for all applications.
1001: */
1002: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1003: IS iscol = a->col, isrow = a->row;
1004: PetscInt i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, nz, idx, idt, idc, ai16;
1005: const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
1006: MatScalar *aa = a->a, *v;
1007: PetscScalar *x, *b, *t;
1009: /* Make space in temp stack for 16 Byte Aligned arrays */
1010: float ssealignedspace[11], *tmps, *tmpx;
1011: unsigned long offset;
1013: SSE_SCOPE_BEGIN;
1015: offset = (unsigned long)ssealignedspace % 16;
1016: if (offset) offset = (16 - offset) / 4;
1017: tmps = &ssealignedspace[offset];
1018: tmpx = &ssealignedspace[offset + 4];
1019: PREFETCH_NTA(aa + 16 * ai[1]);
1021: VecGetArray(bb, &b);
1022: VecGetArray(xx, &x);
1023: t = a->solve_work;
1025: ISGetIndices(isrow, &rout);
1026: r = rout;
1027: ISGetIndices(iscol, &cout);
1028: c = cout + (n - 1);
1030: /* forward solve the lower triangular */
1031: idx = 4 * (*r++);
1032: t[0] = b[idx];
1033: t[1] = b[1 + idx];
1034: t[2] = b[2 + idx];
1035: t[3] = b[3 + idx];
1036: v = aa + 16 * ai[1];
1038: for (i = 1; i < n;) {
1039: PREFETCH_NTA(&v[8]);
1040: vi = aj + ai[i];
1041: nz = diag[i] - ai[i];
1042: idx = 4 * (*r++);
1044: /* Demote sum from double to float */
1045: CONVERT_DOUBLE4_FLOAT4(tmps, &b[idx]);
1046: LOAD_PS(tmps, XMM7);
1048: while (nz--) {
1049: PREFETCH_NTA(&v[16]);
1050: idx = 4 * (*vi++);
1052: /* Demote solution (so far) from double to float */
1053: CONVERT_DOUBLE4_FLOAT4(tmpx, &x[idx]);
1055: /* 4x4 Matrix-Vector product with negative accumulation: */
1056: SSE_INLINE_BEGIN_2(tmpx, v)
1057: SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
1059: /* First Column */
1060: SSE_COPY_PS(XMM0, XMM6)
1061: SSE_SHUFFLE(XMM0, XMM0, 0x00)
1062: SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
1063: SSE_SUB_PS(XMM7, XMM0)
1065: /* Second Column */
1066: SSE_COPY_PS(XMM1, XMM6)
1067: SSE_SHUFFLE(XMM1, XMM1, 0x55)
1068: SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
1069: SSE_SUB_PS(XMM7, XMM1)
1071: SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
1073: /* Third Column */
1074: SSE_COPY_PS(XMM2, XMM6)
1075: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1076: SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
1077: SSE_SUB_PS(XMM7, XMM2)
1079: /* Fourth Column */
1080: SSE_COPY_PS(XMM3, XMM6)
1081: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1082: SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
1083: SSE_SUB_PS(XMM7, XMM3)
1084: SSE_INLINE_END_2
1086: v += 16;
1087: }
1088: idx = 4 * i;
1089: v = aa + 16 * ai[++i];
1090: PREFETCH_NTA(v);
1091: STORE_PS(tmps, XMM7);
1093: /* Promote result from float to double */
1094: CONVERT_FLOAT4_DOUBLE4(&t[idx], tmps);
1095: }
1096: /* backward solve the upper triangular */
1097: idt = 4 * (n - 1);
1098: ai16 = 16 * diag[n - 1];
1099: v = aa + ai16 + 16;
1100: for (i = n - 1; i >= 0;) {
1101: PREFETCH_NTA(&v[8]);
1102: vi = aj + diag[i] + 1;
1103: nz = ai[i + 1] - diag[i] - 1;
1105: /* Demote accumulator from double to float */
1106: CONVERT_DOUBLE4_FLOAT4(tmps, &t[idt]);
1107: LOAD_PS(tmps, XMM7);
1109: while (nz--) {
1110: PREFETCH_NTA(&v[16]);
1111: idx = 4 * (*vi++);
1113: /* Demote solution (so far) from double to float */
1114: CONVERT_DOUBLE4_FLOAT4(tmpx, &t[idx]);
1116: /* 4x4 Matrix-Vector Product with negative accumulation: */
1117: SSE_INLINE_BEGIN_2(tmpx, v)
1118: SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
1120: /* First Column */
1121: SSE_COPY_PS(XMM0, XMM6)
1122: SSE_SHUFFLE(XMM0, XMM0, 0x00)
1123: SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
1124: SSE_SUB_PS(XMM7, XMM0)
1126: /* Second Column */
1127: SSE_COPY_PS(XMM1, XMM6)
1128: SSE_SHUFFLE(XMM1, XMM1, 0x55)
1129: SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
1130: SSE_SUB_PS(XMM7, XMM1)
1132: SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
1134: /* Third Column */
1135: SSE_COPY_PS(XMM2, XMM6)
1136: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1137: SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
1138: SSE_SUB_PS(XMM7, XMM2)
1140: /* Fourth Column */
1141: SSE_COPY_PS(XMM3, XMM6)
1142: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1143: SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
1144: SSE_SUB_PS(XMM7, XMM3)
1145: SSE_INLINE_END_2
1146: v += 16;
1147: }
1148: v = aa + ai16;
1149: ai16 = 16 * diag[--i];
1150: PREFETCH_NTA(aa + ai16 + 16);
1151: /*
1152: Scale the result by the diagonal 4x4 block,
1153: which was inverted as part of the factorization
1154: */
1155: SSE_INLINE_BEGIN_3(v, tmps, aa + ai16)
1156: /* First Column */
1157: SSE_COPY_PS(XMM0, XMM7)
1158: SSE_SHUFFLE(XMM0, XMM0, 0x00)
1159: SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
1161: /* Second Column */
1162: SSE_COPY_PS(XMM1, XMM7)
1163: SSE_SHUFFLE(XMM1, XMM1, 0x55)
1164: SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
1165: SSE_ADD_PS(XMM0, XMM1)
1167: SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
1169: /* Third Column */
1170: SSE_COPY_PS(XMM2, XMM7)
1171: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1172: SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
1173: SSE_ADD_PS(XMM0, XMM2)
1175: /* Fourth Column */
1176: SSE_COPY_PS(XMM3, XMM7)
1177: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1178: SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
1179: SSE_ADD_PS(XMM0, XMM3)
1181: SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
1182: SSE_INLINE_END_3
1184: /* Promote solution from float to double */
1185: CONVERT_FLOAT4_DOUBLE4(&t[idt], tmps);
1187: /* Apply reordering to t and stream into x. */
1188: /* This way, x doesn't pollute the cache. */
1189: /* Be careful with size: 2 doubles = 4 floats! */
1190: idc = 4 * (*c--);
1191: SSE_INLINE_BEGIN_2((float *)&t[idt], (float *)&x[idc])
1192: /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */
1193: SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM0)
1194: SSE_STREAM_PS(SSE_ARG_2, FLOAT_0, XMM0)
1195: /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1196: SSE_LOAD_PS(SSE_ARG_1, FLOAT_4, XMM1)
1197: SSE_STREAM_PS(SSE_ARG_2, FLOAT_4, XMM1)
1198: SSE_INLINE_END_2
1199: v = aa + ai16 + 16;
1200: idt -= 4;
1201: }
1203: ISRestoreIndices(isrow, &rout);
1204: ISRestoreIndices(iscol, &cout);
1205: VecRestoreArray(bb, &b);
1206: VecRestoreArray(xx, &x);
1207: PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n);
1208: SSE_SCOPE_END;
1209: return 0;
1210: }
1212: #endif
1214: PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
1215: {
1216: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1217: IS iscol = a->col, isrow = a->row;
1218: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1219: PetscInt i, nz, idx, idt, idc;
1220: const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
1221: const MatScalar *aa = a->a, *v;
1222: PetscScalar *x, s1, s2, s3, x1, x2, x3, *t;
1223: const PetscScalar *b;
1225: VecGetArrayRead(bb, &b);
1226: VecGetArray(xx, &x);
1227: t = a->solve_work;
1229: ISGetIndices(isrow, &rout);
1230: r = rout;
1231: ISGetIndices(iscol, &cout);
1232: c = cout + (n - 1);
1234: /* forward solve the lower triangular */
1235: idx = 3 * (*r++);
1236: t[0] = b[idx];
1237: t[1] = b[1 + idx];
1238: t[2] = b[2 + idx];
1239: for (i = 1; i < n; i++) {
1240: v = aa + 9 * ai[i];
1241: vi = aj + ai[i];
1242: nz = diag[i] - ai[i];
1243: idx = 3 * (*r++);
1244: s1 = b[idx];
1245: s2 = b[1 + idx];
1246: s3 = b[2 + idx];
1247: while (nz--) {
1248: idx = 3 * (*vi++);
1249: x1 = t[idx];
1250: x2 = t[1 + idx];
1251: x3 = t[2 + idx];
1252: s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1253: s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1254: s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1255: v += 9;
1256: }
1257: idx = 3 * i;
1258: t[idx] = s1;
1259: t[1 + idx] = s2;
1260: t[2 + idx] = s3;
1261: }
1262: /* backward solve the upper triangular */
1263: for (i = n - 1; i >= 0; i--) {
1264: v = aa + 9 * diag[i] + 9;
1265: vi = aj + diag[i] + 1;
1266: nz = ai[i + 1] - diag[i] - 1;
1267: idt = 3 * i;
1268: s1 = t[idt];
1269: s2 = t[1 + idt];
1270: s3 = t[2 + idt];
1271: while (nz--) {
1272: idx = 3 * (*vi++);
1273: x1 = t[idx];
1274: x2 = t[1 + idx];
1275: x3 = t[2 + idx];
1276: s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1277: s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1278: s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1279: v += 9;
1280: }
1281: idc = 3 * (*c--);
1282: v = aa + 9 * diag[i];
1283: x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
1284: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
1285: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
1286: }
1287: ISRestoreIndices(isrow, &rout);
1288: ISRestoreIndices(iscol, &cout);
1289: VecRestoreArrayRead(bb, &b);
1290: VecRestoreArray(xx, &x);
1291: PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n);
1292: return 0;
1293: }
1295: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A, Vec bb, Vec xx)
1296: {
1297: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1298: IS iscol = a->col, isrow = a->row;
1299: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1300: PetscInt i, nz, idx, idt, idc, m;
1301: const PetscInt *r, *c, *rout, *cout;
1302: const MatScalar *aa = a->a, *v;
1303: PetscScalar *x, s1, s2, s3, x1, x2, x3, *t;
1304: const PetscScalar *b;
1306: VecGetArrayRead(bb, &b);
1307: VecGetArray(xx, &x);
1308: t = a->solve_work;
1310: ISGetIndices(isrow, &rout);
1311: r = rout;
1312: ISGetIndices(iscol, &cout);
1313: c = cout;
1315: /* forward solve the lower triangular */
1316: idx = 3 * r[0];
1317: t[0] = b[idx];
1318: t[1] = b[1 + idx];
1319: t[2] = b[2 + idx];
1320: for (i = 1; i < n; i++) {
1321: v = aa + 9 * ai[i];
1322: vi = aj + ai[i];
1323: nz = ai[i + 1] - ai[i];
1324: idx = 3 * r[i];
1325: s1 = b[idx];
1326: s2 = b[1 + idx];
1327: s3 = b[2 + idx];
1328: for (m = 0; m < nz; m++) {
1329: idx = 3 * vi[m];
1330: x1 = t[idx];
1331: x2 = t[1 + idx];
1332: x3 = t[2 + idx];
1333: s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1334: s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1335: s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1336: v += 9;
1337: }
1338: idx = 3 * i;
1339: t[idx] = s1;
1340: t[1 + idx] = s2;
1341: t[2 + idx] = s3;
1342: }
1343: /* backward solve the upper triangular */
1344: for (i = n - 1; i >= 0; i--) {
1345: v = aa + 9 * (adiag[i + 1] + 1);
1346: vi = aj + adiag[i + 1] + 1;
1347: nz = adiag[i] - adiag[i + 1] - 1;
1348: idt = 3 * i;
1349: s1 = t[idt];
1350: s2 = t[1 + idt];
1351: s3 = t[2 + idt];
1352: for (m = 0; m < nz; m++) {
1353: idx = 3 * vi[m];
1354: x1 = t[idx];
1355: x2 = t[1 + idx];
1356: x3 = t[2 + idx];
1357: s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1358: s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1359: s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1360: v += 9;
1361: }
1362: idc = 3 * c[i];
1363: x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
1364: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
1365: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
1366: }
1367: ISRestoreIndices(isrow, &rout);
1368: ISRestoreIndices(iscol, &cout);
1369: VecRestoreArrayRead(bb, &b);
1370: VecRestoreArray(xx, &x);
1371: PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n);
1372: return 0;
1373: }
1375: PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
1376: {
1377: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1378: IS iscol = a->col, isrow = a->row;
1379: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1380: PetscInt i, nz, idx, idt, idc;
1381: const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
1382: const MatScalar *aa = a->a, *v;
1383: PetscScalar *x, s1, s2, x1, x2, *t;
1384: const PetscScalar *b;
1386: VecGetArrayRead(bb, &b);
1387: VecGetArray(xx, &x);
1388: t = a->solve_work;
1390: ISGetIndices(isrow, &rout);
1391: r = rout;
1392: ISGetIndices(iscol, &cout);
1393: c = cout + (n - 1);
1395: /* forward solve the lower triangular */
1396: idx = 2 * (*r++);
1397: t[0] = b[idx];
1398: t[1] = b[1 + idx];
1399: for (i = 1; i < n; i++) {
1400: v = aa + 4 * ai[i];
1401: vi = aj + ai[i];
1402: nz = diag[i] - ai[i];
1403: idx = 2 * (*r++);
1404: s1 = b[idx];
1405: s2 = b[1 + idx];
1406: while (nz--) {
1407: idx = 2 * (*vi++);
1408: x1 = t[idx];
1409: x2 = t[1 + idx];
1410: s1 -= v[0] * x1 + v[2] * x2;
1411: s2 -= v[1] * x1 + v[3] * x2;
1412: v += 4;
1413: }
1414: idx = 2 * i;
1415: t[idx] = s1;
1416: t[1 + idx] = s2;
1417: }
1418: /* backward solve the upper triangular */
1419: for (i = n - 1; i >= 0; i--) {
1420: v = aa + 4 * diag[i] + 4;
1421: vi = aj + diag[i] + 1;
1422: nz = ai[i + 1] - diag[i] - 1;
1423: idt = 2 * i;
1424: s1 = t[idt];
1425: s2 = t[1 + idt];
1426: while (nz--) {
1427: idx = 2 * (*vi++);
1428: x1 = t[idx];
1429: x2 = t[1 + idx];
1430: s1 -= v[0] * x1 + v[2] * x2;
1431: s2 -= v[1] * x1 + v[3] * x2;
1432: v += 4;
1433: }
1434: idc = 2 * (*c--);
1435: v = aa + 4 * diag[i];
1436: x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1437: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1438: }
1439: ISRestoreIndices(isrow, &rout);
1440: ISRestoreIndices(iscol, &cout);
1441: VecRestoreArrayRead(bb, &b);
1442: VecRestoreArray(xx, &x);
1443: PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n);
1444: return 0;
1445: }
1447: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A, Vec bb, Vec xx)
1448: {
1449: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1450: IS iscol = a->col, isrow = a->row;
1451: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1452: PetscInt i, nz, idx, jdx, idt, idc, m;
1453: const PetscInt *r, *c, *rout, *cout;
1454: const MatScalar *aa = a->a, *v;
1455: PetscScalar *x, s1, s2, x1, x2, *t;
1456: const PetscScalar *b;
1458: VecGetArrayRead(bb, &b);
1459: VecGetArray(xx, &x);
1460: t = a->solve_work;
1462: ISGetIndices(isrow, &rout);
1463: r = rout;
1464: ISGetIndices(iscol, &cout);
1465: c = cout;
1467: /* forward solve the lower triangular */
1468: idx = 2 * r[0];
1469: t[0] = b[idx];
1470: t[1] = b[1 + idx];
1471: for (i = 1; i < n; i++) {
1472: v = aa + 4 * ai[i];
1473: vi = aj + ai[i];
1474: nz = ai[i + 1] - ai[i];
1475: idx = 2 * r[i];
1476: s1 = b[idx];
1477: s2 = b[1 + idx];
1478: for (m = 0; m < nz; m++) {
1479: jdx = 2 * vi[m];
1480: x1 = t[jdx];
1481: x2 = t[1 + jdx];
1482: s1 -= v[0] * x1 + v[2] * x2;
1483: s2 -= v[1] * x1 + v[3] * x2;
1484: v += 4;
1485: }
1486: idx = 2 * i;
1487: t[idx] = s1;
1488: t[1 + idx] = s2;
1489: }
1490: /* backward solve the upper triangular */
1491: for (i = n - 1; i >= 0; i--) {
1492: v = aa + 4 * (adiag[i + 1] + 1);
1493: vi = aj + adiag[i + 1] + 1;
1494: nz = adiag[i] - adiag[i + 1] - 1;
1495: idt = 2 * i;
1496: s1 = t[idt];
1497: s2 = t[1 + idt];
1498: for (m = 0; m < nz; m++) {
1499: idx = 2 * vi[m];
1500: x1 = t[idx];
1501: x2 = t[1 + idx];
1502: s1 -= v[0] * x1 + v[2] * x2;
1503: s2 -= v[1] * x1 + v[3] * x2;
1504: v += 4;
1505: }
1506: idc = 2 * c[i];
1507: x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1508: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1509: }
1510: ISRestoreIndices(isrow, &rout);
1511: ISRestoreIndices(iscol, &cout);
1512: VecRestoreArrayRead(bb, &b);
1513: VecRestoreArray(xx, &x);
1514: PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n);
1515: return 0;
1516: }
1518: PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1519: {
1520: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1521: IS iscol = a->col, isrow = a->row;
1522: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1523: PetscInt i, nz;
1524: const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
1525: const MatScalar *aa = a->a, *v;
1526: PetscScalar *x, s1, *t;
1527: const PetscScalar *b;
1529: if (!n) return 0;
1531: VecGetArrayRead(bb, &b);
1532: VecGetArray(xx, &x);
1533: t = a->solve_work;
1535: ISGetIndices(isrow, &rout);
1536: r = rout;
1537: ISGetIndices(iscol, &cout);
1538: c = cout + (n - 1);
1540: /* forward solve the lower triangular */
1541: t[0] = b[*r++];
1542: for (i = 1; i < n; i++) {
1543: v = aa + ai[i];
1544: vi = aj + ai[i];
1545: nz = diag[i] - ai[i];
1546: s1 = b[*r++];
1547: while (nz--) s1 -= (*v++) * t[*vi++];
1548: t[i] = s1;
1549: }
1550: /* backward solve the upper triangular */
1551: for (i = n - 1; i >= 0; i--) {
1552: v = aa + diag[i] + 1;
1553: vi = aj + diag[i] + 1;
1554: nz = ai[i + 1] - diag[i] - 1;
1555: s1 = t[i];
1556: while (nz--) s1 -= (*v++) * t[*vi++];
1557: x[*c--] = t[i] = aa[diag[i]] * s1;
1558: }
1560: ISRestoreIndices(isrow, &rout);
1561: ISRestoreIndices(iscol, &cout);
1562: VecRestoreArrayRead(bb, &b);
1563: VecRestoreArray(xx, &x);
1564: PetscLogFlops(2.0 * 1 * (a->nz) - A->cmap->n);
1565: return 0;
1566: }
1568: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A, Vec bb, Vec xx)
1569: {
1570: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1571: IS iscol = a->col, isrow = a->row;
1572: PetscInt i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz;
1573: const PetscInt *rout, *cout, *r, *c;
1574: PetscScalar *x, *tmp, sum;
1575: const PetscScalar *b;
1576: const MatScalar *aa = a->a, *v;
1578: if (!n) return 0;
1580: VecGetArrayRead(bb, &b);
1581: VecGetArray(xx, &x);
1582: tmp = a->solve_work;
1584: ISGetIndices(isrow, &rout);
1585: r = rout;
1586: ISGetIndices(iscol, &cout);
1587: c = cout;
1589: /* forward solve the lower triangular */
1590: tmp[0] = b[r[0]];
1591: v = aa;
1592: vi = aj;
1593: for (i = 1; i < n; i++) {
1594: nz = ai[i + 1] - ai[i];
1595: sum = b[r[i]];
1596: PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1597: tmp[i] = sum;
1598: v += nz;
1599: vi += nz;
1600: }
1602: /* backward solve the upper triangular */
1603: for (i = n - 1; i >= 0; i--) {
1604: v = aa + adiag[i + 1] + 1;
1605: vi = aj + adiag[i + 1] + 1;
1606: nz = adiag[i] - adiag[i + 1] - 1;
1607: sum = tmp[i];
1608: PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1609: x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */
1610: }
1612: ISRestoreIndices(isrow, &rout);
1613: ISRestoreIndices(iscol, &cout);
1614: VecRestoreArrayRead(bb, &b);
1615: VecRestoreArray(xx, &x);
1616: PetscLogFlops(2.0 * a->nz - A->cmap->n);
1617: return 0;
1618: }