Actual source code: baijsolvnat14.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <petsc/private/kernels/blockinvert.h>
4: /* Block operations are done by accessing one column at at time */
5: /* Default MatSolve for block size 14 */
7: PetscErrorCode MatSolve_SeqBAIJ_14_NaturalOrdering(Mat A, Vec bb, Vec xx)
8: {
9: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
10: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi, bs = A->rmap->bs, bs2 = a->bs2;
11: PetscInt i, k, nz, idx, idt, m;
12: const MatScalar *aa = a->a, *v;
13: PetscScalar s[14];
14: PetscScalar *x, xv;
15: const PetscScalar *b;
17: VecGetArrayRead(bb, &b);
18: VecGetArray(xx, &x);
20: /* forward solve the lower triangular */
21: for (i = 0; i < n; i++) {
22: v = aa + bs2 * ai[i];
23: vi = aj + ai[i];
24: nz = ai[i + 1] - ai[i];
25: idt = bs * i;
26: x[idt] = b[idt];
27: x[1 + idt] = b[1 + idt];
28: x[2 + idt] = b[2 + idt];
29: x[3 + idt] = b[3 + idt];
30: x[4 + idt] = b[4 + idt];
31: x[5 + idt] = b[5 + idt];
32: x[6 + idt] = b[6 + idt];
33: x[7 + idt] = b[7 + idt];
34: x[8 + idt] = b[8 + idt];
35: x[9 + idt] = b[9 + idt];
36: x[10 + idt] = b[10 + idt];
37: x[11 + idt] = b[11 + idt];
38: x[12 + idt] = b[12 + idt];
39: x[13 + idt] = b[13 + idt];
40: for (m = 0; m < nz; m++) {
41: idx = bs * vi[m];
42: for (k = 0; k < bs; k++) {
43: xv = x[k + idx];
44: x[idt] -= v[0] * xv;
45: x[1 + idt] -= v[1] * xv;
46: x[2 + idt] -= v[2] * xv;
47: x[3 + idt] -= v[3] * xv;
48: x[4 + idt] -= v[4] * xv;
49: x[5 + idt] -= v[5] * xv;
50: x[6 + idt] -= v[6] * xv;
51: x[7 + idt] -= v[7] * xv;
52: x[8 + idt] -= v[8] * xv;
53: x[9 + idt] -= v[9] * xv;
54: x[10 + idt] -= v[10] * xv;
55: x[11 + idt] -= v[11] * xv;
56: x[12 + idt] -= v[12] * xv;
57: x[13 + idt] -= v[13] * xv;
58: v += 14;
59: }
60: }
61: }
62: /* backward solve the upper triangular */
63: for (i = n - 1; i >= 0; i--) {
64: v = aa + bs2 * (adiag[i + 1] + 1);
65: vi = aj + adiag[i + 1] + 1;
66: nz = adiag[i] - adiag[i + 1] - 1;
67: idt = bs * i;
68: s[0] = x[idt];
69: s[1] = x[1 + idt];
70: s[2] = x[2 + idt];
71: s[3] = x[3 + idt];
72: s[4] = x[4 + idt];
73: s[5] = x[5 + idt];
74: s[6] = x[6 + idt];
75: s[7] = x[7 + idt];
76: s[8] = x[8 + idt];
77: s[9] = x[9 + idt];
78: s[10] = x[10 + idt];
79: s[11] = x[11 + idt];
80: s[12] = x[12 + idt];
81: s[13] = x[13 + idt];
83: for (m = 0; m < nz; m++) {
84: idx = bs * vi[m];
85: for (k = 0; k < bs; k++) {
86: xv = x[k + idx];
87: s[0] -= v[0] * xv;
88: s[1] -= v[1] * xv;
89: s[2] -= v[2] * xv;
90: s[3] -= v[3] * xv;
91: s[4] -= v[4] * xv;
92: s[5] -= v[5] * xv;
93: s[6] -= v[6] * xv;
94: s[7] -= v[7] * xv;
95: s[8] -= v[8] * xv;
96: s[9] -= v[9] * xv;
97: s[10] -= v[10] * xv;
98: s[11] -= v[11] * xv;
99: s[12] -= v[12] * xv;
100: s[13] -= v[13] * xv;
101: v += 14;
102: }
103: }
104: PetscArrayzero(x + idt, bs);
105: for (k = 0; k < bs; k++) {
106: x[idt] += v[0] * s[k];
107: x[1 + idt] += v[1] * s[k];
108: x[2 + idt] += v[2] * s[k];
109: x[3 + idt] += v[3] * s[k];
110: x[4 + idt] += v[4] * s[k];
111: x[5 + idt] += v[5] * s[k];
112: x[6 + idt] += v[6] * s[k];
113: x[7 + idt] += v[7] * s[k];
114: x[8 + idt] += v[8] * s[k];
115: x[9 + idt] += v[9] * s[k];
116: x[10 + idt] += v[10] * s[k];
117: x[11 + idt] += v[11] * s[k];
118: x[12 + idt] += v[12] * s[k];
119: x[13 + idt] += v[13] * s[k];
120: v += 14;
121: }
122: }
123: VecRestoreArrayRead(bb, &b);
124: VecRestoreArray(xx, &x);
125: PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n);
126: return 0;
127: }
129: /* Block operations are done by accessing one column at at time */
130: /* Default MatSolve for block size 13 */
132: PetscErrorCode MatSolve_SeqBAIJ_13_NaturalOrdering(Mat A, Vec bb, Vec xx)
133: {
134: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
135: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi, bs = A->rmap->bs, bs2 = a->bs2;
136: PetscInt i, k, nz, idx, idt, m;
137: const MatScalar *aa = a->a, *v;
138: PetscScalar s[13];
139: PetscScalar *x, xv;
140: const PetscScalar *b;
142: VecGetArrayRead(bb, &b);
143: VecGetArray(xx, &x);
145: /* forward solve the lower triangular */
146: for (i = 0; i < n; i++) {
147: v = aa + bs2 * ai[i];
148: vi = aj + ai[i];
149: nz = ai[i + 1] - ai[i];
150: idt = bs * i;
151: x[idt] = b[idt];
152: x[1 + idt] = b[1 + idt];
153: x[2 + idt] = b[2 + idt];
154: x[3 + idt] = b[3 + idt];
155: x[4 + idt] = b[4 + idt];
156: x[5 + idt] = b[5 + idt];
157: x[6 + idt] = b[6 + idt];
158: x[7 + idt] = b[7 + idt];
159: x[8 + idt] = b[8 + idt];
160: x[9 + idt] = b[9 + idt];
161: x[10 + idt] = b[10 + idt];
162: x[11 + idt] = b[11 + idt];
163: x[12 + idt] = b[12 + idt];
164: for (m = 0; m < nz; m++) {
165: idx = bs * vi[m];
166: for (k = 0; k < bs; k++) {
167: xv = x[k + idx];
168: x[idt] -= v[0] * xv;
169: x[1 + idt] -= v[1] * xv;
170: x[2 + idt] -= v[2] * xv;
171: x[3 + idt] -= v[3] * xv;
172: x[4 + idt] -= v[4] * xv;
173: x[5 + idt] -= v[5] * xv;
174: x[6 + idt] -= v[6] * xv;
175: x[7 + idt] -= v[7] * xv;
176: x[8 + idt] -= v[8] * xv;
177: x[9 + idt] -= v[9] * xv;
178: x[10 + idt] -= v[10] * xv;
179: x[11 + idt] -= v[11] * xv;
180: x[12 + idt] -= v[12] * xv;
181: v += 13;
182: }
183: }
184: }
185: /* backward solve the upper triangular */
186: for (i = n - 1; i >= 0; i--) {
187: v = aa + bs2 * (adiag[i + 1] + 1);
188: vi = aj + adiag[i + 1] + 1;
189: nz = adiag[i] - adiag[i + 1] - 1;
190: idt = bs * i;
191: s[0] = x[idt];
192: s[1] = x[1 + idt];
193: s[2] = x[2 + idt];
194: s[3] = x[3 + idt];
195: s[4] = x[4 + idt];
196: s[5] = x[5 + idt];
197: s[6] = x[6 + idt];
198: s[7] = x[7 + idt];
199: s[8] = x[8 + idt];
200: s[9] = x[9 + idt];
201: s[10] = x[10 + idt];
202: s[11] = x[11 + idt];
203: s[12] = x[12 + idt];
205: for (m = 0; m < nz; m++) {
206: idx = bs * vi[m];
207: for (k = 0; k < bs; k++) {
208: xv = x[k + idx];
209: s[0] -= v[0] * xv;
210: s[1] -= v[1] * xv;
211: s[2] -= v[2] * xv;
212: s[3] -= v[3] * xv;
213: s[4] -= v[4] * xv;
214: s[5] -= v[5] * xv;
215: s[6] -= v[6] * xv;
216: s[7] -= v[7] * xv;
217: s[8] -= v[8] * xv;
218: s[9] -= v[9] * xv;
219: s[10] -= v[10] * xv;
220: s[11] -= v[11] * xv;
221: s[12] -= v[12] * xv;
222: v += 13;
223: }
224: }
225: PetscArrayzero(x + idt, bs);
226: for (k = 0; k < bs; k++) {
227: x[idt] += v[0] * s[k];
228: x[1 + idt] += v[1] * s[k];
229: x[2 + idt] += v[2] * s[k];
230: x[3 + idt] += v[3] * s[k];
231: x[4 + idt] += v[4] * s[k];
232: x[5 + idt] += v[5] * s[k];
233: x[6 + idt] += v[6] * s[k];
234: x[7 + idt] += v[7] * s[k];
235: x[8 + idt] += v[8] * s[k];
236: x[9 + idt] += v[9] * s[k];
237: x[10 + idt] += v[10] * s[k];
238: x[11 + idt] += v[11] * s[k];
239: x[12 + idt] += v[12] * s[k];
240: v += 13;
241: }
242: }
243: VecRestoreArrayRead(bb, &b);
244: VecRestoreArray(xx, &x);
245: PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n);
246: return 0;
247: }
249: /* Block operations are done by accessing one column at at time */
250: /* Default MatSolve for block size 12 */
252: PetscErrorCode MatSolve_SeqBAIJ_12_NaturalOrdering(Mat A, Vec bb, Vec xx)
253: {
254: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
255: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi, bs = A->rmap->bs, bs2 = a->bs2;
256: PetscInt i, k, nz, idx, idt, m;
257: const MatScalar *aa = a->a, *v;
258: PetscScalar s[12];
259: PetscScalar *x, xv;
260: const PetscScalar *b;
262: VecGetArrayRead(bb, &b);
263: VecGetArray(xx, &x);
265: /* forward solve the lower triangular */
266: for (i = 0; i < n; i++) {
267: v = aa + bs2 * ai[i];
268: vi = aj + ai[i];
269: nz = ai[i + 1] - ai[i];
270: idt = bs * i;
271: x[idt] = b[idt];
272: x[1 + idt] = b[1 + idt];
273: x[2 + idt] = b[2 + idt];
274: x[3 + idt] = b[3 + idt];
275: x[4 + idt] = b[4 + idt];
276: x[5 + idt] = b[5 + idt];
277: x[6 + idt] = b[6 + idt];
278: x[7 + idt] = b[7 + idt];
279: x[8 + idt] = b[8 + idt];
280: x[9 + idt] = b[9 + idt];
281: x[10 + idt] = b[10 + idt];
282: x[11 + idt] = b[11 + idt];
283: for (m = 0; m < nz; m++) {
284: idx = bs * vi[m];
285: for (k = 0; k < bs; k++) {
286: xv = x[k + idx];
287: x[idt] -= v[0] * xv;
288: x[1 + idt] -= v[1] * xv;
289: x[2 + idt] -= v[2] * xv;
290: x[3 + idt] -= v[3] * xv;
291: x[4 + idt] -= v[4] * xv;
292: x[5 + idt] -= v[5] * xv;
293: x[6 + idt] -= v[6] * xv;
294: x[7 + idt] -= v[7] * xv;
295: x[8 + idt] -= v[8] * xv;
296: x[9 + idt] -= v[9] * xv;
297: x[10 + idt] -= v[10] * xv;
298: x[11 + idt] -= v[11] * xv;
299: v += 12;
300: }
301: }
302: }
303: /* backward solve the upper triangular */
304: for (i = n - 1; i >= 0; i--) {
305: v = aa + bs2 * (adiag[i + 1] + 1);
306: vi = aj + adiag[i + 1] + 1;
307: nz = adiag[i] - adiag[i + 1] - 1;
308: idt = bs * i;
309: s[0] = x[idt];
310: s[1] = x[1 + idt];
311: s[2] = x[2 + idt];
312: s[3] = x[3 + idt];
313: s[4] = x[4 + idt];
314: s[5] = x[5 + idt];
315: s[6] = x[6 + idt];
316: s[7] = x[7 + idt];
317: s[8] = x[8 + idt];
318: s[9] = x[9 + idt];
319: s[10] = x[10 + idt];
320: s[11] = x[11 + idt];
322: for (m = 0; m < nz; m++) {
323: idx = bs * vi[m];
324: for (k = 0; k < bs; k++) {
325: xv = x[k + idx];
326: s[0] -= v[0] * xv;
327: s[1] -= v[1] * xv;
328: s[2] -= v[2] * xv;
329: s[3] -= v[3] * xv;
330: s[4] -= v[4] * xv;
331: s[5] -= v[5] * xv;
332: s[6] -= v[6] * xv;
333: s[7] -= v[7] * xv;
334: s[8] -= v[8] * xv;
335: s[9] -= v[9] * xv;
336: s[10] -= v[10] * xv;
337: s[11] -= v[11] * xv;
338: v += 12;
339: }
340: }
341: PetscArrayzero(x + idt, bs);
342: for (k = 0; k < bs; k++) {
343: x[idt] += v[0] * s[k];
344: x[1 + idt] += v[1] * s[k];
345: x[2 + idt] += v[2] * s[k];
346: x[3 + idt] += v[3] * s[k];
347: x[4 + idt] += v[4] * s[k];
348: x[5 + idt] += v[5] * s[k];
349: x[6 + idt] += v[6] * s[k];
350: x[7 + idt] += v[7] * s[k];
351: x[8 + idt] += v[8] * s[k];
352: x[9 + idt] += v[9] * s[k];
353: x[10 + idt] += v[10] * s[k];
354: x[11 + idt] += v[11] * s[k];
355: v += 12;
356: }
357: }
358: VecRestoreArrayRead(bb, &b);
359: VecRestoreArray(xx, &x);
360: PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n);
361: return 0;
362: }