Actual source code: baijsolvnat6.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <petsc/private/kernels/blockinvert.h>
4: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
5: {
6: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
7: PetscInt i, nz, idx, idt, jdx;
8: const PetscInt *diag = a->diag, *vi, n = a->mbs, *ai = a->i, *aj = a->j;
9: const MatScalar *aa = a->a, *v;
10: PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6;
11: const PetscScalar *b;
13: VecGetArrayRead(bb, &b);
14: VecGetArray(xx, &x);
15: /* forward solve the lower triangular */
16: idx = 0;
17: x[0] = b[idx];
18: x[1] = b[1 + idx];
19: x[2] = b[2 + idx];
20: x[3] = b[3 + idx];
21: x[4] = b[4 + idx];
22: x[5] = b[5 + idx];
23: for (i = 1; i < n; i++) {
24: v = aa + 36 * ai[i];
25: vi = aj + ai[i];
26: nz = diag[i] - ai[i];
27: idx = 6 * i;
28: s1 = b[idx];
29: s2 = b[1 + idx];
30: s3 = b[2 + idx];
31: s4 = b[3 + idx];
32: s5 = b[4 + idx];
33: s6 = b[5 + idx];
34: while (nz--) {
35: jdx = 6 * (*vi++);
36: x1 = x[jdx];
37: x2 = x[1 + jdx];
38: x3 = x[2 + jdx];
39: x4 = x[3 + jdx];
40: x5 = x[4 + jdx];
41: x6 = x[5 + jdx];
42: s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
43: s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
44: s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
45: s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
46: s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
47: s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
48: v += 36;
49: }
50: x[idx] = s1;
51: x[1 + idx] = s2;
52: x[2 + idx] = s3;
53: x[3 + idx] = s4;
54: x[4 + idx] = s5;
55: x[5 + idx] = s6;
56: }
57: /* backward solve the upper triangular */
58: for (i = n - 1; i >= 0; i--) {
59: v = aa + 36 * diag[i] + 36;
60: vi = aj + diag[i] + 1;
61: nz = ai[i + 1] - diag[i] - 1;
62: idt = 6 * i;
63: s1 = x[idt];
64: s2 = x[1 + idt];
65: s3 = x[2 + idt];
66: s4 = x[3 + idt];
67: s5 = x[4 + idt];
68: s6 = x[5 + idt];
69: while (nz--) {
70: idx = 6 * (*vi++);
71: x1 = x[idx];
72: x2 = x[1 + idx];
73: x3 = x[2 + idx];
74: x4 = x[3 + idx];
75: x5 = x[4 + idx];
76: x6 = x[5 + idx];
77: s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
78: s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
79: s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
80: s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
81: s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
82: s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
83: v += 36;
84: }
85: v = aa + 36 * diag[i];
86: x[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
87: x[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
88: x[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
89: x[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
90: x[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
91: x[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
92: }
94: VecRestoreArrayRead(bb, &b);
95: VecRestoreArray(xx, &x);
96: PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n);
97: return 0;
98: }
100: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A, Vec bb, Vec xx)
101: {
102: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
103: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
104: PetscInt i, k, nz, idx, jdx, idt;
105: const PetscInt bs = A->rmap->bs, bs2 = a->bs2;
106: const MatScalar *aa = a->a, *v;
107: PetscScalar *x;
108: const PetscScalar *b;
109: PetscScalar s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6;
111: VecGetArrayRead(bb, &b);
112: VecGetArray(xx, &x);
113: /* forward solve the lower triangular */
114: idx = 0;
115: x[0] = b[idx];
116: x[1] = b[1 + idx];
117: x[2] = b[2 + idx];
118: x[3] = b[3 + idx];
119: x[4] = b[4 + idx];
120: x[5] = b[5 + idx];
121: for (i = 1; i < n; i++) {
122: v = aa + bs2 * ai[i];
123: vi = aj + ai[i];
124: nz = ai[i + 1] - ai[i];
125: idx = bs * i;
126: s1 = b[idx];
127: s2 = b[1 + idx];
128: s3 = b[2 + idx];
129: s4 = b[3 + idx];
130: s5 = b[4 + idx];
131: s6 = b[5 + idx];
132: for (k = 0; k < nz; k++) {
133: jdx = bs * vi[k];
134: x1 = x[jdx];
135: x2 = x[1 + jdx];
136: x3 = x[2 + jdx];
137: x4 = x[3 + jdx];
138: x5 = x[4 + jdx];
139: x6 = x[5 + jdx];
140: s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
141: s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
142: s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
143: s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
144: s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
145: s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
146: v += bs2;
147: }
149: x[idx] = s1;
150: x[1 + idx] = s2;
151: x[2 + idx] = s3;
152: x[3 + idx] = s4;
153: x[4 + idx] = s5;
154: x[5 + idx] = s6;
155: }
157: /* backward solve the upper triangular */
158: for (i = n - 1; i >= 0; i--) {
159: v = aa + bs2 * (adiag[i + 1] + 1);
160: vi = aj + adiag[i + 1] + 1;
161: nz = adiag[i] - adiag[i + 1] - 1;
162: idt = bs * i;
163: s1 = x[idt];
164: s2 = x[1 + idt];
165: s3 = x[2 + idt];
166: s4 = x[3 + idt];
167: s5 = x[4 + idt];
168: s6 = x[5 + idt];
169: for (k = 0; k < nz; k++) {
170: idx = bs * vi[k];
171: x1 = x[idx];
172: x2 = x[1 + idx];
173: x3 = x[2 + idx];
174: x4 = x[3 + idx];
175: x5 = x[4 + idx];
176: x6 = x[5 + idx];
177: s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
178: s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
179: s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
180: s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
181: s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
182: s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
183: v += bs2;
184: }
185: /* x = inv_diagonal*x */
186: x[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
187: x[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
188: x[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
189: x[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
190: x[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
191: x[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
192: }
194: VecRestoreArrayRead(bb, &b);
195: VecRestoreArray(xx, &x);
196: PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n);
197: return 0;
198: }