Actual source code: sbaijfact2.c
2: /*
3: Factorization code for SBAIJ format.
4: */
6: #include <../src/mat/impls/sbaij/seq/sbaij.h>
7: #include <../src/mat/impls/baij/seq/baij.h>
8: #include <petsc/private/kernels/blockinvert.h>
10: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
11: {
12: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
13: IS isrow = a->row;
14: PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
15: const PetscInt *r;
16: PetscInt nz, *vj, k, idx, k1;
17: PetscInt bs = A->rmap->bs, bs2 = a->bs2;
18: const MatScalar *aa = a->a, *v, *diag;
19: PetscScalar *x, *xk, *xj, *xk_tmp, *t;
20: const PetscScalar *b;
22: VecGetArrayRead(bb, &b);
23: VecGetArray(xx, &x);
24: t = a->solve_work;
25: ISGetIndices(isrow, &r);
26: PetscMalloc1(bs, &xk_tmp);
28: /* solve U^T * D * y = b by forward substitution */
29: xk = t;
30: for (k = 0; k < mbs; k++) { /* t <- perm(b) */
31: idx = bs * r[k];
32: for (k1 = 0; k1 < bs; k1++) *xk++ = b[idx + k1];
33: }
34: for (k = 0; k < mbs; k++) {
35: v = aa + bs2 * ai[k];
36: xk = t + k * bs; /* Dk*xk = k-th block of x */
37: PetscArraycpy(xk_tmp, xk, bs); /* xk_tmp <- xk */
38: nz = ai[k + 1] - ai[k];
39: vj = aj + ai[k];
40: xj = t + (*vj) * bs; /* *vj-th block of x, *vj>k */
41: while (nz--) {
42: /* x(:) += U(k,:)^T*(Dk*xk) */
43: PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */
44: vj++;
45: xj = t + (*vj) * bs;
46: v += bs2;
47: }
48: /* xk = inv(Dk)*(Dk*xk) */
49: diag = aa + k * bs2; /* ptr to inv(Dk) */
50: PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */
51: }
53: /* solve U*x = y by back substitution */
54: for (k = mbs - 1; k >= 0; k--) {
55: v = aa + bs2 * ai[k];
56: xk = t + k * bs; /* xk */
57: nz = ai[k + 1] - ai[k];
58: vj = aj + ai[k];
59: xj = t + (*vj) * bs;
60: while (nz--) {
61: /* xk += U(k,:)*x(:) */
62: PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */
63: vj++;
64: v += bs2;
65: xj = t + (*vj) * bs;
66: }
67: idx = bs * r[k];
68: for (k1 = 0; k1 < bs; k1++) x[idx + k1] = *xk++;
69: }
71: PetscFree(xk_tmp);
72: ISRestoreIndices(isrow, &r);
73: VecRestoreArrayRead(bb, &b);
74: VecRestoreArray(xx, &x);
75: PetscLogFlops(4.0 * bs2 * a->nz - (bs + 2.0 * bs2) * mbs);
76: return 0;
77: }
79: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
80: {
81: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "not implemented yet");
82: }
84: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
85: {
86: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not yet implemented");
87: }
89: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x)
90: {
91: PetscInt nz, k;
92: const PetscInt *vj, bs2 = bs * bs;
93: const MatScalar *v, *diag;
94: PetscScalar *xk, *xj, *xk_tmp;
96: PetscMalloc1(bs, &xk_tmp);
97: for (k = 0; k < mbs; k++) {
98: v = aa + bs2 * ai[k];
99: xk = x + k * bs; /* Dk*xk = k-th block of x */
100: PetscArraycpy(xk_tmp, xk, bs); /* xk_tmp <- xk */
101: nz = ai[k + 1] - ai[k];
102: vj = aj + ai[k];
103: xj = x + (*vj) * bs; /* *vj-th block of x, *vj>k */
104: while (nz--) {
105: /* x(:) += U(k,:)^T*(Dk*xk) */
106: PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */
107: vj++;
108: xj = x + (*vj) * bs;
109: v += bs2;
110: }
111: /* xk = inv(Dk)*(Dk*xk) */
112: diag = aa + k * bs2; /* ptr to inv(Dk) */
113: PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */
114: }
115: PetscFree(xk_tmp);
116: return 0;
117: }
119: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x)
120: {
121: PetscInt nz, k;
122: const PetscInt *vj, bs2 = bs * bs;
123: const MatScalar *v;
124: PetscScalar *xk, *xj;
126: for (k = mbs - 1; k >= 0; k--) {
127: v = aa + bs2 * ai[k];
128: xk = x + k * bs; /* xk */
129: nz = ai[k + 1] - ai[k];
130: vj = aj + ai[k];
131: xj = x + (*vj) * bs;
132: while (nz--) {
133: /* xk += U(k,:)*x(:) */
134: PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */
135: vj++;
136: v += bs2;
137: xj = x + (*vj) * bs;
138: }
139: }
140: return 0;
141: }
143: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
144: {
145: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
146: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
147: PetscInt bs = A->rmap->bs;
148: const MatScalar *aa = a->a;
149: PetscScalar *x;
150: const PetscScalar *b;
152: VecGetArrayRead(bb, &b);
153: VecGetArray(xx, &x);
155: /* solve U^T * D * y = b by forward substitution */
156: PetscArraycpy(x, b, bs * mbs); /* x <- b */
157: MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x);
159: /* solve U*x = y by back substitution */
160: MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x);
162: VecRestoreArrayRead(bb, &b);
163: VecRestoreArray(xx, &x);
164: PetscLogFlops(4.0 * a->bs2 * a->nz - (bs + 2.0 * a->bs2) * mbs);
165: return 0;
166: }
168: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
169: {
170: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
171: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
172: PetscInt bs = A->rmap->bs;
173: const MatScalar *aa = a->a;
174: const PetscScalar *b;
175: PetscScalar *x;
177: VecGetArrayRead(bb, &b);
178: VecGetArray(xx, &x);
179: PetscArraycpy(x, b, bs * mbs); /* x <- b */
180: MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x);
181: VecRestoreArrayRead(bb, &b);
182: VecRestoreArray(xx, &x);
183: PetscLogFlops(2.0 * a->bs2 * a->nz - bs * mbs);
184: return 0;
185: }
187: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
188: {
189: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
190: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
191: PetscInt bs = A->rmap->bs;
192: const MatScalar *aa = a->a;
193: const PetscScalar *b;
194: PetscScalar *x;
196: VecGetArrayRead(bb, &b);
197: VecGetArray(xx, &x);
198: PetscArraycpy(x, b, bs * mbs);
199: MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x);
200: VecRestoreArrayRead(bb, &b);
201: VecRestoreArray(xx, &x);
202: PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs));
203: return 0;
204: }
206: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
207: {
208: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
209: IS isrow = a->row;
210: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj;
211: PetscInt nz, k, idx;
212: const MatScalar *aa = a->a, *v, *d;
213: PetscScalar *x, x0, x1, x2, x3, x4, x5, x6, *t, *tp;
214: const PetscScalar *b;
216: VecGetArrayRead(bb, &b);
217: VecGetArray(xx, &x);
218: t = a->solve_work;
219: ISGetIndices(isrow, &r);
221: /* solve U^T * D * y = b by forward substitution */
222: tp = t;
223: for (k = 0; k < mbs; k++) { /* t <- perm(b) */
224: idx = 7 * r[k];
225: tp[0] = b[idx];
226: tp[1] = b[idx + 1];
227: tp[2] = b[idx + 2];
228: tp[3] = b[idx + 3];
229: tp[4] = b[idx + 4];
230: tp[5] = b[idx + 5];
231: tp[6] = b[idx + 6];
232: tp += 7;
233: }
235: for (k = 0; k < mbs; k++) {
236: v = aa + 49 * ai[k];
237: vj = aj + ai[k];
238: tp = t + k * 7;
239: x0 = tp[0];
240: x1 = tp[1];
241: x2 = tp[2];
242: x3 = tp[3];
243: x4 = tp[4];
244: x5 = tp[5];
245: x6 = tp[6];
246: nz = ai[k + 1] - ai[k];
247: tp = t + (*vj) * 7;
248: while (nz--) {
249: tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6;
250: tp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6;
251: tp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6;
252: tp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6;
253: tp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6;
254: tp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6;
255: tp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6;
256: vj++;
257: tp = t + (*vj) * 7;
258: v += 49;
259: }
261: /* xk = inv(Dk)*(Dk*xk) */
262: d = aa + k * 49; /* ptr to inv(Dk) */
263: tp = t + k * 7;
264: tp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6;
265: tp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6;
266: tp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6;
267: tp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6;
268: tp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6;
269: tp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6;
270: tp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6;
271: }
273: /* solve U*x = y by back substitution */
274: for (k = mbs - 1; k >= 0; k--) {
275: v = aa + 49 * ai[k];
276: vj = aj + ai[k];
277: tp = t + k * 7;
278: x0 = tp[0];
279: x1 = tp[1];
280: x2 = tp[2];
281: x3 = tp[3];
282: x4 = tp[4];
283: x5 = tp[5];
284: x6 = tp[6]; /* xk */
285: nz = ai[k + 1] - ai[k];
287: tp = t + (*vj) * 7;
288: while (nz--) {
289: /* xk += U(k,:)*x(:) */
290: x0 += v[0] * tp[0] + v[7] * tp[1] + v[14] * tp[2] + v[21] * tp[3] + v[28] * tp[4] + v[35] * tp[5] + v[42] * tp[6];
291: x1 += v[1] * tp[0] + v[8] * tp[1] + v[15] * tp[2] + v[22] * tp[3] + v[29] * tp[4] + v[36] * tp[5] + v[43] * tp[6];
292: x2 += v[2] * tp[0] + v[9] * tp[1] + v[16] * tp[2] + v[23] * tp[3] + v[30] * tp[4] + v[37] * tp[5] + v[44] * tp[6];
293: x3 += v[3] * tp[0] + v[10] * tp[1] + v[17] * tp[2] + v[24] * tp[3] + v[31] * tp[4] + v[38] * tp[5] + v[45] * tp[6];
294: x4 += v[4] * tp[0] + v[11] * tp[1] + v[18] * tp[2] + v[25] * tp[3] + v[32] * tp[4] + v[39] * tp[5] + v[46] * tp[6];
295: x5 += v[5] * tp[0] + v[12] * tp[1] + v[19] * tp[2] + v[26] * tp[3] + v[33] * tp[4] + v[40] * tp[5] + v[47] * tp[6];
296: x6 += v[6] * tp[0] + v[13] * tp[1] + v[20] * tp[2] + v[27] * tp[3] + v[34] * tp[4] + v[41] * tp[5] + v[48] * tp[6];
297: vj++;
298: tp = t + (*vj) * 7;
299: v += 49;
300: }
301: tp = t + k * 7;
302: tp[0] = x0;
303: tp[1] = x1;
304: tp[2] = x2;
305: tp[3] = x3;
306: tp[4] = x4;
307: tp[5] = x5;
308: tp[6] = x6;
309: idx = 7 * r[k];
310: x[idx] = x0;
311: x[idx + 1] = x1;
312: x[idx + 2] = x2;
313: x[idx + 3] = x3;
314: x[idx + 4] = x4;
315: x[idx + 5] = x5;
316: x[idx + 6] = x6;
317: }
319: ISRestoreIndices(isrow, &r);
320: VecRestoreArrayRead(bb, &b);
321: VecRestoreArray(xx, &x);
322: PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
323: return 0;
324: }
326: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
327: {
328: const MatScalar *v, *d;
329: PetscScalar *xp, x0, x1, x2, x3, x4, x5, x6;
330: PetscInt nz, k;
331: const PetscInt *vj;
333: for (k = 0; k < mbs; k++) {
334: v = aa + 49 * ai[k];
335: xp = x + k * 7;
336: x0 = xp[0];
337: x1 = xp[1];
338: x2 = xp[2];
339: x3 = xp[3];
340: x4 = xp[4];
341: x5 = xp[5];
342: x6 = xp[6]; /* Dk*xk = k-th block of x */
343: nz = ai[k + 1] - ai[k];
344: vj = aj + ai[k];
345: PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
346: PetscPrefetchBlock(v + 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
347: while (nz--) {
348: xp = x + (*vj) * 7;
349: /* x(:) += U(k,:)^T*(Dk*xk) */
350: xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6;
351: xp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6;
352: xp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6;
353: xp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6;
354: xp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6;
355: xp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6;
356: xp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6;
357: vj++;
358: v += 49;
359: }
360: /* xk = inv(Dk)*(Dk*xk) */
361: d = aa + k * 49; /* ptr to inv(Dk) */
362: xp = x + k * 7;
363: xp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6;
364: xp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6;
365: xp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6;
366: xp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6;
367: xp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6;
368: xp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6;
369: xp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6;
370: }
371: return 0;
372: }
374: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
375: {
376: const MatScalar *v;
377: PetscScalar *xp, x0, x1, x2, x3, x4, x5, x6;
378: PetscInt nz, k;
379: const PetscInt *vj;
381: for (k = mbs - 1; k >= 0; k--) {
382: v = aa + 49 * ai[k];
383: xp = x + k * 7;
384: x0 = xp[0];
385: x1 = xp[1];
386: x2 = xp[2];
387: x3 = xp[3];
388: x4 = xp[4];
389: x5 = xp[5];
390: x6 = xp[6]; /* xk */
391: nz = ai[k + 1] - ai[k];
392: vj = aj + ai[k];
393: PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
394: PetscPrefetchBlock(v - 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
395: while (nz--) {
396: xp = x + (*vj) * 7;
397: /* xk += U(k,:)*x(:) */
398: x0 += v[0] * xp[0] + v[7] * xp[1] + v[14] * xp[2] + v[21] * xp[3] + v[28] * xp[4] + v[35] * xp[5] + v[42] * xp[6];
399: x1 += v[1] * xp[0] + v[8] * xp[1] + v[15] * xp[2] + v[22] * xp[3] + v[29] * xp[4] + v[36] * xp[5] + v[43] * xp[6];
400: x2 += v[2] * xp[0] + v[9] * xp[1] + v[16] * xp[2] + v[23] * xp[3] + v[30] * xp[4] + v[37] * xp[5] + v[44] * xp[6];
401: x3 += v[3] * xp[0] + v[10] * xp[1] + v[17] * xp[2] + v[24] * xp[3] + v[31] * xp[4] + v[38] * xp[5] + v[45] * xp[6];
402: x4 += v[4] * xp[0] + v[11] * xp[1] + v[18] * xp[2] + v[25] * xp[3] + v[32] * xp[4] + v[39] * xp[5] + v[46] * xp[6];
403: x5 += v[5] * xp[0] + v[12] * xp[1] + v[19] * xp[2] + v[26] * xp[3] + v[33] * xp[4] + v[40] * xp[5] + v[47] * xp[6];
404: x6 += v[6] * xp[0] + v[13] * xp[1] + v[20] * xp[2] + v[27] * xp[3] + v[34] * xp[4] + v[41] * xp[5] + v[48] * xp[6];
405: vj++;
406: v += 49;
407: }
408: xp = x + k * 7;
409: xp[0] = x0;
410: xp[1] = x1;
411: xp[2] = x2;
412: xp[3] = x3;
413: xp[4] = x4;
414: xp[5] = x5;
415: xp[6] = x6;
416: }
417: return 0;
418: }
420: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
421: {
422: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
423: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
424: const MatScalar *aa = a->a;
425: PetscScalar *x;
426: const PetscScalar *b;
428: VecGetArrayRead(bb, &b);
429: VecGetArray(xx, &x);
431: /* solve U^T * D * y = b by forward substitution */
432: PetscArraycpy(x, b, 7 * mbs); /* x <- b */
433: MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x);
435: /* solve U*x = y by back substitution */
436: MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x);
438: VecRestoreArrayRead(bb, &b);
439: VecRestoreArray(xx, &x);
440: PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
441: return 0;
442: }
444: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
445: {
446: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
447: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
448: const MatScalar *aa = a->a;
449: PetscScalar *x;
450: const PetscScalar *b;
452: VecGetArrayRead(bb, &b);
453: VecGetArray(xx, &x);
454: PetscArraycpy(x, b, 7 * mbs);
455: MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x);
456: VecRestoreArrayRead(bb, &b);
457: VecRestoreArray(xx, &x);
458: PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs);
459: return 0;
460: }
462: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
463: {
464: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
465: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
466: const MatScalar *aa = a->a;
467: PetscScalar *x;
468: const PetscScalar *b;
470: VecGetArrayRead(bb, &b);
471: VecGetArray(xx, &x);
472: PetscArraycpy(x, b, 7 * mbs);
473: MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x);
474: VecRestoreArrayRead(bb, &b);
475: VecRestoreArray(xx, &x);
476: PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs));
477: return 0;
478: }
480: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
481: {
482: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
483: IS isrow = a->row;
484: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj;
485: PetscInt nz, k, idx;
486: const MatScalar *aa = a->a, *v, *d;
487: PetscScalar *x, x0, x1, x2, x3, x4, x5, *t, *tp;
488: const PetscScalar *b;
490: VecGetArrayRead(bb, &b);
491: VecGetArray(xx, &x);
492: t = a->solve_work;
493: ISGetIndices(isrow, &r);
495: /* solve U^T * D * y = b by forward substitution */
496: tp = t;
497: for (k = 0; k < mbs; k++) { /* t <- perm(b) */
498: idx = 6 * r[k];
499: tp[0] = b[idx];
500: tp[1] = b[idx + 1];
501: tp[2] = b[idx + 2];
502: tp[3] = b[idx + 3];
503: tp[4] = b[idx + 4];
504: tp[5] = b[idx + 5];
505: tp += 6;
506: }
508: for (k = 0; k < mbs; k++) {
509: v = aa + 36 * ai[k];
510: vj = aj + ai[k];
511: tp = t + k * 6;
512: x0 = tp[0];
513: x1 = tp[1];
514: x2 = tp[2];
515: x3 = tp[3];
516: x4 = tp[4];
517: x5 = tp[5];
518: nz = ai[k + 1] - ai[k];
519: tp = t + (*vj) * 6;
520: while (nz--) {
521: tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5;
522: tp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5;
523: tp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5;
524: tp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5;
525: tp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5;
526: tp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5;
527: vj++;
528: tp = t + (*vj) * 6;
529: v += 36;
530: }
532: /* xk = inv(Dk)*(Dk*xk) */
533: d = aa + k * 36; /* ptr to inv(Dk) */
534: tp = t + k * 6;
535: tp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5;
536: tp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5;
537: tp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5;
538: tp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5;
539: tp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5;
540: tp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5;
541: }
543: /* solve U*x = y by back substitution */
544: for (k = mbs - 1; k >= 0; k--) {
545: v = aa + 36 * ai[k];
546: vj = aj + ai[k];
547: tp = t + k * 6;
548: x0 = tp[0];
549: x1 = tp[1];
550: x2 = tp[2];
551: x3 = tp[3];
552: x4 = tp[4];
553: x5 = tp[5]; /* xk */
554: nz = ai[k + 1] - ai[k];
556: tp = t + (*vj) * 6;
557: while (nz--) {
558: /* xk += U(k,:)*x(:) */
559: x0 += v[0] * tp[0] + v[6] * tp[1] + v[12] * tp[2] + v[18] * tp[3] + v[24] * tp[4] + v[30] * tp[5];
560: x1 += v[1] * tp[0] + v[7] * tp[1] + v[13] * tp[2] + v[19] * tp[3] + v[25] * tp[4] + v[31] * tp[5];
561: x2 += v[2] * tp[0] + v[8] * tp[1] + v[14] * tp[2] + v[20] * tp[3] + v[26] * tp[4] + v[32] * tp[5];
562: x3 += v[3] * tp[0] + v[9] * tp[1] + v[15] * tp[2] + v[21] * tp[3] + v[27] * tp[4] + v[33] * tp[5];
563: x4 += v[4] * tp[0] + v[10] * tp[1] + v[16] * tp[2] + v[22] * tp[3] + v[28] * tp[4] + v[34] * tp[5];
564: x5 += v[5] * tp[0] + v[11] * tp[1] + v[17] * tp[2] + v[23] * tp[3] + v[29] * tp[4] + v[35] * tp[5];
565: vj++;
566: tp = t + (*vj) * 6;
567: v += 36;
568: }
569: tp = t + k * 6;
570: tp[0] = x0;
571: tp[1] = x1;
572: tp[2] = x2;
573: tp[3] = x3;
574: tp[4] = x4;
575: tp[5] = x5;
576: idx = 6 * r[k];
577: x[idx] = x0;
578: x[idx + 1] = x1;
579: x[idx + 2] = x2;
580: x[idx + 3] = x3;
581: x[idx + 4] = x4;
582: x[idx + 5] = x5;
583: }
585: ISRestoreIndices(isrow, &r);
586: VecRestoreArrayRead(bb, &b);
587: VecRestoreArray(xx, &x);
588: PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
589: return 0;
590: }
592: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
593: {
594: const MatScalar *v, *d;
595: PetscScalar *xp, x0, x1, x2, x3, x4, x5;
596: PetscInt nz, k;
597: const PetscInt *vj;
599: for (k = 0; k < mbs; k++) {
600: v = aa + 36 * ai[k];
601: xp = x + k * 6;
602: x0 = xp[0];
603: x1 = xp[1];
604: x2 = xp[2];
605: x3 = xp[3];
606: x4 = xp[4];
607: x5 = xp[5]; /* Dk*xk = k-th block of x */
608: nz = ai[k + 1] - ai[k];
609: vj = aj + ai[k];
610: PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
611: PetscPrefetchBlock(v + 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
612: while (nz--) {
613: xp = x + (*vj) * 6;
614: /* x(:) += U(k,:)^T*(Dk*xk) */
615: xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5;
616: xp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5;
617: xp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5;
618: xp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5;
619: xp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5;
620: xp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5;
621: vj++;
622: v += 36;
623: }
624: /* xk = inv(Dk)*(Dk*xk) */
625: d = aa + k * 36; /* ptr to inv(Dk) */
626: xp = x + k * 6;
627: xp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5;
628: xp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5;
629: xp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5;
630: xp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5;
631: xp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5;
632: xp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5;
633: }
634: return 0;
635: }
636: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
637: {
638: const MatScalar *v;
639: PetscScalar *xp, x0, x1, x2, x3, x4, x5;
640: PetscInt nz, k;
641: const PetscInt *vj;
643: for (k = mbs - 1; k >= 0; k--) {
644: v = aa + 36 * ai[k];
645: xp = x + k * 6;
646: x0 = xp[0];
647: x1 = xp[1];
648: x2 = xp[2];
649: x3 = xp[3];
650: x4 = xp[4];
651: x5 = xp[5]; /* xk */
652: nz = ai[k + 1] - ai[k];
653: vj = aj + ai[k];
654: PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
655: PetscPrefetchBlock(v - 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
656: while (nz--) {
657: xp = x + (*vj) * 6;
658: /* xk += U(k,:)*x(:) */
659: x0 += v[0] * xp[0] + v[6] * xp[1] + v[12] * xp[2] + v[18] * xp[3] + v[24] * xp[4] + v[30] * xp[5];
660: x1 += v[1] * xp[0] + v[7] * xp[1] + v[13] * xp[2] + v[19] * xp[3] + v[25] * xp[4] + v[31] * xp[5];
661: x2 += v[2] * xp[0] + v[8] * xp[1] + v[14] * xp[2] + v[20] * xp[3] + v[26] * xp[4] + v[32] * xp[5];
662: x3 += v[3] * xp[0] + v[9] * xp[1] + v[15] * xp[2] + v[21] * xp[3] + v[27] * xp[4] + v[33] * xp[5];
663: x4 += v[4] * xp[0] + v[10] * xp[1] + v[16] * xp[2] + v[22] * xp[3] + v[28] * xp[4] + v[34] * xp[5];
664: x5 += v[5] * xp[0] + v[11] * xp[1] + v[17] * xp[2] + v[23] * xp[3] + v[29] * xp[4] + v[35] * xp[5];
665: vj++;
666: v += 36;
667: }
668: xp = x + k * 6;
669: xp[0] = x0;
670: xp[1] = x1;
671: xp[2] = x2;
672: xp[3] = x3;
673: xp[4] = x4;
674: xp[5] = x5;
675: }
676: return 0;
677: }
679: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
680: {
681: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
682: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
683: const MatScalar *aa = a->a;
684: PetscScalar *x;
685: const PetscScalar *b;
687: VecGetArrayRead(bb, &b);
688: VecGetArray(xx, &x);
690: /* solve U^T * D * y = b by forward substitution */
691: PetscArraycpy(x, b, 6 * mbs); /* x <- b */
692: MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x);
694: /* solve U*x = y by back substitution */
695: MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x);
697: VecRestoreArrayRead(bb, &b);
698: VecRestoreArray(xx, &x);
699: PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
700: return 0;
701: }
703: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
704: {
705: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
706: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
707: const MatScalar *aa = a->a;
708: PetscScalar *x;
709: const PetscScalar *b;
711: VecGetArrayRead(bb, &b);
712: VecGetArray(xx, &x);
713: PetscArraycpy(x, b, 6 * mbs); /* x <- b */
714: MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x);
715: VecRestoreArrayRead(bb, &b);
716: VecRestoreArray(xx, &x);
717: PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs);
718: return 0;
719: }
721: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
722: {
723: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
724: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
725: const MatScalar *aa = a->a;
726: PetscScalar *x;
727: const PetscScalar *b;
729: VecGetArrayRead(bb, &b);
730: VecGetArray(xx, &x);
731: PetscArraycpy(x, b, 6 * mbs); /* x <- b */
732: MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x);
733: VecRestoreArrayRead(bb, &b);
734: VecRestoreArray(xx, &x);
735: PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs));
736: return 0;
737: }
739: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
740: {
741: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
742: IS isrow = a->row;
743: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
744: const PetscInt *r, *vj;
745: PetscInt nz, k, idx;
746: const MatScalar *aa = a->a, *v, *diag;
747: PetscScalar *x, x0, x1, x2, x3, x4, *t, *tp;
748: const PetscScalar *b;
750: VecGetArrayRead(bb, &b);
751: VecGetArray(xx, &x);
752: t = a->solve_work;
753: ISGetIndices(isrow, &r);
755: /* solve U^T * D * y = b by forward substitution */
756: tp = t;
757: for (k = 0; k < mbs; k++) { /* t <- perm(b) */
758: idx = 5 * r[k];
759: tp[0] = b[idx];
760: tp[1] = b[idx + 1];
761: tp[2] = b[idx + 2];
762: tp[3] = b[idx + 3];
763: tp[4] = b[idx + 4];
764: tp += 5;
765: }
767: for (k = 0; k < mbs; k++) {
768: v = aa + 25 * ai[k];
769: vj = aj + ai[k];
770: tp = t + k * 5;
771: x0 = tp[0];
772: x1 = tp[1];
773: x2 = tp[2];
774: x3 = tp[3];
775: x4 = tp[4];
776: nz = ai[k + 1] - ai[k];
778: tp = t + (*vj) * 5;
779: while (nz--) {
780: tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4;
781: tp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4;
782: tp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4;
783: tp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4;
784: tp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4;
785: vj++;
786: tp = t + (*vj) * 5;
787: v += 25;
788: }
790: /* xk = inv(Dk)*(Dk*xk) */
791: diag = aa + k * 25; /* ptr to inv(Dk) */
792: tp = t + k * 5;
793: tp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4;
794: tp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4;
795: tp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4;
796: tp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4;
797: tp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4;
798: }
800: /* solve U*x = y by back substitution */
801: for (k = mbs - 1; k >= 0; k--) {
802: v = aa + 25 * ai[k];
803: vj = aj + ai[k];
804: tp = t + k * 5;
805: x0 = tp[0];
806: x1 = tp[1];
807: x2 = tp[2];
808: x3 = tp[3];
809: x4 = tp[4]; /* xk */
810: nz = ai[k + 1] - ai[k];
812: tp = t + (*vj) * 5;
813: while (nz--) {
814: /* xk += U(k,:)*x(:) */
815: x0 += v[0] * tp[0] + v[5] * tp[1] + v[10] * tp[2] + v[15] * tp[3] + v[20] * tp[4];
816: x1 += v[1] * tp[0] + v[6] * tp[1] + v[11] * tp[2] + v[16] * tp[3] + v[21] * tp[4];
817: x2 += v[2] * tp[0] + v[7] * tp[1] + v[12] * tp[2] + v[17] * tp[3] + v[22] * tp[4];
818: x3 += v[3] * tp[0] + v[8] * tp[1] + v[13] * tp[2] + v[18] * tp[3] + v[23] * tp[4];
819: x4 += v[4] * tp[0] + v[9] * tp[1] + v[14] * tp[2] + v[19] * tp[3] + v[24] * tp[4];
820: vj++;
821: tp = t + (*vj) * 5;
822: v += 25;
823: }
824: tp = t + k * 5;
825: tp[0] = x0;
826: tp[1] = x1;
827: tp[2] = x2;
828: tp[3] = x3;
829: tp[4] = x4;
830: idx = 5 * r[k];
831: x[idx] = x0;
832: x[idx + 1] = x1;
833: x[idx + 2] = x2;
834: x[idx + 3] = x3;
835: x[idx + 4] = x4;
836: }
838: ISRestoreIndices(isrow, &r);
839: VecRestoreArrayRead(bb, &b);
840: VecRestoreArray(xx, &x);
841: PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
842: return 0;
843: }
845: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
846: {
847: const MatScalar *v, *diag;
848: PetscScalar *xp, x0, x1, x2, x3, x4;
849: PetscInt nz, k;
850: const PetscInt *vj;
852: for (k = 0; k < mbs; k++) {
853: v = aa + 25 * ai[k];
854: xp = x + k * 5;
855: x0 = xp[0];
856: x1 = xp[1];
857: x2 = xp[2];
858: x3 = xp[3];
859: x4 = xp[4]; /* Dk*xk = k-th block of x */
860: nz = ai[k + 1] - ai[k];
861: vj = aj + ai[k];
862: PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
863: PetscPrefetchBlock(v + 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
864: while (nz--) {
865: xp = x + (*vj) * 5;
866: /* x(:) += U(k,:)^T*(Dk*xk) */
867: xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4;
868: xp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4;
869: xp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4;
870: xp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4;
871: xp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4;
872: vj++;
873: v += 25;
874: }
875: /* xk = inv(Dk)*(Dk*xk) */
876: diag = aa + k * 25; /* ptr to inv(Dk) */
877: xp = x + k * 5;
878: xp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4;
879: xp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4;
880: xp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4;
881: xp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4;
882: xp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4;
883: }
884: return 0;
885: }
887: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
888: {
889: const MatScalar *v;
890: PetscScalar *xp, x0, x1, x2, x3, x4;
891: PetscInt nz, k;
892: const PetscInt *vj;
894: for (k = mbs - 1; k >= 0; k--) {
895: v = aa + 25 * ai[k];
896: xp = x + k * 5;
897: x0 = xp[0];
898: x1 = xp[1];
899: x2 = xp[2];
900: x3 = xp[3];
901: x4 = xp[4]; /* xk */
902: nz = ai[k + 1] - ai[k];
903: vj = aj + ai[k];
904: PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
905: PetscPrefetchBlock(v - 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
906: while (nz--) {
907: xp = x + (*vj) * 5;
908: /* xk += U(k,:)*x(:) */
909: x0 += v[0] * xp[0] + v[5] * xp[1] + v[10] * xp[2] + v[15] * xp[3] + v[20] * xp[4];
910: x1 += v[1] * xp[0] + v[6] * xp[1] + v[11] * xp[2] + v[16] * xp[3] + v[21] * xp[4];
911: x2 += v[2] * xp[0] + v[7] * xp[1] + v[12] * xp[2] + v[17] * xp[3] + v[22] * xp[4];
912: x3 += v[3] * xp[0] + v[8] * xp[1] + v[13] * xp[2] + v[18] * xp[3] + v[23] * xp[4];
913: x4 += v[4] * xp[0] + v[9] * xp[1] + v[14] * xp[2] + v[19] * xp[3] + v[24] * xp[4];
914: vj++;
915: v += 25;
916: }
917: xp = x + k * 5;
918: xp[0] = x0;
919: xp[1] = x1;
920: xp[2] = x2;
921: xp[3] = x3;
922: xp[4] = x4;
923: }
924: return 0;
925: }
927: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
928: {
929: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
930: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
931: const MatScalar *aa = a->a;
932: PetscScalar *x;
933: const PetscScalar *b;
935: VecGetArrayRead(bb, &b);
936: VecGetArray(xx, &x);
938: /* solve U^T * D * y = b by forward substitution */
939: PetscArraycpy(x, b, 5 * mbs); /* x <- b */
940: MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x);
942: /* solve U*x = y by back substitution */
943: MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x);
945: VecRestoreArrayRead(bb, &b);
946: VecRestoreArray(xx, &x);
947: PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
948: return 0;
949: }
951: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
952: {
953: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
954: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
955: const MatScalar *aa = a->a;
956: PetscScalar *x;
957: const PetscScalar *b;
959: VecGetArrayRead(bb, &b);
960: VecGetArray(xx, &x);
961: PetscArraycpy(x, b, 5 * mbs); /* x <- b */
962: MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x);
963: VecRestoreArrayRead(bb, &b);
964: VecRestoreArray(xx, &x);
965: PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs);
966: return 0;
967: }
969: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
970: {
971: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
972: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
973: const MatScalar *aa = a->a;
974: PetscScalar *x;
975: const PetscScalar *b;
977: VecGetArrayRead(bb, &b);
978: VecGetArray(xx, &x);
979: PetscArraycpy(x, b, 5 * mbs);
980: MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x);
981: VecRestoreArrayRead(bb, &b);
982: VecRestoreArray(xx, &x);
983: PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs));
984: return 0;
985: }
987: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
988: {
989: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
990: IS isrow = a->row;
991: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
992: const PetscInt *r, *vj;
993: PetscInt nz, k, idx;
994: const MatScalar *aa = a->a, *v, *diag;
995: PetscScalar *x, x0, x1, x2, x3, *t, *tp;
996: const PetscScalar *b;
998: VecGetArrayRead(bb, &b);
999: VecGetArray(xx, &x);
1000: t = a->solve_work;
1001: ISGetIndices(isrow, &r);
1003: /* solve U^T * D * y = b by forward substitution */
1004: tp = t;
1005: for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1006: idx = 4 * r[k];
1007: tp[0] = b[idx];
1008: tp[1] = b[idx + 1];
1009: tp[2] = b[idx + 2];
1010: tp[3] = b[idx + 3];
1011: tp += 4;
1012: }
1014: for (k = 0; k < mbs; k++) {
1015: v = aa + 16 * ai[k];
1016: vj = aj + ai[k];
1017: tp = t + k * 4;
1018: x0 = tp[0];
1019: x1 = tp[1];
1020: x2 = tp[2];
1021: x3 = tp[3];
1022: nz = ai[k + 1] - ai[k];
1024: tp = t + (*vj) * 4;
1025: while (nz--) {
1026: tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3;
1027: tp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3;
1028: tp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3;
1029: tp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3;
1030: vj++;
1031: tp = t + (*vj) * 4;
1032: v += 16;
1033: }
1035: /* xk = inv(Dk)*(Dk*xk) */
1036: diag = aa + k * 16; /* ptr to inv(Dk) */
1037: tp = t + k * 4;
1038: tp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3;
1039: tp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3;
1040: tp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3;
1041: tp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3;
1042: }
1044: /* solve U*x = y by back substitution */
1045: for (k = mbs - 1; k >= 0; k--) {
1046: v = aa + 16 * ai[k];
1047: vj = aj + ai[k];
1048: tp = t + k * 4;
1049: x0 = tp[0];
1050: x1 = tp[1];
1051: x2 = tp[2];
1052: x3 = tp[3]; /* xk */
1053: nz = ai[k + 1] - ai[k];
1055: tp = t + (*vj) * 4;
1056: while (nz--) {
1057: /* xk += U(k,:)*x(:) */
1058: x0 += v[0] * tp[0] + v[4] * tp[1] + v[8] * tp[2] + v[12] * tp[3];
1059: x1 += v[1] * tp[0] + v[5] * tp[1] + v[9] * tp[2] + v[13] * tp[3];
1060: x2 += v[2] * tp[0] + v[6] * tp[1] + v[10] * tp[2] + v[14] * tp[3];
1061: x3 += v[3] * tp[0] + v[7] * tp[1] + v[11] * tp[2] + v[15] * tp[3];
1062: vj++;
1063: tp = t + (*vj) * 4;
1064: v += 16;
1065: }
1066: tp = t + k * 4;
1067: tp[0] = x0;
1068: tp[1] = x1;
1069: tp[2] = x2;
1070: tp[3] = x3;
1071: idx = 4 * r[k];
1072: x[idx] = x0;
1073: x[idx + 1] = x1;
1074: x[idx + 2] = x2;
1075: x[idx + 3] = x3;
1076: }
1078: ISRestoreIndices(isrow, &r);
1079: VecRestoreArrayRead(bb, &b);
1080: VecRestoreArray(xx, &x);
1081: PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
1082: return 0;
1083: }
1085: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1086: {
1087: const MatScalar *v, *diag;
1088: PetscScalar *xp, x0, x1, x2, x3;
1089: PetscInt nz, k;
1090: const PetscInt *vj;
1092: for (k = 0; k < mbs; k++) {
1093: v = aa + 16 * ai[k];
1094: xp = x + k * 4;
1095: x0 = xp[0];
1096: x1 = xp[1];
1097: x2 = xp[2];
1098: x3 = xp[3]; /* Dk*xk = k-th block of x */
1099: nz = ai[k + 1] - ai[k];
1100: vj = aj + ai[k];
1101: PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1102: PetscPrefetchBlock(v + 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1103: while (nz--) {
1104: xp = x + (*vj) * 4;
1105: /* x(:) += U(k,:)^T*(Dk*xk) */
1106: xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3;
1107: xp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3;
1108: xp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3;
1109: xp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3;
1110: vj++;
1111: v += 16;
1112: }
1113: /* xk = inv(Dk)*(Dk*xk) */
1114: diag = aa + k * 16; /* ptr to inv(Dk) */
1115: xp = x + k * 4;
1116: xp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3;
1117: xp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3;
1118: xp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3;
1119: xp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3;
1120: }
1121: return 0;
1122: }
1124: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1125: {
1126: const MatScalar *v;
1127: PetscScalar *xp, x0, x1, x2, x3;
1128: PetscInt nz, k;
1129: const PetscInt *vj;
1131: for (k = mbs - 1; k >= 0; k--) {
1132: v = aa + 16 * ai[k];
1133: xp = x + k * 4;
1134: x0 = xp[0];
1135: x1 = xp[1];
1136: x2 = xp[2];
1137: x3 = xp[3]; /* xk */
1138: nz = ai[k + 1] - ai[k];
1139: vj = aj + ai[k];
1140: PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1141: PetscPrefetchBlock(v - 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1142: while (nz--) {
1143: xp = x + (*vj) * 4;
1144: /* xk += U(k,:)*x(:) */
1145: x0 += v[0] * xp[0] + v[4] * xp[1] + v[8] * xp[2] + v[12] * xp[3];
1146: x1 += v[1] * xp[0] + v[5] * xp[1] + v[9] * xp[2] + v[13] * xp[3];
1147: x2 += v[2] * xp[0] + v[6] * xp[1] + v[10] * xp[2] + v[14] * xp[3];
1148: x3 += v[3] * xp[0] + v[7] * xp[1] + v[11] * xp[2] + v[15] * xp[3];
1149: vj++;
1150: v += 16;
1151: }
1152: xp = x + k * 4;
1153: xp[0] = x0;
1154: xp[1] = x1;
1155: xp[2] = x2;
1156: xp[3] = x3;
1157: }
1158: return 0;
1159: }
1161: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1162: {
1163: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1164: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1165: const MatScalar *aa = a->a;
1166: PetscScalar *x;
1167: const PetscScalar *b;
1169: VecGetArrayRead(bb, &b);
1170: VecGetArray(xx, &x);
1172: /* solve U^T * D * y = b by forward substitution */
1173: PetscArraycpy(x, b, 4 * mbs); /* x <- b */
1174: MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x);
1176: /* solve U*x = y by back substitution */
1177: MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x);
1178: VecRestoreArrayRead(bb, &b);
1179: VecRestoreArray(xx, &x);
1180: PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
1181: return 0;
1182: }
1184: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1185: {
1186: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1187: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1188: const MatScalar *aa = a->a;
1189: PetscScalar *x;
1190: const PetscScalar *b;
1192: VecGetArrayRead(bb, &b);
1193: VecGetArray(xx, &x);
1194: PetscArraycpy(x, b, 4 * mbs); /* x <- b */
1195: MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x);
1196: VecRestoreArrayRead(bb, &b);
1197: VecRestoreArray(xx, &x);
1198: PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs);
1199: return 0;
1200: }
1202: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1203: {
1204: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1205: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1206: const MatScalar *aa = a->a;
1207: PetscScalar *x;
1208: const PetscScalar *b;
1210: VecGetArrayRead(bb, &b);
1211: VecGetArray(xx, &x);
1212: PetscArraycpy(x, b, 4 * mbs);
1213: MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x);
1214: VecRestoreArrayRead(bb, &b);
1215: VecRestoreArray(xx, &x);
1216: PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs));
1217: return 0;
1218: }
1220: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
1221: {
1222: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1223: IS isrow = a->row;
1224: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1225: const PetscInt *r;
1226: PetscInt nz, k, idx;
1227: const PetscInt *vj;
1228: const MatScalar *aa = a->a, *v, *diag;
1229: PetscScalar *x, x0, x1, x2, *t, *tp;
1230: const PetscScalar *b;
1232: VecGetArrayRead(bb, &b);
1233: VecGetArray(xx, &x);
1234: t = a->solve_work;
1235: ISGetIndices(isrow, &r);
1237: /* solve U^T * D * y = b by forward substitution */
1238: tp = t;
1239: for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1240: idx = 3 * r[k];
1241: tp[0] = b[idx];
1242: tp[1] = b[idx + 1];
1243: tp[2] = b[idx + 2];
1244: tp += 3;
1245: }
1247: for (k = 0; k < mbs; k++) {
1248: v = aa + 9 * ai[k];
1249: vj = aj + ai[k];
1250: tp = t + k * 3;
1251: x0 = tp[0];
1252: x1 = tp[1];
1253: x2 = tp[2];
1254: nz = ai[k + 1] - ai[k];
1256: tp = t + (*vj) * 3;
1257: while (nz--) {
1258: tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2;
1259: tp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2;
1260: tp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2;
1261: vj++;
1262: tp = t + (*vj) * 3;
1263: v += 9;
1264: }
1266: /* xk = inv(Dk)*(Dk*xk) */
1267: diag = aa + k * 9; /* ptr to inv(Dk) */
1268: tp = t + k * 3;
1269: tp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2;
1270: tp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2;
1271: tp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2;
1272: }
1274: /* solve U*x = y by back substitution */
1275: for (k = mbs - 1; k >= 0; k--) {
1276: v = aa + 9 * ai[k];
1277: vj = aj + ai[k];
1278: tp = t + k * 3;
1279: x0 = tp[0];
1280: x1 = tp[1];
1281: x2 = tp[2]; /* xk */
1282: nz = ai[k + 1] - ai[k];
1284: tp = t + (*vj) * 3;
1285: while (nz--) {
1286: /* xk += U(k,:)*x(:) */
1287: x0 += v[0] * tp[0] + v[3] * tp[1] + v[6] * tp[2];
1288: x1 += v[1] * tp[0] + v[4] * tp[1] + v[7] * tp[2];
1289: x2 += v[2] * tp[0] + v[5] * tp[1] + v[8] * tp[2];
1290: vj++;
1291: tp = t + (*vj) * 3;
1292: v += 9;
1293: }
1294: tp = t + k * 3;
1295: tp[0] = x0;
1296: tp[1] = x1;
1297: tp[2] = x2;
1298: idx = 3 * r[k];
1299: x[idx] = x0;
1300: x[idx + 1] = x1;
1301: x[idx + 2] = x2;
1302: }
1304: ISRestoreIndices(isrow, &r);
1305: VecRestoreArrayRead(bb, &b);
1306: VecRestoreArray(xx, &x);
1307: PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
1308: return 0;
1309: }
1311: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1312: {
1313: const MatScalar *v, *diag;
1314: PetscScalar *xp, x0, x1, x2;
1315: PetscInt nz, k;
1316: const PetscInt *vj;
1318: for (k = 0; k < mbs; k++) {
1319: v = aa + 9 * ai[k];
1320: xp = x + k * 3;
1321: x0 = xp[0];
1322: x1 = xp[1];
1323: x2 = xp[2]; /* Dk*xk = k-th block of x */
1324: nz = ai[k + 1] - ai[k];
1325: vj = aj + ai[k];
1326: PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1327: PetscPrefetchBlock(v + 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1328: while (nz--) {
1329: xp = x + (*vj) * 3;
1330: /* x(:) += U(k,:)^T*(Dk*xk) */
1331: xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2;
1332: xp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2;
1333: xp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2;
1334: vj++;
1335: v += 9;
1336: }
1337: /* xk = inv(Dk)*(Dk*xk) */
1338: diag = aa + k * 9; /* ptr to inv(Dk) */
1339: xp = x + k * 3;
1340: xp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2;
1341: xp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2;
1342: xp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2;
1343: }
1344: return 0;
1345: }
1347: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1348: {
1349: const MatScalar *v;
1350: PetscScalar *xp, x0, x1, x2;
1351: PetscInt nz, k;
1352: const PetscInt *vj;
1354: for (k = mbs - 1; k >= 0; k--) {
1355: v = aa + 9 * ai[k];
1356: xp = x + k * 3;
1357: x0 = xp[0];
1358: x1 = xp[1];
1359: x2 = xp[2]; /* xk */
1360: nz = ai[k + 1] - ai[k];
1361: vj = aj + ai[k];
1362: PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1363: PetscPrefetchBlock(v - 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1364: while (nz--) {
1365: xp = x + (*vj) * 3;
1366: /* xk += U(k,:)*x(:) */
1367: x0 += v[0] * xp[0] + v[3] * xp[1] + v[6] * xp[2];
1368: x1 += v[1] * xp[0] + v[4] * xp[1] + v[7] * xp[2];
1369: x2 += v[2] * xp[0] + v[5] * xp[1] + v[8] * xp[2];
1370: vj++;
1371: v += 9;
1372: }
1373: xp = x + k * 3;
1374: xp[0] = x0;
1375: xp[1] = x1;
1376: xp[2] = x2;
1377: }
1378: return 0;
1379: }
1381: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1382: {
1383: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1384: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1385: const MatScalar *aa = a->a;
1386: PetscScalar *x;
1387: const PetscScalar *b;
1389: VecGetArrayRead(bb, &b);
1390: VecGetArray(xx, &x);
1392: /* solve U^T * D * y = b by forward substitution */
1393: PetscArraycpy(x, b, 3 * mbs);
1394: MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x);
1396: /* solve U*x = y by back substitution */
1397: MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x);
1399: VecRestoreArrayRead(bb, &b);
1400: VecRestoreArray(xx, &x);
1401: PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
1402: return 0;
1403: }
1405: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1406: {
1407: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1408: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1409: const MatScalar *aa = a->a;
1410: PetscScalar *x;
1411: const PetscScalar *b;
1413: VecGetArrayRead(bb, &b);
1414: VecGetArray(xx, &x);
1415: PetscArraycpy(x, b, 3 * mbs);
1416: MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x);
1417: VecRestoreArrayRead(bb, &b);
1418: VecRestoreArray(xx, &x);
1419: PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs);
1420: return 0;
1421: }
1423: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1424: {
1425: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1426: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1427: const MatScalar *aa = a->a;
1428: PetscScalar *x;
1429: const PetscScalar *b;
1431: VecGetArrayRead(bb, &b);
1432: VecGetArray(xx, &x);
1433: PetscArraycpy(x, b, 3 * mbs);
1434: MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x);
1435: VecRestoreArrayRead(bb, &b);
1436: VecRestoreArray(xx, &x);
1437: PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs));
1438: return 0;
1439: }
1441: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
1442: {
1443: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1444: IS isrow = a->row;
1445: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1446: const PetscInt *r, *vj;
1447: PetscInt nz, k, k2, idx;
1448: const MatScalar *aa = a->a, *v, *diag;
1449: PetscScalar *x, x0, x1, *t;
1450: const PetscScalar *b;
1452: VecGetArrayRead(bb, &b);
1453: VecGetArray(xx, &x);
1454: t = a->solve_work;
1455: ISGetIndices(isrow, &r);
1457: /* solve U^T * D * y = perm(b) by forward substitution */
1458: for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1459: idx = 2 * r[k];
1460: t[k * 2] = b[idx];
1461: t[k * 2 + 1] = b[idx + 1];
1462: }
1463: for (k = 0; k < mbs; k++) {
1464: v = aa + 4 * ai[k];
1465: vj = aj + ai[k];
1466: k2 = k * 2;
1467: x0 = t[k2];
1468: x1 = t[k2 + 1];
1469: nz = ai[k + 1] - ai[k];
1470: while (nz--) {
1471: t[(*vj) * 2] += v[0] * x0 + v[1] * x1;
1472: t[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1;
1473: vj++;
1474: v += 4;
1475: }
1476: diag = aa + k * 4; /* ptr to inv(Dk) */
1477: t[k2] = diag[0] * x0 + diag[2] * x1;
1478: t[k2 + 1] = diag[1] * x0 + diag[3] * x1;
1479: }
1481: /* solve U*x = y by back substitution */
1482: for (k = mbs - 1; k >= 0; k--) {
1483: v = aa + 4 * ai[k];
1484: vj = aj + ai[k];
1485: k2 = k * 2;
1486: x0 = t[k2];
1487: x1 = t[k2 + 1];
1488: nz = ai[k + 1] - ai[k];
1489: while (nz--) {
1490: x0 += v[0] * t[(*vj) * 2] + v[2] * t[(*vj) * 2 + 1];
1491: x1 += v[1] * t[(*vj) * 2] + v[3] * t[(*vj) * 2 + 1];
1492: vj++;
1493: v += 4;
1494: }
1495: t[k2] = x0;
1496: t[k2 + 1] = x1;
1497: idx = 2 * r[k];
1498: x[idx] = x0;
1499: x[idx + 1] = x1;
1500: }
1502: ISRestoreIndices(isrow, &r);
1503: VecRestoreArrayRead(bb, &b);
1504: VecRestoreArray(xx, &x);
1505: PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
1506: return 0;
1507: }
1509: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1510: {
1511: const MatScalar *v, *diag;
1512: PetscScalar x0, x1;
1513: PetscInt nz, k, k2;
1514: const PetscInt *vj;
1516: for (k = 0; k < mbs; k++) {
1517: v = aa + 4 * ai[k];
1518: vj = aj + ai[k];
1519: k2 = k * 2;
1520: x0 = x[k2];
1521: x1 = x[k2 + 1]; /* Dk*xk = k-th block of x */
1522: nz = ai[k + 1] - ai[k];
1523: PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1524: PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1525: while (nz--) {
1526: /* x(:) += U(k,:)^T*(Dk*xk) */
1527: x[(*vj) * 2] += v[0] * x0 + v[1] * x1;
1528: x[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1;
1529: vj++;
1530: v += 4;
1531: }
1532: /* xk = inv(Dk)*(Dk*xk) */
1533: diag = aa + k * 4; /* ptr to inv(Dk) */
1534: x[k2] = diag[0] * x0 + diag[2] * x1;
1535: x[k2 + 1] = diag[1] * x0 + diag[3] * x1;
1536: }
1537: return 0;
1538: }
1540: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1541: {
1542: const MatScalar *v;
1543: PetscScalar x0, x1;
1544: PetscInt nz, k, k2;
1545: const PetscInt *vj;
1547: for (k = mbs - 1; k >= 0; k--) {
1548: v = aa + 4 * ai[k];
1549: vj = aj + ai[k];
1550: k2 = k * 2;
1551: x0 = x[k2];
1552: x1 = x[k2 + 1]; /* xk */
1553: nz = ai[k + 1] - ai[k];
1554: PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1555: PetscPrefetchBlock(v - 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1556: while (nz--) {
1557: /* xk += U(k,:)*x(:) */
1558: x0 += v[0] * x[(*vj) * 2] + v[2] * x[(*vj) * 2 + 1];
1559: x1 += v[1] * x[(*vj) * 2] + v[3] * x[(*vj) * 2 + 1];
1560: vj++;
1561: v += 4;
1562: }
1563: x[k2] = x0;
1564: x[k2 + 1] = x1;
1565: }
1566: return 0;
1567: }
1569: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1570: {
1571: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1572: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1573: const MatScalar *aa = a->a;
1574: PetscScalar *x;
1575: const PetscScalar *b;
1577: VecGetArrayRead(bb, &b);
1578: VecGetArray(xx, &x);
1580: /* solve U^T * D * y = b by forward substitution */
1581: PetscArraycpy(x, b, 2 * mbs);
1582: MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x);
1584: /* solve U*x = y by back substitution */
1585: MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x);
1587: VecRestoreArrayRead(bb, &b);
1588: VecRestoreArray(xx, &x);
1589: PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs);
1590: return 0;
1591: }
1593: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1594: {
1595: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1596: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1597: const MatScalar *aa = a->a;
1598: PetscScalar *x;
1599: const PetscScalar *b;
1601: VecGetArrayRead(bb, &b);
1602: VecGetArray(xx, &x);
1603: PetscArraycpy(x, b, 2 * mbs);
1604: MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x);
1605: VecRestoreArrayRead(bb, &b);
1606: VecRestoreArray(xx, &x);
1607: PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs);
1608: return 0;
1609: }
1611: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1612: {
1613: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1614: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1615: const MatScalar *aa = a->a;
1616: PetscScalar *x;
1617: const PetscScalar *b;
1619: VecGetArrayRead(bb, &b);
1620: VecGetArray(xx, &x);
1621: PetscArraycpy(x, b, 2 * mbs);
1622: MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x);
1623: VecRestoreArrayRead(bb, &b);
1624: VecRestoreArray(xx, &x);
1625: PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs));
1626: return 0;
1627: }
1629: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1630: {
1631: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1632: IS isrow = a->row;
1633: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1634: const MatScalar *aa = a->a, *v;
1635: const PetscScalar *b;
1636: PetscScalar *x, xk, *t;
1637: PetscInt nz, k, j;
1639: VecGetArrayRead(bb, &b);
1640: VecGetArray(xx, &x);
1641: t = a->solve_work;
1642: ISGetIndices(isrow, &rp);
1644: /* solve U^T*D*y = perm(b) by forward substitution */
1645: for (k = 0; k < mbs; k++) t[k] = b[rp[k]];
1646: for (k = 0; k < mbs; k++) {
1647: v = aa + ai[k];
1648: vj = aj + ai[k];
1649: xk = t[k];
1650: nz = ai[k + 1] - ai[k] - 1;
1651: for (j = 0; j < nz; j++) t[vj[j]] += v[j] * xk;
1652: t[k] = xk * v[nz]; /* v[nz] = 1/D(k) */
1653: }
1655: /* solve U*perm(x) = y by back substitution */
1656: for (k = mbs - 1; k >= 0; k--) {
1657: v = aa + adiag[k] - 1;
1658: vj = aj + adiag[k] - 1;
1659: nz = ai[k + 1] - ai[k] - 1;
1660: for (j = 0; j < nz; j++) t[k] += v[-j] * t[vj[-j]];
1661: x[rp[k]] = t[k];
1662: }
1664: ISRestoreIndices(isrow, &rp);
1665: VecRestoreArrayRead(bb, &b);
1666: VecRestoreArray(xx, &x);
1667: PetscLogFlops(4.0 * a->nz - 3.0 * mbs);
1668: return 0;
1669: }
1671: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1672: {
1673: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1674: IS isrow = a->row;
1675: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1676: const MatScalar *aa = a->a, *v;
1677: PetscScalar *x, xk, *t;
1678: const PetscScalar *b;
1679: PetscInt nz, k;
1681: VecGetArrayRead(bb, &b);
1682: VecGetArray(xx, &x);
1683: t = a->solve_work;
1684: ISGetIndices(isrow, &rp);
1686: /* solve U^T*D*y = perm(b) by forward substitution */
1687: for (k = 0; k < mbs; k++) t[k] = b[rp[k]];
1688: for (k = 0; k < mbs; k++) {
1689: v = aa + ai[k] + 1;
1690: vj = aj + ai[k] + 1;
1691: xk = t[k];
1692: nz = ai[k + 1] - ai[k] - 1;
1693: while (nz--) t[*vj++] += (*v++) * xk;
1694: t[k] = xk * aa[ai[k]]; /* aa[k] = 1/D(k) */
1695: }
1697: /* solve U*perm(x) = y by back substitution */
1698: for (k = mbs - 1; k >= 0; k--) {
1699: v = aa + ai[k] + 1;
1700: vj = aj + ai[k] + 1;
1701: nz = ai[k + 1] - ai[k] - 1;
1702: while (nz--) t[k] += (*v++) * t[*vj++];
1703: x[rp[k]] = t[k];
1704: }
1706: ISRestoreIndices(isrow, &rp);
1707: VecRestoreArrayRead(bb, &b);
1708: VecRestoreArray(xx, &x);
1709: PetscLogFlops(4.0 * a->nz - 3 * mbs);
1710: return 0;
1711: }
1713: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1714: {
1715: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1716: IS isrow = a->row;
1717: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1718: const MatScalar *aa = a->a, *v;
1719: PetscReal diagk;
1720: PetscScalar *x, xk;
1721: const PetscScalar *b;
1722: PetscInt nz, k;
1724: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1725: VecGetArrayRead(bb, &b);
1726: VecGetArray(xx, &x);
1727: ISGetIndices(isrow, &rp);
1729: for (k = 0; k < mbs; k++) x[k] = b[rp[k]];
1730: for (k = 0; k < mbs; k++) {
1731: v = aa + ai[k];
1732: vj = aj + ai[k];
1733: xk = x[k];
1734: nz = ai[k + 1] - ai[k] - 1;
1735: while (nz--) x[*vj++] += (*v++) * xk;
1737: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1739: x[k] = xk * PetscSqrtReal(diagk);
1740: }
1741: ISRestoreIndices(isrow, &rp);
1742: VecRestoreArrayRead(bb, &b);
1743: VecRestoreArray(xx, &x);
1744: PetscLogFlops(2.0 * a->nz - mbs);
1745: return 0;
1746: }
1748: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1749: {
1750: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1751: IS isrow = a->row;
1752: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1753: const MatScalar *aa = a->a, *v;
1754: PetscReal diagk;
1755: PetscScalar *x, xk;
1756: const PetscScalar *b;
1757: PetscInt nz, k;
1759: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1760: VecGetArrayRead(bb, &b);
1761: VecGetArray(xx, &x);
1762: ISGetIndices(isrow, &rp);
1764: for (k = 0; k < mbs; k++) x[k] = b[rp[k]];
1765: for (k = 0; k < mbs; k++) {
1766: v = aa + ai[k] + 1;
1767: vj = aj + ai[k] + 1;
1768: xk = x[k];
1769: nz = ai[k + 1] - ai[k] - 1;
1770: while (nz--) x[*vj++] += (*v++) * xk;
1772: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1774: x[k] = xk * PetscSqrtReal(diagk);
1775: }
1776: ISRestoreIndices(isrow, &rp);
1777: VecRestoreArrayRead(bb, &b);
1778: VecRestoreArray(xx, &x);
1779: PetscLogFlops(2.0 * a->nz - mbs);
1780: return 0;
1781: }
1783: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1784: {
1785: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1786: IS isrow = a->row;
1787: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1788: const MatScalar *aa = a->a, *v;
1789: PetscReal diagk;
1790: PetscScalar *x, *t;
1791: const PetscScalar *b;
1792: PetscInt nz, k;
1794: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1795: VecGetArrayRead(bb, &b);
1796: VecGetArray(xx, &x);
1797: t = a->solve_work;
1798: ISGetIndices(isrow, &rp);
1800: for (k = mbs - 1; k >= 0; k--) {
1801: v = aa + ai[k];
1802: vj = aj + ai[k];
1803: diagk = PetscRealPart(aa[adiag[k]]);
1805: t[k] = b[k] * PetscSqrtReal(diagk);
1806: nz = ai[k + 1] - ai[k] - 1;
1807: while (nz--) t[k] += (*v++) * t[*vj++];
1808: x[rp[k]] = t[k];
1809: }
1810: ISRestoreIndices(isrow, &rp);
1811: VecRestoreArrayRead(bb, &b);
1812: VecRestoreArray(xx, &x);
1813: PetscLogFlops(2.0 * a->nz - mbs);
1814: return 0;
1815: }
1817: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1818: {
1819: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1820: IS isrow = a->row;
1821: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1822: const MatScalar *aa = a->a, *v;
1823: PetscReal diagk;
1824: PetscScalar *x, *t;
1825: const PetscScalar *b;
1826: PetscInt nz, k;
1828: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1829: VecGetArrayRead(bb, &b);
1830: VecGetArray(xx, &x);
1831: t = a->solve_work;
1832: ISGetIndices(isrow, &rp);
1834: for (k = mbs - 1; k >= 0; k--) {
1835: v = aa + ai[k] + 1;
1836: vj = aj + ai[k] + 1;
1837: diagk = PetscRealPart(aa[ai[k]]);
1839: t[k] = b[k] * PetscSqrtReal(diagk);
1840: nz = ai[k + 1] - ai[k] - 1;
1841: while (nz--) t[k] += (*v++) * t[*vj++];
1842: x[rp[k]] = t[k];
1843: }
1844: ISRestoreIndices(isrow, &rp);
1845: VecRestoreArrayRead(bb, &b);
1846: VecRestoreArray(xx, &x);
1847: PetscLogFlops(2.0 * a->nz - mbs);
1848: return 0;
1849: }
1851: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A, Vecs bb, Vecs xx)
1852: {
1853: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1855: if (A->rmap->bs == 1) {
1856: MatSolve_SeqSBAIJ_1(A, bb->v, xx->v);
1857: } else {
1858: IS isrow = a->row;
1859: const PetscInt *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp;
1860: const MatScalar *aa = a->a, *v;
1861: PetscScalar *x, *t;
1862: const PetscScalar *b;
1863: PetscInt nz, k, n, i, j;
1865: if (bb->n > a->solves_work_n) {
1866: PetscFree(a->solves_work);
1867: PetscMalloc1(bb->n * A->rmap->N, &a->solves_work);
1868: a->solves_work_n = bb->n;
1869: }
1870: n = bb->n;
1871: VecGetArrayRead(bb->v, &b);
1872: VecGetArray(xx->v, &x);
1873: t = a->solves_work;
1875: ISGetIndices(isrow, &rp);
1877: /* solve U^T*D*y = perm(b) by forward substitution */
1878: for (k = 0; k < mbs; k++) {
1879: for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */
1880: }
1881: for (k = 0; k < mbs; k++) {
1882: v = aa + ai[k];
1883: vj = aj + ai[k];
1884: nz = ai[k + 1] - ai[k] - 1;
1885: for (j = 0; j < nz; j++) {
1886: for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i];
1887: v++;
1888: vj++;
1889: }
1890: for (i = 0; i < n; i++) t[n * k + i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */
1891: }
1893: /* solve U*perm(x) = y by back substitution */
1894: for (k = mbs - 1; k >= 0; k--) {
1895: v = aa + ai[k] - 1;
1896: vj = aj + ai[k] - 1;
1897: nz = ai[k + 1] - ai[k] - 1;
1898: for (j = 0; j < nz; j++) {
1899: for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i];
1900: v++;
1901: vj++;
1902: }
1903: for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i];
1904: }
1906: ISRestoreIndices(isrow, &rp);
1907: VecRestoreArrayRead(bb->v, &b);
1908: VecRestoreArray(xx->v, &x);
1909: PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs));
1910: }
1911: return 0;
1912: }
1914: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A, Vecs bb, Vecs xx)
1915: {
1916: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1918: if (A->rmap->bs == 1) {
1919: MatSolve_SeqSBAIJ_1_inplace(A, bb->v, xx->v);
1920: } else {
1921: IS isrow = a->row;
1922: const PetscInt *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp;
1923: const MatScalar *aa = a->a, *v;
1924: PetscScalar *x, *t;
1925: const PetscScalar *b;
1926: PetscInt nz, k, n, i;
1928: if (bb->n > a->solves_work_n) {
1929: PetscFree(a->solves_work);
1930: PetscMalloc1(bb->n * A->rmap->N, &a->solves_work);
1931: a->solves_work_n = bb->n;
1932: }
1933: n = bb->n;
1934: VecGetArrayRead(bb->v, &b);
1935: VecGetArray(xx->v, &x);
1936: t = a->solves_work;
1938: ISGetIndices(isrow, &rp);
1940: /* solve U^T*D*y = perm(b) by forward substitution */
1941: for (k = 0; k < mbs; k++) {
1942: for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */
1943: }
1944: for (k = 0; k < mbs; k++) {
1945: v = aa + ai[k];
1946: vj = aj + ai[k];
1947: nz = ai[k + 1] - ai[k];
1948: while (nz--) {
1949: for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i];
1950: v++;
1951: vj++;
1952: }
1953: for (i = 0; i < n; i++) t[n * k + i] *= aa[k]; /* note: aa[k] = 1/D(k) */
1954: }
1956: /* solve U*perm(x) = y by back substitution */
1957: for (k = mbs - 1; k >= 0; k--) {
1958: v = aa + ai[k];
1959: vj = aj + ai[k];
1960: nz = ai[k + 1] - ai[k];
1961: while (nz--) {
1962: for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i];
1963: v++;
1964: vj++;
1965: }
1966: for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i];
1967: }
1969: ISRestoreIndices(isrow, &rp);
1970: VecRestoreArrayRead(bb->v, &b);
1971: VecRestoreArray(xx->v, &x);
1972: PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs));
1973: }
1974: return 0;
1975: }
1977: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
1978: {
1979: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1980: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag;
1981: const MatScalar *aa = a->a, *v;
1982: const PetscScalar *b;
1983: PetscScalar *x, xi;
1984: PetscInt nz, i, j;
1986: VecGetArrayRead(bb, &b);
1987: VecGetArray(xx, &x);
1988: /* solve U^T*D*y = b by forward substitution */
1989: PetscArraycpy(x, b, mbs);
1990: for (i = 0; i < mbs; i++) {
1991: v = aa + ai[i];
1992: vj = aj + ai[i];
1993: xi = x[i];
1994: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
1995: for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi;
1996: x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
1997: }
1998: /* solve U*x = y by backward substitution */
1999: for (i = mbs - 2; i >= 0; i--) {
2000: xi = x[i];
2001: v = aa + adiag[i] - 1; /* end of row i, excluding diag */
2002: vj = aj + adiag[i] - 1;
2003: nz = ai[i + 1] - ai[i] - 1;
2004: for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]];
2005: x[i] = xi;
2006: }
2007: VecRestoreArrayRead(bb, &b);
2008: VecRestoreArray(xx, &x);
2009: PetscLogFlops(4.0 * a->nz - 3 * mbs);
2010: return 0;
2011: }
2013: PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Mat B, Mat X)
2014: {
2015: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2016: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag;
2017: const MatScalar *aa = a->a, *v;
2018: const PetscScalar *b;
2019: PetscScalar *x, xi;
2020: PetscInt nz, i, j, neq, ldb, ldx;
2021: PetscBool isdense;
2023: if (!mbs) return 0;
2024: PetscObjectTypeCompare((PetscObject)B, MATSEQDENSE, &isdense);
2026: if (X != B) {
2027: PetscObjectTypeCompare((PetscObject)X, MATSEQDENSE, &isdense);
2029: }
2030: MatDenseGetArrayRead(B, &b);
2031: MatDenseGetLDA(B, &ldb);
2032: MatDenseGetArray(X, &x);
2033: MatDenseGetLDA(X, &ldx);
2034: for (neq = 0; neq < B->cmap->n; neq++) {
2035: /* solve U^T*D*y = b by forward substitution */
2036: PetscArraycpy(x, b, mbs);
2037: for (i = 0; i < mbs; i++) {
2038: v = aa + ai[i];
2039: vj = aj + ai[i];
2040: xi = x[i];
2041: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
2042: for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi;
2043: x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2044: }
2045: /* solve U*x = y by backward substitution */
2046: for (i = mbs - 2; i >= 0; i--) {
2047: xi = x[i];
2048: v = aa + adiag[i] - 1; /* end of row i, excluding diag */
2049: vj = aj + adiag[i] - 1;
2050: nz = ai[i + 1] - ai[i] - 1;
2051: for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]];
2052: x[i] = xi;
2053: }
2054: b += ldb;
2055: x += ldx;
2056: }
2057: MatDenseRestoreArrayRead(B, &b);
2058: MatDenseRestoreArray(X, &x);
2059: PetscLogFlops(B->cmap->n * (4.0 * a->nz - 3 * mbs));
2060: return 0;
2061: }
2063: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2064: {
2065: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2066: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2067: const MatScalar *aa = a->a, *v;
2068: PetscScalar *x, xk;
2069: const PetscScalar *b;
2070: PetscInt nz, k;
2072: VecGetArrayRead(bb, &b);
2073: VecGetArray(xx, &x);
2075: /* solve U^T*D*y = b by forward substitution */
2076: PetscArraycpy(x, b, mbs);
2077: for (k = 0; k < mbs; k++) {
2078: v = aa + ai[k] + 1;
2079: vj = aj + ai[k] + 1;
2080: xk = x[k];
2081: nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2082: while (nz--) x[*vj++] += (*v++) * xk;
2083: x[k] = xk * aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
2084: }
2086: /* solve U*x = y by back substitution */
2087: for (k = mbs - 2; k >= 0; k--) {
2088: v = aa + ai[k] + 1;
2089: vj = aj + ai[k] + 1;
2090: xk = x[k];
2091: nz = ai[k + 1] - ai[k] - 1;
2092: while (nz--) xk += (*v++) * x[*vj++];
2093: x[k] = xk;
2094: }
2096: VecRestoreArrayRead(bb, &b);
2097: VecRestoreArray(xx, &x);
2098: PetscLogFlops(4.0 * a->nz - 3 * mbs);
2099: return 0;
2100: }
2102: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2103: {
2104: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2105: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj;
2106: const MatScalar *aa = a->a, *v;
2107: PetscReal diagk;
2108: PetscScalar *x;
2109: const PetscScalar *b;
2110: PetscInt nz, k;
2112: /* solve U^T*D^(1/2)*x = b by forward substitution */
2113: VecGetArrayRead(bb, &b);
2114: VecGetArray(xx, &x);
2115: PetscArraycpy(x, b, mbs);
2116: for (k = 0; k < mbs; k++) {
2117: v = aa + ai[k];
2118: vj = aj + ai[k];
2119: nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2120: while (nz--) x[*vj++] += (*v++) * x[k];
2121: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2123: x[k] *= PetscSqrtReal(diagk);
2124: }
2125: VecRestoreArrayRead(bb, &b);
2126: VecRestoreArray(xx, &x);
2127: PetscLogFlops(2.0 * a->nz - mbs);
2128: return 0;
2129: }
2131: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2132: {
2133: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2134: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2135: const MatScalar *aa = a->a, *v;
2136: PetscReal diagk;
2137: PetscScalar *x;
2138: const PetscScalar *b;
2139: PetscInt nz, k;
2141: /* solve U^T*D^(1/2)*x = b by forward substitution */
2142: VecGetArrayRead(bb, &b);
2143: VecGetArray(xx, &x);
2144: PetscArraycpy(x, b, mbs);
2145: for (k = 0; k < mbs; k++) {
2146: v = aa + ai[k] + 1;
2147: vj = aj + ai[k] + 1;
2148: nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2149: while (nz--) x[*vj++] += (*v++) * x[k];
2150: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2152: x[k] *= PetscSqrtReal(diagk);
2153: }
2154: VecRestoreArrayRead(bb, &b);
2155: VecRestoreArray(xx, &x);
2156: PetscLogFlops(2.0 * a->nz - mbs);
2157: return 0;
2158: }
2160: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2161: {
2162: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2163: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj;
2164: const MatScalar *aa = a->a, *v;
2165: PetscReal diagk;
2166: PetscScalar *x;
2167: const PetscScalar *b;
2168: PetscInt nz, k;
2170: /* solve D^(1/2)*U*x = b by back substitution */
2171: VecGetArrayRead(bb, &b);
2172: VecGetArray(xx, &x);
2174: for (k = mbs - 1; k >= 0; k--) {
2175: v = aa + ai[k];
2176: vj = aj + ai[k];
2177: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2179: x[k] = PetscSqrtReal(diagk) * b[k];
2180: nz = ai[k + 1] - ai[k] - 1;
2181: while (nz--) x[k] += (*v++) * x[*vj++];
2182: }
2183: VecRestoreArrayRead(bb, &b);
2184: VecRestoreArray(xx, &x);
2185: PetscLogFlops(2.0 * a->nz - mbs);
2186: return 0;
2187: }
2189: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2190: {
2191: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2192: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2193: const MatScalar *aa = a->a, *v;
2194: PetscReal diagk;
2195: PetscScalar *x;
2196: const PetscScalar *b;
2197: PetscInt nz, k;
2199: /* solve D^(1/2)*U*x = b by back substitution */
2200: VecGetArrayRead(bb, &b);
2201: VecGetArray(xx, &x);
2203: for (k = mbs - 1; k >= 0; k--) {
2204: v = aa + ai[k] + 1;
2205: vj = aj + ai[k] + 1;
2206: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2208: x[k] = PetscSqrtReal(diagk) * b[k];
2209: nz = ai[k + 1] - ai[k] - 1;
2210: while (nz--) x[k] += (*v++) * x[*vj++];
2211: }
2212: VecRestoreArrayRead(bb, &b);
2213: VecRestoreArray(xx, &x);
2214: PetscLogFlops(2.0 * a->nz - mbs);
2215: return 0;
2216: }
2218: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2219: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2220: {
2221: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b;
2222: const PetscInt *rip, mbs = a->mbs, *ai, *aj;
2223: PetscInt *jutmp, bs = A->rmap->bs, i;
2224: PetscInt m, reallocs = 0, *levtmp;
2225: PetscInt *prowl, *q, jmin, jmax, juidx, nzk, qm, *iu, *ju, k, j, vj, umax, maxadd;
2226: PetscInt incrlev, *lev, shift, prow, nz;
2227: PetscReal f = info->fill, levels = info->levels;
2228: PetscBool perm_identity;
2230: /* check whether perm is the identity mapping */
2231: ISIdentity(perm, &perm_identity);
2233: if (perm_identity) {
2234: a->permute = PETSC_FALSE;
2235: ai = a->i;
2236: aj = a->j;
2237: } else { /* non-trivial permutation */
2238: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2239: }
2241: /* initialization */
2242: ISGetIndices(perm, &rip);
2243: umax = (PetscInt)(f * ai[mbs] + 1);
2244: PetscMalloc1(umax, &lev);
2245: umax += mbs + 1;
2246: shift = mbs + 1;
2247: PetscMalloc1(mbs + 1, &iu);
2248: PetscMalloc1(umax, &ju);
2249: iu[0] = mbs + 1;
2250: juidx = mbs + 1;
2251: /* prowl: linked list for pivot row */
2252: PetscMalloc3(mbs, &prowl, mbs, &q, mbs, &levtmp);
2253: /* q: linked list for col index */
2255: for (i = 0; i < mbs; i++) {
2256: prowl[i] = mbs;
2257: q[i] = 0;
2258: }
2260: /* for each row k */
2261: for (k = 0; k < mbs; k++) {
2262: nzk = 0;
2263: q[k] = mbs;
2264: /* copy current row into linked list */
2265: nz = ai[rip[k] + 1] - ai[rip[k]];
2266: j = ai[rip[k]];
2267: while (nz--) {
2268: vj = rip[aj[j++]];
2269: if (vj > k) {
2270: qm = k;
2271: do {
2272: m = qm;
2273: qm = q[m];
2274: } while (qm < vj);
2276: nzk++;
2277: q[m] = vj;
2278: q[vj] = qm;
2279: levtmp[vj] = 0; /* initialize lev for nonzero element */
2280: }
2281: }
2283: /* modify nonzero structure of k-th row by computing fill-in
2284: for each row prow to be merged in */
2285: prow = k;
2286: prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2288: while (prow < k) {
2289: /* merge row prow into k-th row */
2290: jmin = iu[prow] + 1;
2291: jmax = iu[prow + 1];
2292: qm = k;
2293: for (j = jmin; j < jmax; j++) {
2294: incrlev = lev[j - shift] + 1;
2295: if (incrlev > levels) continue;
2296: vj = ju[j];
2297: do {
2298: m = qm;
2299: qm = q[m];
2300: } while (qm < vj);
2301: if (qm != vj) { /* a fill */
2302: nzk++;
2303: q[m] = vj;
2304: q[vj] = qm;
2305: qm = vj;
2306: levtmp[vj] = incrlev;
2307: } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2308: }
2309: prow = prowl[prow]; /* next pivot row */
2310: }
2312: /* add k to row list for first nonzero element in k-th row */
2313: if (nzk > 1) {
2314: i = q[k]; /* col value of first nonzero element in k_th row of U */
2315: prowl[k] = prowl[i];
2316: prowl[i] = k;
2317: }
2318: iu[k + 1] = iu[k] + nzk;
2320: /* allocate more space to ju and lev if needed */
2321: if (iu[k + 1] > umax) {
2322: /* estimate how much additional space we will need */
2323: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2324: /* just double the memory each time */
2325: maxadd = umax;
2326: if (maxadd < nzk) maxadd = (mbs - k) * (nzk + 1) / 2;
2327: umax += maxadd;
2329: /* allocate a longer ju */
2330: PetscMalloc1(umax, &jutmp);
2331: PetscArraycpy(jutmp, ju, iu[k]);
2332: PetscFree(ju);
2333: ju = jutmp;
2335: PetscMalloc1(umax, &jutmp);
2336: PetscArraycpy(jutmp, lev, iu[k] - shift);
2337: PetscFree(lev);
2338: lev = jutmp;
2339: reallocs += 2; /* count how many times we realloc */
2340: }
2342: /* save nonzero structure of k-th row in ju */
2343: i = k;
2344: while (nzk--) {
2345: i = q[i];
2346: ju[juidx] = i;
2347: lev[juidx - shift] = levtmp[i];
2348: juidx++;
2349: }
2350: }
2352: #if defined(PETSC_USE_INFO)
2353: if (ai[mbs] != 0) {
2354: PetscReal af = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
2355: PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af);
2356: PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af);
2357: PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af);
2358: PetscInfo(A, "for best performance.\n");
2359: } else {
2360: PetscInfo(A, "Empty matrix\n");
2361: }
2362: #endif
2364: ISRestoreIndices(perm, &rip);
2365: PetscFree3(prowl, q, levtmp);
2366: PetscFree(lev);
2368: /* put together the new matrix */
2369: MatSeqSBAIJSetPreallocation(B, bs, 0, NULL);
2371: b = (Mat_SeqSBAIJ *)(B)->data;
2372: PetscFree2(b->imax, b->ilen);
2374: b->singlemalloc = PETSC_FALSE;
2375: b->free_a = PETSC_TRUE;
2376: b->free_ij = PETSC_TRUE;
2377: /* the next line frees the default space generated by the Create() */
2378: PetscFree3(b->a, b->j, b->i);
2379: PetscMalloc1((iu[mbs] + 1) * a->bs2, &b->a);
2380: b->j = ju;
2381: b->i = iu;
2382: b->diag = NULL;
2383: b->ilen = NULL;
2384: b->imax = NULL;
2386: ISDestroy(&b->row);
2387: ISDestroy(&b->icol);
2388: b->row = perm;
2389: b->icol = perm;
2390: PetscObjectReference((PetscObject)perm);
2391: PetscObjectReference((PetscObject)perm);
2392: PetscMalloc1(bs * mbs + bs, &b->solve_work);
2393: /* In b structure: Free imax, ilen, old a, old j.
2394: Allocate idnew, solve_work, new a, new j */
2395: b->maxnz = b->nz = iu[mbs];
2397: (B)->info.factor_mallocs = reallocs;
2398: (B)->info.fill_ratio_given = f;
2399: if (ai[mbs] != 0) {
2400: (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
2401: } else {
2402: (B)->info.fill_ratio_needed = 0.0;
2403: }
2404: MatSeqSBAIJSetNumericFactorization_inplace(B, perm_identity);
2405: return 0;
2406: }
2408: /*
2409: See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2410: */
2411: #include <petscbt.h>
2412: #include <../src/mat/utils/freespace.h>
2413: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
2414: {
2415: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b;
2416: PetscBool perm_identity, free_ij = PETSC_TRUE, missing;
2417: PetscInt bs = A->rmap->bs, am = a->mbs, d, *ai = a->i, *aj = a->j;
2418: const PetscInt *rip;
2419: PetscInt reallocs = 0, i, *ui, *udiag, *cols;
2420: PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
2421: PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, *uj, **uj_ptr, **uj_lvl_ptr;
2422: PetscReal fill = info->fill, levels = info->levels;
2423: PetscFreeSpaceList free_space = NULL, current_space = NULL;
2424: PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
2425: PetscBT lnkbt;
2428: MatMissingDiagonal(A, &missing, &d);
2430: if (bs > 1) {
2431: MatICCFactorSymbolic_SeqSBAIJ_inplace(fact, A, perm, info);
2432: return 0;
2433: }
2435: /* check whether perm is the identity mapping */
2436: ISIdentity(perm, &perm_identity);
2438: a->permute = PETSC_FALSE;
2440: PetscMalloc1(am + 1, &ui);
2441: PetscMalloc1(am + 1, &udiag);
2442: ui[0] = 0;
2444: /* ICC(0) without matrix ordering: simply rearrange column indices */
2445: if (!levels) {
2446: /* reuse the column pointers and row offsets for memory savings */
2447: for (i = 0; i < am; i++) {
2448: ncols = ai[i + 1] - ai[i];
2449: ui[i + 1] = ui[i] + ncols;
2450: udiag[i] = ui[i + 1] - 1; /* points to the last entry of U(i,:) */
2451: }
2452: PetscMalloc1(ui[am] + 1, &uj);
2453: cols = uj;
2454: for (i = 0; i < am; i++) {
2455: aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2456: ncols = ai[i + 1] - ai[i] - 1;
2457: for (j = 0; j < ncols; j++) *cols++ = aj[j];
2458: *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
2459: }
2460: } else { /* case: levels>0 */
2461: ISGetIndices(perm, &rip);
2463: /* initialization */
2464: /* jl: linked list for storing indices of the pivot rows
2465: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2466: PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl);
2467: for (i = 0; i < am; i++) {
2468: jl[i] = am;
2469: il[i] = 0;
2470: }
2472: /* create and initialize a linked list for storing column indices of the active row k */
2473: nlnk = am + 1;
2474: PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt);
2476: /* initial FreeSpace size is fill*(ai[am]+1) */
2477: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space);
2479: current_space = free_space;
2481: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl);
2483: current_space_lvl = free_space_lvl;
2485: for (k = 0; k < am; k++) { /* for each active row k */
2486: /* initialize lnk by the column indices of row k */
2487: nzk = 0;
2488: ncols = ai[k + 1] - ai[k];
2490: cols = aj + ai[k];
2491: PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt);
2492: nzk += nlnk;
2494: /* update lnk by computing fill-in for each pivot row to be merged in */
2495: prow = jl[k]; /* 1st pivot row */
2497: while (prow < k) {
2498: nextprow = jl[prow];
2500: /* merge prow into k-th row */
2501: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2502: jmax = ui[prow + 1];
2503: ncols = jmax - jmin;
2504: i = jmin - ui[prow];
2506: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2507: uj = uj_lvl_ptr[prow] + i; /* levels of cols */
2508: j = *(uj - 1);
2509: PetscICCLLAddSorted(ncols, cols, levels, uj, am, &nlnk, lnk, lnk_lvl, lnkbt, j);
2510: nzk += nlnk;
2512: /* update il and jl for prow */
2513: if (jmin < jmax) {
2514: il[prow] = jmin;
2516: j = *cols;
2517: jl[prow] = jl[j];
2518: jl[j] = prow;
2519: }
2520: prow = nextprow;
2521: }
2523: /* if free space is not available, make more free space */
2524: if (current_space->local_remaining < nzk) {
2525: i = am - k + 1; /* num of unfactored rows */
2526: i = PetscIntMultTruncate(i, PetscMin(nzk, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2527: PetscFreeSpaceGet(i, ¤t_space);
2528: PetscFreeSpaceGet(i, ¤t_space_lvl);
2529: reallocs++;
2530: }
2532: /* copy data into free_space and free_space_lvl, then initialize lnk */
2534: PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt);
2536: /* add the k-th row into il and jl */
2537: if (nzk > 1) {
2538: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2539: jl[k] = jl[i];
2540: jl[i] = k;
2541: il[k] = ui[k] + 1;
2542: }
2543: uj_ptr[k] = current_space->array;
2544: uj_lvl_ptr[k] = current_space_lvl->array;
2546: current_space->array += nzk;
2547: current_space->local_used += nzk;
2548: current_space->local_remaining -= nzk;
2549: current_space_lvl->array += nzk;
2550: current_space_lvl->local_used += nzk;
2551: current_space_lvl->local_remaining -= nzk;
2553: ui[k + 1] = ui[k] + nzk;
2554: }
2556: ISRestoreIndices(perm, &rip);
2557: PetscFree4(uj_ptr, uj_lvl_ptr, il, jl);
2559: /* destroy list of free space and other temporary array(s) */
2560: PetscMalloc1(ui[am] + 1, &uj);
2561: PetscFreeSpaceContiguous_Cholesky(&free_space, uj, am, ui, udiag); /* store matrix factor */
2562: PetscIncompleteLLDestroy(lnk, lnkbt);
2563: PetscFreeSpaceDestroy(free_space_lvl);
2565: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2567: /* put together the new matrix in MATSEQSBAIJ format */
2568: MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL);
2570: b = (Mat_SeqSBAIJ *)(fact)->data;
2571: PetscFree2(b->imax, b->ilen);
2573: b->singlemalloc = PETSC_FALSE;
2574: b->free_a = PETSC_TRUE;
2575: b->free_ij = free_ij;
2577: PetscMalloc1(ui[am] + 1, &b->a);
2579: b->j = uj;
2580: b->i = ui;
2581: b->diag = udiag;
2582: b->free_diag = PETSC_TRUE;
2583: b->ilen = NULL;
2584: b->imax = NULL;
2585: b->row = perm;
2586: b->col = perm;
2588: PetscObjectReference((PetscObject)perm);
2589: PetscObjectReference((PetscObject)perm);
2591: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2593: PetscMalloc1(am + 1, &b->solve_work);
2595: b->maxnz = b->nz = ui[am];
2597: fact->info.factor_mallocs = reallocs;
2598: fact->info.fill_ratio_given = fill;
2599: if (ai[am] != 0) {
2600: fact->info.fill_ratio_needed = ((PetscReal)ui[am]) / ai[am];
2601: } else {
2602: fact->info.fill_ratio_needed = 0.0;
2603: }
2604: #if defined(PETSC_USE_INFO)
2605: if (ai[am] != 0) {
2606: PetscReal af = fact->info.fill_ratio_needed;
2607: PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af);
2608: PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af);
2609: PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af);
2610: } else {
2611: PetscInfo(A, "Empty matrix\n");
2612: }
2613: #endif
2614: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2615: return 0;
2616: }
2618: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
2619: {
2620: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2621: Mat_SeqSBAIJ *b;
2622: PetscBool perm_identity, free_ij = PETSC_TRUE;
2623: PetscInt bs = A->rmap->bs, am = a->mbs;
2624: const PetscInt *cols, *rip, *ai = a->i, *aj = a->j;
2625: PetscInt reallocs = 0, i, *ui;
2626: PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
2627: PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
2628: PetscReal fill = info->fill, levels = info->levels, ratio_needed;
2629: PetscFreeSpaceList free_space = NULL, current_space = NULL;
2630: PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
2631: PetscBT lnkbt;
2633: /*
2634: This code originally uses Modified Sparse Row (MSR) storage
2635: (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2636: Then it is rewritten so the factor B takes seqsbaij format. However the associated
2637: MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2638: thus the original code in MSR format is still used for these cases.
2639: The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2640: MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2641: */
2642: if (bs > 1) {
2643: MatICCFactorSymbolic_SeqSBAIJ_MSR(fact, A, perm, info);
2644: return 0;
2645: }
2647: /* check whether perm is the identity mapping */
2648: ISIdentity(perm, &perm_identity);
2650: a->permute = PETSC_FALSE;
2652: /* special case that simply copies fill pattern */
2653: if (!levels) {
2654: /* reuse the column pointers and row offsets for memory savings */
2655: ui = a->i;
2656: uj = a->j;
2657: free_ij = PETSC_FALSE;
2658: ratio_needed = 1.0;
2659: } else { /* case: levels>0 */
2660: ISGetIndices(perm, &rip);
2662: /* initialization */
2663: PetscMalloc1(am + 1, &ui);
2664: ui[0] = 0;
2666: /* jl: linked list for storing indices of the pivot rows
2667: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2668: PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl);
2669: for (i = 0; i < am; i++) {
2670: jl[i] = am;
2671: il[i] = 0;
2672: }
2674: /* create and initialize a linked list for storing column indices of the active row k */
2675: nlnk = am + 1;
2676: PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt);
2678: /* initial FreeSpace size is fill*(ai[am]+1) */
2679: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space);
2681: current_space = free_space;
2683: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl);
2685: current_space_lvl = free_space_lvl;
2687: for (k = 0; k < am; k++) { /* for each active row k */
2688: /* initialize lnk by the column indices of row rip[k] */
2689: nzk = 0;
2690: ncols = ai[rip[k] + 1] - ai[rip[k]];
2691: cols = aj + ai[rip[k]];
2692: PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt);
2693: nzk += nlnk;
2695: /* update lnk by computing fill-in for each pivot row to be merged in */
2696: prow = jl[k]; /* 1st pivot row */
2698: while (prow < k) {
2699: nextprow = jl[prow];
2701: /* merge prow into k-th row */
2702: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2703: jmax = ui[prow + 1];
2704: ncols = jmax - jmin;
2705: i = jmin - ui[prow];
2706: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2707: j = *(uj_lvl_ptr[prow] + i - 1);
2708: cols_lvl = uj_lvl_ptr[prow] + i;
2709: PetscICCLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt, j);
2710: nzk += nlnk;
2712: /* update il and jl for prow */
2713: if (jmin < jmax) {
2714: il[prow] = jmin;
2715: j = *cols;
2716: jl[prow] = jl[j];
2717: jl[j] = prow;
2718: }
2719: prow = nextprow;
2720: }
2722: /* if free space is not available, make more free space */
2723: if (current_space->local_remaining < nzk) {
2724: i = am - k + 1; /* num of unfactored rows */
2725: i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2726: PetscFreeSpaceGet(i, ¤t_space);
2727: PetscFreeSpaceGet(i, ¤t_space_lvl);
2728: reallocs++;
2729: }
2731: /* copy data into free_space and free_space_lvl, then initialize lnk */
2732: PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt);
2734: /* add the k-th row into il and jl */
2735: if (nzk - 1 > 0) {
2736: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2737: jl[k] = jl[i];
2738: jl[i] = k;
2739: il[k] = ui[k] + 1;
2740: }
2741: uj_ptr[k] = current_space->array;
2742: uj_lvl_ptr[k] = current_space_lvl->array;
2744: current_space->array += nzk;
2745: current_space->local_used += nzk;
2746: current_space->local_remaining -= nzk;
2747: current_space_lvl->array += nzk;
2748: current_space_lvl->local_used += nzk;
2749: current_space_lvl->local_remaining -= nzk;
2751: ui[k + 1] = ui[k] + nzk;
2752: }
2754: ISRestoreIndices(perm, &rip);
2755: PetscFree4(uj_ptr, uj_lvl_ptr, il, jl);
2757: /* destroy list of free space and other temporary array(s) */
2758: PetscMalloc1(ui[am] + 1, &uj);
2759: PetscFreeSpaceContiguous(&free_space, uj);
2760: PetscIncompleteLLDestroy(lnk, lnkbt);
2761: PetscFreeSpaceDestroy(free_space_lvl);
2762: if (ai[am] != 0) {
2763: ratio_needed = ((PetscReal)ui[am]) / ((PetscReal)ai[am]);
2764: } else {
2765: ratio_needed = 0.0;
2766: }
2767: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2769: /* put together the new matrix in MATSEQSBAIJ format */
2770: MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL);
2772: b = (Mat_SeqSBAIJ *)(fact)->data;
2774: PetscFree2(b->imax, b->ilen);
2776: b->singlemalloc = PETSC_FALSE;
2777: b->free_a = PETSC_TRUE;
2778: b->free_ij = free_ij;
2780: PetscMalloc1(ui[am] + 1, &b->a);
2782: b->j = uj;
2783: b->i = ui;
2784: b->diag = NULL;
2785: b->ilen = NULL;
2786: b->imax = NULL;
2787: b->row = perm;
2788: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2790: PetscObjectReference((PetscObject)perm);
2791: b->icol = perm;
2792: PetscObjectReference((PetscObject)perm);
2793: PetscMalloc1(am + 1, &b->solve_work);
2795: b->maxnz = b->nz = ui[am];
2797: fact->info.factor_mallocs = reallocs;
2798: fact->info.fill_ratio_given = fill;
2799: fact->info.fill_ratio_needed = ratio_needed;
2800: #if defined(PETSC_USE_INFO)
2801: if (ai[am] != 0) {
2802: PetscReal af = fact->info.fill_ratio_needed;
2803: PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af);
2804: PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af);
2805: PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af);
2806: } else {
2807: PetscInfo(A, "Empty matrix\n");
2808: }
2809: #endif
2810: if (perm_identity) {
2811: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2812: } else {
2813: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2814: }
2815: return 0;
2816: }