Actual source code: baijfact9.c
2: /*
3: Factorization code for BAIJ format.
4: */
5: #include <../src/mat/impls/baij/seq/baij.h>
6: #include <petsc/private/kernels/blockinvert.h>
8: /* ------------------------------------------------------------*/
9: /*
10: Version for when blocks are 5 by 5
11: */
12: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_5_inplace(Mat C, Mat A, const MatFactorInfo *info)
13: {
14: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
15: IS isrow = b->row, isicol = b->icol;
16: const PetscInt *r, *ic, *bi = b->i, *bj = b->j, *ajtmpold, *ajtmp;
17: PetscInt i, j, n = a->mbs, nz, row, idx, ipvt[5];
18: const PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
19: MatScalar *w, *pv, *rtmp, *x, *pc;
20: const MatScalar *v, *aa = a->a;
21: MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
22: MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
23: MatScalar x17, x18, x19, x20, x21, x22, x23, x24, x25, p10, p11, p12, p13, p14;
24: MatScalar p15, p16, p17, p18, p19, p20, p21, p22, p23, p24, p25, m10, m11, m12;
25: MatScalar m13, m14, m15, m16, m17, m18, m19, m20, m21, m22, m23, m24, m25;
26: MatScalar *ba = b->a, work[25];
27: PetscReal shift = info->shiftamount;
28: PetscBool allowzeropivot, zeropivotdetected;
30: allowzeropivot = PetscNot(A->erroriffailure);
31: ISGetIndices(isrow, &r);
32: ISGetIndices(isicol, &ic);
33: PetscMalloc1(25 * (n + 1), &rtmp);
35: #define PETSC_USE_MEMZERO 1
36: #define PETSC_USE_MEMCPY 1
38: for (i = 0; i < n; i++) {
39: nz = bi[i + 1] - bi[i];
40: ajtmp = bj + bi[i];
41: for (j = 0; j < nz; j++) {
42: #if defined(PETSC_USE_MEMZERO)
43: PetscArrayzero(rtmp + 25 * ajtmp[j], 25);
44: #else
45: x = rtmp + 25 * ajtmp[j];
46: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
47: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0;
48: x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = 0.0;
49: #endif
50: }
51: /* load in initial (unfactored row) */
52: idx = r[i];
53: nz = ai[idx + 1] - ai[idx];
54: ajtmpold = aj + ai[idx];
55: v = aa + 25 * ai[idx];
56: for (j = 0; j < nz; j++) {
57: #if defined(PETSC_USE_MEMCPY)
58: PetscArraycpy(rtmp + 25 * ic[ajtmpold[j]], v, 25);
59: #else
60: x = rtmp + 25 * ic[ajtmpold[j]];
61: x[0] = v[0];
62: x[1] = v[1];
63: x[2] = v[2];
64: x[3] = v[3];
65: x[4] = v[4];
66: x[5] = v[5];
67: x[6] = v[6];
68: x[7] = v[7];
69: x[8] = v[8];
70: x[9] = v[9];
71: x[10] = v[10];
72: x[11] = v[11];
73: x[12] = v[12];
74: x[13] = v[13];
75: x[14] = v[14];
76: x[15] = v[15];
77: x[16] = v[16];
78: x[17] = v[17];
79: x[18] = v[18];
80: x[19] = v[19];
81: x[20] = v[20];
82: x[21] = v[21];
83: x[22] = v[22];
84: x[23] = v[23];
85: x[24] = v[24];
86: #endif
87: v += 25;
88: }
89: row = *ajtmp++;
90: while (row < i) {
91: pc = rtmp + 25 * row;
92: p1 = pc[0];
93: p2 = pc[1];
94: p3 = pc[2];
95: p4 = pc[3];
96: p5 = pc[4];
97: p6 = pc[5];
98: p7 = pc[6];
99: p8 = pc[7];
100: p9 = pc[8];
101: p10 = pc[9];
102: p11 = pc[10];
103: p12 = pc[11];
104: p13 = pc[12];
105: p14 = pc[13];
106: p15 = pc[14];
107: p16 = pc[15];
108: p17 = pc[16];
109: p18 = pc[17];
110: p19 = pc[18];
111: p20 = pc[19];
112: p21 = pc[20];
113: p22 = pc[21];
114: p23 = pc[22];
115: p24 = pc[23];
116: p25 = pc[24];
117: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || p25 != 0.0) {
118: pv = ba + 25 * diag_offset[row];
119: pj = bj + diag_offset[row] + 1;
120: x1 = pv[0];
121: x2 = pv[1];
122: x3 = pv[2];
123: x4 = pv[3];
124: x5 = pv[4];
125: x6 = pv[5];
126: x7 = pv[6];
127: x8 = pv[7];
128: x9 = pv[8];
129: x10 = pv[9];
130: x11 = pv[10];
131: x12 = pv[11];
132: x13 = pv[12];
133: x14 = pv[13];
134: x15 = pv[14];
135: x16 = pv[15];
136: x17 = pv[16];
137: x18 = pv[17];
138: x19 = pv[18];
139: x20 = pv[19];
140: x21 = pv[20];
141: x22 = pv[21];
142: x23 = pv[22];
143: x24 = pv[23];
144: x25 = pv[24];
145: pc[0] = m1 = p1 * x1 + p6 * x2 + p11 * x3 + p16 * x4 + p21 * x5;
146: pc[1] = m2 = p2 * x1 + p7 * x2 + p12 * x3 + p17 * x4 + p22 * x5;
147: pc[2] = m3 = p3 * x1 + p8 * x2 + p13 * x3 + p18 * x4 + p23 * x5;
148: pc[3] = m4 = p4 * x1 + p9 * x2 + p14 * x3 + p19 * x4 + p24 * x5;
149: pc[4] = m5 = p5 * x1 + p10 * x2 + p15 * x3 + p20 * x4 + p25 * x5;
151: pc[5] = m6 = p1 * x6 + p6 * x7 + p11 * x8 + p16 * x9 + p21 * x10;
152: pc[6] = m7 = p2 * x6 + p7 * x7 + p12 * x8 + p17 * x9 + p22 * x10;
153: pc[7] = m8 = p3 * x6 + p8 * x7 + p13 * x8 + p18 * x9 + p23 * x10;
154: pc[8] = m9 = p4 * x6 + p9 * x7 + p14 * x8 + p19 * x9 + p24 * x10;
155: pc[9] = m10 = p5 * x6 + p10 * x7 + p15 * x8 + p20 * x9 + p25 * x10;
157: pc[10] = m11 = p1 * x11 + p6 * x12 + p11 * x13 + p16 * x14 + p21 * x15;
158: pc[11] = m12 = p2 * x11 + p7 * x12 + p12 * x13 + p17 * x14 + p22 * x15;
159: pc[12] = m13 = p3 * x11 + p8 * x12 + p13 * x13 + p18 * x14 + p23 * x15;
160: pc[13] = m14 = p4 * x11 + p9 * x12 + p14 * x13 + p19 * x14 + p24 * x15;
161: pc[14] = m15 = p5 * x11 + p10 * x12 + p15 * x13 + p20 * x14 + p25 * x15;
163: pc[15] = m16 = p1 * x16 + p6 * x17 + p11 * x18 + p16 * x19 + p21 * x20;
164: pc[16] = m17 = p2 * x16 + p7 * x17 + p12 * x18 + p17 * x19 + p22 * x20;
165: pc[17] = m18 = p3 * x16 + p8 * x17 + p13 * x18 + p18 * x19 + p23 * x20;
166: pc[18] = m19 = p4 * x16 + p9 * x17 + p14 * x18 + p19 * x19 + p24 * x20;
167: pc[19] = m20 = p5 * x16 + p10 * x17 + p15 * x18 + p20 * x19 + p25 * x20;
169: pc[20] = m21 = p1 * x21 + p6 * x22 + p11 * x23 + p16 * x24 + p21 * x25;
170: pc[21] = m22 = p2 * x21 + p7 * x22 + p12 * x23 + p17 * x24 + p22 * x25;
171: pc[22] = m23 = p3 * x21 + p8 * x22 + p13 * x23 + p18 * x24 + p23 * x25;
172: pc[23] = m24 = p4 * x21 + p9 * x22 + p14 * x23 + p19 * x24 + p24 * x25;
173: pc[24] = m25 = p5 * x21 + p10 * x22 + p15 * x23 + p20 * x24 + p25 * x25;
175: nz = bi[row + 1] - diag_offset[row] - 1;
176: pv += 25;
177: for (j = 0; j < nz; j++) {
178: x1 = pv[0];
179: x2 = pv[1];
180: x3 = pv[2];
181: x4 = pv[3];
182: x5 = pv[4];
183: x6 = pv[5];
184: x7 = pv[6];
185: x8 = pv[7];
186: x9 = pv[8];
187: x10 = pv[9];
188: x11 = pv[10];
189: x12 = pv[11];
190: x13 = pv[12];
191: x14 = pv[13];
192: x15 = pv[14];
193: x16 = pv[15];
194: x17 = pv[16];
195: x18 = pv[17];
196: x19 = pv[18];
197: x20 = pv[19];
198: x21 = pv[20];
199: x22 = pv[21];
200: x23 = pv[22];
201: x24 = pv[23];
202: x25 = pv[24];
203: x = rtmp + 25 * pj[j];
204: x[0] -= m1 * x1 + m6 * x2 + m11 * x3 + m16 * x4 + m21 * x5;
205: x[1] -= m2 * x1 + m7 * x2 + m12 * x3 + m17 * x4 + m22 * x5;
206: x[2] -= m3 * x1 + m8 * x2 + m13 * x3 + m18 * x4 + m23 * x5;
207: x[3] -= m4 * x1 + m9 * x2 + m14 * x3 + m19 * x4 + m24 * x5;
208: x[4] -= m5 * x1 + m10 * x2 + m15 * x3 + m20 * x4 + m25 * x5;
210: x[5] -= m1 * x6 + m6 * x7 + m11 * x8 + m16 * x9 + m21 * x10;
211: x[6] -= m2 * x6 + m7 * x7 + m12 * x8 + m17 * x9 + m22 * x10;
212: x[7] -= m3 * x6 + m8 * x7 + m13 * x8 + m18 * x9 + m23 * x10;
213: x[8] -= m4 * x6 + m9 * x7 + m14 * x8 + m19 * x9 + m24 * x10;
214: x[9] -= m5 * x6 + m10 * x7 + m15 * x8 + m20 * x9 + m25 * x10;
216: x[10] -= m1 * x11 + m6 * x12 + m11 * x13 + m16 * x14 + m21 * x15;
217: x[11] -= m2 * x11 + m7 * x12 + m12 * x13 + m17 * x14 + m22 * x15;
218: x[12] -= m3 * x11 + m8 * x12 + m13 * x13 + m18 * x14 + m23 * x15;
219: x[13] -= m4 * x11 + m9 * x12 + m14 * x13 + m19 * x14 + m24 * x15;
220: x[14] -= m5 * x11 + m10 * x12 + m15 * x13 + m20 * x14 + m25 * x15;
222: x[15] -= m1 * x16 + m6 * x17 + m11 * x18 + m16 * x19 + m21 * x20;
223: x[16] -= m2 * x16 + m7 * x17 + m12 * x18 + m17 * x19 + m22 * x20;
224: x[17] -= m3 * x16 + m8 * x17 + m13 * x18 + m18 * x19 + m23 * x20;
225: x[18] -= m4 * x16 + m9 * x17 + m14 * x18 + m19 * x19 + m24 * x20;
226: x[19] -= m5 * x16 + m10 * x17 + m15 * x18 + m20 * x19 + m25 * x20;
228: x[20] -= m1 * x21 + m6 * x22 + m11 * x23 + m16 * x24 + m21 * x25;
229: x[21] -= m2 * x21 + m7 * x22 + m12 * x23 + m17 * x24 + m22 * x25;
230: x[22] -= m3 * x21 + m8 * x22 + m13 * x23 + m18 * x24 + m23 * x25;
231: x[23] -= m4 * x21 + m9 * x22 + m14 * x23 + m19 * x24 + m24 * x25;
232: x[24] -= m5 * x21 + m10 * x22 + m15 * x23 + m20 * x24 + m25 * x25;
234: pv += 25;
235: }
236: PetscLogFlops(250.0 * nz + 225.0);
237: }
238: row = *ajtmp++;
239: }
240: /* finished row so stick it into b->a */
241: pv = ba + 25 * bi[i];
242: pj = bj + bi[i];
243: nz = bi[i + 1] - bi[i];
244: for (j = 0; j < nz; j++) {
245: #if defined(PETSC_USE_MEMCPY)
246: PetscArraycpy(pv, rtmp + 25 * pj[j], 25);
247: #else
248: x = rtmp + 25 * pj[j];
249: pv[0] = x[0];
250: pv[1] = x[1];
251: pv[2] = x[2];
252: pv[3] = x[3];
253: pv[4] = x[4];
254: pv[5] = x[5];
255: pv[6] = x[6];
256: pv[7] = x[7];
257: pv[8] = x[8];
258: pv[9] = x[9];
259: pv[10] = x[10];
260: pv[11] = x[11];
261: pv[12] = x[12];
262: pv[13] = x[13];
263: pv[14] = x[14];
264: pv[15] = x[15];
265: pv[16] = x[16];
266: pv[17] = x[17];
267: pv[18] = x[18];
268: pv[19] = x[19];
269: pv[20] = x[20];
270: pv[21] = x[21];
271: pv[22] = x[22];
272: pv[23] = x[23];
273: pv[24] = x[24];
274: #endif
275: pv += 25;
276: }
277: /* invert diagonal block */
278: w = ba + 25 * diag_offset[i];
279: PetscKernel_A_gets_inverse_A_5(w, ipvt, work, shift, allowzeropivot, &zeropivotdetected);
280: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
281: }
283: PetscFree(rtmp);
284: ISRestoreIndices(isicol, &ic);
285: ISRestoreIndices(isrow, &r);
287: C->ops->solve = MatSolve_SeqBAIJ_5_inplace;
288: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_inplace;
289: C->assembled = PETSC_TRUE;
291: PetscLogFlops(1.333333333333 * 5 * 5 * 5 * b->mbs); /* from inverting diagonal blocks */
292: return 0;
293: }
295: /* MatLUFactorNumeric_SeqBAIJ_5 -
296: copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
297: PetscKernel_A_gets_A_times_B()
298: PetscKernel_A_gets_A_minus_B_times_C()
299: PetscKernel_A_gets_inverse_A()
300: */
302: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_5(Mat B, Mat A, const MatFactorInfo *info)
303: {
304: Mat C = B;
305: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
306: IS isrow = b->row, isicol = b->icol;
307: const PetscInt *r, *ic;
308: PetscInt i, j, k, nz, nzL, row;
309: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
310: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
311: MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a, work[25];
312: PetscInt flg, ipvt[5];
313: PetscReal shift = info->shiftamount;
314: PetscBool allowzeropivot, zeropivotdetected;
316: allowzeropivot = PetscNot(A->erroriffailure);
317: ISGetIndices(isrow, &r);
318: ISGetIndices(isicol, &ic);
320: /* generate work space needed by the factorization */
321: PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork);
322: PetscArrayzero(rtmp, bs2 * n);
324: for (i = 0; i < n; i++) {
325: /* zero rtmp */
326: /* L part */
327: nz = bi[i + 1] - bi[i];
328: bjtmp = bj + bi[i];
329: for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);
331: /* U part */
332: nz = bdiag[i] - bdiag[i + 1];
333: bjtmp = bj + bdiag[i + 1] + 1;
334: for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);
336: /* load in initial (unfactored row) */
337: nz = ai[r[i] + 1] - ai[r[i]];
338: ajtmp = aj + ai[r[i]];
339: v = aa + bs2 * ai[r[i]];
340: for (j = 0; j < nz; j++) PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2);
342: /* elimination */
343: bjtmp = bj + bi[i];
344: nzL = bi[i + 1] - bi[i];
345: for (k = 0; k < nzL; k++) {
346: row = bjtmp[k];
347: pc = rtmp + bs2 * row;
348: for (flg = 0, j = 0; j < bs2; j++) {
349: if (pc[j] != 0.0) {
350: flg = 1;
351: break;
352: }
353: }
354: if (flg) {
355: pv = b->a + bs2 * bdiag[row];
356: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
357: PetscKernel_A_gets_A_times_B_5(pc, pv, mwork);
359: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
360: pv = b->a + bs2 * (bdiag[row + 1] + 1);
361: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
362: for (j = 0; j < nz; j++) {
363: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
364: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
365: v = rtmp + bs2 * pj[j];
366: PetscKernel_A_gets_A_minus_B_times_C_5(v, pc, pv);
367: pv += bs2;
368: }
369: PetscLogFlops(250.0 * nz + 225); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
370: }
371: }
373: /* finished row so stick it into b->a */
374: /* L part */
375: pv = b->a + bs2 * bi[i];
376: pj = b->j + bi[i];
377: nz = bi[i + 1] - bi[i];
378: for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);
380: /* Mark diagonal and invert diagonal for simpler triangular solves */
381: pv = b->a + bs2 * bdiag[i];
382: pj = b->j + bdiag[i];
383: PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2);
384: PetscKernel_A_gets_inverse_A_5(pv, ipvt, work, shift, allowzeropivot, &zeropivotdetected);
385: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
387: /* U part */
388: pv = b->a + bs2 * (bdiag[i + 1] + 1);
389: pj = b->j + bdiag[i + 1] + 1;
390: nz = bdiag[i] - bdiag[i + 1] - 1;
391: for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);
392: }
394: PetscFree2(rtmp, mwork);
395: ISRestoreIndices(isicol, &ic);
396: ISRestoreIndices(isrow, &r);
398: C->ops->solve = MatSolve_SeqBAIJ_5;
399: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5;
400: C->assembled = PETSC_TRUE;
402: PetscLogFlops(1.333333333333 * 5 * 5 * 5 * n); /* from inverting diagonal blocks */
403: return 0;
404: }
406: /*
407: Version for when blocks are 5 by 5 Using natural ordering
408: */
409: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
410: {
411: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
412: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j, ipvt[5];
413: PetscInt *ajtmpold, *ajtmp, nz, row;
414: PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
415: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
416: MatScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
417: MatScalar x16, x17, x18, x19, x20, x21, x22, x23, x24, x25;
418: MatScalar p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15;
419: MatScalar p16, p17, p18, p19, p20, p21, p22, p23, p24, p25;
420: MatScalar m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15;
421: MatScalar m16, m17, m18, m19, m20, m21, m22, m23, m24, m25;
422: MatScalar *ba = b->a, *aa = a->a, work[25];
423: PetscReal shift = info->shiftamount;
424: PetscBool allowzeropivot, zeropivotdetected;
426: allowzeropivot = PetscNot(A->erroriffailure);
427: PetscMalloc1(25 * (n + 1), &rtmp);
428: for (i = 0; i < n; i++) {
429: nz = bi[i + 1] - bi[i];
430: ajtmp = bj + bi[i];
431: for (j = 0; j < nz; j++) {
432: x = rtmp + 25 * ajtmp[j];
433: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
434: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
435: x[16] = x[17] = x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = 0.0;
436: }
437: /* load in initial (unfactored row) */
438: nz = ai[i + 1] - ai[i];
439: ajtmpold = aj + ai[i];
440: v = aa + 25 * ai[i];
441: for (j = 0; j < nz; j++) {
442: x = rtmp + 25 * ajtmpold[j];
443: x[0] = v[0];
444: x[1] = v[1];
445: x[2] = v[2];
446: x[3] = v[3];
447: x[4] = v[4];
448: x[5] = v[5];
449: x[6] = v[6];
450: x[7] = v[7];
451: x[8] = v[8];
452: x[9] = v[9];
453: x[10] = v[10];
454: x[11] = v[11];
455: x[12] = v[12];
456: x[13] = v[13];
457: x[14] = v[14];
458: x[15] = v[15];
459: x[16] = v[16];
460: x[17] = v[17];
461: x[18] = v[18];
462: x[19] = v[19];
463: x[20] = v[20];
464: x[21] = v[21];
465: x[22] = v[22];
466: x[23] = v[23];
467: x[24] = v[24];
468: v += 25;
469: }
470: row = *ajtmp++;
471: while (row < i) {
472: pc = rtmp + 25 * row;
473: p1 = pc[0];
474: p2 = pc[1];
475: p3 = pc[2];
476: p4 = pc[3];
477: p5 = pc[4];
478: p6 = pc[5];
479: p7 = pc[6];
480: p8 = pc[7];
481: p9 = pc[8];
482: p10 = pc[9];
483: p11 = pc[10];
484: p12 = pc[11];
485: p13 = pc[12];
486: p14 = pc[13];
487: p15 = pc[14];
488: p16 = pc[15];
489: p17 = pc[16];
490: p18 = pc[17];
491: p19 = pc[18];
492: p20 = pc[19];
493: p21 = pc[20];
494: p22 = pc[21];
495: p23 = pc[22];
496: p24 = pc[23];
497: p25 = pc[24];
498: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || p25 != 0.0) {
499: pv = ba + 25 * diag_offset[row];
500: pj = bj + diag_offset[row] + 1;
501: x1 = pv[0];
502: x2 = pv[1];
503: x3 = pv[2];
504: x4 = pv[3];
505: x5 = pv[4];
506: x6 = pv[5];
507: x7 = pv[6];
508: x8 = pv[7];
509: x9 = pv[8];
510: x10 = pv[9];
511: x11 = pv[10];
512: x12 = pv[11];
513: x13 = pv[12];
514: x14 = pv[13];
515: x15 = pv[14];
516: x16 = pv[15];
517: x17 = pv[16];
518: x18 = pv[17];
519: x19 = pv[18];
520: x20 = pv[19];
521: x21 = pv[20];
522: x22 = pv[21];
523: x23 = pv[22];
524: x24 = pv[23];
525: x25 = pv[24];
526: pc[0] = m1 = p1 * x1 + p6 * x2 + p11 * x3 + p16 * x4 + p21 * x5;
527: pc[1] = m2 = p2 * x1 + p7 * x2 + p12 * x3 + p17 * x4 + p22 * x5;
528: pc[2] = m3 = p3 * x1 + p8 * x2 + p13 * x3 + p18 * x4 + p23 * x5;
529: pc[3] = m4 = p4 * x1 + p9 * x2 + p14 * x3 + p19 * x4 + p24 * x5;
530: pc[4] = m5 = p5 * x1 + p10 * x2 + p15 * x3 + p20 * x4 + p25 * x5;
532: pc[5] = m6 = p1 * x6 + p6 * x7 + p11 * x8 + p16 * x9 + p21 * x10;
533: pc[6] = m7 = p2 * x6 + p7 * x7 + p12 * x8 + p17 * x9 + p22 * x10;
534: pc[7] = m8 = p3 * x6 + p8 * x7 + p13 * x8 + p18 * x9 + p23 * x10;
535: pc[8] = m9 = p4 * x6 + p9 * x7 + p14 * x8 + p19 * x9 + p24 * x10;
536: pc[9] = m10 = p5 * x6 + p10 * x7 + p15 * x8 + p20 * x9 + p25 * x10;
538: pc[10] = m11 = p1 * x11 + p6 * x12 + p11 * x13 + p16 * x14 + p21 * x15;
539: pc[11] = m12 = p2 * x11 + p7 * x12 + p12 * x13 + p17 * x14 + p22 * x15;
540: pc[12] = m13 = p3 * x11 + p8 * x12 + p13 * x13 + p18 * x14 + p23 * x15;
541: pc[13] = m14 = p4 * x11 + p9 * x12 + p14 * x13 + p19 * x14 + p24 * x15;
542: pc[14] = m15 = p5 * x11 + p10 * x12 + p15 * x13 + p20 * x14 + p25 * x15;
544: pc[15] = m16 = p1 * x16 + p6 * x17 + p11 * x18 + p16 * x19 + p21 * x20;
545: pc[16] = m17 = p2 * x16 + p7 * x17 + p12 * x18 + p17 * x19 + p22 * x20;
546: pc[17] = m18 = p3 * x16 + p8 * x17 + p13 * x18 + p18 * x19 + p23 * x20;
547: pc[18] = m19 = p4 * x16 + p9 * x17 + p14 * x18 + p19 * x19 + p24 * x20;
548: pc[19] = m20 = p5 * x16 + p10 * x17 + p15 * x18 + p20 * x19 + p25 * x20;
550: pc[20] = m21 = p1 * x21 + p6 * x22 + p11 * x23 + p16 * x24 + p21 * x25;
551: pc[21] = m22 = p2 * x21 + p7 * x22 + p12 * x23 + p17 * x24 + p22 * x25;
552: pc[22] = m23 = p3 * x21 + p8 * x22 + p13 * x23 + p18 * x24 + p23 * x25;
553: pc[23] = m24 = p4 * x21 + p9 * x22 + p14 * x23 + p19 * x24 + p24 * x25;
554: pc[24] = m25 = p5 * x21 + p10 * x22 + p15 * x23 + p20 * x24 + p25 * x25;
556: nz = bi[row + 1] - diag_offset[row] - 1;
557: pv += 25;
558: for (j = 0; j < nz; j++) {
559: x1 = pv[0];
560: x2 = pv[1];
561: x3 = pv[2];
562: x4 = pv[3];
563: x5 = pv[4];
564: x6 = pv[5];
565: x7 = pv[6];
566: x8 = pv[7];
567: x9 = pv[8];
568: x10 = pv[9];
569: x11 = pv[10];
570: x12 = pv[11];
571: x13 = pv[12];
572: x14 = pv[13];
573: x15 = pv[14];
574: x16 = pv[15];
575: x17 = pv[16];
576: x18 = pv[17];
577: x19 = pv[18];
578: x20 = pv[19];
579: x21 = pv[20];
580: x22 = pv[21];
581: x23 = pv[22];
582: x24 = pv[23];
583: x25 = pv[24];
584: x = rtmp + 25 * pj[j];
585: x[0] -= m1 * x1 + m6 * x2 + m11 * x3 + m16 * x4 + m21 * x5;
586: x[1] -= m2 * x1 + m7 * x2 + m12 * x3 + m17 * x4 + m22 * x5;
587: x[2] -= m3 * x1 + m8 * x2 + m13 * x3 + m18 * x4 + m23 * x5;
588: x[3] -= m4 * x1 + m9 * x2 + m14 * x3 + m19 * x4 + m24 * x5;
589: x[4] -= m5 * x1 + m10 * x2 + m15 * x3 + m20 * x4 + m25 * x5;
591: x[5] -= m1 * x6 + m6 * x7 + m11 * x8 + m16 * x9 + m21 * x10;
592: x[6] -= m2 * x6 + m7 * x7 + m12 * x8 + m17 * x9 + m22 * x10;
593: x[7] -= m3 * x6 + m8 * x7 + m13 * x8 + m18 * x9 + m23 * x10;
594: x[8] -= m4 * x6 + m9 * x7 + m14 * x8 + m19 * x9 + m24 * x10;
595: x[9] -= m5 * x6 + m10 * x7 + m15 * x8 + m20 * x9 + m25 * x10;
597: x[10] -= m1 * x11 + m6 * x12 + m11 * x13 + m16 * x14 + m21 * x15;
598: x[11] -= m2 * x11 + m7 * x12 + m12 * x13 + m17 * x14 + m22 * x15;
599: x[12] -= m3 * x11 + m8 * x12 + m13 * x13 + m18 * x14 + m23 * x15;
600: x[13] -= m4 * x11 + m9 * x12 + m14 * x13 + m19 * x14 + m24 * x15;
601: x[14] -= m5 * x11 + m10 * x12 + m15 * x13 + m20 * x14 + m25 * x15;
603: x[15] -= m1 * x16 + m6 * x17 + m11 * x18 + m16 * x19 + m21 * x20;
604: x[16] -= m2 * x16 + m7 * x17 + m12 * x18 + m17 * x19 + m22 * x20;
605: x[17] -= m3 * x16 + m8 * x17 + m13 * x18 + m18 * x19 + m23 * x20;
606: x[18] -= m4 * x16 + m9 * x17 + m14 * x18 + m19 * x19 + m24 * x20;
607: x[19] -= m5 * x16 + m10 * x17 + m15 * x18 + m20 * x19 + m25 * x20;
609: x[20] -= m1 * x21 + m6 * x22 + m11 * x23 + m16 * x24 + m21 * x25;
610: x[21] -= m2 * x21 + m7 * x22 + m12 * x23 + m17 * x24 + m22 * x25;
611: x[22] -= m3 * x21 + m8 * x22 + m13 * x23 + m18 * x24 + m23 * x25;
612: x[23] -= m4 * x21 + m9 * x22 + m14 * x23 + m19 * x24 + m24 * x25;
613: x[24] -= m5 * x21 + m10 * x22 + m15 * x23 + m20 * x24 + m25 * x25;
614: pv += 25;
615: }
616: PetscLogFlops(250.0 * nz + 225.0);
617: }
618: row = *ajtmp++;
619: }
620: /* finished row so stick it into b->a */
621: pv = ba + 25 * bi[i];
622: pj = bj + bi[i];
623: nz = bi[i + 1] - bi[i];
624: for (j = 0; j < nz; j++) {
625: x = rtmp + 25 * pj[j];
626: pv[0] = x[0];
627: pv[1] = x[1];
628: pv[2] = x[2];
629: pv[3] = x[3];
630: pv[4] = x[4];
631: pv[5] = x[5];
632: pv[6] = x[6];
633: pv[7] = x[7];
634: pv[8] = x[8];
635: pv[9] = x[9];
636: pv[10] = x[10];
637: pv[11] = x[11];
638: pv[12] = x[12];
639: pv[13] = x[13];
640: pv[14] = x[14];
641: pv[15] = x[15];
642: pv[16] = x[16];
643: pv[17] = x[17];
644: pv[18] = x[18];
645: pv[19] = x[19];
646: pv[20] = x[20];
647: pv[21] = x[21];
648: pv[22] = x[22];
649: pv[23] = x[23];
650: pv[24] = x[24];
651: pv += 25;
652: }
653: /* invert diagonal block */
654: w = ba + 25 * diag_offset[i];
655: PetscKernel_A_gets_inverse_A_5(w, ipvt, work, shift, allowzeropivot, &zeropivotdetected);
656: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
657: }
659: PetscFree(rtmp);
661: C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_inplace;
662: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace;
663: C->assembled = PETSC_TRUE;
665: PetscLogFlops(1.333333333333 * 5 * 5 * 5 * b->mbs); /* from inverting diagonal blocks */
666: return 0;
667: }
669: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
670: {
671: Mat C = B;
672: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
673: PetscInt i, j, k, nz, nzL, row;
674: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
675: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
676: MatScalar *rtmp, *pc, *mwork, *v, *vv, *pv, *aa = a->a, work[25];
677: PetscInt flg, ipvt[5];
678: PetscReal shift = info->shiftamount;
679: PetscBool allowzeropivot, zeropivotdetected;
681: allowzeropivot = PetscNot(A->erroriffailure);
683: /* generate work space needed by the factorization */
684: PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork);
685: PetscArrayzero(rtmp, bs2 * n);
687: for (i = 0; i < n; i++) {
688: /* zero rtmp */
689: /* L part */
690: nz = bi[i + 1] - bi[i];
691: bjtmp = bj + bi[i];
692: for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);
694: /* U part */
695: nz = bdiag[i] - bdiag[i + 1];
696: bjtmp = bj + bdiag[i + 1] + 1;
697: for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);
699: /* load in initial (unfactored row) */
700: nz = ai[i + 1] - ai[i];
701: ajtmp = aj + ai[i];
702: v = aa + bs2 * ai[i];
703: for (j = 0; j < nz; j++) PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2);
705: /* elimination */
706: bjtmp = bj + bi[i];
707: nzL = bi[i + 1] - bi[i];
708: for (k = 0; k < nzL; k++) {
709: row = bjtmp[k];
710: pc = rtmp + bs2 * row;
711: for (flg = 0, j = 0; j < bs2; j++) {
712: if (pc[j] != 0.0) {
713: flg = 1;
714: break;
715: }
716: }
717: if (flg) {
718: pv = b->a + bs2 * bdiag[row];
719: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
720: PetscKernel_A_gets_A_times_B_5(pc, pv, mwork);
722: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
723: pv = b->a + bs2 * (bdiag[row + 1] + 1);
724: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
725: for (j = 0; j < nz; j++) {
726: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
727: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
728: vv = rtmp + bs2 * pj[j];
729: PetscKernel_A_gets_A_minus_B_times_C_5(vv, pc, pv);
730: pv += bs2;
731: }
732: PetscLogFlops(250.0 * nz + 225); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
733: }
734: }
736: /* finished row so stick it into b->a */
737: /* L part */
738: pv = b->a + bs2 * bi[i];
739: pj = b->j + bi[i];
740: nz = bi[i + 1] - bi[i];
741: for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);
743: /* Mark diagonal and invert diagonal for simpler triangular solves */
744: pv = b->a + bs2 * bdiag[i];
745: pj = b->j + bdiag[i];
746: PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2);
747: PetscKernel_A_gets_inverse_A_5(pv, ipvt, work, shift, allowzeropivot, &zeropivotdetected);
748: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
750: /* U part */
751: pv = b->a + bs2 * (bdiag[i + 1] + 1);
752: pj = b->j + bdiag[i + 1] + 1;
753: nz = bdiag[i] - bdiag[i + 1] - 1;
754: for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);
755: }
756: PetscFree2(rtmp, mwork);
758: C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering;
759: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
760: C->assembled = PETSC_TRUE;
762: PetscLogFlops(1.333333333333 * 5 * 5 * 5 * n); /* from inverting diagonal blocks */
763: return 0;
764: }