Actual source code: baijfact11.c
2: /*
3: Factorization code for BAIJ format.
4: */
5: #include <../src/mat/impls/baij/seq/baij.h>
6: #include <petsc/private/kernels/blockinvert.h>
8: /* ------------------------------------------------------------*/
9: /*
10: Version for when blocks are 4 by 4
11: */
12: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C, Mat A, const MatFactorInfo *info)
13: {
14: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
15: IS isrow = b->row, isicol = b->icol;
16: const PetscInt *r, *ic;
17: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
18: PetscInt *ajtmpold, *ajtmp, nz, row;
19: PetscInt *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
20: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
21: MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
22: MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
23: MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
24: MatScalar m13, m14, m15, m16;
25: MatScalar *ba = b->a, *aa = a->a;
26: PetscBool pivotinblocks = b->pivotinblocks;
27: PetscReal shift = info->shiftamount;
28: PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
30: ISGetIndices(isrow, &r);
31: ISGetIndices(isicol, &ic);
32: PetscMalloc1(16 * (n + 1), &rtmp);
33: allowzeropivot = PetscNot(A->erroriffailure);
35: for (i = 0; i < n; i++) {
36: nz = bi[i + 1] - bi[i];
37: ajtmp = bj + bi[i];
38: for (j = 0; j < nz; j++) {
39: x = rtmp + 16 * ajtmp[j];
40: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
41: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
42: }
43: /* load in initial (unfactored row) */
44: idx = r[i];
45: nz = ai[idx + 1] - ai[idx];
46: ajtmpold = aj + ai[idx];
47: v = aa + 16 * ai[idx];
48: for (j = 0; j < nz; j++) {
49: x = rtmp + 16 * ic[ajtmpold[j]];
50: x[0] = v[0];
51: x[1] = v[1];
52: x[2] = v[2];
53: x[3] = v[3];
54: x[4] = v[4];
55: x[5] = v[5];
56: x[6] = v[6];
57: x[7] = v[7];
58: x[8] = v[8];
59: x[9] = v[9];
60: x[10] = v[10];
61: x[11] = v[11];
62: x[12] = v[12];
63: x[13] = v[13];
64: x[14] = v[14];
65: x[15] = v[15];
66: v += 16;
67: }
68: row = *ajtmp++;
69: while (row < i) {
70: pc = rtmp + 16 * row;
71: p1 = pc[0];
72: p2 = pc[1];
73: p3 = pc[2];
74: p4 = pc[3];
75: p5 = pc[4];
76: p6 = pc[5];
77: p7 = pc[6];
78: p8 = pc[7];
79: p9 = pc[8];
80: p10 = pc[9];
81: p11 = pc[10];
82: p12 = pc[11];
83: p13 = pc[12];
84: p14 = pc[13];
85: p15 = pc[14];
86: p16 = pc[15];
87: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
88: pv = ba + 16 * diag_offset[row];
89: pj = bj + diag_offset[row] + 1;
90: x1 = pv[0];
91: x2 = pv[1];
92: x3 = pv[2];
93: x4 = pv[3];
94: x5 = pv[4];
95: x6 = pv[5];
96: x7 = pv[6];
97: x8 = pv[7];
98: x9 = pv[8];
99: x10 = pv[9];
100: x11 = pv[10];
101: x12 = pv[11];
102: x13 = pv[12];
103: x14 = pv[13];
104: x15 = pv[14];
105: x16 = pv[15];
106: pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
107: pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
108: pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
109: pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
111: pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
112: pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
113: pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
114: pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
116: pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
117: pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
118: pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
119: pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
121: pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
122: pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
123: pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
124: pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
126: nz = bi[row + 1] - diag_offset[row] - 1;
127: pv += 16;
128: for (j = 0; j < nz; j++) {
129: x1 = pv[0];
130: x2 = pv[1];
131: x3 = pv[2];
132: x4 = pv[3];
133: x5 = pv[4];
134: x6 = pv[5];
135: x7 = pv[6];
136: x8 = pv[7];
137: x9 = pv[8];
138: x10 = pv[9];
139: x11 = pv[10];
140: x12 = pv[11];
141: x13 = pv[12];
142: x14 = pv[13];
143: x15 = pv[14];
144: x16 = pv[15];
145: x = rtmp + 16 * pj[j];
146: x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
147: x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
148: x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
149: x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
151: x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
152: x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
153: x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
154: x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
156: x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
157: x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
158: x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
159: x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
161: x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
162: x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
163: x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
164: x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
166: pv += 16;
167: }
168: PetscLogFlops(128.0 * nz + 112.0);
169: }
170: row = *ajtmp++;
171: }
172: /* finished row so stick it into b->a */
173: pv = ba + 16 * bi[i];
174: pj = bj + bi[i];
175: nz = bi[i + 1] - bi[i];
176: for (j = 0; j < nz; j++) {
177: x = rtmp + 16 * pj[j];
178: pv[0] = x[0];
179: pv[1] = x[1];
180: pv[2] = x[2];
181: pv[3] = x[3];
182: pv[4] = x[4];
183: pv[5] = x[5];
184: pv[6] = x[6];
185: pv[7] = x[7];
186: pv[8] = x[8];
187: pv[9] = x[9];
188: pv[10] = x[10];
189: pv[11] = x[11];
190: pv[12] = x[12];
191: pv[13] = x[13];
192: pv[14] = x[14];
193: pv[15] = x[15];
194: pv += 16;
195: }
196: /* invert diagonal block */
197: w = ba + 16 * diag_offset[i];
198: if (pivotinblocks) {
199: PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected);
200: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
201: } else {
202: PetscKernel_A_gets_inverse_A_4_nopivot(w);
203: }
204: }
206: PetscFree(rtmp);
207: ISRestoreIndices(isicol, &ic);
208: ISRestoreIndices(isrow, &r);
210: C->ops->solve = MatSolve_SeqBAIJ_4_inplace;
211: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
212: C->assembled = PETSC_TRUE;
214: PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs); /* from inverting diagonal blocks */
215: return 0;
216: }
218: /* MatLUFactorNumeric_SeqBAIJ_4 -
219: copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
220: PetscKernel_A_gets_A_times_B()
221: PetscKernel_A_gets_A_minus_B_times_C()
222: PetscKernel_A_gets_inverse_A()
223: */
225: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B, Mat A, const MatFactorInfo *info)
226: {
227: Mat C = B;
228: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
229: IS isrow = b->row, isicol = b->icol;
230: const PetscInt *r, *ic;
231: PetscInt i, j, k, nz, nzL, row;
232: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
233: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
234: MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
235: PetscInt flg;
236: PetscReal shift;
237: PetscBool allowzeropivot, zeropivotdetected;
239: allowzeropivot = PetscNot(A->erroriffailure);
240: ISGetIndices(isrow, &r);
241: ISGetIndices(isicol, &ic);
243: if (info->shifttype == MAT_SHIFT_NONE) {
244: shift = 0;
245: } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
246: shift = info->shiftamount;
247: }
249: /* generate work space needed by the factorization */
250: PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork);
251: PetscArrayzero(rtmp, bs2 * n);
253: for (i = 0; i < n; i++) {
254: /* zero rtmp */
255: /* L part */
256: nz = bi[i + 1] - bi[i];
257: bjtmp = bj + bi[i];
258: for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);
260: /* U part */
261: nz = bdiag[i] - bdiag[i + 1];
262: bjtmp = bj + bdiag[i + 1] + 1;
263: for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);
265: /* load in initial (unfactored row) */
266: nz = ai[r[i] + 1] - ai[r[i]];
267: ajtmp = aj + ai[r[i]];
268: v = aa + bs2 * ai[r[i]];
269: for (j = 0; j < nz; j++) PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2);
271: /* elimination */
272: bjtmp = bj + bi[i];
273: nzL = bi[i + 1] - bi[i];
274: for (k = 0; k < nzL; k++) {
275: row = bjtmp[k];
276: pc = rtmp + bs2 * row;
277: for (flg = 0, j = 0; j < bs2; j++) {
278: if (pc[j] != 0.0) {
279: flg = 1;
280: break;
281: }
282: }
283: if (flg) {
284: pv = b->a + bs2 * bdiag[row];
285: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
286: PetscKernel_A_gets_A_times_B_4(pc, pv, mwork);
288: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
289: pv = b->a + bs2 * (bdiag[row + 1] + 1);
290: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
291: for (j = 0; j < nz; j++) {
292: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
293: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
294: v = rtmp + bs2 * pj[j];
295: PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv);
296: pv += bs2;
297: }
298: PetscLogFlops(128.0 * nz + 112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
299: }
300: }
302: /* finished row so stick it into b->a */
303: /* L part */
304: pv = b->a + bs2 * bi[i];
305: pj = b->j + bi[i];
306: nz = bi[i + 1] - bi[i];
307: for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);
309: /* Mark diagonal and invert diagonal for simpler triangular solves */
310: pv = b->a + bs2 * bdiag[i];
311: pj = b->j + bdiag[i];
312: PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2);
313: PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected);
314: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
316: /* U part */
317: pv = b->a + bs2 * (bdiag[i + 1] + 1);
318: pj = b->j + bdiag[i + 1] + 1;
319: nz = bdiag[i] - bdiag[i + 1] - 1;
320: for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);
321: }
323: PetscFree2(rtmp, mwork);
324: ISRestoreIndices(isicol, &ic);
325: ISRestoreIndices(isrow, &r);
327: C->ops->solve = MatSolve_SeqBAIJ_4;
328: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
329: C->assembled = PETSC_TRUE;
331: PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n); /* from inverting diagonal blocks */
332: return 0;
333: }
335: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
336: {
337: /*
338: Default Version for when blocks are 4 by 4 Using natural ordering
339: */
340: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
341: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
342: PetscInt *ajtmpold, *ajtmp, nz, row;
343: PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
344: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
345: MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
346: MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
347: MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
348: MatScalar m13, m14, m15, m16;
349: MatScalar *ba = b->a, *aa = a->a;
350: PetscBool pivotinblocks = b->pivotinblocks;
351: PetscReal shift = info->shiftamount;
352: PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
354: allowzeropivot = PetscNot(A->erroriffailure);
355: PetscMalloc1(16 * (n + 1), &rtmp);
357: for (i = 0; i < n; i++) {
358: nz = bi[i + 1] - bi[i];
359: ajtmp = bj + bi[i];
360: for (j = 0; j < nz; j++) {
361: x = rtmp + 16 * ajtmp[j];
362: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
363: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
364: }
365: /* load in initial (unfactored row) */
366: nz = ai[i + 1] - ai[i];
367: ajtmpold = aj + ai[i];
368: v = aa + 16 * ai[i];
369: for (j = 0; j < nz; j++) {
370: x = rtmp + 16 * ajtmpold[j];
371: x[0] = v[0];
372: x[1] = v[1];
373: x[2] = v[2];
374: x[3] = v[3];
375: x[4] = v[4];
376: x[5] = v[5];
377: x[6] = v[6];
378: x[7] = v[7];
379: x[8] = v[8];
380: x[9] = v[9];
381: x[10] = v[10];
382: x[11] = v[11];
383: x[12] = v[12];
384: x[13] = v[13];
385: x[14] = v[14];
386: x[15] = v[15];
387: v += 16;
388: }
389: row = *ajtmp++;
390: while (row < i) {
391: pc = rtmp + 16 * row;
392: p1 = pc[0];
393: p2 = pc[1];
394: p3 = pc[2];
395: p4 = pc[3];
396: p5 = pc[4];
397: p6 = pc[5];
398: p7 = pc[6];
399: p8 = pc[7];
400: p9 = pc[8];
401: p10 = pc[9];
402: p11 = pc[10];
403: p12 = pc[11];
404: p13 = pc[12];
405: p14 = pc[13];
406: p15 = pc[14];
407: p16 = pc[15];
408: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
409: pv = ba + 16 * diag_offset[row];
410: pj = bj + diag_offset[row] + 1;
411: x1 = pv[0];
412: x2 = pv[1];
413: x3 = pv[2];
414: x4 = pv[3];
415: x5 = pv[4];
416: x6 = pv[5];
417: x7 = pv[6];
418: x8 = pv[7];
419: x9 = pv[8];
420: x10 = pv[9];
421: x11 = pv[10];
422: x12 = pv[11];
423: x13 = pv[12];
424: x14 = pv[13];
425: x15 = pv[14];
426: x16 = pv[15];
427: pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
428: pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
429: pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
430: pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
432: pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
433: pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
434: pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
435: pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
437: pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
438: pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
439: pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
440: pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
442: pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
443: pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
444: pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
445: pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
446: nz = bi[row + 1] - diag_offset[row] - 1;
447: pv += 16;
448: for (j = 0; j < nz; j++) {
449: x1 = pv[0];
450: x2 = pv[1];
451: x3 = pv[2];
452: x4 = pv[3];
453: x5 = pv[4];
454: x6 = pv[5];
455: x7 = pv[6];
456: x8 = pv[7];
457: x9 = pv[8];
458: x10 = pv[9];
459: x11 = pv[10];
460: x12 = pv[11];
461: x13 = pv[12];
462: x14 = pv[13];
463: x15 = pv[14];
464: x16 = pv[15];
465: x = rtmp + 16 * pj[j];
466: x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
467: x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
468: x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
469: x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
471: x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
472: x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
473: x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
474: x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
476: x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
477: x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
478: x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
479: x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
481: x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
482: x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
483: x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
484: x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
486: pv += 16;
487: }
488: PetscLogFlops(128.0 * nz + 112.0);
489: }
490: row = *ajtmp++;
491: }
492: /* finished row so stick it into b->a */
493: pv = ba + 16 * bi[i];
494: pj = bj + bi[i];
495: nz = bi[i + 1] - bi[i];
496: for (j = 0; j < nz; j++) {
497: x = rtmp + 16 * pj[j];
498: pv[0] = x[0];
499: pv[1] = x[1];
500: pv[2] = x[2];
501: pv[3] = x[3];
502: pv[4] = x[4];
503: pv[5] = x[5];
504: pv[6] = x[6];
505: pv[7] = x[7];
506: pv[8] = x[8];
507: pv[9] = x[9];
508: pv[10] = x[10];
509: pv[11] = x[11];
510: pv[12] = x[12];
511: pv[13] = x[13];
512: pv[14] = x[14];
513: pv[15] = x[15];
514: pv += 16;
515: }
516: /* invert diagonal block */
517: w = ba + 16 * diag_offset[i];
518: if (pivotinblocks) {
519: PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected);
520: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
521: } else {
522: PetscKernel_A_gets_inverse_A_4_nopivot(w);
523: }
524: }
526: PetscFree(rtmp);
528: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
529: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
530: C->assembled = PETSC_TRUE;
532: PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs); /* from inverting diagonal blocks */
533: return 0;
534: }
536: /*
537: MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
538: copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
539: */
540: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
541: {
542: Mat C = B;
543: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
544: PetscInt i, j, k, nz, nzL, row;
545: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
546: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
547: MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
548: PetscInt flg;
549: PetscReal shift;
550: PetscBool allowzeropivot, zeropivotdetected;
552: allowzeropivot = PetscNot(A->erroriffailure);
554: /* generate work space needed by the factorization */
555: PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork);
556: PetscArrayzero(rtmp, bs2 * n);
558: if (info->shifttype == MAT_SHIFT_NONE) {
559: shift = 0;
560: } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
561: shift = info->shiftamount;
562: }
564: for (i = 0; i < n; i++) {
565: /* zero rtmp */
566: /* L part */
567: nz = bi[i + 1] - bi[i];
568: bjtmp = bj + bi[i];
569: for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);
571: /* U part */
572: nz = bdiag[i] - bdiag[i + 1];
573: bjtmp = bj + bdiag[i + 1] + 1;
574: for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);
576: /* load in initial (unfactored row) */
577: nz = ai[i + 1] - ai[i];
578: ajtmp = aj + ai[i];
579: v = aa + bs2 * ai[i];
580: for (j = 0; j < nz; j++) PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2);
582: /* elimination */
583: bjtmp = bj + bi[i];
584: nzL = bi[i + 1] - bi[i];
585: for (k = 0; k < nzL; k++) {
586: row = bjtmp[k];
587: pc = rtmp + bs2 * row;
588: for (flg = 0, j = 0; j < bs2; j++) {
589: if (pc[j] != 0.0) {
590: flg = 1;
591: break;
592: }
593: }
594: if (flg) {
595: pv = b->a + bs2 * bdiag[row];
596: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
597: PetscKernel_A_gets_A_times_B_4(pc, pv, mwork);
599: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
600: pv = b->a + bs2 * (bdiag[row + 1] + 1);
601: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
602: for (j = 0; j < nz; j++) {
603: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
604: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
605: v = rtmp + bs2 * pj[j];
606: PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv);
607: pv += bs2;
608: }
609: PetscLogFlops(128.0 * nz + 112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
610: }
611: }
613: /* finished row so stick it into b->a */
614: /* L part */
615: pv = b->a + bs2 * bi[i];
616: pj = b->j + bi[i];
617: nz = bi[i + 1] - bi[i];
618: for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);
620: /* Mark diagonal and invert diagonal for simpler triangular solves */
621: pv = b->a + bs2 * bdiag[i];
622: pj = b->j + bdiag[i];
623: PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2);
624: PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected);
625: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
627: /* U part */
628: pv = b->a + bs2 * (bdiag[i + 1] + 1);
629: pj = b->j + bdiag[i + 1] + 1;
630: nz = bdiag[i] - bdiag[i + 1] - 1;
631: for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);
632: }
633: PetscFree2(rtmp, mwork);
635: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
636: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
637: C->assembled = PETSC_TRUE;
639: PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n); /* from inverting diagonal blocks */
640: return 0;
641: }
643: #if defined(PETSC_HAVE_SSE)
645: #include PETSC_HAVE_SSE
647: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
648: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B, Mat A, const MatFactorInfo *info)
649: {
650: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
651: int i, j, n = a->mbs;
652: int *bj = b->j, *bjtmp, *pj;
653: int row;
654: int *ajtmpold, nz, *bi = b->i;
655: int *diag_offset = b->diag, *ai = a->i, *aj = a->j;
656: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
657: MatScalar *ba = b->a, *aa = a->a;
658: int nonzero = 0;
659: PetscBool pivotinblocks = b->pivotinblocks;
660: PetscReal shift = info->shiftamount;
661: PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
663: allowzeropivot = PetscNot(A->erroriffailure);
664: SSE_SCOPE_BEGIN;
668: PetscMalloc1(16 * (n + 1), &rtmp);
670: /* if ((unsigned long)bj==(unsigned long)aj) { */
671: /* colscale = 4; */
672: /* } */
673: for (i = 0; i < n; i++) {
674: nz = bi[i + 1] - bi[i];
675: bjtmp = bj + bi[i];
676: /* zero out the 4x4 block accumulators */
677: /* zero out one register */
678: XOR_PS(XMM7, XMM7);
679: for (j = 0; j < nz; j++) {
680: x = rtmp + 16 * bjtmp[j];
681: /* x = rtmp+4*bjtmp[j]; */
682: SSE_INLINE_BEGIN_1(x)
683: /* Copy zero register to memory locations */
684: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
685: SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
686: SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
687: SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
688: SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
689: SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
690: SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
691: SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
692: SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
693: SSE_INLINE_END_1;
694: }
695: /* load in initial (unfactored row) */
696: nz = ai[i + 1] - ai[i];
697: ajtmpold = aj + ai[i];
698: v = aa + 16 * ai[i];
699: for (j = 0; j < nz; j++) {
700: x = rtmp + 16 * ajtmpold[j];
701: /* x = rtmp+colscale*ajtmpold[j]; */
702: /* Copy v block into x block */
703: SSE_INLINE_BEGIN_2(v, x)
704: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
705: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
706: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
708: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
709: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
711: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
712: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
714: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
715: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
717: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
718: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
720: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
721: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
723: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
724: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
726: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
727: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
728: SSE_INLINE_END_2;
730: v += 16;
731: }
732: /* row = (*bjtmp++)/4; */
733: row = *bjtmp++;
734: while (row < i) {
735: pc = rtmp + 16 * row;
736: SSE_INLINE_BEGIN_1(pc)
737: /* Load block from lower triangle */
738: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
739: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
740: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
742: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
743: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
745: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
746: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
748: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
749: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
751: /* Compare block to zero block */
753: SSE_COPY_PS(XMM4, XMM7)
754: SSE_CMPNEQ_PS(XMM4, XMM0)
756: SSE_COPY_PS(XMM5, XMM7)
757: SSE_CMPNEQ_PS(XMM5, XMM1)
759: SSE_COPY_PS(XMM6, XMM7)
760: SSE_CMPNEQ_PS(XMM6, XMM2)
762: SSE_CMPNEQ_PS(XMM7, XMM3)
764: /* Reduce the comparisons to one SSE register */
765: SSE_OR_PS(XMM6, XMM7)
766: SSE_OR_PS(XMM5, XMM4)
767: SSE_OR_PS(XMM5, XMM6)
768: SSE_INLINE_END_1;
770: /* Reduce the one SSE register to an integer register for branching */
771: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
772: MOVEMASK(nonzero, XMM5);
774: /* If block is nonzero ... */
775: if (nonzero) {
776: pv = ba + 16 * diag_offset[row];
777: PREFETCH_L1(&pv[16]);
778: pj = bj + diag_offset[row] + 1;
780: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
781: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
782: /* but the diagonal was inverted already */
783: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
785: SSE_INLINE_BEGIN_2(pv, pc)
786: /* Column 0, product is accumulated in XMM4 */
787: SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
788: SSE_SHUFFLE(XMM4, XMM4, 0x00)
789: SSE_MULT_PS(XMM4, XMM0)
791: SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
792: SSE_SHUFFLE(XMM5, XMM5, 0x00)
793: SSE_MULT_PS(XMM5, XMM1)
794: SSE_ADD_PS(XMM4, XMM5)
796: SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
797: SSE_SHUFFLE(XMM6, XMM6, 0x00)
798: SSE_MULT_PS(XMM6, XMM2)
799: SSE_ADD_PS(XMM4, XMM6)
801: SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
802: SSE_SHUFFLE(XMM7, XMM7, 0x00)
803: SSE_MULT_PS(XMM7, XMM3)
804: SSE_ADD_PS(XMM4, XMM7)
806: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
807: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
809: /* Column 1, product is accumulated in XMM5 */
810: SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
811: SSE_SHUFFLE(XMM5, XMM5, 0x00)
812: SSE_MULT_PS(XMM5, XMM0)
814: SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
815: SSE_SHUFFLE(XMM6, XMM6, 0x00)
816: SSE_MULT_PS(XMM6, XMM1)
817: SSE_ADD_PS(XMM5, XMM6)
819: SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
820: SSE_SHUFFLE(XMM7, XMM7, 0x00)
821: SSE_MULT_PS(XMM7, XMM2)
822: SSE_ADD_PS(XMM5, XMM7)
824: SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
825: SSE_SHUFFLE(XMM6, XMM6, 0x00)
826: SSE_MULT_PS(XMM6, XMM3)
827: SSE_ADD_PS(XMM5, XMM6)
829: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
830: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
832: SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
834: /* Column 2, product is accumulated in XMM6 */
835: SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
836: SSE_SHUFFLE(XMM6, XMM6, 0x00)
837: SSE_MULT_PS(XMM6, XMM0)
839: SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
840: SSE_SHUFFLE(XMM7, XMM7, 0x00)
841: SSE_MULT_PS(XMM7, XMM1)
842: SSE_ADD_PS(XMM6, XMM7)
844: SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
845: SSE_SHUFFLE(XMM7, XMM7, 0x00)
846: SSE_MULT_PS(XMM7, XMM2)
847: SSE_ADD_PS(XMM6, XMM7)
849: SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
850: SSE_SHUFFLE(XMM7, XMM7, 0x00)
851: SSE_MULT_PS(XMM7, XMM3)
852: SSE_ADD_PS(XMM6, XMM7)
854: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
855: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
857: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
858: /* Column 3, product is accumulated in XMM0 */
859: SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
860: SSE_SHUFFLE(XMM7, XMM7, 0x00)
861: SSE_MULT_PS(XMM0, XMM7)
863: SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
864: SSE_SHUFFLE(XMM7, XMM7, 0x00)
865: SSE_MULT_PS(XMM1, XMM7)
866: SSE_ADD_PS(XMM0, XMM1)
868: SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
869: SSE_SHUFFLE(XMM1, XMM1, 0x00)
870: SSE_MULT_PS(XMM1, XMM2)
871: SSE_ADD_PS(XMM0, XMM1)
873: SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
874: SSE_SHUFFLE(XMM7, XMM7, 0x00)
875: SSE_MULT_PS(XMM3, XMM7)
876: SSE_ADD_PS(XMM0, XMM3)
878: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
879: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
881: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
882: /* This is code to be maintained and read by humans after all. */
883: /* Copy Multiplier Col 3 into XMM3 */
884: SSE_COPY_PS(XMM3, XMM0)
885: /* Copy Multiplier Col 2 into XMM2 */
886: SSE_COPY_PS(XMM2, XMM6)
887: /* Copy Multiplier Col 1 into XMM1 */
888: SSE_COPY_PS(XMM1, XMM5)
889: /* Copy Multiplier Col 0 into XMM0 */
890: SSE_COPY_PS(XMM0, XMM4)
891: SSE_INLINE_END_2;
893: /* Update the row: */
894: nz = bi[row + 1] - diag_offset[row] - 1;
895: pv += 16;
896: for (j = 0; j < nz; j++) {
897: PREFETCH_L1(&pv[16]);
898: x = rtmp + 16 * pj[j];
899: /* x = rtmp + 4*pj[j]; */
901: /* X:=X-M*PV, One column at a time */
902: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
903: SSE_INLINE_BEGIN_2(x, pv)
904: /* Load First Column of X*/
905: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
906: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
908: /* Matrix-Vector Product: */
909: SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
910: SSE_SHUFFLE(XMM5, XMM5, 0x00)
911: SSE_MULT_PS(XMM5, XMM0)
912: SSE_SUB_PS(XMM4, XMM5)
914: SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
915: SSE_SHUFFLE(XMM6, XMM6, 0x00)
916: SSE_MULT_PS(XMM6, XMM1)
917: SSE_SUB_PS(XMM4, XMM6)
919: SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
920: SSE_SHUFFLE(XMM7, XMM7, 0x00)
921: SSE_MULT_PS(XMM7, XMM2)
922: SSE_SUB_PS(XMM4, XMM7)
924: SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
925: SSE_SHUFFLE(XMM5, XMM5, 0x00)
926: SSE_MULT_PS(XMM5, XMM3)
927: SSE_SUB_PS(XMM4, XMM5)
929: SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
930: SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
932: /* Second Column */
933: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
934: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
936: /* Matrix-Vector Product: */
937: SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
938: SSE_SHUFFLE(XMM6, XMM6, 0x00)
939: SSE_MULT_PS(XMM6, XMM0)
940: SSE_SUB_PS(XMM5, XMM6)
942: SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
943: SSE_SHUFFLE(XMM7, XMM7, 0x00)
944: SSE_MULT_PS(XMM7, XMM1)
945: SSE_SUB_PS(XMM5, XMM7)
947: SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
948: SSE_SHUFFLE(XMM4, XMM4, 0x00)
949: SSE_MULT_PS(XMM4, XMM2)
950: SSE_SUB_PS(XMM5, XMM4)
952: SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
953: SSE_SHUFFLE(XMM6, XMM6, 0x00)
954: SSE_MULT_PS(XMM6, XMM3)
955: SSE_SUB_PS(XMM5, XMM6)
957: SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
958: SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
960: SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
962: /* Third Column */
963: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
964: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
966: /* Matrix-Vector Product: */
967: SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
968: SSE_SHUFFLE(XMM7, XMM7, 0x00)
969: SSE_MULT_PS(XMM7, XMM0)
970: SSE_SUB_PS(XMM6, XMM7)
972: SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
973: SSE_SHUFFLE(XMM4, XMM4, 0x00)
974: SSE_MULT_PS(XMM4, XMM1)
975: SSE_SUB_PS(XMM6, XMM4)
977: SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
978: SSE_SHUFFLE(XMM5, XMM5, 0x00)
979: SSE_MULT_PS(XMM5, XMM2)
980: SSE_SUB_PS(XMM6, XMM5)
982: SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
983: SSE_SHUFFLE(XMM7, XMM7, 0x00)
984: SSE_MULT_PS(XMM7, XMM3)
985: SSE_SUB_PS(XMM6, XMM7)
987: SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
988: SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
990: /* Fourth Column */
991: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
992: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
994: /* Matrix-Vector Product: */
995: SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
996: SSE_SHUFFLE(XMM5, XMM5, 0x00)
997: SSE_MULT_PS(XMM5, XMM0)
998: SSE_SUB_PS(XMM4, XMM5)
1000: SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1001: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1002: SSE_MULT_PS(XMM6, XMM1)
1003: SSE_SUB_PS(XMM4, XMM6)
1005: SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1006: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1007: SSE_MULT_PS(XMM7, XMM2)
1008: SSE_SUB_PS(XMM4, XMM7)
1010: SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1011: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1012: SSE_MULT_PS(XMM5, XMM3)
1013: SSE_SUB_PS(XMM4, XMM5)
1015: SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1016: SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1017: SSE_INLINE_END_2;
1018: pv += 16;
1019: }
1020: PetscLogFlops(128.0 * nz + 112.0);
1021: }
1022: row = *bjtmp++;
1023: /* row = (*bjtmp++)/4; */
1024: }
1025: /* finished row so stick it into b->a */
1026: pv = ba + 16 * bi[i];
1027: pj = bj + bi[i];
1028: nz = bi[i + 1] - bi[i];
1030: /* Copy x block back into pv block */
1031: for (j = 0; j < nz; j++) {
1032: x = rtmp + 16 * pj[j];
1033: /* x = rtmp+4*pj[j]; */
1035: SSE_INLINE_BEGIN_2(x, pv)
1036: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1037: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1038: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1040: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1041: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1043: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1044: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1046: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1047: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1049: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1050: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1052: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1053: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1055: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1056: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1058: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1059: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1060: SSE_INLINE_END_2;
1061: pv += 16;
1062: }
1063: /* invert diagonal block */
1064: w = ba + 16 * diag_offset[i];
1065: if (pivotinblocks) {
1066: PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected);
1067: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1068: } else {
1069: PetscKernel_A_gets_inverse_A_4_nopivot(w);
1070: }
1071: /* PetscKernel_A_gets_inverse_A_4_SSE(w); */
1072: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1073: }
1075: PetscFree(rtmp);
1077: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1078: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1079: C->assembled = PETSC_TRUE;
1081: PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs);
1082: /* Flop Count from inverting diagonal blocks */
1083: SSE_SCOPE_END;
1084: return 0;
1085: }
1087: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1088: {
1089: Mat A = C;
1090: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1091: int i, j, n = a->mbs;
1092: unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj;
1093: unsigned short *aj = (unsigned short *)(a->j), *ajtmp;
1094: unsigned int row;
1095: int nz, *bi = b->i;
1096: int *diag_offset = b->diag, *ai = a->i;
1097: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
1098: MatScalar *ba = b->a, *aa = a->a;
1099: int nonzero = 0;
1100: PetscBool pivotinblocks = b->pivotinblocks;
1101: PetscReal shift = info->shiftamount;
1102: PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
1104: allowzeropivot = PetscNot(A->erroriffailure);
1105: SSE_SCOPE_BEGIN;
1109: PetscMalloc1(16 * (n + 1), &rtmp);
1111: /* if ((unsigned long)bj==(unsigned long)aj) { */
1112: /* colscale = 4; */
1113: /* } */
1115: for (i = 0; i < n; i++) {
1116: nz = bi[i + 1] - bi[i];
1117: bjtmp = bj + bi[i];
1118: /* zero out the 4x4 block accumulators */
1119: /* zero out one register */
1120: XOR_PS(XMM7, XMM7);
1121: for (j = 0; j < nz; j++) {
1122: x = rtmp + 16 * ((unsigned int)bjtmp[j]);
1123: /* x = rtmp+4*bjtmp[j]; */
1124: SSE_INLINE_BEGIN_1(x)
1125: /* Copy zero register to memory locations */
1126: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1127: SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
1128: SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
1129: SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
1130: SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
1131: SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
1132: SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
1133: SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1134: SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
1135: SSE_INLINE_END_1;
1136: }
1137: /* load in initial (unfactored row) */
1138: nz = ai[i + 1] - ai[i];
1139: ajtmp = aj + ai[i];
1140: v = aa + 16 * ai[i];
1141: for (j = 0; j < nz; j++) {
1142: x = rtmp + 16 * ((unsigned int)ajtmp[j]);
1143: /* x = rtmp+colscale*ajtmp[j]; */
1144: /* Copy v block into x block */
1145: SSE_INLINE_BEGIN_2(v, x)
1146: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1147: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1148: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
1150: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
1151: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
1153: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
1154: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
1156: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
1157: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
1159: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
1160: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
1162: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
1163: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
1165: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
1166: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
1168: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1169: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1170: SSE_INLINE_END_2;
1172: v += 16;
1173: }
1174: /* row = (*bjtmp++)/4; */
1175: row = (unsigned int)(*bjtmp++);
1176: while (row < i) {
1177: pc = rtmp + 16 * row;
1178: SSE_INLINE_BEGIN_1(pc)
1179: /* Load block from lower triangle */
1180: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1181: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1182: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
1184: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
1185: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
1187: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
1188: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
1190: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
1191: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
1193: /* Compare block to zero block */
1195: SSE_COPY_PS(XMM4, XMM7)
1196: SSE_CMPNEQ_PS(XMM4, XMM0)
1198: SSE_COPY_PS(XMM5, XMM7)
1199: SSE_CMPNEQ_PS(XMM5, XMM1)
1201: SSE_COPY_PS(XMM6, XMM7)
1202: SSE_CMPNEQ_PS(XMM6, XMM2)
1204: SSE_CMPNEQ_PS(XMM7, XMM3)
1206: /* Reduce the comparisons to one SSE register */
1207: SSE_OR_PS(XMM6, XMM7)
1208: SSE_OR_PS(XMM5, XMM4)
1209: SSE_OR_PS(XMM5, XMM6)
1210: SSE_INLINE_END_1;
1212: /* Reduce the one SSE register to an integer register for branching */
1213: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1214: MOVEMASK(nonzero, XMM5);
1216: /* If block is nonzero ... */
1217: if (nonzero) {
1218: pv = ba + 16 * diag_offset[row];
1219: PREFETCH_L1(&pv[16]);
1220: pj = bj + diag_offset[row] + 1;
1222: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1223: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1224: /* but the diagonal was inverted already */
1225: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1227: SSE_INLINE_BEGIN_2(pv, pc)
1228: /* Column 0, product is accumulated in XMM4 */
1229: SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
1230: SSE_SHUFFLE(XMM4, XMM4, 0x00)
1231: SSE_MULT_PS(XMM4, XMM0)
1233: SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
1234: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1235: SSE_MULT_PS(XMM5, XMM1)
1236: SSE_ADD_PS(XMM4, XMM5)
1238: SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
1239: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1240: SSE_MULT_PS(XMM6, XMM2)
1241: SSE_ADD_PS(XMM4, XMM6)
1243: SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
1244: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1245: SSE_MULT_PS(XMM7, XMM3)
1246: SSE_ADD_PS(XMM4, XMM7)
1248: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
1249: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
1251: /* Column 1, product is accumulated in XMM5 */
1252: SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
1253: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1254: SSE_MULT_PS(XMM5, XMM0)
1256: SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
1257: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1258: SSE_MULT_PS(XMM6, XMM1)
1259: SSE_ADD_PS(XMM5, XMM6)
1261: SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
1262: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1263: SSE_MULT_PS(XMM7, XMM2)
1264: SSE_ADD_PS(XMM5, XMM7)
1266: SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
1267: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1268: SSE_MULT_PS(XMM6, XMM3)
1269: SSE_ADD_PS(XMM5, XMM6)
1271: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
1272: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
1274: SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
1276: /* Column 2, product is accumulated in XMM6 */
1277: SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
1278: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1279: SSE_MULT_PS(XMM6, XMM0)
1281: SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
1282: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1283: SSE_MULT_PS(XMM7, XMM1)
1284: SSE_ADD_PS(XMM6, XMM7)
1286: SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
1287: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1288: SSE_MULT_PS(XMM7, XMM2)
1289: SSE_ADD_PS(XMM6, XMM7)
1291: SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
1292: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1293: SSE_MULT_PS(XMM7, XMM3)
1294: SSE_ADD_PS(XMM6, XMM7)
1296: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
1297: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1299: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1300: /* Column 3, product is accumulated in XMM0 */
1301: SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
1302: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1303: SSE_MULT_PS(XMM0, XMM7)
1305: SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
1306: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1307: SSE_MULT_PS(XMM1, XMM7)
1308: SSE_ADD_PS(XMM0, XMM1)
1310: SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
1311: SSE_SHUFFLE(XMM1, XMM1, 0x00)
1312: SSE_MULT_PS(XMM1, XMM2)
1313: SSE_ADD_PS(XMM0, XMM1)
1315: SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
1316: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1317: SSE_MULT_PS(XMM3, XMM7)
1318: SSE_ADD_PS(XMM0, XMM3)
1320: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
1321: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1323: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1324: /* This is code to be maintained and read by humans after all. */
1325: /* Copy Multiplier Col 3 into XMM3 */
1326: SSE_COPY_PS(XMM3, XMM0)
1327: /* Copy Multiplier Col 2 into XMM2 */
1328: SSE_COPY_PS(XMM2, XMM6)
1329: /* Copy Multiplier Col 1 into XMM1 */
1330: SSE_COPY_PS(XMM1, XMM5)
1331: /* Copy Multiplier Col 0 into XMM0 */
1332: SSE_COPY_PS(XMM0, XMM4)
1333: SSE_INLINE_END_2;
1335: /* Update the row: */
1336: nz = bi[row + 1] - diag_offset[row] - 1;
1337: pv += 16;
1338: for (j = 0; j < nz; j++) {
1339: PREFETCH_L1(&pv[16]);
1340: x = rtmp + 16 * ((unsigned int)pj[j]);
1341: /* x = rtmp + 4*pj[j]; */
1343: /* X:=X-M*PV, One column at a time */
1344: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1345: SSE_INLINE_BEGIN_2(x, pv)
1346: /* Load First Column of X*/
1347: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1348: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1350: /* Matrix-Vector Product: */
1351: SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
1352: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1353: SSE_MULT_PS(XMM5, XMM0)
1354: SSE_SUB_PS(XMM4, XMM5)
1356: SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
1357: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1358: SSE_MULT_PS(XMM6, XMM1)
1359: SSE_SUB_PS(XMM4, XMM6)
1361: SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
1362: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1363: SSE_MULT_PS(XMM7, XMM2)
1364: SSE_SUB_PS(XMM4, XMM7)
1366: SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
1367: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1368: SSE_MULT_PS(XMM5, XMM3)
1369: SSE_SUB_PS(XMM4, XMM5)
1371: SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1372: SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1374: /* Second Column */
1375: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1376: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1378: /* Matrix-Vector Product: */
1379: SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
1380: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1381: SSE_MULT_PS(XMM6, XMM0)
1382: SSE_SUB_PS(XMM5, XMM6)
1384: SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
1385: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1386: SSE_MULT_PS(XMM7, XMM1)
1387: SSE_SUB_PS(XMM5, XMM7)
1389: SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
1390: SSE_SHUFFLE(XMM4, XMM4, 0x00)
1391: SSE_MULT_PS(XMM4, XMM2)
1392: SSE_SUB_PS(XMM5, XMM4)
1394: SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
1395: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1396: SSE_MULT_PS(XMM6, XMM3)
1397: SSE_SUB_PS(XMM5, XMM6)
1399: SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1400: SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1402: SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
1404: /* Third Column */
1405: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1406: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1408: /* Matrix-Vector Product: */
1409: SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
1410: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1411: SSE_MULT_PS(XMM7, XMM0)
1412: SSE_SUB_PS(XMM6, XMM7)
1414: SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
1415: SSE_SHUFFLE(XMM4, XMM4, 0x00)
1416: SSE_MULT_PS(XMM4, XMM1)
1417: SSE_SUB_PS(XMM6, XMM4)
1419: SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
1420: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1421: SSE_MULT_PS(XMM5, XMM2)
1422: SSE_SUB_PS(XMM6, XMM5)
1424: SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
1425: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1426: SSE_MULT_PS(XMM7, XMM3)
1427: SSE_SUB_PS(XMM6, XMM7)
1429: SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1430: SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1432: /* Fourth Column */
1433: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1434: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1436: /* Matrix-Vector Product: */
1437: SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1438: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1439: SSE_MULT_PS(XMM5, XMM0)
1440: SSE_SUB_PS(XMM4, XMM5)
1442: SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1443: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1444: SSE_MULT_PS(XMM6, XMM1)
1445: SSE_SUB_PS(XMM4, XMM6)
1447: SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1448: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1449: SSE_MULT_PS(XMM7, XMM2)
1450: SSE_SUB_PS(XMM4, XMM7)
1452: SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1453: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1454: SSE_MULT_PS(XMM5, XMM3)
1455: SSE_SUB_PS(XMM4, XMM5)
1457: SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1458: SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1459: SSE_INLINE_END_2;
1460: pv += 16;
1461: }
1462: PetscLogFlops(128.0 * nz + 112.0);
1463: }
1464: row = (unsigned int)(*bjtmp++);
1465: /* row = (*bjtmp++)/4; */
1466: /* bjtmp++; */
1467: }
1468: /* finished row so stick it into b->a */
1469: pv = ba + 16 * bi[i];
1470: pj = bj + bi[i];
1471: nz = bi[i + 1] - bi[i];
1473: /* Copy x block back into pv block */
1474: for (j = 0; j < nz; j++) {
1475: x = rtmp + 16 * ((unsigned int)pj[j]);
1476: /* x = rtmp+4*pj[j]; */
1478: SSE_INLINE_BEGIN_2(x, pv)
1479: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1480: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1481: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1483: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1484: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1486: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1487: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1489: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1490: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1492: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1493: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1495: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1496: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1498: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1499: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1501: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1502: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1503: SSE_INLINE_END_2;
1504: pv += 16;
1505: }
1506: /* invert diagonal block */
1507: w = ba + 16 * diag_offset[i];
1508: if (pivotinblocks) {
1509: PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected);
1510: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1511: } else {
1512: PetscKernel_A_gets_inverse_A_4_nopivot(w);
1513: }
1514: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1515: }
1517: PetscFree(rtmp);
1519: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1520: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1521: C->assembled = PETSC_TRUE;
1523: PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs);
1524: /* Flop Count from inverting diagonal blocks */
1525: SSE_SCOPE_END;
1526: return 0;
1527: }
1529: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C, Mat A, const MatFactorInfo *info)
1530: {
1531: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1532: int i, j, n = a->mbs;
1533: unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj;
1534: unsigned int row;
1535: int *ajtmpold, nz, *bi = b->i;
1536: int *diag_offset = b->diag, *ai = a->i, *aj = a->j;
1537: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
1538: MatScalar *ba = b->a, *aa = a->a;
1539: int nonzero = 0;
1540: PetscBool pivotinblocks = b->pivotinblocks;
1541: PetscReal shift = info->shiftamount;
1542: PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
1544: allowzeropivot = PetscNot(A->erroriffailure);
1545: SSE_SCOPE_BEGIN;
1549: PetscMalloc1(16 * (n + 1), &rtmp);
1551: /* if ((unsigned long)bj==(unsigned long)aj) { */
1552: /* colscale = 4; */
1553: /* } */
1554: if ((unsigned long)bj == (unsigned long)aj) return (MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1556: for (i = 0; i < n; i++) {
1557: nz = bi[i + 1] - bi[i];
1558: bjtmp = bj + bi[i];
1559: /* zero out the 4x4 block accumulators */
1560: /* zero out one register */
1561: XOR_PS(XMM7, XMM7);
1562: for (j = 0; j < nz; j++) {
1563: x = rtmp + 16 * ((unsigned int)bjtmp[j]);
1564: /* x = rtmp+4*bjtmp[j]; */
1565: SSE_INLINE_BEGIN_1(x)
1566: /* Copy zero register to memory locations */
1567: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1568: SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
1569: SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
1570: SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
1571: SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
1572: SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
1573: SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
1574: SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1575: SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
1576: SSE_INLINE_END_1;
1577: }
1578: /* load in initial (unfactored row) */
1579: nz = ai[i + 1] - ai[i];
1580: ajtmpold = aj + ai[i];
1581: v = aa + 16 * ai[i];
1582: for (j = 0; j < nz; j++) {
1583: x = rtmp + 16 * ajtmpold[j];
1584: /* x = rtmp+colscale*ajtmpold[j]; */
1585: /* Copy v block into x block */
1586: SSE_INLINE_BEGIN_2(v, x)
1587: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1588: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1589: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
1591: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
1592: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
1594: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
1595: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
1597: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
1598: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
1600: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
1601: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
1603: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
1604: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
1606: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
1607: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
1609: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1610: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1611: SSE_INLINE_END_2;
1613: v += 16;
1614: }
1615: /* row = (*bjtmp++)/4; */
1616: row = (unsigned int)(*bjtmp++);
1617: while (row < i) {
1618: pc = rtmp + 16 * row;
1619: SSE_INLINE_BEGIN_1(pc)
1620: /* Load block from lower triangle */
1621: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1622: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1623: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
1625: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
1626: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
1628: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
1629: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
1631: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
1632: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
1634: /* Compare block to zero block */
1636: SSE_COPY_PS(XMM4, XMM7)
1637: SSE_CMPNEQ_PS(XMM4, XMM0)
1639: SSE_COPY_PS(XMM5, XMM7)
1640: SSE_CMPNEQ_PS(XMM5, XMM1)
1642: SSE_COPY_PS(XMM6, XMM7)
1643: SSE_CMPNEQ_PS(XMM6, XMM2)
1645: SSE_CMPNEQ_PS(XMM7, XMM3)
1647: /* Reduce the comparisons to one SSE register */
1648: SSE_OR_PS(XMM6, XMM7)
1649: SSE_OR_PS(XMM5, XMM4)
1650: SSE_OR_PS(XMM5, XMM6)
1651: SSE_INLINE_END_1;
1653: /* Reduce the one SSE register to an integer register for branching */
1654: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1655: MOVEMASK(nonzero, XMM5);
1657: /* If block is nonzero ... */
1658: if (nonzero) {
1659: pv = ba + 16 * diag_offset[row];
1660: PREFETCH_L1(&pv[16]);
1661: pj = bj + diag_offset[row] + 1;
1663: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1664: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1665: /* but the diagonal was inverted already */
1666: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1668: SSE_INLINE_BEGIN_2(pv, pc)
1669: /* Column 0, product is accumulated in XMM4 */
1670: SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
1671: SSE_SHUFFLE(XMM4, XMM4, 0x00)
1672: SSE_MULT_PS(XMM4, XMM0)
1674: SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
1675: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1676: SSE_MULT_PS(XMM5, XMM1)
1677: SSE_ADD_PS(XMM4, XMM5)
1679: SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
1680: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1681: SSE_MULT_PS(XMM6, XMM2)
1682: SSE_ADD_PS(XMM4, XMM6)
1684: SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
1685: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1686: SSE_MULT_PS(XMM7, XMM3)
1687: SSE_ADD_PS(XMM4, XMM7)
1689: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
1690: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
1692: /* Column 1, product is accumulated in XMM5 */
1693: SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
1694: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1695: SSE_MULT_PS(XMM5, XMM0)
1697: SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
1698: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1699: SSE_MULT_PS(XMM6, XMM1)
1700: SSE_ADD_PS(XMM5, XMM6)
1702: SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
1703: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1704: SSE_MULT_PS(XMM7, XMM2)
1705: SSE_ADD_PS(XMM5, XMM7)
1707: SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
1708: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1709: SSE_MULT_PS(XMM6, XMM3)
1710: SSE_ADD_PS(XMM5, XMM6)
1712: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
1713: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
1715: SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
1717: /* Column 2, product is accumulated in XMM6 */
1718: SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
1719: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1720: SSE_MULT_PS(XMM6, XMM0)
1722: SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
1723: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1724: SSE_MULT_PS(XMM7, XMM1)
1725: SSE_ADD_PS(XMM6, XMM7)
1727: SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
1728: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1729: SSE_MULT_PS(XMM7, XMM2)
1730: SSE_ADD_PS(XMM6, XMM7)
1732: SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
1733: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1734: SSE_MULT_PS(XMM7, XMM3)
1735: SSE_ADD_PS(XMM6, XMM7)
1737: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
1738: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1740: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1741: /* Column 3, product is accumulated in XMM0 */
1742: SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
1743: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1744: SSE_MULT_PS(XMM0, XMM7)
1746: SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
1747: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1748: SSE_MULT_PS(XMM1, XMM7)
1749: SSE_ADD_PS(XMM0, XMM1)
1751: SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
1752: SSE_SHUFFLE(XMM1, XMM1, 0x00)
1753: SSE_MULT_PS(XMM1, XMM2)
1754: SSE_ADD_PS(XMM0, XMM1)
1756: SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
1757: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1758: SSE_MULT_PS(XMM3, XMM7)
1759: SSE_ADD_PS(XMM0, XMM3)
1761: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
1762: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1764: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1765: /* This is code to be maintained and read by humans after all. */
1766: /* Copy Multiplier Col 3 into XMM3 */
1767: SSE_COPY_PS(XMM3, XMM0)
1768: /* Copy Multiplier Col 2 into XMM2 */
1769: SSE_COPY_PS(XMM2, XMM6)
1770: /* Copy Multiplier Col 1 into XMM1 */
1771: SSE_COPY_PS(XMM1, XMM5)
1772: /* Copy Multiplier Col 0 into XMM0 */
1773: SSE_COPY_PS(XMM0, XMM4)
1774: SSE_INLINE_END_2;
1776: /* Update the row: */
1777: nz = bi[row + 1] - diag_offset[row] - 1;
1778: pv += 16;
1779: for (j = 0; j < nz; j++) {
1780: PREFETCH_L1(&pv[16]);
1781: x = rtmp + 16 * ((unsigned int)pj[j]);
1782: /* x = rtmp + 4*pj[j]; */
1784: /* X:=X-M*PV, One column at a time */
1785: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1786: SSE_INLINE_BEGIN_2(x, pv)
1787: /* Load First Column of X*/
1788: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1789: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1791: /* Matrix-Vector Product: */
1792: SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
1793: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1794: SSE_MULT_PS(XMM5, XMM0)
1795: SSE_SUB_PS(XMM4, XMM5)
1797: SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
1798: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1799: SSE_MULT_PS(XMM6, XMM1)
1800: SSE_SUB_PS(XMM4, XMM6)
1802: SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
1803: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1804: SSE_MULT_PS(XMM7, XMM2)
1805: SSE_SUB_PS(XMM4, XMM7)
1807: SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
1808: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1809: SSE_MULT_PS(XMM5, XMM3)
1810: SSE_SUB_PS(XMM4, XMM5)
1812: SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1813: SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1815: /* Second Column */
1816: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1817: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1819: /* Matrix-Vector Product: */
1820: SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
1821: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1822: SSE_MULT_PS(XMM6, XMM0)
1823: SSE_SUB_PS(XMM5, XMM6)
1825: SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
1826: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1827: SSE_MULT_PS(XMM7, XMM1)
1828: SSE_SUB_PS(XMM5, XMM7)
1830: SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
1831: SSE_SHUFFLE(XMM4, XMM4, 0x00)
1832: SSE_MULT_PS(XMM4, XMM2)
1833: SSE_SUB_PS(XMM5, XMM4)
1835: SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
1836: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1837: SSE_MULT_PS(XMM6, XMM3)
1838: SSE_SUB_PS(XMM5, XMM6)
1840: SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1841: SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1843: SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
1845: /* Third Column */
1846: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1847: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1849: /* Matrix-Vector Product: */
1850: SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
1851: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1852: SSE_MULT_PS(XMM7, XMM0)
1853: SSE_SUB_PS(XMM6, XMM7)
1855: SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
1856: SSE_SHUFFLE(XMM4, XMM4, 0x00)
1857: SSE_MULT_PS(XMM4, XMM1)
1858: SSE_SUB_PS(XMM6, XMM4)
1860: SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
1861: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1862: SSE_MULT_PS(XMM5, XMM2)
1863: SSE_SUB_PS(XMM6, XMM5)
1865: SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
1866: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1867: SSE_MULT_PS(XMM7, XMM3)
1868: SSE_SUB_PS(XMM6, XMM7)
1870: SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1871: SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1873: /* Fourth Column */
1874: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1875: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1877: /* Matrix-Vector Product: */
1878: SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1879: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1880: SSE_MULT_PS(XMM5, XMM0)
1881: SSE_SUB_PS(XMM4, XMM5)
1883: SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1884: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1885: SSE_MULT_PS(XMM6, XMM1)
1886: SSE_SUB_PS(XMM4, XMM6)
1888: SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1889: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1890: SSE_MULT_PS(XMM7, XMM2)
1891: SSE_SUB_PS(XMM4, XMM7)
1893: SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1894: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1895: SSE_MULT_PS(XMM5, XMM3)
1896: SSE_SUB_PS(XMM4, XMM5)
1898: SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1899: SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1900: SSE_INLINE_END_2;
1901: pv += 16;
1902: }
1903: PetscLogFlops(128.0 * nz + 112.0);
1904: }
1905: row = (unsigned int)(*bjtmp++);
1906: /* row = (*bjtmp++)/4; */
1907: /* bjtmp++; */
1908: }
1909: /* finished row so stick it into b->a */
1910: pv = ba + 16 * bi[i];
1911: pj = bj + bi[i];
1912: nz = bi[i + 1] - bi[i];
1914: /* Copy x block back into pv block */
1915: for (j = 0; j < nz; j++) {
1916: x = rtmp + 16 * ((unsigned int)pj[j]);
1917: /* x = rtmp+4*pj[j]; */
1919: SSE_INLINE_BEGIN_2(x, pv)
1920: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1921: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1922: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1924: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1925: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1927: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1928: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1930: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1931: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1933: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1934: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1936: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1937: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1939: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1940: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1942: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1943: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1944: SSE_INLINE_END_2;
1945: pv += 16;
1946: }
1947: /* invert diagonal block */
1948: w = ba + 16 * diag_offset[i];
1949: if (pivotinblocks) {
1950: PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, , &zeropivotdetected);
1951: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1952: } else {
1953: PetscKernel_A_gets_inverse_A_4_nopivot(w);
1954: }
1955: /* PetscKernel_A_gets_inverse_A_4_SSE(w); */
1956: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1957: }
1959: PetscFree(rtmp);
1961: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1962: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1963: C->assembled = PETSC_TRUE;
1965: PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs);
1966: /* Flop Count from inverting diagonal blocks */
1967: SSE_SCOPE_END;
1968: return 0;
1969: }
1971: #endif