Actual source code: baijsolvnat4.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <petsc/private/kernels/blockinvert.h>
4: /*
5: Special case where the matrix was ILU(0) factored in the natural
6: ordering. This eliminates the need for the column and row permutation.
7: */
8: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
9: {
10: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
11: PetscInt n = a->mbs;
12: const PetscInt *ai = a->i, *aj = a->j;
13: const PetscInt *diag = a->diag;
14: const MatScalar *aa = a->a;
15: PetscScalar *x;
16: const PetscScalar *b;
18: VecGetArrayRead(bb, &b);
19: VecGetArray(xx, &x);
21: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
22: {
23: static PetscScalar w[2000]; /* very BAD need to fix */
24: fortransolvebaij4_(&n, x, ai, aj, diag, aa, b, w);
25: }
26: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
27: fortransolvebaij4unroll_(&n, x, ai, aj, diag, aa, b);
28: #else
29: {
30: PetscScalar s1, s2, s3, s4, x1, x2, x3, x4;
31: const MatScalar *v;
32: PetscInt jdx, idt, idx, nz, i, ai16;
33: const PetscInt *vi;
35: /* forward solve the lower triangular */
36: idx = 0;
37: x[0] = b[0];
38: x[1] = b[1];
39: x[2] = b[2];
40: x[3] = b[3];
41: for (i = 1; i < n; i++) {
42: v = aa + 16 * ai[i];
43: vi = aj + ai[i];
44: nz = diag[i] - ai[i];
45: idx += 4;
46: s1 = b[idx];
47: s2 = b[1 + idx];
48: s3 = b[2 + idx];
49: s4 = b[3 + idx];
50: while (nz--) {
51: jdx = 4 * (*vi++);
52: x1 = x[jdx];
53: x2 = x[1 + jdx];
54: x3 = x[2 + jdx];
55: x4 = x[3 + jdx];
56: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
57: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
58: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
59: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
60: v += 16;
61: }
62: x[idx] = s1;
63: x[1 + idx] = s2;
64: x[2 + idx] = s3;
65: x[3 + idx] = s4;
66: }
67: /* backward solve the upper triangular */
68: idt = 4 * (n - 1);
69: for (i = n - 1; i >= 0; i--) {
70: ai16 = 16 * diag[i];
71: v = aa + ai16 + 16;
72: vi = aj + diag[i] + 1;
73: nz = ai[i + 1] - diag[i] - 1;
74: s1 = x[idt];
75: s2 = x[1 + idt];
76: s3 = x[2 + idt];
77: s4 = x[3 + idt];
78: while (nz--) {
79: idx = 4 * (*vi++);
80: x1 = x[idx];
81: x2 = x[1 + idx];
82: x3 = x[2 + idx];
83: x4 = x[3 + idx];
84: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
85: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
86: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
87: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
88: v += 16;
89: }
90: v = aa + ai16;
91: x[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
92: x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
93: x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
94: x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
95: idt -= 4;
96: }
97: }
98: #endif
100: VecRestoreArrayRead(bb, &b);
101: VecRestoreArray(xx, &x);
102: PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n);
103: return 0;
104: }
106: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A, Vec bb, Vec xx)
107: {
108: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
109: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
110: PetscInt i, k, nz, idx, jdx, idt;
111: const PetscInt bs = A->rmap->bs, bs2 = a->bs2;
112: const MatScalar *aa = a->a, *v;
113: PetscScalar *x;
114: const PetscScalar *b;
115: PetscScalar s1, s2, s3, s4, x1, x2, x3, x4;
117: VecGetArrayRead(bb, &b);
118: VecGetArray(xx, &x);
119: /* forward solve the lower triangular */
120: idx = 0;
121: x[0] = b[idx];
122: x[1] = b[1 + idx];
123: x[2] = b[2 + idx];
124: x[3] = b[3 + idx];
125: for (i = 1; i < n; i++) {
126: v = aa + bs2 * ai[i];
127: vi = aj + ai[i];
128: nz = ai[i + 1] - ai[i];
129: idx = bs * i;
130: s1 = b[idx];
131: s2 = b[1 + idx];
132: s3 = b[2 + idx];
133: s4 = b[3 + idx];
134: for (k = 0; k < nz; k++) {
135: jdx = bs * vi[k];
136: x1 = x[jdx];
137: x2 = x[1 + jdx];
138: x3 = x[2 + jdx];
139: x4 = x[3 + jdx];
140: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
141: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
142: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
143: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
145: v += bs2;
146: }
148: x[idx] = s1;
149: x[1 + idx] = s2;
150: x[2 + idx] = s3;
151: x[3 + idx] = s4;
152: }
154: /* backward solve the upper triangular */
155: for (i = n - 1; i >= 0; i--) {
156: v = aa + bs2 * (adiag[i + 1] + 1);
157: vi = aj + adiag[i + 1] + 1;
158: nz = adiag[i] - adiag[i + 1] - 1;
159: idt = bs * i;
160: s1 = x[idt];
161: s2 = x[1 + idt];
162: s3 = x[2 + idt];
163: s4 = x[3 + idt];
165: for (k = 0; k < nz; k++) {
166: idx = bs * vi[k];
167: x1 = x[idx];
168: x2 = x[1 + idx];
169: x3 = x[2 + idx];
170: x4 = x[3 + idx];
171: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
172: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
173: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
174: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
176: v += bs2;
177: }
178: /* x = inv_diagonal*x */
179: x[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
180: x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
181: x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
182: x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
183: }
185: VecRestoreArrayRead(bb, &b);
186: VecRestoreArray(xx, &x);
187: PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n);
188: return 0;
189: }
191: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A, Vec bb, Vec xx)
192: {
193: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
194: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *diag = a->diag;
195: const MatScalar *aa = a->a;
196: const PetscScalar *b;
197: PetscScalar *x;
199: VecGetArrayRead(bb, &b);
200: VecGetArray(xx, &x);
202: {
203: MatScalar s1, s2, s3, s4, x1, x2, x3, x4;
204: const MatScalar *v;
205: MatScalar *t = (MatScalar *)x;
206: PetscInt jdx, idt, idx, nz, i, ai16;
207: const PetscInt *vi;
209: /* forward solve the lower triangular */
210: idx = 0;
211: t[0] = (MatScalar)b[0];
212: t[1] = (MatScalar)b[1];
213: t[2] = (MatScalar)b[2];
214: t[3] = (MatScalar)b[3];
215: for (i = 1; i < n; i++) {
216: v = aa + 16 * ai[i];
217: vi = aj + ai[i];
218: nz = diag[i] - ai[i];
219: idx += 4;
220: s1 = (MatScalar)b[idx];
221: s2 = (MatScalar)b[1 + idx];
222: s3 = (MatScalar)b[2 + idx];
223: s4 = (MatScalar)b[3 + idx];
224: while (nz--) {
225: jdx = 4 * (*vi++);
226: x1 = t[jdx];
227: x2 = t[1 + jdx];
228: x3 = t[2 + jdx];
229: x4 = t[3 + jdx];
230: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
231: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
232: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
233: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
234: v += 16;
235: }
236: t[idx] = s1;
237: t[1 + idx] = s2;
238: t[2 + idx] = s3;
239: t[3 + idx] = s4;
240: }
241: /* backward solve the upper triangular */
242: idt = 4 * (n - 1);
243: for (i = n - 1; i >= 0; i--) {
244: ai16 = 16 * diag[i];
245: v = aa + ai16 + 16;
246: vi = aj + diag[i] + 1;
247: nz = ai[i + 1] - diag[i] - 1;
248: s1 = t[idt];
249: s2 = t[1 + idt];
250: s3 = t[2 + idt];
251: s4 = t[3 + idt];
252: while (nz--) {
253: idx = 4 * (*vi++);
254: x1 = (MatScalar)x[idx];
255: x2 = (MatScalar)x[1 + idx];
256: x3 = (MatScalar)x[2 + idx];
257: x4 = (MatScalar)x[3 + idx];
258: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
259: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
260: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
261: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
262: v += 16;
263: }
264: v = aa + ai16;
265: x[idt] = (PetscScalar)(v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4);
266: x[1 + idt] = (PetscScalar)(v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4);
267: x[2 + idt] = (PetscScalar)(v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4);
268: x[3 + idt] = (PetscScalar)(v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4);
269: idt -= 4;
270: }
271: }
273: VecRestoreArrayRead(bb, &b);
274: VecRestoreArray(xx, &x);
275: PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n);
276: return 0;
277: }
279: #if defined(PETSC_HAVE_SSE)
281: #include PETSC_HAVE_SSE
282: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A, Vec bb, Vec xx)
283: {
284: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
285: unsigned short *aj = (unsigned short *)a->j;
286: int *ai = a->i, n = a->mbs, *diag = a->diag;
287: MatScalar *aa = a->a;
288: PetscScalar *x, *b;
290: SSE_SCOPE_BEGIN;
291: /*
292: Note: This code currently uses demotion of double
293: to float when performing the mixed-mode computation.
294: This may not be numerically reasonable for all applications.
295: */
296: PREFETCH_NTA(aa + 16 * ai[1]);
298: VecGetArray(bb, &b);
299: VecGetArray(xx, &x);
300: {
301: /* x will first be computed in single precision then promoted inplace to double */
302: MatScalar *v, *t = (MatScalar *)x;
303: int nz, i, idt, ai16;
304: unsigned int jdx, idx;
305: unsigned short *vi;
306: /* Forward solve the lower triangular factor. */
308: /* First block is the identity. */
309: idx = 0;
310: CONVERT_DOUBLE4_FLOAT4(t, b);
311: v = aa + 16 * ((unsigned int)ai[1]);
313: for (i = 1; i < n;) {
314: PREFETCH_NTA(&v[8]);
315: vi = aj + ai[i];
316: nz = diag[i] - ai[i];
317: idx += 4;
319: /* Demote RHS from double to float. */
320: CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]);
321: LOAD_PS(&t[idx], XMM7);
323: while (nz--) {
324: PREFETCH_NTA(&v[16]);
325: jdx = 4 * ((unsigned int)(*vi++));
327: /* 4x4 Matrix-Vector product with negative accumulation: */
328: SSE_INLINE_BEGIN_2(&t[jdx], v)
329: SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
331: /* First Column */
332: SSE_COPY_PS(XMM0, XMM6)
333: SSE_SHUFFLE(XMM0, XMM0, 0x00)
334: SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
335: SSE_SUB_PS(XMM7, XMM0)
337: /* Second Column */
338: SSE_COPY_PS(XMM1, XMM6)
339: SSE_SHUFFLE(XMM1, XMM1, 0x55)
340: SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
341: SSE_SUB_PS(XMM7, XMM1)
343: SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
345: /* Third Column */
346: SSE_COPY_PS(XMM2, XMM6)
347: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
348: SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
349: SSE_SUB_PS(XMM7, XMM2)
351: /* Fourth Column */
352: SSE_COPY_PS(XMM3, XMM6)
353: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
354: SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
355: SSE_SUB_PS(XMM7, XMM3)
356: SSE_INLINE_END_2
358: v += 16;
359: }
360: v = aa + 16 * ai[++i];
361: PREFETCH_NTA(v);
362: STORE_PS(&t[idx], XMM7);
363: }
365: /* Backward solve the upper triangular factor.*/
367: idt = 4 * (n - 1);
368: ai16 = 16 * diag[n - 1];
369: v = aa + ai16 + 16;
370: for (i = n - 1; i >= 0;) {
371: PREFETCH_NTA(&v[8]);
372: vi = aj + diag[i] + 1;
373: nz = ai[i + 1] - diag[i] - 1;
375: LOAD_PS(&t[idt], XMM7);
377: while (nz--) {
378: PREFETCH_NTA(&v[16]);
379: idx = 4 * ((unsigned int)(*vi++));
381: /* 4x4 Matrix-Vector Product with negative accumulation: */
382: SSE_INLINE_BEGIN_2(&t[idx], v)
383: SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
385: /* First Column */
386: SSE_COPY_PS(XMM0, XMM6)
387: SSE_SHUFFLE(XMM0, XMM0, 0x00)
388: SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
389: SSE_SUB_PS(XMM7, XMM0)
391: /* Second Column */
392: SSE_COPY_PS(XMM1, XMM6)
393: SSE_SHUFFLE(XMM1, XMM1, 0x55)
394: SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
395: SSE_SUB_PS(XMM7, XMM1)
397: SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
399: /* Third Column */
400: SSE_COPY_PS(XMM2, XMM6)
401: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
402: SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
403: SSE_SUB_PS(XMM7, XMM2)
405: /* Fourth Column */
406: SSE_COPY_PS(XMM3, XMM6)
407: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
408: SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
409: SSE_SUB_PS(XMM7, XMM3)
410: SSE_INLINE_END_2
411: v += 16;
412: }
413: v = aa + ai16;
414: ai16 = 16 * diag[--i];
415: PREFETCH_NTA(aa + ai16 + 16);
416: /*
417: Scale the result by the diagonal 4x4 block,
418: which was inverted as part of the factorization
419: */
420: SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16)
421: /* First Column */
422: SSE_COPY_PS(XMM0, XMM7)
423: SSE_SHUFFLE(XMM0, XMM0, 0x00)
424: SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
426: /* Second Column */
427: SSE_COPY_PS(XMM1, XMM7)
428: SSE_SHUFFLE(XMM1, XMM1, 0x55)
429: SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
430: SSE_ADD_PS(XMM0, XMM1)
432: SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
434: /* Third Column */
435: SSE_COPY_PS(XMM2, XMM7)
436: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
437: SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
438: SSE_ADD_PS(XMM0, XMM2)
440: /* Fourth Column */
441: SSE_COPY_PS(XMM3, XMM7)
442: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
443: SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
444: SSE_ADD_PS(XMM0, XMM3)
446: SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
447: SSE_INLINE_END_3
449: v = aa + ai16 + 16;
450: idt -= 4;
451: }
453: /* Convert t from single precision back to double precision (inplace)*/
454: idt = 4 * (n - 1);
455: for (i = n - 1; i >= 0; i--) {
456: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
457: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
458: PetscScalar *xtemp = &x[idt];
459: MatScalar *ttemp = &t[idt];
460: xtemp[3] = (PetscScalar)ttemp[3];
461: xtemp[2] = (PetscScalar)ttemp[2];
462: xtemp[1] = (PetscScalar)ttemp[1];
463: xtemp[0] = (PetscScalar)ttemp[0];
464: idt -= 4;
465: }
467: } /* End of artificial scope. */
468: VecRestoreArray(bb, &b);
469: VecRestoreArray(xx, &x);
470: PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n);
471: SSE_SCOPE_END;
472: return 0;
473: }
475: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A, Vec bb, Vec xx)
476: {
477: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
478: int *aj = a->j;
479: int *ai = a->i, n = a->mbs, *diag = a->diag;
480: MatScalar *aa = a->a;
481: PetscScalar *x, *b;
483: SSE_SCOPE_BEGIN;
484: /*
485: Note: This code currently uses demotion of double
486: to float when performing the mixed-mode computation.
487: This may not be numerically reasonable for all applications.
488: */
489: PREFETCH_NTA(aa + 16 * ai[1]);
491: VecGetArray(bb, &b);
492: VecGetArray(xx, &x);
493: {
494: /* x will first be computed in single precision then promoted inplace to double */
495: MatScalar *v, *t = (MatScalar *)x;
496: int nz, i, idt, ai16;
497: int jdx, idx;
498: int *vi;
499: /* Forward solve the lower triangular factor. */
501: /* First block is the identity. */
502: idx = 0;
503: CONVERT_DOUBLE4_FLOAT4(t, b);
504: v = aa + 16 * ai[1];
506: for (i = 1; i < n;) {
507: PREFETCH_NTA(&v[8]);
508: vi = aj + ai[i];
509: nz = diag[i] - ai[i];
510: idx += 4;
512: /* Demote RHS from double to float. */
513: CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]);
514: LOAD_PS(&t[idx], XMM7);
516: while (nz--) {
517: PREFETCH_NTA(&v[16]);
518: jdx = 4 * (*vi++);
519: /* jdx = *vi++; */
521: /* 4x4 Matrix-Vector product with negative accumulation: */
522: SSE_INLINE_BEGIN_2(&t[jdx], v)
523: SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
525: /* First Column */
526: SSE_COPY_PS(XMM0, XMM6)
527: SSE_SHUFFLE(XMM0, XMM0, 0x00)
528: SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
529: SSE_SUB_PS(XMM7, XMM0)
531: /* Second Column */
532: SSE_COPY_PS(XMM1, XMM6)
533: SSE_SHUFFLE(XMM1, XMM1, 0x55)
534: SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
535: SSE_SUB_PS(XMM7, XMM1)
537: SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
539: /* Third Column */
540: SSE_COPY_PS(XMM2, XMM6)
541: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
542: SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
543: SSE_SUB_PS(XMM7, XMM2)
545: /* Fourth Column */
546: SSE_COPY_PS(XMM3, XMM6)
547: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
548: SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
549: SSE_SUB_PS(XMM7, XMM3)
550: SSE_INLINE_END_2
552: v += 16;
553: }
554: v = aa + 16 * ai[++i];
555: PREFETCH_NTA(v);
556: STORE_PS(&t[idx], XMM7);
557: }
559: /* Backward solve the upper triangular factor.*/
561: idt = 4 * (n - 1);
562: ai16 = 16 * diag[n - 1];
563: v = aa + ai16 + 16;
564: for (i = n - 1; i >= 0;) {
565: PREFETCH_NTA(&v[8]);
566: vi = aj + diag[i] + 1;
567: nz = ai[i + 1] - diag[i] - 1;
569: LOAD_PS(&t[idt], XMM7);
571: while (nz--) {
572: PREFETCH_NTA(&v[16]);
573: idx = 4 * (*vi++);
574: /* idx = *vi++; */
576: /* 4x4 Matrix-Vector Product with negative accumulation: */
577: SSE_INLINE_BEGIN_2(&t[idx], v)
578: SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
580: /* First Column */
581: SSE_COPY_PS(XMM0, XMM6)
582: SSE_SHUFFLE(XMM0, XMM0, 0x00)
583: SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
584: SSE_SUB_PS(XMM7, XMM0)
586: /* Second Column */
587: SSE_COPY_PS(XMM1, XMM6)
588: SSE_SHUFFLE(XMM1, XMM1, 0x55)
589: SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
590: SSE_SUB_PS(XMM7, XMM1)
592: SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
594: /* Third Column */
595: SSE_COPY_PS(XMM2, XMM6)
596: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
597: SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
598: SSE_SUB_PS(XMM7, XMM2)
600: /* Fourth Column */
601: SSE_COPY_PS(XMM3, XMM6)
602: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
603: SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
604: SSE_SUB_PS(XMM7, XMM3)
605: SSE_INLINE_END_2
606: v += 16;
607: }
608: v = aa + ai16;
609: ai16 = 16 * diag[--i];
610: PREFETCH_NTA(aa + ai16 + 16);
611: /*
612: Scale the result by the diagonal 4x4 block,
613: which was inverted as part of the factorization
614: */
615: SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16)
616: /* First Column */
617: SSE_COPY_PS(XMM0, XMM7)
618: SSE_SHUFFLE(XMM0, XMM0, 0x00)
619: SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
621: /* Second Column */
622: SSE_COPY_PS(XMM1, XMM7)
623: SSE_SHUFFLE(XMM1, XMM1, 0x55)
624: SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
625: SSE_ADD_PS(XMM0, XMM1)
627: SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
629: /* Third Column */
630: SSE_COPY_PS(XMM2, XMM7)
631: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
632: SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
633: SSE_ADD_PS(XMM0, XMM2)
635: /* Fourth Column */
636: SSE_COPY_PS(XMM3, XMM7)
637: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
638: SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
639: SSE_ADD_PS(XMM0, XMM3)
641: SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
642: SSE_INLINE_END_3
644: v = aa + ai16 + 16;
645: idt -= 4;
646: }
648: /* Convert t from single precision back to double precision (inplace)*/
649: idt = 4 * (n - 1);
650: for (i = n - 1; i >= 0; i--) {
651: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
652: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
653: PetscScalar *xtemp = &x[idt];
654: MatScalar *ttemp = &t[idt];
655: xtemp[3] = (PetscScalar)ttemp[3];
656: xtemp[2] = (PetscScalar)ttemp[2];
657: xtemp[1] = (PetscScalar)ttemp[1];
658: xtemp[0] = (PetscScalar)ttemp[0];
659: idt -= 4;
660: }
662: } /* End of artificial scope. */
663: VecRestoreArray(bb, &b);
664: VecRestoreArray(xx, &x);
665: PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n);
666: SSE_SCOPE_END;
667: return 0;
668: }
670: #endif