Actual source code: baijsolvnat4.c

  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <petsc/private/kernels/blockinvert.h>

  4: /*
  5:       Special case where the matrix was ILU(0) factored in the natural
  6:    ordering. This eliminates the need for the column and row permutation.
  7: */
  8: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
  9: {
 10:   Mat_SeqBAIJ       *a  = (Mat_SeqBAIJ *)A->data;
 11:   PetscInt           n  = a->mbs;
 12:   const PetscInt    *ai = a->i, *aj = a->j;
 13:   const PetscInt    *diag = a->diag;
 14:   const MatScalar   *aa   = a->a;
 15:   PetscScalar       *x;
 16:   const PetscScalar *b;

 18:   VecGetArrayRead(bb, &b);
 19:   VecGetArray(xx, &x);

 21: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
 22:   {
 23:     static PetscScalar w[2000]; /* very BAD need to fix */
 24:     fortransolvebaij4_(&n, x, ai, aj, diag, aa, b, w);
 25:   }
 26: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
 27:   fortransolvebaij4unroll_(&n, x, ai, aj, diag, aa, b);
 28: #else
 29:   {
 30:     PetscScalar      s1, s2, s3, s4, x1, x2, x3, x4;
 31:     const MatScalar *v;
 32:     PetscInt         jdx, idt, idx, nz, i, ai16;
 33:     const PetscInt  *vi;

 35:     /* forward solve the lower triangular */
 36:     idx  = 0;
 37:     x[0] = b[0];
 38:     x[1] = b[1];
 39:     x[2] = b[2];
 40:     x[3] = b[3];
 41:     for (i = 1; i < n; i++) {
 42:       v  = aa + 16 * ai[i];
 43:       vi = aj + ai[i];
 44:       nz = diag[i] - ai[i];
 45:       idx += 4;
 46:       s1 = b[idx];
 47:       s2 = b[1 + idx];
 48:       s3 = b[2 + idx];
 49:       s4 = b[3 + idx];
 50:       while (nz--) {
 51:         jdx = 4 * (*vi++);
 52:         x1  = x[jdx];
 53:         x2  = x[1 + jdx];
 54:         x3  = x[2 + jdx];
 55:         x4  = x[3 + jdx];
 56:         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
 57:         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
 58:         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
 59:         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
 60:         v += 16;
 61:       }
 62:       x[idx]     = s1;
 63:       x[1 + idx] = s2;
 64:       x[2 + idx] = s3;
 65:       x[3 + idx] = s4;
 66:     }
 67:     /* backward solve the upper triangular */
 68:     idt = 4 * (n - 1);
 69:     for (i = n - 1; i >= 0; i--) {
 70:       ai16 = 16 * diag[i];
 71:       v    = aa + ai16 + 16;
 72:       vi   = aj + diag[i] + 1;
 73:       nz   = ai[i + 1] - diag[i] - 1;
 74:       s1   = x[idt];
 75:       s2   = x[1 + idt];
 76:       s3   = x[2 + idt];
 77:       s4   = x[3 + idt];
 78:       while (nz--) {
 79:         idx = 4 * (*vi++);
 80:         x1  = x[idx];
 81:         x2  = x[1 + idx];
 82:         x3  = x[2 + idx];
 83:         x4  = x[3 + idx];
 84:         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
 85:         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
 86:         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
 87:         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
 88:         v += 16;
 89:       }
 90:       v          = aa + ai16;
 91:       x[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
 92:       x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
 93:       x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
 94:       x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
 95:       idt -= 4;
 96:     }
 97:   }
 98: #endif

100:   VecRestoreArrayRead(bb, &b);
101:   VecRestoreArray(xx, &x);
102:   PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n);
103:   return 0;
104: }

106: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A, Vec bb, Vec xx)
107: {
108:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
109:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
110:   PetscInt           i, k, nz, idx, jdx, idt;
111:   const PetscInt     bs = A->rmap->bs, bs2 = a->bs2;
112:   const MatScalar   *aa = a->a, *v;
113:   PetscScalar       *x;
114:   const PetscScalar *b;
115:   PetscScalar        s1, s2, s3, s4, x1, x2, x3, x4;

117:   VecGetArrayRead(bb, &b);
118:   VecGetArray(xx, &x);
119:   /* forward solve the lower triangular */
120:   idx  = 0;
121:   x[0] = b[idx];
122:   x[1] = b[1 + idx];
123:   x[2] = b[2 + idx];
124:   x[3] = b[3 + idx];
125:   for (i = 1; i < n; i++) {
126:     v   = aa + bs2 * ai[i];
127:     vi  = aj + ai[i];
128:     nz  = ai[i + 1] - ai[i];
129:     idx = bs * i;
130:     s1  = b[idx];
131:     s2  = b[1 + idx];
132:     s3  = b[2 + idx];
133:     s4  = b[3 + idx];
134:     for (k = 0; k < nz; k++) {
135:       jdx = bs * vi[k];
136:       x1  = x[jdx];
137:       x2  = x[1 + jdx];
138:       x3  = x[2 + jdx];
139:       x4  = x[3 + jdx];
140:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
141:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
142:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
143:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;

145:       v += bs2;
146:     }

148:     x[idx]     = s1;
149:     x[1 + idx] = s2;
150:     x[2 + idx] = s3;
151:     x[3 + idx] = s4;
152:   }

154:   /* backward solve the upper triangular */
155:   for (i = n - 1; i >= 0; i--) {
156:     v   = aa + bs2 * (adiag[i + 1] + 1);
157:     vi  = aj + adiag[i + 1] + 1;
158:     nz  = adiag[i] - adiag[i + 1] - 1;
159:     idt = bs * i;
160:     s1  = x[idt];
161:     s2  = x[1 + idt];
162:     s3  = x[2 + idt];
163:     s4  = x[3 + idt];

165:     for (k = 0; k < nz; k++) {
166:       idx = bs * vi[k];
167:       x1  = x[idx];
168:       x2  = x[1 + idx];
169:       x3  = x[2 + idx];
170:       x4  = x[3 + idx];
171:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
172:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
173:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
174:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;

176:       v += bs2;
177:     }
178:     /* x = inv_diagonal*x */
179:     x[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
180:     x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
181:     x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
182:     x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
183:   }

185:   VecRestoreArrayRead(bb, &b);
186:   VecRestoreArray(xx, &x);
187:   PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n);
188:   return 0;
189: }

191: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A, Vec bb, Vec xx)
192: {
193:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
194:   const PetscInt     n = a->mbs, *ai = a->i, *aj = a->j, *diag = a->diag;
195:   const MatScalar   *aa = a->a;
196:   const PetscScalar *b;
197:   PetscScalar       *x;

199:   VecGetArrayRead(bb, &b);
200:   VecGetArray(xx, &x);

202:   {
203:     MatScalar        s1, s2, s3, s4, x1, x2, x3, x4;
204:     const MatScalar *v;
205:     MatScalar       *t = (MatScalar *)x;
206:     PetscInt         jdx, idt, idx, nz, i, ai16;
207:     const PetscInt  *vi;

209:     /* forward solve the lower triangular */
210:     idx  = 0;
211:     t[0] = (MatScalar)b[0];
212:     t[1] = (MatScalar)b[1];
213:     t[2] = (MatScalar)b[2];
214:     t[3] = (MatScalar)b[3];
215:     for (i = 1; i < n; i++) {
216:       v  = aa + 16 * ai[i];
217:       vi = aj + ai[i];
218:       nz = diag[i] - ai[i];
219:       idx += 4;
220:       s1 = (MatScalar)b[idx];
221:       s2 = (MatScalar)b[1 + idx];
222:       s3 = (MatScalar)b[2 + idx];
223:       s4 = (MatScalar)b[3 + idx];
224:       while (nz--) {
225:         jdx = 4 * (*vi++);
226:         x1  = t[jdx];
227:         x2  = t[1 + jdx];
228:         x3  = t[2 + jdx];
229:         x4  = t[3 + jdx];
230:         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
231:         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
232:         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
233:         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
234:         v += 16;
235:       }
236:       t[idx]     = s1;
237:       t[1 + idx] = s2;
238:       t[2 + idx] = s3;
239:       t[3 + idx] = s4;
240:     }
241:     /* backward solve the upper triangular */
242:     idt = 4 * (n - 1);
243:     for (i = n - 1; i >= 0; i--) {
244:       ai16 = 16 * diag[i];
245:       v    = aa + ai16 + 16;
246:       vi   = aj + diag[i] + 1;
247:       nz   = ai[i + 1] - diag[i] - 1;
248:       s1   = t[idt];
249:       s2   = t[1 + idt];
250:       s3   = t[2 + idt];
251:       s4   = t[3 + idt];
252:       while (nz--) {
253:         idx = 4 * (*vi++);
254:         x1  = (MatScalar)x[idx];
255:         x2  = (MatScalar)x[1 + idx];
256:         x3  = (MatScalar)x[2 + idx];
257:         x4  = (MatScalar)x[3 + idx];
258:         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
259:         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
260:         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
261:         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
262:         v += 16;
263:       }
264:       v          = aa + ai16;
265:       x[idt]     = (PetscScalar)(v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4);
266:       x[1 + idt] = (PetscScalar)(v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4);
267:       x[2 + idt] = (PetscScalar)(v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4);
268:       x[3 + idt] = (PetscScalar)(v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4);
269:       idt -= 4;
270:     }
271:   }

273:   VecRestoreArrayRead(bb, &b);
274:   VecRestoreArray(xx, &x);
275:   PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n);
276:   return 0;
277: }

279: #if defined(PETSC_HAVE_SSE)

281:   #include PETSC_HAVE_SSE
282: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A, Vec bb, Vec xx)
283: {
284:   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
285:   unsigned short *aj = (unsigned short *)a->j;
286:   int            *ai = a->i, n = a->mbs, *diag = a->diag;
287:   MatScalar      *aa = a->a;
288:   PetscScalar    *x, *b;

290:   SSE_SCOPE_BEGIN;
291:   /*
292:      Note: This code currently uses demotion of double
293:      to float when performing the mixed-mode computation.
294:      This may not be numerically reasonable for all applications.
295:   */
296:   PREFETCH_NTA(aa + 16 * ai[1]);

298:   VecGetArray(bb, &b);
299:   VecGetArray(xx, &x);
300:   {
301:     /* x will first be computed in single precision then promoted inplace to double */
302:     MatScalar      *v, *t = (MatScalar *)x;
303:     int             nz, i, idt, ai16;
304:     unsigned int    jdx, idx;
305:     unsigned short *vi;
306:     /* Forward solve the lower triangular factor. */

308:     /* First block is the identity. */
309:     idx = 0;
310:     CONVERT_DOUBLE4_FLOAT4(t, b);
311:     v = aa + 16 * ((unsigned int)ai[1]);

313:     for (i = 1; i < n;) {
314:       PREFETCH_NTA(&v[8]);
315:       vi = aj + ai[i];
316:       nz = diag[i] - ai[i];
317:       idx += 4;

319:       /* Demote RHS from double to float. */
320:       CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]);
321:       LOAD_PS(&t[idx], XMM7);

323:       while (nz--) {
324:         PREFETCH_NTA(&v[16]);
325:         jdx = 4 * ((unsigned int)(*vi++));

327:         /* 4x4 Matrix-Vector product with negative accumulation: */
328:         SSE_INLINE_BEGIN_2(&t[jdx], v)
329:         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)

331:         /* First Column */
332:         SSE_COPY_PS(XMM0, XMM6)
333:         SSE_SHUFFLE(XMM0, XMM0, 0x00)
334:         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
335:         SSE_SUB_PS(XMM7, XMM0)

337:         /* Second Column */
338:         SSE_COPY_PS(XMM1, XMM6)
339:         SSE_SHUFFLE(XMM1, XMM1, 0x55)
340:         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
341:         SSE_SUB_PS(XMM7, XMM1)

343:         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)

345:         /* Third Column */
346:         SSE_COPY_PS(XMM2, XMM6)
347:         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
348:         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
349:         SSE_SUB_PS(XMM7, XMM2)

351:         /* Fourth Column */
352:         SSE_COPY_PS(XMM3, XMM6)
353:         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
354:         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
355:         SSE_SUB_PS(XMM7, XMM3)
356:         SSE_INLINE_END_2

358:         v += 16;
359:       }
360:       v = aa + 16 * ai[++i];
361:       PREFETCH_NTA(v);
362:       STORE_PS(&t[idx], XMM7);
363:     }

365:     /* Backward solve the upper triangular factor.*/

367:     idt  = 4 * (n - 1);
368:     ai16 = 16 * diag[n - 1];
369:     v    = aa + ai16 + 16;
370:     for (i = n - 1; i >= 0;) {
371:       PREFETCH_NTA(&v[8]);
372:       vi = aj + diag[i] + 1;
373:       nz = ai[i + 1] - diag[i] - 1;

375:       LOAD_PS(&t[idt], XMM7);

377:       while (nz--) {
378:         PREFETCH_NTA(&v[16]);
379:         idx = 4 * ((unsigned int)(*vi++));

381:         /* 4x4 Matrix-Vector Product with negative accumulation: */
382:         SSE_INLINE_BEGIN_2(&t[idx], v)
383:         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)

385:         /* First Column */
386:         SSE_COPY_PS(XMM0, XMM6)
387:         SSE_SHUFFLE(XMM0, XMM0, 0x00)
388:         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
389:         SSE_SUB_PS(XMM7, XMM0)

391:         /* Second Column */
392:         SSE_COPY_PS(XMM1, XMM6)
393:         SSE_SHUFFLE(XMM1, XMM1, 0x55)
394:         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
395:         SSE_SUB_PS(XMM7, XMM1)

397:         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)

399:         /* Third Column */
400:         SSE_COPY_PS(XMM2, XMM6)
401:         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
402:         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
403:         SSE_SUB_PS(XMM7, XMM2)

405:         /* Fourth Column */
406:         SSE_COPY_PS(XMM3, XMM6)
407:         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
408:         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
409:         SSE_SUB_PS(XMM7, XMM3)
410:         SSE_INLINE_END_2
411:         v += 16;
412:       }
413:       v    = aa + ai16;
414:       ai16 = 16 * diag[--i];
415:       PREFETCH_NTA(aa + ai16 + 16);
416:       /*
417:          Scale the result by the diagonal 4x4 block,
418:          which was inverted as part of the factorization
419:       */
420:       SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16)
421:       /* First Column */
422:       SSE_COPY_PS(XMM0, XMM7)
423:       SSE_SHUFFLE(XMM0, XMM0, 0x00)
424:       SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)

426:       /* Second Column */
427:       SSE_COPY_PS(XMM1, XMM7)
428:       SSE_SHUFFLE(XMM1, XMM1, 0x55)
429:       SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
430:       SSE_ADD_PS(XMM0, XMM1)

432:       SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)

434:       /* Third Column */
435:       SSE_COPY_PS(XMM2, XMM7)
436:       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
437:       SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
438:       SSE_ADD_PS(XMM0, XMM2)

440:       /* Fourth Column */
441:       SSE_COPY_PS(XMM3, XMM7)
442:       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
443:       SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
444:       SSE_ADD_PS(XMM0, XMM3)

446:       SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
447:       SSE_INLINE_END_3

449:       v = aa + ai16 + 16;
450:       idt -= 4;
451:     }

453:     /* Convert t from single precision back to double precision (inplace)*/
454:     idt = 4 * (n - 1);
455:     for (i = n - 1; i >= 0; i--) {
456:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
457:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
458:       PetscScalar *xtemp = &x[idt];
459:       MatScalar   *ttemp = &t[idt];
460:       xtemp[3]           = (PetscScalar)ttemp[3];
461:       xtemp[2]           = (PetscScalar)ttemp[2];
462:       xtemp[1]           = (PetscScalar)ttemp[1];
463:       xtemp[0]           = (PetscScalar)ttemp[0];
464:       idt -= 4;
465:     }

467:   } /* End of artificial scope. */
468:   VecRestoreArray(bb, &b);
469:   VecRestoreArray(xx, &x);
470:   PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n);
471:   SSE_SCOPE_END;
472:   return 0;
473: }

475: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A, Vec bb, Vec xx)
476: {
477:   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
478:   int         *aj = a->j;
479:   int         *ai = a->i, n = a->mbs, *diag = a->diag;
480:   MatScalar   *aa = a->a;
481:   PetscScalar *x, *b;

483:   SSE_SCOPE_BEGIN;
484:   /*
485:      Note: This code currently uses demotion of double
486:      to float when performing the mixed-mode computation.
487:      This may not be numerically reasonable for all applications.
488:   */
489:   PREFETCH_NTA(aa + 16 * ai[1]);

491:   VecGetArray(bb, &b);
492:   VecGetArray(xx, &x);
493:   {
494:     /* x will first be computed in single precision then promoted inplace to double */
495:     MatScalar *v, *t = (MatScalar *)x;
496:     int        nz, i, idt, ai16;
497:     int        jdx, idx;
498:     int       *vi;
499:     /* Forward solve the lower triangular factor. */

501:     /* First block is the identity. */
502:     idx = 0;
503:     CONVERT_DOUBLE4_FLOAT4(t, b);
504:     v = aa + 16 * ai[1];

506:     for (i = 1; i < n;) {
507:       PREFETCH_NTA(&v[8]);
508:       vi = aj + ai[i];
509:       nz = diag[i] - ai[i];
510:       idx += 4;

512:       /* Demote RHS from double to float. */
513:       CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]);
514:       LOAD_PS(&t[idx], XMM7);

516:       while (nz--) {
517:         PREFETCH_NTA(&v[16]);
518:         jdx = 4 * (*vi++);
519:         /*          jdx = *vi++; */

521:         /* 4x4 Matrix-Vector product with negative accumulation: */
522:         SSE_INLINE_BEGIN_2(&t[jdx], v)
523:         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)

525:         /* First Column */
526:         SSE_COPY_PS(XMM0, XMM6)
527:         SSE_SHUFFLE(XMM0, XMM0, 0x00)
528:         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
529:         SSE_SUB_PS(XMM7, XMM0)

531:         /* Second Column */
532:         SSE_COPY_PS(XMM1, XMM6)
533:         SSE_SHUFFLE(XMM1, XMM1, 0x55)
534:         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
535:         SSE_SUB_PS(XMM7, XMM1)

537:         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)

539:         /* Third Column */
540:         SSE_COPY_PS(XMM2, XMM6)
541:         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
542:         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
543:         SSE_SUB_PS(XMM7, XMM2)

545:         /* Fourth Column */
546:         SSE_COPY_PS(XMM3, XMM6)
547:         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
548:         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
549:         SSE_SUB_PS(XMM7, XMM3)
550:         SSE_INLINE_END_2

552:         v += 16;
553:       }
554:       v = aa + 16 * ai[++i];
555:       PREFETCH_NTA(v);
556:       STORE_PS(&t[idx], XMM7);
557:     }

559:     /* Backward solve the upper triangular factor.*/

561:     idt  = 4 * (n - 1);
562:     ai16 = 16 * diag[n - 1];
563:     v    = aa + ai16 + 16;
564:     for (i = n - 1; i >= 0;) {
565:       PREFETCH_NTA(&v[8]);
566:       vi = aj + diag[i] + 1;
567:       nz = ai[i + 1] - diag[i] - 1;

569:       LOAD_PS(&t[idt], XMM7);

571:       while (nz--) {
572:         PREFETCH_NTA(&v[16]);
573:         idx = 4 * (*vi++);
574:         /*          idx = *vi++; */

576:         /* 4x4 Matrix-Vector Product with negative accumulation: */
577:         SSE_INLINE_BEGIN_2(&t[idx], v)
578:         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)

580:         /* First Column */
581:         SSE_COPY_PS(XMM0, XMM6)
582:         SSE_SHUFFLE(XMM0, XMM0, 0x00)
583:         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
584:         SSE_SUB_PS(XMM7, XMM0)

586:         /* Second Column */
587:         SSE_COPY_PS(XMM1, XMM6)
588:         SSE_SHUFFLE(XMM1, XMM1, 0x55)
589:         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
590:         SSE_SUB_PS(XMM7, XMM1)

592:         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)

594:         /* Third Column */
595:         SSE_COPY_PS(XMM2, XMM6)
596:         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
597:         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
598:         SSE_SUB_PS(XMM7, XMM2)

600:         /* Fourth Column */
601:         SSE_COPY_PS(XMM3, XMM6)
602:         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
603:         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
604:         SSE_SUB_PS(XMM7, XMM3)
605:         SSE_INLINE_END_2
606:         v += 16;
607:       }
608:       v    = aa + ai16;
609:       ai16 = 16 * diag[--i];
610:       PREFETCH_NTA(aa + ai16 + 16);
611:       /*
612:          Scale the result by the diagonal 4x4 block,
613:          which was inverted as part of the factorization
614:       */
615:       SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16)
616:       /* First Column */
617:       SSE_COPY_PS(XMM0, XMM7)
618:       SSE_SHUFFLE(XMM0, XMM0, 0x00)
619:       SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)

621:       /* Second Column */
622:       SSE_COPY_PS(XMM1, XMM7)
623:       SSE_SHUFFLE(XMM1, XMM1, 0x55)
624:       SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
625:       SSE_ADD_PS(XMM0, XMM1)

627:       SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)

629:       /* Third Column */
630:       SSE_COPY_PS(XMM2, XMM7)
631:       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
632:       SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
633:       SSE_ADD_PS(XMM0, XMM2)

635:       /* Fourth Column */
636:       SSE_COPY_PS(XMM3, XMM7)
637:       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
638:       SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
639:       SSE_ADD_PS(XMM0, XMM3)

641:       SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
642:       SSE_INLINE_END_3

644:       v = aa + ai16 + 16;
645:       idt -= 4;
646:     }

648:     /* Convert t from single precision back to double precision (inplace)*/
649:     idt = 4 * (n - 1);
650:     for (i = n - 1; i >= 0; i--) {
651:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
652:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
653:       PetscScalar *xtemp = &x[idt];
654:       MatScalar   *ttemp = &t[idt];
655:       xtemp[3]           = (PetscScalar)ttemp[3];
656:       xtemp[2]           = (PetscScalar)ttemp[2];
657:       xtemp[1]           = (PetscScalar)ttemp[1];
658:       xtemp[0]           = (PetscScalar)ttemp[0];
659:       idt -= 4;
660:     }

662:   } /* End of artificial scope. */
663:   VecRestoreArray(bb, &b);
664:   VecRestoreArray(xx, &x);
665:   PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n);
666:   SSE_SCOPE_END;
667:   return 0;
668: }

670: #endif