Actual source code: baijfact11.c


  2: /*
  3:     Factorization code for BAIJ format.
  4: */
  5: #include <../src/mat/impls/baij/seq/baij.h>
  6: #include <petsc/private/kernels/blockinvert.h>

  8: /* ------------------------------------------------------------*/
  9: /*
 10:       Version for when blocks are 4 by 4
 11: */
 12: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C, Mat A, const MatFactorInfo *info)
 13: {
 14:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
 15:   IS              isrow = b->row, isicol = b->icol;
 16:   const PetscInt *r, *ic;
 17:   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
 18:   PetscInt       *ajtmpold, *ajtmp, nz, row;
 19:   PetscInt       *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
 20:   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
 21:   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
 22:   MatScalar       p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
 23:   MatScalar       p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
 24:   MatScalar       m13, m14, m15, m16;
 25:   MatScalar      *ba = b->a, *aa = a->a;
 26:   PetscBool       pivotinblocks = b->pivotinblocks;
 27:   PetscReal       shift         = info->shiftamount;
 28:   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;

 30:   ISGetIndices(isrow, &r);
 31:   ISGetIndices(isicol, &ic);
 32:   PetscMalloc1(16 * (n + 1), &rtmp);
 33:   allowzeropivot = PetscNot(A->erroriffailure);

 35:   for (i = 0; i < n; i++) {
 36:     nz    = bi[i + 1] - bi[i];
 37:     ajtmp = bj + bi[i];
 38:     for (j = 0; j < nz; j++) {
 39:       x    = rtmp + 16 * ajtmp[j];
 40:       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
 41:       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
 42:     }
 43:     /* load in initial (unfactored row) */
 44:     idx      = r[i];
 45:     nz       = ai[idx + 1] - ai[idx];
 46:     ajtmpold = aj + ai[idx];
 47:     v        = aa + 16 * ai[idx];
 48:     for (j = 0; j < nz; j++) {
 49:       x     = rtmp + 16 * ic[ajtmpold[j]];
 50:       x[0]  = v[0];
 51:       x[1]  = v[1];
 52:       x[2]  = v[2];
 53:       x[3]  = v[3];
 54:       x[4]  = v[4];
 55:       x[5]  = v[5];
 56:       x[6]  = v[6];
 57:       x[7]  = v[7];
 58:       x[8]  = v[8];
 59:       x[9]  = v[9];
 60:       x[10] = v[10];
 61:       x[11] = v[11];
 62:       x[12] = v[12];
 63:       x[13] = v[13];
 64:       x[14] = v[14];
 65:       x[15] = v[15];
 66:       v += 16;
 67:     }
 68:     row = *ajtmp++;
 69:     while (row < i) {
 70:       pc  = rtmp + 16 * row;
 71:       p1  = pc[0];
 72:       p2  = pc[1];
 73:       p3  = pc[2];
 74:       p4  = pc[3];
 75:       p5  = pc[4];
 76:       p6  = pc[5];
 77:       p7  = pc[6];
 78:       p8  = pc[7];
 79:       p9  = pc[8];
 80:       p10 = pc[9];
 81:       p11 = pc[10];
 82:       p12 = pc[11];
 83:       p13 = pc[12];
 84:       p14 = pc[13];
 85:       p15 = pc[14];
 86:       p16 = pc[15];
 87:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
 88:         pv    = ba + 16 * diag_offset[row];
 89:         pj    = bj + diag_offset[row] + 1;
 90:         x1    = pv[0];
 91:         x2    = pv[1];
 92:         x3    = pv[2];
 93:         x4    = pv[3];
 94:         x5    = pv[4];
 95:         x6    = pv[5];
 96:         x7    = pv[6];
 97:         x8    = pv[7];
 98:         x9    = pv[8];
 99:         x10   = pv[9];
100:         x11   = pv[10];
101:         x12   = pv[11];
102:         x13   = pv[12];
103:         x14   = pv[13];
104:         x15   = pv[14];
105:         x16   = pv[15];
106:         pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
107:         pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
108:         pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
109:         pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;

111:         pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
112:         pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
113:         pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
114:         pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;

116:         pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
117:         pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
118:         pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
119:         pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;

121:         pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
122:         pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
123:         pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
124:         pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;

126:         nz = bi[row + 1] - diag_offset[row] - 1;
127:         pv += 16;
128:         for (j = 0; j < nz; j++) {
129:           x1  = pv[0];
130:           x2  = pv[1];
131:           x3  = pv[2];
132:           x4  = pv[3];
133:           x5  = pv[4];
134:           x6  = pv[5];
135:           x7  = pv[6];
136:           x8  = pv[7];
137:           x9  = pv[8];
138:           x10 = pv[9];
139:           x11 = pv[10];
140:           x12 = pv[11];
141:           x13 = pv[12];
142:           x14 = pv[13];
143:           x15 = pv[14];
144:           x16 = pv[15];
145:           x   = rtmp + 16 * pj[j];
146:           x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
147:           x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
148:           x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
149:           x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;

151:           x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
152:           x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
153:           x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
154:           x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;

156:           x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
157:           x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
158:           x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
159:           x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;

161:           x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
162:           x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
163:           x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
164:           x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;

166:           pv += 16;
167:         }
168:         PetscLogFlops(128.0 * nz + 112.0);
169:       }
170:       row = *ajtmp++;
171:     }
172:     /* finished row so stick it into b->a */
173:     pv = ba + 16 * bi[i];
174:     pj = bj + bi[i];
175:     nz = bi[i + 1] - bi[i];
176:     for (j = 0; j < nz; j++) {
177:       x      = rtmp + 16 * pj[j];
178:       pv[0]  = x[0];
179:       pv[1]  = x[1];
180:       pv[2]  = x[2];
181:       pv[3]  = x[3];
182:       pv[4]  = x[4];
183:       pv[5]  = x[5];
184:       pv[6]  = x[6];
185:       pv[7]  = x[7];
186:       pv[8]  = x[8];
187:       pv[9]  = x[9];
188:       pv[10] = x[10];
189:       pv[11] = x[11];
190:       pv[12] = x[12];
191:       pv[13] = x[13];
192:       pv[14] = x[14];
193:       pv[15] = x[15];
194:       pv += 16;
195:     }
196:     /* invert diagonal block */
197:     w = ba + 16 * diag_offset[i];
198:     if (pivotinblocks) {
199:       PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected);
200:       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
201:     } else {
202:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
203:     }
204:   }

206:   PetscFree(rtmp);
207:   ISRestoreIndices(isicol, &ic);
208:   ISRestoreIndices(isrow, &r);

210:   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
211:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
212:   C->assembled           = PETSC_TRUE;

214:   PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs); /* from inverting diagonal blocks */
215:   return 0;
216: }

218: /* MatLUFactorNumeric_SeqBAIJ_4 -
219:      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
220:        PetscKernel_A_gets_A_times_B()
221:        PetscKernel_A_gets_A_minus_B_times_C()
222:        PetscKernel_A_gets_inverse_A()
223: */

225: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B, Mat A, const MatFactorInfo *info)
226: {
227:   Mat             C = B;
228:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
229:   IS              isrow = b->row, isicol = b->icol;
230:   const PetscInt *r, *ic;
231:   PetscInt        i, j, k, nz, nzL, row;
232:   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
233:   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
234:   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
235:   PetscInt        flg;
236:   PetscReal       shift;
237:   PetscBool       allowzeropivot, zeropivotdetected;

239:   allowzeropivot = PetscNot(A->erroriffailure);
240:   ISGetIndices(isrow, &r);
241:   ISGetIndices(isicol, &ic);

243:   if (info->shifttype == MAT_SHIFT_NONE) {
244:     shift = 0;
245:   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
246:     shift = info->shiftamount;
247:   }

249:   /* generate work space needed by the factorization */
250:   PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork);
251:   PetscArrayzero(rtmp, bs2 * n);

253:   for (i = 0; i < n; i++) {
254:     /* zero rtmp */
255:     /* L part */
256:     nz    = bi[i + 1] - bi[i];
257:     bjtmp = bj + bi[i];
258:     for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);

260:     /* U part */
261:     nz    = bdiag[i] - bdiag[i + 1];
262:     bjtmp = bj + bdiag[i + 1] + 1;
263:     for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);

265:     /* load in initial (unfactored row) */
266:     nz    = ai[r[i] + 1] - ai[r[i]];
267:     ajtmp = aj + ai[r[i]];
268:     v     = aa + bs2 * ai[r[i]];
269:     for (j = 0; j < nz; j++) PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2);

271:     /* elimination */
272:     bjtmp = bj + bi[i];
273:     nzL   = bi[i + 1] - bi[i];
274:     for (k = 0; k < nzL; k++) {
275:       row = bjtmp[k];
276:       pc  = rtmp + bs2 * row;
277:       for (flg = 0, j = 0; j < bs2; j++) {
278:         if (pc[j] != 0.0) {
279:           flg = 1;
280:           break;
281:         }
282:       }
283:       if (flg) {
284:         pv = b->a + bs2 * bdiag[row];
285:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
286:         PetscKernel_A_gets_A_times_B_4(pc, pv, mwork);

288:         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
289:         pv = b->a + bs2 * (bdiag[row + 1] + 1);
290:         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
291:         for (j = 0; j < nz; j++) {
292:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
293:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
294:           v = rtmp + bs2 * pj[j];
295:           PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv);
296:           pv += bs2;
297:         }
298:         PetscLogFlops(128.0 * nz + 112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
299:       }
300:     }

302:     /* finished row so stick it into b->a */
303:     /* L part */
304:     pv = b->a + bs2 * bi[i];
305:     pj = b->j + bi[i];
306:     nz = bi[i + 1] - bi[i];
307:     for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);

309:     /* Mark diagonal and invert diagonal for simpler triangular solves */
310:     pv = b->a + bs2 * bdiag[i];
311:     pj = b->j + bdiag[i];
312:     PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2);
313:     PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected);
314:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

316:     /* U part */
317:     pv = b->a + bs2 * (bdiag[i + 1] + 1);
318:     pj = b->j + bdiag[i + 1] + 1;
319:     nz = bdiag[i] - bdiag[i + 1] - 1;
320:     for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);
321:   }

323:   PetscFree2(rtmp, mwork);
324:   ISRestoreIndices(isicol, &ic);
325:   ISRestoreIndices(isrow, &r);

327:   C->ops->solve          = MatSolve_SeqBAIJ_4;
328:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
329:   C->assembled           = PETSC_TRUE;

331:   PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n); /* from inverting diagonal blocks */
332:   return 0;
333: }

335: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
336: {
337:   /*
338:     Default Version for when blocks are 4 by 4 Using natural ordering
339: */
340:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
341:   PetscInt     i, j, n = a->mbs, *bi = b->i, *bj = b->j;
342:   PetscInt    *ajtmpold, *ajtmp, nz, row;
343:   PetscInt    *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
344:   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
345:   MatScalar    p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
346:   MatScalar    p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
347:   MatScalar    p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
348:   MatScalar    m13, m14, m15, m16;
349:   MatScalar   *ba = b->a, *aa = a->a;
350:   PetscBool    pivotinblocks = b->pivotinblocks;
351:   PetscReal    shift         = info->shiftamount;
352:   PetscBool    allowzeropivot, zeropivotdetected = PETSC_FALSE;

354:   allowzeropivot = PetscNot(A->erroriffailure);
355:   PetscMalloc1(16 * (n + 1), &rtmp);

357:   for (i = 0; i < n; i++) {
358:     nz    = bi[i + 1] - bi[i];
359:     ajtmp = bj + bi[i];
360:     for (j = 0; j < nz; j++) {
361:       x    = rtmp + 16 * ajtmp[j];
362:       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
363:       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
364:     }
365:     /* load in initial (unfactored row) */
366:     nz       = ai[i + 1] - ai[i];
367:     ajtmpold = aj + ai[i];
368:     v        = aa + 16 * ai[i];
369:     for (j = 0; j < nz; j++) {
370:       x     = rtmp + 16 * ajtmpold[j];
371:       x[0]  = v[0];
372:       x[1]  = v[1];
373:       x[2]  = v[2];
374:       x[3]  = v[3];
375:       x[4]  = v[4];
376:       x[5]  = v[5];
377:       x[6]  = v[6];
378:       x[7]  = v[7];
379:       x[8]  = v[8];
380:       x[9]  = v[9];
381:       x[10] = v[10];
382:       x[11] = v[11];
383:       x[12] = v[12];
384:       x[13] = v[13];
385:       x[14] = v[14];
386:       x[15] = v[15];
387:       v += 16;
388:     }
389:     row = *ajtmp++;
390:     while (row < i) {
391:       pc  = rtmp + 16 * row;
392:       p1  = pc[0];
393:       p2  = pc[1];
394:       p3  = pc[2];
395:       p4  = pc[3];
396:       p5  = pc[4];
397:       p6  = pc[5];
398:       p7  = pc[6];
399:       p8  = pc[7];
400:       p9  = pc[8];
401:       p10 = pc[9];
402:       p11 = pc[10];
403:       p12 = pc[11];
404:       p13 = pc[12];
405:       p14 = pc[13];
406:       p15 = pc[14];
407:       p16 = pc[15];
408:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
409:         pv    = ba + 16 * diag_offset[row];
410:         pj    = bj + diag_offset[row] + 1;
411:         x1    = pv[0];
412:         x2    = pv[1];
413:         x3    = pv[2];
414:         x4    = pv[3];
415:         x5    = pv[4];
416:         x6    = pv[5];
417:         x7    = pv[6];
418:         x8    = pv[7];
419:         x9    = pv[8];
420:         x10   = pv[9];
421:         x11   = pv[10];
422:         x12   = pv[11];
423:         x13   = pv[12];
424:         x14   = pv[13];
425:         x15   = pv[14];
426:         x16   = pv[15];
427:         pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
428:         pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
429:         pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
430:         pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;

432:         pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
433:         pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
434:         pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
435:         pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;

437:         pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
438:         pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
439:         pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
440:         pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;

442:         pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
443:         pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
444:         pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
445:         pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
446:         nz           = bi[row + 1] - diag_offset[row] - 1;
447:         pv += 16;
448:         for (j = 0; j < nz; j++) {
449:           x1  = pv[0];
450:           x2  = pv[1];
451:           x3  = pv[2];
452:           x4  = pv[3];
453:           x5  = pv[4];
454:           x6  = pv[5];
455:           x7  = pv[6];
456:           x8  = pv[7];
457:           x9  = pv[8];
458:           x10 = pv[9];
459:           x11 = pv[10];
460:           x12 = pv[11];
461:           x13 = pv[12];
462:           x14 = pv[13];
463:           x15 = pv[14];
464:           x16 = pv[15];
465:           x   = rtmp + 16 * pj[j];
466:           x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
467:           x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
468:           x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
469:           x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;

471:           x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
472:           x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
473:           x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
474:           x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;

476:           x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
477:           x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
478:           x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
479:           x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;

481:           x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
482:           x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
483:           x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
484:           x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;

486:           pv += 16;
487:         }
488:         PetscLogFlops(128.0 * nz + 112.0);
489:       }
490:       row = *ajtmp++;
491:     }
492:     /* finished row so stick it into b->a */
493:     pv = ba + 16 * bi[i];
494:     pj = bj + bi[i];
495:     nz = bi[i + 1] - bi[i];
496:     for (j = 0; j < nz; j++) {
497:       x      = rtmp + 16 * pj[j];
498:       pv[0]  = x[0];
499:       pv[1]  = x[1];
500:       pv[2]  = x[2];
501:       pv[3]  = x[3];
502:       pv[4]  = x[4];
503:       pv[5]  = x[5];
504:       pv[6]  = x[6];
505:       pv[7]  = x[7];
506:       pv[8]  = x[8];
507:       pv[9]  = x[9];
508:       pv[10] = x[10];
509:       pv[11] = x[11];
510:       pv[12] = x[12];
511:       pv[13] = x[13];
512:       pv[14] = x[14];
513:       pv[15] = x[15];
514:       pv += 16;
515:     }
516:     /* invert diagonal block */
517:     w = ba + 16 * diag_offset[i];
518:     if (pivotinblocks) {
519:       PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected);
520:       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
521:     } else {
522:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
523:     }
524:   }

526:   PetscFree(rtmp);

528:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
529:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
530:   C->assembled           = PETSC_TRUE;

532:   PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs); /* from inverting diagonal blocks */
533:   return 0;
534: }

536: /*
537:   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
538:     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
539: */
540: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
541: {
542:   Mat             C = B;
543:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
544:   PetscInt        i, j, k, nz, nzL, row;
545:   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
546:   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
547:   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
548:   PetscInt        flg;
549:   PetscReal       shift;
550:   PetscBool       allowzeropivot, zeropivotdetected;

552:   allowzeropivot = PetscNot(A->erroriffailure);

554:   /* generate work space needed by the factorization */
555:   PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork);
556:   PetscArrayzero(rtmp, bs2 * n);

558:   if (info->shifttype == MAT_SHIFT_NONE) {
559:     shift = 0;
560:   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
561:     shift = info->shiftamount;
562:   }

564:   for (i = 0; i < n; i++) {
565:     /* zero rtmp */
566:     /* L part */
567:     nz    = bi[i + 1] - bi[i];
568:     bjtmp = bj + bi[i];
569:     for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);

571:     /* U part */
572:     nz    = bdiag[i] - bdiag[i + 1];
573:     bjtmp = bj + bdiag[i + 1] + 1;
574:     for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);

576:     /* load in initial (unfactored row) */
577:     nz    = ai[i + 1] - ai[i];
578:     ajtmp = aj + ai[i];
579:     v     = aa + bs2 * ai[i];
580:     for (j = 0; j < nz; j++) PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2);

582:     /* elimination */
583:     bjtmp = bj + bi[i];
584:     nzL   = bi[i + 1] - bi[i];
585:     for (k = 0; k < nzL; k++) {
586:       row = bjtmp[k];
587:       pc  = rtmp + bs2 * row;
588:       for (flg = 0, j = 0; j < bs2; j++) {
589:         if (pc[j] != 0.0) {
590:           flg = 1;
591:           break;
592:         }
593:       }
594:       if (flg) {
595:         pv = b->a + bs2 * bdiag[row];
596:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
597:         PetscKernel_A_gets_A_times_B_4(pc, pv, mwork);

599:         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
600:         pv = b->a + bs2 * (bdiag[row + 1] + 1);
601:         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
602:         for (j = 0; j < nz; j++) {
603:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
604:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
605:           v = rtmp + bs2 * pj[j];
606:           PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv);
607:           pv += bs2;
608:         }
609:         PetscLogFlops(128.0 * nz + 112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
610:       }
611:     }

613:     /* finished row so stick it into b->a */
614:     /* L part */
615:     pv = b->a + bs2 * bi[i];
616:     pj = b->j + bi[i];
617:     nz = bi[i + 1] - bi[i];
618:     for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);

620:     /* Mark diagonal and invert diagonal for simpler triangular solves */
621:     pv = b->a + bs2 * bdiag[i];
622:     pj = b->j + bdiag[i];
623:     PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2);
624:     PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected);
625:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

627:     /* U part */
628:     pv = b->a + bs2 * (bdiag[i + 1] + 1);
629:     pj = b->j + bdiag[i + 1] + 1;
630:     nz = bdiag[i] - bdiag[i + 1] - 1;
631:     for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);
632:   }
633:   PetscFree2(rtmp, mwork);

635:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
636:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
637:   C->assembled           = PETSC_TRUE;

639:   PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n); /* from inverting diagonal blocks */
640:   return 0;
641: }

643: #if defined(PETSC_HAVE_SSE)

645:   #include PETSC_HAVE_SSE

647: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
648: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B, Mat A, const MatFactorInfo *info)
649: {
650:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
651:   int          i, j, n = a->mbs;
652:   int         *bj = b->j, *bjtmp, *pj;
653:   int          row;
654:   int         *ajtmpold, nz, *bi = b->i;
655:   int         *diag_offset = b->diag, *ai = a->i, *aj = a->j;
656:   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
657:   MatScalar   *ba = b->a, *aa = a->a;
658:   int          nonzero       = 0;
659:   PetscBool    pivotinblocks = b->pivotinblocks;
660:   PetscReal    shift         = info->shiftamount;
661:   PetscBool    allowzeropivot, zeropivotdetected = PETSC_FALSE;

663:   allowzeropivot = PetscNot(A->erroriffailure);
664:   SSE_SCOPE_BEGIN;

668:   PetscMalloc1(16 * (n + 1), &rtmp);
670:   /*    if ((unsigned long)bj==(unsigned long)aj) { */
671:   /*      colscale = 4; */
672:   /*    } */
673:   for (i = 0; i < n; i++) {
674:     nz    = bi[i + 1] - bi[i];
675:     bjtmp = bj + bi[i];
676:     /* zero out the 4x4 block accumulators */
677:     /* zero out one register */
678:     XOR_PS(XMM7, XMM7);
679:     for (j = 0; j < nz; j++) {
680:       x = rtmp + 16 * bjtmp[j];
681:       /*        x = rtmp+4*bjtmp[j]; */
682:       SSE_INLINE_BEGIN_1(x)
683:       /* Copy zero register to memory locations */
684:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
685:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
686:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
687:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
688:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
689:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
690:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
691:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
692:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
693:       SSE_INLINE_END_1;
694:     }
695:     /* load in initial (unfactored row) */
696:     nz       = ai[i + 1] - ai[i];
697:     ajtmpold = aj + ai[i];
698:     v        = aa + 16 * ai[i];
699:     for (j = 0; j < nz; j++) {
700:       x = rtmp + 16 * ajtmpold[j];
701:       /*        x = rtmp+colscale*ajtmpold[j]; */
702:       /* Copy v block into x block */
703:       SSE_INLINE_BEGIN_2(v, x)
704:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
705:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
706:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)

708:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
709:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)

711:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
712:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)

714:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
715:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)

717:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
718:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)

720:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
721:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)

723:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
724:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)

726:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
727:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
728:       SSE_INLINE_END_2;

730:       v += 16;
731:     }
732:     /*      row = (*bjtmp++)/4; */
733:     row = *bjtmp++;
734:     while (row < i) {
735:       pc = rtmp + 16 * row;
736:       SSE_INLINE_BEGIN_1(pc)
737:       /* Load block from lower triangle */
738:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
739:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
740:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)

742:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
743:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)

745:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
746:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)

748:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
749:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)

751:       /* Compare block to zero block */

753:       SSE_COPY_PS(XMM4, XMM7)
754:       SSE_CMPNEQ_PS(XMM4, XMM0)

756:       SSE_COPY_PS(XMM5, XMM7)
757:       SSE_CMPNEQ_PS(XMM5, XMM1)

759:       SSE_COPY_PS(XMM6, XMM7)
760:       SSE_CMPNEQ_PS(XMM6, XMM2)

762:       SSE_CMPNEQ_PS(XMM7, XMM3)

764:       /* Reduce the comparisons to one SSE register */
765:       SSE_OR_PS(XMM6, XMM7)
766:       SSE_OR_PS(XMM5, XMM4)
767:       SSE_OR_PS(XMM5, XMM6)
768:       SSE_INLINE_END_1;

770:       /* Reduce the one SSE register to an integer register for branching */
771:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
772:       MOVEMASK(nonzero, XMM5);

774:       /* If block is nonzero ... */
775:       if (nonzero) {
776:         pv = ba + 16 * diag_offset[row];
777:         PREFETCH_L1(&pv[16]);
778:         pj = bj + diag_offset[row] + 1;

780:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
781:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
782:         /* but the diagonal was inverted already */
783:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

785:         SSE_INLINE_BEGIN_2(pv, pc)
786:         /* Column 0, product is accumulated in XMM4 */
787:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
788:         SSE_SHUFFLE(XMM4, XMM4, 0x00)
789:         SSE_MULT_PS(XMM4, XMM0)

791:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
792:         SSE_SHUFFLE(XMM5, XMM5, 0x00)
793:         SSE_MULT_PS(XMM5, XMM1)
794:         SSE_ADD_PS(XMM4, XMM5)

796:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
797:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
798:         SSE_MULT_PS(XMM6, XMM2)
799:         SSE_ADD_PS(XMM4, XMM6)

801:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
802:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
803:         SSE_MULT_PS(XMM7, XMM3)
804:         SSE_ADD_PS(XMM4, XMM7)

806:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
807:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)

809:         /* Column 1, product is accumulated in XMM5 */
810:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
811:         SSE_SHUFFLE(XMM5, XMM5, 0x00)
812:         SSE_MULT_PS(XMM5, XMM0)

814:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
815:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
816:         SSE_MULT_PS(XMM6, XMM1)
817:         SSE_ADD_PS(XMM5, XMM6)

819:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
820:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
821:         SSE_MULT_PS(XMM7, XMM2)
822:         SSE_ADD_PS(XMM5, XMM7)

824:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
825:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
826:         SSE_MULT_PS(XMM6, XMM3)
827:         SSE_ADD_PS(XMM5, XMM6)

829:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
830:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)

832:         SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)

834:         /* Column 2, product is accumulated in XMM6 */
835:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
836:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
837:         SSE_MULT_PS(XMM6, XMM0)

839:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
840:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
841:         SSE_MULT_PS(XMM7, XMM1)
842:         SSE_ADD_PS(XMM6, XMM7)

844:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
845:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
846:         SSE_MULT_PS(XMM7, XMM2)
847:         SSE_ADD_PS(XMM6, XMM7)

849:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
850:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
851:         SSE_MULT_PS(XMM7, XMM3)
852:         SSE_ADD_PS(XMM6, XMM7)

854:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
855:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)

857:         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
858:         /* Column 3, product is accumulated in XMM0 */
859:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
860:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
861:         SSE_MULT_PS(XMM0, XMM7)

863:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
864:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
865:         SSE_MULT_PS(XMM1, XMM7)
866:         SSE_ADD_PS(XMM0, XMM1)

868:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
869:         SSE_SHUFFLE(XMM1, XMM1, 0x00)
870:         SSE_MULT_PS(XMM1, XMM2)
871:         SSE_ADD_PS(XMM0, XMM1)

873:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
874:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
875:         SSE_MULT_PS(XMM3, XMM7)
876:         SSE_ADD_PS(XMM0, XMM3)

878:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
879:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)

881:         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
882:         /* This is code to be maintained and read by humans after all. */
883:         /* Copy Multiplier Col 3 into XMM3 */
884:         SSE_COPY_PS(XMM3, XMM0)
885:         /* Copy Multiplier Col 2 into XMM2 */
886:         SSE_COPY_PS(XMM2, XMM6)
887:         /* Copy Multiplier Col 1 into XMM1 */
888:         SSE_COPY_PS(XMM1, XMM5)
889:         /* Copy Multiplier Col 0 into XMM0 */
890:         SSE_COPY_PS(XMM0, XMM4)
891:         SSE_INLINE_END_2;

893:         /* Update the row: */
894:         nz = bi[row + 1] - diag_offset[row] - 1;
895:         pv += 16;
896:         for (j = 0; j < nz; j++) {
897:           PREFETCH_L1(&pv[16]);
898:           x = rtmp + 16 * pj[j];
899:           /*            x = rtmp + 4*pj[j]; */

901:           /* X:=X-M*PV, One column at a time */
902:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
903:           SSE_INLINE_BEGIN_2(x, pv)
904:           /* Load First Column of X*/
905:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
906:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)

908:           /* Matrix-Vector Product: */
909:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
910:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
911:           SSE_MULT_PS(XMM5, XMM0)
912:           SSE_SUB_PS(XMM4, XMM5)

914:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
915:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
916:           SSE_MULT_PS(XMM6, XMM1)
917:           SSE_SUB_PS(XMM4, XMM6)

919:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
920:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
921:           SSE_MULT_PS(XMM7, XMM2)
922:           SSE_SUB_PS(XMM4, XMM7)

924:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
925:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
926:           SSE_MULT_PS(XMM5, XMM3)
927:           SSE_SUB_PS(XMM4, XMM5)

929:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
930:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)

932:           /* Second Column */
933:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
934:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)

936:           /* Matrix-Vector Product: */
937:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
938:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
939:           SSE_MULT_PS(XMM6, XMM0)
940:           SSE_SUB_PS(XMM5, XMM6)

942:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
943:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
944:           SSE_MULT_PS(XMM7, XMM1)
945:           SSE_SUB_PS(XMM5, XMM7)

947:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
948:           SSE_SHUFFLE(XMM4, XMM4, 0x00)
949:           SSE_MULT_PS(XMM4, XMM2)
950:           SSE_SUB_PS(XMM5, XMM4)

952:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
953:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
954:           SSE_MULT_PS(XMM6, XMM3)
955:           SSE_SUB_PS(XMM5, XMM6)

957:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
958:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)

960:           SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)

962:           /* Third Column */
963:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
964:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)

966:           /* Matrix-Vector Product: */
967:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
968:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
969:           SSE_MULT_PS(XMM7, XMM0)
970:           SSE_SUB_PS(XMM6, XMM7)

972:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
973:           SSE_SHUFFLE(XMM4, XMM4, 0x00)
974:           SSE_MULT_PS(XMM4, XMM1)
975:           SSE_SUB_PS(XMM6, XMM4)

977:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
978:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
979:           SSE_MULT_PS(XMM5, XMM2)
980:           SSE_SUB_PS(XMM6, XMM5)

982:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
983:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
984:           SSE_MULT_PS(XMM7, XMM3)
985:           SSE_SUB_PS(XMM6, XMM7)

987:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
988:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)

990:           /* Fourth Column */
991:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
992:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)

994:           /* Matrix-Vector Product: */
995:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
996:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
997:           SSE_MULT_PS(XMM5, XMM0)
998:           SSE_SUB_PS(XMM4, XMM5)

1000:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1001:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1002:           SSE_MULT_PS(XMM6, XMM1)
1003:           SSE_SUB_PS(XMM4, XMM6)

1005:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1006:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1007:           SSE_MULT_PS(XMM7, XMM2)
1008:           SSE_SUB_PS(XMM4, XMM7)

1010:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1011:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1012:           SSE_MULT_PS(XMM5, XMM3)
1013:           SSE_SUB_PS(XMM4, XMM5)

1015:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1016:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1017:           SSE_INLINE_END_2;
1018:           pv += 16;
1019:         }
1020:         PetscLogFlops(128.0 * nz + 112.0);
1021:       }
1022:       row = *bjtmp++;
1023:       /*        row = (*bjtmp++)/4; */
1024:     }
1025:     /* finished row so stick it into b->a */
1026:     pv = ba + 16 * bi[i];
1027:     pj = bj + bi[i];
1028:     nz = bi[i + 1] - bi[i];

1030:     /* Copy x block back into pv block */
1031:     for (j = 0; j < nz; j++) {
1032:       x = rtmp + 16 * pj[j];
1033:       /*        x  = rtmp+4*pj[j]; */

1035:       SSE_INLINE_BEGIN_2(x, pv)
1036:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1037:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1038:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)

1040:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1041:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)

1043:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1044:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)

1046:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1047:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)

1049:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1050:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)

1052:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1053:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)

1055:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1056:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)

1058:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1059:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1060:       SSE_INLINE_END_2;
1061:       pv += 16;
1062:     }
1063:     /* invert diagonal block */
1064:     w = ba + 16 * diag_offset[i];
1065:     if (pivotinblocks) {
1066:       PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected);
1067:       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1068:     } else {
1069:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
1070:     }
1071:     /*      PetscKernel_A_gets_inverse_A_4_SSE(w); */
1072:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1073:   }

1075:   PetscFree(rtmp);

1077:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1078:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1079:   C->assembled           = PETSC_TRUE;

1081:   PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs);
1082:   /* Flop Count from inverting diagonal blocks */
1083:   SSE_SCOPE_END;
1084:   return 0;
1085: }

1087: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1088: {
1089:   Mat             A = C;
1090:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1091:   int             i, j, n = a->mbs;
1092:   unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj;
1093:   unsigned short *aj = (unsigned short *)(a->j), *ajtmp;
1094:   unsigned int    row;
1095:   int             nz, *bi = b->i;
1096:   int            *diag_offset = b->diag, *ai = a->i;
1097:   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
1098:   MatScalar      *ba = b->a, *aa = a->a;
1099:   int             nonzero       = 0;
1100:   PetscBool       pivotinblocks = b->pivotinblocks;
1101:   PetscReal       shift         = info->shiftamount;
1102:   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;

1104:   allowzeropivot = PetscNot(A->erroriffailure);
1105:   SSE_SCOPE_BEGIN;

1109:   PetscMalloc1(16 * (n + 1), &rtmp);
1111:   /*    if ((unsigned long)bj==(unsigned long)aj) { */
1112:   /*      colscale = 4; */
1113:   /*    } */

1115:   for (i = 0; i < n; i++) {
1116:     nz    = bi[i + 1] - bi[i];
1117:     bjtmp = bj + bi[i];
1118:     /* zero out the 4x4 block accumulators */
1119:     /* zero out one register */
1120:     XOR_PS(XMM7, XMM7);
1121:     for (j = 0; j < nz; j++) {
1122:       x = rtmp + 16 * ((unsigned int)bjtmp[j]);
1123:       /*        x = rtmp+4*bjtmp[j]; */
1124:       SSE_INLINE_BEGIN_1(x)
1125:       /* Copy zero register to memory locations */
1126:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1127:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
1128:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
1129:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
1130:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
1131:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
1132:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
1133:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1134:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
1135:       SSE_INLINE_END_1;
1136:     }
1137:     /* load in initial (unfactored row) */
1138:     nz    = ai[i + 1] - ai[i];
1139:     ajtmp = aj + ai[i];
1140:     v     = aa + 16 * ai[i];
1141:     for (j = 0; j < nz; j++) {
1142:       x = rtmp + 16 * ((unsigned int)ajtmp[j]);
1143:       /*        x = rtmp+colscale*ajtmp[j]; */
1144:       /* Copy v block into x block */
1145:       SSE_INLINE_BEGIN_2(v, x)
1146:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1147:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1148:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)

1150:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
1151:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)

1153:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
1154:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)

1156:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
1157:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)

1159:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
1160:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)

1162:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
1163:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)

1165:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
1166:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)

1168:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1169:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1170:       SSE_INLINE_END_2;

1172:       v += 16;
1173:     }
1174:     /*      row = (*bjtmp++)/4; */
1175:     row = (unsigned int)(*bjtmp++);
1176:     while (row < i) {
1177:       pc = rtmp + 16 * row;
1178:       SSE_INLINE_BEGIN_1(pc)
1179:       /* Load block from lower triangle */
1180:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1181:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1182:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)

1184:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
1185:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)

1187:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
1188:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)

1190:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
1191:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)

1193:       /* Compare block to zero block */

1195:       SSE_COPY_PS(XMM4, XMM7)
1196:       SSE_CMPNEQ_PS(XMM4, XMM0)

1198:       SSE_COPY_PS(XMM5, XMM7)
1199:       SSE_CMPNEQ_PS(XMM5, XMM1)

1201:       SSE_COPY_PS(XMM6, XMM7)
1202:       SSE_CMPNEQ_PS(XMM6, XMM2)

1204:       SSE_CMPNEQ_PS(XMM7, XMM3)

1206:       /* Reduce the comparisons to one SSE register */
1207:       SSE_OR_PS(XMM6, XMM7)
1208:       SSE_OR_PS(XMM5, XMM4)
1209:       SSE_OR_PS(XMM5, XMM6)
1210:       SSE_INLINE_END_1;

1212:       /* Reduce the one SSE register to an integer register for branching */
1213:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1214:       MOVEMASK(nonzero, XMM5);

1216:       /* If block is nonzero ... */
1217:       if (nonzero) {
1218:         pv = ba + 16 * diag_offset[row];
1219:         PREFETCH_L1(&pv[16]);
1220:         pj = bj + diag_offset[row] + 1;

1222:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1223:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1224:         /* but the diagonal was inverted already */
1225:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

1227:         SSE_INLINE_BEGIN_2(pv, pc)
1228:         /* Column 0, product is accumulated in XMM4 */
1229:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
1230:         SSE_SHUFFLE(XMM4, XMM4, 0x00)
1231:         SSE_MULT_PS(XMM4, XMM0)

1233:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
1234:         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1235:         SSE_MULT_PS(XMM5, XMM1)
1236:         SSE_ADD_PS(XMM4, XMM5)

1238:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
1239:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1240:         SSE_MULT_PS(XMM6, XMM2)
1241:         SSE_ADD_PS(XMM4, XMM6)

1243:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
1244:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1245:         SSE_MULT_PS(XMM7, XMM3)
1246:         SSE_ADD_PS(XMM4, XMM7)

1248:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
1249:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)

1251:         /* Column 1, product is accumulated in XMM5 */
1252:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
1253:         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1254:         SSE_MULT_PS(XMM5, XMM0)

1256:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
1257:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1258:         SSE_MULT_PS(XMM6, XMM1)
1259:         SSE_ADD_PS(XMM5, XMM6)

1261:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
1262:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1263:         SSE_MULT_PS(XMM7, XMM2)
1264:         SSE_ADD_PS(XMM5, XMM7)

1266:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
1267:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1268:         SSE_MULT_PS(XMM6, XMM3)
1269:         SSE_ADD_PS(XMM5, XMM6)

1271:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
1272:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)

1274:         SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)

1276:         /* Column 2, product is accumulated in XMM6 */
1277:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
1278:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1279:         SSE_MULT_PS(XMM6, XMM0)

1281:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
1282:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1283:         SSE_MULT_PS(XMM7, XMM1)
1284:         SSE_ADD_PS(XMM6, XMM7)

1286:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
1287:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1288:         SSE_MULT_PS(XMM7, XMM2)
1289:         SSE_ADD_PS(XMM6, XMM7)

1291:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
1292:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1293:         SSE_MULT_PS(XMM7, XMM3)
1294:         SSE_ADD_PS(XMM6, XMM7)

1296:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
1297:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)

1299:         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1300:         /* Column 3, product is accumulated in XMM0 */
1301:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
1302:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1303:         SSE_MULT_PS(XMM0, XMM7)

1305:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
1306:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1307:         SSE_MULT_PS(XMM1, XMM7)
1308:         SSE_ADD_PS(XMM0, XMM1)

1310:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
1311:         SSE_SHUFFLE(XMM1, XMM1, 0x00)
1312:         SSE_MULT_PS(XMM1, XMM2)
1313:         SSE_ADD_PS(XMM0, XMM1)

1315:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
1316:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1317:         SSE_MULT_PS(XMM3, XMM7)
1318:         SSE_ADD_PS(XMM0, XMM3)

1320:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
1321:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)

1323:         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1324:         /* This is code to be maintained and read by humans after all. */
1325:         /* Copy Multiplier Col 3 into XMM3 */
1326:         SSE_COPY_PS(XMM3, XMM0)
1327:         /* Copy Multiplier Col 2 into XMM2 */
1328:         SSE_COPY_PS(XMM2, XMM6)
1329:         /* Copy Multiplier Col 1 into XMM1 */
1330:         SSE_COPY_PS(XMM1, XMM5)
1331:         /* Copy Multiplier Col 0 into XMM0 */
1332:         SSE_COPY_PS(XMM0, XMM4)
1333:         SSE_INLINE_END_2;

1335:         /* Update the row: */
1336:         nz = bi[row + 1] - diag_offset[row] - 1;
1337:         pv += 16;
1338:         for (j = 0; j < nz; j++) {
1339:           PREFETCH_L1(&pv[16]);
1340:           x = rtmp + 16 * ((unsigned int)pj[j]);
1341:           /*            x = rtmp + 4*pj[j]; */

1343:           /* X:=X-M*PV, One column at a time */
1344:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1345:           SSE_INLINE_BEGIN_2(x, pv)
1346:           /* Load First Column of X*/
1347:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1348:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)

1350:           /* Matrix-Vector Product: */
1351:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
1352:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1353:           SSE_MULT_PS(XMM5, XMM0)
1354:           SSE_SUB_PS(XMM4, XMM5)

1356:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
1357:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1358:           SSE_MULT_PS(XMM6, XMM1)
1359:           SSE_SUB_PS(XMM4, XMM6)

1361:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
1362:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1363:           SSE_MULT_PS(XMM7, XMM2)
1364:           SSE_SUB_PS(XMM4, XMM7)

1366:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
1367:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1368:           SSE_MULT_PS(XMM5, XMM3)
1369:           SSE_SUB_PS(XMM4, XMM5)

1371:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1372:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)

1374:           /* Second Column */
1375:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1376:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)

1378:           /* Matrix-Vector Product: */
1379:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
1380:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1381:           SSE_MULT_PS(XMM6, XMM0)
1382:           SSE_SUB_PS(XMM5, XMM6)

1384:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
1385:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1386:           SSE_MULT_PS(XMM7, XMM1)
1387:           SSE_SUB_PS(XMM5, XMM7)

1389:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
1390:           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1391:           SSE_MULT_PS(XMM4, XMM2)
1392:           SSE_SUB_PS(XMM5, XMM4)

1394:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
1395:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1396:           SSE_MULT_PS(XMM6, XMM3)
1397:           SSE_SUB_PS(XMM5, XMM6)

1399:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1400:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)

1402:           SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)

1404:           /* Third Column */
1405:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1406:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)

1408:           /* Matrix-Vector Product: */
1409:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
1410:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1411:           SSE_MULT_PS(XMM7, XMM0)
1412:           SSE_SUB_PS(XMM6, XMM7)

1414:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
1415:           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1416:           SSE_MULT_PS(XMM4, XMM1)
1417:           SSE_SUB_PS(XMM6, XMM4)

1419:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
1420:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1421:           SSE_MULT_PS(XMM5, XMM2)
1422:           SSE_SUB_PS(XMM6, XMM5)

1424:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
1425:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1426:           SSE_MULT_PS(XMM7, XMM3)
1427:           SSE_SUB_PS(XMM6, XMM7)

1429:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1430:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)

1432:           /* Fourth Column */
1433:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1434:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)

1436:           /* Matrix-Vector Product: */
1437:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1438:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1439:           SSE_MULT_PS(XMM5, XMM0)
1440:           SSE_SUB_PS(XMM4, XMM5)

1442:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1443:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1444:           SSE_MULT_PS(XMM6, XMM1)
1445:           SSE_SUB_PS(XMM4, XMM6)

1447:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1448:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1449:           SSE_MULT_PS(XMM7, XMM2)
1450:           SSE_SUB_PS(XMM4, XMM7)

1452:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1453:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1454:           SSE_MULT_PS(XMM5, XMM3)
1455:           SSE_SUB_PS(XMM4, XMM5)

1457:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1458:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1459:           SSE_INLINE_END_2;
1460:           pv += 16;
1461:         }
1462:         PetscLogFlops(128.0 * nz + 112.0);
1463:       }
1464:       row = (unsigned int)(*bjtmp++);
1465:       /*        row = (*bjtmp++)/4; */
1466:       /*        bjtmp++; */
1467:     }
1468:     /* finished row so stick it into b->a */
1469:     pv = ba + 16 * bi[i];
1470:     pj = bj + bi[i];
1471:     nz = bi[i + 1] - bi[i];

1473:     /* Copy x block back into pv block */
1474:     for (j = 0; j < nz; j++) {
1475:       x = rtmp + 16 * ((unsigned int)pj[j]);
1476:       /*        x  = rtmp+4*pj[j]; */

1478:       SSE_INLINE_BEGIN_2(x, pv)
1479:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1480:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1481:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)

1483:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1484:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)

1486:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1487:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)

1489:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1490:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)

1492:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1493:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)

1495:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1496:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)

1498:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1499:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)

1501:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1502:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1503:       SSE_INLINE_END_2;
1504:       pv += 16;
1505:     }
1506:     /* invert diagonal block */
1507:     w = ba + 16 * diag_offset[i];
1508:     if (pivotinblocks) {
1509:       PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected);
1510:       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1511:     } else {
1512:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
1513:     }
1514:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1515:   }

1517:   PetscFree(rtmp);

1519:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1520:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1521:   C->assembled           = PETSC_TRUE;

1523:   PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs);
1524:   /* Flop Count from inverting diagonal blocks */
1525:   SSE_SCOPE_END;
1526:   return 0;
1527: }

1529: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C, Mat A, const MatFactorInfo *info)
1530: {
1531:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1532:   int             i, j, n = a->mbs;
1533:   unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj;
1534:   unsigned int    row;
1535:   int            *ajtmpold, nz, *bi = b->i;
1536:   int            *diag_offset = b->diag, *ai = a->i, *aj = a->j;
1537:   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
1538:   MatScalar      *ba = b->a, *aa = a->a;
1539:   int             nonzero       = 0;
1540:   PetscBool       pivotinblocks = b->pivotinblocks;
1541:   PetscReal       shift         = info->shiftamount;
1542:   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;

1544:   allowzeropivot = PetscNot(A->erroriffailure);
1545:   SSE_SCOPE_BEGIN;

1549:   PetscMalloc1(16 * (n + 1), &rtmp);
1551:   /*    if ((unsigned long)bj==(unsigned long)aj) { */
1552:   /*      colscale = 4; */
1553:   /*    } */
1554:   if ((unsigned long)bj == (unsigned long)aj) return (MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));

1556:   for (i = 0; i < n; i++) {
1557:     nz    = bi[i + 1] - bi[i];
1558:     bjtmp = bj + bi[i];
1559:     /* zero out the 4x4 block accumulators */
1560:     /* zero out one register */
1561:     XOR_PS(XMM7, XMM7);
1562:     for (j = 0; j < nz; j++) {
1563:       x = rtmp + 16 * ((unsigned int)bjtmp[j]);
1564:       /*        x = rtmp+4*bjtmp[j]; */
1565:       SSE_INLINE_BEGIN_1(x)
1566:       /* Copy zero register to memory locations */
1567:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1568:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
1569:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
1570:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
1571:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
1572:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
1573:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
1574:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1575:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
1576:       SSE_INLINE_END_1;
1577:     }
1578:     /* load in initial (unfactored row) */
1579:     nz       = ai[i + 1] - ai[i];
1580:     ajtmpold = aj + ai[i];
1581:     v        = aa + 16 * ai[i];
1582:     for (j = 0; j < nz; j++) {
1583:       x = rtmp + 16 * ajtmpold[j];
1584:       /*        x = rtmp+colscale*ajtmpold[j]; */
1585:       /* Copy v block into x block */
1586:       SSE_INLINE_BEGIN_2(v, x)
1587:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1588:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1589:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)

1591:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
1592:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)

1594:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
1595:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)

1597:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
1598:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)

1600:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
1601:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)

1603:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
1604:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)

1606:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
1607:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)

1609:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1610:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1611:       SSE_INLINE_END_2;

1613:       v += 16;
1614:     }
1615:     /*      row = (*bjtmp++)/4; */
1616:     row = (unsigned int)(*bjtmp++);
1617:     while (row < i) {
1618:       pc = rtmp + 16 * row;
1619:       SSE_INLINE_BEGIN_1(pc)
1620:       /* Load block from lower triangle */
1621:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1622:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1623:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)

1625:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
1626:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)

1628:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
1629:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)

1631:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
1632:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)

1634:       /* Compare block to zero block */

1636:       SSE_COPY_PS(XMM4, XMM7)
1637:       SSE_CMPNEQ_PS(XMM4, XMM0)

1639:       SSE_COPY_PS(XMM5, XMM7)
1640:       SSE_CMPNEQ_PS(XMM5, XMM1)

1642:       SSE_COPY_PS(XMM6, XMM7)
1643:       SSE_CMPNEQ_PS(XMM6, XMM2)

1645:       SSE_CMPNEQ_PS(XMM7, XMM3)

1647:       /* Reduce the comparisons to one SSE register */
1648:       SSE_OR_PS(XMM6, XMM7)
1649:       SSE_OR_PS(XMM5, XMM4)
1650:       SSE_OR_PS(XMM5, XMM6)
1651:       SSE_INLINE_END_1;

1653:       /* Reduce the one SSE register to an integer register for branching */
1654:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1655:       MOVEMASK(nonzero, XMM5);

1657:       /* If block is nonzero ... */
1658:       if (nonzero) {
1659:         pv = ba + 16 * diag_offset[row];
1660:         PREFETCH_L1(&pv[16]);
1661:         pj = bj + diag_offset[row] + 1;

1663:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1664:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1665:         /* but the diagonal was inverted already */
1666:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

1668:         SSE_INLINE_BEGIN_2(pv, pc)
1669:         /* Column 0, product is accumulated in XMM4 */
1670:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
1671:         SSE_SHUFFLE(XMM4, XMM4, 0x00)
1672:         SSE_MULT_PS(XMM4, XMM0)

1674:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
1675:         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1676:         SSE_MULT_PS(XMM5, XMM1)
1677:         SSE_ADD_PS(XMM4, XMM5)

1679:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
1680:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1681:         SSE_MULT_PS(XMM6, XMM2)
1682:         SSE_ADD_PS(XMM4, XMM6)

1684:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
1685:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1686:         SSE_MULT_PS(XMM7, XMM3)
1687:         SSE_ADD_PS(XMM4, XMM7)

1689:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
1690:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)

1692:         /* Column 1, product is accumulated in XMM5 */
1693:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
1694:         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1695:         SSE_MULT_PS(XMM5, XMM0)

1697:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
1698:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1699:         SSE_MULT_PS(XMM6, XMM1)
1700:         SSE_ADD_PS(XMM5, XMM6)

1702:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
1703:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1704:         SSE_MULT_PS(XMM7, XMM2)
1705:         SSE_ADD_PS(XMM5, XMM7)

1707:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
1708:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1709:         SSE_MULT_PS(XMM6, XMM3)
1710:         SSE_ADD_PS(XMM5, XMM6)

1712:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
1713:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)

1715:         SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)

1717:         /* Column 2, product is accumulated in XMM6 */
1718:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
1719:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1720:         SSE_MULT_PS(XMM6, XMM0)

1722:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
1723:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1724:         SSE_MULT_PS(XMM7, XMM1)
1725:         SSE_ADD_PS(XMM6, XMM7)

1727:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
1728:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1729:         SSE_MULT_PS(XMM7, XMM2)
1730:         SSE_ADD_PS(XMM6, XMM7)

1732:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
1733:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1734:         SSE_MULT_PS(XMM7, XMM3)
1735:         SSE_ADD_PS(XMM6, XMM7)

1737:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
1738:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)

1740:         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1741:         /* Column 3, product is accumulated in XMM0 */
1742:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
1743:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1744:         SSE_MULT_PS(XMM0, XMM7)

1746:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
1747:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1748:         SSE_MULT_PS(XMM1, XMM7)
1749:         SSE_ADD_PS(XMM0, XMM1)

1751:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
1752:         SSE_SHUFFLE(XMM1, XMM1, 0x00)
1753:         SSE_MULT_PS(XMM1, XMM2)
1754:         SSE_ADD_PS(XMM0, XMM1)

1756:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
1757:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1758:         SSE_MULT_PS(XMM3, XMM7)
1759:         SSE_ADD_PS(XMM0, XMM3)

1761:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
1762:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)

1764:         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1765:         /* This is code to be maintained and read by humans after all. */
1766:         /* Copy Multiplier Col 3 into XMM3 */
1767:         SSE_COPY_PS(XMM3, XMM0)
1768:         /* Copy Multiplier Col 2 into XMM2 */
1769:         SSE_COPY_PS(XMM2, XMM6)
1770:         /* Copy Multiplier Col 1 into XMM1 */
1771:         SSE_COPY_PS(XMM1, XMM5)
1772:         /* Copy Multiplier Col 0 into XMM0 */
1773:         SSE_COPY_PS(XMM0, XMM4)
1774:         SSE_INLINE_END_2;

1776:         /* Update the row: */
1777:         nz = bi[row + 1] - diag_offset[row] - 1;
1778:         pv += 16;
1779:         for (j = 0; j < nz; j++) {
1780:           PREFETCH_L1(&pv[16]);
1781:           x = rtmp + 16 * ((unsigned int)pj[j]);
1782:           /*            x = rtmp + 4*pj[j]; */

1784:           /* X:=X-M*PV, One column at a time */
1785:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1786:           SSE_INLINE_BEGIN_2(x, pv)
1787:           /* Load First Column of X*/
1788:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1789:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)

1791:           /* Matrix-Vector Product: */
1792:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
1793:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1794:           SSE_MULT_PS(XMM5, XMM0)
1795:           SSE_SUB_PS(XMM4, XMM5)

1797:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
1798:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1799:           SSE_MULT_PS(XMM6, XMM1)
1800:           SSE_SUB_PS(XMM4, XMM6)

1802:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
1803:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1804:           SSE_MULT_PS(XMM7, XMM2)
1805:           SSE_SUB_PS(XMM4, XMM7)

1807:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
1808:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1809:           SSE_MULT_PS(XMM5, XMM3)
1810:           SSE_SUB_PS(XMM4, XMM5)

1812:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1813:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)

1815:           /* Second Column */
1816:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1817:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)

1819:           /* Matrix-Vector Product: */
1820:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
1821:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1822:           SSE_MULT_PS(XMM6, XMM0)
1823:           SSE_SUB_PS(XMM5, XMM6)

1825:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
1826:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1827:           SSE_MULT_PS(XMM7, XMM1)
1828:           SSE_SUB_PS(XMM5, XMM7)

1830:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
1831:           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1832:           SSE_MULT_PS(XMM4, XMM2)
1833:           SSE_SUB_PS(XMM5, XMM4)

1835:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
1836:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1837:           SSE_MULT_PS(XMM6, XMM3)
1838:           SSE_SUB_PS(XMM5, XMM6)

1840:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1841:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)

1843:           SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)

1845:           /* Third Column */
1846:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1847:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)

1849:           /* Matrix-Vector Product: */
1850:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
1851:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1852:           SSE_MULT_PS(XMM7, XMM0)
1853:           SSE_SUB_PS(XMM6, XMM7)

1855:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
1856:           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1857:           SSE_MULT_PS(XMM4, XMM1)
1858:           SSE_SUB_PS(XMM6, XMM4)

1860:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
1861:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1862:           SSE_MULT_PS(XMM5, XMM2)
1863:           SSE_SUB_PS(XMM6, XMM5)

1865:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
1866:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1867:           SSE_MULT_PS(XMM7, XMM3)
1868:           SSE_SUB_PS(XMM6, XMM7)

1870:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1871:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)

1873:           /* Fourth Column */
1874:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1875:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)

1877:           /* Matrix-Vector Product: */
1878:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1879:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1880:           SSE_MULT_PS(XMM5, XMM0)
1881:           SSE_SUB_PS(XMM4, XMM5)

1883:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1884:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1885:           SSE_MULT_PS(XMM6, XMM1)
1886:           SSE_SUB_PS(XMM4, XMM6)

1888:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1889:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1890:           SSE_MULT_PS(XMM7, XMM2)
1891:           SSE_SUB_PS(XMM4, XMM7)

1893:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1894:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1895:           SSE_MULT_PS(XMM5, XMM3)
1896:           SSE_SUB_PS(XMM4, XMM5)

1898:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1899:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1900:           SSE_INLINE_END_2;
1901:           pv += 16;
1902:         }
1903:         PetscLogFlops(128.0 * nz + 112.0);
1904:       }
1905:       row = (unsigned int)(*bjtmp++);
1906:       /*        row = (*bjtmp++)/4; */
1907:       /*        bjtmp++; */
1908:     }
1909:     /* finished row so stick it into b->a */
1910:     pv = ba + 16 * bi[i];
1911:     pj = bj + bi[i];
1912:     nz = bi[i + 1] - bi[i];

1914:     /* Copy x block back into pv block */
1915:     for (j = 0; j < nz; j++) {
1916:       x = rtmp + 16 * ((unsigned int)pj[j]);
1917:       /*        x  = rtmp+4*pj[j]; */

1919:       SSE_INLINE_BEGIN_2(x, pv)
1920:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1921:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1922:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)

1924:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1925:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)

1927:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1928:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)

1930:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1931:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)

1933:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1934:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)

1936:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1937:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)

1939:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1940:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)

1942:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1943:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1944:       SSE_INLINE_END_2;
1945:       pv += 16;
1946:     }
1947:     /* invert diagonal block */
1948:     w = ba + 16 * diag_offset[i];
1949:     if (pivotinblocks) {
1950:       PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, , &zeropivotdetected);
1951:       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1952:     } else {
1953:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
1954:     }
1955:     /*      PetscKernel_A_gets_inverse_A_4_SSE(w); */
1956:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1957:   }

1959:   PetscFree(rtmp);

1961:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1962:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1963:   C->assembled           = PETSC_TRUE;

1965:   PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs);
1966:   /* Flop Count from inverting diagonal blocks */
1967:   SSE_SCOPE_END;
1968:   return 0;
1969: }

1971: #endif