Actual source code: sbaijfact.c
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <../src/mat/impls/sbaij/seq/sbaij.h>
4: #include <petsc/private/kernels/blockinvert.h>
5: #include <petscis.h>
7: PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F, PetscInt *nneg, PetscInt *nzero, PetscInt *npos)
8: {
9: Mat_SeqSBAIJ *fact = (Mat_SeqSBAIJ *)F->data;
10: MatScalar *dd = fact->a;
11: PetscInt mbs = fact->mbs, bs = F->rmap->bs, i, nneg_tmp, npos_tmp, *fi = fact->diag;
15: nneg_tmp = 0;
16: npos_tmp = 0;
17: for (i = 0; i < mbs; i++) {
18: if (PetscRealPart(dd[*fi]) > 0.0) npos_tmp++;
19: else if (PetscRealPart(dd[*fi]) < 0.0) nneg_tmp++;
20: fi++;
21: }
22: if (nneg) *nneg = nneg_tmp;
23: if (npos) *npos = npos_tmp;
24: if (nzero) *nzero = mbs - nneg_tmp - npos_tmp;
25: return 0;
26: }
28: /*
29: Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
30: Use Modified Sparse Row (MSR) storage for u and ju. See page 85, "Iterative Methods ..." by Saad.
31: */
32: PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(Mat F, Mat A, IS perm, const MatFactorInfo *info)
33: {
34: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b;
35: const PetscInt *rip, *ai, *aj;
36: PetscInt i, mbs = a->mbs, *jutmp, bs = A->rmap->bs, bs2 = a->bs2;
37: PetscInt m, reallocs = 0, prow;
38: PetscInt *jl, *q, jmin, jmax, juidx, nzk, qm, *iu, *ju, k, j, vj, umax, maxadd;
39: PetscReal f = info->fill;
40: PetscBool perm_identity;
42: /* check whether perm is the identity mapping */
43: ISIdentity(perm, &perm_identity);
44: ISGetIndices(perm, &rip);
46: if (perm_identity) { /* without permutation */
47: a->permute = PETSC_FALSE;
49: ai = a->i;
50: aj = a->j;
51: } else { /* non-trivial permutation */
52: a->permute = PETSC_TRUE;
54: MatReorderingSeqSBAIJ(A, perm);
56: ai = a->inew;
57: aj = a->jnew;
58: }
60: /* initialization */
61: PetscMalloc1(mbs + 1, &iu);
62: umax = (PetscInt)(f * ai[mbs] + 1);
63: umax += mbs + 1;
64: PetscMalloc1(umax, &ju);
65: iu[0] = mbs + 1;
66: juidx = mbs + 1; /* index for ju */
67: /* jl linked list for pivot row -- linked list for col index */
68: PetscMalloc2(mbs, &jl, mbs, &q);
69: for (i = 0; i < mbs; i++) {
70: jl[i] = mbs;
71: q[i] = 0;
72: }
74: /* for each row k */
75: for (k = 0; k < mbs; k++) {
76: for (i = 0; i < mbs; i++) q[i] = 0; /* to be removed! */
77: nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
78: q[k] = mbs;
79: /* initialize nonzero structure of k-th row to row rip[k] of A */
80: jmin = ai[rip[k]] + 1; /* exclude diag[k] */
81: jmax = ai[rip[k] + 1];
82: for (j = jmin; j < jmax; j++) {
83: vj = rip[aj[j]]; /* col. value */
84: if (vj > k) {
85: qm = k;
86: do {
87: m = qm;
88: qm = q[m];
89: } while (qm < vj);
91: nzk++;
92: q[m] = vj;
93: q[vj] = qm;
94: } /* if (vj > k) */
95: } /* for (j=jmin; j<jmax; j++) */
97: /* modify nonzero structure of k-th row by computing fill-in
98: for each row i to be merged in */
99: prow = k;
100: prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
102: while (prow < k) {
103: /* merge row prow into k-th row */
104: jmin = iu[prow] + 1;
105: jmax = iu[prow + 1];
106: qm = k;
107: for (j = jmin; j < jmax; j++) {
108: vj = ju[j];
109: do {
110: m = qm;
111: qm = q[m];
112: } while (qm < vj);
113: if (qm != vj) {
114: nzk++;
115: q[m] = vj;
116: q[vj] = qm;
117: qm = vj;
118: }
119: }
120: prow = jl[prow]; /* next pivot row */
121: }
123: /* add k to row list for first nonzero element in k-th row */
124: if (nzk > 0) {
125: i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
126: jl[k] = jl[i];
127: jl[i] = k;
128: }
129: iu[k + 1] = iu[k] + nzk;
131: /* allocate more space to ju if needed */
132: if (iu[k + 1] > umax) {
133: /* estimate how much additional space we will need */
134: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
135: /* just double the memory each time */
136: maxadd = umax;
137: if (maxadd < nzk) maxadd = (mbs - k) * (nzk + 1) / 2;
138: umax += maxadd;
140: /* allocate a longer ju */
141: PetscMalloc1(umax, &jutmp);
142: PetscArraycpy(jutmp, ju, iu[k]);
143: PetscFree(ju);
144: ju = jutmp;
145: reallocs++; /* count how many times we realloc */
146: }
148: /* save nonzero structure of k-th row in ju */
149: i = k;
150: while (nzk--) {
151: i = q[i];
152: ju[juidx++] = i;
153: }
154: }
156: #if defined(PETSC_USE_INFO)
157: if (ai[mbs] != 0) {
158: PetscReal af = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
159: PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af);
160: PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af);
161: PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af);
162: PetscInfo(A, "for best performance.\n");
163: } else {
164: PetscInfo(A, "Empty matrix\n");
165: }
166: #endif
168: ISRestoreIndices(perm, &rip);
169: PetscFree2(jl, q);
171: /* put together the new matrix */
172: MatSeqSBAIJSetPreallocation(F, bs, MAT_SKIP_ALLOCATION, NULL);
174: b = (Mat_SeqSBAIJ *)(F)->data;
175: b->singlemalloc = PETSC_FALSE;
176: b->free_a = PETSC_TRUE;
177: b->free_ij = PETSC_TRUE;
179: PetscMalloc1((iu[mbs] + 1) * bs2, &b->a);
180: b->j = ju;
181: b->i = iu;
182: b->diag = NULL;
183: b->ilen = NULL;
184: b->imax = NULL;
185: b->row = perm;
187: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
189: PetscObjectReference((PetscObject)perm);
191: b->icol = perm;
192: PetscObjectReference((PetscObject)perm);
193: PetscMalloc1(bs * mbs + bs, &b->solve_work);
194: /* In b structure: Free imax, ilen, old a, old j.
195: Allocate idnew, solve_work, new a, new j */
196: b->maxnz = b->nz = iu[mbs];
198: (F)->info.factor_mallocs = reallocs;
199: (F)->info.fill_ratio_given = f;
200: if (ai[mbs] != 0) {
201: (F)->info.fill_ratio_needed = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
202: } else {
203: (F)->info.fill_ratio_needed = 0.0;
204: }
205: MatSeqSBAIJSetNumericFactorization_inplace(F, perm_identity);
206: return 0;
207: }
208: /*
209: Symbolic U^T*D*U factorization for SBAIJ format.
210: See MatICCFactorSymbolic_SeqAIJ() for description of its data structure.
211: */
212: #include <petscbt.h>
213: #include <../src/mat/utils/freespace.h>
214: PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
215: {
216: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
217: Mat_SeqSBAIJ *b;
218: PetscBool perm_identity, missing;
219: PetscReal fill = info->fill;
220: const PetscInt *rip, *ai = a->i, *aj = a->j;
221: PetscInt i, mbs = a->mbs, bs = A->rmap->bs, reallocs = 0, prow;
222: PetscInt *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
223: PetscInt nlnk, *lnk, ncols, *cols, *uj, **ui_ptr, *uj_ptr, *udiag;
224: PetscFreeSpaceList free_space = NULL, current_space = NULL;
225: PetscBT lnkbt;
228: MatMissingDiagonal(A, &missing, &i);
230: if (bs > 1) {
231: MatCholeskyFactorSymbolic_SeqSBAIJ_inplace(fact, A, perm, info);
232: return 0;
233: }
235: /* check whether perm is the identity mapping */
236: ISIdentity(perm, &perm_identity);
238: a->permute = PETSC_FALSE;
239: ISGetIndices(perm, &rip);
241: /* initialization */
242: PetscMalloc1(mbs + 1, &ui);
243: PetscMalloc1(mbs + 1, &udiag);
244: ui[0] = 0;
246: /* jl: linked list for storing indices of the pivot rows
247: il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
248: PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols);
249: for (i = 0; i < mbs; i++) {
250: jl[i] = mbs;
251: il[i] = 0;
252: }
254: /* create and initialize a linked list for storing column indices of the active row k */
255: nlnk = mbs + 1;
256: PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt);
258: /* initial FreeSpace size is fill*(ai[mbs]+1) */
259: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[mbs] + 1), &free_space);
260: current_space = free_space;
262: for (k = 0; k < mbs; k++) { /* for each active row k */
263: /* initialize lnk by the column indices of row rip[k] of A */
264: nzk = 0;
265: ncols = ai[k + 1] - ai[k];
267: for (j = 0; j < ncols; j++) {
268: i = *(aj + ai[k] + j);
269: cols[j] = i;
270: }
271: PetscLLAdd(ncols, cols, mbs, &nlnk, lnk, lnkbt);
272: nzk += nlnk;
274: /* update lnk by computing fill-in for each pivot row to be merged in */
275: prow = jl[k]; /* 1st pivot row */
277: while (prow < k) {
278: nextprow = jl[prow];
279: /* merge prow into k-th row */
280: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
281: jmax = ui[prow + 1];
282: ncols = jmax - jmin;
283: uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
284: PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt);
285: nzk += nlnk;
287: /* update il and jl for prow */
288: if (jmin < jmax) {
289: il[prow] = jmin;
290: j = *uj_ptr;
291: jl[prow] = jl[j];
292: jl[j] = prow;
293: }
294: prow = nextprow;
295: }
297: /* if free space is not available, make more free space */
298: if (current_space->local_remaining < nzk) {
299: i = mbs - k + 1; /* num of unfactored rows */
300: i = PetscIntMultTruncate(i, PetscMin(nzk, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
301: PetscFreeSpaceGet(i, ¤t_space);
302: reallocs++;
303: }
305: /* copy data into free space, then initialize lnk */
306: PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt);
308: /* add the k-th row into il and jl */
309: if (nzk > 1) {
310: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
311: jl[k] = jl[i];
312: jl[i] = k;
313: il[k] = ui[k] + 1;
314: }
315: ui_ptr[k] = current_space->array;
317: current_space->array += nzk;
318: current_space->local_used += nzk;
319: current_space->local_remaining -= nzk;
321: ui[k + 1] = ui[k] + nzk;
322: }
324: ISRestoreIndices(perm, &rip);
325: PetscFree4(ui_ptr, il, jl, cols);
327: /* destroy list of free space and other temporary array(s) */
328: PetscMalloc1(ui[mbs] + 1, &uj);
329: PetscFreeSpaceContiguous_Cholesky(&free_space, uj, mbs, ui, udiag); /* store matrix factor */
330: PetscLLDestroy(lnk, lnkbt);
332: /* put together the new matrix in MATSEQSBAIJ format */
333: MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL);
335: b = (Mat_SeqSBAIJ *)fact->data;
336: b->singlemalloc = PETSC_FALSE;
337: b->free_a = PETSC_TRUE;
338: b->free_ij = PETSC_TRUE;
340: PetscMalloc1(ui[mbs] + 1, &b->a);
342: b->j = uj;
343: b->i = ui;
344: b->diag = udiag;
345: b->free_diag = PETSC_TRUE;
346: b->ilen = NULL;
347: b->imax = NULL;
348: b->row = perm;
349: b->icol = perm;
351: PetscObjectReference((PetscObject)perm);
352: PetscObjectReference((PetscObject)perm);
354: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
356: PetscMalloc1(mbs + 1, &b->solve_work);
358: b->maxnz = b->nz = ui[mbs];
360: fact->info.factor_mallocs = reallocs;
361: fact->info.fill_ratio_given = fill;
362: if (ai[mbs] != 0) {
363: fact->info.fill_ratio_needed = ((PetscReal)ui[mbs]) / ai[mbs];
364: } else {
365: fact->info.fill_ratio_needed = 0.0;
366: }
367: #if defined(PETSC_USE_INFO)
368: if (ai[mbs] != 0) {
369: PetscReal af = fact->info.fill_ratio_needed;
370: PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af);
371: PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af);
372: PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af);
373: } else {
374: PetscInfo(A, "Empty matrix\n");
375: }
376: #endif
377: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
378: return 0;
379: }
381: PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_inplace(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
382: {
383: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
384: Mat_SeqSBAIJ *b;
385: PetscBool perm_identity, missing;
386: PetscReal fill = info->fill;
387: const PetscInt *rip, *ai, *aj;
388: PetscInt i, mbs = a->mbs, bs = A->rmap->bs, reallocs = 0, prow, d;
389: PetscInt *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
390: PetscInt nlnk, *lnk, ncols, *cols, *uj, **ui_ptr, *uj_ptr;
391: PetscFreeSpaceList free_space = NULL, current_space = NULL;
392: PetscBT lnkbt;
394: MatMissingDiagonal(A, &missing, &d);
397: /*
398: This code originally uses Modified Sparse Row (MSR) storage
399: (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
400: Then it is rewritten so the factor B takes seqsbaij format. However the associated
401: MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1 or !perm_identity,
402: thus the original code in MSR format is still used for these cases.
403: The code below should replace MatCholeskyFactorSymbolic_SeqSBAIJ_MSR() whenever
404: MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
405: */
406: if (bs > 1) {
407: MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(fact, A, perm, info);
408: return 0;
409: }
411: /* check whether perm is the identity mapping */
412: ISIdentity(perm, &perm_identity);
414: a->permute = PETSC_FALSE;
415: ai = a->i;
416: aj = a->j;
417: ISGetIndices(perm, &rip);
419: /* initialization */
420: PetscMalloc1(mbs + 1, &ui);
421: ui[0] = 0;
423: /* jl: linked list for storing indices of the pivot rows
424: il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
425: PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols);
426: for (i = 0; i < mbs; i++) {
427: jl[i] = mbs;
428: il[i] = 0;
429: }
431: /* create and initialize a linked list for storing column indices of the active row k */
432: nlnk = mbs + 1;
433: PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt);
435: /* initial FreeSpace size is fill*(ai[mbs]+1) */
436: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[mbs] + 1), &free_space);
437: current_space = free_space;
439: for (k = 0; k < mbs; k++) { /* for each active row k */
440: /* initialize lnk by the column indices of row rip[k] of A */
441: nzk = 0;
442: ncols = ai[rip[k] + 1] - ai[rip[k]];
443: for (j = 0; j < ncols; j++) {
444: i = *(aj + ai[rip[k]] + j);
445: cols[j] = rip[i];
446: }
447: PetscLLAdd(ncols, cols, mbs, &nlnk, lnk, lnkbt);
448: nzk += nlnk;
450: /* update lnk by computing fill-in for each pivot row to be merged in */
451: prow = jl[k]; /* 1st pivot row */
453: while (prow < k) {
454: nextprow = jl[prow];
455: /* merge prow into k-th row */
456: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
457: jmax = ui[prow + 1];
458: ncols = jmax - jmin;
459: uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
460: PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt);
461: nzk += nlnk;
463: /* update il and jl for prow */
464: if (jmin < jmax) {
465: il[prow] = jmin;
467: j = *uj_ptr;
468: jl[prow] = jl[j];
469: jl[j] = prow;
470: }
471: prow = nextprow;
472: }
474: /* if free space is not available, make more free space */
475: if (current_space->local_remaining < nzk) {
476: i = mbs - k + 1; /* num of unfactored rows */
477: i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
478: PetscFreeSpaceGet(i, ¤t_space);
479: reallocs++;
480: }
482: /* copy data into free space, then initialize lnk */
483: PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt);
485: /* add the k-th row into il and jl */
486: if (nzk - 1 > 0) {
487: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
488: jl[k] = jl[i];
489: jl[i] = k;
490: il[k] = ui[k] + 1;
491: }
492: ui_ptr[k] = current_space->array;
494: current_space->array += nzk;
495: current_space->local_used += nzk;
496: current_space->local_remaining -= nzk;
498: ui[k + 1] = ui[k] + nzk;
499: }
501: ISRestoreIndices(perm, &rip);
502: PetscFree4(ui_ptr, il, jl, cols);
504: /* destroy list of free space and other temporary array(s) */
505: PetscMalloc1(ui[mbs] + 1, &uj);
506: PetscFreeSpaceContiguous(&free_space, uj);
507: PetscLLDestroy(lnk, lnkbt);
509: /* put together the new matrix in MATSEQSBAIJ format */
510: MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL);
512: b = (Mat_SeqSBAIJ *)fact->data;
513: b->singlemalloc = PETSC_FALSE;
514: b->free_a = PETSC_TRUE;
515: b->free_ij = PETSC_TRUE;
517: PetscMalloc1(ui[mbs] + 1, &b->a);
519: b->j = uj;
520: b->i = ui;
521: b->diag = NULL;
522: b->ilen = NULL;
523: b->imax = NULL;
524: b->row = perm;
526: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
528: PetscObjectReference((PetscObject)perm);
529: b->icol = perm;
530: PetscObjectReference((PetscObject)perm);
531: PetscMalloc1(mbs + 1, &b->solve_work);
532: b->maxnz = b->nz = ui[mbs];
534: fact->info.factor_mallocs = reallocs;
535: fact->info.fill_ratio_given = fill;
536: if (ai[mbs] != 0) {
537: fact->info.fill_ratio_needed = ((PetscReal)ui[mbs]) / ai[mbs];
538: } else {
539: fact->info.fill_ratio_needed = 0.0;
540: }
541: #if defined(PETSC_USE_INFO)
542: if (ai[mbs] != 0) {
543: PetscReal af = fact->info.fill_ratio_needed;
544: PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af);
545: PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af);
546: PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af);
547: } else {
548: PetscInfo(A, "Empty matrix\n");
549: }
550: #endif
551: MatSeqSBAIJSetNumericFactorization_inplace(fact, perm_identity);
552: return 0;
553: }
555: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat C, Mat A, const MatFactorInfo *info)
556: {
557: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
558: IS perm = b->row;
559: const PetscInt *ai, *aj, *perm_ptr, mbs = a->mbs, *bi = b->i, *bj = b->j;
560: PetscInt i, j;
561: PetscInt *a2anew, k, k1, jmin, jmax, *jl, *il, vj, nexti, ili;
562: PetscInt bs = A->rmap->bs, bs2 = a->bs2;
563: MatScalar *ba = b->a, *aa, *ap, *dk, *uik;
564: MatScalar *u, *diag, *rtmp, *rtmp_ptr;
565: MatScalar *work;
566: PetscInt *pivots;
567: PetscBool allowzeropivot, zeropivotdetected;
569: /* initialization */
570: PetscCalloc1(bs2 * mbs, &rtmp);
571: PetscMalloc2(mbs, &il, mbs, &jl);
572: allowzeropivot = PetscNot(A->erroriffailure);
574: il[0] = 0;
575: for (i = 0; i < mbs; i++) jl[i] = mbs;
577: PetscMalloc3(bs2, &dk, bs2, &uik, bs, &work);
578: PetscMalloc1(bs, &pivots);
580: ISGetIndices(perm, &perm_ptr);
582: /* check permutation */
583: if (!a->permute) {
584: ai = a->i;
585: aj = a->j;
586: aa = a->a;
587: } else {
588: ai = a->inew;
589: aj = a->jnew;
590: PetscMalloc1(bs2 * ai[mbs], &aa);
591: PetscArraycpy(aa, a->a, bs2 * ai[mbs]);
592: PetscMalloc1(ai[mbs], &a2anew);
593: PetscArraycpy(a2anew, a->a2anew, ai[mbs]);
595: for (i = 0; i < mbs; i++) {
596: jmin = ai[i];
597: jmax = ai[i + 1];
598: for (j = jmin; j < jmax; j++) {
599: while (a2anew[j] != j) {
600: k = a2anew[j];
601: a2anew[j] = a2anew[k];
602: a2anew[k] = k;
603: for (k1 = 0; k1 < bs2; k1++) {
604: dk[k1] = aa[k * bs2 + k1];
605: aa[k * bs2 + k1] = aa[j * bs2 + k1];
606: aa[j * bs2 + k1] = dk[k1];
607: }
608: }
609: /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
610: if (i > aj[j]) {
611: ap = aa + j * bs2; /* ptr to the beginning of j-th block of aa */
612: for (k = 0; k < bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
613: for (k = 0; k < bs; k++) { /* j-th block of aa <- dk^T */
614: for (k1 = 0; k1 < bs; k1++) *ap++ = dk[k + bs * k1];
615: }
616: }
617: }
618: }
619: PetscFree(a2anew);
620: }
622: /* for each row k */
623: for (k = 0; k < mbs; k++) {
624: /*initialize k-th row with elements nonzero in row perm(k) of A */
625: jmin = ai[perm_ptr[k]];
626: jmax = ai[perm_ptr[k] + 1];
628: ap = aa + jmin * bs2;
629: for (j = jmin; j < jmax; j++) {
630: vj = perm_ptr[aj[j]]; /* block col. index */
631: rtmp_ptr = rtmp + vj * bs2;
632: for (i = 0; i < bs2; i++) *rtmp_ptr++ = *ap++;
633: }
635: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
636: PetscArraycpy(dk, rtmp + k * bs2, bs2);
637: i = jl[k]; /* first row to be added to k_th row */
639: while (i < k) {
640: nexti = jl[i]; /* next row to be added to k_th row */
642: /* compute multiplier */
643: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
645: /* uik = -inv(Di)*U_bar(i,k) */
646: diag = ba + i * bs2;
647: u = ba + ili * bs2;
648: PetscArrayzero(uik, bs2);
649: PetscKernel_A_gets_A_minus_B_times_C(bs, uik, diag, u);
651: /* update D(k) += -U(i,k)^T * U_bar(i,k) */
652: PetscKernel_A_gets_A_plus_Btranspose_times_C(bs, dk, uik, u);
653: PetscLogFlops(4.0 * bs * bs2);
655: /* update -U(i,k) */
656: PetscArraycpy(ba + ili * bs2, uik, bs2);
658: /* add multiple of row i to k-th row ... */
659: jmin = ili + 1;
660: jmax = bi[i + 1];
661: if (jmin < jmax) {
662: for (j = jmin; j < jmax; j++) {
663: /* rtmp += -U(i,k)^T * U_bar(i,j) */
664: rtmp_ptr = rtmp + bj[j] * bs2;
665: u = ba + j * bs2;
666: PetscKernel_A_gets_A_plus_Btranspose_times_C(bs, rtmp_ptr, uik, u);
667: }
668: PetscLogFlops(2.0 * bs * bs2 * (jmax - jmin));
670: /* ... add i to row list for next nonzero entry */
671: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
672: j = bj[jmin];
673: jl[i] = jl[j];
674: jl[j] = i; /* update jl */
675: }
676: i = nexti;
677: }
679: /* save nonzero entries in k-th row of U ... */
681: /* invert diagonal block */
682: diag = ba + k * bs2;
683: PetscArraycpy(diag, dk, bs2);
685: PetscKernel_A_gets_inverse_A(bs, diag, pivots, work, allowzeropivot, &zeropivotdetected);
686: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
688: jmin = bi[k];
689: jmax = bi[k + 1];
690: if (jmin < jmax) {
691: for (j = jmin; j < jmax; j++) {
692: vj = bj[j]; /* block col. index of U */
693: u = ba + j * bs2;
694: rtmp_ptr = rtmp + vj * bs2;
695: for (k1 = 0; k1 < bs2; k1++) {
696: *u++ = *rtmp_ptr;
697: *rtmp_ptr++ = 0.0;
698: }
699: }
701: /* ... add k to row list for first nonzero entry in k-th row */
702: il[k] = jmin;
703: i = bj[jmin];
704: jl[k] = jl[i];
705: jl[i] = k;
706: }
707: }
709: PetscFree(rtmp);
710: PetscFree2(il, jl);
711: PetscFree3(dk, uik, work);
712: PetscFree(pivots);
713: if (a->permute) PetscFree(aa);
715: ISRestoreIndices(perm, &perm_ptr);
717: C->ops->solve = MatSolve_SeqSBAIJ_N_inplace;
718: C->ops->solvetranspose = MatSolve_SeqSBAIJ_N_inplace;
719: C->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_N_inplace;
720: C->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_N_inplace;
722: C->assembled = PETSC_TRUE;
723: C->preallocated = PETSC_TRUE;
725: PetscLogFlops(1.3333 * bs * bs2 * b->mbs); /* from inverting diagonal blocks */
726: return 0;
727: }
729: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
730: {
731: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
732: PetscInt i, j, mbs = a->mbs, *bi = b->i, *bj = b->j;
733: PetscInt *ai, *aj, k, k1, jmin, jmax, *jl, *il, vj, nexti, ili;
734: PetscInt bs = A->rmap->bs, bs2 = a->bs2;
735: MatScalar *ba = b->a, *aa, *ap, *dk, *uik;
736: MatScalar *u, *diag, *rtmp, *rtmp_ptr;
737: MatScalar *work;
738: PetscInt *pivots;
739: PetscBool allowzeropivot, zeropivotdetected;
741: PetscCalloc1(bs2 * mbs, &rtmp);
742: PetscMalloc2(mbs, &il, mbs, &jl);
743: il[0] = 0;
744: for (i = 0; i < mbs; i++) jl[i] = mbs;
746: PetscMalloc3(bs2, &dk, bs2, &uik, bs, &work);
747: PetscMalloc1(bs, &pivots);
748: allowzeropivot = PetscNot(A->erroriffailure);
750: ai = a->i;
751: aj = a->j;
752: aa = a->a;
754: /* for each row k */
755: for (k = 0; k < mbs; k++) {
756: /*initialize k-th row with elements nonzero in row k of A */
757: jmin = ai[k];
758: jmax = ai[k + 1];
759: ap = aa + jmin * bs2;
760: for (j = jmin; j < jmax; j++) {
761: vj = aj[j]; /* block col. index */
762: rtmp_ptr = rtmp + vj * bs2;
763: for (i = 0; i < bs2; i++) *rtmp_ptr++ = *ap++;
764: }
766: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
767: PetscArraycpy(dk, rtmp + k * bs2, bs2);
768: i = jl[k]; /* first row to be added to k_th row */
770: while (i < k) {
771: nexti = jl[i]; /* next row to be added to k_th row */
773: /* compute multiplier */
774: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
776: /* uik = -inv(Di)*U_bar(i,k) */
777: diag = ba + i * bs2;
778: u = ba + ili * bs2;
779: PetscArrayzero(uik, bs2);
780: PetscKernel_A_gets_A_minus_B_times_C(bs, uik, diag, u);
782: /* update D(k) += -U(i,k)^T * U_bar(i,k) */
783: PetscKernel_A_gets_A_plus_Btranspose_times_C(bs, dk, uik, u);
784: PetscLogFlops(2.0 * bs * bs2);
786: /* update -U(i,k) */
787: PetscArraycpy(ba + ili * bs2, uik, bs2);
789: /* add multiple of row i to k-th row ... */
790: jmin = ili + 1;
791: jmax = bi[i + 1];
792: if (jmin < jmax) {
793: for (j = jmin; j < jmax; j++) {
794: /* rtmp += -U(i,k)^T * U_bar(i,j) */
795: rtmp_ptr = rtmp + bj[j] * bs2;
796: u = ba + j * bs2;
797: PetscKernel_A_gets_A_plus_Btranspose_times_C(bs, rtmp_ptr, uik, u);
798: }
799: PetscLogFlops(2.0 * bs * bs2 * (jmax - jmin));
801: /* ... add i to row list for next nonzero entry */
802: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
803: j = bj[jmin];
804: jl[i] = jl[j];
805: jl[j] = i; /* update jl */
806: }
807: i = nexti;
808: }
810: /* save nonzero entries in k-th row of U ... */
812: /* invert diagonal block */
813: diag = ba + k * bs2;
814: PetscArraycpy(diag, dk, bs2);
816: PetscKernel_A_gets_inverse_A(bs, diag, pivots, work, allowzeropivot, &zeropivotdetected);
817: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
819: jmin = bi[k];
820: jmax = bi[k + 1];
821: if (jmin < jmax) {
822: for (j = jmin; j < jmax; j++) {
823: vj = bj[j]; /* block col. index of U */
824: u = ba + j * bs2;
825: rtmp_ptr = rtmp + vj * bs2;
826: for (k1 = 0; k1 < bs2; k1++) {
827: *u++ = *rtmp_ptr;
828: *rtmp_ptr++ = 0.0;
829: }
830: }
832: /* ... add k to row list for first nonzero entry in k-th row */
833: il[k] = jmin;
834: i = bj[jmin];
835: jl[k] = jl[i];
836: jl[i] = k;
837: }
838: }
840: PetscFree(rtmp);
841: PetscFree2(il, jl);
842: PetscFree3(dk, uik, work);
843: PetscFree(pivots);
845: C->ops->solve = MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
846: C->ops->solvetranspose = MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
847: C->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
848: C->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
849: C->assembled = PETSC_TRUE;
850: C->preallocated = PETSC_TRUE;
852: PetscLogFlops(1.3333 * bs * bs2 * b->mbs); /* from inverting diagonal blocks */
853: return 0;
854: }
856: /*
857: Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
858: Version for blocks 2 by 2.
859: */
860: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat C, Mat A, const MatFactorInfo *info)
861: {
862: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
863: IS perm = b->row;
864: const PetscInt *ai, *aj, *perm_ptr;
865: PetscInt i, j, mbs = a->mbs, *bi = b->i, *bj = b->j;
866: PetscInt *a2anew, k, k1, jmin, jmax, *jl, *il, vj, nexti, ili;
867: MatScalar *ba = b->a, *aa, *ap;
868: MatScalar *u, *diag, *rtmp, *rtmp_ptr, dk[4], uik[4];
869: PetscReal shift = info->shiftamount;
870: PetscBool allowzeropivot, zeropivotdetected;
872: allowzeropivot = PetscNot(A->erroriffailure);
874: /* initialization */
875: /* il and jl record the first nonzero element in each row of the accessing
876: window U(0:k, k:mbs-1).
877: jl: list of rows to be added to uneliminated rows
878: i>= k: jl(i) is the first row to be added to row i
879: i< k: jl(i) is the row following row i in some list of rows
880: jl(i) = mbs indicates the end of a list
881: il(i): points to the first nonzero element in columns k,...,mbs-1 of
882: row i of U */
883: PetscCalloc1(4 * mbs, &rtmp);
884: PetscMalloc2(mbs, &il, mbs, &jl);
885: il[0] = 0;
886: for (i = 0; i < mbs; i++) jl[i] = mbs;
888: ISGetIndices(perm, &perm_ptr);
890: /* check permutation */
891: if (!a->permute) {
892: ai = a->i;
893: aj = a->j;
894: aa = a->a;
895: } else {
896: ai = a->inew;
897: aj = a->jnew;
898: PetscMalloc1(4 * ai[mbs], &aa);
899: PetscArraycpy(aa, a->a, 4 * ai[mbs]);
900: PetscMalloc1(ai[mbs], &a2anew);
901: PetscArraycpy(a2anew, a->a2anew, ai[mbs]);
903: for (i = 0; i < mbs; i++) {
904: jmin = ai[i];
905: jmax = ai[i + 1];
906: for (j = jmin; j < jmax; j++) {
907: while (a2anew[j] != j) {
908: k = a2anew[j];
909: a2anew[j] = a2anew[k];
910: a2anew[k] = k;
911: for (k1 = 0; k1 < 4; k1++) {
912: dk[k1] = aa[k * 4 + k1];
913: aa[k * 4 + k1] = aa[j * 4 + k1];
914: aa[j * 4 + k1] = dk[k1];
915: }
916: }
917: /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
918: if (i > aj[j]) {
919: ap = aa + j * 4; /* ptr to the beginning of the block */
920: dk[1] = ap[1]; /* swap ap[1] and ap[2] */
921: ap[1] = ap[2];
922: ap[2] = dk[1];
923: }
924: }
925: }
926: PetscFree(a2anew);
927: }
929: /* for each row k */
930: for (k = 0; k < mbs; k++) {
931: /*initialize k-th row with elements nonzero in row perm(k) of A */
932: jmin = ai[perm_ptr[k]];
933: jmax = ai[perm_ptr[k] + 1];
934: ap = aa + jmin * 4;
935: for (j = jmin; j < jmax; j++) {
936: vj = perm_ptr[aj[j]]; /* block col. index */
937: rtmp_ptr = rtmp + vj * 4;
938: for (i = 0; i < 4; i++) *rtmp_ptr++ = *ap++;
939: }
941: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
942: PetscArraycpy(dk, rtmp + k * 4, 4);
943: i = jl[k]; /* first row to be added to k_th row */
945: while (i < k) {
946: nexti = jl[i]; /* next row to be added to k_th row */
948: /* compute multiplier */
949: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
951: /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
952: diag = ba + i * 4;
953: u = ba + ili * 4;
954: uik[0] = -(diag[0] * u[0] + diag[2] * u[1]);
955: uik[1] = -(diag[1] * u[0] + diag[3] * u[1]);
956: uik[2] = -(diag[0] * u[2] + diag[2] * u[3]);
957: uik[3] = -(diag[1] * u[2] + diag[3] * u[3]);
959: /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
960: dk[0] += uik[0] * u[0] + uik[1] * u[1];
961: dk[1] += uik[2] * u[0] + uik[3] * u[1];
962: dk[2] += uik[0] * u[2] + uik[1] * u[3];
963: dk[3] += uik[2] * u[2] + uik[3] * u[3];
965: PetscLogFlops(16.0 * 2.0);
967: /* update -U(i,k): ba[ili] = uik */
968: PetscArraycpy(ba + ili * 4, uik, 4);
970: /* add multiple of row i to k-th row ... */
971: jmin = ili + 1;
972: jmax = bi[i + 1];
973: if (jmin < jmax) {
974: for (j = jmin; j < jmax; j++) {
975: /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
976: rtmp_ptr = rtmp + bj[j] * 4;
977: u = ba + j * 4;
978: rtmp_ptr[0] += uik[0] * u[0] + uik[1] * u[1];
979: rtmp_ptr[1] += uik[2] * u[0] + uik[3] * u[1];
980: rtmp_ptr[2] += uik[0] * u[2] + uik[1] * u[3];
981: rtmp_ptr[3] += uik[2] * u[2] + uik[3] * u[3];
982: }
983: PetscLogFlops(16.0 * (jmax - jmin));
985: /* ... add i to row list for next nonzero entry */
986: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
987: j = bj[jmin];
988: jl[i] = jl[j];
989: jl[j] = i; /* update jl */
990: }
991: i = nexti;
992: }
994: /* save nonzero entries in k-th row of U ... */
996: /* invert diagonal block */
997: diag = ba + k * 4;
998: PetscArraycpy(diag, dk, 4);
999: PetscKernel_A_gets_inverse_A_2(diag, shift, allowzeropivot, &zeropivotdetected);
1000: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1002: jmin = bi[k];
1003: jmax = bi[k + 1];
1004: if (jmin < jmax) {
1005: for (j = jmin; j < jmax; j++) {
1006: vj = bj[j]; /* block col. index of U */
1007: u = ba + j * 4;
1008: rtmp_ptr = rtmp + vj * 4;
1009: for (k1 = 0; k1 < 4; k1++) {
1010: *u++ = *rtmp_ptr;
1011: *rtmp_ptr++ = 0.0;
1012: }
1013: }
1015: /* ... add k to row list for first nonzero entry in k-th row */
1016: il[k] = jmin;
1017: i = bj[jmin];
1018: jl[k] = jl[i];
1019: jl[i] = k;
1020: }
1021: }
1023: PetscFree(rtmp);
1024: PetscFree2(il, jl);
1025: if (a->permute) PetscFree(aa);
1026: ISRestoreIndices(perm, &perm_ptr);
1028: C->ops->solve = MatSolve_SeqSBAIJ_2_inplace;
1029: C->ops->solvetranspose = MatSolve_SeqSBAIJ_2_inplace;
1030: C->assembled = PETSC_TRUE;
1031: C->preallocated = PETSC_TRUE;
1033: PetscLogFlops(1.3333 * 8 * b->mbs); /* from inverting diagonal blocks */
1034: return 0;
1035: }
1037: /*
1038: Version for when blocks are 2 by 2 Using natural ordering
1039: */
1040: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
1041: {
1042: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
1043: PetscInt i, j, mbs = a->mbs, *bi = b->i, *bj = b->j;
1044: PetscInt *ai, *aj, k, k1, jmin, jmax, *jl, *il, vj, nexti, ili;
1045: MatScalar *ba = b->a, *aa, *ap, dk[8], uik[8];
1046: MatScalar *u, *diag, *rtmp, *rtmp_ptr;
1047: PetscReal shift = info->shiftamount;
1048: PetscBool allowzeropivot, zeropivotdetected;
1050: allowzeropivot = PetscNot(A->erroriffailure);
1052: /* initialization */
1053: /* il and jl record the first nonzero element in each row of the accessing
1054: window U(0:k, k:mbs-1).
1055: jl: list of rows to be added to uneliminated rows
1056: i>= k: jl(i) is the first row to be added to row i
1057: i< k: jl(i) is the row following row i in some list of rows
1058: jl(i) = mbs indicates the end of a list
1059: il(i): points to the first nonzero element in columns k,...,mbs-1 of
1060: row i of U */
1061: PetscCalloc1(4 * mbs, &rtmp);
1062: PetscMalloc2(mbs, &il, mbs, &jl);
1063: il[0] = 0;
1064: for (i = 0; i < mbs; i++) jl[i] = mbs;
1066: ai = a->i;
1067: aj = a->j;
1068: aa = a->a;
1070: /* for each row k */
1071: for (k = 0; k < mbs; k++) {
1072: /*initialize k-th row with elements nonzero in row k of A */
1073: jmin = ai[k];
1074: jmax = ai[k + 1];
1075: ap = aa + jmin * 4;
1076: for (j = jmin; j < jmax; j++) {
1077: vj = aj[j]; /* block col. index */
1078: rtmp_ptr = rtmp + vj * 4;
1079: for (i = 0; i < 4; i++) *rtmp_ptr++ = *ap++;
1080: }
1082: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1083: PetscArraycpy(dk, rtmp + k * 4, 4);
1084: i = jl[k]; /* first row to be added to k_th row */
1086: while (i < k) {
1087: nexti = jl[i]; /* next row to be added to k_th row */
1089: /* compute multiplier */
1090: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1092: /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
1093: diag = ba + i * 4;
1094: u = ba + ili * 4;
1095: uik[0] = -(diag[0] * u[0] + diag[2] * u[1]);
1096: uik[1] = -(diag[1] * u[0] + diag[3] * u[1]);
1097: uik[2] = -(diag[0] * u[2] + diag[2] * u[3]);
1098: uik[3] = -(diag[1] * u[2] + diag[3] * u[3]);
1100: /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
1101: dk[0] += uik[0] * u[0] + uik[1] * u[1];
1102: dk[1] += uik[2] * u[0] + uik[3] * u[1];
1103: dk[2] += uik[0] * u[2] + uik[1] * u[3];
1104: dk[3] += uik[2] * u[2] + uik[3] * u[3];
1106: PetscLogFlops(16.0 * 2.0);
1108: /* update -U(i,k): ba[ili] = uik */
1109: PetscArraycpy(ba + ili * 4, uik, 4);
1111: /* add multiple of row i to k-th row ... */
1112: jmin = ili + 1;
1113: jmax = bi[i + 1];
1114: if (jmin < jmax) {
1115: for (j = jmin; j < jmax; j++) {
1116: /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
1117: rtmp_ptr = rtmp + bj[j] * 4;
1118: u = ba + j * 4;
1119: rtmp_ptr[0] += uik[0] * u[0] + uik[1] * u[1];
1120: rtmp_ptr[1] += uik[2] * u[0] + uik[3] * u[1];
1121: rtmp_ptr[2] += uik[0] * u[2] + uik[1] * u[3];
1122: rtmp_ptr[3] += uik[2] * u[2] + uik[3] * u[3];
1123: }
1124: PetscLogFlops(16.0 * (jmax - jmin));
1126: /* ... add i to row list for next nonzero entry */
1127: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
1128: j = bj[jmin];
1129: jl[i] = jl[j];
1130: jl[j] = i; /* update jl */
1131: }
1132: i = nexti;
1133: }
1135: /* save nonzero entries in k-th row of U ... */
1137: /* invert diagonal block */
1138: diag = ba + k * 4;
1139: PetscArraycpy(diag, dk, 4);
1140: PetscKernel_A_gets_inverse_A_2(diag, shift, allowzeropivot, &zeropivotdetected);
1141: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1143: jmin = bi[k];
1144: jmax = bi[k + 1];
1145: if (jmin < jmax) {
1146: for (j = jmin; j < jmax; j++) {
1147: vj = bj[j]; /* block col. index of U */
1148: u = ba + j * 4;
1149: rtmp_ptr = rtmp + vj * 4;
1150: for (k1 = 0; k1 < 4; k1++) {
1151: *u++ = *rtmp_ptr;
1152: *rtmp_ptr++ = 0.0;
1153: }
1154: }
1156: /* ... add k to row list for first nonzero entry in k-th row */
1157: il[k] = jmin;
1158: i = bj[jmin];
1159: jl[k] = jl[i];
1160: jl[i] = k;
1161: }
1162: }
1164: PetscFree(rtmp);
1165: PetscFree2(il, jl);
1167: C->ops->solve = MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1168: C->ops->solvetranspose = MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1169: C->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1170: C->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1171: C->assembled = PETSC_TRUE;
1172: C->preallocated = PETSC_TRUE;
1174: PetscLogFlops(1.3333 * 8 * b->mbs); /* from inverting diagonal blocks */
1175: return 0;
1176: }
1178: /*
1179: Numeric U^T*D*U factorization for SBAIJ format.
1180: Version for blocks are 1 by 1.
1181: */
1182: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info)
1183: {
1184: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
1185: IS ip = b->row;
1186: const PetscInt *ai, *aj, *rip;
1187: PetscInt *a2anew, i, j, mbs = a->mbs, *bi = b->i, *bj = b->j, *bcol;
1188: PetscInt k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
1189: MatScalar *rtmp, *ba = b->a, *bval, *aa, dk, uikdi;
1190: PetscReal rs;
1191: FactorShiftCtx sctx;
1193: /* MatPivotSetUp(): initialize shift context sctx */
1194: PetscMemzero(&sctx, sizeof(FactorShiftCtx));
1196: ISGetIndices(ip, &rip);
1197: if (!a->permute) {
1198: ai = a->i;
1199: aj = a->j;
1200: aa = a->a;
1201: } else {
1202: ai = a->inew;
1203: aj = a->jnew;
1204: nz = ai[mbs];
1205: PetscMalloc1(nz, &aa);
1206: a2anew = a->a2anew;
1207: bval = a->a;
1208: for (j = 0; j < nz; j++) aa[a2anew[j]] = *(bval++);
1209: }
1211: /* initialization */
1212: /* il and jl record the first nonzero element in each row of the accessing
1213: window U(0:k, k:mbs-1).
1214: jl: list of rows to be added to uneliminated rows
1215: i>= k: jl(i) is the first row to be added to row i
1216: i< k: jl(i) is the row following row i in some list of rows
1217: jl(i) = mbs indicates the end of a list
1218: il(i): points to the first nonzero element in columns k,...,mbs-1 of
1219: row i of U */
1220: PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl);
1222: do {
1223: sctx.newshift = PETSC_FALSE;
1224: il[0] = 0;
1225: for (i = 0; i < mbs; i++) {
1226: rtmp[i] = 0.0;
1227: jl[i] = mbs;
1228: }
1230: for (k = 0; k < mbs; k++) {
1231: /*initialize k-th row by the perm[k]-th row of A */
1232: jmin = ai[rip[k]];
1233: jmax = ai[rip[k] + 1];
1234: bval = ba + bi[k];
1235: for (j = jmin; j < jmax; j++) {
1236: col = rip[aj[j]];
1237: rtmp[col] = aa[j];
1238: *bval++ = 0.0; /* for in-place factorization */
1239: }
1241: /* shift the diagonal of the matrix */
1242: if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1244: /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1245: dk = rtmp[k];
1246: i = jl[k]; /* first row to be added to k_th row */
1248: while (i < k) {
1249: nexti = jl[i]; /* next row to be added to k_th row */
1251: /* compute multiplier, update diag(k) and U(i,k) */
1252: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1253: uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
1254: dk += uikdi * ba[ili];
1255: ba[ili] = uikdi; /* -U(i,k) */
1257: /* add multiple of row i to k-th row */
1258: jmin = ili + 1;
1259: jmax = bi[i + 1];
1260: if (jmin < jmax) {
1261: for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
1262: PetscLogFlops(2.0 * (jmax - jmin));
1264: /* update il and jl for row i */
1265: il[i] = jmin;
1266: j = bj[jmin];
1267: jl[i] = jl[j];
1268: jl[j] = i;
1269: }
1270: i = nexti;
1271: }
1273: /* shift the diagonals when zero pivot is detected */
1274: /* compute rs=sum of abs(off-diagonal) */
1275: rs = 0.0;
1276: jmin = bi[k] + 1;
1277: nz = bi[k + 1] - jmin;
1278: if (nz) {
1279: bcol = bj + jmin;
1280: while (nz--) {
1281: rs += PetscAbsScalar(rtmp[*bcol]);
1282: bcol++;
1283: }
1284: }
1286: sctx.rs = rs;
1287: sctx.pv = dk;
1288: MatPivotCheck(C, A, info, &sctx, k);
1289: if (sctx.newshift) break; /* sctx.shift_amount is updated */
1290: dk = sctx.pv;
1292: /* copy data into U(k,:) */
1293: ba[bi[k]] = 1.0 / dk; /* U(k,k) */
1294: jmin = bi[k] + 1;
1295: jmax = bi[k + 1];
1296: if (jmin < jmax) {
1297: for (j = jmin; j < jmax; j++) {
1298: col = bj[j];
1299: ba[j] = rtmp[col];
1300: rtmp[col] = 0.0;
1301: }
1302: /* add the k-th row into il and jl */
1303: il[k] = jmin;
1304: i = bj[jmin];
1305: jl[k] = jl[i];
1306: jl[i] = k;
1307: }
1308: }
1309: } while (sctx.newshift);
1310: PetscFree3(rtmp, il, jl);
1311: if (a->permute) PetscFree(aa);
1313: ISRestoreIndices(ip, &rip);
1315: C->ops->solve = MatSolve_SeqSBAIJ_1_inplace;
1316: C->ops->solves = MatSolves_SeqSBAIJ_1_inplace;
1317: C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_inplace;
1318: C->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_1_inplace;
1319: C->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_1_inplace;
1320: C->assembled = PETSC_TRUE;
1321: C->preallocated = PETSC_TRUE;
1323: PetscLogFlops(C->rmap->N);
1324: if (sctx.nshift) {
1325: if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1326: PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount);
1327: } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1328: PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount);
1329: }
1330: }
1331: return 0;
1332: }
1334: /*
1335: Version for when blocks are 1 by 1 Using natural ordering under new datastructure
1336: Modified from MatCholeskyFactorNumeric_SeqAIJ()
1337: */
1338: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
1339: {
1340: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1341: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)B->data;
1342: PetscInt i, j, mbs = A->rmap->n, *bi = b->i, *bj = b->j, *bdiag = b->diag, *bjtmp;
1343: PetscInt *ai = a->i, *aj = a->j, *ajtmp;
1344: PetscInt k, jmin, jmax, *c2r, *il, col, nexti, ili, nz;
1345: MatScalar *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
1346: FactorShiftCtx sctx;
1347: PetscReal rs;
1348: MatScalar d, *v;
1350: PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &c2r);
1352: /* MatPivotSetUp(): initialize shift context sctx */
1353: PetscMemzero(&sctx, sizeof(FactorShiftCtx));
1355: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1356: sctx.shift_top = info->zeropivot;
1358: PetscArrayzero(rtmp, mbs);
1360: for (i = 0; i < mbs; i++) {
1361: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1362: d = (aa)[a->diag[i]];
1363: rtmp[i] += -PetscRealPart(d); /* diagonal entry */
1364: ajtmp = aj + ai[i] + 1; /* exclude diagonal */
1365: v = aa + ai[i] + 1;
1366: nz = ai[i + 1] - ai[i] - 1;
1367: for (j = 0; j < nz; j++) {
1368: rtmp[i] += PetscAbsScalar(v[j]);
1369: rtmp[ajtmp[j]] += PetscAbsScalar(v[j]);
1370: }
1371: if (PetscRealPart(rtmp[i]) > sctx.shift_top) sctx.shift_top = PetscRealPart(rtmp[i]);
1372: }
1373: sctx.shift_top *= 1.1;
1374: sctx.nshift_max = 5;
1375: sctx.shift_lo = 0.;
1376: sctx.shift_hi = 1.;
1377: }
1379: /* allocate working arrays
1380: c2r: linked list, keep track of pivot rows for a given column. c2r[col]: head of the list for a given col
1381: il: for active k row, il[i] gives the index of the 1st nonzero entry in U[i,k:n-1] in bj and ba arrays
1382: */
1383: do {
1384: sctx.newshift = PETSC_FALSE;
1386: for (i = 0; i < mbs; i++) c2r[i] = mbs;
1387: if (mbs) il[0] = 0;
1389: for (k = 0; k < mbs; k++) {
1390: /* zero rtmp */
1391: nz = bi[k + 1] - bi[k];
1392: bjtmp = bj + bi[k];
1393: for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
1395: /* load in initial unfactored row */
1396: bval = ba + bi[k];
1397: jmin = ai[k];
1398: jmax = ai[k + 1];
1399: for (j = jmin; j < jmax; j++) {
1400: col = aj[j];
1401: rtmp[col] = aa[j];
1402: *bval++ = 0.0; /* for in-place factorization */
1403: }
1404: /* shift the diagonal of the matrix: ZeropivotApply() */
1405: rtmp[k] += sctx.shift_amount; /* shift the diagonal of the matrix */
1407: /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1408: dk = rtmp[k];
1409: i = c2r[k]; /* first row to be added to k_th row */
1411: while (i < k) {
1412: nexti = c2r[i]; /* next row to be added to k_th row */
1414: /* compute multiplier, update diag(k) and U(i,k) */
1415: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1416: uikdi = -ba[ili] * ba[bdiag[i]]; /* diagonal(k) */
1417: dk += uikdi * ba[ili]; /* update diag[k] */
1418: ba[ili] = uikdi; /* -U(i,k) */
1420: /* add multiple of row i to k-th row */
1421: jmin = ili + 1;
1422: jmax = bi[i + 1];
1423: if (jmin < jmax) {
1424: for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
1425: /* update il and c2r for row i */
1426: il[i] = jmin;
1427: j = bj[jmin];
1428: c2r[i] = c2r[j];
1429: c2r[j] = i;
1430: }
1431: i = nexti;
1432: }
1434: /* copy data into U(k,:) */
1435: rs = 0.0;
1436: jmin = bi[k];
1437: jmax = bi[k + 1] - 1;
1438: if (jmin < jmax) {
1439: for (j = jmin; j < jmax; j++) {
1440: col = bj[j];
1441: ba[j] = rtmp[col];
1442: rs += PetscAbsScalar(ba[j]);
1443: }
1444: /* add the k-th row into il and c2r */
1445: il[k] = jmin;
1446: i = bj[jmin];
1447: c2r[k] = c2r[i];
1448: c2r[i] = k;
1449: }
1451: sctx.rs = rs;
1452: sctx.pv = dk;
1453: MatPivotCheck(B, A, info, &sctx, k);
1454: if (sctx.newshift) break;
1455: dk = sctx.pv;
1457: ba[bdiag[k]] = 1.0 / dk; /* U(k,k) */
1458: }
1459: } while (sctx.newshift);
1461: PetscFree3(rtmp, il, c2r);
1463: B->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1464: B->ops->solves = MatSolves_SeqSBAIJ_1;
1465: B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1466: B->ops->matsolve = MatMatSolve_SeqSBAIJ_1_NaturalOrdering;
1467: B->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering;
1468: B->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering;
1470: B->assembled = PETSC_TRUE;
1471: B->preallocated = PETSC_TRUE;
1473: PetscLogFlops(B->rmap->n);
1475: /* MatPivotView() */
1476: if (sctx.nshift) {
1477: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1478: PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top);
1479: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1480: PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount);
1481: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1482: PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount);
1483: }
1484: }
1485: return 0;
1486: }
1488: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
1489: {
1490: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
1491: PetscInt i, j, mbs = a->mbs;
1492: PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
1493: PetscInt k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
1494: MatScalar *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
1495: PetscReal rs;
1496: FactorShiftCtx sctx;
1498: /* MatPivotSetUp(): initialize shift context sctx */
1499: PetscMemzero(&sctx, sizeof(FactorShiftCtx));
1501: /* initialization */
1502: /* il and jl record the first nonzero element in each row of the accessing
1503: window U(0:k, k:mbs-1).
1504: jl: list of rows to be added to uneliminated rows
1505: i>= k: jl(i) is the first row to be added to row i
1506: i< k: jl(i) is the row following row i in some list of rows
1507: jl(i) = mbs indicates the end of a list
1508: il(i): points to the first nonzero element in U(i,k:mbs-1)
1509: */
1510: PetscMalloc1(mbs, &rtmp);
1511: PetscMalloc2(mbs, &il, mbs, &jl);
1513: do {
1514: sctx.newshift = PETSC_FALSE;
1515: il[0] = 0;
1516: for (i = 0; i < mbs; i++) {
1517: rtmp[i] = 0.0;
1518: jl[i] = mbs;
1519: }
1521: for (k = 0; k < mbs; k++) {
1522: /*initialize k-th row with elements nonzero in row perm(k) of A */
1523: nz = ai[k + 1] - ai[k];
1524: acol = aj + ai[k];
1525: aval = aa + ai[k];
1526: bval = ba + bi[k];
1527: while (nz--) {
1528: rtmp[*acol++] = *aval++;
1529: *bval++ = 0.0; /* for in-place factorization */
1530: }
1532: /* shift the diagonal of the matrix */
1533: if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1535: /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1536: dk = rtmp[k];
1537: i = jl[k]; /* first row to be added to k_th row */
1539: while (i < k) {
1540: nexti = jl[i]; /* next row to be added to k_th row */
1541: /* compute multiplier, update D(k) and U(i,k) */
1542: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1543: uikdi = -ba[ili] * ba[bi[i]];
1544: dk += uikdi * ba[ili];
1545: ba[ili] = uikdi; /* -U(i,k) */
1547: /* add multiple of row i to k-th row ... */
1548: jmin = ili + 1;
1549: nz = bi[i + 1] - jmin;
1550: if (nz > 0) {
1551: bcol = bj + jmin;
1552: bval = ba + jmin;
1553: PetscLogFlops(2.0 * nz);
1554: while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
1556: /* update il and jl for i-th row */
1557: il[i] = jmin;
1558: j = bj[jmin];
1559: jl[i] = jl[j];
1560: jl[j] = i;
1561: }
1562: i = nexti;
1563: }
1565: /* shift the diagonals when zero pivot is detected */
1566: /* compute rs=sum of abs(off-diagonal) */
1567: rs = 0.0;
1568: jmin = bi[k] + 1;
1569: nz = bi[k + 1] - jmin;
1570: if (nz) {
1571: bcol = bj + jmin;
1572: while (nz--) {
1573: rs += PetscAbsScalar(rtmp[*bcol]);
1574: bcol++;
1575: }
1576: }
1578: sctx.rs = rs;
1579: sctx.pv = dk;
1580: MatPivotCheck(C, A, info, &sctx, k);
1581: if (sctx.newshift) break; /* sctx.shift_amount is updated */
1582: dk = sctx.pv;
1584: /* copy data into U(k,:) */
1585: ba[bi[k]] = 1.0 / dk;
1586: jmin = bi[k] + 1;
1587: nz = bi[k + 1] - jmin;
1588: if (nz) {
1589: bcol = bj + jmin;
1590: bval = ba + jmin;
1591: while (nz--) {
1592: *bval++ = rtmp[*bcol];
1593: rtmp[*bcol++] = 0.0;
1594: }
1595: /* add k-th row into il and jl */
1596: il[k] = jmin;
1597: i = bj[jmin];
1598: jl[k] = jl[i];
1599: jl[i] = k;
1600: }
1601: } /* end of for (k = 0; k<mbs; k++) */
1602: } while (sctx.newshift);
1603: PetscFree(rtmp);
1604: PetscFree2(il, jl);
1606: C->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1607: C->ops->solves = MatSolves_SeqSBAIJ_1_inplace;
1608: C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1609: C->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1610: C->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1612: C->assembled = PETSC_TRUE;
1613: C->preallocated = PETSC_TRUE;
1615: PetscLogFlops(C->rmap->N);
1616: if (sctx.nshift) {
1617: if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1618: PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount);
1619: } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1620: PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount);
1621: }
1622: }
1623: return 0;
1624: }
1626: PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A, IS perm, const MatFactorInfo *info)
1627: {
1628: Mat C;
1630: MatGetFactor(A, "petsc", MAT_FACTOR_CHOLESKY, &C);
1631: MatCholeskyFactorSymbolic(C, A, perm, info);
1632: MatCholeskyFactorNumeric(C, A, info);
1634: A->ops->solve = C->ops->solve;
1635: A->ops->solvetranspose = C->ops->solvetranspose;
1637: MatHeaderMerge(A, &C);
1638: return 0;
1639: }