Actual source code: baijsolvtrann.c

  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <petsc/private/kernels/blockinvert.h>

  4: /* ----------------------------------------------------------- */
  5: PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
  6: {
  7:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
  8:   IS                 iscol = a->col, isrow = a->row;
  9:   const PetscInt    *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *vi;
 10:   PetscInt           i, nz, j;
 11:   const PetscInt     n = a->mbs, bs = A->rmap->bs, bs2 = a->bs2;
 12:   const MatScalar   *aa = a->a, *v;
 13:   PetscScalar       *x, *t, *ls;
 14:   const PetscScalar *b;

 16:   VecGetArrayRead(bb, &b);
 17:   VecGetArray(xx, &x);
 18:   t = a->solve_work;

 20:   ISGetIndices(isrow, &rout);
 21:   r = rout;
 22:   ISGetIndices(iscol, &cout);
 23:   c = cout;

 25:   /* copy the b into temp work space according to permutation */
 26:   for (i = 0; i < n; i++) {
 27:     for (j = 0; j < bs; j++) t[i * bs + j] = b[c[i] * bs + j];
 28:   }

 30:   /* forward solve the upper triangular transpose */
 31:   ls = a->solve_work + A->cmap->n;
 32:   for (i = 0; i < n; i++) {
 33:     PetscArraycpy(ls, t + i * bs, bs);
 34:     PetscKernel_w_gets_transA_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs);
 35:     v  = aa + bs2 * (a->diag[i] + 1);
 36:     vi = aj + a->diag[i] + 1;
 37:     nz = ai[i + 1] - a->diag[i] - 1;
 38:     while (nz--) {
 39:       PetscKernel_v_gets_v_minus_transA_times_w(bs, t + bs * (*vi++), v, t + i * bs);
 40:       v += bs2;
 41:     }
 42:   }

 44:   /* backward solve the lower triangular transpose */
 45:   for (i = n - 1; i >= 0; i--) {
 46:     v  = aa + bs2 * ai[i];
 47:     vi = aj + ai[i];
 48:     nz = a->diag[i] - ai[i];
 49:     while (nz--) {
 50:       PetscKernel_v_gets_v_minus_transA_times_w(bs, t + bs * (*vi++), v, t + i * bs);
 51:       v += bs2;
 52:     }
 53:   }

 55:   /* copy t into x according to permutation */
 56:   for (i = 0; i < n; i++) {
 57:     for (j = 0; j < bs; j++) x[bs * r[i] + j] = t[bs * i + j];
 58:   }

 60:   ISRestoreIndices(isrow, &rout);
 61:   ISRestoreIndices(iscol, &cout);
 62:   VecRestoreArrayRead(bb, &b);
 63:   VecRestoreArray(xx, &x);
 64:   PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n);
 65:   return 0;
 66: }

 68: PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A, Vec bb, Vec xx)
 69: {
 70:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
 71:   IS                 iscol = a->col, isrow = a->row;
 72:   const PetscInt    *r, *c, *rout, *cout;
 73:   const PetscInt     n = a->mbs, *ai = a->i, *aj = a->j, *vi, *diag = a->diag;
 74:   PetscInt           i, j, nz;
 75:   const PetscInt     bs = A->rmap->bs, bs2 = a->bs2;
 76:   const MatScalar   *aa = a->a, *v;
 77:   PetscScalar       *x, *t, *ls;
 78:   const PetscScalar *b;

 80:   VecGetArrayRead(bb, &b);
 81:   VecGetArray(xx, &x);
 82:   t = a->solve_work;

 84:   ISGetIndices(isrow, &rout);
 85:   r = rout;
 86:   ISGetIndices(iscol, &cout);
 87:   c = cout;

 89:   /* copy the b into temp work space according to permutation */
 90:   for (i = 0; i < n; i++) {
 91:     for (j = 0; j < bs; j++) t[i * bs + j] = b[c[i] * bs + j];
 92:   }

 94:   /* forward solve the upper triangular transpose */
 95:   ls = a->solve_work + A->cmap->n;
 96:   for (i = 0; i < n; i++) {
 97:     PetscArraycpy(ls, t + i * bs, bs);
 98:     PetscKernel_w_gets_transA_times_v(bs, ls, aa + bs2 * diag[i], t + i * bs);
 99:     v  = aa + bs2 * (diag[i] - 1);
100:     vi = aj + diag[i] - 1;
101:     nz = diag[i] - diag[i + 1] - 1;
102:     for (j = 0; j > -nz; j--) {
103:       PetscKernel_v_gets_v_minus_transA_times_w(bs, t + bs * (vi[j]), v, t + i * bs);
104:       v -= bs2;
105:     }
106:   }

108:   /* backward solve the lower triangular transpose */
109:   for (i = n - 1; i >= 0; i--) {
110:     v  = aa + bs2 * ai[i];
111:     vi = aj + ai[i];
112:     nz = ai[i + 1] - ai[i];
113:     for (j = 0; j < nz; j++) {
114:       PetscKernel_v_gets_v_minus_transA_times_w(bs, t + bs * (vi[j]), v, t + i * bs);
115:       v += bs2;
116:     }
117:   }

119:   /* copy t into x according to permutation */
120:   for (i = 0; i < n; i++) {
121:     for (j = 0; j < bs; j++) x[bs * r[i] + j] = t[bs * i + j];
122:   }

124:   ISRestoreIndices(isrow, &rout);
125:   ISRestoreIndices(iscol, &cout);
126:   VecRestoreArrayRead(bb, &b);
127:   VecRestoreArray(xx, &x);
128:   PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n);
129:   return 0;
130: }