Actual source code: aijhipsparseband.hip.cpp

  1: /*
  2:   AIJHIPSPARSE methods implemented with HIP kernels. Uses hipSparse/Thrust maps from AIJHIPSPARSE
  3:   Portions of this code are under:
  4:   Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
  5: */
  6: #define PETSC_SKIP_SPINLOCK

  8: #include <petscconf.h>
  9: #include <../src/mat/impls/aij/seq/aij.h>
 10: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 11: #undef VecType
 12: #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h>
 13: #define AIJBANDUSEGROUPS 1
 14: #if defined(AIJBANDUSEGROUPS)
 15:   #include <hip/hip_cooperative_groups.h>
 16: #endif

 18: /*
 19:   LU BAND factorization with optimization for block diagonal (Nf blocks) in natural order (-mat_no_inode -pc_factor_mat_ordering_type rcm with Nf>1 fields)

 21:   requires:
 22:      structurally symmetric: fix with transpose/column meta data
 23: */

 25: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSEBAND(Mat, Mat, IS, IS, const MatFactorInfo *);
 26: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSEBAND(Mat, Mat, const MatFactorInfo *);
 27: static PetscErrorCode MatSolve_SeqAIJHIPSPARSEBAND(Mat, Vec, Vec);

 29: /*
 30:   The GPU LU factor kernel
 31: */
 32: __global__ void __launch_bounds__(1024, 1) mat_lu_factor_band_init_set_i(const PetscInt n, const int bw, int bi_csr[])
 33: {
 34:   const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n / Nf;
 35:   const PetscInt field = blockIdx.x, blkIdx = blockIdx.y;
 36:   const PetscInt nloc_i = (nloc / Nblk + !!(nloc % Nblk)), start_i = field * nloc + blkIdx * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i);

 38:   // set i (row+1)
 39:   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) bi_csr[0] = 0; // dummy at zero
 40:   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) {   // rows in block by thread y
 41:     if (rowb < end_i && threadIdx.x == 0) {
 42:       PetscInt i = rowb + 1, ni = (rowb > bw) ? bw + 1 : i, n1L = ni * (ni - 1) / 2, nug = i * bw, n2L = bw * ((rowb > bw) ? (rowb - bw) : 0), mi = bw + rowb + 1 - n, clip = (mi > 0) ? mi * (mi - 1) / 2 + mi : 0;
 43:       bi_csr[rowb + 1] = n1L + nug - clip + n2L + i;
 44:     }
 45:   }
 46: }
 47: // copy AIJ to AIJ_BAND
 48: __global__ void __launch_bounds__(1024, 1) mat_lu_factor_band_copy_aij_aij(const PetscInt n, const int bw, const PetscInt r[], const PetscInt ic[], const int ai_d[], const int aj_d[], const PetscScalar aa_d[], const int bi_csr[], PetscScalar ba_csr[])
 49: {
 50:   const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n / Nf;
 51:   const PetscInt field = blockIdx.x, blkIdx = blockIdx.y;
 52:   const PetscInt nloc_i = (nloc / Nblk + !!(nloc % Nblk)), start_i = field * nloc + blkIdx * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i);

 54:   // zero B
 55:   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) ba_csr[bi_csr[n]] = 0; // flop count at end
 56:   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) {           // rows in block by thread y
 57:     if (rowb < end_i) {
 58:       PetscScalar   *batmp = ba_csr + bi_csr[rowb];
 59:       const PetscInt nzb   = bi_csr[rowb + 1] - bi_csr[rowb];
 60:       for (int j = threadIdx.x; j < nzb; j += blockDim.x) {
 61:         if (j < nzb) { batmp[j] = 0; }
 62:       }
 63:     }
 64:   }

 66:   // copy A into B with CSR format -- these two loops can be fused
 67:   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
 68:     if (rowb < end_i) {
 69:       const PetscInt     rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa];
 70:       const int         *ajtmp = aj_d + ai_d[rowa], bjStart = (rowb > bw) ? rowb - bw : 0;
 71:       const PetscScalar *av    = aa_d + ai_d[rowa];
 72:       PetscScalar       *batmp = ba_csr + bi_csr[rowb];
 73:       /* load in initial (unfactored row) */
 74:       for (int j = threadIdx.x; j < nza; j += blockDim.x) {
 75:         if (j < nza) {
 76:           PetscInt    colb = ic[ajtmp[j]], idx = colb - bjStart;
 77:           PetscScalar vala = av[j];
 78:           batmp[idx]       = vala;
 79:         }
 80:       }
 81:     }
 82:   }
 83: }
 84: // print AIJ_BAND
 85: __global__ void print_mat_aij_band(const PetscInt n, const int bi_csr[], const PetscScalar ba_csr[])
 86: {
 87:   // debug
 88:   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) {
 89:     printf("B (AIJ) n=%d:\n", (int)n);
 90:     for (int rowb = 0; rowb < n; rowb++) {
 91:       const PetscInt     nz    = bi_csr[rowb + 1] - bi_csr[rowb];
 92:       const PetscScalar *batmp = ba_csr + bi_csr[rowb];
 93:       for (int j = 0; j < nz; j++) printf("(%13.6e) ", PetscRealPart(batmp[j]));
 94:       printf(" bi=%d\n", bi_csr[rowb + 1]);
 95:     }
 96:   }
 97: }
 98: // Band LU kernel ---  ba_csr bi_csr
 99: __global__ void __launch_bounds__(1024, 1) mat_lu_factor_band(const PetscInt n, const PetscInt bw, const int bi_csr[], PetscScalar ba_csr[], int *use_group_sync)
100: {
101:   const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n / Nf;
102:   const PetscInt field = blockIdx.x, blkIdx = blockIdx.y;
103:   const PetscInt start = field * nloc, end = start + nloc;
104: #if defined(AIJBANDUSEGROUPS)
105:   auto g = cooperative_groups::this_grid();
106: #endif
107:   // A22 panel update for each row A(1,:) and col A(:,1)
108:   for (int glbDD = start, locDD = 0; glbDD < end; glbDD++, locDD++) {
109:     PetscInt           tnzUd = bw, maxU = end - 1 - glbDD;                                        // we are chopping off the inter ears
110:     const PetscInt     nzUd = (tnzUd > maxU) ? maxU : tnzUd, dOffset = (glbDD > bw) ? bw : glbDD; // global to go past ears after first
111:     PetscScalar       *pBdd   = ba_csr + bi_csr[glbDD] + dOffset;
112:     const PetscScalar *baUd   = pBdd + 1; // vector of data  U(i,i+1:end)
113:     const PetscScalar  Bdd    = *pBdd;
114:     const PetscInt     offset = blkIdx * blockDim.y + threadIdx.y, inc = Nblk * blockDim.y;
115:     if (threadIdx.x == 0) {
116:       for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd; idx += inc, myi += inc) { /* assuming symmetric structure */
117:         const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi - glbDD);                // cuts off just the first (global) block
118:         PetscScalar   *Aid = ba_csr + bi_csr[myi] + kIdx;
119:         *Aid               = *Aid / Bdd;
120:       }
121:     }
122:     __syncthreads(); // synch on threadIdx.x only
123:     for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd; idx += inc, myi += inc) {
124:       const PetscInt    bwi = myi > bw ? bw : myi, kIdx = bwi - (myi - glbDD); // cuts off just the first (global) block
125:       PetscScalar      *Aid = ba_csr + bi_csr[myi] + kIdx;
126:       PetscScalar      *Aij = Aid + 1;
127:       const PetscScalar Lid = *Aid;
128:       for (int jIdx = threadIdx.x; jIdx < nzUd; jIdx += blockDim.x) { Aij[jIdx] -= Lid * baUd[jIdx]; }
129:     }
130: #if defined(AIJBANDUSEGROUPS)
131:     if (use_group_sync) {
132:       g.sync();
133:     } else {
134:       __syncthreads();
135:     }
136: #else
137:     __syncthreads();
138: #endif
139:   } /* endof for (i=0; i<n; i++) { */
140: }

142: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSEBAND(Mat B, Mat A, const MatFactorInfo *info)
143: {
144:   Mat_SeqAIJ                    *b                   = (Mat_SeqAIJ *)B->data;
145:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;

148:   Mat_SeqAIJHIPSPARSE           *hipsparsestructA = (Mat_SeqAIJHIPSPARSE *)A->spptr;
149:   Mat_SeqAIJHIPSPARSEMultStruct *matstructA;
150:   CsrMatrix                     *matrixA;
151:   const PetscInt                 n = A->rmap->n, *ic, *r;
152:   const int                     *ai_d, *aj_d;
153:   const PetscScalar             *aa_d;
154:   PetscScalar                   *ba_t = hipsparseTriFactors->a_band_d;
155:   int                           *bi_t = hipsparseTriFactors->i_band_d;
156:   int                            Ni = 10, team_size = 9, Nf = 1, nVec = 56, nconcurrent = 1, nsm = -1; // Nf is batch size - not used

158:   if (A->rmap->n == 0) return 0;
159:   // hipsparse setup
161:   matstructA = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestructA->mat; //  matstruct->cprowIndices
163:   matrixA = (CsrMatrix *)matstructA->mat;

166:   // get data
167:   ic   = thrust::raw_pointer_cast(hipsparseTriFactors->cpermIndices->data());
168:   ai_d = thrust::raw_pointer_cast(matrixA->row_offsets->data());
169:   aj_d = thrust::raw_pointer_cast(matrixA->column_indices->data());
170:   aa_d = thrust::raw_pointer_cast(matrixA->values->data().get());
171:   r    = thrust::raw_pointer_cast(hipsparseTriFactors->rpermIndices->data());

173:   WaitForHIP();
174:   PetscLogGpuTimeBegin();
175:   {
176:     int bw = (int)(2. * (double)n - 1. - (double)(PetscSqrtReal(1. + 4. * ((double)n * (double)n - (double)b->nz)) + PETSC_MACHINE_EPSILON)) / 2, bm1 = bw - 1, nl = n / Nf;
177: #if !defined(AIJBANDUSEGROUPS)
178:     Ni = 1 / nconcurrent;
179:     Ni = 1;
180: #else
181:     if (!hipsparseTriFactors->init_dev_prop) {
182:       int gpuid;
183:       hipsparseTriFactors->init_dev_prop = PETSC_TRUE;
184:       hipGetDevice(&gpuid);
185:       hipGetDeviceProperties(&hipsparseTriFactors->dev_prop, gpuid);
186:     }
187:     nsm = hipsparseTriFactors->dev_prop.multiProcessorCount;
188:     Ni  = nsm / Nf / nconcurrent;
189: #endif
190:     team_size = bw / Ni + !!(bw % Ni);
191:     nVec      = PetscMin(bw, 1024 / team_size);
192:     PetscInfo(A, "Matrix Bandwidth = %d, number SMs/block = %d, num concurrency = %d, num fields = %d, numSMs/GPU = %d, thread group size = %d,%d\n", bw, Ni, nconcurrent, Nf, nsm, team_size, nVec);
193:     {
194:       dim3 dimBlockTeam(nVec, team_size);
195:       dim3 dimBlockLeague(Nf, Ni);
196:       hipLaunchKernelGGL(mat_lu_factor_band_copy_aij_aij, dim3(dimBlockLeague), dim3(dimBlockTeam), 0, 0, n, bw, r, ic, ai_d, aj_d, aa_d, bi_t, ba_t);
197:       PetscHIPCheckLaunch; // does a sync
198: #if defined(AIJBANDUSEGROUPS)
199:       if (Ni > 1) {
200:         void *kernelArgs[] = {(void *)&n, (void *)&bw, (void *)&bi_t, (void *)&ba_t, (void *)&nsm};
201:         hipLaunchCooperativeKernel((void *)mat_lu_factor_band, dimBlockLeague, dimBlockTeam, kernelArgs, 0, NULL);
202:       } else {
203:         hipLaunchKernelGGL(mat_lu_factor_band, dim3(dimBlockLeague), dim3(dimBlockTeam), 0, 0, n, bw, bi_t, ba_t, NULL);
204:       }
205: #else
206:       hipLaunchKernelGGL(mat_lu_factor_band, dim3(dimBlockLeague), dim3(dimBlockTeam), 0, 0, n, bw, bi_t, ba_t, NULL);
207: #endif
208:       PetscHIPCheckLaunch; // does a sync
209: #if defined(PETSC_USE_LOG)
210:       PetscLogGpuFlops((PetscLogDouble)Nf * (bm1 * (bm1 + 1) * (PetscLogDouble)(2 * bm1 + 1) / 3 + (PetscLogDouble)2 * (nl - bw) * bw * bw + (PetscLogDouble)nl * (nl + 1) / 2));
211: #endif
212:     }
213:   }
214:   PetscLogGpuTimeEnd();
215:   /* determine which version of MatSolve needs to be used. from MatLUFactorNumeric_AIJ_SeqAIJHIPSPARSE */
216:   B->ops->solve             = MatSolve_SeqAIJHIPSPARSEBAND;
217:   B->ops->solvetranspose    = NULL; /* need transpose */
218:   B->ops->matsolve          = NULL;
219:   B->ops->matsolvetranspose = NULL;
220:   return 0;
221: }

223: PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSEBAND(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
224: {
225:   Mat_SeqAIJ                    *a = (Mat_SeqAIJ *)A->data, *b;
226:   IS                             isicol;
227:   const PetscInt                *ic, *ai = a->i, *aj = a->j;
228:   PetscScalar                   *ba_t;
229:   int                           *bi_t;
230:   PetscInt                       i, n = A->rmap->n, Nf = 1; /* Nf batch size - not used */
231:   PetscInt                       nzBcsr, bwL, bwU;
232:   PetscBool                      missing;
233:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;

236:   MatMissingDiagonal(A, &missing, &i);
239:   MatGetOption(A, MAT_STRUCTURALLY_SYMMETRIC, &missing);
241:   ISInvertPermutation(iscol, PETSC_DECIDE, &isicol);
242:   ISGetIndices(isicol, &ic);
243:   MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL);
244:   PetscLogObjectParent((PetscObject)B, (PetscObject)isicol);
245:   b = (Mat_SeqAIJ *)(B)->data;

247:   /* get band widths, MatComputeBandwidth should take a reordering ic and do this */
248:   bwL = bwU = 0;
249:   for (int rwb = 0; rwb < n; rwb++) {
250:     const PetscInt rwa = ic[rwb], anz = ai[rwb + 1] - ai[rwb], *ajtmp = aj + ai[rwb];
251:     for (int j = 0; j < anz; j++) {
252:       PetscInt colb = ic[ajtmp[j]];
253:       if (colb < rwa) { // L
254:         if (rwa - colb > bwL) bwL = rwa - colb;
255:       } else {
256:         if (colb - rwa > bwU) bwU = colb - rwa;
257:       }
258:     }
259:   }
260:   ISRestoreIndices(isicol, &ic);
261:   /* only support structurally symmetric, but it might work */
263:   MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors);
264:   nzBcsr   = n + (2 * n - 1) * bwU - bwU * bwU;
265:   b->maxnz = b->nz         = nzBcsr;
266:   hipsparseTriFactors->nnz = b->nz; // only meta data needed: n & nz
267:   PetscInfo(A, "Matrix Bandwidth = %" PetscInt_FMT ", nnz = %" PetscInt_FMT "\n", bwL, b->nz);
268:   if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
269:   hipMalloc(&ba_t, (b->nz + 1) * sizeof(PetscScalar)); // include a place for flops
270:   hipMalloc(&bi_t, (n + 1) * sizeof(int));
271:   hipsparseTriFactors->a_band_d = ba_t;
272:   hipsparseTriFactors->i_band_d = bi_t;
273:   /* In b structure:  Free imax, ilen, old a, old j.  Allocate solve_work, new a, new j */
274:   PetscLogObjectMemory((PetscObject)B, (nzBcsr + 1) * (sizeof(PetscInt) + sizeof(PetscScalar)));
275:   {
276:     dim3 dimBlockTeam(1, 128);
277:     dim3 dimBlockLeague(Nf, 1);
278:     hipLaunchKernelGGL(mat_lu_factor_band_init_set_i, dim3(dimBlockLeague), dim3(dimBlockTeam), 0, 0, n, bwU, bi_t);
279:   }
280:   PetscHIPCheckLaunch; // does a sync

282:   // setup data
283:   if (!hipsparseTriFactors->rpermIndices) {
284:     const PetscInt *r;
285:     ISGetIndices(isrow, &r);
286:     hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
287:     hipsparseTriFactors->rpermIndices->assign(r, r + n);
288:     ISRestoreIndices(isrow, &r);
289:     PetscLogCpuToGpu(n * sizeof(PetscInt));
290:   }
291:   /* upper triangular indices */
292:   if (!hipsparseTriFactors->cpermIndices) {
293:     const PetscInt *c;
294:     ISGetIndices(isicol, &c);
295:     hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
296:     hipsparseTriFactors->cpermIndices->assign(c, c + n);
297:     ISRestoreIndices(isicol, &c);
298:     PetscLogCpuToGpu(n * sizeof(PetscInt));
299:   }

301:   /* put together the new matrix */
302:   b->free_a       = PETSC_FALSE;
303:   b->free_ij      = PETSC_FALSE;
304:   b->singlemalloc = PETSC_FALSE;
305:   b->ilen         = NULL;
306:   b->imax         = NULL;
307:   b->row          = isrow;
308:   b->col          = iscol;
309:   PetscObjectReference((PetscObject)isrow);
310:   PetscObjectReference((PetscObject)iscol);
311:   b->icol = isicol;
312:   PetscMalloc1(n + 1, &b->solve_work);

314:   B->factortype            = MAT_FACTOR_LU;
315:   B->info.factor_mallocs   = 0;
316:   B->info.fill_ratio_given = 0;

318:   if (ai[n]) B->info.fill_ratio_needed = ((PetscReal)(nzBcsr)) / ((PetscReal)ai[n]);
319:   else B->info.fill_ratio_needed = 0.0;
320: #if defined(PETSC_USE_INFO)
321:   if (ai[n] != 0) {
322:     PetscReal af = B->info.fill_ratio_needed;
323:     PetscInfo(A, "Band fill ratio %g\n", (double)af);
324:   } else PetscInfo(A, "Empty matrix\n");
325: #endif
326:   if (a->inode.size) PetscInfo(A, "Warning: using inodes in band solver.\n");
327:   MatSeqAIJCheckInode_FactorLU(B);
328:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSEBAND;
329:   B->offloadmask          = PETSC_OFFLOAD_GPU;

331:   return 0;
332: }

334: /* Use -pc_factor_mat_solver_type hipsparseband */
335: PetscErrorCode MatFactorGetSolverType_seqaij_hipsparse_band(Mat A, MatSolverType *type)
336: {
337:   *type = MATSOLVERHIPSPARSEBAND;
338:   return 0;
339: }

341: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijhipsparse_hipsparse_band(Mat A, MatFactorType ftype, Mat *B)
342: {
343:   PetscInt n = A->rmap->n;

345:   MatCreate(PetscObjectComm((PetscObject)A), B);
346:   MatSetSizes(*B, n, n, n, n);
347:   (*B)->factortype     = ftype;
348:   (*B)->canuseordering = PETSC_TRUE;
349:   MatSetType(*B, MATSEQAIJHIPSPARSE);

351:   if (ftype == MAT_FACTOR_LU) {
352:     MatSetBlockSizesFromMats(*B, A, A);
353:     (*B)->ops->ilufactorsymbolic = NULL; // MatILUFactorSymbolic_SeqAIJHIPSPARSE;
354:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJHIPSPARSEBAND;
355:     PetscStrallocpy(MATORDERINGRCM, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]);
356:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for HIPSPARSEBAND Matrix Types");

358:   MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL);
359:   PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_hipsparse_band);
360:   return 0;
361: }

363: #define WARP_SIZE 32 // to be consistent with Nvidia terminology. WARP == Wavefront
364: template <typename T>
365: __forceinline__ __device__ T wreduce(T a)
366: {
367:   T b;
368: #pragma unroll
369:   for (int i = WARP_SIZE / 2; i >= 1; i = i >> 1) {
370:     b = __shfl_down(0xffffffff, a, i);
371:     a += b;
372:   }
373:   return a;
374: }
375: // reduce in a block, returns result in thread 0
376: template <typename T, int BLOCK_SIZE>
377: __device__ T breduce(T a)
378: {
379:   constexpr int     NWARP = BLOCK_SIZE / WARP_SIZE;
380:   __shared__ double buf[NWARP];
381:   int               wid    = threadIdx.x / WARP_SIZE;
382:   int               laneid = threadIdx.x % WARP_SIZE;
383:   T                 b      = wreduce<T>(a);
384:   if (laneid == 0) buf[wid] = b;
385:   __syncthreads();
386:   if (wid == 0) {
387:     if (threadIdx.x < NWARP) a = buf[threadIdx.x];
388:     else a = 0;
389:     for (int i = (NWARP + 1) / 2; i >= 1; i = i >> 1) { a += __shfl_down(0xffffffff, a, i); }
390:   }
391:   return a;
392: }

394: // Band LU kernel ---  ba_csr bi_csr
395: template <int BLOCK_SIZE>
396: __global__ void __launch_bounds__(256, 1) mat_solve_band(const PetscInt n, const PetscInt bw, const PetscScalar ba_csr[], PetscScalar x[])
397: {
398:   const PetscInt     Nf = gridDim.x, nloc = n / Nf, field = blockIdx.x, start = field * nloc, end = start + nloc, chopnz = bw * (bw + 1) / 2, blocknz = (2 * bw + 1) * nloc, blocknz_0 = blocknz - chopnz;
399:   const PetscScalar *pLi;
400:   const int          tid = threadIdx.x;

402:   /* Next, solve L */
403:   pLi = ba_csr + (field == 0 ? 0 : blocknz_0 + (field - 1) * blocknz + bw); // diagonal (0,0) in field
404:   for (int glbDD = start, locDD = 0; glbDD < end; glbDD++, locDD++) {
405:     const PetscInt col = locDD < bw ? start : (glbDD - bw);
406:     PetscScalar    t   = 0;
407:     for (int j = col + tid, idx = tid; j < glbDD; j += blockDim.x, idx += blockDim.x) { t += pLi[idx] * x[j]; }
408: #if defined(PETSC_USE_COMPLEX)
409:     PetscReal   tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
410:     PetscScalar tt(breduce<PetscReal, BLOCK_SIZE>(tr), breduce<PetscReal, BLOCK_SIZE>(ti));
411:     t = tt;
412: #else
413:     t = breduce<PetscReal, BLOCK_SIZE>(t);
414: #endif
415:     if (threadIdx.x == 0) x[glbDD] -= t; // /1.0
416:     __syncthreads();
417:     // inc
418:     pLi += glbDD - col;                           // get to diagonal
419:     if (glbDD > n - 1 - bw) pLi += n - 1 - glbDD; // skip over U, only last block has funny offset
420:     else pLi += bw;
421:     pLi += 1;                                                   // skip to next row
422:     if (field > 0 && (locDD + 1) < bw) pLi += bw - (locDD + 1); // skip padding at beginning (ear)
423:   }
424:   /* Then, solve U */
425:   pLi = ba_csr + Nf * blocknz - 2 * chopnz - 1;                            // end of real data on block (diagonal)
426:   if (field != Nf - 1) pLi -= blocknz_0 + (Nf - 2 - field) * blocknz + bw; // diagonal of last local row

428:   for (int glbDD = end - 1, locDD = 0; glbDD >= start; glbDD--, locDD++) {
429:     const PetscInt col = (locDD < bw) ? end - 1 : glbDD + bw; // end of row in U
430:     PetscScalar    t   = 0;
431:     for (int j = col - tid, idx = tid; j > glbDD; j -= blockDim.x, idx += blockDim.x) { t += pLi[-idx] * x[j]; }
432: #if defined(PETSC_USE_COMPLEX)
433:     PetscReal   tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
434:     PetscScalar tt(breduce<PetscReal, BLOCK_SIZE>(tr), breduce<PetscReal, BLOCK_SIZE>(ti));
435:     t = tt;
436: #else
437:     t = breduce<PetscReal, BLOCK_SIZE>(PetscRealPart(t));
438: #endif
439:     pLi -= col - glbDD; // diagonal
440:     if (threadIdx.x == 0) {
441:       x[glbDD] -= t;
442:       x[glbDD] /= pLi[0];
443:     }
444:     __syncthreads();
445:     // inc past L to start of previous U
446:     pLi -= bw + 1;
447:     if (glbDD < bw) pLi += bw - glbDD;                                    // overshot in top left corner
448:     if (((locDD + 1) < bw) && field != Nf - 1) pLi -= (bw - (locDD + 1)); // skip past right corner
449:   }
450: }

452: static PetscErrorCode MatSolve_SeqAIJHIPSPARSEBAND(Mat A, Vec bb, Vec xx)
453: {
454:   const PetscScalar                    *barray;
455:   PetscScalar                          *xarray;
456:   thrust::device_ptr<const PetscScalar> bGPU;
457:   thrust::device_ptr<PetscScalar>       xGPU;
458:   Mat_SeqAIJHIPSPARSETriFactors        *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
459:   THRUSTARRAY                          *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;
460:   PetscInt                              n = A->rmap->n, nz = hipsparseTriFactors->nnz, Nf = 1;                                                                                 // Nf is batch size - not used
461:   PetscInt                              bw = (int)(2. * (double)n - 1. - (double)(PetscSqrtReal(1. + 4. * ((double)n * (double)n - (double)nz)) + PETSC_MACHINE_EPSILON)) / 2; // quadric formula for bandwidth

463:   if (A->rmap->n == 0) return 0;

465:   /* Get the GPU pointers */
466:   VecHIPGetArrayWrite(xx, &xarray);
467:   VecHIPGetArrayRead(bb, &barray);
468:   xGPU = thrust::device_pointer_cast(xarray);
469:   bGPU = thrust::device_pointer_cast(barray);

471:   PetscLogGpuTimeBegin();
472:   /* First, reorder with the row permutation */
473:   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->end()), tempGPU->begin());
474:   constexpr int block = 128;
475:   hipLaunchKernelGGL(HIP_KERNEL_NAME(mat_solve_band<block>), dim3(Nf), dim3(block), 0, 0, n, bw, hipsparseTriFactors->a_band_d, tempGPU->data().get());
476:   PetscHIPCheckLaunch; // does a sync

478:   /* Last, reorder with the column permutation */
479:   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->end()), xGPU);

481:   VecHIPRestoreArrayRead(bb, &barray);
482:   VecHIPRestoreArrayWrite(xx, &xarray);
483:   PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n);
484:   PetscLogGpuTimeEnd();

486:   return 0;
487: }