Actual source code: aijhipsparse.hip.cpp

  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the HIPSPARSE library,
  4:   Portions of this code are under:
  5:   Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
  6: */
  7: #define PETSC_SKIP_SPINLOCK

  9: #include <petscconf.h>
 10: #include <../src/mat/impls/aij/seq/aij.h>
 11: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 12: #include <../src/vec/vec/impls/dvecimpl.h>
 13: #include <petsc/private/vecimpl.h>
 14: #undef VecType
 15: #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h>
 16: #include <thrust/adjacent_difference.h>
 17: #include <thrust/iterator/transform_iterator.h>
 18: #if PETSC_CPP_VERSION >= 14
 19:   #define PETSC_HAVE_THRUST_ASYNC 1
 20:   #include <thrust/async/for_each.h>
 21: #endif
 22: #include <thrust/iterator/constant_iterator.h>
 23: #include <thrust/iterator/discard_iterator.h>
 24: #include <thrust/binary_search.h>
 25: #include <thrust/remove.h>
 26: #include <thrust/sort.h>
 27: #include <thrust/unique.h>

 29: const char *const MatHIPSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatHIPSPARSEStorageFormat", "MAT_HIPSPARSE_", 0};
 30: const char *const MatHIPSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "SPMV_ALG_DEFAULT", "SPMV_COO_ALG1", "SPMV_COO_ALG2", "SPMV_CSR_ALG1", "SPMV_CSR_ALG2", "hipsparseSpMVAlg_t", "HIPSPARSE_", 0};
 31: const char *const MatHIPSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "hipsparseSpMMAlg_t", "HIPSPARSE_SPMM_", 0};
 32: //const char *const MatHIPSPARSECsr2CscAlgorithms[] = {"INVALID"/*HIPSPARSE does not have enum 0! We created one*/, "ALG1", "ALG2", "hipsparseCsr2CscAlg_t", "HIPSPARSE_CSR2CSC_", 0};

 34: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 35: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 36: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
 37: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
 38: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
 39: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
 40: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 41: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
 42: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 43: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
 44: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
 45: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat, PetscScalar, Mat, MatStructure);
 46: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat, PetscScalar);
 47: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 48: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
 49: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 50: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
 51: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 52: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
 53: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
 54: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
 55: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **);
 56: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **, MatHIPSPARSEStorageFormat);
 57: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **);
 58: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat_SeqAIJHIPSPARSE **);
 59: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat);
 60: static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat);
 61: static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat, PetscBool);
 62: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
 63: static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat, PetscBool);
 64: static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
 65: static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat, const PetscScalar[], InsertMode);

 67: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(Mat, Mat, Mat, PetscBool, PetscBool);
 68: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
 69: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *);
 70: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijhipsparse_hipsparse_band(Mat, MatFactorType, Mat *);

 72: /*
 73: PetscErrorCode MatHIPSPARSESetStream(Mat A, const hipStream_t stream)
 74: {
 75:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;

 78:   hipsparsestruct->stream = stream;
 79:   hipsparseSetStream(hipsparsestruct->handle, hipsparsestruct->stream);
 80:   return 0;
 81: }

 83: PetscErrorCode MatHIPSPARSESetHandle(Mat A, const hipsparseHandle_t handle)
 84: {
 85:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;

 88:   if (hipsparsestruct->handle != handle) {
 89:     if (hipsparsestruct->handle) hipsparseDestroy(hipsparsestruct->handle);
 90:     hipsparsestruct->handle = handle;
 91:   }
 92:   hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE);
 93:   return 0;
 94: }

 96: PetscErrorCode MatHIPSPARSEClearHandle(Mat A)
 97: {
 98:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
 99:   PetscBool            flg;

101:   PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg);
102:   if (!flg || !hipsparsestruct) return 0;
103:   if (hipsparsestruct->handle) hipsparsestruct->handle = 0;
104:   return 0;
105: }
106: */

108: PETSC_INTERN PetscErrorCode MatHIPSPARSESetFormat_SeqAIJHIPSPARSE(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
109: {
110:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;

112:   switch (op) {
113:   case MAT_HIPSPARSE_MULT:
114:     hipsparsestruct->format = format;
115:     break;
116:   case MAT_HIPSPARSE_ALL:
117:     hipsparsestruct->format = format;
118:     break;
119:   default:
120:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatHIPSPARSEFormatOperation. MAT_HIPSPARSE_MULT and MAT_HIPSPARSE_ALL are currently supported.", op);
121:   }
122:   return 0;
123: }

125: /*@
126:    MatHIPSPARSESetFormat - Sets the storage format of `MATSEQHIPSPARSE` matrices for a particular
127:    operation. Only the `MatMult()` operation can use different GPU storage formats

129:    Not Collective

131:    Input Parameters:
132: +  A - Matrix of type `MATSEQAIJHIPSPARSE`
133: .  op - `MatHIPSPARSEFormatOperation`. `MATSEQAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT` and `MAT_HIPSPARSE_ALL`. `MATMPIAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT_DIAG`, `MAT_HIPSPARSE_MULT_OFFDIAG`, and `MAT_HIPSPARSE_ALL`.
134: -  format - `MatHIPSPARSEStorageFormat` (one of `MAT_HIPSPARSE_CSR`, `MAT_HIPSPARSE_ELL`, `MAT_HIPSPARSE_HYB`.)

136:    Output Parameter:

138:    Level: intermediate

140: .seealso: `Mat`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
141: @*/
142: PetscErrorCode MatHIPSPARSESetFormat(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
143: {
145:   PetscTryMethod(A, "MatHIPSPARSESetFormat_C", (Mat, MatHIPSPARSEFormatOperation, MatHIPSPARSEStorageFormat), (A, op, format));
146:   return 0;
147: }

149: PETSC_INTERN PetscErrorCode MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE(Mat A, PetscBool use_cpu)
150: {
151:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;

153:   hipsparsestruct->use_cpu_solve = use_cpu;
154:   return 0;
155: }

157: /*@
158:    MatHIPSPARSESetUseCPUSolve - Sets use CPU `MatSolve()`.

160:    Input Parameters:
161: +  A - Matrix of type `MATSEQAIJHIPSPARSE`
162: -  use_cpu - set flag for using the built-in CPU `MatSolve()`

164:    Output Parameter:

166:    Notes:
167:    The hipSparse LU solver currently computes the factors with the built-in CPU method
168:    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
169:    This method to specify if the solve is done on the CPU or GPU (GPU is the default).

171:    Level: intermediate

173: .seealso: `MatSolve()`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
174: @*/
175: PetscErrorCode MatHIPSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
176: {
178:   PetscTryMethod(A, "MatHIPSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
179:   return 0;
180: }

182: PetscErrorCode MatSetOption_SeqAIJHIPSPARSE(Mat A, MatOption op, PetscBool flg)
183: {
184:   switch (op) {
185:   case MAT_FORM_EXPLICIT_TRANSPOSE:
186:     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
187:     if (A->form_explicit_transpose && !flg) MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE);
188:     A->form_explicit_transpose = flg;
189:     break;
190:   default:
191:     MatSetOption_SeqAIJ(A, op, flg);
192:     break;
193:   }
194:   return 0;
195: }

197: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
198: {
199:   PetscBool            row_identity, col_identity;
200:   Mat_SeqAIJ          *b     = (Mat_SeqAIJ *)B->data;
201:   IS                   isrow = b->row, iscol = b->col;
202:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)B->spptr;

204:   MatSeqAIJHIPSPARSECopyFromGPU(A);
205:   MatLUFactorNumeric_SeqAIJ(B, A, info);
206:   B->offloadmask = PETSC_OFFLOAD_CPU;
207:   /* determine which version of MatSolve needs to be used. */
208:   ISIdentity(isrow, &row_identity);
209:   ISIdentity(iscol, &col_identity);
210:   if (!hipsparsestruct->use_cpu_solve) {
211:     if (row_identity && col_identity) {
212:       B->ops->solve          = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
213:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
214:     } else {
215:       B->ops->solve          = MatSolve_SeqAIJHIPSPARSE;
216:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE;
217:     }
218:   }
219:   B->ops->matsolve          = NULL;
220:   B->ops->matsolvetranspose = NULL;

222:   /* get the triangular factors */
223:   if (!hipsparsestruct->use_cpu_solve) { MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(B); }
224:   return 0;
225: }

227: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
228: {
229:   MatHIPSPARSEStorageFormat format;
230:   PetscBool                 flg;
231:   Mat_SeqAIJHIPSPARSE      *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;

233:   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJHIPSPARSE options");
234:   if (A->factortype == MAT_FACTOR_NONE) {
235:     PetscOptionsEnum("-mat_hipsparse_mult_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg);
236:     if (flg) MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT, format);
237:     PetscOptionsEnum("-mat_hipsparse_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV and TriSolve", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg);
238:     if (flg) MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_ALL, format);
239:     PetscOptionsBool("-mat_hipsparse_use_cpu_solve", "Use CPU (I)LU solve", "MatHIPSPARSESetUseCPUSolve", hipsparsestruct->use_cpu_solve, &hipsparsestruct->use_cpu_solve, &flg);
240:     if (flg) MatHIPSPARSESetUseCPUSolve(A, hipsparsestruct->use_cpu_solve);
241:     PetscCall(
242:       PetscOptionsEnum("-mat_hipsparse_spmv_alg", "sets hipSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "hipsparseSpMVAlg_t", MatHIPSPARSESpMVAlgorithms, (PetscEnum)hipsparsestruct->spmvAlg, (PetscEnum *)&hipsparsestruct->spmvAlg, &flg));
243:     /* If user did use this option, check its consistency with hipSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatHIPSPARSESpMVAlgorithms[] */
245:     PetscCall(
246:       PetscOptionsEnum("-mat_hipsparse_spmm_alg", "sets hipSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "hipsparseSpMMAlg_t", MatHIPSPARSESpMMAlgorithms, (PetscEnum)hipsparsestruct->spmmAlg, (PetscEnum *)&hipsparsestruct->spmmAlg, &flg));
248:     /*
249:     PetscOptionsEnum("-mat_hipsparse_csr2csc_alg", "sets hipSPARSE algorithm used in converting CSR matrices to CSC matrices", "hipsparseCsr2CscAlg_t", MatHIPSPARSECsr2CscAlgorithms, (PetscEnum)hipsparsestruct->csr2cscAlg, (PetscEnum*)&hipsparsestruct->csr2cscAlg, &flg);
251:     */
252:   }
253:   PetscOptionsHeadEnd();
254:   return 0;
255: }

257: static PetscErrorCode MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(Mat A)
258: {
259:   Mat_SeqAIJ                         *a                   = (Mat_SeqAIJ *)A->data;
260:   PetscInt                            n                   = A->rmap->n;
261:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
262:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
263:   const PetscInt                     *ai = a->i, *aj = a->j, *vi;
264:   const MatScalar                    *aa = a->a, *v;
265:   PetscInt                           *AiLo, *AjLo;
266:   PetscInt                            i, nz, nzLower, offset, rowOffset;

268:   if (!n) return 0;
269:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
270:     try {
271:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
272:       nzLower = n + ai[n] - ai[1];
273:       if (!loTriFactor) {
274:         PetscScalar *AALo;
275:         hipHostMalloc((void **)&AALo, nzLower * sizeof(PetscScalar));

277:         /* Allocate Space for the lower triangular matrix */
278:         hipHostMalloc((void **)&AiLo, (n + 1) * sizeof(PetscInt));
279:         hipHostMalloc((void **)&AjLo, nzLower * sizeof(PetscInt));

281:         /* Fill the lower triangular matrix */
282:         AiLo[0]   = (PetscInt)0;
283:         AiLo[n]   = nzLower;
284:         AjLo[0]   = (PetscInt)0;
285:         AALo[0]   = (MatScalar)1.0;
286:         v         = aa;
287:         vi        = aj;
288:         offset    = 1;
289:         rowOffset = 1;
290:         for (i = 1; i < n; i++) {
291:           nz = ai[i + 1] - ai[i];
292:           /* additional 1 for the term on the diagonal */
293:           AiLo[i] = rowOffset;
294:           rowOffset += nz + 1;

296:           PetscArraycpy(&(AjLo[offset]), vi, nz);
297:           PetscArraycpy(&(AALo[offset]), v, nz);
298:           offset += nz;
299:           AjLo[offset] = (PetscInt)i;
300:           AALo[offset] = (MatScalar)1.0;
301:           offset += 1;
302:           v += nz;
303:           vi += nz;
304:         }

306:         /* allocate space for the triangular factor information */
307:         PetscNew(&loTriFactor);
308:         loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
309:         /* Create the matrix description */
310:         hipsparseCreateMatDescr(&loTriFactor->descr);
311:         hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO);
312:         hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL);
313:         hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_LOWER);
314:         hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT);

316:         /* set the operation */
317:         loTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;

319:         /* set the matrix */
320:         loTriFactor->csrMat                 = new CsrMatrix;
321:         loTriFactor->csrMat->num_rows       = n;
322:         loTriFactor->csrMat->num_cols       = n;
323:         loTriFactor->csrMat->num_entries    = nzLower;
324:         loTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(n + 1);
325:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
326:         loTriFactor->csrMat->values         = new THRUSTARRAY(nzLower);

328:         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
329:         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
330:         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);

332:         /* Create the solve analysis information */
333:         PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);
334:         hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo);
335:         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
336:                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
337:         hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize);

339:         /* perform the solve analysis */
340:         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
341:                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

343:         WaitForHIP();
344:         PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);

346:         /* assign the pointer */
347:         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
348:         loTriFactor->AA_h                                           = AALo;
349:         hipHostFree(AiLo);
350:         hipHostFree(AjLo);
351:         PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar));
352:       } else { /* update values only */
353:         if (!loTriFactor->AA_h) hipHostMalloc((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar));
354:         /* Fill the lower triangular matrix */
355:         loTriFactor->AA_h[0] = 1.0;
356:         v                    = aa;
357:         vi                   = aj;
358:         offset               = 1;
359:         for (i = 1; i < n; i++) {
360:           nz = ai[i + 1] - ai[i];
361:           PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);
362:           offset += nz;
363:           loTriFactor->AA_h[offset] = 1.0;
364:           offset += 1;
365:           v += nz;
366:         }
367:         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
368:         PetscLogCpuToGpu(nzLower * sizeof(PetscScalar));
369:       }
370:     } catch (char *ex) {
371:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
372:     }
373:   }
374:   return 0;
375: }

377: static PetscErrorCode MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(Mat A)
378: {
379:   Mat_SeqAIJ                         *a                   = (Mat_SeqAIJ *)A->data;
380:   PetscInt                            n                   = A->rmap->n;
381:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
382:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
383:   const PetscInt                     *aj = a->j, *adiag = a->diag, *vi;
384:   const MatScalar                    *aa = a->a, *v;
385:   PetscInt                           *AiUp, *AjUp;
386:   PetscInt                            i, nz, nzUpper, offset;

388:   if (!n) return 0;
389:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
390:     try {
391:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
392:       nzUpper = adiag[0] - adiag[n];
393:       if (!upTriFactor) {
394:         PetscScalar *AAUp;
395:         hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar));

397:         /* Allocate Space for the upper triangular matrix */
398:         hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt));
399:         hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt));

401:         /* Fill the upper triangular matrix */
402:         AiUp[0] = (PetscInt)0;
403:         AiUp[n] = nzUpper;
404:         offset  = nzUpper;
405:         for (i = n - 1; i >= 0; i--) {
406:           v  = aa + adiag[i + 1] + 1;
407:           vi = aj + adiag[i + 1] + 1;
408:           nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
409:           offset -= (nz + 1);               /* decrement the offset */

411:           /* first, set the diagonal elements */
412:           AjUp[offset] = (PetscInt)i;
413:           AAUp[offset] = (MatScalar)1. / v[nz];
414:           AiUp[i]      = AiUp[i + 1] - (nz + 1);

416:           PetscArraycpy(&(AjUp[offset + 1]), vi, nz);
417:           PetscArraycpy(&(AAUp[offset + 1]), v, nz);
418:         }

420:         /* allocate space for the triangular factor information */
421:         PetscNew(&upTriFactor);
422:         upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;

424:         /* Create the matrix description */
425:         hipsparseCreateMatDescr(&upTriFactor->descr);
426:         hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO);
427:         hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL);
428:         hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER);
429:         hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT);

431:         /* set the operation */
432:         upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;

434:         /* set the matrix */
435:         upTriFactor->csrMat                 = new CsrMatrix;
436:         upTriFactor->csrMat->num_rows       = n;
437:         upTriFactor->csrMat->num_cols       = n;
438:         upTriFactor->csrMat->num_entries    = nzUpper;
439:         upTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(n + 1);
440:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
441:         upTriFactor->csrMat->values         = new THRUSTARRAY(nzUpper);
442:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
443:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
444:         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);

446:         /* Create the solve analysis information */
447:         PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);
448:         hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo);
449:         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
450:                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
451:         hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize);

453:         /* perform the solve analysis */
454:         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
455:                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

457:         WaitForHIP();
458:         PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);

460:         /* assign the pointer */
461:         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
462:         upTriFactor->AA_h                                           = AAUp;
463:         hipHostFree(AiUp);
464:         hipHostFree(AjUp);
465:         PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar));
466:       } else {
467:         if (!upTriFactor->AA_h) hipHostMalloc((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar));
468:         /* Fill the upper triangular matrix */
469:         offset = nzUpper;
470:         for (i = n - 1; i >= 0; i--) {
471:           v  = aa + adiag[i + 1] + 1;
472:           nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
473:           offset -= (nz + 1);               /* decrement the offset */

475:           /* first, set the diagonal elements */
476:           upTriFactor->AA_h[offset] = 1. / v[nz];
477:           PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz);
478:         }
479:         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
480:         PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar));
481:       }
482:     } catch (char *ex) {
483:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
484:     }
485:   }
486:   return 0;
487: }

489: static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat A)
490: {
491:   PetscBool                      row_identity, col_identity;
492:   Mat_SeqAIJ                    *a                   = (Mat_SeqAIJ *)A->data;
493:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
494:   IS                             isrow = a->row, iscol = a->icol;
495:   PetscInt                       n = A->rmap->n;

498:   MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(A);
499:   MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(A);

501:   if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
502:   hipsparseTriFactors->nnz = a->nz;

504:   A->offloadmask = PETSC_OFFLOAD_BOTH;
505:   /* lower triangular indices */
506:   ISIdentity(isrow, &row_identity);
507:   if (!row_identity && !hipsparseTriFactors->rpermIndices) {
508:     const PetscInt *r;

510:     ISGetIndices(isrow, &r);
511:     hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
512:     hipsparseTriFactors->rpermIndices->assign(r, r + n);
513:     ISRestoreIndices(isrow, &r);
514:     PetscLogCpuToGpu(n * sizeof(PetscInt));
515:   }
516:   /* upper triangular indices */
517:   ISIdentity(iscol, &col_identity);
518:   if (!col_identity && !hipsparseTriFactors->cpermIndices) {
519:     const PetscInt *c;

521:     ISGetIndices(iscol, &c);
522:     hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
523:     hipsparseTriFactors->cpermIndices->assign(c, c + n);
524:     ISRestoreIndices(iscol, &c);
525:     PetscLogCpuToGpu(n * sizeof(PetscInt));
526:   }
527:   return 0;
528: }

530: static PetscErrorCode MatSeqAIJHIPSPARSEBuildICCTriMatrices(Mat A)
531: {
532:   Mat_SeqAIJ                         *a                   = (Mat_SeqAIJ *)A->data;
533:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
534:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
535:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
536:   PetscInt                           *AiUp, *AjUp;
537:   PetscScalar                        *AAUp;
538:   PetscScalar                        *AALo;
539:   PetscInt                            nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
540:   Mat_SeqSBAIJ                       *b  = (Mat_SeqSBAIJ *)A->data;
541:   const PetscInt                     *ai = b->i, *aj = b->j, *vj;
542:   const MatScalar                    *aa = b->a, *v;

544:   if (!n) return 0;
545:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
546:     try {
547:       hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar));
548:       hipHostMalloc((void **)&AALo, nzUpper * sizeof(PetscScalar));
549:       if (!upTriFactor && !loTriFactor) {
550:         /* Allocate Space for the upper triangular matrix */
551:         hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt));
552:         hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt));

554:         /* Fill the upper triangular matrix */
555:         AiUp[0] = (PetscInt)0;
556:         AiUp[n] = nzUpper;
557:         offset  = 0;
558:         for (i = 0; i < n; i++) {
559:           /* set the pointers */
560:           v  = aa + ai[i];
561:           vj = aj + ai[i];
562:           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */

564:           /* first, set the diagonal elements */
565:           AjUp[offset] = (PetscInt)i;
566:           AAUp[offset] = (MatScalar)1.0 / v[nz];
567:           AiUp[i]      = offset;
568:           AALo[offset] = (MatScalar)1.0 / v[nz];

570:           offset += 1;
571:           if (nz > 0) {
572:             PetscArraycpy(&(AjUp[offset]), vj, nz);
573:             PetscArraycpy(&(AAUp[offset]), v, nz);
574:             for (j = offset; j < offset + nz; j++) {
575:               AAUp[j] = -AAUp[j];
576:               AALo[j] = AAUp[j] / v[nz];
577:             }
578:             offset += nz;
579:           }
580:         }

582:         /* allocate space for the triangular factor information */
583:         PetscNew(&upTriFactor);
584:         upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;

586:         /* Create the matrix description */
587:         hipsparseCreateMatDescr(&upTriFactor->descr);
588:         hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO);
589:         hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL);
590:         hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER);
591:         hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT);

593:         /* set the matrix */
594:         upTriFactor->csrMat                 = new CsrMatrix;
595:         upTriFactor->csrMat->num_rows       = A->rmap->n;
596:         upTriFactor->csrMat->num_cols       = A->cmap->n;
597:         upTriFactor->csrMat->num_entries    = a->nz;
598:         upTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
599:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
600:         upTriFactor->csrMat->values         = new THRUSTARRAY(a->nz);
601:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
602:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
603:         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);

605:         /* set the operation */
606:         upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;

608:         /* Create the solve analysis information */
609:         PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);
610:         hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo);
611:         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
612:                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
613:         hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize);

615:         /* perform the solve analysis */
616:         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
617:                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

619:         WaitForHIP();
620:         PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);

622:         /* assign the pointer */
623:         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;

625:         /* allocate space for the triangular factor information */
626:         PetscNew(&loTriFactor);
627:         loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;

629:         /* Create the matrix description */
630:         hipsparseCreateMatDescr(&loTriFactor->descr);
631:         hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO);
632:         hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL);
633:         hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER);
634:         hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT);

636:         /* set the operation */
637:         loTriFactor->solveOp = HIPSPARSE_OPERATION_TRANSPOSE;

639:         /* set the matrix */
640:         loTriFactor->csrMat                 = new CsrMatrix;
641:         loTriFactor->csrMat->num_rows       = A->rmap->n;
642:         loTriFactor->csrMat->num_cols       = A->cmap->n;
643:         loTriFactor->csrMat->num_entries    = a->nz;
644:         loTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
645:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
646:         loTriFactor->csrMat->values         = new THRUSTARRAY(a->nz);
647:         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
648:         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
649:         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);

651:         /* Create the solve analysis information */
652:         PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);
653:         hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo);
654:         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
655:                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
656:         hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize);

658:         /* perform the solve analysis */
659:         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
660:                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

662:         WaitForHIP();
663:         PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);

665:         /* assign the pointer */
666:         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;

668:         PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)));
669:         hipHostFree(AiUp);
670:         hipHostFree(AjUp);
671:       } else {
672:         /* Fill the upper triangular matrix */
673:         offset = 0;
674:         for (i = 0; i < n; i++) {
675:           /* set the pointers */
676:           v  = aa + ai[i];
677:           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */

679:           /* first, set the diagonal elements */
680:           AAUp[offset] = 1.0 / v[nz];
681:           AALo[offset] = 1.0 / v[nz];

683:           offset += 1;
684:           if (nz > 0) {
685:             PetscArraycpy(&(AAUp[offset]), v, nz);
686:             for (j = offset; j < offset + nz; j++) {
687:               AAUp[j] = -AAUp[j];
688:               AALo[j] = AAUp[j] / v[nz];
689:             }
690:             offset += nz;
691:           }
692:         }
695:         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
696:         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
697:         PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar));
698:       }
699:       hipHostFree(AAUp);
700:       hipHostFree(AALo);
701:     } catch (char *ex) {
702:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
703:     }
704:   }
705:   return 0;
706: }

708: static PetscErrorCode MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(Mat A)
709: {
710:   PetscBool                      perm_identity;
711:   Mat_SeqAIJ                    *a                   = (Mat_SeqAIJ *)A->data;
712:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
713:   IS                             ip                  = a->row;
714:   PetscInt                       n                   = A->rmap->n;

717:   MatSeqAIJHIPSPARSEBuildICCTriMatrices(A);
718:   if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
719:   hipsparseTriFactors->nnz = (a->nz - n) * 2 + n;

721:   A->offloadmask = PETSC_OFFLOAD_BOTH;
722:   /* lower triangular indices */
723:   ISIdentity(ip, &perm_identity);
724:   if (!perm_identity) {
725:     IS              iip;
726:     const PetscInt *irip, *rip;

728:     ISInvertPermutation(ip, PETSC_DECIDE, &iip);
729:     ISGetIndices(iip, &irip);
730:     ISGetIndices(ip, &rip);
731:     hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
732:     hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
733:     hipsparseTriFactors->rpermIndices->assign(rip, rip + n);
734:     hipsparseTriFactors->cpermIndices->assign(irip, irip + n);
735:     ISRestoreIndices(iip, &irip);
736:     ISDestroy(&iip);
737:     ISRestoreIndices(ip, &rip);
738:     PetscLogCpuToGpu(2. * n * sizeof(PetscInt));
739:   }
740:   return 0;
741: }

743: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
744: {
745:   PetscBool   perm_identity;
746:   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
747:   IS          ip = b->row;

749:   MatSeqAIJHIPSPARSECopyFromGPU(A);
750:   MatCholeskyFactorNumeric_SeqAIJ(B, A, info);
751:   B->offloadmask = PETSC_OFFLOAD_CPU;
752:   /* determine which version of MatSolve needs to be used. */
753:   ISIdentity(ip, &perm_identity);
754:   if (perm_identity) {
755:     B->ops->solve             = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
756:     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
757:     B->ops->matsolve          = NULL;
758:     B->ops->matsolvetranspose = NULL;
759:   } else {
760:     B->ops->solve             = MatSolve_SeqAIJHIPSPARSE;
761:     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJHIPSPARSE;
762:     B->ops->matsolve          = NULL;
763:     B->ops->matsolvetranspose = NULL;
764:   }

766:   /* get the triangular factors */
767:   MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(B);
768:   return 0;
769: }

771: static PetscErrorCode MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(Mat A)
772: {
773:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
774:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
775:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
776:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT;
777:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT;
778:   hipsparseIndexBase_t                indexBase;
779:   hipsparseMatrixType_t               matrixType;
780:   hipsparseFillMode_t                 fillMode;
781:   hipsparseDiagType_t                 diagType;

783:   /* allocate space for the transpose of the lower triangular factor */
784:   PetscNew(&loTriFactorT);
785:   loTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;

787:   /* set the matrix descriptors of the lower triangular factor */
788:   matrixType = hipsparseGetMatType(loTriFactor->descr);
789:   indexBase  = hipsparseGetMatIndexBase(loTriFactor->descr);
790:   fillMode   = hipsparseGetMatFillMode(loTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
791:   diagType   = hipsparseGetMatDiagType(loTriFactor->descr);

793:   /* Create the matrix description */
794:   hipsparseCreateMatDescr(&loTriFactorT->descr);
795:   hipsparseSetMatIndexBase(loTriFactorT->descr, indexBase);
796:   hipsparseSetMatType(loTriFactorT->descr, matrixType);
797:   hipsparseSetMatFillMode(loTriFactorT->descr, fillMode);
798:   hipsparseSetMatDiagType(loTriFactorT->descr, diagType);

800:   /* set the operation */
801:   loTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;

803:   /* allocate GPU space for the CSC of the lower triangular factor*/
804:   loTriFactorT->csrMat                 = new CsrMatrix;
805:   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
806:   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
807:   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
808:   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
809:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
810:   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);

812:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
813:   /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
814: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
815:   PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
816:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
817:                                                   loTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
818:   hipMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize);
819: #endif
820: */
821:   PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0);

823:   PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows,
824:                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
825:                           loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
826:                           loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
827: #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
828:                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
829:                           hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
830: #else
831:                                        loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
832: #endif

834:   WaitForHIP();
835:   PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0);

837:   /* Create the solve analysis information */
838:   PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);
839:   hipsparseCreateCsrsvInfo(&loTriFactorT->solveInfo);
840:   PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactorT->solveOp,
841:                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
842:                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
843:                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
844:                                 &loTriFactorT->solveBufferSize));
845:   hipMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize);

847:   /* perform the solve analysis */
848:   PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactorT->solveOp,
849:                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
850:                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
851:                            loTriFactorT->csrMat->column_indices->data().get(),
852:                            loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

854:   WaitForHIP();
855:   PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);

857:   /* assign the pointer */
858:   ((Mat_SeqAIJHIPSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

860:   /*********************************************/
861:   /* Now the Transpose of the Upper Tri Factor */
862:   /*********************************************/

864:   /* allocate space for the transpose of the upper triangular factor */
865:   PetscNew(&upTriFactorT);
866:   upTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;

868:   /* set the matrix descriptors of the upper triangular factor */
869:   matrixType = hipsparseGetMatType(upTriFactor->descr);
870:   indexBase  = hipsparseGetMatIndexBase(upTriFactor->descr);
871:   fillMode   = hipsparseGetMatFillMode(upTriFactor->descr)== HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
872:   diagType   = hipsparseGetMatDiagType(upTriFactor->descr);

874:   /* Create the matrix description */
875:   hipsparseCreateMatDescr(&upTriFactorT->descr);
876:   hipsparseSetMatIndexBase(upTriFactorT->descr, indexBase);
877:   hipsparseSetMatType(upTriFactorT->descr, matrixType);
878:   hipsparseSetMatFillMode(upTriFactorT->descr, fillMode);
879:   hipsparseSetMatDiagType(upTriFactorT->descr, diagType);

881:   /* set the operation */
882:   upTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;

884:   /* allocate GPU space for the CSC of the upper triangular factor*/
885:   upTriFactorT->csrMat                 = new CsrMatrix;
886:   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
887:   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
888:   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
889:   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
890:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
891:   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);

893:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
894:   /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
895: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
896:   PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
897:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
898:                                                   upTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
899:   hipMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize);
900: #endif
901: */
902:   PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0);
903:   PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows,
904:                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
905:                           upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
906:                           upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
907: #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
908:                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
909:                           hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
910: #else
911:                                        upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
912: #endif

914:   WaitForHIP();
915:   PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0);

917:   /* Create the solve analysis information */
918:   PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);
919:   hipsparseCreateCsrsvInfo(&upTriFactorT->solveInfo);
920:   PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactorT->solveOp,
921:                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
922:                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
923:                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
924:                                  &upTriFactorT->solveBufferSize));
925:   hipMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize);

927:   /* perform the solve analysis */
928:   PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactorT->solveOp,
929:                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
930:                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
931:                            upTriFactorT->csrMat->column_indices->data().get(),
932:                            upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

934:   WaitForHIP();
935:   PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);

937:   /* assign the pointer */
938:   ((Mat_SeqAIJHIPSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
939:   return 0;
940: }

942: struct PetscScalarToPetscInt {
943:   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
944: };

946: static PetscErrorCode MatSeqAIJHIPSPARSEFormExplicitTranspose(Mat A)
947: {
948:   Mat_SeqAIJHIPSPARSE           *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
949:   Mat_SeqAIJHIPSPARSEMultStruct *matstruct, *matstructT;
950:   Mat_SeqAIJ                    *a = (Mat_SeqAIJ *)A->data;
951:   hipsparseIndexBase_t           indexBase;

953:   MatSeqAIJHIPSPARSECopyToGPU(A);
954:   matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
956:   matstructT = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
958:   if (A->transupdated) return 0;
959:   PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0);
960:   PetscLogGpuTimeBegin();
961:   if (hipsparsestruct->format != MAT_HIPSPARSE_CSR) MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE);
962:   if (!hipsparsestruct->matTranspose) { /* create hipsparse matrix */
963:     matstructT = new Mat_SeqAIJHIPSPARSEMultStruct;
964:     hipsparseCreateMatDescr(&matstructT->descr);
965:     indexBase = hipsparseGetMatIndexBase(matstruct->descr);
966:     hipsparseSetMatIndexBase(matstructT->descr, indexBase);
967:     hipsparseSetMatType(matstructT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL);

969:     /* set alpha and beta */
970:     hipMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar));
971:     hipMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar));
972:     hipMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));
973:     hipMemcpy(matstructT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice);
974:     hipMemcpy(matstructT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice);
975:     hipMemcpy(matstructT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice);

977:     if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
978:       CsrMatrix *matrixT      = new CsrMatrix;
979:       matstructT->mat         = matrixT;
980:       matrixT->num_rows       = A->cmap->n;
981:       matrixT->num_cols       = A->rmap->n;
982:       matrixT->num_entries    = a->nz;
983:       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
984:       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
985:       matrixT->values         = new THRUSTARRAY(a->nz);

987:       if (!hipsparsestruct->rowoffsets_gpu) hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
988:       hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);

990:       PetscCallHIPSPARSE(hipsparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
991:                                             indexBase, hipsparse_scalartype));
992:     } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
993:       CsrMatrix *temp  = new CsrMatrix;
994:       CsrMatrix *tempT = new CsrMatrix;
995:       /* First convert HYB to CSR */
996:       temp->num_rows       = A->rmap->n;
997:       temp->num_cols       = A->cmap->n;
998:       temp->num_entries    = a->nz;
999:       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1000:       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1001:       temp->values         = new THRUSTARRAY(a->nz);

1003:       hipsparse_hyb2csr(hipsparsestruct->handle, matstruct->descr, (hipsparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());

1005:       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1006:       tempT->num_rows       = A->rmap->n;
1007:       tempT->num_cols       = A->cmap->n;
1008:       tempT->num_entries    = a->nz;
1009:       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1010:       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1011:       tempT->values         = new THRUSTARRAY(a->nz);

1013:       PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1014:                                            tempT->column_indices->data().get(), tempT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));

1016:       /* Last, convert CSC to HYB */
1017:       hipsparseHybMat_t hybMat;
1018:       hipsparseCreateHybMat(&hybMat);
1019:       hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
1020:       hipsparse_csr2hyb(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);

1022:       /* assign the pointer */
1023:       matstructT->mat = hybMat;
1024:       A->transupdated = PETSC_TRUE;
1025:       /* delete temporaries */
1026:       if (tempT) {
1027:         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1028:         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1029:         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1030:         delete (CsrMatrix *)tempT;
1031:       }
1032:       if (temp) {
1033:         if (temp->values) delete (THRUSTARRAY *)temp->values;
1034:         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1035:         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1036:         delete (CsrMatrix *)temp;
1037:       }
1038:     }
1039:   }
1040:   if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1041:     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1042:     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1051:     if (!hipsparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1052:       hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1053:       hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1054:       PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt));
1055:     }
1056:     if (!hipsparsestruct->csr2csc_i) {
1057:       THRUSTARRAY csr2csc_a(matrix->num_entries);
1058:       thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0);

1060:       indexBase = hipsparseGetMatIndexBase(matstruct->descr);
1061:       if (matrix->num_entries) {
1062:         /* This routine is known to give errors with CUDA-11, but works fine with CUDA-10
1063:            Need to verify this for ROCm.
1064:         */
1065:         PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), hipsparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1066:                                              matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
1067:       } else {
1068:         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1069:       }

1071:       hipsparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1072:       thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), hipsparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt());
1073:     }
1074:     PetscCallThrust(
1075:       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1076:   }
1077:   PetscLogGpuTimeEnd();
1078:   PetscLogEventEnd(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0);
1079:   /* the compressed row indices is not used for matTranspose */
1080:   matstructT->cprowIndices = NULL;
1081:   /* assign the pointer */
1082:   ((Mat_SeqAIJHIPSPARSE *)A->spptr)->matTranspose = matstructT;
1083:   A->transupdated                                 = PETSC_TRUE;
1084:   return 0;
1085: }

1087: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = HIPSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJHIPSPARSE? */
1088: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1089: {
1090:   PetscInt                              n = xx->map->n;
1091:   const PetscScalar                    *barray;
1092:   PetscScalar                          *xarray;
1093:   thrust::device_ptr<const PetscScalar> bGPU;
1094:   thrust::device_ptr<PetscScalar>       xGPU;
1095:   Mat_SeqAIJHIPSPARSETriFactors        *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1096:   Mat_SeqAIJHIPSPARSETriFactorStruct   *loTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1097:   Mat_SeqAIJHIPSPARSETriFactorStruct   *upTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1098:   THRUSTARRAY                          *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;

1100:   /* Analyze the matrix and create the transpose ... on the fly */
1101:   if (!loTriFactorT && !upTriFactorT) {
1102:     MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A);
1103:     loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1104:     upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1105:   }

1107:   /* Get the GPU pointers */
1108:   VecHIPGetArrayWrite(xx, &xarray);
1109:   VecHIPGetArrayRead(bb, &barray);
1110:   xGPU = thrust::device_pointer_cast(xarray);
1111:   bGPU = thrust::device_pointer_cast(barray);

1113:   PetscLogGpuTimeBegin();
1114:   /* First, reorder with the row permutation */
1115:   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, hipsparseTriFactors->rpermIndices->end()), xGPU);

1117:   /* First, solve U */
1118:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1119:                                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1121:   /* Then, solve L */
1122:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1123:                                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1125:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1126:   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(xGPU, hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, hipsparseTriFactors->cpermIndices->end()), tempGPU->begin());

1128:   /* Copy the temporary to the full solution. */
1129:   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), tempGPU->begin(), tempGPU->end(), xGPU);

1131:   /* restore */
1132:   VecHIPRestoreArrayRead(bb, &barray);
1133:   VecHIPRestoreArrayWrite(xx, &xarray);
1134:   PetscLogGpuTimeEnd();
1135:   PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n);
1136:   return 0;
1137: }

1139: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1140: {
1141:   const PetscScalar                  *barray;
1142:   PetscScalar                        *xarray;
1143:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1144:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1145:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1146:   THRUSTARRAY                        *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;

1148:   /* Analyze the matrix and create the transpose ... on the fly */
1149:   if (!loTriFactorT && !upTriFactorT) {
1150:     MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A);
1151:     loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1152:     upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1153:   }

1155:   /* Get the GPU pointers */
1156:   VecHIPGetArrayWrite(xx, &xarray);
1157:   VecHIPGetArrayRead(bb, &barray);

1159:   PetscLogGpuTimeBegin();
1160:   /* First, solve U */
1161:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1162:                                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1164:   /* Then, solve L */
1165:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1166:                                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1168:   /* restore */
1169:   VecHIPRestoreArrayRead(bb, &barray);
1170:   VecHIPRestoreArrayWrite(xx, &xarray);
1171:   PetscLogGpuTimeEnd();
1172:   PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n);
1173:   return 0;
1174: }

1176: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1177: {
1178:   const PetscScalar                    *barray;
1179:   PetscScalar                          *xarray;
1180:   thrust::device_ptr<const PetscScalar> bGPU;
1181:   thrust::device_ptr<PetscScalar>       xGPU;
1182:   Mat_SeqAIJHIPSPARSETriFactors        *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1183:   Mat_SeqAIJHIPSPARSETriFactorStruct   *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1184:   Mat_SeqAIJHIPSPARSETriFactorStruct   *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1185:   THRUSTARRAY                          *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;

1187:   /* Get the GPU pointers */
1188:   VecHIPGetArrayWrite(xx, &xarray);
1189:   VecHIPGetArrayRead(bb, &barray);
1190:   xGPU = thrust::device_pointer_cast(xarray);
1191:   bGPU = thrust::device_pointer_cast(barray);

1193:   PetscLogGpuTimeBegin();
1194:   /* First, reorder with the row permutation */
1195:   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->end()), tempGPU->begin());

1197:   /* Next, solve L */
1198:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1199:                                            loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

1201:   /* Then, solve U */
1202:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1203:                                            upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));

1205:   /* Last, reorder with the column permutation */
1206:   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->end()), xGPU);

1208:   VecHIPRestoreArrayRead(bb, &barray);
1209:   VecHIPRestoreArrayWrite(xx, &xarray);
1210:   PetscLogGpuTimeEnd();
1211:   PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n);
1212:   return 0;
1213: }

1215: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1216: {
1217:   const PetscScalar                  *barray;
1218:   PetscScalar                        *xarray;
1219:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1220:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1221:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1222:   THRUSTARRAY                        *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;

1224:   /* Get the GPU pointers */
1225:   VecHIPGetArrayWrite(xx, &xarray);
1226:   VecHIPGetArrayRead(bb, &barray);

1228:   PetscLogGpuTimeBegin();
1229:   /* First, solve L */
1230:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1231:                                            loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));

1233:   /* Next, solve U */
1234:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1235:                                            upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

1237:   VecHIPRestoreArrayRead(bb, &barray);
1238:   VecHIPRestoreArrayWrite(xx, &xarray);
1239:   PetscLogGpuTimeEnd();
1240:   PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n);
1241:   return 0;
1242: }

1244: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1245: /* hipsparseSpSV_solve() and related functions first appeared in ROCm-4.5.0*/
1246: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1247: {
1248:   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1249:   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1250:   const PetscScalar             *barray;
1251:   PetscScalar                   *xarray;

1253:   VecHIPGetArrayWrite(x, &xarray);
1254:   VecHIPGetArrayRead(b, &barray);
1255:   PetscLogGpuTimeBegin();

1257:   /* Solve L*y = b */
1258:   hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray);
1259:   hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y);
1260:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L,                                     /* L Y = X */
1261:                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!

1263:   /* Solve U*x = y */
1264:   hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray);
1265:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1266:                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));

1268:   VecHIPRestoreArrayRead(b, &barray);
1269:   VecHIPRestoreArrayWrite(x, &xarray);

1271:   PetscLogGpuTimeEnd();
1272:   PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n);
1273:   return 0;
1274: }

1276: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1277: {
1278:   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1279:   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1280:   const PetscScalar             *barray;
1281:   PetscScalar                   *xarray;

1283:   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1284:     hipsparseSpSV_createDescr(&fs->spsvDescr_Lt);
1285:     PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1286:                                                 fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

1288:     hipsparseSpSV_createDescr(&fs->spsvDescr_Ut);
1289:     hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut);
1290:     hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt);
1291:     hipMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut);
1292:     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1293:   }

1295:   if (!fs->updatedTransposeSpSVAnalysis) {
1296:     hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt);

1298:     hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut);
1299:     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1300:   }

1302:   VecHIPGetArrayWrite(x, &xarray);
1303:   VecHIPGetArrayRead(b, &barray);
1304:   PetscLogGpuTimeBegin();

1306:   /* Solve Ut*y = b */
1307:   hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray);
1308:   hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y);
1309:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1310:                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));

1312:   /* Solve Lt*x = y */
1313:   hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray);
1314:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1315:                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));

1317:   VecHIPRestoreArrayRead(b, &barray);
1318:   VecHIPRestoreArrayWrite(x, &xarray);
1319:   PetscLogGpuTimeEnd();
1320:   PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n);
1321:   return 0;
1322: }

1324: static PetscErrorCode MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info)
1325: {
1326:   Mat_SeqAIJHIPSPARSETriFactors *fs    = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1327:   Mat_SeqAIJ                    *aij   = (Mat_SeqAIJ *)fact->data;
1328:   Mat_SeqAIJHIPSPARSE           *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1329:   CsrMatrix                     *Acsr;
1330:   PetscInt                       m, nz;
1331:   PetscBool                      flg;

1333:   if (PetscDefined(USE_DEBUG)) {
1334:     PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg);
1336:   }

1338:   /* Copy A's value to fact */
1339:   m  = fact->rmap->n;
1340:   nz = aij->nz;
1341:   MatSeqAIJHIPSPARSECopyToGPU(A);
1342:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1343:   hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream);

1345:   /* Factorize fact inplace */
1346:   if (m)
1347:     PetscCallHIPSPARSE(hipsparseXcsrilu02(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1348:                                           fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1349:   if (PetscDefined(USE_DEBUG)) {
1350:     int               numerical_zero;
1351:     hipsparseStatus_t status;
1352:     status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1353:     PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1354:   }

1356:   /* hipsparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after hipsparseXcsrilu02() */
1357:   hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L);

1359:   hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U);

1361:   /* L, U values have changed, reset the flag to indicate we need to redo hipsparseSpSV_analysis() for transpose solve */
1362:   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;

1364:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1365:   fact->ops->solve             = MatSolve_SeqAIJHIPSPARSE_ILU0;
1366:   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJHIPSPARSE_ILU0;
1367:   fact->ops->matsolve          = NULL;
1368:   fact->ops->matsolvetranspose = NULL;
1369:   PetscLogGpuFlops(fs->numericFactFlops);
1370:   return 0;
1371: }

1373: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1374: {
1375:   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1376:   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1377:   PetscInt                       m, nz;

1379:   if (PetscDefined(USE_DEBUG)) {
1380:     PetscInt  i;
1381:     PetscBool flg, missing;

1383:     PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg);
1386:     MatMissingDiagonal(A, &missing, &i);
1388:   }

1390:   /* Free the old stale stuff */
1391:   MatSeqAIJHIPSPARSETriFactors_Reset(&fs);

1393:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1394:      but they will not be used. Allocate them just for easy debugging.
1395:    */
1396:   MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/);

1398:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1399:   fact->factortype             = MAT_FACTOR_ILU;
1400:   fact->info.factor_mallocs    = 0;
1401:   fact->info.fill_ratio_given  = info->fill;
1402:   fact->info.fill_ratio_needed = 1.0;

1404:   aij->row = NULL;
1405:   aij->col = NULL;

1407:   /* ====================================================================== */
1408:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1409:   /* We'll do in-place factorization on fact                                */
1410:   /* ====================================================================== */
1411:   const int *Ai, *Aj;

1413:   m  = fact->rmap->n;
1414:   nz = aij->nz;

1416:   hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1));
1417:   hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz);
1418:   hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz);
1419:   MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj); /* Do not use compressed Ai */
1420:   hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream);
1421:   hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream);

1423:   /* ====================================================================== */
1424:   /* Create descriptors for M, L, U                                         */
1425:   /* ====================================================================== */
1426:   hipsparseFillMode_t fillMode;
1427:   hipsparseDiagType_t diagType;

1429:   hipsparseCreateMatDescr(&fs->matDescr_M);
1430:   hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO);
1431:   hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL);

1433:   /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1434:     hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1435:     assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1436:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1437:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1438:   */
1439:   fillMode = HIPSPARSE_FILL_MODE_LOWER;
1440:   diagType = HIPSPARSE_DIAG_TYPE_UNIT;
1441:   hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype);
1442:   hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode));
1443:   hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType));

1445:   fillMode = HIPSPARSE_FILL_MODE_UPPER;
1446:   diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1447:   hipsparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype);
1448:   hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode));
1449:   hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType));

1451:   /* ========================================================================= */
1452:   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1453:   /* ========================================================================= */
1454:   hipsparseCreateCsrilu02Info(&fs->ilu0Info_M);
1455:   if (m)
1456:     PetscCallHIPSPARSE(hipsparseXcsrilu02_bufferSize(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1457:                                                      fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));

1459:   hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m);
1460:   hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m);

1462:   hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype);
1463:   hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype);

1465:   hipsparseSpSV_createDescr(&fs->spsvDescr_L);
1466:   hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L);

1468:   hipsparseSpSV_createDescr(&fs->spsvDescr_U);
1469:   hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U);

1471:   /* It appears spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1472:      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1473:    */
1474:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1475:     hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M));
1476:     fs->spsvBuffer_L = fs->factBuffer_M;
1477:     hipMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U);
1478:   } else {
1479:     hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M));
1480:     fs->spsvBuffer_U = fs->factBuffer_M;
1481:     hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L);
1482:   }

1484:   /* ========================================================================== */
1485:   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1486:   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1487:   /* ========================================================================== */
1488:   int structural_zero;

1490:   fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1491:   if (m)
1492:     PetscCallHIPSPARSE(hipsparseXcsrilu02_analysis(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1493:                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1494:   if (PetscDefined(USE_DEBUG)) {
1495:     /* Function hipsparseXcsrilu02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1496:     hipsparseStatus_t status;
1497:     status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1499:   }

1501:   /* Estimate FLOPs of the numeric factorization */
1502:   {
1503:     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1504:     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1505:     PetscLogDouble flops = 0.0;

1507:     MatMarkDiagonal_SeqAIJ(A);
1508:     Ai    = Aseq->i;
1509:     Adiag = Aseq->diag;
1510:     for (PetscInt i = 0; i < m; i++) {
1511:       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1512:         nzRow  = Ai[i + 1] - Ai[i];
1513:         nzLeft = Adiag[i] - Ai[i];
1514:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1515:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1516:         */
1517:         nzLeft = (nzRow - 1) / 2;
1518:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1519:       }
1520:     }
1521:     fs->numericFactFlops = flops;
1522:   }
1523:   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0;
1524:   return 0;
1525: }

1527: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact, Vec b, Vec x)
1528: {
1529:   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1530:   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1531:   const PetscScalar             *barray;
1532:   PetscScalar                   *xarray;

1534:   VecHIPGetArrayWrite(x, &xarray);
1535:   VecHIPGetArrayRead(b, &barray);
1536:   PetscLogGpuTimeBegin();

1538:   /* Solve L*y = b */
1539:   hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray);
1540:   hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y);
1541:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1542:                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

1544:   /* Solve Lt*x = y */
1545:   hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray);
1546:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1547:                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));

1549:   VecHIPRestoreArrayRead(b, &barray);
1550:   VecHIPRestoreArrayWrite(x, &xarray);

1552:   PetscLogGpuTimeEnd();
1553:   PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n);
1554:   return 0;
1555: }

1557: static PetscErrorCode MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info)
1558: {
1559:   Mat_SeqAIJHIPSPARSETriFactors *fs    = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1560:   Mat_SeqAIJ                    *aij   = (Mat_SeqAIJ *)fact->data;
1561:   Mat_SeqAIJHIPSPARSE           *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1562:   CsrMatrix                     *Acsr;
1563:   PetscInt                       m, nz;
1564:   PetscBool                      flg;

1566:   if (PetscDefined(USE_DEBUG)) {
1567:     PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg);
1569:   }

1571:   /* Copy A's value to fact */
1572:   m  = fact->rmap->n;
1573:   nz = aij->nz;
1574:   MatSeqAIJHIPSPARSECopyToGPU(A);
1575:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1576:   hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream);

1578:   /* Factorize fact inplace */
1579:   /* Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1580:      The matrix type must be HIPSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1581:      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1582:      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1583:    */
1584:   if (m) hipsparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M);
1585:   if (PetscDefined(USE_DEBUG)) {
1586:     int               numerical_zero;
1587:     hipsparseStatus_t status;
1588:     status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1589:     PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1590:   }

1592:   hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L);

1594:   /* Note that hipsparse reports this error if we use double and HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1595:     ** On entry to hipsparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1596:   */
1597:   hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt);

1599:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1600:   fact->ops->solve             = MatSolve_SeqAIJHIPSPARSE_ICC0;
1601:   fact->ops->solvetranspose    = MatSolve_SeqAIJHIPSPARSE_ICC0;
1602:   fact->ops->matsolve          = NULL;
1603:   fact->ops->matsolvetranspose = NULL;
1604:   PetscLogGpuFlops(fs->numericFactFlops);
1605:   return 0;
1606: }

1608: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1609: {
1610:   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1611:   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1612:   PetscInt                       m, nz;

1614:   if (PetscDefined(USE_DEBUG)) {
1615:     PetscInt  i;
1616:     PetscBool flg, missing;

1618:     PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg);
1621:     MatMissingDiagonal(A, &missing, &i);
1623:   }

1625:   /* Free the old stale stuff */
1626:   MatSeqAIJHIPSPARSETriFactors_Reset(&fs);

1628:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1629:      but they will not be used. Allocate them just for easy debugging.
1630:    */
1631:   MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/);

1633:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1634:   fact->factortype             = MAT_FACTOR_ICC;
1635:   fact->info.factor_mallocs    = 0;
1636:   fact->info.fill_ratio_given  = info->fill;
1637:   fact->info.fill_ratio_needed = 1.0;

1639:   aij->row = NULL;
1640:   aij->col = NULL;

1642:   /* ====================================================================== */
1643:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1644:   /* We'll do in-place factorization on fact                                */
1645:   /* ====================================================================== */
1646:   const int *Ai, *Aj;

1648:   m  = fact->rmap->n;
1649:   nz = aij->nz;

1651:   hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1));
1652:   hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz);
1653:   hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz);
1654:   MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj); /* Do not use compressed Ai */
1655:   hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream);
1656:   hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream);

1658:   /* ====================================================================== */
1659:   /* Create mat descriptors for M, L                                        */
1660:   /* ====================================================================== */
1661:   hipsparseFillMode_t fillMode;
1662:   hipsparseDiagType_t diagType;

1664:   hipsparseCreateMatDescr(&fs->matDescr_M);
1665:   hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO);
1666:   hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL);

1668:   /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1669:     hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1670:     assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1671:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1672:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1673:   */
1674:   fillMode = HIPSPARSE_FILL_MODE_LOWER;
1675:   diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1676:   hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype);
1677:   hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode));
1678:   hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType));

1680:   /* ========================================================================= */
1681:   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1682:   /* ========================================================================= */
1683:   hipsparseCreateCsric02Info(&fs->ic0Info_M);
1684:   if (m) hipsparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M);

1686:   hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m);
1687:   hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m);

1689:   hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype);
1690:   hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype);

1692:   hipsparseSpSV_createDescr(&fs->spsvDescr_L);
1693:   hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L);

1695:   hipsparseSpSV_createDescr(&fs->spsvDescr_Lt);
1696:   hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt);

1698:   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1699:      See also comments in `MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0()`.
1700:    */
1701:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1702:     hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M));
1703:     fs->spsvBuffer_L = fs->factBuffer_M;
1704:     hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt);
1705:   } else {
1706:     hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M));
1707:     fs->spsvBuffer_Lt = fs->factBuffer_M;
1708:     hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L);
1709:   }

1711:   /* ========================================================================== */
1712:   /* Perform analysis of ic0 on M                                               */
1713:   /* The lower triangular part of M has the same sparsity pattern as L          */
1714:   /* ========================================================================== */
1715:   int structural_zero;

1717:   fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1718:   if (m) hipsparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M);
1719:   if (PetscDefined(USE_DEBUG)) {
1720:     hipsparseStatus_t status;
1721:     /* Function hipsparseXcsric02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1722:     status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1724:   }

1726:   /* Estimate FLOPs of the numeric factorization */
1727:   {
1728:     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1729:     PetscInt      *Ai, nzRow, nzLeft;
1730:     PetscLogDouble flops = 0.0;

1732:     Ai = Aseq->i;
1733:     for (PetscInt i = 0; i < m; i++) {
1734:       nzRow = Ai[i + 1] - Ai[i];
1735:       if (nzRow > 1) {
1736:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1737:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1738:         */
1739:         nzLeft = (nzRow - 1) / 2;
1740:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1741:       }
1742:     }
1743:     fs->numericFactFlops = flops;
1744:   }
1745:   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0;
1746:   return 0;
1747: }
1748: #endif

1750: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1751: {
1752:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;

1754: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1755:   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1756:   if (hipsparseTriFactors->factorizeOnDevice) {
1757:     ISIdentity(isrow, &row_identity);
1758:     ISIdentity(iscol, &col_identity);
1759:   }
1760:   if (!info->levels && row_identity && col_identity) MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(B, A, isrow, iscol, info);
1761:   else
1762: #endif
1763:   {
1764:     MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors);
1765:     MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info);
1766:     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1767:   }
1768:   return 0;
1769: }

1771: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1772: {
1773:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;

1775:   MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors);
1776:   MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info);
1777:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1778:   return 0;
1779: }

1781: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1782: {
1783:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;

1785: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1786:   PetscBool perm_identity = PETSC_FALSE;
1787:   if (hipsparseTriFactors->factorizeOnDevice) ISIdentity(perm, &perm_identity);
1788:   if (!info->levels && perm_identity) MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(B, A, perm, info);
1789:   else
1790: #endif
1791:   {
1792:     MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors);
1793:     MatICCFactorSymbolic_SeqAIJ(B, A, perm, info);
1794:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1795:   }
1796:   return 0;
1797: }

1799: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1800: {
1801:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;

1803:   MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors);
1804:   MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info);
1805:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1806:   return 0;
1807: }

1809: PetscErrorCode MatFactorGetSolverType_seqaij_hipsparse(Mat A, MatSolverType *type)
1810: {
1811:   *type = MATSOLVERHIPSPARSE;
1812:   return 0;
1813: }

1815: /*MC
1816:   MATSOLVERHIPSPARSE = "hipsparse" - A matrix type providing triangular solvers for seq matrices
1817:   on a single GPU of type, `MATSEQAIJHIPSPARSE`. Currently supported
1818:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1819:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1820:   HipSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1821:   algorithms are not recommended. This class does NOT support direct solver operations.

1823:   Level: beginner

1825: .seealso: `MATSEQAIJHIPSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
1826: M*/

1828: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijhipsparse_hipsparse(Mat A, MatFactorType ftype, Mat *B)
1829: {
1830:   PetscInt  n = A->rmap->n;
1831:   PetscBool factOnDevice, factOnHost;
1832:   char     *prefix;
1833:   char      factPlace[32] = "device"; /* the default */

1835:   MatCreate(PetscObjectComm((PetscObject)A), B);
1836:   MatSetSizes(*B, n, n, n, n);
1837:   (*B)->factortype = ftype;
1838:   MatSetType(*B, MATSEQAIJHIPSPARSE);

1840:   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
1841:   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
1842:   PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL);
1843:   PetscOptionsEnd();
1844:   PetscStrcasecmp("device", factPlace, &factOnDevice);
1845:   PetscStrcasecmp("host", factPlace, &factOnHost);
1847:   ((Mat_SeqAIJHIPSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;

1849:   if (A->boundtocpu && A->bindingpropagates) MatBindToCPU(*B, PETSC_TRUE);
1850:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1851:     MatSetBlockSizesFromMats(*B, A, A);
1852:     if (!A->boundtocpu) {
1853:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJHIPSPARSE;
1854:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJHIPSPARSE;
1855:     } else {
1856:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1857:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1858:     }
1859:     PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]);
1860:     PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]);
1861:     PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);
1862:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1863:     if (!A->boundtocpu) {
1864:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJHIPSPARSE;
1865:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE;
1866:     } else {
1867:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1868:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1869:     }
1870:     PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);
1871:     PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]);
1872:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for HIPSPARSE Matrix Types");

1874:   MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL);
1875:   (*B)->canuseordering = PETSC_TRUE;
1876:   PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_hipsparse);
1877:   return 0;
1878: }

1880: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat A)
1881: {
1882:   Mat_SeqAIJ          *a    = (Mat_SeqAIJ *)A->data;
1883:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1884: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1885:   Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1886: #endif

1888:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1889:     PetscLogEventBegin(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0);
1890:     if (A->factortype == MAT_FACTOR_NONE) {
1891:       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
1892:       hipMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost);
1893:     }
1894: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1895:     else if (fs->csrVal) {
1896:       /* We have a factorized matrix on device and are able to copy it to host */
1897:       hipMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost);
1898:     }
1899: #endif
1900:     else
1901:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
1902:     PetscLogGpuToCpu(a->nz * sizeof(PetscScalar));
1903:     PetscLogEventEnd(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0);
1904:     A->offloadmask = PETSC_OFFLOAD_BOTH;
1905:   }
1906:   return 0;
1907: }

1909: static PetscErrorCode MatSeqAIJGetArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1910: {
1911:   MatSeqAIJHIPSPARSECopyFromGPU(A);
1912:   *array = ((Mat_SeqAIJ *)A->data)->a;
1913:   return 0;
1914: }

1916: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1917: {
1918:   A->offloadmask = PETSC_OFFLOAD_CPU;
1919:   *array         = NULL;
1920:   return 0;
1921: }

1923: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1924: {
1925:   MatSeqAIJHIPSPARSECopyFromGPU(A);
1926:   *array = ((Mat_SeqAIJ *)A->data)->a;
1927:   return 0;
1928: }

1930: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1931: {
1932:   *array = NULL;
1933:   return 0;
1934: }

1936: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1937: {
1938:   *array = ((Mat_SeqAIJ *)A->data)->a;
1939:   return 0;
1940: }

1942: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1943: {
1944:   A->offloadmask = PETSC_OFFLOAD_CPU;
1945:   *array         = NULL;
1946:   return 0;
1947: }

1949: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
1950: {
1951:   Mat_SeqAIJHIPSPARSE *cusp;
1952:   CsrMatrix           *matrix;

1954:   MatSeqAIJHIPSPARSECopyToGPU(A);
1956:   cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(A->spptr);
1958:   matrix = (CsrMatrix *)cusp->mat->mat;

1960:   if (i) {
1961: #if !defined(PETSC_USE_64BIT_INDICES)
1962:     *i = matrix->row_offsets->data().get();
1963: #else
1964:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
1965: #endif
1966:   }
1967:   if (j) {
1968: #if !defined(PETSC_USE_64BIT_INDICES)
1969:     *j = matrix->column_indices->data().get();
1970: #else
1971:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
1972: #endif
1973:   }
1974:   if (a) *a = matrix->values->data().get();
1975:   if (mtype) *mtype = PETSC_MEMTYPE_HIP;
1976:   return 0;
1977: }

1979: PETSC_INTERN PetscErrorCode MatSeqAIJHIPSPARSECopyToGPU(Mat A)
1980: {
1981:   Mat_SeqAIJHIPSPARSE           *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1982:   Mat_SeqAIJHIPSPARSEMultStruct *matstruct       = hipsparsestruct->mat;
1983:   Mat_SeqAIJ                    *a               = (Mat_SeqAIJ *)A->data;
1984:   PetscBool                      both            = PETSC_TRUE;
1985:   PetscInt                       m               = A->rmap->n, *ii, *ridx, tmp;

1988:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1989:     if (A->nonzerostate == hipsparsestruct->nonzerostate && hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* Copy values only */
1990:       CsrMatrix *matrix;
1991:       matrix = (CsrMatrix *)hipsparsestruct->mat->mat;

1994:       PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0);
1995:       matrix->values->assign(a->a, a->a + a->nz);
1996:       WaitForHIP();
1997:       PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar));
1998:       PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0);
1999:       MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE);
2000:     } else {
2001:       PetscInt nnz;
2002:       PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0);
2003:       MatSeqAIJHIPSPARSEMultStruct_Destroy(&hipsparsestruct->mat, hipsparsestruct->format);
2004:       MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE);
2005:       delete hipsparsestruct->workVector;
2006:       delete hipsparsestruct->rowoffsets_gpu;
2007:       hipsparsestruct->workVector     = NULL;
2008:       hipsparsestruct->rowoffsets_gpu = NULL;
2009:       try {
2010:         if (a->compressedrow.use) {
2011:           m    = a->compressedrow.nrows;
2012:           ii   = a->compressedrow.i;
2013:           ridx = a->compressedrow.rindex;
2014:         } else {
2015:           m    = A->rmap->n;
2016:           ii   = a->i;
2017:           ridx = NULL;
2018:         }
2020:         if (!a->a) {
2021:           nnz  = ii[m];
2022:           both = PETSC_FALSE;
2023:         } else nnz = a->nz;

2026:         /* create hipsparse matrix */
2027:         hipsparsestruct->nrows = m;
2028:         matstruct              = new Mat_SeqAIJHIPSPARSEMultStruct;
2029:         hipsparseCreateMatDescr(&matstruct->descr);
2030:         hipsparseSetMatIndexBase(matstruct->descr, HIPSPARSE_INDEX_BASE_ZERO);
2031:         hipsparseSetMatType(matstruct->descr, HIPSPARSE_MATRIX_TYPE_GENERAL);

2033:         hipMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar));
2034:         hipMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar));
2035:         hipMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));
2036:         hipMemcpy(matstruct->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice);
2037:         hipMemcpy(matstruct->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice);
2038:         hipMemcpy(matstruct->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice);
2039:         hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE);

2041:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2042:         if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
2043:           /* set the matrix */
2044:           CsrMatrix *mat      = new CsrMatrix;
2045:           mat->num_rows       = m;
2046:           mat->num_cols       = A->cmap->n;
2047:           mat->num_entries    = nnz;
2048:           mat->row_offsets    = new THRUSTINTARRAY32(m + 1);
2049:           mat->column_indices = new THRUSTINTARRAY32(nnz);
2050:           mat->values         = new THRUSTARRAY(nnz);
2051:           mat->row_offsets->assign(ii, ii + m + 1);
2052:           mat->column_indices->assign(a->j, a->j + nnz);
2053:           if (a->a) mat->values->assign(a->a, a->a + nnz);

2055:           /* assign the pointer */
2056:           matstruct->mat = mat;
2057:           if (mat->num_rows) { /* hipsparse errors on empty matrices! */
2058:             PetscCallHIPSPARSE(hipsparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2059:                                                   HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2060:           }
2061:         } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
2062:           CsrMatrix *mat      = new CsrMatrix;
2063:           mat->num_rows       = m;
2064:           mat->num_cols       = A->cmap->n;
2065:           mat->num_entries    = nnz;
2066:           mat->row_offsets    = new THRUSTINTARRAY32(m + 1);
2067:           mat->column_indices = new THRUSTINTARRAY32(nnz);
2068:           mat->values         = new THRUSTARRAY(nnz);
2069:           mat->row_offsets->assign(ii, ii + m + 1);
2070:           mat->column_indices->assign(a->j, a->j + nnz);
2071:           if (a->a) mat->values->assign(a->a, a->a + nnz);

2073:           hipsparseHybMat_t hybMat;
2074:           hipsparseCreateHybMat(&hybMat);
2075:           hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
2076:           hipsparse_csr2hyb(hipsparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2077:           /* assign the pointer */
2078:           matstruct->mat = hybMat;

2080:           if (mat) {
2081:             if (mat->values) delete (THRUSTARRAY *)mat->values;
2082:             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2083:             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2084:             delete (CsrMatrix *)mat;
2085:           }
2086:         }

2088:         /* assign the compressed row indices */
2089:         if (a->compressedrow.use) {
2090:           hipsparsestruct->workVector = new THRUSTARRAY(m);
2091:           matstruct->cprowIndices     = new THRUSTINTARRAY(m);
2092:           matstruct->cprowIndices->assign(ridx, ridx + m);
2093:           tmp = m;
2094:         } else {
2095:           hipsparsestruct->workVector = NULL;
2096:           matstruct->cprowIndices     = NULL;
2097:           tmp                         = 0;
2098:         }
2099:         PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar));

2101:         /* assign the pointer */
2102:         hipsparsestruct->mat = matstruct;
2103:       } catch (char *ex) {
2104:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
2105:       }
2106:       WaitForHIP();
2107:       PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0);
2108:       hipsparsestruct->nonzerostate = A->nonzerostate;
2109:     }
2110:     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2111:   }
2112:   return 0;
2113: }

2115: struct VecHIPPlusEquals {
2116:   template <typename Tuple>
2117:   __host__ __device__ void operator()(Tuple t)
2118:   {
2119:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2120:   }
2121: };

2123: struct VecHIPEquals {
2124:   template <typename Tuple>
2125:   __host__ __device__ void operator()(Tuple t)
2126:   {
2127:     thrust::get<1>(t) = thrust::get<0>(t);
2128:   }
2129: };

2131: struct VecHIPEqualsReverse {
2132:   template <typename Tuple>
2133:   __host__ __device__ void operator()(Tuple t)
2134:   {
2135:     thrust::get<0>(t) = thrust::get<1>(t);
2136:   }
2137: };

2139: struct MatMatHipsparse {
2140:   PetscBool             cisdense;
2141:   PetscScalar          *Bt;
2142:   Mat                   X;
2143:   PetscBool             reusesym; /* Hipsparse does not have split symbolic and numeric phases for sparse matmat operations */
2144:   PetscLogDouble        flops;
2145:   CsrMatrix            *Bcsr;
2146:   hipsparseSpMatDescr_t matSpBDescr;
2147:   PetscBool             initialized; /* C = alpha op(A) op(B) + beta C */
2148:   hipsparseDnMatDescr_t matBDescr;
2149:   hipsparseDnMatDescr_t matCDescr;
2150:   PetscInt              Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2151: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2152:   void *dBuffer4, *dBuffer5;
2153: #endif
2154:   size_t                 mmBufferSize;
2155:   void                  *mmBuffer, *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2156:   hipsparseSpGEMMDescr_t spgemmDesc;
2157: };

2159: static PetscErrorCode MatDestroy_MatMatHipsparse(void *data)
2160: {
2161:   MatMatHipsparse *mmdata = (MatMatHipsparse *)data;

2163:   hipFree(mmdata->Bt);
2164:   delete mmdata->Bcsr;
2165:   if (mmdata->matSpBDescr) hipsparseDestroySpMat(mmdata->matSpBDescr);
2166:   if (mmdata->matBDescr) hipsparseDestroyDnMat(mmdata->matBDescr);
2167:   if (mmdata->matCDescr) hipsparseDestroyDnMat(mmdata->matCDescr);
2168:   if (mmdata->spgemmDesc) hipsparseSpGEMM_destroyDescr(mmdata->spgemmDesc);
2169: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2170:   if (mmdata->dBuffer4) hipFree(mmdata->dBuffer4);
2171:   if (mmdata->dBuffer5) hipFree(mmdata->dBuffer5);
2172: #endif
2173:   if (mmdata->mmBuffer) hipFree(mmdata->mmBuffer);
2174:   if (mmdata->mmBuffer2) hipFree(mmdata->mmBuffer2);
2175:   MatDestroy(&mmdata->X);
2176:   PetscFree(data);
2177:   return 0;
2178: }

2180: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2181: {
2182:   Mat_Product                   *product = C->product;
2183:   Mat                            A, B;
2184:   PetscInt                       m, n, blda, clda;
2185:   PetscBool                      flg, biship;
2186:   Mat_SeqAIJHIPSPARSE           *cusp;
2187:   hipsparseOperation_t           opA;
2188:   const PetscScalar             *barray;
2189:   PetscScalar                   *carray;
2190:   MatMatHipsparse               *mmdata;
2191:   Mat_SeqAIJHIPSPARSEMultStruct *mat;
2192:   CsrMatrix                     *csrmat;

2194:   MatCheckProduct(C, 1);
2196:   mmdata = (MatMatHipsparse *)product->data;
2197:   A      = product->A;
2198:   B      = product->B;
2199:   PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg);
2201:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2202:      Instead of silently accepting the wrong answer, I prefer to raise the error */
2204:   MatSeqAIJHIPSPARSECopyToGPU(A);
2205:   cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2206:   switch (product->type) {
2207:   case MATPRODUCT_AB:
2208:   case MATPRODUCT_PtAP:
2209:     mat = cusp->mat;
2210:     opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2211:     m   = A->rmap->n;
2212:     n   = B->cmap->n;
2213:     break;
2214:   case MATPRODUCT_AtB:
2215:     if (!A->form_explicit_transpose) {
2216:       mat = cusp->mat;
2217:       opA = HIPSPARSE_OPERATION_TRANSPOSE;
2218:     } else {
2219:       MatSeqAIJHIPSPARSEFormExplicitTranspose(A);
2220:       mat = cusp->matTranspose;
2221:       opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2222:     }
2223:     m = A->cmap->n;
2224:     n = B->cmap->n;
2225:     break;
2226:   case MATPRODUCT_ABt:
2227:   case MATPRODUCT_RARt:
2228:     mat = cusp->mat;
2229:     opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2230:     m   = A->rmap->n;
2231:     n   = B->rmap->n;
2232:     break;
2233:   default:
2234:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2235:   }
2237:   csrmat = (CsrMatrix *)mat->mat;
2238:   /* if the user passed a CPU matrix, copy the data to the GPU */
2239:   PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &biship);
2240:   if (!biship) { MatConvert(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B); }
2241:   MatDenseHIPGetArrayRead(B, &barray);
2242:   MatDenseGetLDA(B, &blda);
2243:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2244:     MatDenseHIPGetArrayWrite(mmdata->X, &carray);
2245:     MatDenseGetLDA(mmdata->X, &clda);
2246:   } else {
2247:     MatDenseHIPGetArrayWrite(C, &carray);
2248:     MatDenseGetLDA(C, &clda);
2249:   }

2251:   PetscLogGpuTimeBegin();
2252:   hipsparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE;
2253:   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2254:   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2255:     size_t mmBufferSize;
2256:     if (mmdata->initialized && mmdata->Blda != blda) {
2257:       hipsparseDestroyDnMat(mmdata->matBDescr);
2258:       mmdata->matBDescr = NULL;
2259:     }
2260:     if (!mmdata->matBDescr) {
2261:       hipsparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, hipsparse_scalartype, HIPSPARSE_ORDER_COLUMN);
2262:       mmdata->Blda = blda;
2263:     }
2264:     if (mmdata->initialized && mmdata->Clda != clda) {
2265:       hipsparseDestroyDnMat(mmdata->matCDescr);
2266:       mmdata->matCDescr = NULL;
2267:     }
2268:     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2269:       hipsparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, hipsparse_scalartype, HIPSPARSE_ORDER_COLUMN);
2270:       mmdata->Clda = clda;
2271:     }
2272:     if (!mat->matDescr) {
2273:       PetscCallHIPSPARSE(hipsparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2274:                                             HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2275:     }
2276:     hipsparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, &mmBufferSize);
2277:     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2278:       hipFree(mmdata->mmBuffer);
2279:       hipMalloc(&mmdata->mmBuffer, mmBufferSize);
2280:       mmdata->mmBufferSize = mmBufferSize;
2281:     }
2282:     mmdata->initialized = PETSC_TRUE;
2283:   } else {
2284:     /* to be safe, always update pointers of the mats */
2285:     hipsparseSpMatSetValues(mat->matDescr, csrmat->values->data().get());
2286:     hipsparseDnMatSetValues(mmdata->matBDescr, (void *)barray);
2287:     hipsparseDnMatSetValues(mmdata->matCDescr, (void *)carray);
2288:   }

2290:   /* do hipsparseSpMM, which supports transpose on B */
2291:   hipsparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);

2293:   PetscLogGpuTimeEnd();
2294:   PetscLogGpuFlops(n * 2.0 * csrmat->num_entries);
2295:   MatDenseHIPRestoreArrayRead(B, &barray);
2296:   if (product->type == MATPRODUCT_RARt) {
2297:     MatDenseHIPRestoreArrayWrite(mmdata->X, &carray);
2298:     MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE);
2299:   } else if (product->type == MATPRODUCT_PtAP) {
2300:     MatDenseHIPRestoreArrayWrite(mmdata->X, &carray);
2301:     MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE);
2302:   } else MatDenseHIPRestoreArrayWrite(C, &carray);
2303:   if (mmdata->cisdense) MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C);
2304:   if (!biship) MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B);
2305:   return 0;
2306: }

2308: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2309: {
2310:   Mat_Product         *product = C->product;
2311:   Mat                  A, B;
2312:   PetscInt             m, n;
2313:   PetscBool            cisdense, flg;
2314:   MatMatHipsparse     *mmdata;
2315:   Mat_SeqAIJHIPSPARSE *cusp;

2317:   MatCheckProduct(C, 1);
2319:   A = product->A;
2320:   B = product->B;
2321:   PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg);
2323:   cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2325:   switch (product->type) {
2326:   case MATPRODUCT_AB:
2327:     m = A->rmap->n;
2328:     n = B->cmap->n;
2329:     break;
2330:   case MATPRODUCT_AtB:
2331:     m = A->cmap->n;
2332:     n = B->cmap->n;
2333:     break;
2334:   case MATPRODUCT_ABt:
2335:     m = A->rmap->n;
2336:     n = B->rmap->n;
2337:     break;
2338:   case MATPRODUCT_PtAP:
2339:     m = B->cmap->n;
2340:     n = B->cmap->n;
2341:     break;
2342:   case MATPRODUCT_RARt:
2343:     m = B->rmap->n;
2344:     n = B->rmap->n;
2345:     break;
2346:   default:
2347:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2348:   }
2349:   MatSetSizes(C, m, n, m, n);
2350:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2351:   PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense);
2352:   MatSetType(C, MATSEQDENSEHIP);

2354:   /* product data */
2355:   PetscNew(&mmdata);
2356:   mmdata->cisdense = cisdense;
2357:   /* for these products we need intermediate storage */
2358:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2359:     MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X);
2360:     MatSetType(mmdata->X, MATSEQDENSEHIP);
2361:     /* do not preallocate, since the first call to MatDenseHIPGetArray will preallocate on the GPU for us */
2362:     if (product->type == MATPRODUCT_RARt) MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n);
2363:     else MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n);
2364:   }
2365:   C->product->data       = mmdata;
2366:   C->product->destroy    = MatDestroy_MatMatHipsparse;
2367:   C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP;
2368:   return 0;
2369: }

2371: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2372: {
2373:   Mat_Product                   *product = C->product;
2374:   Mat                            A, B;
2375:   Mat_SeqAIJHIPSPARSE           *Acusp, *Bcusp, *Ccusp;
2376:   Mat_SeqAIJ                    *c = (Mat_SeqAIJ *)C->data;
2377:   Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2378:   CsrMatrix                     *Acsr, *Bcsr, *Ccsr;
2379:   PetscBool                      flg;
2380:   MatProductType                 ptype;
2381:   MatMatHipsparse               *mmdata;
2382:   hipsparseSpMatDescr_t          BmatSpDescr;
2383:   hipsparseOperation_t           opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */

2385:   MatCheckProduct(C, 1);
2387:   PetscObjectTypeCompare((PetscObject)C, MATSEQAIJHIPSPARSE, &flg);
2389:   mmdata = (MatMatHipsparse *)C->product->data;
2390:   A      = product->A;
2391:   B      = product->B;
2392:   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2393:     mmdata->reusesym = PETSC_FALSE;
2394:     Ccusp            = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2396:     Cmat = Ccusp->mat;
2398:     Ccsr = (CsrMatrix *)Cmat->mat;
2400:     goto finalize;
2401:   }
2402:   if (!c->nz) goto finalize;
2403:   PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg);
2405:   PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg);
2409:   Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2410:   Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2411:   Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2415:   MatSeqAIJHIPSPARSECopyToGPU(A);
2416:   MatSeqAIJHIPSPARSECopyToGPU(B);

2418:   ptype = product->type;
2419:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2420:     ptype = MATPRODUCT_AB;
2422:   }
2423:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2424:     ptype = MATPRODUCT_AB;
2426:   }
2427:   switch (ptype) {
2428:   case MATPRODUCT_AB:
2429:     Amat = Acusp->mat;
2430:     Bmat = Bcusp->mat;
2431:     break;
2432:   case MATPRODUCT_AtB:
2433:     Amat = Acusp->matTranspose;
2434:     Bmat = Bcusp->mat;
2435:     break;
2436:   case MATPRODUCT_ABt:
2437:     Amat = Acusp->mat;
2438:     Bmat = Bcusp->matTranspose;
2439:     break;
2440:   default:
2441:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2442:   }
2443:   Cmat = Ccusp->mat;
2447:   Acsr = (CsrMatrix *)Amat->mat;
2448:   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2449:   Ccsr = (CsrMatrix *)Cmat->mat;
2453:   PetscLogGpuTimeBegin();
2454: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2455:   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2456:   hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE);
2457:   #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2458:   hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2459:   #else
2460:   hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2461:   hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2462:   #endif
2463: #else
2464:   PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2465:                                           Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2466:                                           Ccsr->column_indices->data().get()));
2467: #endif
2468:   PetscLogGpuFlops(mmdata->flops);
2469:   WaitForHIP();
2470:   PetscLogGpuTimeEnd();
2471:   C->offloadmask = PETSC_OFFLOAD_GPU;
2472: finalize:
2473:   /* shorter version of MatAssemblyEnd_SeqAIJ */
2474:   PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz);
2475:   PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n");
2476:   PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax);
2477:   c->reallocs = 0;
2478:   C->info.mallocs += 0;
2479:   C->info.nz_unneeded = 0;
2480:   C->assembled = C->was_assembled = PETSC_TRUE;
2481:   C->num_ass++;
2482:   return 0;
2483: }

2485: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2486: {
2487:   Mat_Product                   *product = C->product;
2488:   Mat                            A, B;
2489:   Mat_SeqAIJHIPSPARSE           *Acusp, *Bcusp, *Ccusp;
2490:   Mat_SeqAIJ                    *a, *b, *c;
2491:   Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2492:   CsrMatrix                     *Acsr, *Bcsr, *Ccsr;
2493:   PetscInt                       i, j, m, n, k;
2494:   PetscBool                      flg;
2495:   MatProductType                 ptype;
2496:   MatMatHipsparse               *mmdata;
2497:   PetscLogDouble                 flops;
2498:   PetscBool                      biscompressed, ciscompressed;
2499: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2500:   int64_t               C_num_rows1, C_num_cols1, C_nnz1;
2501:   hipsparseSpMatDescr_t BmatSpDescr;
2502: #else
2503:   int cnz;
2504: #endif
2505:   hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */

2507:   MatCheckProduct(C, 1);
2509:   A = product->A;
2510:   B = product->B;
2511:   PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg);
2513:   PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg);
2515:   a = (Mat_SeqAIJ *)A->data;
2516:   b = (Mat_SeqAIJ *)B->data;
2517:   /* product data */
2518:   PetscNew(&mmdata);
2519:   C->product->data    = mmdata;
2520:   C->product->destroy = MatDestroy_MatMatHipsparse;

2522:   MatSeqAIJHIPSPARSECopyToGPU(A);
2523:   MatSeqAIJHIPSPARSECopyToGPU(B);
2524:   Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; /* Access spptr after MatSeqAIJHIPSPARSECopyToGPU, not before */
2525:   Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;

2529:   ptype = product->type;
2530:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2531:     ptype                                          = MATPRODUCT_AB;
2532:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2533:   }
2534:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2535:     ptype                                          = MATPRODUCT_AB;
2536:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2537:   }
2538:   biscompressed = PETSC_FALSE;
2539:   ciscompressed = PETSC_FALSE;
2540:   switch (ptype) {
2541:   case MATPRODUCT_AB:
2542:     m    = A->rmap->n;
2543:     n    = B->cmap->n;
2544:     k    = A->cmap->n;
2545:     Amat = Acusp->mat;
2546:     Bmat = Bcusp->mat;
2547:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2548:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2549:     break;
2550:   case MATPRODUCT_AtB:
2551:     m = A->cmap->n;
2552:     n = B->cmap->n;
2553:     k = A->rmap->n;
2554:     MatSeqAIJHIPSPARSEFormExplicitTranspose(A);
2555:     Amat = Acusp->matTranspose;
2556:     Bmat = Bcusp->mat;
2557:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2558:     break;
2559:   case MATPRODUCT_ABt:
2560:     m = A->rmap->n;
2561:     n = B->rmap->n;
2562:     k = A->cmap->n;
2563:     MatSeqAIJHIPSPARSEFormExplicitTranspose(B);
2564:     Amat = Acusp->mat;
2565:     Bmat = Bcusp->matTranspose;
2566:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2567:     break;
2568:   default:
2569:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2570:   }

2572:   /* create hipsparse matrix */
2573:   MatSetSizes(C, m, n, m, n);
2574:   MatSetType(C, MATSEQAIJHIPSPARSE);
2575:   c     = (Mat_SeqAIJ *)C->data;
2576:   Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2577:   Cmat  = new Mat_SeqAIJHIPSPARSEMultStruct;
2578:   Ccsr  = new CsrMatrix;

2580:   c->compressedrow.use = ciscompressed;
2581:   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2582:     c->compressedrow.nrows = a->compressedrow.nrows;
2583:     PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex);
2584:     PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows);
2585:     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2586:     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2587:     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2588:   } else {
2589:     c->compressedrow.nrows  = 0;
2590:     c->compressedrow.i      = NULL;
2591:     c->compressedrow.rindex = NULL;
2592:     Ccusp->workVector       = NULL;
2593:     Cmat->cprowIndices      = NULL;
2594:   }
2595:   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2596:   Ccusp->mat        = Cmat;
2597:   Ccusp->mat->mat   = Ccsr;
2598:   Ccsr->num_rows    = Ccusp->nrows;
2599:   Ccsr->num_cols    = n;
2600:   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2601:   hipsparseCreateMatDescr(&Cmat->descr);
2602:   hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO);
2603:   hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL);
2604:   hipMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar));
2605:   hipMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar));
2606:   hipMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));
2607:   hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice);
2608:   hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice);
2609:   hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice);
2610:   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* hipsparse raise errors in different calls when matrices have zero rows/columns! */
2611:     thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2612:     c->nz                = 0;
2613:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2614:     Ccsr->values         = new THRUSTARRAY(c->nz);
2615:     goto finalizesym;
2616:   }

2620:   Acsr = (CsrMatrix *)Amat->mat;
2621:   if (!biscompressed) {
2622:     Bcsr        = (CsrMatrix *)Bmat->mat;
2623:     BmatSpDescr = Bmat->matDescr;
2624:   } else { /* we need to use row offsets for the full matrix */
2625:     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2626:     Bcsr                 = new CsrMatrix;
2627:     Bcsr->num_rows       = B->rmap->n;
2628:     Bcsr->num_cols       = cBcsr->num_cols;
2629:     Bcsr->num_entries    = cBcsr->num_entries;
2630:     Bcsr->column_indices = cBcsr->column_indices;
2631:     Bcsr->values         = cBcsr->values;
2632:     if (!Bcusp->rowoffsets_gpu) {
2633:       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2634:       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2635:       PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt));
2636:     }
2637:     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2638:     mmdata->Bcsr      = Bcsr;
2639:     if (Bcsr->num_rows && Bcsr->num_cols) {
2640:       hipsparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype);
2641:     }
2642:     BmatSpDescr = mmdata->matSpBDescr;
2643:   }
2646:   /* precompute flops count */
2647:   if (ptype == MATPRODUCT_AB) {
2648:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2649:       const PetscInt st = a->i[i];
2650:       const PetscInt en = a->i[i + 1];
2651:       for (j = st; j < en; j++) {
2652:         const PetscInt brow = a->j[j];
2653:         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2654:       }
2655:     }
2656:   } else if (ptype == MATPRODUCT_AtB) {
2657:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2658:       const PetscInt anzi = a->i[i + 1] - a->i[i];
2659:       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2660:       flops += (2. * anzi) * bnzi;
2661:     }
2662:   } else flops = 0.; /* TODO */

2664:   mmdata->flops = flops;
2665:   PetscLogGpuTimeBegin();
2666: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2667:   hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE);
2668:   hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype);
2669:   hipsparseSpGEMM_createDescr(&mmdata->spgemmDesc);
2670:   #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2671:   {
2672:     /* hipsparseSpGEMMreuse has more reasonable APIs than hipsparseSpGEMM, so we prefer to use it.
2673:      We follow the sample code at https://github.com/ROCmSoftwarePlatform/hipSPARSE/blob/develop/clients/include/testing_spgemmreuse_csr.hpp
2674:   */
2675:     void *dBuffer1 = NULL;
2676:     void *dBuffer2 = NULL;
2677:     void *dBuffer3 = NULL;
2678:     /* dBuffer4, dBuffer5 are needed by hipsparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2679:     size_t bufferSize1 = 0;
2680:     size_t bufferSize2 = 0;
2681:     size_t bufferSize3 = 0;
2682:     size_t bufferSize4 = 0;
2683:     size_t bufferSize5 = 0;

2685:     /*----------------------------------------------------------------------*/
2686:     /* ask bufferSize1 bytes for external memory */
2687:     hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
2688:     hipMalloc((void **)&dBuffer1, bufferSize1);
2689:     /* inspect the matrices A and B to understand the memory requirement for the next step */
2690:     hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
2691:     /*----------------------------------------------------------------------*/
2692:     hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
2693:     hipMalloc((void **)&dBuffer2, bufferSize2);
2694:     hipMalloc((void **)&dBuffer3, bufferSize3);
2695:     hipMalloc((void **)&mmdata->dBuffer4, bufferSize4);
2696:     hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
2697:     hipFree(dBuffer1);
2698:     hipFree(dBuffer2);
2699:     /*----------------------------------------------------------------------*/
2700:     /* get matrix C non-zero entries C_nnz1 */
2701:     hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);
2702:     c->nz = (PetscInt)C_nnz1;
2703:     /* allocate matrix C */
2704:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2705:     hipPeekAtLastError(); /* catch out of memory errors */
2706:     Ccsr->values = new THRUSTARRAY(c->nz);
2707:     hipPeekAtLastError(); /* catch out of memory errors */
2708:     /* update matC with the new pointers */
2709:     hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2710:     /*----------------------------------------------------------------------*/
2711:     hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
2712:     hipMalloc((void **)&mmdata->dBuffer5, bufferSize5);
2713:     hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
2714:     hipFree(dBuffer3);
2715:     hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2716:     PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024);
2717:   }
2718:   #else
2719:   size_t bufSize2;
2720:   /* ask bufferSize bytes for external memory */
2721:   hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
2722:   hipMalloc((void **)&mmdata->mmBuffer2, bufSize2);
2723:   /* inspect the matrices A and B to understand the memory requirement for the next step */
2724:   hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
2725:   /* ask bufferSize again bytes for external memory */
2726:   hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
2727:   /* Similar to CUSPARSE, we need both buffers to perform the operations properly!
2728:      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2729:      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2730:      is stored in the descriptor! What a messy API... */
2731:   hipMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize);
2732:   /* compute the intermediate product of A * B */
2733:   hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2734:   /* get matrix C non-zero entries C_nnz1 */
2735:   hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);
2736:   c->nz = (PetscInt)C_nnz1;
2737:   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2738:                       mmdata->mmBufferSize / 1024));
2739:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2740:   hipPeekAtLastError(); /* catch out of memory errors */
2741:   Ccsr->values = new THRUSTARRAY(c->nz);
2742:   hipPeekAtLastError(); /* catch out of memory errors */
2743:   hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2744:   hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2745:   #endif
2746: #else
2747:   hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_HOST);
2748:   PetscCallHIPSPARSE(hipsparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2749:                                           Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz));
2750:   c->nz = cnz;
2751:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2752:   hipPeekAtLastError(); /* catch out of memory errors */
2753:   Ccsr->values = new THRUSTARRAY(c->nz);
2754:   hipPeekAtLastError(); /* catch out of memory errors */

2756:   hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE);
2757:   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2758:       I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2759:       D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2760:   PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2761:                                           Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2762:                                           Ccsr->column_indices->data().get()));
2763: #endif
2764:   PetscLogGpuFlops(mmdata->flops);
2765:   PetscLogGpuTimeEnd();
2766: finalizesym:
2767:   c->singlemalloc = PETSC_FALSE;
2768:   c->free_a       = PETSC_TRUE;
2769:   c->free_ij      = PETSC_TRUE;
2770:   PetscMalloc1(m + 1, &c->i);
2771:   PetscMalloc1(c->nz, &c->j);
2772:   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2773:     PetscInt      *d_i = c->i;
2774:     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2775:     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2776:     ii = *Ccsr->row_offsets;
2777:     jj = *Ccsr->column_indices;
2778:     if (ciscompressed) d_i = c->compressedrow.i;
2779:     hipMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost);
2780:     hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost);
2781:   } else {
2782:     PetscInt *d_i = c->i;
2783:     if (ciscompressed) d_i = c->compressedrow.i;
2784:     hipMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost);
2785:     hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost);
2786:   }
2787:   if (ciscompressed) { /* need to expand host row offsets */
2788:     PetscInt r = 0;
2789:     c->i[0]    = 0;
2790:     for (k = 0; k < c->compressedrow.nrows; k++) {
2791:       const PetscInt next = c->compressedrow.rindex[k];
2792:       const PetscInt old  = c->compressedrow.i[k];
2793:       for (; r < next; r++) c->i[r + 1] = old;
2794:     }
2795:     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
2796:   }
2797:   PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt));
2798:   PetscMalloc1(m, &c->ilen);
2799:   PetscMalloc1(m, &c->imax);
2800:   c->maxnz         = c->nz;
2801:   c->nonzerorowcnt = 0;
2802:   c->rmax          = 0;
2803:   for (k = 0; k < m; k++) {
2804:     const PetscInt nn = c->i[k + 1] - c->i[k];
2805:     c->ilen[k] = c->imax[k] = nn;
2806:     c->nonzerorowcnt += (PetscInt) !!nn;
2807:     c->rmax = PetscMax(c->rmax, nn);
2808:   }
2809:   MatMarkDiagonal_SeqAIJ(C);
2810:   PetscMalloc1(c->nz, &c->a);
2811:   Ccsr->num_entries = c->nz;

2813:   C->nonzerostate++;
2814:   PetscLayoutSetUp(C->rmap);
2815:   PetscLayoutSetUp(C->cmap);
2816:   Ccusp->nonzerostate = C->nonzerostate;
2817:   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
2818:   C->preallocated     = PETSC_TRUE;
2819:   C->assembled        = PETSC_FALSE;
2820:   C->was_assembled    = PETSC_FALSE;
2821:   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2822:     mmdata->reusesym = PETSC_TRUE;
2823:     C->offloadmask   = PETSC_OFFLOAD_GPU;
2824:   }
2825:   C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2826:   return 0;
2827: }

2829: /* handles sparse or dense B */
2830: static PetscErrorCode MatProductSetFromOptions_SeqAIJHIPSPARSE(Mat mat)
2831: {
2832:   Mat_Product *product = mat->product;
2833:   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;

2835:   MatCheckProduct(mat, 1);
2836:   PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense);
2837:   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJHIPSPARSE, &Biscusp);
2838:   if (product->type == MATPRODUCT_ABC) {
2839:     Ciscusp = PETSC_FALSE;
2840:     if (!product->C->boundtocpu) PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJHIPSPARSE, &Ciscusp);
2841:   }
2842:   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2843:     PetscBool usecpu = PETSC_FALSE;
2844:     switch (product->type) {
2845:     case MATPRODUCT_AB:
2846:       if (product->api_user) {
2847:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
2848:         PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL);
2849:         PetscOptionsEnd();
2850:       } else {
2851:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
2852:         PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL);
2853:         PetscOptionsEnd();
2854:       }
2855:       break;
2856:     case MATPRODUCT_AtB:
2857:       if (product->api_user) {
2858:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
2859:         PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL);
2860:         PetscOptionsEnd();
2861:       } else {
2862:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
2863:         PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL);
2864:         PetscOptionsEnd();
2865:       }
2866:       break;
2867:     case MATPRODUCT_PtAP:
2868:       if (product->api_user) {
2869:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
2870:         PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL);
2871:         PetscOptionsEnd();
2872:       } else {
2873:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
2874:         PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL);
2875:         PetscOptionsEnd();
2876:       }
2877:       break;
2878:     case MATPRODUCT_RARt:
2879:       if (product->api_user) {
2880:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
2881:         PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL);
2882:         PetscOptionsEnd();
2883:       } else {
2884:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
2885:         PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL);
2886:         PetscOptionsEnd();
2887:       }
2888:       break;
2889:     case MATPRODUCT_ABC:
2890:       if (product->api_user) {
2891:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
2892:         PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL);
2893:         PetscOptionsEnd();
2894:       } else {
2895:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
2896:         PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL);
2897:         PetscOptionsEnd();
2898:       }
2899:       break;
2900:     default:
2901:       break;
2902:     }
2903:     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2904:   }
2905:   /* dispatch */
2906:   if (isdense) {
2907:     switch (product->type) {
2908:     case MATPRODUCT_AB:
2909:     case MATPRODUCT_AtB:
2910:     case MATPRODUCT_ABt:
2911:     case MATPRODUCT_PtAP:
2912:     case MATPRODUCT_RARt:
2913:       if (product->A->boundtocpu) MatProductSetFromOptions_SeqAIJ_SeqDense(mat);
2914:       else mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP;
2915:       break;
2916:     case MATPRODUCT_ABC:
2917:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2918:       break;
2919:     default:
2920:       break;
2921:     }
2922:   } else if (Biscusp && Ciscusp) {
2923:     switch (product->type) {
2924:     case MATPRODUCT_AB:
2925:     case MATPRODUCT_AtB:
2926:     case MATPRODUCT_ABt:
2927:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2928:       break;
2929:     case MATPRODUCT_PtAP:
2930:     case MATPRODUCT_RARt:
2931:     case MATPRODUCT_ABC:
2932:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2933:       break;
2934:     default:
2935:       break;
2936:     }
2937:   } else MatProductSetFromOptions_SeqAIJ(mat); /* fallback for AIJ */
2938:   return 0;
2939: }

2941: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2942: {
2943:   MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE);
2944:   return 0;
2945: }

2947: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2948: {
2949:   MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE);
2950:   return 0;
2951: }

2953: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2954: {
2955:   MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE);
2956:   return 0;
2957: }

2959: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2960: {
2961:   MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE);
2962:   return 0;
2963: }

2965: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2966: {
2967:   MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE);
2968:   return 0;
2969: }

2971: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
2972: {
2973:   int i = blockIdx.x * blockDim.x + threadIdx.x;
2974:   if (i < n) y[idx[i]] += x[i];
2975: }

2977: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2978: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
2979: {
2980:   Mat_SeqAIJ                    *a               = (Mat_SeqAIJ *)A->data;
2981:   Mat_SeqAIJHIPSPARSE           *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2982:   Mat_SeqAIJHIPSPARSEMultStruct *matstruct;
2983:   PetscScalar                   *xarray, *zarray, *dptr, *beta, *xptr;
2984:   hipsparseOperation_t           opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2985:   PetscBool                      compressed;
2986:   PetscInt                       nx, ny;

2989:   if (!a->nz) {
2990:     if (!yy) VecSet_SeqHIP(zz, 0);
2991:     else VecCopy_SeqHIP(yy, zz);
2992:     return 0;
2993:   }
2994:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2995:   MatSeqAIJHIPSPARSECopyToGPU(A);
2996:   if (!trans) {
2997:     matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
2999:   } else {
3000:     if (herm || !A->form_explicit_transpose) {
3001:       opA       = herm ? HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE : HIPSPARSE_OPERATION_TRANSPOSE;
3002:       matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3003:     } else {
3004:       if (!hipsparsestruct->matTranspose) MatSeqAIJHIPSPARSEFormExplicitTranspose(A);
3005:       matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
3006:     }
3007:   }
3008:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3009:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3010:   try {
3011:     VecHIPGetArrayRead(xx, (const PetscScalar **)&xarray);
3012:     if (yy == zz) VecHIPGetArray(zz, &zarray); /* read & write zz, so need to get uptodate zarray on GPU */
3013:     else VecHIPGetArrayWrite(zz, &zarray);     /* write zz, so no need to init zarray on GPU */

3015:     PetscLogGpuTimeBegin();
3016:     if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3017:       /* z = A x + beta y.
3018:          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3019:          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3020:       */
3021:       xptr = xarray;
3022:       dptr = compressed ? hipsparsestruct->workVector->data().get() : zarray;
3023:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3024:       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3025:           allocated to accommodate different uses. So we get the length info directly from mat.
3026:        */
3027:       if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3028:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3029:         nx             = mat->num_cols;
3030:         ny             = mat->num_rows;
3031:       }
3032:     } else {
3033:       /* z = A^T x + beta y
3034:          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3035:          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3036:        */
3037:       xptr = compressed ? hipsparsestruct->workVector->data().get() : xarray;
3038:       dptr = zarray;
3039:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3040:       if (compressed) { /* Scatter x to work vector */
3041:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3042:         thrust::for_each(
3043: #if PetscDefined(HAVE_THRUST_ASYNC)
3044:           thrust::hip::par.on(PetscDefaultHipStream),
3045: #endif
3046:           thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3047:           thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecHIPEqualsReverse());
3048:       }
3049:       if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3050:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3051:         nx             = mat->num_rows;
3052:         ny             = mat->num_cols;
3053:       }
3054:     }
3055:     /* csr_spmv does y = alpha op(A) x + beta y */
3056:     if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3057: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
3059:       if (!matstruct->hipSpMV[opA].initialized) { /* built on demand */
3060:         hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecXDescr, nx, xptr, hipsparse_scalartype);
3061:         hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecYDescr, ny, dptr, hipsparse_scalartype);
3062:         PetscCallHIPSPARSE(hipsparseSpMV_bufferSize(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg,
3063:                                                     &matstruct->hipSpMV[opA].spmvBufferSize));
3064:         hipMalloc(&matstruct->hipSpMV[opA].spmvBuffer, matstruct->hipSpMV[opA].spmvBufferSize);
3065:         matstruct->hipSpMV[opA].initialized = PETSC_TRUE;
3066:       } else {
3067:         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3068:         hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecXDescr, xptr);
3069:         hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecYDescr, dptr);
3070:       }
3071:       PetscCallHIPSPARSE(hipsparseSpMV(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJHIPSPARSECopyToGPU() or MatSeqAIJHIPSPARSEFormExplicitTranspose() */
3072:                                        matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg, matstruct->hipSpMV[opA].spmvBuffer));
3073: #else
3074:       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3075:       hipsparse_csr_spmv(hipsparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr);
3076: #endif
3077:     } else {
3078:       if (hipsparsestruct->nrows) {
3079:         hipsparseHybMat_t hybMat = (hipsparseHybMat_t)matstruct->mat;
3080:         hipsparse_hyb_spmv(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr);
3081:       }
3082:     }
3083:     PetscLogGpuTimeEnd();

3085:     if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3086:       if (yy) {                                   /* MatMultAdd: zz = A*xx + yy */
3087:         if (compressed) {                         /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3088:           VecCopy_SeqHIP(yy, zz);      /* zz = yy */
3089:         } else if (zz != yy) {                    /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3090:           VecAXPY_SeqHIP(zz, 1.0, yy); /* zz += yy */
3091:         }
3092:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3093:         VecSet_SeqHIP(zz, 0);
3094:       }

3096:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3097:       if (compressed) {
3098:         PetscLogGpuTimeBegin();
3099:         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3100:            and in the destructor of the scope, it will call hipStreamSynchronize() on this stream. One has to store all events to
3101:            prevent that. So I just add a ScatterAdd kernel.
3102:          */
3103: #if 0
3104:         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3105:         thrust::async::for_each(thrust::hip::par.on(hipsparsestruct->stream),
3106:                          thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3107:                          thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3108:                          VecHIPPlusEquals());
3109: #else
3110:         PetscInt n = matstruct->cprowIndices->size();
3111:         hipLaunchKernelGGL(ScatterAdd, dim3((n + 255) / 256), dim3(256), 0, PetscDefaultHipStream, n, matstruct->cprowIndices->data().get(), hipsparsestruct->workVector->data().get(), zarray);
3112: #endif
3113:         PetscLogGpuTimeEnd();
3114:       }
3115:     } else {
3116:       if (yy && yy != zz) { VecAXPY_SeqHIP(zz, 1.0, yy); /* zz += yy */ }
3117:     }
3118:     VecHIPRestoreArrayRead(xx, (const PetscScalar **)&xarray);
3119:     if (yy == zz) VecHIPRestoreArray(zz, &zarray);
3120:     else VecHIPRestoreArrayWrite(zz, &zarray);
3121:   } catch (char *ex) {
3122:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
3123:   }
3124:   if (yy) PetscLogGpuFlops(2.0 * a->nz);
3125:   else PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt);
3126:   return 0;
3127: }

3129: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3130: {
3131:   MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE);
3132:   return 0;
3133: }

3135: static PetscErrorCode MatAssemblyEnd_SeqAIJHIPSPARSE(Mat A, MatAssemblyType mode)
3136: {
3137:   PetscObjectState     onnz = A->nonzerostate;
3138:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;

3140:   MatAssemblyEnd_SeqAIJ(A, mode);
3141:   if (onnz != A->nonzerostate && cusp->deviceMat) {
3142:     PetscInfo(A, "Destroy device mat since nonzerostate changed\n");
3143:     hipFree(cusp->deviceMat);
3144:     cusp->deviceMat = NULL;
3145:   }
3146:   return 0;
3147: }

3149: /* --------------------------------------------------------------------------------*/
3150: /*@
3151:    MatCreateSeqAIJHIPSPARSE - Creates a sparse matrix in `MATAIJHIPSPARSE` (compressed row) format.
3152:    This matrix will ultimately pushed down to AMD GPUs and use the HIPSPARSE library for calculations.
3153:    For good matrix assembly performance the user should preallocate the matrix storage by setting
3154:    the parameter nz (or the array nnz).  By setting these parameters accurately,
3155:    performance during matrix assembly can be increased by more than a factor of 50.

3157:    Collective

3159:    Input Parameters:
3160: +  comm - MPI communicator, set to `PETSC_COMM_SELF`
3161: .  m - number of rows
3162: .  n - number of columns
3163: .  nz - number of nonzeros per row (same for all rows)
3164: -  nnz - array containing the number of nonzeros in the various rows
3165:          (possibly different for each row) or NULL

3167:    Output Parameter:
3168: .  A - the matrix

3170:    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3171:    `MatXXXXSetPreallocation()` paradgm instead of this routine directly.
3172:    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation`]

3174:    Notes:
3175:    If nnz is given then nz is ignored

3177:    The AIJ format (also called the Yale sparse matrix format or
3178:    compressed row storage), is fully compatible with standard Fortran 77
3179:    storage.  That is, the stored row and column indices can begin at
3180:    either one (as in Fortran) or zero.  See the users' manual for details.

3182:    Specify the preallocated storage with either nz or nnz (not both).
3183:    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
3184:    allocation.  For large problems you MUST preallocate memory or you
3185:    will get TERRIBLE performance, see the users' manual chapter on matrices.

3187:    By default, this format uses inodes (identical nodes) when possible, to
3188:    improve numerical efficiency of matrix-vector products and solves. We
3189:    search for consecutive rows with the same nonzero structure, thereby
3190:    reusing matrix information to achieve increased efficiency.

3192:    Level: intermediate

3194: .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJHIPSPARSE`, `MATAIJHIPSPARSE`
3195: @*/
3196: PetscErrorCode MatCreateSeqAIJHIPSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3197: {
3198:   MatCreate(comm, A);
3199:   MatSetSizes(*A, m, n, m, n);
3200:   MatSetType(*A, MATSEQAIJHIPSPARSE);
3201:   MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz);
3202:   return 0;
3203: }

3205: static PetscErrorCode MatDestroy_SeqAIJHIPSPARSE(Mat A)
3206: {
3207:   if (A->factortype == MAT_FACTOR_NONE) MatSeqAIJHIPSPARSE_Destroy((Mat_SeqAIJHIPSPARSE **)&A->spptr);
3208:   else MatSeqAIJHIPSPARSETriFactors_Destroy((Mat_SeqAIJHIPSPARSETriFactors **)&A->spptr);
3209:   PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL);
3210:   PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", NULL);
3211:   PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetUseCPUSolve_C", NULL);
3212:   PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL);
3213:   PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL);
3214:   PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL);
3215:   PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL);
3216:   PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL);
3217:   PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL);
3218:   PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijhipsparse_hypre_C", NULL);
3219:   MatDestroy_SeqAIJ(A);
3220:   return 0;
3221: }

3223: static PetscErrorCode MatDuplicate_SeqAIJHIPSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3224: {
3225:   MatDuplicate_SeqAIJ(A, cpvalues, B);
3226:   MatConvert_SeqAIJ_SeqAIJHIPSPARSE(*B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, B);
3227:   return 0;
3228: }

3230: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3231: {
3232:   Mat_SeqAIJ          *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3233:   Mat_SeqAIJHIPSPARSE *cy;
3234:   Mat_SeqAIJHIPSPARSE *cx;
3235:   PetscScalar         *ay;
3236:   const PetscScalar   *ax;
3237:   CsrMatrix           *csry, *csrx;

3239:   cy = (Mat_SeqAIJHIPSPARSE *)Y->spptr;
3240:   cx = (Mat_SeqAIJHIPSPARSE *)X->spptr;
3241:   if (X->ops->axpy != Y->ops->axpy) {
3242:     MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE);
3243:     MatAXPY_SeqAIJ(Y, a, X, str);
3244:     return 0;
3245:   }
3246:   /* if we are here, it means both matrices are bound to GPU */
3247:   MatSeqAIJHIPSPARSECopyToGPU(Y);
3248:   MatSeqAIJHIPSPARSECopyToGPU(X);
3251:   csry = (CsrMatrix *)cy->mat->mat;
3252:   csrx = (CsrMatrix *)cx->mat->mat;
3253:   /* see if we can turn this into a hipblas axpy */
3254:   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3255:     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3256:     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3257:     if (eq) str = SAME_NONZERO_PATTERN;
3258:   }
3259:   /* spgeam is buggy with one column */
3260:   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3261:   if (str == SUBSET_NONZERO_PATTERN) {
3262:     PetscScalar b = 1.0;
3263: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3264:     size_t bufferSize;
3265:     void  *buffer;
3266: #endif

3268:     MatSeqAIJHIPSPARSEGetArrayRead(X, &ax);
3269:     MatSeqAIJHIPSPARSEGetArray(Y, &ay);
3270:     hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_HOST);
3271: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3272:     PetscCallHIPSPARSE(hipsparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3273:                                                        csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3274:     hipMalloc(&buffer, bufferSize);
3275:     PetscLogGpuTimeBegin();
3276:     PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3277:                                             csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3278:     PetscLogGpuFlops(x->nz + y->nz);
3279:     PetscLogGpuTimeEnd();
3280:     hipFree(buffer);
3281: #else
3282:     PetscLogGpuTimeBegin();
3283:     PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3284:                                             csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3285:     PetscLogGpuFlops(x->nz + y->nz);
3286:     PetscLogGpuTimeEnd();
3287: #endif
3288:     hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_DEVICE);
3289:     MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax);
3290:     MatSeqAIJHIPSPARSERestoreArray(Y, &ay);
3291:     MatSeqAIJInvalidateDiagonal(Y);
3292:   } else if (str == SAME_NONZERO_PATTERN) {
3293:     hipblasHandle_t hipblasv2handle;
3294:     PetscBLASInt    one = 1, bnz = 1;

3296:     MatSeqAIJHIPSPARSEGetArrayRead(X, &ax);
3297:     MatSeqAIJHIPSPARSEGetArray(Y, &ay);
3298:     PetscHIPBLASGetHandle(&hipblasv2handle);
3299:     PetscBLASIntCast(x->nz, &bnz);
3300:     PetscLogGpuTimeBegin();
3301:     hipblasXaxpy(hipblasv2handle, bnz, &a, ax, one, ay, one);
3302:     PetscLogGpuFlops(2.0 * bnz);
3303:     PetscLogGpuTimeEnd();
3304:     MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax);
3305:     MatSeqAIJHIPSPARSERestoreArray(Y, &ay);
3306:     MatSeqAIJInvalidateDiagonal(Y);
3307:   } else {
3308:     MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE);
3309:     MatAXPY_SeqAIJ(Y, a, X, str);
3310:   }
3311:   return 0;
3312: }

3314: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat Y, PetscScalar a)
3315: {
3316:   Mat_SeqAIJ     *y = (Mat_SeqAIJ *)Y->data;
3317:   PetscScalar    *ay;
3318:   hipblasHandle_t hipblasv2handle;
3319:   PetscBLASInt    one = 1, bnz = 1;

3321:   MatSeqAIJHIPSPARSEGetArray(Y, &ay);
3322:   PetscHIPBLASGetHandle(&hipblasv2handle);
3323:   PetscBLASIntCast(y->nz, &bnz);
3324:   PetscLogGpuTimeBegin();
3325:   hipblasXscal(hipblasv2handle, bnz, &a, ay, one);
3326:   PetscLogGpuFlops(bnz);
3327:   PetscLogGpuTimeEnd();
3328:   MatSeqAIJHIPSPARSERestoreArray(Y, &ay);
3329:   MatSeqAIJInvalidateDiagonal(Y);
3330:   return 0;
3331: }

3333: static PetscErrorCode MatZeroEntries_SeqAIJHIPSPARSE(Mat A)
3334: {
3335:   PetscBool   both = PETSC_FALSE;
3336:   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;

3338:   if (A->factortype == MAT_FACTOR_NONE) {
3339:     Mat_SeqAIJHIPSPARSE *spptr = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3340:     if (spptr->mat) {
3341:       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3342:       if (matrix->values) {
3343:         both = PETSC_TRUE;
3344:         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3345:       }
3346:     }
3347:     if (spptr->matTranspose) {
3348:       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3349:       if (matrix->values) { thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); }
3350:     }
3351:   }
3352:   //MatZeroEntries_SeqAIJ(A);
3353:   PetscArrayzero(a->a, a->i[A->rmap->n]);
3354:   MatSeqAIJInvalidateDiagonal(A);
3355:   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3356:   else A->offloadmask = PETSC_OFFLOAD_CPU;
3357:   return 0;
3358: }

3360: static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat A, PetscBool flg)
3361: {
3362:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

3364:   if (A->factortype != MAT_FACTOR_NONE) {
3365:     A->boundtocpu = flg;
3366:     return 0;
3367:   }
3368:   if (flg) {
3369:     MatSeqAIJHIPSPARSECopyFromGPU(A);

3371:     A->ops->scale                     = MatScale_SeqAIJ;
3372:     A->ops->axpy                      = MatAXPY_SeqAIJ;
3373:     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3374:     A->ops->mult                      = MatMult_SeqAIJ;
3375:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3376:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3377:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3378:     A->ops->multhermitiantranspose    = NULL;
3379:     A->ops->multhermitiantransposeadd = NULL;
3380:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3381:     PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps));
3382:     PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL);
3383:     PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL);
3384:     PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL);
3385:     PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL);
3386:     PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL);
3387:     PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL);
3388:   } else {
3389:     A->ops->scale                     = MatScale_SeqAIJHIPSPARSE;
3390:     A->ops->axpy                      = MatAXPY_SeqAIJHIPSPARSE;
3391:     A->ops->zeroentries               = MatZeroEntries_SeqAIJHIPSPARSE;
3392:     A->ops->mult                      = MatMult_SeqAIJHIPSPARSE;
3393:     A->ops->multadd                   = MatMultAdd_SeqAIJHIPSPARSE;
3394:     A->ops->multtranspose             = MatMultTranspose_SeqAIJHIPSPARSE;
3395:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJHIPSPARSE;
3396:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJHIPSPARSE;
3397:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE;
3398:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJHIPSPARSE;
3399:     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJHIPSPARSE;
3400:     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJHIPSPARSE;
3401:     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE;
3402:     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE;
3403:     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE;
3404:     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE;
3405:     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE;
3406:     PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJHIPSPARSE);
3407:     PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", MatProductSetFromOptions_SeqAIJHIPSPARSE);
3408:     PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", MatProductSetFromOptions_SeqAIJHIPSPARSE);
3409:     PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJHIPSPARSE);
3410:     PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJHIPSPARSE);
3411:     PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", MatProductSetFromOptions_SeqAIJHIPSPARSE);
3412:   }
3413:   A->boundtocpu = flg;
3414:   if (flg && a->inode.size) a->inode.use = PETSC_TRUE;
3415:   else a->inode.use = PETSC_FALSE;

3417:   return 0;
3418: }

3420: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
3421: {
3422:   Mat B;

3424:   PetscDeviceInitialize(PETSC_DEVICE_HIP); /* first use of HIPSPARSE may be via MatConvert */
3425:   if (reuse == MAT_INITIAL_MATRIX) {
3426:     MatDuplicate(A, MAT_COPY_VALUES, newmat);
3427:   } else if (reuse == MAT_REUSE_MATRIX) {
3428:     MatCopy(A, *newmat, SAME_NONZERO_PATTERN);
3429:   }
3430:   B = *newmat;
3431:   PetscFree(B->defaultvectype);
3432:   PetscStrallocpy(VECHIP, &B->defaultvectype);
3433:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3434:     if (B->factortype == MAT_FACTOR_NONE) {
3435:       Mat_SeqAIJHIPSPARSE *spptr;
3436:       PetscNew(&spptr);
3437:       hipsparseCreate(&spptr->handle);
3438:       hipsparseSetStream(spptr->handle, PetscDefaultHipStream);
3439:       spptr->format = MAT_HIPSPARSE_CSR;
3440: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3441:       spptr->spmvAlg = HIPSPARSE_SPMV_CSR_ALG1;
3442: #else
3443:       spptr->spmvAlg = HIPSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3444: #endif
3445:       spptr->spmmAlg = HIPSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3446:       //spptr->csr2cscAlg = HIPSPARSE_CSR2CSC_ALG1;

3448:       B->spptr = spptr;
3449:     } else {
3450:       Mat_SeqAIJHIPSPARSETriFactors *spptr;

3452:       PetscNew(&spptr);
3453:       hipsparseCreate(&spptr->handle);
3454:       hipsparseSetStream(spptr->handle, PetscDefaultHipStream);
3455:       B->spptr = spptr;
3456:     }
3457:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3458:   }
3459:   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJHIPSPARSE;
3460:   B->ops->destroy        = MatDestroy_SeqAIJHIPSPARSE;
3461:   B->ops->setoption      = MatSetOption_SeqAIJHIPSPARSE;
3462:   B->ops->setfromoptions = MatSetFromOptions_SeqAIJHIPSPARSE;
3463:   B->ops->bindtocpu      = MatBindToCPU_SeqAIJHIPSPARSE;
3464:   B->ops->duplicate      = MatDuplicate_SeqAIJHIPSPARSE;

3466:   MatBindToCPU_SeqAIJHIPSPARSE(B, PETSC_FALSE);
3467:   PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJHIPSPARSE);
3468:   PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetFormat_C", MatHIPSPARSESetFormat_SeqAIJHIPSPARSE);
3469: #if defined(PETSC_HAVE_HYPRE)
3470:   PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijhipsparse_hypre_C", MatConvert_AIJ_HYPRE);
3471: #endif
3472:   PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetUseCPUSolve_C", MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE);
3473:   return 0;
3474: }

3476: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJHIPSPARSE(Mat B)
3477: {
3478:   MatCreate_SeqAIJ(B);
3479:   MatConvert_SeqAIJ_SeqAIJHIPSPARSE(B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, &B);
3480:   return 0;
3481: }

3483: /*
3484:    MATSEQAIJHIPSPARSE - MATAIJHIPSPARSE = "(seq)aijhipsparse" - A matrix type to be used for sparse matrices.

3486:    A matrix type type whose data resides on AMD GPUs. These matrices can be in either
3487:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later on Nvidia devices.
3488:    All matrix calculations are performed on AMD/Nvidia GPUs using the HIPSPARSE library.

3490:    Options Database Keys:
3491: +  -mat_type aijhipsparse - sets the matrix type to "seqaijhipsparse" during a call to MatSetFromOptions()
3492: .  -mat_hipsparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3493: -  -mat_hipsparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3494: +  -mat_hipsparse_use_cpu_solve - Do MatSolve on CPU

3496:   Level: beginner

3498: .seealso: `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
3499: */

3501: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_HIPSPARSE(void)
3502: {
3503:   MatSolverTypeRegister(MATSOLVERHIPSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijhipsparse_hipsparse_band);
3504:   MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijhipsparse_hipsparse);
3505:   MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijhipsparse_hipsparse);
3506:   MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijhipsparse_hipsparse);
3507:   MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijhipsparse_hipsparse);

3509:   return 0;
3510: }

3512: static PetscErrorCode MatResetPreallocationCOO_SeqAIJHIPSPARSE(Mat mat)
3513: {
3514:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)mat->spptr;

3516:   if (!cusp) return 0;
3517:   delete cusp->cooPerm;
3518:   delete cusp->cooPerm_a;
3519:   cusp->cooPerm   = NULL;
3520:   cusp->cooPerm_a = NULL;
3521:   if (cusp->use_extended_coo) {
3522:     hipFree(cusp->jmap_d);
3523:     hipFree(cusp->perm_d);
3524:   }
3525:   cusp->use_extended_coo = PETSC_FALSE;
3526:   return 0;
3527: }

3529: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat_SeqAIJHIPSPARSE **hipsparsestruct)
3530: {
3531:   if (*hipsparsestruct) {
3532:     MatSeqAIJHIPSPARSEMultStruct_Destroy(&(*hipsparsestruct)->mat, (*hipsparsestruct)->format);
3533:     MatSeqAIJHIPSPARSEMultStruct_Destroy(&(*hipsparsestruct)->matTranspose, (*hipsparsestruct)->format);
3534:     delete (*hipsparsestruct)->workVector;
3535:     delete (*hipsparsestruct)->rowoffsets_gpu;
3536:     delete (*hipsparsestruct)->cooPerm;
3537:     delete (*hipsparsestruct)->cooPerm_a;
3538:     delete (*hipsparsestruct)->csr2csc_i;
3539:     if ((*hipsparsestruct)->handle) hipsparseDestroy((*hipsparsestruct)->handle);
3540:     if ((*hipsparsestruct)->jmap_d) hipFree((*hipsparsestruct)->jmap_d);
3541:     if ((*hipsparsestruct)->perm_d) hipFree((*hipsparsestruct)->perm_d);
3542:     PetscFree(*hipsparsestruct);
3543:   }
3544:   return 0;
3545: }

3547: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3548: {
3549:   if (*mat) {
3550:     delete (*mat)->values;
3551:     delete (*mat)->column_indices;
3552:     delete (*mat)->row_offsets;
3553:     delete *mat;
3554:     *mat = 0;
3555:   }
3556:   return 0;
3557: }

3559: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **trifactor)
3560: {
3561:   if (*trifactor) {
3562:     if ((*trifactor)->descr) hipsparseDestroyMatDescr((*trifactor)->descr);
3563:     if ((*trifactor)->solveInfo) hipsparseDestroyCsrsvInfo((*trifactor)->solveInfo);
3564:     CsrMatrix_Destroy(&(*trifactor)->csrMat);
3565:     if ((*trifactor)->solveBuffer) hipFree((*trifactor)->solveBuffer);
3566:     if ((*trifactor)->AA_h) hipHostFree((*trifactor)->AA_h);
3567:     if ((*trifactor)->csr2cscBuffer) hipFree((*trifactor)->csr2cscBuffer);
3568:     PetscFree(*trifactor);
3569:   }
3570:   return 0;
3571: }

3573: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **matstruct, MatHIPSPARSEStorageFormat format)
3574: {
3575:   CsrMatrix *mat;

3577:   if (*matstruct) {
3578:     if ((*matstruct)->mat) {
3579:       if (format == MAT_HIPSPARSE_ELL || format == MAT_HIPSPARSE_HYB) {
3580:         hipsparseHybMat_t hybMat = (hipsparseHybMat_t)(*matstruct)->mat;
3581:         hipsparseDestroyHybMat(hybMat);
3582:       } else {
3583:         mat = (CsrMatrix *)(*matstruct)->mat;
3584:         CsrMatrix_Destroy(&mat);
3585:       }
3586:     }
3587:     if ((*matstruct)->descr) hipsparseDestroyMatDescr((*matstruct)->descr);
3588:     delete (*matstruct)->cprowIndices;
3589:     if ((*matstruct)->alpha_one) hipFree((*matstruct)->alpha_one);
3590:     if ((*matstruct)->beta_zero) hipFree((*matstruct)->beta_zero);
3591:     if ((*matstruct)->beta_one) hipFree((*matstruct)->beta_one);

3593:     Mat_SeqAIJHIPSPARSEMultStruct *mdata = *matstruct;
3594:     if (mdata->matDescr) hipsparseDestroySpMat(mdata->matDescr);
3595:     for (int i = 0; i < 3; i++) {
3596:       if (mdata->hipSpMV[i].initialized) {
3597:         hipFree(mdata->hipSpMV[i].spmvBuffer);
3598:         hipsparseDestroyDnVec(mdata->hipSpMV[i].vecXDescr);
3599:         hipsparseDestroyDnVec(mdata->hipSpMV[i].vecYDescr);
3600:       }
3601:     }
3602:     delete *matstruct;
3603:     *matstruct = NULL;
3604:   }
3605:   return 0;
3606: }

3608: PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Reset(Mat_SeqAIJHIPSPARSETriFactors_p *trifactors)
3609: {
3610:   Mat_SeqAIJHIPSPARSETriFactors *fs = *trifactors;

3612:   if (fs) {
3613:     MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtr);
3614:     MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtr);
3615:     MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose);
3616:     MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose);
3617:     delete fs->rpermIndices;
3618:     delete fs->cpermIndices;
3619:     delete fs->workVector;
3620:     fs->rpermIndices = NULL;
3621:     fs->cpermIndices = NULL;
3622:     fs->workVector   = NULL;
3623:     if (fs->a_band_d) hipFree(fs->a_band_d);
3624:     if (fs->i_band_d) hipFree(fs->i_band_d);
3625:     fs->init_dev_prop = PETSC_FALSE;
3626: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3627:     hipFree(fs->csrRowPtr);
3628:     hipFree(fs->csrColIdx);
3629:     hipFree(fs->csrVal);
3630:     hipFree(fs->X);
3631:     hipFree(fs->Y);
3632:     // hipFree(fs->factBuffer_M); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3633:     hipFree(fs->spsvBuffer_L);
3634:     hipFree(fs->spsvBuffer_U);
3635:     hipFree(fs->spsvBuffer_Lt);
3636:     hipFree(fs->spsvBuffer_Ut);
3637:     hipsparseDestroyMatDescr(fs->matDescr_M);
3638:     if (fs->spMatDescr_L) hipsparseDestroySpMat(fs->spMatDescr_L);
3639:     if (fs->spMatDescr_U) hipsparseDestroySpMat(fs->spMatDescr_U);
3640:     hipsparseSpSV_destroyDescr(fs->spsvDescr_L);
3641:     hipsparseSpSV_destroyDescr(fs->spsvDescr_Lt);
3642:     hipsparseSpSV_destroyDescr(fs->spsvDescr_U);
3643:     hipsparseSpSV_destroyDescr(fs->spsvDescr_Ut);
3644:     if (fs->dnVecDescr_X) hipsparseDestroyDnVec(fs->dnVecDescr_X);
3645:     if (fs->dnVecDescr_Y) hipsparseDestroyDnVec(fs->dnVecDescr_Y);
3646:     hipsparseDestroyCsrilu02Info(fs->ilu0Info_M);
3647:     hipsparseDestroyCsric02Info(fs->ic0Info_M);

3649:     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
3650:     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3651: #endif
3652:   }
3653:   return 0;
3654: }

3656: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **trifactors)
3657: {
3658:   hipsparseHandle_t handle;

3660:   if (*trifactors) {
3661:     MatSeqAIJHIPSPARSETriFactors_Reset(trifactors);
3662:     if ((handle = (*trifactors)->handle)) hipsparseDestroy(handle);
3663:     PetscFree(*trifactors);
3664:   }
3665:   return 0;
3666: }

3668: struct IJCompare {
3669:   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3670:   {
3671:     if (t1.get<0>() < t2.get<0>()) return true;
3672:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3673:     return false;
3674:   }
3675: };

3677: struct IJEqual {
3678:   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3679:   {
3680:     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3681:     return true;
3682:   }
3683: };

3685: struct IJDiff {
3686:   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
3687: };

3689: struct IJSum {
3690:   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
3691: };

3693: PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
3694: {
3695:   Mat_SeqAIJHIPSPARSE                  *cusp      = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3696:   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
3697:   THRUSTARRAY                          *cooPerm_v = NULL;
3698:   thrust::device_ptr<const PetscScalar> d_v;
3699:   CsrMatrix                            *matrix;
3700:   PetscInt                              n;

3704:   if (!cusp->cooPerm) {
3705:     MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);
3706:     MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);
3707:     return 0;
3708:   }
3709:   matrix = (CsrMatrix *)cusp->mat->mat;
3711:   if (!v) {
3712:     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3713:     goto finalize;
3714:   }
3715:   n = cusp->cooPerm->size();
3716:   if (isHipMem(v)) d_v = thrust::device_pointer_cast(v);
3717:   else {
3718:     cooPerm_v = new THRUSTARRAY(n);
3719:     cooPerm_v->assign(v, v + n);
3720:     d_v = cooPerm_v->data();
3721:     PetscLogCpuToGpu(n * sizeof(PetscScalar));
3722:   }
3723:   PetscLogGpuTimeBegin();
3724:   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3725:     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
3726:       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3727:       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
3728:       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3729:         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3730:         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3731:       */
3732:       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
3733:       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
3734:       delete cooPerm_w;
3735:     } else {
3736:       /* all nonzeros in d_v[] are unique entries */
3737:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
3738:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
3739:       thrust::for_each(zibit, zieit, VecHIPPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3740:     }
3741:   } else {
3742:     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3743:       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
3744:       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
3745:     } else {
3746:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
3747:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
3748:       thrust::for_each(zibit, zieit, VecHIPEquals());
3749:     }
3750:   }
3751:   PetscLogGpuTimeEnd();
3752: finalize:
3753:   delete cooPerm_v;
3754:   A->offloadmask = PETSC_OFFLOAD_GPU;
3755:   PetscObjectStateIncrease((PetscObject)A);
3756:   /* shorter version of MatAssemblyEnd_SeqAIJ */
3757:   PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz);
3758:   PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n");
3759:   PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax);
3760:   a->reallocs = 0;
3761:   A->info.mallocs += 0;
3762:   A->info.nz_unneeded = 0;
3763:   A->assembled = A->was_assembled = PETSC_TRUE;
3764:   A->num_ass++;
3765:   return 0;
3766: }

3768: PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3769: {
3770:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;

3773:   if (!cusp) return 0;
3774:   if (destroy) {
3775:     MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format);
3776:     delete cusp->csr2csc_i;
3777:     cusp->csr2csc_i = NULL;
3778:   }
3779:   A->transupdated = PETSC_FALSE;
3780:   return 0;
3781: }

3783: PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
3784: {
3785:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3786:   Mat_SeqAIJ          *a    = (Mat_SeqAIJ *)A->data;
3787:   PetscInt             cooPerm_n, nzr = 0;

3789:   PetscLayoutSetUp(A->rmap);
3790:   PetscLayoutSetUp(A->cmap);
3791:   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3792:   if (n != cooPerm_n) {
3793:     delete cusp->cooPerm;
3794:     delete cusp->cooPerm_a;
3795:     cusp->cooPerm   = NULL;
3796:     cusp->cooPerm_a = NULL;
3797:   }
3798:   if (n) {
3799:     thrust::device_ptr<PetscInt> d_i, d_j;
3800:     PetscInt                    *d_raw_i, *d_raw_j;
3801:     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
3802:     PetscMemType                 imtype, jmtype;

3804:     PetscGetMemType(coo_i, &imtype);
3805:     if (PetscMemTypeHost(imtype)) {
3806:       hipMalloc(&d_raw_i, sizeof(PetscInt) * n);
3807:       hipMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, hipMemcpyHostToDevice);
3808:       d_i        = thrust::device_pointer_cast(d_raw_i);
3809:       free_raw_i = PETSC_TRUE;
3810:       PetscLogCpuToGpu(1. * n * sizeof(PetscInt));
3811:     } else {
3812:       d_i = thrust::device_pointer_cast(coo_i);
3813:     }

3815:     PetscGetMemType(coo_j, &jmtype);
3816:     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJHIPSPARSE_Basic() passes device coo_i[] and host coo_j[]!
3817:       hipMalloc(&d_raw_j, sizeof(PetscInt) * n);
3818:       hipMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, hipMemcpyHostToDevice);
3819:       d_j        = thrust::device_pointer_cast(d_raw_j);
3820:       free_raw_j = PETSC_TRUE;
3821:       PetscLogCpuToGpu(1. * n * sizeof(PetscInt));
3822:     } else {
3823:       d_j = thrust::device_pointer_cast(coo_j);
3824:     }

3826:     THRUSTINTARRAY ii(A->rmap->n);

3828:     if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
3829:     if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
3830:     /* Ex.
3831:       n = 6
3832:       coo_i = [3,3,1,4,1,4]
3833:       coo_j = [3,2,2,5,2,6]
3834:     */
3835:     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
3836:     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));

3838:     PetscLogGpuTimeBegin();
3839:     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3840:     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3841:     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
3842:     THRUSTINTARRAY w(d_j, d_j + n);
3843:     /*
3844:       d_i     = [1,1,3,3,4,4]
3845:       d_j     = [2,2,2,3,5,6]
3846:       cooPerm = [2,4,1,0,3,5]
3847:     */
3848:     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3849:     /*
3850:       d_i     = [1,3,3,4,4,x]
3851:                             ^ekey
3852:       d_j     = [2,2,3,5,6,x]
3853:                            ^nekye
3854:     */
3855:     if (nekey == ekey) { /* all entries are unique */
3856:       delete cusp->cooPerm_a;
3857:       cusp->cooPerm_a = NULL;
3858:     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3859:       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3860:       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3861:       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3862:       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a hipMemcpy */
3863:       w[0]                  = 0;
3864:       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
3865:       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3866:     }
3867:     thrust::counting_iterator<PetscInt> search_begin(0);
3868:     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3869:                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3870:                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3871:     PetscLogGpuTimeEnd();

3873:     MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i);
3874:     a->singlemalloc = PETSC_FALSE;
3875:     a->free_a       = PETSC_TRUE;
3876:     a->free_ij      = PETSC_TRUE;
3877:     PetscMalloc1(A->rmap->n + 1, &a->i);
3878:     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3879:     hipMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), hipMemcpyDeviceToHost);
3880:     a->nz = a->maxnz = a->i[A->rmap->n];
3881:     a->rmax          = 0;
3882:     PetscMalloc1(a->nz, &a->a);
3883:     PetscMalloc1(a->nz, &a->j);
3884:     hipMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), hipMemcpyDeviceToHost);
3885:     if (!a->ilen) PetscMalloc1(A->rmap->n, &a->ilen);
3886:     if (!a->imax) PetscMalloc1(A->rmap->n, &a->imax);
3887:     for (PetscInt i = 0; i < A->rmap->n; i++) {
3888:       const PetscInt nnzr = a->i[i + 1] - a->i[i];
3889:       nzr += (PetscInt) !!(nnzr);
3890:       a->ilen[i] = a->imax[i] = nnzr;
3891:       a->rmax                 = PetscMax(a->rmax, nnzr);
3892:     }
3893:     a->nonzerorowcnt = nzr;
3894:     A->preallocated  = PETSC_TRUE;
3895:     PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt));
3896:     MatMarkDiagonal_SeqAIJ(A);
3897:   } else MatSeqAIJSetPreallocation(A, 0, NULL);
3898:   MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE);
3899:   /* We want to allocate the HIPSPARSE struct for matvec now.
3900:      The code is so convoluted now that I prefer to copy zeros */
3901:   PetscArrayzero(a->a, a->nz);
3902:   MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6);
3903:   A->offloadmask = PETSC_OFFLOAD_CPU;
3904:   MatSeqAIJHIPSPARSECopyToGPU(A);
3905:   MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE);
3906:   return 0;
3907: }

3909: PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
3910: {
3911:   Mat_SeqAIJ          *seq;
3912:   Mat_SeqAIJHIPSPARSE *dev;
3913:   PetscBool            coo_basic = PETSC_TRUE;
3914:   PetscMemType         mtype     = PETSC_MEMTYPE_DEVICE;

3916:   MatResetPreallocationCOO_SeqAIJ(mat);
3917:   MatResetPreallocationCOO_SeqAIJHIPSPARSE(mat);
3918:   if (coo_i) {
3919:     PetscGetMemType(coo_i, &mtype);
3920:     if (PetscMemTypeHost(mtype)) {
3921:       for (PetscCount k = 0; k < coo_n; k++) {
3922:         if (coo_i[k] < 0 || coo_j[k] < 0) {
3923:           coo_basic = PETSC_FALSE;
3924:           break;
3925:         }
3926:       }
3927:     }
3928:   }

3930:   if (coo_basic) { /* i,j are on device or do not contain negative indices */
3931:     MatSetPreallocationCOO_SeqAIJHIPSPARSE_Basic(mat, coo_n, coo_i, coo_j);
3932:   } else {
3933:     MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j);
3934:     mat->offloadmask = PETSC_OFFLOAD_CPU;
3935:     MatSeqAIJHIPSPARSECopyToGPU(mat);
3936:     seq = static_cast<Mat_SeqAIJ *>(mat->data);
3937:     dev = static_cast<Mat_SeqAIJHIPSPARSE *>(mat->spptr);
3938:     hipMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount));
3939:     hipMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice);
3940:     hipMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount));
3941:     hipMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), hipMemcpyHostToDevice);
3942:     dev->use_extended_coo = PETSC_TRUE;
3943:   }
3944:   return 0;
3945: }

3947: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
3948: {
3949:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
3950:   const PetscCount grid_size = gridDim.x * blockDim.x;
3951:   for (; i < nnz; i += grid_size) {
3952:     PetscScalar sum = 0.0;
3953:     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
3954:     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
3955:   }
3956: }

3958: PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3959: {
3960:   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
3961:   Mat_SeqAIJHIPSPARSE *dev  = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3962:   PetscCount           Annz = seq->nz;
3963:   PetscMemType         memtype;
3964:   const PetscScalar   *v1 = v;
3965:   PetscScalar         *Aa;

3967:   if (dev->use_extended_coo) {
3968:     PetscGetMemType(v, &memtype);
3969:     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
3970:       hipMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar));
3971:       hipMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), hipMemcpyHostToDevice);
3972:     }

3974:     if (imode == INSERT_VALUES) MatSeqAIJHIPSPARSEGetArrayWrite(A, &Aa);
3975:     else MatSeqAIJHIPSPARSEGetArray(A, &Aa);

3977:     if (Annz) {
3978:       hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddCOOValues), dim3((Annz + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
3979:       hipPeekAtLastError();
3980:     }

3982:     if (imode == INSERT_VALUES) MatSeqAIJHIPSPARSERestoreArrayWrite(A, &Aa);
3983:     else MatSeqAIJHIPSPARSERestoreArray(A, &Aa);

3985:     if (PetscMemTypeHost(memtype)) hipFree((void *)v1);
3986:   } else {
3987:     MatSetValuesCOO_SeqAIJHIPSPARSE_Basic(A, v, imode);
3988:   }
3989:   return 0;
3990: }

3992: /*@C
3993:     MatSeqAIJHIPSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJHIPSPARSE matrices.
3994:     Not collective

3996:     Input Parameters:
3997: +   A - the matrix
3998: -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form

4000:     Output Parameters:
4001: +   ia - the CSR row pointers
4002: -   ja - the CSR column indices

4004:     Level: developer

4006:     Notes:
4007:       When compressed is true, the CSR structure does not contain empty rows

4009: .seealso: `MatSeqAIJHIPSPARSERestoreIJ()`, `MatSeqAIJHIPSPARSEGetArrayRead()`
4010: @*/
4011: PetscErrorCode MatSeqAIJHIPSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4012: {
4013:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4014:   Mat_SeqAIJ          *a    = (Mat_SeqAIJ *)A->data;
4015:   CsrMatrix           *csr;

4018:   if (!i || !j) return 0;
4021:   MatSeqAIJHIPSPARSECopyToGPU(A);
4023:   csr = (CsrMatrix *)cusp->mat->mat;
4024:   if (i) {
4025:     if (!compressed && a->compressedrow.use) { /* need full row offset */
4026:       if (!cusp->rowoffsets_gpu) {
4027:         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4028:         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4029:         PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt));
4030:       }
4031:       *i = cusp->rowoffsets_gpu->data().get();
4032:     } else *i = csr->row_offsets->data().get();
4033:   }
4034:   if (j) *j = csr->column_indices->data().get();
4035:   return 0;
4036: }

4038: /*@C
4039:     MatSeqAIJHIPSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJHIPSPARSEGetIJ()
4040:     Not collective

4042:     Input Parameters:
4043: +   A - the matrix
4044: -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form

4046:     Output Parameters:
4047: +   ia - the CSR row pointers
4048: -   ja - the CSR column indices

4050:     Level: developer

4052: .seealso: `MatSeqAIJHIPSPARSEGetIJ()`
4053: @*/
4054: PetscErrorCode MatSeqAIJHIPSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4055: {
4058:   if (i) *i = NULL;
4059:   if (j) *j = NULL;
4060:   return 0;
4061: }

4063: /*@C
4064:    MatSeqAIJHIPSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJHIPSPARSE matrix is stored
4065:    Not Collective

4067:    Input Parameter:
4068: .   A - a MATSEQAIJHIPSPARSE matrix

4070:    Output Parameter:
4071: .   a - pointer to the device data

4073:    Level: developer

4075:    Notes: may trigger host-device copies if up-to-date matrix data is on host

4077: .seealso: `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArrayRead()`
4078: @*/
4079: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4080: {
4081:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4082:   CsrMatrix           *csr;

4088:   MatSeqAIJHIPSPARSECopyToGPU(A);
4090:   csr = (CsrMatrix *)cusp->mat->mat;
4092:   *a = csr->values->data().get();
4093:   return 0;
4094: }

4096: /*@C
4097:    MatSeqAIJHIPSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJHIPSPARSEGetArrayRead()
4098:    Not Collective

4100:    Input Parameter:
4101: .   A - a MATSEQAIJHIPSPARSE matrix

4103:    Output Parameter:
4104: .   a - pointer to the device data

4106:    Level: developer

4108: .seealso: `MatSeqAIJHIPSPARSEGetArrayRead()`
4109: @*/
4110: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4111: {
4115:   *a = NULL;
4116:   return 0;
4117: }

4119: /*@C
4120:    MatSeqAIJHIPSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJHIPSPARSE matrix is stored
4121:    Not Collective

4123:    Input Parameter:
4124: .   A - a MATSEQAIJHIPSPARSE matrix

4126:    Output Parameter:
4127: .   a - pointer to the device data

4129:    Level: developer

4131:    Notes: may trigger host-device copies if up-to-date matrix data is on host

4133: .seealso: `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArray()`
4134: @*/
4135: PetscErrorCode MatSeqAIJHIPSPARSEGetArray(Mat A, PetscScalar **a)
4136: {
4137:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4138:   CsrMatrix           *csr;

4144:   MatSeqAIJHIPSPARSECopyToGPU(A);
4146:   csr = (CsrMatrix *)cusp->mat->mat;
4148:   *a             = csr->values->data().get();
4149:   A->offloadmask = PETSC_OFFLOAD_GPU;
4150:   MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE);
4151:   return 0;
4152: }
4153: /*@C
4154:    MatSeqAIJHIPSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJHIPSPARSEGetArray()

4156:    Not Collective

4158:    Input Parameter:
4159: .   A - a MATSEQAIJHIPSPARSE matrix

4161:    Output Parameter:
4162: .   a - pointer to the device data

4164:    Level: developer

4166: .seealso: `MatSeqAIJHIPSPARSEGetArray()`
4167: @*/
4168: PetscErrorCode MatSeqAIJHIPSPARSERestoreArray(Mat A, PetscScalar **a)
4169: {
4173:   MatSeqAIJInvalidateDiagonal(A);
4174:   PetscObjectStateIncrease((PetscObject)A);
4175:   *a = NULL;
4176:   return 0;
4177: }

4179: /*@C
4180:    MatSeqAIJHIPSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJHIPSPARSE matrix is stored

4182:    Not Collective

4184:    Input Parameter:
4185: .   A - a MATSEQAIJHIPSPARSE matrix

4187:    Output Parameter:
4188: .   a - pointer to the device data

4190:    Level: developer

4192:    Notes: does not trigger host-device copies and flags data validity on the GPU

4194: .seealso: `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSERestoreArrayWrite()`
4195: @*/
4196: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4197: {
4198:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4199:   CsrMatrix           *csr;

4206:   csr = (CsrMatrix *)cusp->mat->mat;
4208:   *a             = csr->values->data().get();
4209:   A->offloadmask = PETSC_OFFLOAD_GPU;
4210:   MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE);
4211:   return 0;
4212: }

4214: /*@C
4215:    MatSeqAIJHIPSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJHIPSPARSEGetArrayWrite()

4217:    Not Collective

4219:    Input Parameter:
4220: .   A - a MATSEQAIJHIPSPARSE matrix

4222:    Output Parameter:
4223: .   a - pointer to the device data

4225:    Level: developer

4227: .seealso: `MatSeqAIJHIPSPARSEGetArrayWrite()`
4228: @*/
4229: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4230: {
4234:   MatSeqAIJInvalidateDiagonal(A);
4235:   PetscObjectStateIncrease((PetscObject)A);
4236:   *a = NULL;
4237:   return 0;
4238: }

4240: struct IJCompare4 {
4241:   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4242:   {
4243:     if (t1.get<0>() < t2.get<0>()) return true;
4244:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4245:     return false;
4246:   }
4247: };

4249: struct Shift {
4250:   int _shift;

4252:   Shift(int shift) : _shift(shift) { }
4253:   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4254: };

4256: /* merges two SeqAIJHIPSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4257: PetscErrorCode MatSeqAIJHIPSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4258: {
4259:   Mat_SeqAIJ                    *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4260:   Mat_SeqAIJHIPSPARSE           *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr, *Ccusp;
4261:   Mat_SeqAIJHIPSPARSEMultStruct *Cmat;
4262:   CsrMatrix                     *Acsr, *Bcsr, *Ccsr;
4263:   PetscInt                       Annz, Bnnz;
4264:   PetscInt                       i, m, n, zero = 0;

4275:   if (reuse == MAT_INITIAL_MATRIX) {
4276:     m = A->rmap->n;
4277:     n = A->cmap->n + B->cmap->n;
4278:     MatCreate(PETSC_COMM_SELF, C);
4279:     MatSetSizes(*C, m, n, m, n);
4280:     MatSetType(*C, MATSEQAIJHIPSPARSE);
4281:     c                       = (Mat_SeqAIJ *)(*C)->data;
4282:     Ccusp                   = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4283:     Cmat                    = new Mat_SeqAIJHIPSPARSEMultStruct;
4284:     Ccsr                    = new CsrMatrix;
4285:     Cmat->cprowIndices      = NULL;
4286:     c->compressedrow.use    = PETSC_FALSE;
4287:     c->compressedrow.nrows  = 0;
4288:     c->compressedrow.i      = NULL;
4289:     c->compressedrow.rindex = NULL;
4290:     Ccusp->workVector       = NULL;
4291:     Ccusp->nrows            = m;
4292:     Ccusp->mat              = Cmat;
4293:     Ccusp->mat->mat         = Ccsr;
4294:     Ccsr->num_rows          = m;
4295:     Ccsr->num_cols          = n;
4296:     hipsparseCreateMatDescr(&Cmat->descr);
4297:     hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO);
4298:     hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL);
4299:     hipMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar));
4300:     hipMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar));
4301:     hipMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));
4302:     hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice);
4303:     hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice);
4304:     hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice);
4305:     MatSeqAIJHIPSPARSECopyToGPU(A);
4306:     MatSeqAIJHIPSPARSECopyToGPU(B);

4310:     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4311:     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4312:     Annz                 = (PetscInt)Acsr->column_indices->size();
4313:     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4314:     c->nz                = Annz + Bnnz;
4315:     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4316:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4317:     Ccsr->values         = new THRUSTARRAY(c->nz);
4318:     Ccsr->num_entries    = c->nz;
4319:     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4320:     if (c->nz) {
4321:       auto              Acoo = new THRUSTINTARRAY32(Annz);
4322:       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4323:       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4324:       THRUSTINTARRAY32 *Aroff, *Broff;

4326:       if (a->compressedrow.use) { /* need full row offset */
4327:         if (!Acusp->rowoffsets_gpu) {
4328:           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4329:           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4330:           PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt));
4331:         }
4332:         Aroff = Acusp->rowoffsets_gpu;
4333:       } else Aroff = Acsr->row_offsets;
4334:       if (b->compressedrow.use) { /* need full row offset */
4335:         if (!Bcusp->rowoffsets_gpu) {
4336:           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4337:           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4338:           PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt));
4339:         }
4340:         Broff = Bcusp->rowoffsets_gpu;
4341:       } else Broff = Bcsr->row_offsets;
4342:       PetscLogGpuTimeBegin();
4343:       hipsparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO);
4344:       hipsparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO);
4345:       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4346:       auto Aperm = thrust::make_constant_iterator(1);
4347:       auto Bperm = thrust::make_constant_iterator(0);
4348:       auto Bcib  = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4349:       auto Bcie  = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4350:       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4351:       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4352:       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4353:       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4354:       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4355:       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4356:       auto p1    = Ccusp->cooPerm->begin();
4357:       auto p2    = Ccusp->cooPerm->begin();
4358:       thrust::advance(p2, Annz);
4359:       thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4());
4360:       auto cci = thrust::make_counting_iterator(zero);
4361:       auto cce = thrust::make_counting_iterator(c->nz);
4362: #if 0 //Errors on SUMMIT cuda 11.1.0
4363:       thrust::partition_copy(thrust::device, cci, cce, wPerm->begin(), p1, p2, thrust::identity<int>());
4364: #else
4365:       auto pred = thrust::identity<int>();
4366:       thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred);
4367:       thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred);
4368: #endif
4369:       hipsparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), HIPSPARSE_INDEX_BASE_ZERO);
4370:       PetscLogGpuTimeEnd();
4371:       delete wPerm;
4372:       delete Acoo;
4373:       delete Bcoo;
4374:       delete Ccoo;
4375:       hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype);

4377:       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4378:         MatSeqAIJHIPSPARSEFormExplicitTranspose(A);
4379:         MatSeqAIJHIPSPARSEFormExplicitTranspose(B);
4380:         PetscBool                      AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4381:         Mat_SeqAIJHIPSPARSEMultStruct *CmatT = new Mat_SeqAIJHIPSPARSEMultStruct;
4382:         CsrMatrix                     *CcsrT = new CsrMatrix;
4383:         CsrMatrix                     *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4384:         CsrMatrix                     *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;

4386:         (*C)->form_explicit_transpose = PETSC_TRUE;
4387:         (*C)->transupdated            = PETSC_TRUE;
4388:         Ccusp->rowoffsets_gpu         = NULL;
4389:         CmatT->cprowIndices           = NULL;
4390:         CmatT->mat                    = CcsrT;
4391:         CcsrT->num_rows               = n;
4392:         CcsrT->num_cols               = m;
4393:         CcsrT->num_entries            = c->nz;
4394:         CcsrT->row_offsets            = new THRUSTINTARRAY32(n + 1);
4395:         CcsrT->column_indices         = new THRUSTINTARRAY32(c->nz);
4396:         CcsrT->values                 = new THRUSTARRAY(c->nz);

4398:         PetscLogGpuTimeBegin();
4399:         auto rT = CcsrT->row_offsets->begin();
4400:         if (AT) {
4401:           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4402:           thrust::advance(rT, -1);
4403:         }
4404:         if (BT) {
4405:           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4406:           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4407:           thrust::copy(titb, tite, rT);
4408:         }
4409:         auto cT = CcsrT->column_indices->begin();
4410:         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4411:         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4412:         auto vT = CcsrT->values->begin();
4413:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4414:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4415:         PetscLogGpuTimeEnd();

4417:         hipsparseCreateMatDescr(&CmatT->descr);
4418:         hipsparseSetMatIndexBase(CmatT->descr, HIPSPARSE_INDEX_BASE_ZERO);
4419:         hipsparseSetMatType(CmatT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL);
4420:         hipMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar));
4421:         hipMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar));
4422:         hipMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));
4423:         hipMemcpy(CmatT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice);
4424:         hipMemcpy(CmatT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice);
4425:         hipMemcpy(CmatT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice);

4427:         hipsparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype);
4428:         Ccusp->matTranspose = CmatT;
4429:       }
4430:     }

4432:     c->singlemalloc = PETSC_FALSE;
4433:     c->free_a       = PETSC_TRUE;
4434:     c->free_ij      = PETSC_TRUE;
4435:     PetscMalloc1(m + 1, &c->i);
4436:     PetscMalloc1(c->nz, &c->j);
4437:     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4438:       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4439:       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4440:       ii = *Ccsr->row_offsets;
4441:       jj = *Ccsr->column_indices;
4442:       hipMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost);
4443:       hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost);
4444:     } else {
4445:       hipMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost);
4446:       hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost);
4447:     }
4448:     PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt));
4449:     PetscMalloc1(m, &c->ilen);
4450:     PetscMalloc1(m, &c->imax);
4451:     c->maxnz         = c->nz;
4452:     c->nonzerorowcnt = 0;
4453:     c->rmax          = 0;
4454:     for (i = 0; i < m; i++) {
4455:       const PetscInt nn = c->i[i + 1] - c->i[i];
4456:       c->ilen[i] = c->imax[i] = nn;
4457:       c->nonzerorowcnt += (PetscInt) !!nn;
4458:       c->rmax = PetscMax(c->rmax, nn);
4459:     }
4460:     MatMarkDiagonal_SeqAIJ(*C);
4461:     PetscMalloc1(c->nz, &c->a);
4462:     (*C)->nonzerostate++;
4463:     PetscLayoutSetUp((*C)->rmap);
4464:     PetscLayoutSetUp((*C)->cmap);
4465:     Ccusp->nonzerostate = (*C)->nonzerostate;
4466:     (*C)->preallocated  = PETSC_TRUE;
4467:   } else {
4469:     c = (Mat_SeqAIJ *)(*C)->data;
4470:     if (c->nz) {
4471:       Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4475:       MatSeqAIJHIPSPARSECopyToGPU(A);
4476:       MatSeqAIJHIPSPARSECopyToGPU(B);
4479:       Acsr = (CsrMatrix *)Acusp->mat->mat;
4480:       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4481:       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4487:       auto pmid = Ccusp->cooPerm->begin();
4488:       thrust::advance(pmid, Acsr->num_entries);
4489:       PetscLogGpuTimeBegin();
4490:       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
4491:       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4492:       thrust::for_each(zibait, zieait, VecHIPEquals());
4493:       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4494:       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
4495:       thrust::for_each(zibbit, ziebit, VecHIPEquals());
4496:       MatSeqAIJHIPSPARSEInvalidateTranspose(*C, PETSC_FALSE);
4497:       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4499:         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4500:         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4501:         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4502:         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4503:         auto       vT    = CcsrT->values->begin();
4504:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4505:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4506:         (*C)->transupdated = PETSC_TRUE;
4507:       }
4508:       PetscLogGpuTimeEnd();
4509:     }
4510:   }
4511:   PetscObjectStateIncrease((PetscObject)*C);
4512:   (*C)->assembled     = PETSC_TRUE;
4513:   (*C)->was_assembled = PETSC_FALSE;
4514:   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4515:   return 0;
4516: }

4518: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4519: {
4520:   bool               dmem;
4521:   const PetscScalar *av;

4523:   dmem = isHipMem(v);
4524:   MatSeqAIJHIPSPARSEGetArrayRead(A, &av);
4525:   if (n && idx) {
4526:     THRUSTINTARRAY widx(n);
4527:     widx.assign(idx, idx + n);
4528:     PetscLogCpuToGpu(n * sizeof(PetscInt));

4530:     THRUSTARRAY                    *w = NULL;
4531:     thrust::device_ptr<PetscScalar> dv;
4532:     if (dmem) dv = thrust::device_pointer_cast(v);
4533:     else {
4534:       w  = new THRUSTARRAY(n);
4535:       dv = w->data();
4536:     }
4537:     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);

4539:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4540:     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4541:     thrust::for_each(zibit, zieit, VecHIPEquals());
4542:     if (w) hipMemcpy(v, w->data().get(), n * sizeof(PetscScalar), hipMemcpyDeviceToHost);
4543:     delete w;
4544:   } else hipMemcpy(v, av, n * sizeof(PetscScalar), dmem ? hipMemcpyDeviceToDevice : hipMemcpyDeviceToHost);

4546:   if (!dmem) PetscLogCpuToGpu(n * sizeof(PetscScalar));
4547:   MatSeqAIJHIPSPARSERestoreArrayRead(A, &av);
4548:   return 0;
4549: }