Actual source code: aijhipsparse.hip.cpp
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the HIPSPARSE library,
4: Portions of this code are under:
5: Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
6: */
7: #define PETSC_SKIP_SPINLOCK
9: #include <petscconf.h>
10: #include <../src/mat/impls/aij/seq/aij.h>
11: #include <../src/mat/impls/sbaij/seq/sbaij.h>
12: #include <../src/vec/vec/impls/dvecimpl.h>
13: #include <petsc/private/vecimpl.h>
14: #undef VecType
15: #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h>
16: #include <thrust/adjacent_difference.h>
17: #include <thrust/iterator/transform_iterator.h>
18: #if PETSC_CPP_VERSION >= 14
19: #define PETSC_HAVE_THRUST_ASYNC 1
20: #include <thrust/async/for_each.h>
21: #endif
22: #include <thrust/iterator/constant_iterator.h>
23: #include <thrust/iterator/discard_iterator.h>
24: #include <thrust/binary_search.h>
25: #include <thrust/remove.h>
26: #include <thrust/sort.h>
27: #include <thrust/unique.h>
29: const char *const MatHIPSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatHIPSPARSEStorageFormat", "MAT_HIPSPARSE_", 0};
30: const char *const MatHIPSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "SPMV_ALG_DEFAULT", "SPMV_COO_ALG1", "SPMV_COO_ALG2", "SPMV_CSR_ALG1", "SPMV_CSR_ALG2", "hipsparseSpMVAlg_t", "HIPSPARSE_", 0};
31: const char *const MatHIPSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "hipsparseSpMMAlg_t", "HIPSPARSE_SPMM_", 0};
32: //const char *const MatHIPSPARSECsr2CscAlgorithms[] = {"INVALID"/*HIPSPARSE does not have enum 0! We created one*/, "ALG1", "ALG2", "hipsparseCsr2CscAlg_t", "HIPSPARSE_CSR2CSC_", 0};
34: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
35: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
36: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
37: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
38: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
39: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
40: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat, Vec, Vec);
41: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
42: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
43: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
44: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
45: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat, PetscScalar, Mat, MatStructure);
46: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat, PetscScalar);
47: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat, Vec, Vec);
48: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
49: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
50: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
51: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
52: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
53: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
54: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
55: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **);
56: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **, MatHIPSPARSEStorageFormat);
57: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **);
58: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat_SeqAIJHIPSPARSE **);
59: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat);
60: static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat);
61: static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat, PetscBool);
62: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
63: static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat, PetscBool);
64: static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
65: static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat, const PetscScalar[], InsertMode);
67: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(Mat, Mat, Mat, PetscBool, PetscBool);
68: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
69: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *);
70: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijhipsparse_hipsparse_band(Mat, MatFactorType, Mat *);
72: /*
73: PetscErrorCode MatHIPSPARSESetStream(Mat A, const hipStream_t stream)
74: {
75: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
78: hipsparsestruct->stream = stream;
79: hipsparseSetStream(hipsparsestruct->handle, hipsparsestruct->stream);
80: return 0;
81: }
83: PetscErrorCode MatHIPSPARSESetHandle(Mat A, const hipsparseHandle_t handle)
84: {
85: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
88: if (hipsparsestruct->handle != handle) {
89: if (hipsparsestruct->handle) hipsparseDestroy(hipsparsestruct->handle);
90: hipsparsestruct->handle = handle;
91: }
92: hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE);
93: return 0;
94: }
96: PetscErrorCode MatHIPSPARSEClearHandle(Mat A)
97: {
98: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
99: PetscBool flg;
101: PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg);
102: if (!flg || !hipsparsestruct) return 0;
103: if (hipsparsestruct->handle) hipsparsestruct->handle = 0;
104: return 0;
105: }
106: */
108: PETSC_INTERN PetscErrorCode MatHIPSPARSESetFormat_SeqAIJHIPSPARSE(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
109: {
110: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
112: switch (op) {
113: case MAT_HIPSPARSE_MULT:
114: hipsparsestruct->format = format;
115: break;
116: case MAT_HIPSPARSE_ALL:
117: hipsparsestruct->format = format;
118: break;
119: default:
120: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatHIPSPARSEFormatOperation. MAT_HIPSPARSE_MULT and MAT_HIPSPARSE_ALL are currently supported.", op);
121: }
122: return 0;
123: }
125: /*@
126: MatHIPSPARSESetFormat - Sets the storage format of `MATSEQHIPSPARSE` matrices for a particular
127: operation. Only the `MatMult()` operation can use different GPU storage formats
129: Not Collective
131: Input Parameters:
132: + A - Matrix of type `MATSEQAIJHIPSPARSE`
133: . op - `MatHIPSPARSEFormatOperation`. `MATSEQAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT` and `MAT_HIPSPARSE_ALL`. `MATMPIAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT_DIAG`, `MAT_HIPSPARSE_MULT_OFFDIAG`, and `MAT_HIPSPARSE_ALL`.
134: - format - `MatHIPSPARSEStorageFormat` (one of `MAT_HIPSPARSE_CSR`, `MAT_HIPSPARSE_ELL`, `MAT_HIPSPARSE_HYB`.)
136: Output Parameter:
138: Level: intermediate
140: .seealso: `Mat`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
141: @*/
142: PetscErrorCode MatHIPSPARSESetFormat(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
143: {
145: PetscTryMethod(A, "MatHIPSPARSESetFormat_C", (Mat, MatHIPSPARSEFormatOperation, MatHIPSPARSEStorageFormat), (A, op, format));
146: return 0;
147: }
149: PETSC_INTERN PetscErrorCode MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE(Mat A, PetscBool use_cpu)
150: {
151: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
153: hipsparsestruct->use_cpu_solve = use_cpu;
154: return 0;
155: }
157: /*@
158: MatHIPSPARSESetUseCPUSolve - Sets use CPU `MatSolve()`.
160: Input Parameters:
161: + A - Matrix of type `MATSEQAIJHIPSPARSE`
162: - use_cpu - set flag for using the built-in CPU `MatSolve()`
164: Output Parameter:
166: Notes:
167: The hipSparse LU solver currently computes the factors with the built-in CPU method
168: and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
169: This method to specify if the solve is done on the CPU or GPU (GPU is the default).
171: Level: intermediate
173: .seealso: `MatSolve()`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
174: @*/
175: PetscErrorCode MatHIPSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
176: {
178: PetscTryMethod(A, "MatHIPSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
179: return 0;
180: }
182: PetscErrorCode MatSetOption_SeqAIJHIPSPARSE(Mat A, MatOption op, PetscBool flg)
183: {
184: switch (op) {
185: case MAT_FORM_EXPLICIT_TRANSPOSE:
186: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
187: if (A->form_explicit_transpose && !flg) MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE);
188: A->form_explicit_transpose = flg;
189: break;
190: default:
191: MatSetOption_SeqAIJ(A, op, flg);
192: break;
193: }
194: return 0;
195: }
197: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
198: {
199: PetscBool row_identity, col_identity;
200: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
201: IS isrow = b->row, iscol = b->col;
202: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)B->spptr;
204: MatSeqAIJHIPSPARSECopyFromGPU(A);
205: MatLUFactorNumeric_SeqAIJ(B, A, info);
206: B->offloadmask = PETSC_OFFLOAD_CPU;
207: /* determine which version of MatSolve needs to be used. */
208: ISIdentity(isrow, &row_identity);
209: ISIdentity(iscol, &col_identity);
210: if (!hipsparsestruct->use_cpu_solve) {
211: if (row_identity && col_identity) {
212: B->ops->solve = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
213: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
214: } else {
215: B->ops->solve = MatSolve_SeqAIJHIPSPARSE;
216: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE;
217: }
218: }
219: B->ops->matsolve = NULL;
220: B->ops->matsolvetranspose = NULL;
222: /* get the triangular factors */
223: if (!hipsparsestruct->use_cpu_solve) { MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(B); }
224: return 0;
225: }
227: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
228: {
229: MatHIPSPARSEStorageFormat format;
230: PetscBool flg;
231: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
233: PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJHIPSPARSE options");
234: if (A->factortype == MAT_FACTOR_NONE) {
235: PetscOptionsEnum("-mat_hipsparse_mult_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg);
236: if (flg) MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT, format);
237: PetscOptionsEnum("-mat_hipsparse_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV and TriSolve", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg);
238: if (flg) MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_ALL, format);
239: PetscOptionsBool("-mat_hipsparse_use_cpu_solve", "Use CPU (I)LU solve", "MatHIPSPARSESetUseCPUSolve", hipsparsestruct->use_cpu_solve, &hipsparsestruct->use_cpu_solve, &flg);
240: if (flg) MatHIPSPARSESetUseCPUSolve(A, hipsparsestruct->use_cpu_solve);
241: PetscCall(
242: PetscOptionsEnum("-mat_hipsparse_spmv_alg", "sets hipSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "hipsparseSpMVAlg_t", MatHIPSPARSESpMVAlgorithms, (PetscEnum)hipsparsestruct->spmvAlg, (PetscEnum *)&hipsparsestruct->spmvAlg, &flg));
243: /* If user did use this option, check its consistency with hipSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatHIPSPARSESpMVAlgorithms[] */
245: PetscCall(
246: PetscOptionsEnum("-mat_hipsparse_spmm_alg", "sets hipSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "hipsparseSpMMAlg_t", MatHIPSPARSESpMMAlgorithms, (PetscEnum)hipsparsestruct->spmmAlg, (PetscEnum *)&hipsparsestruct->spmmAlg, &flg));
248: /*
249: PetscOptionsEnum("-mat_hipsparse_csr2csc_alg", "sets hipSPARSE algorithm used in converting CSR matrices to CSC matrices", "hipsparseCsr2CscAlg_t", MatHIPSPARSECsr2CscAlgorithms, (PetscEnum)hipsparsestruct->csr2cscAlg, (PetscEnum*)&hipsparsestruct->csr2cscAlg, &flg);
251: */
252: }
253: PetscOptionsHeadEnd();
254: return 0;
255: }
257: static PetscErrorCode MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(Mat A)
258: {
259: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
260: PetscInt n = A->rmap->n;
261: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
262: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
263: const PetscInt *ai = a->i, *aj = a->j, *vi;
264: const MatScalar *aa = a->a, *v;
265: PetscInt *AiLo, *AjLo;
266: PetscInt i, nz, nzLower, offset, rowOffset;
268: if (!n) return 0;
269: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
270: try {
271: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
272: nzLower = n + ai[n] - ai[1];
273: if (!loTriFactor) {
274: PetscScalar *AALo;
275: hipHostMalloc((void **)&AALo, nzLower * sizeof(PetscScalar));
277: /* Allocate Space for the lower triangular matrix */
278: hipHostMalloc((void **)&AiLo, (n + 1) * sizeof(PetscInt));
279: hipHostMalloc((void **)&AjLo, nzLower * sizeof(PetscInt));
281: /* Fill the lower triangular matrix */
282: AiLo[0] = (PetscInt)0;
283: AiLo[n] = nzLower;
284: AjLo[0] = (PetscInt)0;
285: AALo[0] = (MatScalar)1.0;
286: v = aa;
287: vi = aj;
288: offset = 1;
289: rowOffset = 1;
290: for (i = 1; i < n; i++) {
291: nz = ai[i + 1] - ai[i];
292: /* additional 1 for the term on the diagonal */
293: AiLo[i] = rowOffset;
294: rowOffset += nz + 1;
296: PetscArraycpy(&(AjLo[offset]), vi, nz);
297: PetscArraycpy(&(AALo[offset]), v, nz);
298: offset += nz;
299: AjLo[offset] = (PetscInt)i;
300: AALo[offset] = (MatScalar)1.0;
301: offset += 1;
302: v += nz;
303: vi += nz;
304: }
306: /* allocate space for the triangular factor information */
307: PetscNew(&loTriFactor);
308: loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
309: /* Create the matrix description */
310: hipsparseCreateMatDescr(&loTriFactor->descr);
311: hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO);
312: hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL);
313: hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_LOWER);
314: hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT);
316: /* set the operation */
317: loTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
319: /* set the matrix */
320: loTriFactor->csrMat = new CsrMatrix;
321: loTriFactor->csrMat->num_rows = n;
322: loTriFactor->csrMat->num_cols = n;
323: loTriFactor->csrMat->num_entries = nzLower;
324: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
325: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
326: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
328: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
329: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
330: loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
332: /* Create the solve analysis information */
333: PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);
334: hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo);
335: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
336: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
337: hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize);
339: /* perform the solve analysis */
340: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
341: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
343: WaitForHIP();
344: PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);
346: /* assign the pointer */
347: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
348: loTriFactor->AA_h = AALo;
349: hipHostFree(AiLo);
350: hipHostFree(AjLo);
351: PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar));
352: } else { /* update values only */
353: if (!loTriFactor->AA_h) hipHostMalloc((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar));
354: /* Fill the lower triangular matrix */
355: loTriFactor->AA_h[0] = 1.0;
356: v = aa;
357: vi = aj;
358: offset = 1;
359: for (i = 1; i < n; i++) {
360: nz = ai[i + 1] - ai[i];
361: PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);
362: offset += nz;
363: loTriFactor->AA_h[offset] = 1.0;
364: offset += 1;
365: v += nz;
366: }
367: loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
368: PetscLogCpuToGpu(nzLower * sizeof(PetscScalar));
369: }
370: } catch (char *ex) {
371: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
372: }
373: }
374: return 0;
375: }
377: static PetscErrorCode MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(Mat A)
378: {
379: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
380: PetscInt n = A->rmap->n;
381: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
382: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
383: const PetscInt *aj = a->j, *adiag = a->diag, *vi;
384: const MatScalar *aa = a->a, *v;
385: PetscInt *AiUp, *AjUp;
386: PetscInt i, nz, nzUpper, offset;
388: if (!n) return 0;
389: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
390: try {
391: /* next, figure out the number of nonzeros in the upper triangular matrix. */
392: nzUpper = adiag[0] - adiag[n];
393: if (!upTriFactor) {
394: PetscScalar *AAUp;
395: hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar));
397: /* Allocate Space for the upper triangular matrix */
398: hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt));
399: hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt));
401: /* Fill the upper triangular matrix */
402: AiUp[0] = (PetscInt)0;
403: AiUp[n] = nzUpper;
404: offset = nzUpper;
405: for (i = n - 1; i >= 0; i--) {
406: v = aa + adiag[i + 1] + 1;
407: vi = aj + adiag[i + 1] + 1;
408: nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
409: offset -= (nz + 1); /* decrement the offset */
411: /* first, set the diagonal elements */
412: AjUp[offset] = (PetscInt)i;
413: AAUp[offset] = (MatScalar)1. / v[nz];
414: AiUp[i] = AiUp[i + 1] - (nz + 1);
416: PetscArraycpy(&(AjUp[offset + 1]), vi, nz);
417: PetscArraycpy(&(AAUp[offset + 1]), v, nz);
418: }
420: /* allocate space for the triangular factor information */
421: PetscNew(&upTriFactor);
422: upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
424: /* Create the matrix description */
425: hipsparseCreateMatDescr(&upTriFactor->descr);
426: hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO);
427: hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL);
428: hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER);
429: hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT);
431: /* set the operation */
432: upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
434: /* set the matrix */
435: upTriFactor->csrMat = new CsrMatrix;
436: upTriFactor->csrMat->num_rows = n;
437: upTriFactor->csrMat->num_cols = n;
438: upTriFactor->csrMat->num_entries = nzUpper;
439: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
440: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
441: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
442: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
443: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
444: upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
446: /* Create the solve analysis information */
447: PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);
448: hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo);
449: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
450: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
451: hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize);
453: /* perform the solve analysis */
454: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
455: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
457: WaitForHIP();
458: PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);
460: /* assign the pointer */
461: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
462: upTriFactor->AA_h = AAUp;
463: hipHostFree(AiUp);
464: hipHostFree(AjUp);
465: PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar));
466: } else {
467: if (!upTriFactor->AA_h) hipHostMalloc((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar));
468: /* Fill the upper triangular matrix */
469: offset = nzUpper;
470: for (i = n - 1; i >= 0; i--) {
471: v = aa + adiag[i + 1] + 1;
472: nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
473: offset -= (nz + 1); /* decrement the offset */
475: /* first, set the diagonal elements */
476: upTriFactor->AA_h[offset] = 1. / v[nz];
477: PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz);
478: }
479: upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
480: PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar));
481: }
482: } catch (char *ex) {
483: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
484: }
485: }
486: return 0;
487: }
489: static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat A)
490: {
491: PetscBool row_identity, col_identity;
492: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
493: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
494: IS isrow = a->row, iscol = a->icol;
495: PetscInt n = A->rmap->n;
498: MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(A);
499: MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(A);
501: if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
502: hipsparseTriFactors->nnz = a->nz;
504: A->offloadmask = PETSC_OFFLOAD_BOTH;
505: /* lower triangular indices */
506: ISIdentity(isrow, &row_identity);
507: if (!row_identity && !hipsparseTriFactors->rpermIndices) {
508: const PetscInt *r;
510: ISGetIndices(isrow, &r);
511: hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
512: hipsparseTriFactors->rpermIndices->assign(r, r + n);
513: ISRestoreIndices(isrow, &r);
514: PetscLogCpuToGpu(n * sizeof(PetscInt));
515: }
516: /* upper triangular indices */
517: ISIdentity(iscol, &col_identity);
518: if (!col_identity && !hipsparseTriFactors->cpermIndices) {
519: const PetscInt *c;
521: ISGetIndices(iscol, &c);
522: hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
523: hipsparseTriFactors->cpermIndices->assign(c, c + n);
524: ISRestoreIndices(iscol, &c);
525: PetscLogCpuToGpu(n * sizeof(PetscInt));
526: }
527: return 0;
528: }
530: static PetscErrorCode MatSeqAIJHIPSPARSEBuildICCTriMatrices(Mat A)
531: {
532: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
533: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
534: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
535: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
536: PetscInt *AiUp, *AjUp;
537: PetscScalar *AAUp;
538: PetscScalar *AALo;
539: PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
540: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data;
541: const PetscInt *ai = b->i, *aj = b->j, *vj;
542: const MatScalar *aa = b->a, *v;
544: if (!n) return 0;
545: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
546: try {
547: hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar));
548: hipHostMalloc((void **)&AALo, nzUpper * sizeof(PetscScalar));
549: if (!upTriFactor && !loTriFactor) {
550: /* Allocate Space for the upper triangular matrix */
551: hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt));
552: hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt));
554: /* Fill the upper triangular matrix */
555: AiUp[0] = (PetscInt)0;
556: AiUp[n] = nzUpper;
557: offset = 0;
558: for (i = 0; i < n; i++) {
559: /* set the pointers */
560: v = aa + ai[i];
561: vj = aj + ai[i];
562: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
564: /* first, set the diagonal elements */
565: AjUp[offset] = (PetscInt)i;
566: AAUp[offset] = (MatScalar)1.0 / v[nz];
567: AiUp[i] = offset;
568: AALo[offset] = (MatScalar)1.0 / v[nz];
570: offset += 1;
571: if (nz > 0) {
572: PetscArraycpy(&(AjUp[offset]), vj, nz);
573: PetscArraycpy(&(AAUp[offset]), v, nz);
574: for (j = offset; j < offset + nz; j++) {
575: AAUp[j] = -AAUp[j];
576: AALo[j] = AAUp[j] / v[nz];
577: }
578: offset += nz;
579: }
580: }
582: /* allocate space for the triangular factor information */
583: PetscNew(&upTriFactor);
584: upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
586: /* Create the matrix description */
587: hipsparseCreateMatDescr(&upTriFactor->descr);
588: hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO);
589: hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL);
590: hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER);
591: hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT);
593: /* set the matrix */
594: upTriFactor->csrMat = new CsrMatrix;
595: upTriFactor->csrMat->num_rows = A->rmap->n;
596: upTriFactor->csrMat->num_cols = A->cmap->n;
597: upTriFactor->csrMat->num_entries = a->nz;
598: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
599: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
600: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
601: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
602: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
603: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
605: /* set the operation */
606: upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
608: /* Create the solve analysis information */
609: PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);
610: hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo);
611: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
612: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
613: hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize);
615: /* perform the solve analysis */
616: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
617: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
619: WaitForHIP();
620: PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);
622: /* assign the pointer */
623: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
625: /* allocate space for the triangular factor information */
626: PetscNew(&loTriFactor);
627: loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
629: /* Create the matrix description */
630: hipsparseCreateMatDescr(&loTriFactor->descr);
631: hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO);
632: hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL);
633: hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER);
634: hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT);
636: /* set the operation */
637: loTriFactor->solveOp = HIPSPARSE_OPERATION_TRANSPOSE;
639: /* set the matrix */
640: loTriFactor->csrMat = new CsrMatrix;
641: loTriFactor->csrMat->num_rows = A->rmap->n;
642: loTriFactor->csrMat->num_cols = A->cmap->n;
643: loTriFactor->csrMat->num_entries = a->nz;
644: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
645: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
646: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
647: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
648: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
649: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
651: /* Create the solve analysis information */
652: PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);
653: hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo);
654: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
655: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
656: hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize);
658: /* perform the solve analysis */
659: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
660: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
662: WaitForHIP();
663: PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);
665: /* assign the pointer */
666: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
668: PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)));
669: hipHostFree(AiUp);
670: hipHostFree(AjUp);
671: } else {
672: /* Fill the upper triangular matrix */
673: offset = 0;
674: for (i = 0; i < n; i++) {
675: /* set the pointers */
676: v = aa + ai[i];
677: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
679: /* first, set the diagonal elements */
680: AAUp[offset] = 1.0 / v[nz];
681: AALo[offset] = 1.0 / v[nz];
683: offset += 1;
684: if (nz > 0) {
685: PetscArraycpy(&(AAUp[offset]), v, nz);
686: for (j = offset; j < offset + nz; j++) {
687: AAUp[j] = -AAUp[j];
688: AALo[j] = AAUp[j] / v[nz];
689: }
690: offset += nz;
691: }
692: }
695: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
696: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
697: PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar));
698: }
699: hipHostFree(AAUp);
700: hipHostFree(AALo);
701: } catch (char *ex) {
702: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
703: }
704: }
705: return 0;
706: }
708: static PetscErrorCode MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(Mat A)
709: {
710: PetscBool perm_identity;
711: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
712: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
713: IS ip = a->row;
714: PetscInt n = A->rmap->n;
717: MatSeqAIJHIPSPARSEBuildICCTriMatrices(A);
718: if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
719: hipsparseTriFactors->nnz = (a->nz - n) * 2 + n;
721: A->offloadmask = PETSC_OFFLOAD_BOTH;
722: /* lower triangular indices */
723: ISIdentity(ip, &perm_identity);
724: if (!perm_identity) {
725: IS iip;
726: const PetscInt *irip, *rip;
728: ISInvertPermutation(ip, PETSC_DECIDE, &iip);
729: ISGetIndices(iip, &irip);
730: ISGetIndices(ip, &rip);
731: hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
732: hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
733: hipsparseTriFactors->rpermIndices->assign(rip, rip + n);
734: hipsparseTriFactors->cpermIndices->assign(irip, irip + n);
735: ISRestoreIndices(iip, &irip);
736: ISDestroy(&iip);
737: ISRestoreIndices(ip, &rip);
738: PetscLogCpuToGpu(2. * n * sizeof(PetscInt));
739: }
740: return 0;
741: }
743: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
744: {
745: PetscBool perm_identity;
746: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
747: IS ip = b->row;
749: MatSeqAIJHIPSPARSECopyFromGPU(A);
750: MatCholeskyFactorNumeric_SeqAIJ(B, A, info);
751: B->offloadmask = PETSC_OFFLOAD_CPU;
752: /* determine which version of MatSolve needs to be used. */
753: ISIdentity(ip, &perm_identity);
754: if (perm_identity) {
755: B->ops->solve = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
756: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
757: B->ops->matsolve = NULL;
758: B->ops->matsolvetranspose = NULL;
759: } else {
760: B->ops->solve = MatSolve_SeqAIJHIPSPARSE;
761: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE;
762: B->ops->matsolve = NULL;
763: B->ops->matsolvetranspose = NULL;
764: }
766: /* get the triangular factors */
767: MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(B);
768: return 0;
769: }
771: static PetscErrorCode MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(Mat A)
772: {
773: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
774: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
775: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
776: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT;
777: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT;
778: hipsparseIndexBase_t indexBase;
779: hipsparseMatrixType_t matrixType;
780: hipsparseFillMode_t fillMode;
781: hipsparseDiagType_t diagType;
783: /* allocate space for the transpose of the lower triangular factor */
784: PetscNew(&loTriFactorT);
785: loTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
787: /* set the matrix descriptors of the lower triangular factor */
788: matrixType = hipsparseGetMatType(loTriFactor->descr);
789: indexBase = hipsparseGetMatIndexBase(loTriFactor->descr);
790: fillMode = hipsparseGetMatFillMode(loTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
791: diagType = hipsparseGetMatDiagType(loTriFactor->descr);
793: /* Create the matrix description */
794: hipsparseCreateMatDescr(&loTriFactorT->descr);
795: hipsparseSetMatIndexBase(loTriFactorT->descr, indexBase);
796: hipsparseSetMatType(loTriFactorT->descr, matrixType);
797: hipsparseSetMatFillMode(loTriFactorT->descr, fillMode);
798: hipsparseSetMatDiagType(loTriFactorT->descr, diagType);
800: /* set the operation */
801: loTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
803: /* allocate GPU space for the CSC of the lower triangular factor*/
804: loTriFactorT->csrMat = new CsrMatrix;
805: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
806: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
807: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
808: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
809: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
810: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
812: /* compute the transpose of the lower triangular factor, i.e. the CSC */
813: /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
814: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
815: PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
816: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
817: loTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
818: hipMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize);
819: #endif
820: */
821: PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0);
823: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows,
824: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
825: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
826: loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
827: #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
828: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
829: hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
830: #else
831: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
832: #endif
834: WaitForHIP();
835: PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0);
837: /* Create the solve analysis information */
838: PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);
839: hipsparseCreateCsrsvInfo(&loTriFactorT->solveInfo);
840: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactorT->solveOp,
841: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
842: loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
843: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
844: &loTriFactorT->solveBufferSize));
845: hipMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize);
847: /* perform the solve analysis */
848: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactorT->solveOp,
849: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
850: loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
851: loTriFactorT->csrMat->column_indices->data().get(),
852: loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
854: WaitForHIP();
855: PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);
857: /* assign the pointer */
858: ((Mat_SeqAIJHIPSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
860: /*********************************************/
861: /* Now the Transpose of the Upper Tri Factor */
862: /*********************************************/
864: /* allocate space for the transpose of the upper triangular factor */
865: PetscNew(&upTriFactorT);
866: upTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
868: /* set the matrix descriptors of the upper triangular factor */
869: matrixType = hipsparseGetMatType(upTriFactor->descr);
870: indexBase = hipsparseGetMatIndexBase(upTriFactor->descr);
871: fillMode = hipsparseGetMatFillMode(upTriFactor->descr)== HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
872: diagType = hipsparseGetMatDiagType(upTriFactor->descr);
874: /* Create the matrix description */
875: hipsparseCreateMatDescr(&upTriFactorT->descr);
876: hipsparseSetMatIndexBase(upTriFactorT->descr, indexBase);
877: hipsparseSetMatType(upTriFactorT->descr, matrixType);
878: hipsparseSetMatFillMode(upTriFactorT->descr, fillMode);
879: hipsparseSetMatDiagType(upTriFactorT->descr, diagType);
881: /* set the operation */
882: upTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
884: /* allocate GPU space for the CSC of the upper triangular factor*/
885: upTriFactorT->csrMat = new CsrMatrix;
886: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
887: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
888: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
889: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
890: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
891: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
893: /* compute the transpose of the upper triangular factor, i.e. the CSC */
894: /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
895: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
896: PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
897: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
898: upTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
899: hipMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize);
900: #endif
901: */
902: PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0);
903: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows,
904: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
905: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
906: upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
907: #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
908: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
909: hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
910: #else
911: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
912: #endif
914: WaitForHIP();
915: PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0);
917: /* Create the solve analysis information */
918: PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);
919: hipsparseCreateCsrsvInfo(&upTriFactorT->solveInfo);
920: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactorT->solveOp,
921: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
922: upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
923: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
924: &upTriFactorT->solveBufferSize));
925: hipMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize);
927: /* perform the solve analysis */
928: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactorT->solveOp,
929: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
930: upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
931: upTriFactorT->csrMat->column_indices->data().get(),
932: upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
934: WaitForHIP();
935: PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0);
937: /* assign the pointer */
938: ((Mat_SeqAIJHIPSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
939: return 0;
940: }
942: struct PetscScalarToPetscInt {
943: __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
944: };
946: static PetscErrorCode MatSeqAIJHIPSPARSEFormExplicitTranspose(Mat A)
947: {
948: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
949: Mat_SeqAIJHIPSPARSEMultStruct *matstruct, *matstructT;
950: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
951: hipsparseIndexBase_t indexBase;
953: MatSeqAIJHIPSPARSECopyToGPU(A);
954: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
956: matstructT = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
958: if (A->transupdated) return 0;
959: PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0);
960: PetscLogGpuTimeBegin();
961: if (hipsparsestruct->format != MAT_HIPSPARSE_CSR) MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE);
962: if (!hipsparsestruct->matTranspose) { /* create hipsparse matrix */
963: matstructT = new Mat_SeqAIJHIPSPARSEMultStruct;
964: hipsparseCreateMatDescr(&matstructT->descr);
965: indexBase = hipsparseGetMatIndexBase(matstruct->descr);
966: hipsparseSetMatIndexBase(matstructT->descr, indexBase);
967: hipsparseSetMatType(matstructT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL);
969: /* set alpha and beta */
970: hipMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar));
971: hipMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar));
972: hipMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));
973: hipMemcpy(matstructT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice);
974: hipMemcpy(matstructT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice);
975: hipMemcpy(matstructT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice);
977: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
978: CsrMatrix *matrixT = new CsrMatrix;
979: matstructT->mat = matrixT;
980: matrixT->num_rows = A->cmap->n;
981: matrixT->num_cols = A->rmap->n;
982: matrixT->num_entries = a->nz;
983: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1);
984: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
985: matrixT->values = new THRUSTARRAY(a->nz);
987: if (!hipsparsestruct->rowoffsets_gpu) hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
988: hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
990: PetscCallHIPSPARSE(hipsparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
991: indexBase, hipsparse_scalartype));
992: } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
993: CsrMatrix *temp = new CsrMatrix;
994: CsrMatrix *tempT = new CsrMatrix;
995: /* First convert HYB to CSR */
996: temp->num_rows = A->rmap->n;
997: temp->num_cols = A->cmap->n;
998: temp->num_entries = a->nz;
999: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1000: temp->column_indices = new THRUSTINTARRAY32(a->nz);
1001: temp->values = new THRUSTARRAY(a->nz);
1003: hipsparse_hyb2csr(hipsparsestruct->handle, matstruct->descr, (hipsparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1005: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1006: tempT->num_rows = A->rmap->n;
1007: tempT->num_cols = A->cmap->n;
1008: tempT->num_entries = a->nz;
1009: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1010: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1011: tempT->values = new THRUSTARRAY(a->nz);
1013: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1014: tempT->column_indices->data().get(), tempT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
1016: /* Last, convert CSC to HYB */
1017: hipsparseHybMat_t hybMat;
1018: hipsparseCreateHybMat(&hybMat);
1019: hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
1020: hipsparse_csr2hyb(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1022: /* assign the pointer */
1023: matstructT->mat = hybMat;
1024: A->transupdated = PETSC_TRUE;
1025: /* delete temporaries */
1026: if (tempT) {
1027: if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1028: if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1029: if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1030: delete (CsrMatrix *)tempT;
1031: }
1032: if (temp) {
1033: if (temp->values) delete (THRUSTARRAY *)temp->values;
1034: if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1035: if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1036: delete (CsrMatrix *)temp;
1037: }
1038: }
1039: }
1040: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1041: CsrMatrix *matrix = (CsrMatrix *)matstruct->mat;
1042: CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1051: if (!hipsparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1052: hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1053: hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1054: PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt));
1055: }
1056: if (!hipsparsestruct->csr2csc_i) {
1057: THRUSTARRAY csr2csc_a(matrix->num_entries);
1058: thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0);
1060: indexBase = hipsparseGetMatIndexBase(matstruct->descr);
1061: if (matrix->num_entries) {
1062: /* This routine is known to give errors with CUDA-11, but works fine with CUDA-10
1063: Need to verify this for ROCm.
1064: */
1065: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), hipsparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1066: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
1067: } else {
1068: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1069: }
1071: hipsparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1072: thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), hipsparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt());
1073: }
1074: PetscCallThrust(
1075: thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1076: }
1077: PetscLogGpuTimeEnd();
1078: PetscLogEventEnd(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0);
1079: /* the compressed row indices is not used for matTranspose */
1080: matstructT->cprowIndices = NULL;
1081: /* assign the pointer */
1082: ((Mat_SeqAIJHIPSPARSE *)A->spptr)->matTranspose = matstructT;
1083: A->transupdated = PETSC_TRUE;
1084: return 0;
1085: }
1087: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = HIPSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJHIPSPARSE? */
1088: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1089: {
1090: PetscInt n = xx->map->n;
1091: const PetscScalar *barray;
1092: PetscScalar *xarray;
1093: thrust::device_ptr<const PetscScalar> bGPU;
1094: thrust::device_ptr<PetscScalar> xGPU;
1095: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1096: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1097: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1098: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1100: /* Analyze the matrix and create the transpose ... on the fly */
1101: if (!loTriFactorT && !upTriFactorT) {
1102: MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A);
1103: loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1104: upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1105: }
1107: /* Get the GPU pointers */
1108: VecHIPGetArrayWrite(xx, &xarray);
1109: VecHIPGetArrayRead(bb, &barray);
1110: xGPU = thrust::device_pointer_cast(xarray);
1111: bGPU = thrust::device_pointer_cast(barray);
1113: PetscLogGpuTimeBegin();
1114: /* First, reorder with the row permutation */
1115: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, hipsparseTriFactors->rpermIndices->end()), xGPU);
1117: /* First, solve U */
1118: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1119: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1121: /* Then, solve L */
1122: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1123: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1125: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1126: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(xGPU, hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, hipsparseTriFactors->cpermIndices->end()), tempGPU->begin());
1128: /* Copy the temporary to the full solution. */
1129: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), tempGPU->begin(), tempGPU->end(), xGPU);
1131: /* restore */
1132: VecHIPRestoreArrayRead(bb, &barray);
1133: VecHIPRestoreArrayWrite(xx, &xarray);
1134: PetscLogGpuTimeEnd();
1135: PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n);
1136: return 0;
1137: }
1139: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1140: {
1141: const PetscScalar *barray;
1142: PetscScalar *xarray;
1143: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1144: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1145: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1146: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1148: /* Analyze the matrix and create the transpose ... on the fly */
1149: if (!loTriFactorT && !upTriFactorT) {
1150: MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A);
1151: loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1152: upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1153: }
1155: /* Get the GPU pointers */
1156: VecHIPGetArrayWrite(xx, &xarray);
1157: VecHIPGetArrayRead(bb, &barray);
1159: PetscLogGpuTimeBegin();
1160: /* First, solve U */
1161: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1162: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1164: /* Then, solve L */
1165: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1166: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1168: /* restore */
1169: VecHIPRestoreArrayRead(bb, &barray);
1170: VecHIPRestoreArrayWrite(xx, &xarray);
1171: PetscLogGpuTimeEnd();
1172: PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n);
1173: return 0;
1174: }
1176: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1177: {
1178: const PetscScalar *barray;
1179: PetscScalar *xarray;
1180: thrust::device_ptr<const PetscScalar> bGPU;
1181: thrust::device_ptr<PetscScalar> xGPU;
1182: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1183: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1184: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1185: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1187: /* Get the GPU pointers */
1188: VecHIPGetArrayWrite(xx, &xarray);
1189: VecHIPGetArrayRead(bb, &barray);
1190: xGPU = thrust::device_pointer_cast(xarray);
1191: bGPU = thrust::device_pointer_cast(barray);
1193: PetscLogGpuTimeBegin();
1194: /* First, reorder with the row permutation */
1195: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->end()), tempGPU->begin());
1197: /* Next, solve L */
1198: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1199: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1201: /* Then, solve U */
1202: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1203: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1205: /* Last, reorder with the column permutation */
1206: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->end()), xGPU);
1208: VecHIPRestoreArrayRead(bb, &barray);
1209: VecHIPRestoreArrayWrite(xx, &xarray);
1210: PetscLogGpuTimeEnd();
1211: PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n);
1212: return 0;
1213: }
1215: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1216: {
1217: const PetscScalar *barray;
1218: PetscScalar *xarray;
1219: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1220: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1221: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1222: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1224: /* Get the GPU pointers */
1225: VecHIPGetArrayWrite(xx, &xarray);
1226: VecHIPGetArrayRead(bb, &barray);
1228: PetscLogGpuTimeBegin();
1229: /* First, solve L */
1230: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1231: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1233: /* Next, solve U */
1234: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1235: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1237: VecHIPRestoreArrayRead(bb, &barray);
1238: VecHIPRestoreArrayWrite(xx, &xarray);
1239: PetscLogGpuTimeEnd();
1240: PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n);
1241: return 0;
1242: }
1244: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1245: /* hipsparseSpSV_solve() and related functions first appeared in ROCm-4.5.0*/
1246: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1247: {
1248: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1249: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1250: const PetscScalar *barray;
1251: PetscScalar *xarray;
1253: VecHIPGetArrayWrite(x, &xarray);
1254: VecHIPGetArrayRead(b, &barray);
1255: PetscLogGpuTimeBegin();
1257: /* Solve L*y = b */
1258: hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray);
1259: hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y);
1260: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1261: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!
1263: /* Solve U*x = y */
1264: hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray);
1265: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1266: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1268: VecHIPRestoreArrayRead(b, &barray);
1269: VecHIPRestoreArrayWrite(x, &xarray);
1271: PetscLogGpuTimeEnd();
1272: PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n);
1273: return 0;
1274: }
1276: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1277: {
1278: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1279: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1280: const PetscScalar *barray;
1281: PetscScalar *xarray;
1283: if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1284: hipsparseSpSV_createDescr(&fs->spsvDescr_Lt);
1285: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1286: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1288: hipsparseSpSV_createDescr(&fs->spsvDescr_Ut);
1289: hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut);
1290: hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt);
1291: hipMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut);
1292: fs->createdTransposeSpSVDescr = PETSC_TRUE;
1293: }
1295: if (!fs->updatedTransposeSpSVAnalysis) {
1296: hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt);
1298: hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut);
1299: fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1300: }
1302: VecHIPGetArrayWrite(x, &xarray);
1303: VecHIPGetArrayRead(b, &barray);
1304: PetscLogGpuTimeBegin();
1306: /* Solve Ut*y = b */
1307: hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray);
1308: hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y);
1309: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1310: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1312: /* Solve Lt*x = y */
1313: hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray);
1314: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1315: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1317: VecHIPRestoreArrayRead(b, &barray);
1318: VecHIPRestoreArrayWrite(x, &xarray);
1319: PetscLogGpuTimeEnd();
1320: PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n);
1321: return 0;
1322: }
1324: static PetscErrorCode MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info)
1325: {
1326: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1327: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1328: Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1329: CsrMatrix *Acsr;
1330: PetscInt m, nz;
1331: PetscBool flg;
1333: if (PetscDefined(USE_DEBUG)) {
1334: PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg);
1336: }
1338: /* Copy A's value to fact */
1339: m = fact->rmap->n;
1340: nz = aij->nz;
1341: MatSeqAIJHIPSPARSECopyToGPU(A);
1342: Acsr = (CsrMatrix *)Acusp->mat->mat;
1343: hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream);
1345: /* Factorize fact inplace */
1346: if (m)
1347: PetscCallHIPSPARSE(hipsparseXcsrilu02(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1348: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1349: if (PetscDefined(USE_DEBUG)) {
1350: int numerical_zero;
1351: hipsparseStatus_t status;
1352: status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1353: PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1354: }
1356: /* hipsparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after hipsparseXcsrilu02() */
1357: hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L);
1359: hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U);
1361: /* L, U values have changed, reset the flag to indicate we need to redo hipsparseSpSV_analysis() for transpose solve */
1362: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1364: fact->offloadmask = PETSC_OFFLOAD_GPU;
1365: fact->ops->solve = MatSolve_SeqAIJHIPSPARSE_ILU0;
1366: fact->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_ILU0;
1367: fact->ops->matsolve = NULL;
1368: fact->ops->matsolvetranspose = NULL;
1369: PetscLogGpuFlops(fs->numericFactFlops);
1370: return 0;
1371: }
1373: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1374: {
1375: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1376: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1377: PetscInt m, nz;
1379: if (PetscDefined(USE_DEBUG)) {
1380: PetscInt i;
1381: PetscBool flg, missing;
1383: PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg);
1386: MatMissingDiagonal(A, &missing, &i);
1388: }
1390: /* Free the old stale stuff */
1391: MatSeqAIJHIPSPARSETriFactors_Reset(&fs);
1393: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1394: but they will not be used. Allocate them just for easy debugging.
1395: */
1396: MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/);
1398: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1399: fact->factortype = MAT_FACTOR_ILU;
1400: fact->info.factor_mallocs = 0;
1401: fact->info.fill_ratio_given = info->fill;
1402: fact->info.fill_ratio_needed = 1.0;
1404: aij->row = NULL;
1405: aij->col = NULL;
1407: /* ====================================================================== */
1408: /* Copy A's i, j to fact and also allocate the value array of fact. */
1409: /* We'll do in-place factorization on fact */
1410: /* ====================================================================== */
1411: const int *Ai, *Aj;
1413: m = fact->rmap->n;
1414: nz = aij->nz;
1416: hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1));
1417: hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz);
1418: hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz);
1419: MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj); /* Do not use compressed Ai */
1420: hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream);
1421: hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream);
1423: /* ====================================================================== */
1424: /* Create descriptors for M, L, U */
1425: /* ====================================================================== */
1426: hipsparseFillMode_t fillMode;
1427: hipsparseDiagType_t diagType;
1429: hipsparseCreateMatDescr(&fs->matDescr_M);
1430: hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO);
1431: hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL);
1433: /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1434: hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1435: assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1436: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1437: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1438: */
1439: fillMode = HIPSPARSE_FILL_MODE_LOWER;
1440: diagType = HIPSPARSE_DIAG_TYPE_UNIT;
1441: hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype);
1442: hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode));
1443: hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType));
1445: fillMode = HIPSPARSE_FILL_MODE_UPPER;
1446: diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1447: hipsparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype);
1448: hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode));
1449: hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType));
1451: /* ========================================================================= */
1452: /* Query buffer sizes for csrilu0, SpSV and allocate buffers */
1453: /* ========================================================================= */
1454: hipsparseCreateCsrilu02Info(&fs->ilu0Info_M);
1455: if (m)
1456: PetscCallHIPSPARSE(hipsparseXcsrilu02_bufferSize(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1457: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1459: hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m);
1460: hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m);
1462: hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype);
1463: hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype);
1465: hipsparseSpSV_createDescr(&fs->spsvDescr_L);
1466: hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L);
1468: hipsparseSpSV_createDescr(&fs->spsvDescr_U);
1469: hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U);
1471: /* It appears spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1472: To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1473: */
1474: if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1475: hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M));
1476: fs->spsvBuffer_L = fs->factBuffer_M;
1477: hipMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U);
1478: } else {
1479: hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M));
1480: fs->spsvBuffer_U = fs->factBuffer_M;
1481: hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L);
1482: }
1484: /* ========================================================================== */
1485: /* Perform analysis of ilu0 on M, SpSv on L and U */
1486: /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1487: /* ========================================================================== */
1488: int structural_zero;
1490: fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1491: if (m)
1492: PetscCallHIPSPARSE(hipsparseXcsrilu02_analysis(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1493: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1494: if (PetscDefined(USE_DEBUG)) {
1495: /* Function hipsparseXcsrilu02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1496: hipsparseStatus_t status;
1497: status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1499: }
1501: /* Estimate FLOPs of the numeric factorization */
1502: {
1503: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1504: PetscInt *Ai, *Adiag, nzRow, nzLeft;
1505: PetscLogDouble flops = 0.0;
1507: MatMarkDiagonal_SeqAIJ(A);
1508: Ai = Aseq->i;
1509: Adiag = Aseq->diag;
1510: for (PetscInt i = 0; i < m; i++) {
1511: if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1512: nzRow = Ai[i + 1] - Ai[i];
1513: nzLeft = Adiag[i] - Ai[i];
1514: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1515: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1516: */
1517: nzLeft = (nzRow - 1) / 2;
1518: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1519: }
1520: }
1521: fs->numericFactFlops = flops;
1522: }
1523: fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0;
1524: return 0;
1525: }
1527: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact, Vec b, Vec x)
1528: {
1529: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1530: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1531: const PetscScalar *barray;
1532: PetscScalar *xarray;
1534: VecHIPGetArrayWrite(x, &xarray);
1535: VecHIPGetArrayRead(b, &barray);
1536: PetscLogGpuTimeBegin();
1538: /* Solve L*y = b */
1539: hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray);
1540: hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y);
1541: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1542: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1544: /* Solve Lt*x = y */
1545: hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray);
1546: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1547: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1549: VecHIPRestoreArrayRead(b, &barray);
1550: VecHIPRestoreArrayWrite(x, &xarray);
1552: PetscLogGpuTimeEnd();
1553: PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n);
1554: return 0;
1555: }
1557: static PetscErrorCode MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info)
1558: {
1559: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1560: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1561: Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1562: CsrMatrix *Acsr;
1563: PetscInt m, nz;
1564: PetscBool flg;
1566: if (PetscDefined(USE_DEBUG)) {
1567: PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg);
1569: }
1571: /* Copy A's value to fact */
1572: m = fact->rmap->n;
1573: nz = aij->nz;
1574: MatSeqAIJHIPSPARSECopyToGPU(A);
1575: Acsr = (CsrMatrix *)Acusp->mat->mat;
1576: hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream);
1578: /* Factorize fact inplace */
1579: /* Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1580: The matrix type must be HIPSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1581: and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1582: In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1583: */
1584: if (m) hipsparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M);
1585: if (PetscDefined(USE_DEBUG)) {
1586: int numerical_zero;
1587: hipsparseStatus_t status;
1588: status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1589: PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1590: }
1592: hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L);
1594: /* Note that hipsparse reports this error if we use double and HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1595: ** On entry to hipsparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1596: */
1597: hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt);
1599: fact->offloadmask = PETSC_OFFLOAD_GPU;
1600: fact->ops->solve = MatSolve_SeqAIJHIPSPARSE_ICC0;
1601: fact->ops->solvetranspose = MatSolve_SeqAIJHIPSPARSE_ICC0;
1602: fact->ops->matsolve = NULL;
1603: fact->ops->matsolvetranspose = NULL;
1604: PetscLogGpuFlops(fs->numericFactFlops);
1605: return 0;
1606: }
1608: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1609: {
1610: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1611: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1612: PetscInt m, nz;
1614: if (PetscDefined(USE_DEBUG)) {
1615: PetscInt i;
1616: PetscBool flg, missing;
1618: PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg);
1621: MatMissingDiagonal(A, &missing, &i);
1623: }
1625: /* Free the old stale stuff */
1626: MatSeqAIJHIPSPARSETriFactors_Reset(&fs);
1628: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1629: but they will not be used. Allocate them just for easy debugging.
1630: */
1631: MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/);
1633: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1634: fact->factortype = MAT_FACTOR_ICC;
1635: fact->info.factor_mallocs = 0;
1636: fact->info.fill_ratio_given = info->fill;
1637: fact->info.fill_ratio_needed = 1.0;
1639: aij->row = NULL;
1640: aij->col = NULL;
1642: /* ====================================================================== */
1643: /* Copy A's i, j to fact and also allocate the value array of fact. */
1644: /* We'll do in-place factorization on fact */
1645: /* ====================================================================== */
1646: const int *Ai, *Aj;
1648: m = fact->rmap->n;
1649: nz = aij->nz;
1651: hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1));
1652: hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz);
1653: hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz);
1654: MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj); /* Do not use compressed Ai */
1655: hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream);
1656: hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream);
1658: /* ====================================================================== */
1659: /* Create mat descriptors for M, L */
1660: /* ====================================================================== */
1661: hipsparseFillMode_t fillMode;
1662: hipsparseDiagType_t diagType;
1664: hipsparseCreateMatDescr(&fs->matDescr_M);
1665: hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO);
1666: hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL);
1668: /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1669: hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1670: assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1671: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1672: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1673: */
1674: fillMode = HIPSPARSE_FILL_MODE_LOWER;
1675: diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1676: hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype);
1677: hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode));
1678: hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType));
1680: /* ========================================================================= */
1681: /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */
1682: /* ========================================================================= */
1683: hipsparseCreateCsric02Info(&fs->ic0Info_M);
1684: if (m) hipsparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M);
1686: hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m);
1687: hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m);
1689: hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype);
1690: hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype);
1692: hipsparseSpSV_createDescr(&fs->spsvDescr_L);
1693: hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L);
1695: hipsparseSpSV_createDescr(&fs->spsvDescr_Lt);
1696: hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt);
1698: /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1699: See also comments in `MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0()`.
1700: */
1701: if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1702: hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M));
1703: fs->spsvBuffer_L = fs->factBuffer_M;
1704: hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt);
1705: } else {
1706: hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M));
1707: fs->spsvBuffer_Lt = fs->factBuffer_M;
1708: hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L);
1709: }
1711: /* ========================================================================== */
1712: /* Perform analysis of ic0 on M */
1713: /* The lower triangular part of M has the same sparsity pattern as L */
1714: /* ========================================================================== */
1715: int structural_zero;
1717: fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1718: if (m) hipsparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M);
1719: if (PetscDefined(USE_DEBUG)) {
1720: hipsparseStatus_t status;
1721: /* Function hipsparseXcsric02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1722: status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1724: }
1726: /* Estimate FLOPs of the numeric factorization */
1727: {
1728: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1729: PetscInt *Ai, nzRow, nzLeft;
1730: PetscLogDouble flops = 0.0;
1732: Ai = Aseq->i;
1733: for (PetscInt i = 0; i < m; i++) {
1734: nzRow = Ai[i + 1] - Ai[i];
1735: if (nzRow > 1) {
1736: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1737: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1738: */
1739: nzLeft = (nzRow - 1) / 2;
1740: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1741: }
1742: }
1743: fs->numericFactFlops = flops;
1744: }
1745: fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0;
1746: return 0;
1747: }
1748: #endif
1750: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1751: {
1752: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1754: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1755: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1756: if (hipsparseTriFactors->factorizeOnDevice) {
1757: ISIdentity(isrow, &row_identity);
1758: ISIdentity(iscol, &col_identity);
1759: }
1760: if (!info->levels && row_identity && col_identity) MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(B, A, isrow, iscol, info);
1761: else
1762: #endif
1763: {
1764: MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors);
1765: MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info);
1766: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1767: }
1768: return 0;
1769: }
1771: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1772: {
1773: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1775: MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors);
1776: MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info);
1777: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1778: return 0;
1779: }
1781: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1782: {
1783: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1785: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1786: PetscBool perm_identity = PETSC_FALSE;
1787: if (hipsparseTriFactors->factorizeOnDevice) ISIdentity(perm, &perm_identity);
1788: if (!info->levels && perm_identity) MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(B, A, perm, info);
1789: else
1790: #endif
1791: {
1792: MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors);
1793: MatICCFactorSymbolic_SeqAIJ(B, A, perm, info);
1794: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1795: }
1796: return 0;
1797: }
1799: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1800: {
1801: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1803: MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors);
1804: MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info);
1805: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1806: return 0;
1807: }
1809: PetscErrorCode MatFactorGetSolverType_seqaij_hipsparse(Mat A, MatSolverType *type)
1810: {
1811: *type = MATSOLVERHIPSPARSE;
1812: return 0;
1813: }
1815: /*MC
1816: MATSOLVERHIPSPARSE = "hipsparse" - A matrix type providing triangular solvers for seq matrices
1817: on a single GPU of type, `MATSEQAIJHIPSPARSE`. Currently supported
1818: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1819: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1820: HipSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1821: algorithms are not recommended. This class does NOT support direct solver operations.
1823: Level: beginner
1825: .seealso: `MATSEQAIJHIPSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
1826: M*/
1828: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijhipsparse_hipsparse(Mat A, MatFactorType ftype, Mat *B)
1829: {
1830: PetscInt n = A->rmap->n;
1831: PetscBool factOnDevice, factOnHost;
1832: char *prefix;
1833: char factPlace[32] = "device"; /* the default */
1835: MatCreate(PetscObjectComm((PetscObject)A), B);
1836: MatSetSizes(*B, n, n, n, n);
1837: (*B)->factortype = ftype;
1838: MatSetType(*B, MATSEQAIJHIPSPARSE);
1840: prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
1841: PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
1842: PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL);
1843: PetscOptionsEnd();
1844: PetscStrcasecmp("device", factPlace, &factOnDevice);
1845: PetscStrcasecmp("host", factPlace, &factOnHost);
1847: ((Mat_SeqAIJHIPSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
1849: if (A->boundtocpu && A->bindingpropagates) MatBindToCPU(*B, PETSC_TRUE);
1850: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1851: MatSetBlockSizesFromMats(*B, A, A);
1852: if (!A->boundtocpu) {
1853: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJHIPSPARSE;
1854: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJHIPSPARSE;
1855: } else {
1856: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1857: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ;
1858: }
1859: PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]);
1860: PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]);
1861: PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);
1862: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1863: if (!A->boundtocpu) {
1864: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJHIPSPARSE;
1865: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE;
1866: } else {
1867: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ;
1868: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1869: }
1870: PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);
1871: PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]);
1872: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for HIPSPARSE Matrix Types");
1874: MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL);
1875: (*B)->canuseordering = PETSC_TRUE;
1876: PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_hipsparse);
1877: return 0;
1878: }
1880: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat A)
1881: {
1882: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1883: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1884: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1885: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1886: #endif
1888: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1889: PetscLogEventBegin(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0);
1890: if (A->factortype == MAT_FACTOR_NONE) {
1891: CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
1892: hipMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost);
1893: }
1894: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1895: else if (fs->csrVal) {
1896: /* We have a factorized matrix on device and are able to copy it to host */
1897: hipMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost);
1898: }
1899: #endif
1900: else
1901: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
1902: PetscLogGpuToCpu(a->nz * sizeof(PetscScalar));
1903: PetscLogEventEnd(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0);
1904: A->offloadmask = PETSC_OFFLOAD_BOTH;
1905: }
1906: return 0;
1907: }
1909: static PetscErrorCode MatSeqAIJGetArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1910: {
1911: MatSeqAIJHIPSPARSECopyFromGPU(A);
1912: *array = ((Mat_SeqAIJ *)A->data)->a;
1913: return 0;
1914: }
1916: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1917: {
1918: A->offloadmask = PETSC_OFFLOAD_CPU;
1919: *array = NULL;
1920: return 0;
1921: }
1923: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1924: {
1925: MatSeqAIJHIPSPARSECopyFromGPU(A);
1926: *array = ((Mat_SeqAIJ *)A->data)->a;
1927: return 0;
1928: }
1930: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1931: {
1932: *array = NULL;
1933: return 0;
1934: }
1936: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1937: {
1938: *array = ((Mat_SeqAIJ *)A->data)->a;
1939: return 0;
1940: }
1942: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1943: {
1944: A->offloadmask = PETSC_OFFLOAD_CPU;
1945: *array = NULL;
1946: return 0;
1947: }
1949: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
1950: {
1951: Mat_SeqAIJHIPSPARSE *cusp;
1952: CsrMatrix *matrix;
1954: MatSeqAIJHIPSPARSECopyToGPU(A);
1956: cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(A->spptr);
1958: matrix = (CsrMatrix *)cusp->mat->mat;
1960: if (i) {
1961: #if !defined(PETSC_USE_64BIT_INDICES)
1962: *i = matrix->row_offsets->data().get();
1963: #else
1964: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
1965: #endif
1966: }
1967: if (j) {
1968: #if !defined(PETSC_USE_64BIT_INDICES)
1969: *j = matrix->column_indices->data().get();
1970: #else
1971: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
1972: #endif
1973: }
1974: if (a) *a = matrix->values->data().get();
1975: if (mtype) *mtype = PETSC_MEMTYPE_HIP;
1976: return 0;
1977: }
1979: PETSC_INTERN PetscErrorCode MatSeqAIJHIPSPARSECopyToGPU(Mat A)
1980: {
1981: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1982: Mat_SeqAIJHIPSPARSEMultStruct *matstruct = hipsparsestruct->mat;
1983: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1984: PetscBool both = PETSC_TRUE;
1985: PetscInt m = A->rmap->n, *ii, *ridx, tmp;
1988: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1989: if (A->nonzerostate == hipsparsestruct->nonzerostate && hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* Copy values only */
1990: CsrMatrix *matrix;
1991: matrix = (CsrMatrix *)hipsparsestruct->mat->mat;
1994: PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0);
1995: matrix->values->assign(a->a, a->a + a->nz);
1996: WaitForHIP();
1997: PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar));
1998: PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0);
1999: MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE);
2000: } else {
2001: PetscInt nnz;
2002: PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0);
2003: MatSeqAIJHIPSPARSEMultStruct_Destroy(&hipsparsestruct->mat, hipsparsestruct->format);
2004: MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE);
2005: delete hipsparsestruct->workVector;
2006: delete hipsparsestruct->rowoffsets_gpu;
2007: hipsparsestruct->workVector = NULL;
2008: hipsparsestruct->rowoffsets_gpu = NULL;
2009: try {
2010: if (a->compressedrow.use) {
2011: m = a->compressedrow.nrows;
2012: ii = a->compressedrow.i;
2013: ridx = a->compressedrow.rindex;
2014: } else {
2015: m = A->rmap->n;
2016: ii = a->i;
2017: ridx = NULL;
2018: }
2020: if (!a->a) {
2021: nnz = ii[m];
2022: both = PETSC_FALSE;
2023: } else nnz = a->nz;
2026: /* create hipsparse matrix */
2027: hipsparsestruct->nrows = m;
2028: matstruct = new Mat_SeqAIJHIPSPARSEMultStruct;
2029: hipsparseCreateMatDescr(&matstruct->descr);
2030: hipsparseSetMatIndexBase(matstruct->descr, HIPSPARSE_INDEX_BASE_ZERO);
2031: hipsparseSetMatType(matstruct->descr, HIPSPARSE_MATRIX_TYPE_GENERAL);
2033: hipMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar));
2034: hipMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar));
2035: hipMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));
2036: hipMemcpy(matstruct->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice);
2037: hipMemcpy(matstruct->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice);
2038: hipMemcpy(matstruct->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice);
2039: hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE);
2041: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2042: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
2043: /* set the matrix */
2044: CsrMatrix *mat = new CsrMatrix;
2045: mat->num_rows = m;
2046: mat->num_cols = A->cmap->n;
2047: mat->num_entries = nnz;
2048: mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2049: mat->column_indices = new THRUSTINTARRAY32(nnz);
2050: mat->values = new THRUSTARRAY(nnz);
2051: mat->row_offsets->assign(ii, ii + m + 1);
2052: mat->column_indices->assign(a->j, a->j + nnz);
2053: if (a->a) mat->values->assign(a->a, a->a + nnz);
2055: /* assign the pointer */
2056: matstruct->mat = mat;
2057: if (mat->num_rows) { /* hipsparse errors on empty matrices! */
2058: PetscCallHIPSPARSE(hipsparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2059: HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2060: }
2061: } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
2062: CsrMatrix *mat = new CsrMatrix;
2063: mat->num_rows = m;
2064: mat->num_cols = A->cmap->n;
2065: mat->num_entries = nnz;
2066: mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2067: mat->column_indices = new THRUSTINTARRAY32(nnz);
2068: mat->values = new THRUSTARRAY(nnz);
2069: mat->row_offsets->assign(ii, ii + m + 1);
2070: mat->column_indices->assign(a->j, a->j + nnz);
2071: if (a->a) mat->values->assign(a->a, a->a + nnz);
2073: hipsparseHybMat_t hybMat;
2074: hipsparseCreateHybMat(&hybMat);
2075: hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
2076: hipsparse_csr2hyb(hipsparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2077: /* assign the pointer */
2078: matstruct->mat = hybMat;
2080: if (mat) {
2081: if (mat->values) delete (THRUSTARRAY *)mat->values;
2082: if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2083: if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2084: delete (CsrMatrix *)mat;
2085: }
2086: }
2088: /* assign the compressed row indices */
2089: if (a->compressedrow.use) {
2090: hipsparsestruct->workVector = new THRUSTARRAY(m);
2091: matstruct->cprowIndices = new THRUSTINTARRAY(m);
2092: matstruct->cprowIndices->assign(ridx, ridx + m);
2093: tmp = m;
2094: } else {
2095: hipsparsestruct->workVector = NULL;
2096: matstruct->cprowIndices = NULL;
2097: tmp = 0;
2098: }
2099: PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar));
2101: /* assign the pointer */
2102: hipsparsestruct->mat = matstruct;
2103: } catch (char *ex) {
2104: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
2105: }
2106: WaitForHIP();
2107: PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0);
2108: hipsparsestruct->nonzerostate = A->nonzerostate;
2109: }
2110: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2111: }
2112: return 0;
2113: }
2115: struct VecHIPPlusEquals {
2116: template <typename Tuple>
2117: __host__ __device__ void operator()(Tuple t)
2118: {
2119: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2120: }
2121: };
2123: struct VecHIPEquals {
2124: template <typename Tuple>
2125: __host__ __device__ void operator()(Tuple t)
2126: {
2127: thrust::get<1>(t) = thrust::get<0>(t);
2128: }
2129: };
2131: struct VecHIPEqualsReverse {
2132: template <typename Tuple>
2133: __host__ __device__ void operator()(Tuple t)
2134: {
2135: thrust::get<0>(t) = thrust::get<1>(t);
2136: }
2137: };
2139: struct MatMatHipsparse {
2140: PetscBool cisdense;
2141: PetscScalar *Bt;
2142: Mat X;
2143: PetscBool reusesym; /* Hipsparse does not have split symbolic and numeric phases for sparse matmat operations */
2144: PetscLogDouble flops;
2145: CsrMatrix *Bcsr;
2146: hipsparseSpMatDescr_t matSpBDescr;
2147: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
2148: hipsparseDnMatDescr_t matBDescr;
2149: hipsparseDnMatDescr_t matCDescr;
2150: PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2151: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2152: void *dBuffer4, *dBuffer5;
2153: #endif
2154: size_t mmBufferSize;
2155: void *mmBuffer, *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2156: hipsparseSpGEMMDescr_t spgemmDesc;
2157: };
2159: static PetscErrorCode MatDestroy_MatMatHipsparse(void *data)
2160: {
2161: MatMatHipsparse *mmdata = (MatMatHipsparse *)data;
2163: hipFree(mmdata->Bt);
2164: delete mmdata->Bcsr;
2165: if (mmdata->matSpBDescr) hipsparseDestroySpMat(mmdata->matSpBDescr);
2166: if (mmdata->matBDescr) hipsparseDestroyDnMat(mmdata->matBDescr);
2167: if (mmdata->matCDescr) hipsparseDestroyDnMat(mmdata->matCDescr);
2168: if (mmdata->spgemmDesc) hipsparseSpGEMM_destroyDescr(mmdata->spgemmDesc);
2169: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2170: if (mmdata->dBuffer4) hipFree(mmdata->dBuffer4);
2171: if (mmdata->dBuffer5) hipFree(mmdata->dBuffer5);
2172: #endif
2173: if (mmdata->mmBuffer) hipFree(mmdata->mmBuffer);
2174: if (mmdata->mmBuffer2) hipFree(mmdata->mmBuffer2);
2175: MatDestroy(&mmdata->X);
2176: PetscFree(data);
2177: return 0;
2178: }
2180: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2181: {
2182: Mat_Product *product = C->product;
2183: Mat A, B;
2184: PetscInt m, n, blda, clda;
2185: PetscBool flg, biship;
2186: Mat_SeqAIJHIPSPARSE *cusp;
2187: hipsparseOperation_t opA;
2188: const PetscScalar *barray;
2189: PetscScalar *carray;
2190: MatMatHipsparse *mmdata;
2191: Mat_SeqAIJHIPSPARSEMultStruct *mat;
2192: CsrMatrix *csrmat;
2194: MatCheckProduct(C, 1);
2196: mmdata = (MatMatHipsparse *)product->data;
2197: A = product->A;
2198: B = product->B;
2199: PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg);
2201: /* currently CopyToGpu does not copy if the matrix is bound to CPU
2202: Instead of silently accepting the wrong answer, I prefer to raise the error */
2204: MatSeqAIJHIPSPARSECopyToGPU(A);
2205: cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2206: switch (product->type) {
2207: case MATPRODUCT_AB:
2208: case MATPRODUCT_PtAP:
2209: mat = cusp->mat;
2210: opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2211: m = A->rmap->n;
2212: n = B->cmap->n;
2213: break;
2214: case MATPRODUCT_AtB:
2215: if (!A->form_explicit_transpose) {
2216: mat = cusp->mat;
2217: opA = HIPSPARSE_OPERATION_TRANSPOSE;
2218: } else {
2219: MatSeqAIJHIPSPARSEFormExplicitTranspose(A);
2220: mat = cusp->matTranspose;
2221: opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2222: }
2223: m = A->cmap->n;
2224: n = B->cmap->n;
2225: break;
2226: case MATPRODUCT_ABt:
2227: case MATPRODUCT_RARt:
2228: mat = cusp->mat;
2229: opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2230: m = A->rmap->n;
2231: n = B->rmap->n;
2232: break;
2233: default:
2234: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2235: }
2237: csrmat = (CsrMatrix *)mat->mat;
2238: /* if the user passed a CPU matrix, copy the data to the GPU */
2239: PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &biship);
2240: if (!biship) { MatConvert(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B); }
2241: MatDenseHIPGetArrayRead(B, &barray);
2242: MatDenseGetLDA(B, &blda);
2243: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2244: MatDenseHIPGetArrayWrite(mmdata->X, &carray);
2245: MatDenseGetLDA(mmdata->X, &clda);
2246: } else {
2247: MatDenseHIPGetArrayWrite(C, &carray);
2248: MatDenseGetLDA(C, &clda);
2249: }
2251: PetscLogGpuTimeBegin();
2252: hipsparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE;
2253: /* (re)allocate mmBuffer if not initialized or LDAs are different */
2254: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2255: size_t mmBufferSize;
2256: if (mmdata->initialized && mmdata->Blda != blda) {
2257: hipsparseDestroyDnMat(mmdata->matBDescr);
2258: mmdata->matBDescr = NULL;
2259: }
2260: if (!mmdata->matBDescr) {
2261: hipsparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, hipsparse_scalartype, HIPSPARSE_ORDER_COLUMN);
2262: mmdata->Blda = blda;
2263: }
2264: if (mmdata->initialized && mmdata->Clda != clda) {
2265: hipsparseDestroyDnMat(mmdata->matCDescr);
2266: mmdata->matCDescr = NULL;
2267: }
2268: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2269: hipsparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, hipsparse_scalartype, HIPSPARSE_ORDER_COLUMN);
2270: mmdata->Clda = clda;
2271: }
2272: if (!mat->matDescr) {
2273: PetscCallHIPSPARSE(hipsparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2274: HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2275: }
2276: hipsparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, &mmBufferSize);
2277: if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2278: hipFree(mmdata->mmBuffer);
2279: hipMalloc(&mmdata->mmBuffer, mmBufferSize);
2280: mmdata->mmBufferSize = mmBufferSize;
2281: }
2282: mmdata->initialized = PETSC_TRUE;
2283: } else {
2284: /* to be safe, always update pointers of the mats */
2285: hipsparseSpMatSetValues(mat->matDescr, csrmat->values->data().get());
2286: hipsparseDnMatSetValues(mmdata->matBDescr, (void *)barray);
2287: hipsparseDnMatSetValues(mmdata->matCDescr, (void *)carray);
2288: }
2290: /* do hipsparseSpMM, which supports transpose on B */
2291: hipsparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
2293: PetscLogGpuTimeEnd();
2294: PetscLogGpuFlops(n * 2.0 * csrmat->num_entries);
2295: MatDenseHIPRestoreArrayRead(B, &barray);
2296: if (product->type == MATPRODUCT_RARt) {
2297: MatDenseHIPRestoreArrayWrite(mmdata->X, &carray);
2298: MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE);
2299: } else if (product->type == MATPRODUCT_PtAP) {
2300: MatDenseHIPRestoreArrayWrite(mmdata->X, &carray);
2301: MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE);
2302: } else MatDenseHIPRestoreArrayWrite(C, &carray);
2303: if (mmdata->cisdense) MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C);
2304: if (!biship) MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B);
2305: return 0;
2306: }
2308: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2309: {
2310: Mat_Product *product = C->product;
2311: Mat A, B;
2312: PetscInt m, n;
2313: PetscBool cisdense, flg;
2314: MatMatHipsparse *mmdata;
2315: Mat_SeqAIJHIPSPARSE *cusp;
2317: MatCheckProduct(C, 1);
2319: A = product->A;
2320: B = product->B;
2321: PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg);
2323: cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2325: switch (product->type) {
2326: case MATPRODUCT_AB:
2327: m = A->rmap->n;
2328: n = B->cmap->n;
2329: break;
2330: case MATPRODUCT_AtB:
2331: m = A->cmap->n;
2332: n = B->cmap->n;
2333: break;
2334: case MATPRODUCT_ABt:
2335: m = A->rmap->n;
2336: n = B->rmap->n;
2337: break;
2338: case MATPRODUCT_PtAP:
2339: m = B->cmap->n;
2340: n = B->cmap->n;
2341: break;
2342: case MATPRODUCT_RARt:
2343: m = B->rmap->n;
2344: n = B->rmap->n;
2345: break;
2346: default:
2347: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2348: }
2349: MatSetSizes(C, m, n, m, n);
2350: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2351: PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense);
2352: MatSetType(C, MATSEQDENSEHIP);
2354: /* product data */
2355: PetscNew(&mmdata);
2356: mmdata->cisdense = cisdense;
2357: /* for these products we need intermediate storage */
2358: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2359: MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X);
2360: MatSetType(mmdata->X, MATSEQDENSEHIP);
2361: /* do not preallocate, since the first call to MatDenseHIPGetArray will preallocate on the GPU for us */
2362: if (product->type == MATPRODUCT_RARt) MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n);
2363: else MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n);
2364: }
2365: C->product->data = mmdata;
2366: C->product->destroy = MatDestroy_MatMatHipsparse;
2367: C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP;
2368: return 0;
2369: }
2371: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2372: {
2373: Mat_Product *product = C->product;
2374: Mat A, B;
2375: Mat_SeqAIJHIPSPARSE *Acusp, *Bcusp, *Ccusp;
2376: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
2377: Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2378: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2379: PetscBool flg;
2380: MatProductType ptype;
2381: MatMatHipsparse *mmdata;
2382: hipsparseSpMatDescr_t BmatSpDescr;
2383: hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */
2385: MatCheckProduct(C, 1);
2387: PetscObjectTypeCompare((PetscObject)C, MATSEQAIJHIPSPARSE, &flg);
2389: mmdata = (MatMatHipsparse *)C->product->data;
2390: A = product->A;
2391: B = product->B;
2392: if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2393: mmdata->reusesym = PETSC_FALSE;
2394: Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2396: Cmat = Ccusp->mat;
2398: Ccsr = (CsrMatrix *)Cmat->mat;
2400: goto finalize;
2401: }
2402: if (!c->nz) goto finalize;
2403: PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg);
2405: PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg);
2409: Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2410: Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2411: Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2415: MatSeqAIJHIPSPARSECopyToGPU(A);
2416: MatSeqAIJHIPSPARSECopyToGPU(B);
2418: ptype = product->type;
2419: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2420: ptype = MATPRODUCT_AB;
2422: }
2423: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2424: ptype = MATPRODUCT_AB;
2426: }
2427: switch (ptype) {
2428: case MATPRODUCT_AB:
2429: Amat = Acusp->mat;
2430: Bmat = Bcusp->mat;
2431: break;
2432: case MATPRODUCT_AtB:
2433: Amat = Acusp->matTranspose;
2434: Bmat = Bcusp->mat;
2435: break;
2436: case MATPRODUCT_ABt:
2437: Amat = Acusp->mat;
2438: Bmat = Bcusp->matTranspose;
2439: break;
2440: default:
2441: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2442: }
2443: Cmat = Ccusp->mat;
2447: Acsr = (CsrMatrix *)Amat->mat;
2448: Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2449: Ccsr = (CsrMatrix *)Cmat->mat;
2453: PetscLogGpuTimeBegin();
2454: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2455: BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2456: hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE);
2457: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2458: hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2459: #else
2460: hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2461: hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2462: #endif
2463: #else
2464: PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2465: Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2466: Ccsr->column_indices->data().get()));
2467: #endif
2468: PetscLogGpuFlops(mmdata->flops);
2469: WaitForHIP();
2470: PetscLogGpuTimeEnd();
2471: C->offloadmask = PETSC_OFFLOAD_GPU;
2472: finalize:
2473: /* shorter version of MatAssemblyEnd_SeqAIJ */
2474: PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz);
2475: PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n");
2476: PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax);
2477: c->reallocs = 0;
2478: C->info.mallocs += 0;
2479: C->info.nz_unneeded = 0;
2480: C->assembled = C->was_assembled = PETSC_TRUE;
2481: C->num_ass++;
2482: return 0;
2483: }
2485: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2486: {
2487: Mat_Product *product = C->product;
2488: Mat A, B;
2489: Mat_SeqAIJHIPSPARSE *Acusp, *Bcusp, *Ccusp;
2490: Mat_SeqAIJ *a, *b, *c;
2491: Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2492: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2493: PetscInt i, j, m, n, k;
2494: PetscBool flg;
2495: MatProductType ptype;
2496: MatMatHipsparse *mmdata;
2497: PetscLogDouble flops;
2498: PetscBool biscompressed, ciscompressed;
2499: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2500: int64_t C_num_rows1, C_num_cols1, C_nnz1;
2501: hipsparseSpMatDescr_t BmatSpDescr;
2502: #else
2503: int cnz;
2504: #endif
2505: hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */
2507: MatCheckProduct(C, 1);
2509: A = product->A;
2510: B = product->B;
2511: PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg);
2513: PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg);
2515: a = (Mat_SeqAIJ *)A->data;
2516: b = (Mat_SeqAIJ *)B->data;
2517: /* product data */
2518: PetscNew(&mmdata);
2519: C->product->data = mmdata;
2520: C->product->destroy = MatDestroy_MatMatHipsparse;
2522: MatSeqAIJHIPSPARSECopyToGPU(A);
2523: MatSeqAIJHIPSPARSECopyToGPU(B);
2524: Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; /* Access spptr after MatSeqAIJHIPSPARSECopyToGPU, not before */
2525: Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2529: ptype = product->type;
2530: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2531: ptype = MATPRODUCT_AB;
2532: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2533: }
2534: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2535: ptype = MATPRODUCT_AB;
2536: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2537: }
2538: biscompressed = PETSC_FALSE;
2539: ciscompressed = PETSC_FALSE;
2540: switch (ptype) {
2541: case MATPRODUCT_AB:
2542: m = A->rmap->n;
2543: n = B->cmap->n;
2544: k = A->cmap->n;
2545: Amat = Acusp->mat;
2546: Bmat = Bcusp->mat;
2547: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2548: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2549: break;
2550: case MATPRODUCT_AtB:
2551: m = A->cmap->n;
2552: n = B->cmap->n;
2553: k = A->rmap->n;
2554: MatSeqAIJHIPSPARSEFormExplicitTranspose(A);
2555: Amat = Acusp->matTranspose;
2556: Bmat = Bcusp->mat;
2557: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2558: break;
2559: case MATPRODUCT_ABt:
2560: m = A->rmap->n;
2561: n = B->rmap->n;
2562: k = A->cmap->n;
2563: MatSeqAIJHIPSPARSEFormExplicitTranspose(B);
2564: Amat = Acusp->mat;
2565: Bmat = Bcusp->matTranspose;
2566: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2567: break;
2568: default:
2569: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2570: }
2572: /* create hipsparse matrix */
2573: MatSetSizes(C, m, n, m, n);
2574: MatSetType(C, MATSEQAIJHIPSPARSE);
2575: c = (Mat_SeqAIJ *)C->data;
2576: Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2577: Cmat = new Mat_SeqAIJHIPSPARSEMultStruct;
2578: Ccsr = new CsrMatrix;
2580: c->compressedrow.use = ciscompressed;
2581: if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2582: c->compressedrow.nrows = a->compressedrow.nrows;
2583: PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex);
2584: PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows);
2585: Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
2586: Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2587: Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2588: } else {
2589: c->compressedrow.nrows = 0;
2590: c->compressedrow.i = NULL;
2591: c->compressedrow.rindex = NULL;
2592: Ccusp->workVector = NULL;
2593: Cmat->cprowIndices = NULL;
2594: }
2595: Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
2596: Ccusp->mat = Cmat;
2597: Ccusp->mat->mat = Ccsr;
2598: Ccsr->num_rows = Ccusp->nrows;
2599: Ccsr->num_cols = n;
2600: Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2601: hipsparseCreateMatDescr(&Cmat->descr);
2602: hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO);
2603: hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL);
2604: hipMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar));
2605: hipMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar));
2606: hipMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));
2607: hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice);
2608: hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice);
2609: hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice);
2610: if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* hipsparse raise errors in different calls when matrices have zero rows/columns! */
2611: thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2612: c->nz = 0;
2613: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2614: Ccsr->values = new THRUSTARRAY(c->nz);
2615: goto finalizesym;
2616: }
2620: Acsr = (CsrMatrix *)Amat->mat;
2621: if (!biscompressed) {
2622: Bcsr = (CsrMatrix *)Bmat->mat;
2623: BmatSpDescr = Bmat->matDescr;
2624: } else { /* we need to use row offsets for the full matrix */
2625: CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat;
2626: Bcsr = new CsrMatrix;
2627: Bcsr->num_rows = B->rmap->n;
2628: Bcsr->num_cols = cBcsr->num_cols;
2629: Bcsr->num_entries = cBcsr->num_entries;
2630: Bcsr->column_indices = cBcsr->column_indices;
2631: Bcsr->values = cBcsr->values;
2632: if (!Bcusp->rowoffsets_gpu) {
2633: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2634: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2635: PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt));
2636: }
2637: Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2638: mmdata->Bcsr = Bcsr;
2639: if (Bcsr->num_rows && Bcsr->num_cols) {
2640: hipsparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype);
2641: }
2642: BmatSpDescr = mmdata->matSpBDescr;
2643: }
2646: /* precompute flops count */
2647: if (ptype == MATPRODUCT_AB) {
2648: for (i = 0, flops = 0; i < A->rmap->n; i++) {
2649: const PetscInt st = a->i[i];
2650: const PetscInt en = a->i[i + 1];
2651: for (j = st; j < en; j++) {
2652: const PetscInt brow = a->j[j];
2653: flops += 2. * (b->i[brow + 1] - b->i[brow]);
2654: }
2655: }
2656: } else if (ptype == MATPRODUCT_AtB) {
2657: for (i = 0, flops = 0; i < A->rmap->n; i++) {
2658: const PetscInt anzi = a->i[i + 1] - a->i[i];
2659: const PetscInt bnzi = b->i[i + 1] - b->i[i];
2660: flops += (2. * anzi) * bnzi;
2661: }
2662: } else flops = 0.; /* TODO */
2664: mmdata->flops = flops;
2665: PetscLogGpuTimeBegin();
2666: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2667: hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE);
2668: hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype);
2669: hipsparseSpGEMM_createDescr(&mmdata->spgemmDesc);
2670: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2671: {
2672: /* hipsparseSpGEMMreuse has more reasonable APIs than hipsparseSpGEMM, so we prefer to use it.
2673: We follow the sample code at https://github.com/ROCmSoftwarePlatform/hipSPARSE/blob/develop/clients/include/testing_spgemmreuse_csr.hpp
2674: */
2675: void *dBuffer1 = NULL;
2676: void *dBuffer2 = NULL;
2677: void *dBuffer3 = NULL;
2678: /* dBuffer4, dBuffer5 are needed by hipsparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2679: size_t bufferSize1 = 0;
2680: size_t bufferSize2 = 0;
2681: size_t bufferSize3 = 0;
2682: size_t bufferSize4 = 0;
2683: size_t bufferSize5 = 0;
2685: /*----------------------------------------------------------------------*/
2686: /* ask bufferSize1 bytes for external memory */
2687: hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
2688: hipMalloc((void **)&dBuffer1, bufferSize1);
2689: /* inspect the matrices A and B to understand the memory requirement for the next step */
2690: hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
2691: /*----------------------------------------------------------------------*/
2692: hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
2693: hipMalloc((void **)&dBuffer2, bufferSize2);
2694: hipMalloc((void **)&dBuffer3, bufferSize3);
2695: hipMalloc((void **)&mmdata->dBuffer4, bufferSize4);
2696: hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
2697: hipFree(dBuffer1);
2698: hipFree(dBuffer2);
2699: /*----------------------------------------------------------------------*/
2700: /* get matrix C non-zero entries C_nnz1 */
2701: hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);
2702: c->nz = (PetscInt)C_nnz1;
2703: /* allocate matrix C */
2704: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2705: hipPeekAtLastError(); /* catch out of memory errors */
2706: Ccsr->values = new THRUSTARRAY(c->nz);
2707: hipPeekAtLastError(); /* catch out of memory errors */
2708: /* update matC with the new pointers */
2709: hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2710: /*----------------------------------------------------------------------*/
2711: hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
2712: hipMalloc((void **)&mmdata->dBuffer5, bufferSize5);
2713: hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
2714: hipFree(dBuffer3);
2715: hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2716: PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024);
2717: }
2718: #else
2719: size_t bufSize2;
2720: /* ask bufferSize bytes for external memory */
2721: hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
2722: hipMalloc((void **)&mmdata->mmBuffer2, bufSize2);
2723: /* inspect the matrices A and B to understand the memory requirement for the next step */
2724: hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
2725: /* ask bufferSize again bytes for external memory */
2726: hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
2727: /* Similar to CUSPARSE, we need both buffers to perform the operations properly!
2728: mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2729: it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2730: is stored in the descriptor! What a messy API... */
2731: hipMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize);
2732: /* compute the intermediate product of A * B */
2733: hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2734: /* get matrix C non-zero entries C_nnz1 */
2735: hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);
2736: c->nz = (PetscInt)C_nnz1;
2737: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2738: mmdata->mmBufferSize / 1024));
2739: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2740: hipPeekAtLastError(); /* catch out of memory errors */
2741: Ccsr->values = new THRUSTARRAY(c->nz);
2742: hipPeekAtLastError(); /* catch out of memory errors */
2743: hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2744: hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2745: #endif
2746: #else
2747: hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_HOST);
2748: PetscCallHIPSPARSE(hipsparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2749: Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz));
2750: c->nz = cnz;
2751: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2752: hipPeekAtLastError(); /* catch out of memory errors */
2753: Ccsr->values = new THRUSTARRAY(c->nz);
2754: hipPeekAtLastError(); /* catch out of memory errors */
2756: hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE);
2757: /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2758: I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2759: D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2760: PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2761: Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2762: Ccsr->column_indices->data().get()));
2763: #endif
2764: PetscLogGpuFlops(mmdata->flops);
2765: PetscLogGpuTimeEnd();
2766: finalizesym:
2767: c->singlemalloc = PETSC_FALSE;
2768: c->free_a = PETSC_TRUE;
2769: c->free_ij = PETSC_TRUE;
2770: PetscMalloc1(m + 1, &c->i);
2771: PetscMalloc1(c->nz, &c->j);
2772: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2773: PetscInt *d_i = c->i;
2774: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2775: THRUSTINTARRAY jj(Ccsr->column_indices->size());
2776: ii = *Ccsr->row_offsets;
2777: jj = *Ccsr->column_indices;
2778: if (ciscompressed) d_i = c->compressedrow.i;
2779: hipMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost);
2780: hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost);
2781: } else {
2782: PetscInt *d_i = c->i;
2783: if (ciscompressed) d_i = c->compressedrow.i;
2784: hipMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost);
2785: hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost);
2786: }
2787: if (ciscompressed) { /* need to expand host row offsets */
2788: PetscInt r = 0;
2789: c->i[0] = 0;
2790: for (k = 0; k < c->compressedrow.nrows; k++) {
2791: const PetscInt next = c->compressedrow.rindex[k];
2792: const PetscInt old = c->compressedrow.i[k];
2793: for (; r < next; r++) c->i[r + 1] = old;
2794: }
2795: for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
2796: }
2797: PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt));
2798: PetscMalloc1(m, &c->ilen);
2799: PetscMalloc1(m, &c->imax);
2800: c->maxnz = c->nz;
2801: c->nonzerorowcnt = 0;
2802: c->rmax = 0;
2803: for (k = 0; k < m; k++) {
2804: const PetscInt nn = c->i[k + 1] - c->i[k];
2805: c->ilen[k] = c->imax[k] = nn;
2806: c->nonzerorowcnt += (PetscInt) !!nn;
2807: c->rmax = PetscMax(c->rmax, nn);
2808: }
2809: MatMarkDiagonal_SeqAIJ(C);
2810: PetscMalloc1(c->nz, &c->a);
2811: Ccsr->num_entries = c->nz;
2813: C->nonzerostate++;
2814: PetscLayoutSetUp(C->rmap);
2815: PetscLayoutSetUp(C->cmap);
2816: Ccusp->nonzerostate = C->nonzerostate;
2817: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2818: C->preallocated = PETSC_TRUE;
2819: C->assembled = PETSC_FALSE;
2820: C->was_assembled = PETSC_FALSE;
2821: if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2822: mmdata->reusesym = PETSC_TRUE;
2823: C->offloadmask = PETSC_OFFLOAD_GPU;
2824: }
2825: C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2826: return 0;
2827: }
2829: /* handles sparse or dense B */
2830: static PetscErrorCode MatProductSetFromOptions_SeqAIJHIPSPARSE(Mat mat)
2831: {
2832: Mat_Product *product = mat->product;
2833: PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
2835: MatCheckProduct(mat, 1);
2836: PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense);
2837: if (!product->A->boundtocpu && !product->B->boundtocpu) PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJHIPSPARSE, &Biscusp);
2838: if (product->type == MATPRODUCT_ABC) {
2839: Ciscusp = PETSC_FALSE;
2840: if (!product->C->boundtocpu) PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJHIPSPARSE, &Ciscusp);
2841: }
2842: if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2843: PetscBool usecpu = PETSC_FALSE;
2844: switch (product->type) {
2845: case MATPRODUCT_AB:
2846: if (product->api_user) {
2847: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
2848: PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL);
2849: PetscOptionsEnd();
2850: } else {
2851: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
2852: PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL);
2853: PetscOptionsEnd();
2854: }
2855: break;
2856: case MATPRODUCT_AtB:
2857: if (product->api_user) {
2858: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
2859: PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL);
2860: PetscOptionsEnd();
2861: } else {
2862: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
2863: PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL);
2864: PetscOptionsEnd();
2865: }
2866: break;
2867: case MATPRODUCT_PtAP:
2868: if (product->api_user) {
2869: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
2870: PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL);
2871: PetscOptionsEnd();
2872: } else {
2873: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
2874: PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL);
2875: PetscOptionsEnd();
2876: }
2877: break;
2878: case MATPRODUCT_RARt:
2879: if (product->api_user) {
2880: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
2881: PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL);
2882: PetscOptionsEnd();
2883: } else {
2884: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
2885: PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL);
2886: PetscOptionsEnd();
2887: }
2888: break;
2889: case MATPRODUCT_ABC:
2890: if (product->api_user) {
2891: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
2892: PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL);
2893: PetscOptionsEnd();
2894: } else {
2895: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
2896: PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL);
2897: PetscOptionsEnd();
2898: }
2899: break;
2900: default:
2901: break;
2902: }
2903: if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2904: }
2905: /* dispatch */
2906: if (isdense) {
2907: switch (product->type) {
2908: case MATPRODUCT_AB:
2909: case MATPRODUCT_AtB:
2910: case MATPRODUCT_ABt:
2911: case MATPRODUCT_PtAP:
2912: case MATPRODUCT_RARt:
2913: if (product->A->boundtocpu) MatProductSetFromOptions_SeqAIJ_SeqDense(mat);
2914: else mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP;
2915: break;
2916: case MATPRODUCT_ABC:
2917: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2918: break;
2919: default:
2920: break;
2921: }
2922: } else if (Biscusp && Ciscusp) {
2923: switch (product->type) {
2924: case MATPRODUCT_AB:
2925: case MATPRODUCT_AtB:
2926: case MATPRODUCT_ABt:
2927: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2928: break;
2929: case MATPRODUCT_PtAP:
2930: case MATPRODUCT_RARt:
2931: case MATPRODUCT_ABC:
2932: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2933: break;
2934: default:
2935: break;
2936: }
2937: } else MatProductSetFromOptions_SeqAIJ(mat); /* fallback for AIJ */
2938: return 0;
2939: }
2941: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2942: {
2943: MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE);
2944: return 0;
2945: }
2947: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2948: {
2949: MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE);
2950: return 0;
2951: }
2953: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2954: {
2955: MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE);
2956: return 0;
2957: }
2959: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2960: {
2961: MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE);
2962: return 0;
2963: }
2965: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2966: {
2967: MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE);
2968: return 0;
2969: }
2971: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
2972: {
2973: int i = blockIdx.x * blockDim.x + threadIdx.x;
2974: if (i < n) y[idx[i]] += x[i];
2975: }
2977: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2978: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
2979: {
2980: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2981: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2982: Mat_SeqAIJHIPSPARSEMultStruct *matstruct;
2983: PetscScalar *xarray, *zarray, *dptr, *beta, *xptr;
2984: hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2985: PetscBool compressed;
2986: PetscInt nx, ny;
2989: if (!a->nz) {
2990: if (!yy) VecSet_SeqHIP(zz, 0);
2991: else VecCopy_SeqHIP(yy, zz);
2992: return 0;
2993: }
2994: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2995: MatSeqAIJHIPSPARSECopyToGPU(A);
2996: if (!trans) {
2997: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
2999: } else {
3000: if (herm || !A->form_explicit_transpose) {
3001: opA = herm ? HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE : HIPSPARSE_OPERATION_TRANSPOSE;
3002: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3003: } else {
3004: if (!hipsparsestruct->matTranspose) MatSeqAIJHIPSPARSEFormExplicitTranspose(A);
3005: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
3006: }
3007: }
3008: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3009: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3010: try {
3011: VecHIPGetArrayRead(xx, (const PetscScalar **)&xarray);
3012: if (yy == zz) VecHIPGetArray(zz, &zarray); /* read & write zz, so need to get uptodate zarray on GPU */
3013: else VecHIPGetArrayWrite(zz, &zarray); /* write zz, so no need to init zarray on GPU */
3015: PetscLogGpuTimeBegin();
3016: if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3017: /* z = A x + beta y.
3018: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3019: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3020: */
3021: xptr = xarray;
3022: dptr = compressed ? hipsparsestruct->workVector->data().get() : zarray;
3023: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3024: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3025: allocated to accommodate different uses. So we get the length info directly from mat.
3026: */
3027: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3028: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3029: nx = mat->num_cols;
3030: ny = mat->num_rows;
3031: }
3032: } else {
3033: /* z = A^T x + beta y
3034: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3035: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3036: */
3037: xptr = compressed ? hipsparsestruct->workVector->data().get() : xarray;
3038: dptr = zarray;
3039: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3040: if (compressed) { /* Scatter x to work vector */
3041: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3042: thrust::for_each(
3043: #if PetscDefined(HAVE_THRUST_ASYNC)
3044: thrust::hip::par.on(PetscDefaultHipStream),
3045: #endif
3046: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3047: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecHIPEqualsReverse());
3048: }
3049: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3050: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3051: nx = mat->num_rows;
3052: ny = mat->num_cols;
3053: }
3054: }
3055: /* csr_spmv does y = alpha op(A) x + beta y */
3056: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3057: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
3059: if (!matstruct->hipSpMV[opA].initialized) { /* built on demand */
3060: hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecXDescr, nx, xptr, hipsparse_scalartype);
3061: hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecYDescr, ny, dptr, hipsparse_scalartype);
3062: PetscCallHIPSPARSE(hipsparseSpMV_bufferSize(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg,
3063: &matstruct->hipSpMV[opA].spmvBufferSize));
3064: hipMalloc(&matstruct->hipSpMV[opA].spmvBuffer, matstruct->hipSpMV[opA].spmvBufferSize);
3065: matstruct->hipSpMV[opA].initialized = PETSC_TRUE;
3066: } else {
3067: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3068: hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecXDescr, xptr);
3069: hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecYDescr, dptr);
3070: }
3071: PetscCallHIPSPARSE(hipsparseSpMV(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJHIPSPARSECopyToGPU() or MatSeqAIJHIPSPARSEFormExplicitTranspose() */
3072: matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg, matstruct->hipSpMV[opA].spmvBuffer));
3073: #else
3074: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3075: hipsparse_csr_spmv(hipsparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr);
3076: #endif
3077: } else {
3078: if (hipsparsestruct->nrows) {
3079: hipsparseHybMat_t hybMat = (hipsparseHybMat_t)matstruct->mat;
3080: hipsparse_hyb_spmv(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr);
3081: }
3082: }
3083: PetscLogGpuTimeEnd();
3085: if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3086: if (yy) { /* MatMultAdd: zz = A*xx + yy */
3087: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3088: VecCopy_SeqHIP(yy, zz); /* zz = yy */
3089: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3090: VecAXPY_SeqHIP(zz, 1.0, yy); /* zz += yy */
3091: }
3092: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3093: VecSet_SeqHIP(zz, 0);
3094: }
3096: /* ScatterAdd the result from work vector into the full vector when A is compressed */
3097: if (compressed) {
3098: PetscLogGpuTimeBegin();
3099: /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3100: and in the destructor of the scope, it will call hipStreamSynchronize() on this stream. One has to store all events to
3101: prevent that. So I just add a ScatterAdd kernel.
3102: */
3103: #if 0
3104: thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3105: thrust::async::for_each(thrust::hip::par.on(hipsparsestruct->stream),
3106: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3107: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3108: VecHIPPlusEquals());
3109: #else
3110: PetscInt n = matstruct->cprowIndices->size();
3111: hipLaunchKernelGGL(ScatterAdd, dim3((n + 255) / 256), dim3(256), 0, PetscDefaultHipStream, n, matstruct->cprowIndices->data().get(), hipsparsestruct->workVector->data().get(), zarray);
3112: #endif
3113: PetscLogGpuTimeEnd();
3114: }
3115: } else {
3116: if (yy && yy != zz) { VecAXPY_SeqHIP(zz, 1.0, yy); /* zz += yy */ }
3117: }
3118: VecHIPRestoreArrayRead(xx, (const PetscScalar **)&xarray);
3119: if (yy == zz) VecHIPRestoreArray(zz, &zarray);
3120: else VecHIPRestoreArrayWrite(zz, &zarray);
3121: } catch (char *ex) {
3122: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
3123: }
3124: if (yy) PetscLogGpuFlops(2.0 * a->nz);
3125: else PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt);
3126: return 0;
3127: }
3129: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3130: {
3131: MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE);
3132: return 0;
3133: }
3135: static PetscErrorCode MatAssemblyEnd_SeqAIJHIPSPARSE(Mat A, MatAssemblyType mode)
3136: {
3137: PetscObjectState onnz = A->nonzerostate;
3138: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3140: MatAssemblyEnd_SeqAIJ(A, mode);
3141: if (onnz != A->nonzerostate && cusp->deviceMat) {
3142: PetscInfo(A, "Destroy device mat since nonzerostate changed\n");
3143: hipFree(cusp->deviceMat);
3144: cusp->deviceMat = NULL;
3145: }
3146: return 0;
3147: }
3149: /* --------------------------------------------------------------------------------*/
3150: /*@
3151: MatCreateSeqAIJHIPSPARSE - Creates a sparse matrix in `MATAIJHIPSPARSE` (compressed row) format.
3152: This matrix will ultimately pushed down to AMD GPUs and use the HIPSPARSE library for calculations.
3153: For good matrix assembly performance the user should preallocate the matrix storage by setting
3154: the parameter nz (or the array nnz). By setting these parameters accurately,
3155: performance during matrix assembly can be increased by more than a factor of 50.
3157: Collective
3159: Input Parameters:
3160: + comm - MPI communicator, set to `PETSC_COMM_SELF`
3161: . m - number of rows
3162: . n - number of columns
3163: . nz - number of nonzeros per row (same for all rows)
3164: - nnz - array containing the number of nonzeros in the various rows
3165: (possibly different for each row) or NULL
3167: Output Parameter:
3168: . A - the matrix
3170: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3171: `MatXXXXSetPreallocation()` paradgm instead of this routine directly.
3172: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation`]
3174: Notes:
3175: If nnz is given then nz is ignored
3177: The AIJ format (also called the Yale sparse matrix format or
3178: compressed row storage), is fully compatible with standard Fortran 77
3179: storage. That is, the stored row and column indices can begin at
3180: either one (as in Fortran) or zero. See the users' manual for details.
3182: Specify the preallocated storage with either nz or nnz (not both).
3183: Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
3184: allocation. For large problems you MUST preallocate memory or you
3185: will get TERRIBLE performance, see the users' manual chapter on matrices.
3187: By default, this format uses inodes (identical nodes) when possible, to
3188: improve numerical efficiency of matrix-vector products and solves. We
3189: search for consecutive rows with the same nonzero structure, thereby
3190: reusing matrix information to achieve increased efficiency.
3192: Level: intermediate
3194: .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJHIPSPARSE`, `MATAIJHIPSPARSE`
3195: @*/
3196: PetscErrorCode MatCreateSeqAIJHIPSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3197: {
3198: MatCreate(comm, A);
3199: MatSetSizes(*A, m, n, m, n);
3200: MatSetType(*A, MATSEQAIJHIPSPARSE);
3201: MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz);
3202: return 0;
3203: }
3205: static PetscErrorCode MatDestroy_SeqAIJHIPSPARSE(Mat A)
3206: {
3207: if (A->factortype == MAT_FACTOR_NONE) MatSeqAIJHIPSPARSE_Destroy((Mat_SeqAIJHIPSPARSE **)&A->spptr);
3208: else MatSeqAIJHIPSPARSETriFactors_Destroy((Mat_SeqAIJHIPSPARSETriFactors **)&A->spptr);
3209: PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL);
3210: PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", NULL);
3211: PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetUseCPUSolve_C", NULL);
3212: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL);
3213: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL);
3214: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL);
3215: PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL);
3216: PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL);
3217: PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL);
3218: PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijhipsparse_hypre_C", NULL);
3219: MatDestroy_SeqAIJ(A);
3220: return 0;
3221: }
3223: static PetscErrorCode MatDuplicate_SeqAIJHIPSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3224: {
3225: MatDuplicate_SeqAIJ(A, cpvalues, B);
3226: MatConvert_SeqAIJ_SeqAIJHIPSPARSE(*B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, B);
3227: return 0;
3228: }
3230: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3231: {
3232: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3233: Mat_SeqAIJHIPSPARSE *cy;
3234: Mat_SeqAIJHIPSPARSE *cx;
3235: PetscScalar *ay;
3236: const PetscScalar *ax;
3237: CsrMatrix *csry, *csrx;
3239: cy = (Mat_SeqAIJHIPSPARSE *)Y->spptr;
3240: cx = (Mat_SeqAIJHIPSPARSE *)X->spptr;
3241: if (X->ops->axpy != Y->ops->axpy) {
3242: MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE);
3243: MatAXPY_SeqAIJ(Y, a, X, str);
3244: return 0;
3245: }
3246: /* if we are here, it means both matrices are bound to GPU */
3247: MatSeqAIJHIPSPARSECopyToGPU(Y);
3248: MatSeqAIJHIPSPARSECopyToGPU(X);
3251: csry = (CsrMatrix *)cy->mat->mat;
3252: csrx = (CsrMatrix *)cx->mat->mat;
3253: /* see if we can turn this into a hipblas axpy */
3254: if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3255: bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3256: if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3257: if (eq) str = SAME_NONZERO_PATTERN;
3258: }
3259: /* spgeam is buggy with one column */
3260: if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3261: if (str == SUBSET_NONZERO_PATTERN) {
3262: PetscScalar b = 1.0;
3263: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3264: size_t bufferSize;
3265: void *buffer;
3266: #endif
3268: MatSeqAIJHIPSPARSEGetArrayRead(X, &ax);
3269: MatSeqAIJHIPSPARSEGetArray(Y, &ay);
3270: hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_HOST);
3271: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3272: PetscCallHIPSPARSE(hipsparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3273: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3274: hipMalloc(&buffer, bufferSize);
3275: PetscLogGpuTimeBegin();
3276: PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3277: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3278: PetscLogGpuFlops(x->nz + y->nz);
3279: PetscLogGpuTimeEnd();
3280: hipFree(buffer);
3281: #else
3282: PetscLogGpuTimeBegin();
3283: PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3284: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3285: PetscLogGpuFlops(x->nz + y->nz);
3286: PetscLogGpuTimeEnd();
3287: #endif
3288: hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_DEVICE);
3289: MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax);
3290: MatSeqAIJHIPSPARSERestoreArray(Y, &ay);
3291: MatSeqAIJInvalidateDiagonal(Y);
3292: } else if (str == SAME_NONZERO_PATTERN) {
3293: hipblasHandle_t hipblasv2handle;
3294: PetscBLASInt one = 1, bnz = 1;
3296: MatSeqAIJHIPSPARSEGetArrayRead(X, &ax);
3297: MatSeqAIJHIPSPARSEGetArray(Y, &ay);
3298: PetscHIPBLASGetHandle(&hipblasv2handle);
3299: PetscBLASIntCast(x->nz, &bnz);
3300: PetscLogGpuTimeBegin();
3301: hipblasXaxpy(hipblasv2handle, bnz, &a, ax, one, ay, one);
3302: PetscLogGpuFlops(2.0 * bnz);
3303: PetscLogGpuTimeEnd();
3304: MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax);
3305: MatSeqAIJHIPSPARSERestoreArray(Y, &ay);
3306: MatSeqAIJInvalidateDiagonal(Y);
3307: } else {
3308: MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE);
3309: MatAXPY_SeqAIJ(Y, a, X, str);
3310: }
3311: return 0;
3312: }
3314: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat Y, PetscScalar a)
3315: {
3316: Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
3317: PetscScalar *ay;
3318: hipblasHandle_t hipblasv2handle;
3319: PetscBLASInt one = 1, bnz = 1;
3321: MatSeqAIJHIPSPARSEGetArray(Y, &ay);
3322: PetscHIPBLASGetHandle(&hipblasv2handle);
3323: PetscBLASIntCast(y->nz, &bnz);
3324: PetscLogGpuTimeBegin();
3325: hipblasXscal(hipblasv2handle, bnz, &a, ay, one);
3326: PetscLogGpuFlops(bnz);
3327: PetscLogGpuTimeEnd();
3328: MatSeqAIJHIPSPARSERestoreArray(Y, &ay);
3329: MatSeqAIJInvalidateDiagonal(Y);
3330: return 0;
3331: }
3333: static PetscErrorCode MatZeroEntries_SeqAIJHIPSPARSE(Mat A)
3334: {
3335: PetscBool both = PETSC_FALSE;
3336: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3338: if (A->factortype == MAT_FACTOR_NONE) {
3339: Mat_SeqAIJHIPSPARSE *spptr = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3340: if (spptr->mat) {
3341: CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3342: if (matrix->values) {
3343: both = PETSC_TRUE;
3344: thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3345: }
3346: }
3347: if (spptr->matTranspose) {
3348: CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3349: if (matrix->values) { thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); }
3350: }
3351: }
3352: //MatZeroEntries_SeqAIJ(A);
3353: PetscArrayzero(a->a, a->i[A->rmap->n]);
3354: MatSeqAIJInvalidateDiagonal(A);
3355: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3356: else A->offloadmask = PETSC_OFFLOAD_CPU;
3357: return 0;
3358: }
3360: static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat A, PetscBool flg)
3361: {
3362: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3364: if (A->factortype != MAT_FACTOR_NONE) {
3365: A->boundtocpu = flg;
3366: return 0;
3367: }
3368: if (flg) {
3369: MatSeqAIJHIPSPARSECopyFromGPU(A);
3371: A->ops->scale = MatScale_SeqAIJ;
3372: A->ops->axpy = MatAXPY_SeqAIJ;
3373: A->ops->zeroentries = MatZeroEntries_SeqAIJ;
3374: A->ops->mult = MatMult_SeqAIJ;
3375: A->ops->multadd = MatMultAdd_SeqAIJ;
3376: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
3377: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
3378: A->ops->multhermitiantranspose = NULL;
3379: A->ops->multhermitiantransposeadd = NULL;
3380: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
3381: PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps));
3382: PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL);
3383: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL);
3384: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL);
3385: PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL);
3386: PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL);
3387: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL);
3388: } else {
3389: A->ops->scale = MatScale_SeqAIJHIPSPARSE;
3390: A->ops->axpy = MatAXPY_SeqAIJHIPSPARSE;
3391: A->ops->zeroentries = MatZeroEntries_SeqAIJHIPSPARSE;
3392: A->ops->mult = MatMult_SeqAIJHIPSPARSE;
3393: A->ops->multadd = MatMultAdd_SeqAIJHIPSPARSE;
3394: A->ops->multtranspose = MatMultTranspose_SeqAIJHIPSPARSE;
3395: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJHIPSPARSE;
3396: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJHIPSPARSE;
3397: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE;
3398: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJHIPSPARSE;
3399: a->ops->getarray = MatSeqAIJGetArray_SeqAIJHIPSPARSE;
3400: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJHIPSPARSE;
3401: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE;
3402: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE;
3403: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE;
3404: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE;
3405: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE;
3406: PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJHIPSPARSE);
3407: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", MatProductSetFromOptions_SeqAIJHIPSPARSE);
3408: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", MatProductSetFromOptions_SeqAIJHIPSPARSE);
3409: PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJHIPSPARSE);
3410: PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJHIPSPARSE);
3411: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", MatProductSetFromOptions_SeqAIJHIPSPARSE);
3412: }
3413: A->boundtocpu = flg;
3414: if (flg && a->inode.size) a->inode.use = PETSC_TRUE;
3415: else a->inode.use = PETSC_FALSE;
3417: return 0;
3418: }
3420: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
3421: {
3422: Mat B;
3424: PetscDeviceInitialize(PETSC_DEVICE_HIP); /* first use of HIPSPARSE may be via MatConvert */
3425: if (reuse == MAT_INITIAL_MATRIX) {
3426: MatDuplicate(A, MAT_COPY_VALUES, newmat);
3427: } else if (reuse == MAT_REUSE_MATRIX) {
3428: MatCopy(A, *newmat, SAME_NONZERO_PATTERN);
3429: }
3430: B = *newmat;
3431: PetscFree(B->defaultvectype);
3432: PetscStrallocpy(VECHIP, &B->defaultvectype);
3433: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3434: if (B->factortype == MAT_FACTOR_NONE) {
3435: Mat_SeqAIJHIPSPARSE *spptr;
3436: PetscNew(&spptr);
3437: hipsparseCreate(&spptr->handle);
3438: hipsparseSetStream(spptr->handle, PetscDefaultHipStream);
3439: spptr->format = MAT_HIPSPARSE_CSR;
3440: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3441: spptr->spmvAlg = HIPSPARSE_SPMV_CSR_ALG1;
3442: #else
3443: spptr->spmvAlg = HIPSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3444: #endif
3445: spptr->spmmAlg = HIPSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3446: //spptr->csr2cscAlg = HIPSPARSE_CSR2CSC_ALG1;
3448: B->spptr = spptr;
3449: } else {
3450: Mat_SeqAIJHIPSPARSETriFactors *spptr;
3452: PetscNew(&spptr);
3453: hipsparseCreate(&spptr->handle);
3454: hipsparseSetStream(spptr->handle, PetscDefaultHipStream);
3455: B->spptr = spptr;
3456: }
3457: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3458: }
3459: B->ops->assemblyend = MatAssemblyEnd_SeqAIJHIPSPARSE;
3460: B->ops->destroy = MatDestroy_SeqAIJHIPSPARSE;
3461: B->ops->setoption = MatSetOption_SeqAIJHIPSPARSE;
3462: B->ops->setfromoptions = MatSetFromOptions_SeqAIJHIPSPARSE;
3463: B->ops->bindtocpu = MatBindToCPU_SeqAIJHIPSPARSE;
3464: B->ops->duplicate = MatDuplicate_SeqAIJHIPSPARSE;
3466: MatBindToCPU_SeqAIJHIPSPARSE(B, PETSC_FALSE);
3467: PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJHIPSPARSE);
3468: PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetFormat_C", MatHIPSPARSESetFormat_SeqAIJHIPSPARSE);
3469: #if defined(PETSC_HAVE_HYPRE)
3470: PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijhipsparse_hypre_C", MatConvert_AIJ_HYPRE);
3471: #endif
3472: PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetUseCPUSolve_C", MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE);
3473: return 0;
3474: }
3476: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJHIPSPARSE(Mat B)
3477: {
3478: MatCreate_SeqAIJ(B);
3479: MatConvert_SeqAIJ_SeqAIJHIPSPARSE(B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, &B);
3480: return 0;
3481: }
3483: /*
3484: MATSEQAIJHIPSPARSE - MATAIJHIPSPARSE = "(seq)aijhipsparse" - A matrix type to be used for sparse matrices.
3486: A matrix type type whose data resides on AMD GPUs. These matrices can be in either
3487: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later on Nvidia devices.
3488: All matrix calculations are performed on AMD/Nvidia GPUs using the HIPSPARSE library.
3490: Options Database Keys:
3491: + -mat_type aijhipsparse - sets the matrix type to "seqaijhipsparse" during a call to MatSetFromOptions()
3492: . -mat_hipsparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3493: - -mat_hipsparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3494: + -mat_hipsparse_use_cpu_solve - Do MatSolve on CPU
3496: Level: beginner
3498: .seealso: `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
3499: */
3501: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_HIPSPARSE(void)
3502: {
3503: MatSolverTypeRegister(MATSOLVERHIPSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijhipsparse_hipsparse_band);
3504: MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijhipsparse_hipsparse);
3505: MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijhipsparse_hipsparse);
3506: MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijhipsparse_hipsparse);
3507: MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijhipsparse_hipsparse);
3509: return 0;
3510: }
3512: static PetscErrorCode MatResetPreallocationCOO_SeqAIJHIPSPARSE(Mat mat)
3513: {
3514: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)mat->spptr;
3516: if (!cusp) return 0;
3517: delete cusp->cooPerm;
3518: delete cusp->cooPerm_a;
3519: cusp->cooPerm = NULL;
3520: cusp->cooPerm_a = NULL;
3521: if (cusp->use_extended_coo) {
3522: hipFree(cusp->jmap_d);
3523: hipFree(cusp->perm_d);
3524: }
3525: cusp->use_extended_coo = PETSC_FALSE;
3526: return 0;
3527: }
3529: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat_SeqAIJHIPSPARSE **hipsparsestruct)
3530: {
3531: if (*hipsparsestruct) {
3532: MatSeqAIJHIPSPARSEMultStruct_Destroy(&(*hipsparsestruct)->mat, (*hipsparsestruct)->format);
3533: MatSeqAIJHIPSPARSEMultStruct_Destroy(&(*hipsparsestruct)->matTranspose, (*hipsparsestruct)->format);
3534: delete (*hipsparsestruct)->workVector;
3535: delete (*hipsparsestruct)->rowoffsets_gpu;
3536: delete (*hipsparsestruct)->cooPerm;
3537: delete (*hipsparsestruct)->cooPerm_a;
3538: delete (*hipsparsestruct)->csr2csc_i;
3539: if ((*hipsparsestruct)->handle) hipsparseDestroy((*hipsparsestruct)->handle);
3540: if ((*hipsparsestruct)->jmap_d) hipFree((*hipsparsestruct)->jmap_d);
3541: if ((*hipsparsestruct)->perm_d) hipFree((*hipsparsestruct)->perm_d);
3542: PetscFree(*hipsparsestruct);
3543: }
3544: return 0;
3545: }
3547: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3548: {
3549: if (*mat) {
3550: delete (*mat)->values;
3551: delete (*mat)->column_indices;
3552: delete (*mat)->row_offsets;
3553: delete *mat;
3554: *mat = 0;
3555: }
3556: return 0;
3557: }
3559: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **trifactor)
3560: {
3561: if (*trifactor) {
3562: if ((*trifactor)->descr) hipsparseDestroyMatDescr((*trifactor)->descr);
3563: if ((*trifactor)->solveInfo) hipsparseDestroyCsrsvInfo((*trifactor)->solveInfo);
3564: CsrMatrix_Destroy(&(*trifactor)->csrMat);
3565: if ((*trifactor)->solveBuffer) hipFree((*trifactor)->solveBuffer);
3566: if ((*trifactor)->AA_h) hipHostFree((*trifactor)->AA_h);
3567: if ((*trifactor)->csr2cscBuffer) hipFree((*trifactor)->csr2cscBuffer);
3568: PetscFree(*trifactor);
3569: }
3570: return 0;
3571: }
3573: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **matstruct, MatHIPSPARSEStorageFormat format)
3574: {
3575: CsrMatrix *mat;
3577: if (*matstruct) {
3578: if ((*matstruct)->mat) {
3579: if (format == MAT_HIPSPARSE_ELL || format == MAT_HIPSPARSE_HYB) {
3580: hipsparseHybMat_t hybMat = (hipsparseHybMat_t)(*matstruct)->mat;
3581: hipsparseDestroyHybMat(hybMat);
3582: } else {
3583: mat = (CsrMatrix *)(*matstruct)->mat;
3584: CsrMatrix_Destroy(&mat);
3585: }
3586: }
3587: if ((*matstruct)->descr) hipsparseDestroyMatDescr((*matstruct)->descr);
3588: delete (*matstruct)->cprowIndices;
3589: if ((*matstruct)->alpha_one) hipFree((*matstruct)->alpha_one);
3590: if ((*matstruct)->beta_zero) hipFree((*matstruct)->beta_zero);
3591: if ((*matstruct)->beta_one) hipFree((*matstruct)->beta_one);
3593: Mat_SeqAIJHIPSPARSEMultStruct *mdata = *matstruct;
3594: if (mdata->matDescr) hipsparseDestroySpMat(mdata->matDescr);
3595: for (int i = 0; i < 3; i++) {
3596: if (mdata->hipSpMV[i].initialized) {
3597: hipFree(mdata->hipSpMV[i].spmvBuffer);
3598: hipsparseDestroyDnVec(mdata->hipSpMV[i].vecXDescr);
3599: hipsparseDestroyDnVec(mdata->hipSpMV[i].vecYDescr);
3600: }
3601: }
3602: delete *matstruct;
3603: *matstruct = NULL;
3604: }
3605: return 0;
3606: }
3608: PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Reset(Mat_SeqAIJHIPSPARSETriFactors_p *trifactors)
3609: {
3610: Mat_SeqAIJHIPSPARSETriFactors *fs = *trifactors;
3612: if (fs) {
3613: MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtr);
3614: MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtr);
3615: MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose);
3616: MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose);
3617: delete fs->rpermIndices;
3618: delete fs->cpermIndices;
3619: delete fs->workVector;
3620: fs->rpermIndices = NULL;
3621: fs->cpermIndices = NULL;
3622: fs->workVector = NULL;
3623: if (fs->a_band_d) hipFree(fs->a_band_d);
3624: if (fs->i_band_d) hipFree(fs->i_band_d);
3625: fs->init_dev_prop = PETSC_FALSE;
3626: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3627: hipFree(fs->csrRowPtr);
3628: hipFree(fs->csrColIdx);
3629: hipFree(fs->csrVal);
3630: hipFree(fs->X);
3631: hipFree(fs->Y);
3632: // hipFree(fs->factBuffer_M); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3633: hipFree(fs->spsvBuffer_L);
3634: hipFree(fs->spsvBuffer_U);
3635: hipFree(fs->spsvBuffer_Lt);
3636: hipFree(fs->spsvBuffer_Ut);
3637: hipsparseDestroyMatDescr(fs->matDescr_M);
3638: if (fs->spMatDescr_L) hipsparseDestroySpMat(fs->spMatDescr_L);
3639: if (fs->spMatDescr_U) hipsparseDestroySpMat(fs->spMatDescr_U);
3640: hipsparseSpSV_destroyDescr(fs->spsvDescr_L);
3641: hipsparseSpSV_destroyDescr(fs->spsvDescr_Lt);
3642: hipsparseSpSV_destroyDescr(fs->spsvDescr_U);
3643: hipsparseSpSV_destroyDescr(fs->spsvDescr_Ut);
3644: if (fs->dnVecDescr_X) hipsparseDestroyDnVec(fs->dnVecDescr_X);
3645: if (fs->dnVecDescr_Y) hipsparseDestroyDnVec(fs->dnVecDescr_Y);
3646: hipsparseDestroyCsrilu02Info(fs->ilu0Info_M);
3647: hipsparseDestroyCsric02Info(fs->ic0Info_M);
3649: fs->createdTransposeSpSVDescr = PETSC_FALSE;
3650: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3651: #endif
3652: }
3653: return 0;
3654: }
3656: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **trifactors)
3657: {
3658: hipsparseHandle_t handle;
3660: if (*trifactors) {
3661: MatSeqAIJHIPSPARSETriFactors_Reset(trifactors);
3662: if ((handle = (*trifactors)->handle)) hipsparseDestroy(handle);
3663: PetscFree(*trifactors);
3664: }
3665: return 0;
3666: }
3668: struct IJCompare {
3669: __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3670: {
3671: if (t1.get<0>() < t2.get<0>()) return true;
3672: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3673: return false;
3674: }
3675: };
3677: struct IJEqual {
3678: __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3679: {
3680: if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3681: return true;
3682: }
3683: };
3685: struct IJDiff {
3686: __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
3687: };
3689: struct IJSum {
3690: __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
3691: };
3693: PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
3694: {
3695: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3696: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3697: THRUSTARRAY *cooPerm_v = NULL;
3698: thrust::device_ptr<const PetscScalar> d_v;
3699: CsrMatrix *matrix;
3700: PetscInt n;
3704: if (!cusp->cooPerm) {
3705: MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);
3706: MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);
3707: return 0;
3708: }
3709: matrix = (CsrMatrix *)cusp->mat->mat;
3711: if (!v) {
3712: if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3713: goto finalize;
3714: }
3715: n = cusp->cooPerm->size();
3716: if (isHipMem(v)) d_v = thrust::device_pointer_cast(v);
3717: else {
3718: cooPerm_v = new THRUSTARRAY(n);
3719: cooPerm_v->assign(v, v + n);
3720: d_v = cooPerm_v->data();
3721: PetscLogCpuToGpu(n * sizeof(PetscScalar));
3722: }
3723: PetscLogGpuTimeBegin();
3724: if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3725: if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3726: THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3727: auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
3728: /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3729: cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3730: cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3731: */
3732: thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
3733: thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
3734: delete cooPerm_w;
3735: } else {
3736: /* all nonzeros in d_v[] are unique entries */
3737: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
3738: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
3739: thrust::for_each(zibit, zieit, VecHIPPlusEquals()); /* values[i] += d_v[cooPerm[i]] */
3740: }
3741: } else {
3742: if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3743: auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
3744: thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
3745: } else {
3746: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
3747: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
3748: thrust::for_each(zibit, zieit, VecHIPEquals());
3749: }
3750: }
3751: PetscLogGpuTimeEnd();
3752: finalize:
3753: delete cooPerm_v;
3754: A->offloadmask = PETSC_OFFLOAD_GPU;
3755: PetscObjectStateIncrease((PetscObject)A);
3756: /* shorter version of MatAssemblyEnd_SeqAIJ */
3757: PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz);
3758: PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n");
3759: PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax);
3760: a->reallocs = 0;
3761: A->info.mallocs += 0;
3762: A->info.nz_unneeded = 0;
3763: A->assembled = A->was_assembled = PETSC_TRUE;
3764: A->num_ass++;
3765: return 0;
3766: }
3768: PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3769: {
3770: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3773: if (!cusp) return 0;
3774: if (destroy) {
3775: MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format);
3776: delete cusp->csr2csc_i;
3777: cusp->csr2csc_i = NULL;
3778: }
3779: A->transupdated = PETSC_FALSE;
3780: return 0;
3781: }
3783: PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
3784: {
3785: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3786: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3787: PetscInt cooPerm_n, nzr = 0;
3789: PetscLayoutSetUp(A->rmap);
3790: PetscLayoutSetUp(A->cmap);
3791: cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3792: if (n != cooPerm_n) {
3793: delete cusp->cooPerm;
3794: delete cusp->cooPerm_a;
3795: cusp->cooPerm = NULL;
3796: cusp->cooPerm_a = NULL;
3797: }
3798: if (n) {
3799: thrust::device_ptr<PetscInt> d_i, d_j;
3800: PetscInt *d_raw_i, *d_raw_j;
3801: PetscBool free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
3802: PetscMemType imtype, jmtype;
3804: PetscGetMemType(coo_i, &imtype);
3805: if (PetscMemTypeHost(imtype)) {
3806: hipMalloc(&d_raw_i, sizeof(PetscInt) * n);
3807: hipMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, hipMemcpyHostToDevice);
3808: d_i = thrust::device_pointer_cast(d_raw_i);
3809: free_raw_i = PETSC_TRUE;
3810: PetscLogCpuToGpu(1. * n * sizeof(PetscInt));
3811: } else {
3812: d_i = thrust::device_pointer_cast(coo_i);
3813: }
3815: PetscGetMemType(coo_j, &jmtype);
3816: if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJHIPSPARSE_Basic() passes device coo_i[] and host coo_j[]!
3817: hipMalloc(&d_raw_j, sizeof(PetscInt) * n);
3818: hipMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, hipMemcpyHostToDevice);
3819: d_j = thrust::device_pointer_cast(d_raw_j);
3820: free_raw_j = PETSC_TRUE;
3821: PetscLogCpuToGpu(1. * n * sizeof(PetscInt));
3822: } else {
3823: d_j = thrust::device_pointer_cast(coo_j);
3824: }
3826: THRUSTINTARRAY ii(A->rmap->n);
3828: if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
3829: if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
3830: /* Ex.
3831: n = 6
3832: coo_i = [3,3,1,4,1,4]
3833: coo_j = [3,2,2,5,2,6]
3834: */
3835: auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
3836: auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));
3838: PetscLogGpuTimeBegin();
3839: thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3840: thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3841: (*cusp->cooPerm_a).assign(d_i, d_i + n); /* copy the sorted array */
3842: THRUSTINTARRAY w(d_j, d_j + n);
3843: /*
3844: d_i = [1,1,3,3,4,4]
3845: d_j = [2,2,2,3,5,6]
3846: cooPerm = [2,4,1,0,3,5]
3847: */
3848: auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3849: /*
3850: d_i = [1,3,3,4,4,x]
3851: ^ekey
3852: d_j = [2,2,3,5,6,x]
3853: ^nekye
3854: */
3855: if (nekey == ekey) { /* all entries are unique */
3856: delete cusp->cooPerm_a;
3857: cusp->cooPerm_a = NULL;
3858: } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3859: /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3860: adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3861: adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3862: (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a hipMemcpy */
3863: w[0] = 0;
3864: thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/
3865: thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3866: }
3867: thrust::counting_iterator<PetscInt> search_begin(0);
3868: thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3869: search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3870: ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3871: PetscLogGpuTimeEnd();
3873: MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i);
3874: a->singlemalloc = PETSC_FALSE;
3875: a->free_a = PETSC_TRUE;
3876: a->free_ij = PETSC_TRUE;
3877: PetscMalloc1(A->rmap->n + 1, &a->i);
3878: a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3879: hipMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), hipMemcpyDeviceToHost);
3880: a->nz = a->maxnz = a->i[A->rmap->n];
3881: a->rmax = 0;
3882: PetscMalloc1(a->nz, &a->a);
3883: PetscMalloc1(a->nz, &a->j);
3884: hipMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), hipMemcpyDeviceToHost);
3885: if (!a->ilen) PetscMalloc1(A->rmap->n, &a->ilen);
3886: if (!a->imax) PetscMalloc1(A->rmap->n, &a->imax);
3887: for (PetscInt i = 0; i < A->rmap->n; i++) {
3888: const PetscInt nnzr = a->i[i + 1] - a->i[i];
3889: nzr += (PetscInt) !!(nnzr);
3890: a->ilen[i] = a->imax[i] = nnzr;
3891: a->rmax = PetscMax(a->rmax, nnzr);
3892: }
3893: a->nonzerorowcnt = nzr;
3894: A->preallocated = PETSC_TRUE;
3895: PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt));
3896: MatMarkDiagonal_SeqAIJ(A);
3897: } else MatSeqAIJSetPreallocation(A, 0, NULL);
3898: MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE);
3899: /* We want to allocate the HIPSPARSE struct for matvec now.
3900: The code is so convoluted now that I prefer to copy zeros */
3901: PetscArrayzero(a->a, a->nz);
3902: MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6);
3903: A->offloadmask = PETSC_OFFLOAD_CPU;
3904: MatSeqAIJHIPSPARSECopyToGPU(A);
3905: MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE);
3906: return 0;
3907: }
3909: PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
3910: {
3911: Mat_SeqAIJ *seq;
3912: Mat_SeqAIJHIPSPARSE *dev;
3913: PetscBool coo_basic = PETSC_TRUE;
3914: PetscMemType mtype = PETSC_MEMTYPE_DEVICE;
3916: MatResetPreallocationCOO_SeqAIJ(mat);
3917: MatResetPreallocationCOO_SeqAIJHIPSPARSE(mat);
3918: if (coo_i) {
3919: PetscGetMemType(coo_i, &mtype);
3920: if (PetscMemTypeHost(mtype)) {
3921: for (PetscCount k = 0; k < coo_n; k++) {
3922: if (coo_i[k] < 0 || coo_j[k] < 0) {
3923: coo_basic = PETSC_FALSE;
3924: break;
3925: }
3926: }
3927: }
3928: }
3930: if (coo_basic) { /* i,j are on device or do not contain negative indices */
3931: MatSetPreallocationCOO_SeqAIJHIPSPARSE_Basic(mat, coo_n, coo_i, coo_j);
3932: } else {
3933: MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j);
3934: mat->offloadmask = PETSC_OFFLOAD_CPU;
3935: MatSeqAIJHIPSPARSECopyToGPU(mat);
3936: seq = static_cast<Mat_SeqAIJ *>(mat->data);
3937: dev = static_cast<Mat_SeqAIJHIPSPARSE *>(mat->spptr);
3938: hipMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount));
3939: hipMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice);
3940: hipMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount));
3941: hipMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), hipMemcpyHostToDevice);
3942: dev->use_extended_coo = PETSC_TRUE;
3943: }
3944: return 0;
3945: }
3947: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
3948: {
3949: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
3950: const PetscCount grid_size = gridDim.x * blockDim.x;
3951: for (; i < nnz; i += grid_size) {
3952: PetscScalar sum = 0.0;
3953: for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
3954: a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
3955: }
3956: }
3958: PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3959: {
3960: Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data;
3961: Mat_SeqAIJHIPSPARSE *dev = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3962: PetscCount Annz = seq->nz;
3963: PetscMemType memtype;
3964: const PetscScalar *v1 = v;
3965: PetscScalar *Aa;
3967: if (dev->use_extended_coo) {
3968: PetscGetMemType(v, &memtype);
3969: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
3970: hipMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar));
3971: hipMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), hipMemcpyHostToDevice);
3972: }
3974: if (imode == INSERT_VALUES) MatSeqAIJHIPSPARSEGetArrayWrite(A, &Aa);
3975: else MatSeqAIJHIPSPARSEGetArray(A, &Aa);
3977: if (Annz) {
3978: hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddCOOValues), dim3((Annz + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
3979: hipPeekAtLastError();
3980: }
3982: if (imode == INSERT_VALUES) MatSeqAIJHIPSPARSERestoreArrayWrite(A, &Aa);
3983: else MatSeqAIJHIPSPARSERestoreArray(A, &Aa);
3985: if (PetscMemTypeHost(memtype)) hipFree((void *)v1);
3986: } else {
3987: MatSetValuesCOO_SeqAIJHIPSPARSE_Basic(A, v, imode);
3988: }
3989: return 0;
3990: }
3992: /*@C
3993: MatSeqAIJHIPSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJHIPSPARSE matrices.
3994: Not collective
3996: Input Parameters:
3997: + A - the matrix
3998: - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4000: Output Parameters:
4001: + ia - the CSR row pointers
4002: - ja - the CSR column indices
4004: Level: developer
4006: Notes:
4007: When compressed is true, the CSR structure does not contain empty rows
4009: .seealso: `MatSeqAIJHIPSPARSERestoreIJ()`, `MatSeqAIJHIPSPARSEGetArrayRead()`
4010: @*/
4011: PetscErrorCode MatSeqAIJHIPSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4012: {
4013: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4014: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4015: CsrMatrix *csr;
4018: if (!i || !j) return 0;
4021: MatSeqAIJHIPSPARSECopyToGPU(A);
4023: csr = (CsrMatrix *)cusp->mat->mat;
4024: if (i) {
4025: if (!compressed && a->compressedrow.use) { /* need full row offset */
4026: if (!cusp->rowoffsets_gpu) {
4027: cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4028: cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4029: PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt));
4030: }
4031: *i = cusp->rowoffsets_gpu->data().get();
4032: } else *i = csr->row_offsets->data().get();
4033: }
4034: if (j) *j = csr->column_indices->data().get();
4035: return 0;
4036: }
4038: /*@C
4039: MatSeqAIJHIPSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJHIPSPARSEGetIJ()
4040: Not collective
4042: Input Parameters:
4043: + A - the matrix
4044: - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4046: Output Parameters:
4047: + ia - the CSR row pointers
4048: - ja - the CSR column indices
4050: Level: developer
4052: .seealso: `MatSeqAIJHIPSPARSEGetIJ()`
4053: @*/
4054: PetscErrorCode MatSeqAIJHIPSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4055: {
4058: if (i) *i = NULL;
4059: if (j) *j = NULL;
4060: return 0;
4061: }
4063: /*@C
4064: MatSeqAIJHIPSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJHIPSPARSE matrix is stored
4065: Not Collective
4067: Input Parameter:
4068: . A - a MATSEQAIJHIPSPARSE matrix
4070: Output Parameter:
4071: . a - pointer to the device data
4073: Level: developer
4075: Notes: may trigger host-device copies if up-to-date matrix data is on host
4077: .seealso: `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArrayRead()`
4078: @*/
4079: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4080: {
4081: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4082: CsrMatrix *csr;
4088: MatSeqAIJHIPSPARSECopyToGPU(A);
4090: csr = (CsrMatrix *)cusp->mat->mat;
4092: *a = csr->values->data().get();
4093: return 0;
4094: }
4096: /*@C
4097: MatSeqAIJHIPSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJHIPSPARSEGetArrayRead()
4098: Not Collective
4100: Input Parameter:
4101: . A - a MATSEQAIJHIPSPARSE matrix
4103: Output Parameter:
4104: . a - pointer to the device data
4106: Level: developer
4108: .seealso: `MatSeqAIJHIPSPARSEGetArrayRead()`
4109: @*/
4110: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4111: {
4115: *a = NULL;
4116: return 0;
4117: }
4119: /*@C
4120: MatSeqAIJHIPSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJHIPSPARSE matrix is stored
4121: Not Collective
4123: Input Parameter:
4124: . A - a MATSEQAIJHIPSPARSE matrix
4126: Output Parameter:
4127: . a - pointer to the device data
4129: Level: developer
4131: Notes: may trigger host-device copies if up-to-date matrix data is on host
4133: .seealso: `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArray()`
4134: @*/
4135: PetscErrorCode MatSeqAIJHIPSPARSEGetArray(Mat A, PetscScalar **a)
4136: {
4137: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4138: CsrMatrix *csr;
4144: MatSeqAIJHIPSPARSECopyToGPU(A);
4146: csr = (CsrMatrix *)cusp->mat->mat;
4148: *a = csr->values->data().get();
4149: A->offloadmask = PETSC_OFFLOAD_GPU;
4150: MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE);
4151: return 0;
4152: }
4153: /*@C
4154: MatSeqAIJHIPSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJHIPSPARSEGetArray()
4156: Not Collective
4158: Input Parameter:
4159: . A - a MATSEQAIJHIPSPARSE matrix
4161: Output Parameter:
4162: . a - pointer to the device data
4164: Level: developer
4166: .seealso: `MatSeqAIJHIPSPARSEGetArray()`
4167: @*/
4168: PetscErrorCode MatSeqAIJHIPSPARSERestoreArray(Mat A, PetscScalar **a)
4169: {
4173: MatSeqAIJInvalidateDiagonal(A);
4174: PetscObjectStateIncrease((PetscObject)A);
4175: *a = NULL;
4176: return 0;
4177: }
4179: /*@C
4180: MatSeqAIJHIPSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJHIPSPARSE matrix is stored
4182: Not Collective
4184: Input Parameter:
4185: . A - a MATSEQAIJHIPSPARSE matrix
4187: Output Parameter:
4188: . a - pointer to the device data
4190: Level: developer
4192: Notes: does not trigger host-device copies and flags data validity on the GPU
4194: .seealso: `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSERestoreArrayWrite()`
4195: @*/
4196: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4197: {
4198: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4199: CsrMatrix *csr;
4206: csr = (CsrMatrix *)cusp->mat->mat;
4208: *a = csr->values->data().get();
4209: A->offloadmask = PETSC_OFFLOAD_GPU;
4210: MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE);
4211: return 0;
4212: }
4214: /*@C
4215: MatSeqAIJHIPSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJHIPSPARSEGetArrayWrite()
4217: Not Collective
4219: Input Parameter:
4220: . A - a MATSEQAIJHIPSPARSE matrix
4222: Output Parameter:
4223: . a - pointer to the device data
4225: Level: developer
4227: .seealso: `MatSeqAIJHIPSPARSEGetArrayWrite()`
4228: @*/
4229: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4230: {
4234: MatSeqAIJInvalidateDiagonal(A);
4235: PetscObjectStateIncrease((PetscObject)A);
4236: *a = NULL;
4237: return 0;
4238: }
4240: struct IJCompare4 {
4241: __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4242: {
4243: if (t1.get<0>() < t2.get<0>()) return true;
4244: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4245: return false;
4246: }
4247: };
4249: struct Shift {
4250: int _shift;
4252: Shift(int shift) : _shift(shift) { }
4253: __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4254: };
4256: /* merges two SeqAIJHIPSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4257: PetscErrorCode MatSeqAIJHIPSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4258: {
4259: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4260: Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr, *Ccusp;
4261: Mat_SeqAIJHIPSPARSEMultStruct *Cmat;
4262: CsrMatrix *Acsr, *Bcsr, *Ccsr;
4263: PetscInt Annz, Bnnz;
4264: PetscInt i, m, n, zero = 0;
4275: if (reuse == MAT_INITIAL_MATRIX) {
4276: m = A->rmap->n;
4277: n = A->cmap->n + B->cmap->n;
4278: MatCreate(PETSC_COMM_SELF, C);
4279: MatSetSizes(*C, m, n, m, n);
4280: MatSetType(*C, MATSEQAIJHIPSPARSE);
4281: c = (Mat_SeqAIJ *)(*C)->data;
4282: Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4283: Cmat = new Mat_SeqAIJHIPSPARSEMultStruct;
4284: Ccsr = new CsrMatrix;
4285: Cmat->cprowIndices = NULL;
4286: c->compressedrow.use = PETSC_FALSE;
4287: c->compressedrow.nrows = 0;
4288: c->compressedrow.i = NULL;
4289: c->compressedrow.rindex = NULL;
4290: Ccusp->workVector = NULL;
4291: Ccusp->nrows = m;
4292: Ccusp->mat = Cmat;
4293: Ccusp->mat->mat = Ccsr;
4294: Ccsr->num_rows = m;
4295: Ccsr->num_cols = n;
4296: hipsparseCreateMatDescr(&Cmat->descr);
4297: hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO);
4298: hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL);
4299: hipMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar));
4300: hipMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar));
4301: hipMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));
4302: hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice);
4303: hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice);
4304: hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice);
4305: MatSeqAIJHIPSPARSECopyToGPU(A);
4306: MatSeqAIJHIPSPARSECopyToGPU(B);
4310: Acsr = (CsrMatrix *)Acusp->mat->mat;
4311: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4312: Annz = (PetscInt)Acsr->column_indices->size();
4313: Bnnz = (PetscInt)Bcsr->column_indices->size();
4314: c->nz = Annz + Bnnz;
4315: Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1);
4316: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4317: Ccsr->values = new THRUSTARRAY(c->nz);
4318: Ccsr->num_entries = c->nz;
4319: Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4320: if (c->nz) {
4321: auto Acoo = new THRUSTINTARRAY32(Annz);
4322: auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4323: auto Ccoo = new THRUSTINTARRAY32(c->nz);
4324: THRUSTINTARRAY32 *Aroff, *Broff;
4326: if (a->compressedrow.use) { /* need full row offset */
4327: if (!Acusp->rowoffsets_gpu) {
4328: Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4329: Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4330: PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt));
4331: }
4332: Aroff = Acusp->rowoffsets_gpu;
4333: } else Aroff = Acsr->row_offsets;
4334: if (b->compressedrow.use) { /* need full row offset */
4335: if (!Bcusp->rowoffsets_gpu) {
4336: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4337: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4338: PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt));
4339: }
4340: Broff = Bcusp->rowoffsets_gpu;
4341: } else Broff = Bcsr->row_offsets;
4342: PetscLogGpuTimeBegin();
4343: hipsparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO);
4344: hipsparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO);
4345: /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4346: auto Aperm = thrust::make_constant_iterator(1);
4347: auto Bperm = thrust::make_constant_iterator(0);
4348: auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4349: auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4350: auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4351: auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4352: auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4353: auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4354: auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4355: auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4356: auto p1 = Ccusp->cooPerm->begin();
4357: auto p2 = Ccusp->cooPerm->begin();
4358: thrust::advance(p2, Annz);
4359: thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4());
4360: auto cci = thrust::make_counting_iterator(zero);
4361: auto cce = thrust::make_counting_iterator(c->nz);
4362: #if 0 //Errors on SUMMIT cuda 11.1.0
4363: thrust::partition_copy(thrust::device, cci, cce, wPerm->begin(), p1, p2, thrust::identity<int>());
4364: #else
4365: auto pred = thrust::identity<int>();
4366: thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred);
4367: thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred);
4368: #endif
4369: hipsparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), HIPSPARSE_INDEX_BASE_ZERO);
4370: PetscLogGpuTimeEnd();
4371: delete wPerm;
4372: delete Acoo;
4373: delete Bcoo;
4374: delete Ccoo;
4375: hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype);
4377: if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4378: MatSeqAIJHIPSPARSEFormExplicitTranspose(A);
4379: MatSeqAIJHIPSPARSEFormExplicitTranspose(B);
4380: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4381: Mat_SeqAIJHIPSPARSEMultStruct *CmatT = new Mat_SeqAIJHIPSPARSEMultStruct;
4382: CsrMatrix *CcsrT = new CsrMatrix;
4383: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4384: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4386: (*C)->form_explicit_transpose = PETSC_TRUE;
4387: (*C)->transupdated = PETSC_TRUE;
4388: Ccusp->rowoffsets_gpu = NULL;
4389: CmatT->cprowIndices = NULL;
4390: CmatT->mat = CcsrT;
4391: CcsrT->num_rows = n;
4392: CcsrT->num_cols = m;
4393: CcsrT->num_entries = c->nz;
4394: CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1);
4395: CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4396: CcsrT->values = new THRUSTARRAY(c->nz);
4398: PetscLogGpuTimeBegin();
4399: auto rT = CcsrT->row_offsets->begin();
4400: if (AT) {
4401: rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4402: thrust::advance(rT, -1);
4403: }
4404: if (BT) {
4405: auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4406: auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4407: thrust::copy(titb, tite, rT);
4408: }
4409: auto cT = CcsrT->column_indices->begin();
4410: if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4411: if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4412: auto vT = CcsrT->values->begin();
4413: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4414: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4415: PetscLogGpuTimeEnd();
4417: hipsparseCreateMatDescr(&CmatT->descr);
4418: hipsparseSetMatIndexBase(CmatT->descr, HIPSPARSE_INDEX_BASE_ZERO);
4419: hipsparseSetMatType(CmatT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL);
4420: hipMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar));
4421: hipMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar));
4422: hipMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));
4423: hipMemcpy(CmatT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice);
4424: hipMemcpy(CmatT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice);
4425: hipMemcpy(CmatT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice);
4427: hipsparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype);
4428: Ccusp->matTranspose = CmatT;
4429: }
4430: }
4432: c->singlemalloc = PETSC_FALSE;
4433: c->free_a = PETSC_TRUE;
4434: c->free_ij = PETSC_TRUE;
4435: PetscMalloc1(m + 1, &c->i);
4436: PetscMalloc1(c->nz, &c->j);
4437: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4438: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4439: THRUSTINTARRAY jj(Ccsr->column_indices->size());
4440: ii = *Ccsr->row_offsets;
4441: jj = *Ccsr->column_indices;
4442: hipMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost);
4443: hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost);
4444: } else {
4445: hipMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost);
4446: hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost);
4447: }
4448: PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt));
4449: PetscMalloc1(m, &c->ilen);
4450: PetscMalloc1(m, &c->imax);
4451: c->maxnz = c->nz;
4452: c->nonzerorowcnt = 0;
4453: c->rmax = 0;
4454: for (i = 0; i < m; i++) {
4455: const PetscInt nn = c->i[i + 1] - c->i[i];
4456: c->ilen[i] = c->imax[i] = nn;
4457: c->nonzerorowcnt += (PetscInt) !!nn;
4458: c->rmax = PetscMax(c->rmax, nn);
4459: }
4460: MatMarkDiagonal_SeqAIJ(*C);
4461: PetscMalloc1(c->nz, &c->a);
4462: (*C)->nonzerostate++;
4463: PetscLayoutSetUp((*C)->rmap);
4464: PetscLayoutSetUp((*C)->cmap);
4465: Ccusp->nonzerostate = (*C)->nonzerostate;
4466: (*C)->preallocated = PETSC_TRUE;
4467: } else {
4469: c = (Mat_SeqAIJ *)(*C)->data;
4470: if (c->nz) {
4471: Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4475: MatSeqAIJHIPSPARSECopyToGPU(A);
4476: MatSeqAIJHIPSPARSECopyToGPU(B);
4479: Acsr = (CsrMatrix *)Acusp->mat->mat;
4480: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4481: Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4487: auto pmid = Ccusp->cooPerm->begin();
4488: thrust::advance(pmid, Acsr->num_entries);
4489: PetscLogGpuTimeBegin();
4490: auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
4491: auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4492: thrust::for_each(zibait, zieait, VecHIPEquals());
4493: auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4494: auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
4495: thrust::for_each(zibbit, ziebit, VecHIPEquals());
4496: MatSeqAIJHIPSPARSEInvalidateTranspose(*C, PETSC_FALSE);
4497: if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4499: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4500: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4501: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4502: CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4503: auto vT = CcsrT->values->begin();
4504: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4505: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4506: (*C)->transupdated = PETSC_TRUE;
4507: }
4508: PetscLogGpuTimeEnd();
4509: }
4510: }
4511: PetscObjectStateIncrease((PetscObject)*C);
4512: (*C)->assembled = PETSC_TRUE;
4513: (*C)->was_assembled = PETSC_FALSE;
4514: (*C)->offloadmask = PETSC_OFFLOAD_GPU;
4515: return 0;
4516: }
4518: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4519: {
4520: bool dmem;
4521: const PetscScalar *av;
4523: dmem = isHipMem(v);
4524: MatSeqAIJHIPSPARSEGetArrayRead(A, &av);
4525: if (n && idx) {
4526: THRUSTINTARRAY widx(n);
4527: widx.assign(idx, idx + n);
4528: PetscLogCpuToGpu(n * sizeof(PetscInt));
4530: THRUSTARRAY *w = NULL;
4531: thrust::device_ptr<PetscScalar> dv;
4532: if (dmem) dv = thrust::device_pointer_cast(v);
4533: else {
4534: w = new THRUSTARRAY(n);
4535: dv = w->data();
4536: }
4537: thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4539: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4540: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4541: thrust::for_each(zibit, zieit, VecHIPEquals());
4542: if (w) hipMemcpy(v, w->data().get(), n * sizeof(PetscScalar), hipMemcpyDeviceToHost);
4543: delete w;
4544: } else hipMemcpy(v, av, n * sizeof(PetscScalar), dmem ? hipMemcpyDeviceToDevice : hipMemcpyDeviceToHost);
4546: if (!dmem) PetscLogCpuToGpu(n * sizeof(PetscScalar));
4547: MatSeqAIJHIPSPARSERestoreArrayRead(A, &av);
4548: return 0;
4549: }