Actual source code: densehip.hip.cpp
1: /*
2: Defines the matrix operations for sequential dense with HIP
3: Portions of this code are under:
4: Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
5: */
6: #include <petscpkg_version.h>
7: #include <../src/mat/impls/dense/seq/dense.h>
8: #include <petsc/private/hipvecimpl.h>
9: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
10: #include <hipsolver/hipsolver.h>
11: #else
12: #include <hipsolver.h>
13: #endif
14: #include <thrust/device_ptr.h>
15: #include <thrust/functional.h>
16: #include <thrust/iterator/counting_iterator.h>
17: #include <thrust/iterator/transform_iterator.h>
18: #include <thrust/iterator/permutation_iterator.h>
19: #include <thrust/transform.h>
20: #include <thrust/device_vector.h>
22: #if defined(PETSC_USE_COMPLEX)
23: #if defined(PETSC_USE_REAL_SINGLE)
24: #define hipsolverDnXpotrf(a, b, c, d, e, f, g, h) hipsolverCpotrf((a), (b), (c), (hipComplex *)(d), (e), (hipComplex *)(f), (g), (h))
25: #define hipsolverDnXpotrf_bufferSize(a, b, c, d, e, f) hipsolverCpotrf_bufferSize((a), (b), (c), (hipComplex *)(d), (e), (f))
26: #define hipsolverDnXpotrs(a, b, c, d, e, f, g, h, i, j, k) hipsolverCpotrs((a), (b), (c), (d), (hipComplex *)(e), (f), (hipComplex *)(g), (h), (hipComplex *)(i), (j), (k))
27: #define hipsolverDnXpotri(a, b, c, d, e, f, g, h) hipsolverCpotri((a), (b), (c), (hipComplex *)(d), (e), (hipComplex *)(f), (g), (h))
28: #define hipsolverDnXpotri_bufferSize(a, b, c, d, e, f) hipsolverCpotri_bufferSize((a), (b), (c), (hipComplex *)(d), (e), (f))
29: #define hipsolverDnXsytrf(a, b, c, d, e, f, g, h, i) hipsolverCsytrf((a), (b), (c), (hipComplex *)(d), (e), (f), (hipComplex *)(g), (h), (i))
30: #define hipsolverDnXsytrf_bufferSize(a, b, c, d, e) hipsolverCsytrf_bufferSize((a), (b), (hipComplex *)(c), (d), (e))
31: #define hipsolverDnXgetrf(a, b, c, d, e, f, g, h, i) hipsolverCgetrf((a), (b), (c), (hipComplex *)(d), (e), (hipComplex *)(f), (g), (h), (i))
32: #define hipsolverDnXgetrf_bufferSize(a, b, c, d, e, f) hipsolverCgetrf_bufferSize((a), (b), (c), (hipComplex *)(d), (e), (f))
33: #define hipsolverDnXgetrs(a, b, c, d, e, f, g, h, i, j, k, l) hipsolverCgetrs((a), (b), (c), (d), (hipComplex *)(e), (f), (g), (hipComplex *)(h), (i), (hipComplex *)(j), (k), (l))
34: #define hipsolverDnXgeqrf_bufferSize(a, b, c, d, e, f) hipsolverCgeqrf_bufferSize((a), (b), (c), (hipComplex *)(d), (e), (f))
35: #define hipsolverDnXgeqrf(a, b, c, d, e, f, g, h, i) hipsolverCgeqrf((a), (b), (c), (hipComplex *)(d), (e), (hipComplex *)(f), (hipComplex *)(g), (h), (i))
36: #define hipsolverDnXormqr_bufferSize(a, b, c, d, e, f, g, h, i, j, k, l) hipsolverCunmqr_bufferSize((a), (b), (c), (d), (e), (f), (hipComplex *)(g), (h), (hipComplex *)(i), (hipComplex *)(j), (k), (l))
37: #define hipsolverDnXormqr(a, b, c, d, e, f, g, h, i, j, k, l, m, n) hipsolverCunmqr((a), (b), (c), (d), (e), (f), (hipComplex *)(g), (h), (hipComplex *)(i), (hipComplex *)(j), (k), (hipComplex *)(l), (m), (n))
38: #define hipblasXtrsm(a, b, c, d, e, f, g, h, i, j, k, l) hipblasCtrsm((a), (b), (c), (d), (e), (f), (g), (hipComplex *)(h), (hipComplex *)(i), (j), (hipComplex *)(k), (l))
39: #else /* complex double */
40: #define hipsolverDnXpotrf(a, b, c, d, e, f, g, h) hipsolverZpotrf((a), (b), (c), (hipDoubleComplex *)(d), (e), (hipDoubleComplex *)(f), (g), (h))
41: #define hipsolverDnXpotrf_bufferSize(a, b, c, d, e, f) hipsolverZpotrf_bufferSize((a), (b), (c), (hipDoubleComplex *)(d), (e), (f))
42: #define hipsolverDnXpotrs(a, b, c, d, e, f, g, h, i, j, k) hipsolverZpotrs((a), (b), (c), (d), (hipDoubleComplex *)(e), (f), (hipDoubleComplex *)(g), (h), (hipDoubleComplex *)(i), (j), (k))
43: #define hipsolverDnXpotri(a, b, c, d, e, f, g, h) hipsolverZpotri((a), (b), (c), (hipDoubleComplex *)(d), (e), (hipDoubleComplex *)(f), (g), (h))
44: #define hipsolverDnXpotri_bufferSize(a, b, c, d, e, f) hipsolverZpotri_bufferSize((a), (b), (c), (hipDoubleComplex *)(d), (e), (f))
45: #define hipsolverDnXsytrf(a, b, c, d, e, f, g, h, i) hipsolverZsytrf((a), (b), (c), (hipDoubleComplex *)(d), (e), (f), (hipDoubleComplex *)(g), (h), (i))
46: #define hipsolverDnXsytrf_bufferSize(a, b, c, d, e) hipsolverZsytrf_bufferSize((a), (b), (hipDoubleComplex *)(c), (d), (e))
47: #define hipsolverDnXgetrf(a, b, c, d, e, f, g, h, i) hipsolverZgetrf((a), (b), (c), (hipDoubleComplex *)(d), (e), (hipDoubleComplex *)(f), (g), (h), (i))
48: #define hipsolverDnXgetrf_bufferSize(a, b, c, d, e, f) hipsolverZgetrf_bufferSize((a), (b), (c), (hipDoubleComplex *)(d), (e), (f))
49: #define hipsolverDnXgetrs(a, b, c, d, e, f, g, h, i, j, k, l) hipsolverZgetrs((a), (b), (c), (d), (hipDoubleComplex *)(e), (f), (g), (hipDoubleComplex *)(h), (i), (hipDoubleComplex *)(j), (k), (l))
50: #define hipsolverDnXgeqrf_bufferSize(a, b, c, d, e, f) hipsolverZgeqrf_bufferSize((a), (b), (c), (hipDoubleComplex *)(d), (e), (f))
51: #define hipsolverDnXgeqrf(a, b, c, d, e, f, g, h, i) hipsolverZgeqrf((a), (b), (c), (hipDoubleComplex *)(d), (e), (hipDoubleComplex *)(f), (hipDoubleComplex *)(g), (h), (i))
52: #define hipsolverDnXormqr_bufferSize(a, b, c, d, e, f, g, h, i, j, k, l) hipsolverZunmqr_bufferSize((a), (b), (c), (d), (e), (f), (hipDoubleComplex *)(g), (h), (hipDoubleComplex *)(i), (hipDoubleComplex *)(j), (k), (l))
53: #define hipsolverDnXormqr(a, b, c, d, e, f, g, h, i, j, k, l, m, n) hipsolverZunmqr((a), (b), (c), (d), (e), (f), (hipDoubleComplex *)(g), (h), (hipDoubleComplex *)(i), (hipDoubleComplex *)(j), (k), (hipDoubleComplex *)(l), (m), (n))
54: #define hipblasXtrsm(a, b, c, d, e, f, g, h, i, j, k, l) hipblasZtrsm((a), (b), (c), (d), (e), (f), (g), (hipDoubleComplex *)(h), (hipDoubleComplex *)(i), (j), (hipDoubleComplex *)(k), (l))
55: #endif
56: #else /* real single */
57: #if defined(PETSC_USE_REAL_SINGLE)
58: #define hipsolverDnXpotrf(a, b, c, d, e, f, g, h) hipsolverSpotrf((a), (b), (c), (float *)(d), (e), (float *)(f), (g), (h))
59: #define hipsolverDnXpotrf_bufferSize(a, b, c, d, e, f) hipsolverSpotrf_bufferSize((a), (b), (c), (float *)(d), (e), (f))
60: #define hipsolverDnXpotrs(a, b, c, d, e, f, g, h, i, j, k) hipsolverSpotrs((a), (b), (c), (d), (float *)(e), (f), (float *)(g), (h), (float *)(i), (j), (k))
61: #define hipsolverDnXpotri(a, b, c, d, e, f, g, h) hipsolverSpotri((a), (b), (c), (float *)(d), (e), (float *)(f), (g), (h))
62: #define hipsolverDnXpotri_bufferSize(a, b, c, d, e, f) hipsolverSpotri_bufferSize((a), (b), (c), (float *)(d), (e), (f))
63: #define hipsolverDnXsytrf(a, b, c, d, e, f, g, h, i) hipsolverSsytrf((a), (b), (c), (float *)(d), (e), (f), (float *)(g), (h), (i))
64: #define hipsolverDnXsytrf_bufferSize(a, b, c, d, e) hipsolverSsytrf_bufferSize((a), (b), (float *)(c), (d), (e))
65: #define hipsolverDnXgetrf(a, b, c, d, e, f, g, h, i) hipsolverSgetrf((a), (b), (c), (float *)(d), (e), (float *)(f), (g), (h), (i))
66: #define hipsolverDnXgetrf_bufferSize(a, b, c, d, e, f) hipsolverSgetrf_bufferSize((a), (b), (c), (float *)(d), (e), (f))
67: #define hipsolverDnXgetrs(a, b, c, d, e, f, g, h, i, j, k, l) hipsolverSgetrs((a),(b),(c),(d),(float*)(e),(f),(g),(float*)(h),(i),(float*)(j),(k),(l)))
68: #define hipsolverDnXgeqrf_bufferSize(a, b, c, d, e, f) hipsolverSgeqrf_bufferSize((a), (b), (c), (float *)(d), (e), (f))
69: #define hipsolverDnXgeqrf(a, b, c, d, e, f, g, h, i) hipsolverSgeqrf((a), (b), (c), (float *)(d), (e), (float *)(f), (float *)(g), (h), (i))
70: #define hipsolverDnXormqr_bufferSize(a, b, c, d, e, f, g, h, i, j, k, l) hipsolverSormqr_bufferSize((a), (b), (c), (d), (e), (f), (float *)(g), (h), (float *)(i), (float *)(j), (k), (l))
71: #define hipsolverDnXormqr(a, b, c, d, e, f, g, h, i, j, k, l, m, n) hipsolverSormqr((a), (b), (c), (d), (e), (f), (float *)(g), (h), (float *)(i), (float *)(j), (k), (float *)(l), (m), (n))
72: #define hipblasXtrsm(a, b, c, d, e, f, g, h, i, j, k, l) hipblasStrsm((a), (b), (c), (d), (e), (f), (g), (float *)(h), (float *)(i), (j), (float *)(k), (l))
73: #else /* real double */
74: #define hipsolverDnXpotrf(a, b, c, d, e, f, g, h) hipsolverDpotrf((a), (b), (c), (double *)(d), (e), (double *)(f), (g), (h))
75: #define hipsolverDnXpotrf_bufferSize(a, b, c, d, e, f) hipsolverDpotrf_bufferSize((a), (b), (c), (double *)(d), (e), (f))
76: #define hipsolverDnXpotrs(a, b, c, d, e, f, g, h, i, j, k) hipsolverDpotrs((a), (b), (c), (d), (double *)(e), (f), (double *)(g), (h), (double *)(i), (j), (k))
77: #define hipsolverDnXpotri(a, b, c, d, e, f, g, h) hipsolverDpotri((a), (b), (c), (double *)(d), (e), (double *)(f), (g), (h))
78: #define hipsolverDnXpotri_bufferSize(a, b, c, d, e, f) hipsolverDpotri_bufferSize((a), (b), (c), (double *)(d), (e), (f))
79: #define hipsolverDnXsytrf(a, b, c, d, e, f, g, h, i) hipsolverDsytrf((a), (b), (c), (double *)(d), (e), (f), (double *)(g), (h), (i))
80: #define hipsolverDnXsytrf_bufferSize(a, b, c, d, e) hipsolverDsytrf_bufferSize((a), (b), (double *)(c), (d), (e))
81: #define hipsolverDnXgetrf(a, b, c, d, e, f, g, h, i) hipsolverDgetrf((a), (b), (c), (double *)(d), (e), (double *)(f), (g), (h), (i))
82: #define hipsolverDnXgetrf_bufferSize(a, b, c, d, e, f) hipsolverDgetrf_bufferSize((a), (b), (c), (double *)(d), (e), (f))
83: #define hipsolverDnXgetrs(a, b, c, d, e, f, g, h, i, j, k, l) hipsolverDgetrs((a), (b), (c), (d), (double *)(e), (f), (g), (double *)(h), (i), (double *)(j), (k), (l))
84: #define hipsolverDnXgeqrf_bufferSize(a, b, c, d, e, f) hipsolverDgeqrf_bufferSize((a), (b), (c), (double *)(d), (e), (f))
85: #define hipsolverDnXgeqrf(a, b, c, d, e, f, g, h, i) hipsolverDgeqrf((a), (b), (c), (double *)(d), (e), (double *)(f), (double *)(g), (h), (i))
86: #define hipsolverDnXormqr_bufferSize(a, b, c, d, e, f, g, h, i, j, k, l) hipsolverDormqr_bufferSize((a), (b), (c), (d), (e), (f), (double *)(g), (h), (double *)(i), (double *)(j), (k), (l))
87: #define hipsolverDnXormqr(a, b, c, d, e, f, g, h, i, j, k, l, m, n) hipsolverDormqr((a), (b), (c), (d), (e), (f), (double *)(g), (h), (double *)(i), (double *)(j), (k), (double *)(l), (m), (n))
88: #define hipblasXtrsm(a, b, c, d, e, f, g, h, i, j, k, l) hipblasDtrsm((a), (b), (c), (d), (e), (f), (g), (double *)(h), (double *)(i), (j), (double *)(k), (l))
89: #endif
90: #endif
92: typedef struct {
93: PetscScalar *d_v; /* pointer to the matrix on the GPU */
94: PetscBool user_alloc;
95: PetscScalar *unplacedarray; /* if one called MatHIPDensePlaceArray(), this is where it stashed the original */
96: PetscBool unplaced_user_alloc;
97: /* factorization support */
98: PetscHipBLASInt *d_fact_ipiv; /* device pivots */
99: PetscScalar *d_fact_tau; /* device QR tau vector */
100: PetscScalar *d_fact_work; /* device workspace */
101: PetscHipBLASInt fact_lwork;
102: PetscHipBLASInt *d_fact_info; /* device info */
103: /* workspace */
104: Vec workvec;
105: } Mat_SeqDenseHIP;
107: PetscErrorCode MatSeqDenseHIPSetPreallocation(Mat A, PetscScalar *d_data)
108: {
109: Mat_SeqDense *cA = (Mat_SeqDense *)A->data;
110: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
111: PetscBool iship;
113: PetscObjectTypeCompare((PetscObject)A, MATSEQDENSEHIP, &iship);
114: if (!iship) return 0;
115: /* it may happen CPU preallocation has not been performed */
116: PetscLayoutSetUp(A->rmap);
117: PetscLayoutSetUp(A->cmap);
118: if (cA->lda <= 0) cA->lda = A->rmap->n;
119: if (!dA->user_alloc) hipFree(dA->d_v);
120: if (!d_data) { /* petsc-allocated storage */
121: size_t sz;
122: PetscIntMultError(cA->lda, A->cmap->n, NULL);
123: sz = cA->lda * A->cmap->n * sizeof(PetscScalar);
124: hipMalloc((void **)&dA->d_v, sz);
125: hipMemset(dA->d_v, 0, sz);
126: dA->user_alloc = PETSC_FALSE;
127: } else { /* user-allocated storage */
128: dA->d_v = d_data;
129: dA->user_alloc = PETSC_TRUE;
130: }
131: A->offloadmask = PETSC_OFFLOAD_GPU;
132: A->preallocated = PETSC_TRUE;
133: A->assembled = PETSC_TRUE;
134: return 0;
135: }
137: PetscErrorCode MatSeqDenseHIPCopyFromGPU(Mat A)
138: {
139: Mat_SeqDense *cA = (Mat_SeqDense *)A->data;
140: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
143: PetscInfo(A, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", A->offloadmask == PETSC_OFFLOAD_GPU ? "Copy" : "Reusing", A->rmap->n, A->cmap->n);
144: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
145: if (!cA->v) { /* MatCreateSeqDenseHIP may not allocate CPU memory. Allocate if needed */
146: MatSeqDenseSetPreallocation(A, NULL);
147: }
148: PetscLogEventBegin(MAT_DenseCopyFromGPU, A, 0, 0, 0);
149: if (cA->lda > A->rmap->n) {
150: hipMemcpy2D(cA->v, cA->lda * sizeof(PetscScalar), dA->d_v, cA->lda * sizeof(PetscScalar), A->rmap->n * sizeof(PetscScalar), A->cmap->n, hipMemcpyDeviceToHost);
151: } else {
152: hipMemcpy(cA->v, dA->d_v, cA->lda * sizeof(PetscScalar) * A->cmap->n, hipMemcpyDeviceToHost);
153: }
154: PetscLogGpuToCpu(cA->lda * sizeof(PetscScalar) * A->cmap->n);
155: PetscLogEventEnd(MAT_DenseCopyFromGPU, A, 0, 0, 0);
157: A->offloadmask = PETSC_OFFLOAD_BOTH;
158: }
159: return 0;
160: }
162: PetscErrorCode MatSeqDenseHIPCopyToGPU(Mat A)
163: {
164: Mat_SeqDense *cA = (Mat_SeqDense *)A->data;
165: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
166: PetscBool copy;
169: if (A->boundtocpu) return 0;
170: copy = (PetscBool)(A->offloadmask == PETSC_OFFLOAD_CPU || A->offloadmask == PETSC_OFFLOAD_UNALLOCATED);
171: PetscInfo(A, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", A->rmap->n, A->cmap->n);
172: if (copy) {
173: if (!dA->d_v) { /* Allocate GPU memory if not present */
174: MatSeqDenseHIPSetPreallocation(A, NULL);
175: }
176: PetscLogEventBegin(MAT_DenseCopyToGPU, A, 0, 0, 0);
177: if (cA->lda > A->rmap->n) {
178: hipMemcpy2D(dA->d_v, cA->lda * sizeof(PetscScalar), cA->v, cA->lda * sizeof(PetscScalar), A->rmap->n * sizeof(PetscScalar), A->cmap->n, hipMemcpyHostToDevice);
179: } else {
180: hipMemcpy(dA->d_v, cA->v, cA->lda * sizeof(PetscScalar) * A->cmap->n, hipMemcpyHostToDevice);
181: }
182: PetscLogCpuToGpu(cA->lda * sizeof(PetscScalar) * A->cmap->n);
183: PetscLogEventEnd(MAT_DenseCopyToGPU, A, 0, 0, 0);
185: A->offloadmask = PETSC_OFFLOAD_BOTH;
186: }
187: return 0;
188: }
190: static PetscErrorCode MatCopy_SeqDenseHIP(Mat A, Mat B, MatStructure str)
191: {
192: const PetscScalar *va;
193: PetscScalar *vb;
194: PetscInt lda1, lda2, m = A->rmap->n, n = A->cmap->n;
196: /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
197: if (A->ops->copy != B->ops->copy) {
198: MatCopy_Basic(A, B, str);
199: return 0;
200: }
202: MatDenseHIPGetArrayRead(A, &va);
203: MatDenseHIPGetArrayWrite(B, &vb);
204: MatDenseGetLDA(A, &lda1);
205: MatDenseGetLDA(B, &lda2);
206: PetscLogGpuTimeBegin();
207: if (lda1 > m || lda2 > m) {
208: hipMemcpy2D(vb, lda2 * sizeof(PetscScalar), va, lda1 * sizeof(PetscScalar), m * sizeof(PetscScalar), n, hipMemcpyDeviceToDevice);
209: } else {
210: hipMemcpy(vb, va, m * (n * sizeof(PetscScalar)), hipMemcpyDeviceToDevice);
211: }
212: PetscLogGpuTimeEnd();
213: MatDenseHIPRestoreArrayWrite(B, &vb);
214: MatDenseHIPRestoreArrayRead(A, &va);
215: return 0;
216: }
218: static PetscErrorCode MatZeroEntries_SeqDenseHIP(Mat A)
219: {
220: PetscScalar *va;
221: PetscInt lda, m = A->rmap->n, n = A->cmap->n;
223: MatDenseHIPGetArrayWrite(A, &va);
224: MatDenseGetLDA(A, &lda);
225: PetscLogGpuTimeBegin();
226: if (lda > m) {
227: hipMemset2D(va, lda * sizeof(PetscScalar), 0, m * sizeof(PetscScalar), n);
228: } else {
229: hipMemset(va, 0, m * (n * sizeof(PetscScalar)));
230: }
231: WaitForHIP();
232: PetscLogGpuTimeEnd();
233: MatDenseHIPRestoreArrayWrite(A, &va);
234: return 0;
235: }
237: static PetscErrorCode MatDenseHIPPlaceArray_SeqDenseHIP(Mat A, const PetscScalar *a)
238: {
239: Mat_SeqDense *aa = (Mat_SeqDense *)A->data;
240: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
245: if (aa->v) MatSeqDenseHIPCopyToGPU(A);
246: dA->unplacedarray = dA->d_v;
247: dA->unplaced_user_alloc = dA->user_alloc;
248: dA->d_v = (PetscScalar *)a;
249: dA->user_alloc = PETSC_TRUE;
250: return 0;
251: }
253: static PetscErrorCode MatDenseHIPResetArray_SeqDenseHIP(Mat A)
254: {
255: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
256: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
260: if (a->v) MatSeqDenseHIPCopyToGPU(A);
261: dA->d_v = dA->unplacedarray;
262: dA->user_alloc = dA->unplaced_user_alloc;
263: dA->unplacedarray = NULL;
264: return 0;
265: }
267: static PetscErrorCode MatDenseHIPReplaceArray_SeqDenseHIP(Mat A, const PetscScalar *a)
268: {
269: Mat_SeqDense *aa = (Mat_SeqDense *)A->data;
270: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
275: if (!dA->user_alloc) hipFree(dA->d_v);
276: dA->d_v = (PetscScalar *)a;
277: dA->user_alloc = PETSC_FALSE;
278: return 0;
279: }
281: static PetscErrorCode MatDenseHIPGetArrayWrite_SeqDenseHIP(Mat A, PetscScalar **a)
282: {
283: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
285: if (!dA->d_v) MatSeqDenseHIPSetPreallocation(A, NULL);
286: *a = dA->d_v;
287: return 0;
288: }
290: static PetscErrorCode MatDenseHIPRestoreArrayWrite_SeqDenseHIP(Mat A, PetscScalar **a)
291: {
292: if (a) *a = NULL;
293: return 0;
294: }
296: static PetscErrorCode MatDenseHIPGetArrayRead_SeqDenseHIP(Mat A, const PetscScalar **a)
297: {
298: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
300: MatSeqDenseHIPCopyToGPU(A);
301: *a = dA->d_v;
302: return 0;
303: }
305: static PetscErrorCode MatDenseHIPRestoreArrayRead_SeqDenseHIP(Mat A, const PetscScalar **a)
306: {
307: if (a) *a = NULL;
308: return 0;
309: }
311: static PetscErrorCode MatDenseHIPGetArray_SeqDenseHIP(Mat A, PetscScalar **a)
312: {
313: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
315: MatSeqDenseHIPCopyToGPU(A);
316: *a = dA->d_v;
317: return 0;
318: }
320: static PetscErrorCode MatDenseHIPRestoreArray_SeqDenseHIP(Mat A, PetscScalar **a)
321: {
322: if (a) *a = NULL;
323: return 0;
324: }
326: PETSC_EXTERN PetscErrorCode MatSeqDenseHIPInvertFactors_Private(Mat A)
327: {
328: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
329: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
330: PetscScalar *da;
331: hipsolverHandle_t handle;
332: PetscHipBLASInt n, lda;
333: #if defined(PETSC_USE_DEBUG)
334: PetscHipBLASInt info;
335: #endif
337: if (!A->rmap->n || !A->cmap->n) return 0;
338: PetscHIPSOLVERGetHandle(&handle);
339: PetscHipBLASIntCast(A->cmap->n, &n);
340: PetscHipBLASIntCast(a->lda, &lda);
342: if (A->factortype == MAT_FACTOR_CHOLESKY) {
343: if (!dA->d_fact_ipiv) { /* spd */
344: PetscHipBLASInt il;
346: MatDenseHIPGetArray(A, &da);
347: hipsolverDnXpotri_bufferSize(handle, HIPSOLVER_FILL_MODE_LOWER, n, da, lda, &il);
348: if (il > dA->fact_lwork) {
349: dA->fact_lwork = il;
351: hipFree(dA->d_fact_work);
352: hipMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work));
353: }
354: PetscLogGpuTimeBegin();
355: hipsolverDnXpotri(handle, HIPSOLVER_FILL_MODE_LOWER, n, da, lda, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
356: PetscLogGpuTimeEnd();
357: MatDenseHIPRestoreArray(A, &da);
358: /* TODO (write hip kernel) */
359: MatSeqDenseSymmetrize_Private(A, PETSC_TRUE);
360: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "hipsolverDnsytri not implemented");
361: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Not implemented");
362: #if defined(PETSC_USE_DEBUG)
363: hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost);
366: #endif
367: PetscLogGpuFlops(1.0 * n * n * n / 3.0);
368: A->ops->solve = NULL;
369: A->ops->solvetranspose = NULL;
370: A->ops->matsolve = NULL;
371: A->factortype = MAT_FACTOR_NONE;
373: PetscFree(A->solvertype);
374: return 0;
375: }
377: static PetscErrorCode MatSolve_SeqDenseHIP_Internal(Mat A, Vec xx, Vec yy, PetscBool transpose, PetscErrorCode (*matsolve)(Mat, PetscScalar *, PetscHipBLASInt, PetscHipBLASInt, PetscHipBLASInt, PetscHipBLASInt, PetscBool))
378: {
379: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
380: PetscScalar *y;
381: PetscHipBLASInt m = 0, k = 0;
382: PetscBool xiship, yiship, aiship;
385: PetscHipBLASIntCast(A->rmap->n, &m);
386: PetscHipBLASIntCast(A->cmap->n, &k);
387: PetscObjectTypeCompare((PetscObject)xx, VECSEQHIP, &xiship);
388: PetscObjectTypeCompare((PetscObject)yy, VECSEQHIP, &yiship);
389: {
390: const PetscScalar *x;
391: PetscBool xishost = PETSC_TRUE;
393: /* The logic here is to try to minimize the amount of memory copying:
394: if we call VecHIPGetArrayRead(X,&x) every time xiship and the
395: data is not offloaded to the GPU yet, then the data is copied to the
396: GPU. But we are only trying to get the data in order to copy it into the y
397: array. So the array x will be wherever the data already is so that
398: only one memcpy is performed */
399: if (xiship && xx->offloadmask & PETSC_OFFLOAD_GPU) {
400: VecHIPGetArrayRead(xx, &x);
401: xishost = PETSC_FALSE;
402: } else VecGetArrayRead(xx, &x);
403: if (k < m || !yiship) {
404: if (!dA->workvec) VecCreateSeqHIP(PetscObjectComm((PetscObject)A), m, &(dA->workvec));
405: VecHIPGetArrayWrite(dA->workvec, &y);
406: } else VecHIPGetArrayWrite(yy, &y);
407: hipMemcpy(y, x, m * sizeof(PetscScalar), xishost ? hipMemcpyHostToDevice : hipMemcpyDeviceToDevice);
408: }
409: PetscObjectTypeCompare((PetscObject)A, MATSEQDENSEHIP, &aiship);
410: if (!aiship) MatConvert(A, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &A);
411: (*matsolve)(A, y, m, m, 1, k, transpose);
412: if (!aiship) MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A);
413: if (k < m || !yiship) {
414: PetscScalar *yv;
416: /* The logic here is that the data is not yet in either yy's GPU array or its
417: CPU array. There is nothing in the interface to say where the user would like
418: it to end up. So we choose the GPU, because it is the faster option */
419: if (yiship) VecHIPGetArrayWrite(yy, &yv);
420: else VecGetArray(yy, &yv);
421: hipMemcpy(yv, y, k * sizeof(PetscScalar), yiship ? hipMemcpyDeviceToDevice : hipMemcpyDeviceToHost);
422: if (yiship) VecHIPRestoreArrayWrite(yy, &yv);
423: else VecRestoreArray(yy, &yv);
424: VecHIPRestoreArrayWrite(dA->workvec, &y);
425: } else VecHIPRestoreArrayWrite(yy, &y);
426: return 0;
427: }
429: static PetscErrorCode MatMatSolve_SeqDenseHIP_Internal(Mat A, Mat B, Mat X, PetscBool transpose, PetscErrorCode (*matsolve)(Mat, PetscScalar *, PetscHipBLASInt, PetscHipBLASInt, PetscHipBLASInt, PetscHipBLASInt, PetscBool))
430: {
431: PetscScalar *y;
432: PetscInt n, _ldb, _ldx;
433: PetscBool biship, xiship, aiship;
434: PetscHipBLASInt nrhs = 0, m = 0, k = 0, ldb = 0, ldx = 0, ldy = 0;
437: PetscHipBLASIntCast(A->rmap->n, &m);
438: PetscHipBLASIntCast(A->cmap->n, &k);
439: MatGetSize(B, NULL, &n);
440: PetscHipBLASIntCast(n, &nrhs);
441: MatDenseGetLDA(B, &_ldb);
442: PetscHipBLASIntCast(_ldb, &ldb);
443: MatDenseGetLDA(X, &_ldx);
444: PetscHipBLASIntCast(_ldx, &ldx);
445: PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &biship);
446: PetscObjectTypeCompare((PetscObject)X, MATSEQDENSEHIP, &xiship);
447: /* The logic here is to try to minimize the amount of memory copying:
448: if we call MatDenseHIPGetArrayRead(B,&b) every time biship and the
449: data is not offloaded to the GPU yet, then the data is copied to the
450: GPU. But we are only trying to get the data in order to copy it into the y
451: array. So the array b will be wherever the data already is so that
452: only one memcpy is performed */
453: const PetscScalar *b;
454: /* some copying from B will be involved */
455: PetscBool bishost = PETSC_TRUE;
456: if (biship && B->offloadmask & PETSC_OFFLOAD_GPU) {
457: MatDenseHIPGetArrayRead(B, &b);
458: bishost = PETSC_FALSE;
459: } else MatDenseGetArrayRead(B, &b);
460: if (ldx < m || !xiship) {
461: /* X's array cannot serve as the array (too small or not on device), B's
462: * array cannot serve as the array (const), so allocate a new array */
463: ldy = m;
464: hipMalloc((void **)&y, nrhs * m * sizeof(PetscScalar));
465: } else {
466: /* X's array should serve as the array */
467: ldy = ldx;
468: MatDenseHIPGetArrayWrite(X, &y);
469: }
470: hipMemcpy2D(y, ldy * sizeof(PetscScalar), b, ldb * sizeof(PetscScalar), m * sizeof(PetscScalar), nrhs, bishost ? hipMemcpyHostToDevice : hipMemcpyDeviceToDevice);
471: if (bishost) MatDenseRestoreArrayRead(B, &b);
472: else MatDenseHIPRestoreArrayRead(B, &b);
474: PetscObjectTypeCompare((PetscObject)A, MATSEQDENSEHIP, &aiship);
475: if (!aiship) MatConvert(A, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &A);
476: (*matsolve)(A, y, ldy, m, nrhs, k, transpose);
477: if (!aiship) MatConvert(A, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &A);
478: if (ldx < m || !xiship) {
479: PetscScalar *x;
481: /* The logic here is that the data is not yet in either X's GPU array or its
482: CPU array. There is nothing in the interface to say where the user would like
483: it to end up. So we choose the GPU, because it is the faster option */
484: if (xiship) MatDenseHIPGetArrayWrite(X, &x);
485: else MatDenseGetArray(X, &x);
486: hipMemcpy2D(x, ldx * sizeof(PetscScalar), y, ldy * sizeof(PetscScalar), k * sizeof(PetscScalar), nrhs, xiship ? hipMemcpyDeviceToDevice : hipMemcpyDeviceToHost);
487: if (xiship) MatDenseHIPRestoreArrayWrite(X, &x);
488: else MatDenseRestoreArray(X, &x);
489: hipFree(y);
490: } else MatDenseHIPRestoreArrayWrite(X, &y);
491: return 0;
492: }
494: static PetscErrorCode MatSolve_SeqDenseHIP_Internal_LU(Mat A, PetscScalar *x, PetscHipBLASInt ldx, PetscHipBLASInt m, PetscHipBLASInt nrhs, PetscHipBLASInt k, PetscBool T)
495: {
496: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
497: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
498: const PetscScalar *da;
499: PetscHipBLASInt lda;
500: hipsolverHandle_t handle;
501: int info;
503: MatDenseHIPGetArrayRead(A, &da);
504: PetscHipBLASIntCast(mat->lda, &lda);
505: PetscHIPSOLVERGetHandle(&handle);
506: PetscLogGpuTimeBegin();
507: PetscInfo(A, "LU solve %d x %d on backend\n", m, k);
508: hipsolverDnXgetrs(handle, T ? HIPSOLVER_OP_T : HIPSOLVER_OP_N, m, nrhs, da, lda, dA->d_fact_ipiv, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
509: PetscLogGpuTimeEnd();
510: MatDenseHIPRestoreArrayRead(A, &da);
511: if (PetscDefined(USE_DEBUG)) {
512: hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost);
515: }
516: PetscLogGpuFlops(nrhs * (2.0 * m * m - m));
517: return 0;
518: }
520: static PetscErrorCode MatSolve_SeqDenseHIP_Internal_Cholesky(Mat A, PetscScalar *x, PetscHipBLASInt ldx, PetscHipBLASInt m, PetscHipBLASInt nrhs, PetscHipBLASInt k, PetscBool T)
521: {
522: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
523: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
524: const PetscScalar *da;
525: PetscHipBLASInt lda;
526: hipsolverHandle_t handle;
527: int info;
529: MatDenseHIPGetArrayRead(A, &da);
530: PetscHipBLASIntCast(mat->lda, &lda);
531: PetscHIPSOLVERGetHandle(&handle);
532: PetscLogGpuTimeBegin();
533: PetscInfo(A, "Cholesky solve %d x %d on backend\n", m, k);
534: if (!dA->d_fact_ipiv) { /* spd */
535: /* ========= Program hit hipErrorNotReady (error 34) due to "device not ready" on HIP API call to hipEventQuery. */
536: hipsolverDnXpotrs(handle, HIPSOLVER_FILL_MODE_LOWER, m, nrhs, da, lda, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
537: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "hipsolverDnsytrs not implemented");
538: PetscLogGpuTimeEnd();
539: MatDenseHIPRestoreArrayRead(A, &da);
540: if (PetscDefined(USE_DEBUG)) {
541: hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost);
544: }
545: PetscLogGpuFlops(nrhs * (2.0 * m * m - m));
546: return 0;
547: }
549: static PetscErrorCode MatSolve_SeqDenseHIP_Internal_QR(Mat A, PetscScalar *x, PetscHipBLASInt ldx, PetscHipBLASInt m, PetscHipBLASInt nrhs, PetscHipBLASInt k, PetscBool T)
550: {
551: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
552: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
553: const PetscScalar *da;
554: PetscHipBLASInt lda, rank;
555: hipsolverHandle_t handle;
556: hipblasHandle_t bhandle;
557: int info;
558: hipsolverOperation_t trans;
559: PetscScalar one = 1.;
561: PetscHipBLASIntCast(mat->rank, &rank);
562: MatDenseHIPGetArrayRead(A, &da);
563: PetscHipBLASIntCast(mat->lda, &lda);
564: PetscHIPSOLVERGetHandle(&handle);
565: PetscHIPBLASGetHandle(&bhandle);
566: PetscLogGpuTimeBegin();
567: PetscInfo(A, "QR solve %d x %d on backend\n", m, k);
568: if (!T) {
569: if (PetscDefined(USE_COMPLEX)) trans = HIPSOLVER_OP_C;
570: else trans = HIPSOLVER_OP_T;
571: hipsolverDnXormqr(handle, HIPSOLVER_SIDE_LEFT, trans, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
572: if (PetscDefined(USE_DEBUG)) {
573: hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost);
575: }
576: hipblasXtrsm(bhandle, HIPBLAS_SIDE_LEFT, HIPBLAS_FILL_MODE_UPPER, HIPBLAS_OP_N, HIPBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx);
577: } else {
578: hipblasXtrsm(bhandle, HIPBLAS_SIDE_LEFT, HIPBLAS_FILL_MODE_UPPER, HIPBLAS_OP_T, HIPBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx);
579: hipsolverDnXormqr(handle, HIPSOLVER_SIDE_LEFT, HIPSOLVER_OP_N, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
580: if (PetscDefined(USE_DEBUG)) {
581: hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost);
583: }
584: }
585: PetscLogGpuTimeEnd();
586: MatDenseHIPRestoreArrayRead(A, &da);
587: PetscLogFlops(nrhs * (4.0 * m * mat->rank - PetscSqr(mat->rank)));
588: return 0;
589: }
591: static PetscErrorCode MatSolve_SeqDenseHIP_LU(Mat A, Vec xx, Vec yy)
592: {
593: MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_LU);
594: return 0;
595: }
597: static PetscErrorCode MatSolveTranspose_SeqDenseHIP_LU(Mat A, Vec xx, Vec yy)
598: {
599: MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_LU);
600: return 0;
601: }
603: static PetscErrorCode MatSolve_SeqDenseHIP_Cholesky(Mat A, Vec xx, Vec yy)
604: {
605: MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_Cholesky);
606: return 0;
607: }
609: static PetscErrorCode MatSolveTranspose_SeqDenseHIP_Cholesky(Mat A, Vec xx, Vec yy)
610: {
611: MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_Cholesky);
612: return 0;
613: }
615: static PetscErrorCode MatSolve_SeqDenseHIP_QR(Mat A, Vec xx, Vec yy)
616: {
617: MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_QR);
618: return 0;
619: }
621: static PetscErrorCode MatSolveTranspose_SeqDenseHIP_QR(Mat A, Vec xx, Vec yy)
622: {
623: MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_QR);
624: return 0;
625: }
627: static PetscErrorCode MatMatSolve_SeqDenseHIP_LU(Mat A, Mat B, Mat X)
628: {
629: MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_LU);
630: return 0;
631: }
633: static PetscErrorCode MatMatSolveTranspose_SeqDenseHIP_LU(Mat A, Mat B, Mat X)
634: {
635: MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_LU);
636: return 0;
637: }
639: static PetscErrorCode MatMatSolve_SeqDenseHIP_Cholesky(Mat A, Mat B, Mat X)
640: {
641: MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_Cholesky);
642: return 0;
643: }
645: static PetscErrorCode MatMatSolveTranspose_SeqDenseHIP_Cholesky(Mat A, Mat B, Mat X)
646: {
647: MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_Cholesky);
648: return 0;
649: }
651: static PetscErrorCode MatMatSolve_SeqDenseHIP_QR(Mat A, Mat B, Mat X)
652: {
653: MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_QR);
654: return 0;
655: }
657: static PetscErrorCode MatMatSolveTranspose_SeqDenseHIP_QR(Mat A, Mat B, Mat X)
658: {
659: MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_QR);
660: return 0;
661: }
663: static PetscErrorCode MatLUFactor_SeqDenseHIP(Mat A, IS rperm, IS cperm, const MatFactorInfo *factinfo)
664: {
665: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
666: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
667: PetscScalar *da;
668: PetscHipBLASInt m, n, lda;
669: hipsolverHandle_t handle;
670: #if defined(PETSC_USE_DEBUG)
671: int info;
672: #endif
674: if (!A->rmap->n || !A->cmap->n) return 0;
675: PetscHIPSOLVERGetHandle(&handle);
676: MatDenseHIPGetArray(A, &da);
677: PetscHipBLASIntCast(A->cmap->n, &n);
678: PetscHipBLASIntCast(A->rmap->n, &m);
679: PetscHipBLASIntCast(a->lda, &lda);
680: PetscInfo(A, "LU factor %d x %d on backend\n", m, n);
681: if (!dA->d_fact_ipiv) hipMalloc((void **)&dA->d_fact_ipiv, n * sizeof(*dA->d_fact_ipiv));
682: if (!dA->fact_lwork) {
683: hipsolverDnXgetrf_bufferSize(handle, m, n, da, lda, &dA->fact_lwork);
684: hipMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work));
685: }
686: if (!dA->d_fact_info) hipMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info));
687: PetscLogGpuTimeBegin();
688: hipsolverDnXgetrf(handle, m, n, da, lda, dA->d_fact_work, dA->fact_lwork, dA->d_fact_ipiv, dA->d_fact_info);
689: PetscLogGpuTimeEnd();
690: MatDenseHIPRestoreArray(A, &da);
691: #if defined(PETSC_USE_DEBUG)
692: hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost);
695: #endif
696: A->factortype = MAT_FACTOR_LU;
697: PetscLogGpuFlops(2.0 * n * n * m / 3.0);
699: A->ops->solve = MatSolve_SeqDenseHIP_LU;
700: A->ops->solvetranspose = MatSolveTranspose_SeqDenseHIP_LU;
701: A->ops->matsolve = MatMatSolve_SeqDenseHIP_LU;
702: A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseHIP_LU;
704: PetscFree(A->solvertype);
705: PetscStrallocpy(MATSOLVERHIP, &A->solvertype);
706: return 0;
707: }
709: static PetscErrorCode MatCholeskyFactor_SeqDenseHIP(Mat A, IS perm, const MatFactorInfo *factinfo)
710: {
711: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
712: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
713: PetscScalar *da;
714: PetscHipBLASInt n, lda;
715: hipsolverHandle_t handle;
716: #if defined(PETSC_USE_DEBUG)
717: int info;
718: #endif
720: if (!A->rmap->n || !A->cmap->n) return 0;
721: PetscHIPSOLVERGetHandle(&handle);
722: PetscHipBLASIntCast(A->rmap->n, &n);
723: PetscInfo(A, "Cholesky factor %d x %d on backend\n", n, n);
724: if (A->spd == PETSC_BOOL3_TRUE) {
725: MatDenseHIPGetArray(A, &da);
726: PetscHipBLASIntCast(a->lda, &lda);
727: if (!dA->fact_lwork) {
728: hipsolverDnXpotrf_bufferSize(handle, HIPSOLVER_FILL_MODE_LOWER, n, da, lda, &dA->fact_lwork);
729: hipMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work));
730: }
731: if (!dA->d_fact_info) hipMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info));
732: PetscLogGpuTimeBegin();
733: hipsolverDnXpotrf(handle, HIPSOLVER_FILL_MODE_LOWER, n, da, lda, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
734: PetscLogGpuTimeEnd();
736: MatDenseHIPRestoreArray(A, &da);
737: #if defined(PETSC_USE_DEBUG)
738: hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost);
741: #endif
742: A->factortype = MAT_FACTOR_CHOLESKY;
743: PetscLogGpuFlops(1.0 * n * n * n / 3.0);
744: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipsolverDnsytrs unavailable. Use MAT_FACTOR_LU");
746: /* at the time of writing hipsolverDn has *sytrs and *hetr* routines implemented and the
747: code below should work */
748: if (!dA->d_fact_ipiv) hipMalloc((void **)&dA->d_fact_ipiv, n * sizeof(*dA->d_fact_ipiv));
749: if (!dA->fact_lwork) {
750: hipsolverDnXsytrf_bufferSize(handle, n, da, lda, &dA->fact_lwork);
751: hipMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work));
752: }
753: if (!dA->d_fact_info) hipMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info));
754: PetscLogGpuTimeBegin();
755: hipsolverDnXsytrf(handle, HIPSOLVER_FILL_MODE_LOWER, n, da, lda, dA->d_fact_ipiv, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
756: PetscLogGpuTimeEnd();
758: A->ops->solve = MatSolve_SeqDenseHIP_Cholesky;
759: A->ops->solvetranspose = MatSolveTranspose_SeqDenseHIP_Cholesky;
760: A->ops->matsolve = MatMatSolve_SeqDenseHIP_Cholesky;
761: A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseHIP_Cholesky;
762: PetscFree(A->solvertype);
763: PetscStrallocpy(MATSOLVERHIP, &A->solvertype);
764: return 0;
765: }
767: static PetscErrorCode MatQRFactor_SeqDenseHIP(Mat A, IS col, const MatFactorInfo *factinfo)
768: {
769: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
770: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
771: PetscScalar *da;
772: PetscHipBLASInt m, min, max, n, lda;
773: hipsolverHandle_t handle;
774: #if defined(PETSC_USE_DEBUG)
775: int info;
776: #endif
778: if (!A->rmap->n || !A->cmap->n) return 0;
779: PetscHIPSOLVERGetHandle(&handle);
780: MatDenseHIPGetArray(A, &da);
781: PetscHipBLASIntCast(A->cmap->n, &n);
782: PetscHipBLASIntCast(A->rmap->n, &m);
783: PetscHipBLASIntCast(a->lda, &lda);
784: PetscInfo(A, "QR factor %d x %d on backend\n", m, n);
785: max = PetscMax(m, n);
786: min = PetscMin(m, n);
787: if (!dA->d_fact_tau) hipMalloc((void **)&dA->d_fact_tau, min * sizeof(*dA->d_fact_tau));
788: if (!dA->d_fact_ipiv) hipMalloc((void **)&dA->d_fact_ipiv, n * sizeof(*dA->d_fact_ipiv));
789: if (!dA->fact_lwork) {
790: hipsolverDnXgeqrf_bufferSize(handle, m, n, da, lda, &dA->fact_lwork);
791: hipMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work));
792: }
793: if (!dA->d_fact_info) hipMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info));
794: if (!dA->workvec) VecCreateSeqHIP(PetscObjectComm((PetscObject)A), m, &(dA->workvec));
795: PetscLogGpuTimeBegin();
796: hipsolverDnXgeqrf(handle, m, n, da, lda, dA->d_fact_tau, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
797: PetscLogGpuTimeEnd();
798: MatDenseHIPRestoreArray(A, &da);
799: #if defined(PETSC_USE_DEBUG)
800: hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost);
802: #endif
803: A->factortype = MAT_FACTOR_QR;
804: a->rank = min;
805: PetscLogGpuFlops(2.0 * min * min * (max - min / 3.0));
807: A->ops->solve = MatSolve_SeqDenseHIP_QR;
808: A->ops->solvetranspose = MatSolveTranspose_SeqDenseHIP_QR;
809: A->ops->matsolve = MatMatSolve_SeqDenseHIP_QR;
810: A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseHIP_QR;
812: PetscFree(A->solvertype);
813: PetscStrallocpy(MATSOLVERHIP, &A->solvertype);
814: return 0;
815: }
817: /* GEMM kernel: C = op(A)*op(B), tA, tB flag transposition */
818: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(Mat A, Mat B, Mat C, PetscBool tA, PetscBool tB)
819: {
820: const PetscScalar *da, *db;
821: PetscScalar *dc;
822: PetscScalar one = 1.0, zero = 0.0;
823: PetscHipBLASInt m, n, k;
824: PetscInt alda, blda, clda;
825: hipblasHandle_t hipblasv2handle;
826: PetscBool Aiship, Biship;
828: /* we may end up with SEQDENSE as one of the arguments */
829: PetscObjectTypeCompare((PetscObject)A, MATSEQDENSEHIP, &Aiship);
830: PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &Biship);
831: if (!Aiship) MatConvert(A, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &A);
832: if (!Biship) MatConvert(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B);
833: PetscHipBLASIntCast(C->rmap->n, &m);
834: PetscHipBLASIntCast(C->cmap->n, &n);
835: if (tA) PetscHipBLASIntCast(A->rmap->n, &k);
836: else PetscHipBLASIntCast(A->cmap->n, &k);
837: if (!m || !n || !k) return 0;
838: PetscInfo(C, "Matrix-Matrix product %d x %d x %d on backend\n", m, k, n);
839: MatDenseHIPGetArrayRead(A, &da);
840: MatDenseHIPGetArrayRead(B, &db);
841: MatDenseHIPGetArrayWrite(C, &dc);
842: MatDenseGetLDA(A, &alda);
843: MatDenseGetLDA(B, &blda);
844: MatDenseGetLDA(C, &clda);
845: PetscHIPBLASGetHandle(&hipblasv2handle);
846: PetscLogGpuTimeBegin();
847: hipblasXgemm(hipblasv2handle, tA ? HIPBLAS_OP_T : HIPBLAS_OP_N, tB ? HIPBLAS_OP_T : HIPBLAS_OP_N, m, n, k, &one, da, alda, db, blda, &zero, dc, clda);
848: PetscLogGpuTimeEnd();
849: PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1));
850: MatDenseHIPRestoreArrayRead(A, &da);
851: MatDenseHIPRestoreArrayRead(B, &db);
852: MatDenseHIPRestoreArrayWrite(C, &dc);
853: if (!Aiship) MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A);
854: if (!Biship) MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B);
855: return 0;
856: }
858: PetscErrorCode MatTransposeMatMultNumeric_SeqDenseHIP_SeqDenseHIP(Mat A, Mat B, Mat C)
859: {
860: MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(A, B, C, PETSC_TRUE, PETSC_FALSE);
861: return 0;
862: }
864: PetscErrorCode MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP(Mat A, Mat B, Mat C)
865: {
866: MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(A, B, C, PETSC_FALSE, PETSC_FALSE);
867: return 0;
868: }
870: PetscErrorCode MatMatTransposeMultNumeric_SeqDenseHIP_SeqDenseHIP(Mat A, Mat B, Mat C)
871: {
872: MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(A, B, C, PETSC_FALSE, PETSC_TRUE);
873: return 0;
874: }
876: PetscErrorCode MatProductSetFromOptions_SeqDenseHIP(Mat C)
877: {
878: MatProductSetFromOptions_SeqDense(C);
879: return 0;
880: }
882: /* zz = op(A)*xx + yy
883: if yy == NULL, only MatMult */
884: static PetscErrorCode MatMultAdd_SeqDenseHIP_Private(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans)
885: {
886: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
887: const PetscScalar *xarray, *da;
888: PetscScalar *zarray;
889: PetscScalar one = 1.0, zero = 0.0;
890: PetscHipBLASInt m, n, lda;
891: hipblasHandle_t hipblasv2handle;
893: if (yy && yy != zz) VecCopy_SeqHIP(yy, zz); /* mult add */
894: if (!A->rmap->n || !A->cmap->n) {
895: if (!yy) VecSet_SeqHIP(zz, 0.0); /* mult only */
896: return 0;
897: }
898: PetscInfo(A, "Matrix-vector product %d x %d on backend\n", A->rmap->n, A->cmap->n);
899: PetscHipBLASIntCast(A->rmap->n, &m);
900: PetscHipBLASIntCast(A->cmap->n, &n);
901: PetscHIPBLASGetHandle(&hipblasv2handle);
902: MatDenseHIPGetArrayRead(A, &da);
903: PetscHipBLASIntCast(mat->lda, &lda);
904: VecHIPGetArrayRead(xx, &xarray);
905: VecHIPGetArray(zz, &zarray);
906: PetscLogGpuTimeBegin();
907: hipblasXgemv(hipblasv2handle, trans ? HIPBLAS_OP_T : HIPBLAS_OP_N, m, n, &one, da, lda, xarray, 1, (yy ? &one : &zero), zarray, 1);
908: PetscLogGpuTimeEnd();
909: PetscLogGpuFlops(2.0 * A->rmap->n * A->cmap->n - (yy ? 0 : A->rmap->n));
910: VecHIPRestoreArrayRead(xx, &xarray);
911: VecHIPRestoreArray(zz, &zarray);
912: MatDenseHIPRestoreArrayRead(A, &da);
913: return 0;
914: }
916: PetscErrorCode MatMultAdd_SeqDenseHIP(Mat A, Vec xx, Vec yy, Vec zz)
917: {
918: MatMultAdd_SeqDenseHIP_Private(A, xx, yy, zz, PETSC_FALSE);
919: return 0;
920: }
922: PetscErrorCode MatMultTransposeAdd_SeqDenseHIP(Mat A, Vec xx, Vec yy, Vec zz)
923: {
924: MatMultAdd_SeqDenseHIP_Private(A, xx, yy, zz, PETSC_TRUE);
925: return 0;
926: }
928: PetscErrorCode MatMult_SeqDenseHIP(Mat A, Vec xx, Vec yy)
929: {
930: MatMultAdd_SeqDenseHIP_Private(A, xx, NULL, yy, PETSC_FALSE);
931: return 0;
932: }
934: PetscErrorCode MatMultTranspose_SeqDenseHIP(Mat A, Vec xx, Vec yy)
935: {
936: MatMultAdd_SeqDenseHIP_Private(A, xx, NULL, yy, PETSC_TRUE);
937: return 0;
938: }
940: static PetscErrorCode MatDenseGetArrayRead_SeqDenseHIP(Mat A, const PetscScalar **array)
941: {
942: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
944: MatSeqDenseHIPCopyFromGPU(A);
945: *array = mat->v;
946: return 0;
947: }
949: static PetscErrorCode MatDenseGetArrayWrite_SeqDenseHIP(Mat A, PetscScalar **array)
950: {
951: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
953: if (!mat->v) MatSeqDenseSetPreallocation(A, NULL); /* MatCreateSeqDenseHIP may not allocate CPU memory. Allocate if needed */
954: *array = mat->v;
955: A->offloadmask = PETSC_OFFLOAD_CPU;
956: return 0;
957: }
959: static PetscErrorCode MatDenseGetArray_SeqDenseHIP(Mat A, PetscScalar **array)
960: {
961: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
963: MatSeqDenseHIPCopyFromGPU(A);
964: *array = mat->v;
965: A->offloadmask = PETSC_OFFLOAD_CPU;
966: return 0;
967: }
969: PetscErrorCode MatScale_SeqDenseHIP(Mat Y, PetscScalar alpha)
970: {
971: Mat_SeqDense *y = (Mat_SeqDense *)Y->data;
972: PetscScalar *dy;
973: PetscHipBLASInt j, N, m, lday, one = 1;
974: hipblasHandle_t hipblasv2handle;
976: PetscHIPBLASGetHandle(&hipblasv2handle);
977: MatDenseHIPGetArray(Y, &dy);
978: PetscHipBLASIntCast(Y->rmap->n * Y->cmap->n, &N);
979: PetscHipBLASIntCast(Y->rmap->n, &m);
980: PetscHipBLASIntCast(y->lda, &lday);
981: PetscInfo(Y, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", Y->rmap->n, Y->cmap->n);
982: PetscLogGpuTimeBegin();
983: if (lday > m) {
984: for (j = 0; j < Y->cmap->n; j++) hipblasXscal(hipblasv2handle, m, &alpha, dy + lday * j, one);
985: } else hipblasXscal(hipblasv2handle, N, &alpha, dy, one);
986: PetscLogGpuTimeEnd();
987: PetscLogGpuFlops(N);
988: MatDenseHIPRestoreArray(Y, &dy);
989: return 0;
990: }
992: struct petscshift : public thrust::unary_function<PetscScalar, PetscScalar> {
993: const PetscScalar shift_;
994: petscshift(PetscScalar shift) : shift_(shift) { }
995: __device__ PetscScalar operator()(PetscScalar x) { return x + shift_; }
996: };
998: template <typename Iterator>
999: class strided_range {
1000: public:
1001: typedef typename thrust::iterator_difference<Iterator>::type difference_type;
1002: struct stride_functor : public thrust::unary_function<difference_type, difference_type> {
1003: difference_type stride;
1004: stride_functor(difference_type stride) : stride(stride) { }
1005: __device__ difference_type operator()(const difference_type &i) const { return stride * i; }
1006: };
1007: typedef typename thrust::counting_iterator<difference_type> CountingIterator;
1008: typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
1009: typedef typename thrust::permutation_iterator<Iterator, TransformIterator> PermutationIterator;
1010: typedef PermutationIterator iterator; // type of the strided_range iterator
1011: // construct strided_range for the range [first,last)
1012: strided_range(Iterator first, Iterator last, difference_type stride) : first(first), last(last), stride(stride) { }
1013: iterator begin(void) const { return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride))); }
1014: iterator end(void) const { return begin() + ((last - first) + (stride - 1)) / stride; }
1016: protected:
1017: Iterator first;
1018: Iterator last;
1019: difference_type stride;
1020: };
1022: PetscErrorCode MatShift_DenseHIP_Private(PetscScalar *da, PetscScalar alpha, PetscInt lda, PetscInt rstart, PetscInt rend, PetscInt cols)
1023: {
1024: PetscInt rend2 = PetscMin(rend, cols);
1025: if (rend2 > rstart) {
1026: PetscLogGpuTimeBegin();
1027: try {
1028: const auto dptr = thrust::device_pointer_cast(da);
1029: size_t begin = rstart * lda;
1030: size_t end = rend2 - rstart + rend2 * lda;
1031: strided_range<thrust::device_vector<PetscScalar>::iterator> diagonal(dptr + begin, dptr + end, lda + 1);
1032: thrust::transform(diagonal.begin(), diagonal.end(), diagonal.begin(), petscshift(alpha));
1033: } catch (char *ex) {
1034: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1035: }
1036: PetscLogGpuTimeEnd();
1037: PetscLogGpuFlops(rend2 - rstart);
1038: }
1039: return 0;
1040: }
1042: PetscErrorCode MatShift_SeqDenseHIP(Mat A, PetscScalar alpha)
1043: {
1044: PetscScalar *da;
1045: PetscInt m = A->rmap->n, n = A->cmap->n, lda;
1047: MatDenseHIPGetArray(A, &da);
1048: MatDenseGetLDA(A, &lda);
1049: PetscInfo(A, "Performing Shift %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n);
1050: MatShift_DenseHIP_Private(da, alpha, lda, 0, m, n);
1051: MatDenseHIPRestoreArray(A, &da);
1052: return 0;
1053: }
1055: PetscErrorCode MatAXPY_SeqDenseHIP(Mat Y, PetscScalar alpha, Mat X, MatStructure str)
1056: {
1057: Mat_SeqDense *x = (Mat_SeqDense *)X->data;
1058: Mat_SeqDense *y = (Mat_SeqDense *)Y->data;
1059: const PetscScalar *dx;
1060: PetscScalar *dy;
1061: PetscHipBLASInt j, N, m, ldax, lday, one = 1;
1062: hipblasHandle_t hipblasv2handle;
1064: if (!X->rmap->n || !X->cmap->n) return 0;
1065: PetscHIPBLASGetHandle(&hipblasv2handle);
1066: MatDenseHIPGetArrayRead(X, &dx);
1067: if (alpha == 0.0) MatDenseHIPGetArrayWrite(Y, &dy);
1068: else MatDenseHIPGetArray(Y, &dy);
1069: PetscHipBLASIntCast(X->rmap->n * X->cmap->n, &N);
1070: PetscHipBLASIntCast(X->rmap->n, &m);
1071: PetscHipBLASIntCast(x->lda, &ldax);
1072: PetscHipBLASIntCast(y->lda, &lday);
1073: PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", Y->rmap->n, Y->cmap->n);
1074: PetscLogGpuTimeBegin();
1075: if (ldax > m || lday > m) {
1076: for (j = 0; j < X->cmap->n; j++) hipblasXaxpy(hipblasv2handle, m, &alpha, dx + j * ldax, one, dy + j * lday, one);
1077: } else hipblasXaxpy(hipblasv2handle, N, &alpha, dx, one, dy, one);
1078: PetscLogGpuTimeEnd();
1079: PetscLogGpuFlops(PetscMax(2. * N - 1, 0));
1080: MatDenseHIPRestoreArrayRead(X, &dx);
1081: if (alpha == 0.0) MatDenseHIPRestoreArrayWrite(Y, &dy);
1082: else MatDenseHIPRestoreArray(Y, &dy);
1083: return 0;
1084: }
1086: static PetscErrorCode MatReset_SeqDenseHIP(Mat A)
1087: {
1088: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
1090: if (dA) {
1092: if (!dA->user_alloc) hipFree(dA->d_v);
1093: hipFree(dA->d_fact_tau);
1094: hipFree(dA->d_fact_ipiv);
1095: hipFree(dA->d_fact_info);
1096: hipFree(dA->d_fact_work);
1097: VecDestroy(&dA->workvec);
1098: }
1099: PetscFree(A->spptr);
1100: return 0;
1101: }
1103: PetscErrorCode MatDestroy_SeqDenseHIP(Mat A)
1104: {
1105: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1107: /* prevent to copy back data if we own the data pointer */
1108: if (!a->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU;
1109: MatConvert_SeqDenseHIP_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A);
1110: MatDestroy_SeqDense(A);
1111: return 0;
1112: }
1114: PetscErrorCode MatDuplicate_SeqDenseHIP(Mat A, MatDuplicateOption cpvalues, Mat *B)
1115: {
1116: MatDuplicateOption hcpvalues = (cpvalues == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : cpvalues;
1118: MatCreate(PetscObjectComm((PetscObject)A), B);
1119: MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n);
1120: MatSetType(*B, ((PetscObject)A)->type_name);
1121: MatDuplicateNoCreate_SeqDense(*B, A, hcpvalues);
1122: if (cpvalues == MAT_COPY_VALUES && hcpvalues != MAT_COPY_VALUES) MatCopy_SeqDenseHIP(A, *B, SAME_NONZERO_PATTERN);
1123: if (cpvalues != MAT_COPY_VALUES) { /* allocate memory if needed */
1124: Mat_SeqDenseHIP *dB = (Mat_SeqDenseHIP *)(*B)->spptr;
1125: if (!dB->d_v) MatSeqDenseHIPSetPreallocation(*B, NULL);
1126: }
1127: return 0;
1128: }
1130: static PetscErrorCode MatGetColumnVector_SeqDenseHIP(Mat A, Vec v, PetscInt col)
1131: {
1132: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1133: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
1134: PetscScalar *x;
1135: PetscBool viship;
1137: PetscObjectTypeCompareAny((PetscObject)v, &viship, VECSEQHIP, VECMPIHIP, VECHIP, "");
1138: if (viship && !v->boundtocpu) { /* update device data */
1139: VecHIPGetArrayWrite(v, &x);
1140: if (A->offloadmask & PETSC_OFFLOAD_GPU) hipMemcpy(x, dA->d_v + col * a->lda, A->rmap->n * sizeof(PetscScalar), hipMemcpyHostToHost);
1141: else hipMemcpy(x, a->v + col * a->lda, A->rmap->n * sizeof(PetscScalar), hipMemcpyHostToDevice);
1142: VecHIPRestoreArrayWrite(v, &x);
1143: } else { /* update host data */
1144: VecGetArrayWrite(v, &x);
1145: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask & PETSC_OFFLOAD_CPU) PetscArraycpy(x, a->v + col * a->lda, A->rmap->n);
1146: else if (A->offloadmask & PETSC_OFFLOAD_GPU) hipMemcpy(x, dA->d_v + col * a->lda, A->rmap->n * sizeof(PetscScalar), hipMemcpyDeviceToHost);
1147: VecRestoreArrayWrite(v, &x);
1148: }
1149: return 0;
1150: }
1152: PETSC_INTERN PetscErrorCode MatGetFactor_seqdense_hip(Mat A, MatFactorType ftype, Mat *fact)
1153: {
1154: MatCreate(PetscObjectComm((PetscObject)A), fact);
1155: MatSetSizes(*fact, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n);
1156: MatSetType(*fact, MATSEQDENSEHIP);
1157: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU) {
1158: (*fact)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1159: (*fact)->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1160: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1161: (*fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1162: } else if (ftype == MAT_FACTOR_QR) {
1163: PetscObjectComposeFunction((PetscObject)(*fact), "MatQRFactor_C", MatQRFactor_SeqDense);
1164: PetscObjectComposeFunction((PetscObject)(*fact), "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense);
1165: }
1166: (*fact)->factortype = ftype;
1167: PetscFree((*fact)->solvertype);
1168: PetscStrallocpy(MATSOLVERHIP, &(*fact)->solvertype);
1169: PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_LU]);
1170: PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_ILU]);
1171: PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_CHOLESKY]);
1172: PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_ICC]);
1173: return 0;
1174: }
1176: static PetscErrorCode MatDenseGetColumnVec_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1177: {
1178: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1182: MatDenseHIPGetArray(A, (PetscScalar **)&a->ptrinuse);
1183: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecHIPPlaceArray is called */
1184: VecCreateSeqHIPWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec);
1185: PetscLogObjectParent((PetscObject)A, (PetscObject)a->cvec);
1186: }
1187: a->vecinuse = col + 1;
1188: VecHIPPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda);
1189: *v = a->cvec;
1190: return 0;
1191: }
1193: static PetscErrorCode MatDenseRestoreColumnVec_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1194: {
1195: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1199: a->vecinuse = 0;
1200: VecHIPResetArray(a->cvec);
1201: MatDenseHIPRestoreArray(A, (PetscScalar **)&a->ptrinuse);
1202: if (v) *v = NULL;
1203: return 0;
1204: }
1206: static PetscErrorCode MatDenseGetColumnVecRead_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1207: {
1208: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1212: MatDenseHIPGetArrayRead(A, &a->ptrinuse);
1213: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecHIPPlaceArray is called */
1214: VecCreateSeqHIPWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec);
1215: PetscLogObjectParent((PetscObject)A, (PetscObject)a->cvec);
1216: }
1217: a->vecinuse = col + 1;
1218: VecHIPPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda);
1219: VecLockReadPush(a->cvec);
1220: *v = a->cvec;
1221: return 0;
1222: }
1224: static PetscErrorCode MatDenseRestoreColumnVecRead_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1225: {
1226: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1230: a->vecinuse = 0;
1231: VecLockReadPop(a->cvec);
1232: VecHIPResetArray(a->cvec);
1233: MatDenseHIPRestoreArrayRead(A, &a->ptrinuse);
1234: if (v) *v = NULL;
1235: return 0;
1236: }
1238: static PetscErrorCode MatDenseGetColumnVecWrite_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1239: {
1240: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1244: MatDenseHIPGetArrayWrite(A, (PetscScalar **)&a->ptrinuse);
1245: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecHIPPlaceArray is called */
1246: VecCreateSeqHIPWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec);
1247: PetscLogObjectParent((PetscObject)A, (PetscObject)a->cvec);
1248: }
1249: a->vecinuse = col + 1;
1250: VecHIPPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda);
1251: *v = a->cvec;
1252: return 0;
1253: }
1255: static PetscErrorCode MatDenseRestoreColumnVecWrite_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1256: {
1257: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1261: a->vecinuse = 0;
1262: VecHIPResetArray(a->cvec);
1263: MatDenseHIPRestoreArrayWrite(A, (PetscScalar **)&a->ptrinuse);
1264: if (v) *v = NULL;
1265: return 0;
1266: }
1268: static PetscErrorCode MatDenseGetSubMatrix_SeqDenseHIP(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *v)
1269: {
1270: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1271: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
1275: if (a->cmat && (cend - cbegin != a->cmat->cmap->N || rend - rbegin != a->cmat->rmap->N)) MatDestroy(&a->cmat);
1276: MatSeqDenseHIPCopyToGPU(A);
1277: if (!a->cmat) {
1278: MatCreateDenseHIP(PetscObjectComm((PetscObject)A), rend - rbegin, PETSC_DECIDE, rend - rbegin, cend - cbegin, dA->d_v + rbegin + (size_t)cbegin * a->lda, &a->cmat);
1279: PetscLogObjectParent((PetscObject)A, (PetscObject)a->cmat);
1280: } else MatDenseHIPPlaceArray(a->cmat, dA->d_v + rbegin + (size_t)cbegin * a->lda);
1281: MatDenseSetLDA(a->cmat, a->lda);
1282: /* Place CPU array if present but not copy any data */
1283: a->cmat->offloadmask = PETSC_OFFLOAD_GPU;
1284: if (a->v) { MatDensePlaceArray(a->cmat, a->v + rbegin + (size_t)cbegin * a->lda); }
1285: a->cmat->offloadmask = A->offloadmask;
1286: a->matinuse = cbegin + 1;
1287: *v = a->cmat;
1288: return 0;
1289: }
1291: static PetscErrorCode MatDenseRestoreSubMatrix_SeqDenseHIP(Mat A, Mat *v)
1292: {
1293: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1294: PetscBool copy = PETSC_FALSE, reset;
1295: PetscOffloadMask suboff;
1300: a->matinuse = 0;
1301: reset = a->v ? PETSC_TRUE : PETSC_FALSE;
1302: suboff = a->cmat->offloadmask; /* calls to ResetArray may change it, so save it here */
1303: if (suboff == PETSC_OFFLOAD_CPU && !a->v) {
1304: copy = PETSC_TRUE;
1305: MatSeqDenseSetPreallocation(A, NULL);
1306: }
1307: MatDenseHIPResetArray(a->cmat);
1308: if (reset) MatDenseResetArray(a->cmat);
1309: if (copy) {
1310: MatSeqDenseHIPCopyFromGPU(A);
1311: } else A->offloadmask = (suboff == PETSC_OFFLOAD_CPU) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1312: a->cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1313: if (v) *v = NULL;
1314: return 0;
1315: }
1317: static PetscErrorCode MatDenseSetLDA_SeqDenseHIP(Mat A, PetscInt lda)
1318: {
1319: Mat_SeqDense *cA = (Mat_SeqDense *)A->data;
1320: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
1321: PetscBool data;
1323: data = (PetscBool)((A->rmap->n > 0 && A->cmap->n > 0) ? (dA->d_v ? PETSC_TRUE : PETSC_FALSE) : PETSC_FALSE);
1326: cA->lda = lda;
1327: return 0;
1328: }
1330: static PetscErrorCode MatSetUp_SeqDenseHIP(Mat A)
1331: {
1332: PetscLayoutSetUp(A->rmap);
1333: PetscLayoutSetUp(A->cmap);
1334: if (!A->preallocated) MatSeqDenseHIPSetPreallocation(A, NULL);
1335: return 0;
1336: }
1338: static PetscErrorCode MatBindToCPU_SeqDenseHIP(Mat A, PetscBool flg)
1339: {
1340: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1344: A->boundtocpu = flg;
1345: if (!flg) {
1346: PetscBool iship;
1348: PetscObjectTypeCompare((PetscObject)a->cvec, VECSEQHIP, &iship);
1349: if (!iship) VecDestroy(&a->cvec);
1350: PetscObjectTypeCompare((PetscObject)a->cmat, MATSEQDENSEHIP, &iship);
1351: if (!iship) MatDestroy(&a->cmat);
1352: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArray_C", MatDenseGetArray_SeqDenseHIP);
1353: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayRead_C", MatDenseGetArrayRead_SeqDenseHIP);
1354: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWrite_C", MatDenseGetArrayWrite_SeqDenseHIP);
1355: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDenseHIP);
1356: PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDenseHIP);
1357: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDenseHIP);
1358: PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDenseHIP);
1359: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDenseHIP);
1360: PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDenseHIP);
1361: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDenseHIP);
1362: PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDenseHIP);
1363: PetscObjectComposeFunction((PetscObject)A, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDenseHIP);
1364: PetscObjectComposeFunction((PetscObject)A, "MatQRFactor_C", MatQRFactor_SeqDenseHIP);
1366: A->ops->duplicate = MatDuplicate_SeqDenseHIP;
1367: A->ops->mult = MatMult_SeqDenseHIP;
1368: A->ops->multadd = MatMultAdd_SeqDenseHIP;
1369: A->ops->multtranspose = MatMultTranspose_SeqDenseHIP;
1370: A->ops->multtransposeadd = MatMultTransposeAdd_SeqDenseHIP;
1371: A->ops->matmultnumeric = MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP;
1372: A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDenseHIP_SeqDenseHIP;
1373: A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDenseHIP_SeqDenseHIP;
1374: A->ops->axpy = MatAXPY_SeqDenseHIP;
1375: A->ops->choleskyfactor = MatCholeskyFactor_SeqDenseHIP;
1376: A->ops->lufactor = MatLUFactor_SeqDenseHIP;
1377: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDenseHIP;
1378: A->ops->getcolumnvector = MatGetColumnVector_SeqDenseHIP;
1379: A->ops->scale = MatScale_SeqDenseHIP;
1380: A->ops->shift = MatShift_SeqDenseHIP;
1381: A->ops->copy = MatCopy_SeqDenseHIP;
1382: A->ops->zeroentries = MatZeroEntries_SeqDenseHIP;
1383: } else {
1384: /* make sure we have an up-to-date copy on the CPU */
1385: MatSeqDenseHIPCopyFromGPU(A);
1386: PetscFree(A->defaultrandtype);
1387: PetscStrallocpy(PETSCRANDER48, &A->defaultrandtype);
1388: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArray_C", MatDenseGetArray_SeqDense);
1389: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense);
1390: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense);
1391: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense);
1392: PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense);
1393: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense);
1394: PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense);
1395: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense);
1396: PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense);
1397: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense);
1398: PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense);
1399: PetscObjectComposeFunction((PetscObject)A, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense);
1400: PetscObjectComposeFunction((PetscObject)A, "MatQRFactor_C", MatQRFactor_SeqDense);
1402: A->ops->duplicate = MatDuplicate_SeqDense;
1403: A->ops->mult = MatMult_SeqDense;
1404: A->ops->multadd = MatMultAdd_SeqDense;
1405: A->ops->multtranspose = MatMultTranspose_SeqDense;
1406: A->ops->multtransposeadd = MatMultTransposeAdd_SeqDense;
1407: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1408: A->ops->matmultnumeric = MatMatMultNumeric_SeqDense_SeqDense;
1409: A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDense_SeqDense;
1410: A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDense_SeqDense;
1411: A->ops->axpy = MatAXPY_SeqDense;
1412: A->ops->choleskyfactor = MatCholeskyFactor_SeqDense;
1413: A->ops->lufactor = MatLUFactor_SeqDense;
1414: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1415: A->ops->getcolumnvector = MatGetColumnVector_SeqDense;
1416: A->ops->scale = MatScale_SeqDense;
1417: A->ops->shift = MatShift_SeqDense;
1418: A->ops->copy = MatCopy_SeqDense;
1419: A->ops->zeroentries = MatZeroEntries_SeqDense;
1420: }
1421: if (a->cmat) { MatBindToCPU(a->cmat, flg); }
1422: return 0;
1423: }
1425: PetscErrorCode MatConvert_SeqDenseHIP_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat)
1426: {
1427: Mat B;
1428: Mat_SeqDense *a;
1430: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1431: /* TODO these cases should be optimized */
1432: MatConvert_Basic(M, type, reuse, newmat);
1433: return 0;
1434: }
1436: B = *newmat;
1437: MatBindToCPU_SeqDenseHIP(B, PETSC_TRUE);
1438: MatReset_SeqDenseHIP(B);
1439: PetscFree(B->defaultvectype);
1440: PetscStrallocpy(VECSTANDARD, &B->defaultvectype);
1441: PetscObjectChangeTypeName((PetscObject)B, MATSEQDENSE);
1442: PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqdensehip_seqdense_C", NULL);
1443: PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArray_C", NULL);
1444: PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArrayRead_C", NULL);
1445: PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArrayWrite_C", NULL);
1446: PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArray_C", NULL);
1447: PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArrayRead_C", NULL);
1448: PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArrayWrite_C", NULL);
1449: PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPPlaceArray_C", NULL);
1450: PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPResetArray_C", NULL);
1451: PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPReplaceArray_C", NULL);
1452: PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_seqaij_seqdensehip_C", NULL);
1453: a = (Mat_SeqDense *)B->data;
1454: VecDestroy(&a->cvec); /* cvec might be VECSEQHIP. Destroy it and rebuild a VECSEQ when needed */
1455: B->ops->bindtocpu = NULL;
1456: B->ops->destroy = MatDestroy_SeqDense;
1457: B->offloadmask = PETSC_OFFLOAD_CPU;
1458: return 0;
1459: }
1461: PetscErrorCode MatConvert_SeqDense_SeqDenseHIP(Mat M, MatType type, MatReuse reuse, Mat *newmat)
1462: {
1463: Mat_SeqDenseHIP *dB;
1464: Mat_SeqDense *a;
1465: Mat B;
1467: PetscDeviceInitialize(PETSC_DEVICE_HIP);
1468: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1469: /* TODO these cases should be optimized */
1470: MatConvert_Basic(M, type, reuse, newmat);
1471: return 0;
1472: }
1474: B = *newmat;
1475: PetscFree(B->defaultvectype);
1476: PetscStrallocpy(VECHIP, &B->defaultvectype);
1477: PetscObjectChangeTypeName((PetscObject)B, MATSEQDENSEHIP);
1478: PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqdensehip_seqdense_C", MatConvert_SeqDenseHIP_SeqDense);
1479: PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArray_C", MatDenseHIPGetArray_SeqDenseHIP);
1480: PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArrayRead_C", MatDenseHIPGetArrayRead_SeqDenseHIP);
1481: PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArrayWrite_C", MatDenseHIPGetArrayWrite_SeqDenseHIP);
1482: PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArray_C", MatDenseHIPRestoreArray_SeqDenseHIP);
1483: PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArrayRead_C", MatDenseHIPRestoreArrayRead_SeqDenseHIP);
1484: PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArrayWrite_C", MatDenseHIPRestoreArrayWrite_SeqDenseHIP);
1485: PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPPlaceArray_C", MatDenseHIPPlaceArray_SeqDenseHIP);
1486: PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPResetArray_C", MatDenseHIPResetArray_SeqDenseHIP);
1487: PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPReplaceArray_C", MatDenseHIPReplaceArray_SeqDenseHIP);
1488: PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_seqaij_seqdensehip_C", MatProductSetFromOptions_SeqAIJ_SeqDense);
1489: a = (Mat_SeqDense *)B->data;
1490: VecDestroy(&a->cvec); /* cvec might be VECSEQ. Destroy it and rebuild a VECSEQHIP when needed */
1491: PetscNewLog(B, &dB);
1492: B->spptr = dB;
1493: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1494: MatBindToCPU_SeqDenseHIP(B, PETSC_FALSE);
1495: B->ops->bindtocpu = MatBindToCPU_SeqDenseHIP;
1496: B->ops->destroy = MatDestroy_SeqDenseHIP;
1497: return 0;
1498: }
1500: /*@C
1501: MatCreateSeqDenseHIP - Creates a sequential matrix in dense format using HIP.
1503: Collective
1505: Input Parameters:
1506: + comm - MPI communicator
1507: . m - number of rows
1508: . n - number of columns
1509: - data - optional location of GPU matrix data. Set data=NULL for PETSc
1510: to control matrix memory allocation.
1512: Output Parameter:
1513: . A - the matrix
1515: Notes:
1517: Level: intermediate
1519: .seealso: `MATSEQDENSE`, `MatCreate()`, `MatCreateSeqDense()`
1520: @*/
1521: PetscErrorCode MatCreateSeqDenseHIP(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A)
1522: {
1523: PetscMPIInt size;
1525: MPI_Comm_size(comm, &size);
1527: MatCreate(comm, A);
1528: MatSetSizes(*A, m, n, m, n);
1529: MatSetType(*A, MATSEQDENSEHIP);
1530: MatSeqDenseHIPSetPreallocation(*A, data);
1531: return 0;
1532: }
1534: /*MC
1535: MATSEQDENSEHIP - MATSEQDENSEHIP = "seqdensehip" - A matrix type to be used for sequential dense matrices on GPUs.
1537: Options Database Keys:
1538: . -mat_type seqdensehip - sets the matrix type to `MATSEQDENSEHIP` during a call to `MatSetFromOptions()`
1540: Level: beginner
1542: .seealso: `MATSEQDENSE`
1543: M*/
1544: PETSC_EXTERN PetscErrorCode MatCreate_SeqDenseHIP(Mat B)
1545: {
1546: PetscDeviceInitialize(PETSC_DEVICE_HIP);
1547: MatCreate_SeqDense(B);
1548: MatConvert_SeqDense_SeqDenseHIP(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B);
1549: return 0;
1550: }