Actual source code: densehip.hip.cpp

  1: /*
  2:      Defines the matrix operations for sequential dense with HIP
  3:      Portions of this code are under:
  4:      Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
  5: */
  6: #include <petscpkg_version.h>
  7: #include <../src/mat/impls/dense/seq/dense.h>
  8: #include <petsc/private/hipvecimpl.h>
  9: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
 10:   #include <hipsolver/hipsolver.h>
 11: #else
 12:   #include <hipsolver.h>
 13: #endif
 14: #include <thrust/device_ptr.h>
 15: #include <thrust/functional.h>
 16: #include <thrust/iterator/counting_iterator.h>
 17: #include <thrust/iterator/transform_iterator.h>
 18: #include <thrust/iterator/permutation_iterator.h>
 19: #include <thrust/transform.h>
 20: #include <thrust/device_vector.h>

 22: #if defined(PETSC_USE_COMPLEX)
 23:   #if defined(PETSC_USE_REAL_SINGLE)
 24:     #define hipsolverDnXpotrf(a, b, c, d, e, f, g, h)                        hipsolverCpotrf((a), (b), (c), (hipComplex *)(d), (e), (hipComplex *)(f), (g), (h))
 25:     #define hipsolverDnXpotrf_bufferSize(a, b, c, d, e, f)                   hipsolverCpotrf_bufferSize((a), (b), (c), (hipComplex *)(d), (e), (f))
 26:     #define hipsolverDnXpotrs(a, b, c, d, e, f, g, h, i, j, k)               hipsolverCpotrs((a), (b), (c), (d), (hipComplex *)(e), (f), (hipComplex *)(g), (h), (hipComplex *)(i), (j), (k))
 27:     #define hipsolverDnXpotri(a, b, c, d, e, f, g, h)                        hipsolverCpotri((a), (b), (c), (hipComplex *)(d), (e), (hipComplex *)(f), (g), (h))
 28:     #define hipsolverDnXpotri_bufferSize(a, b, c, d, e, f)                   hipsolverCpotri_bufferSize((a), (b), (c), (hipComplex *)(d), (e), (f))
 29:     #define hipsolverDnXsytrf(a, b, c, d, e, f, g, h, i)                     hipsolverCsytrf((a), (b), (c), (hipComplex *)(d), (e), (f), (hipComplex *)(g), (h), (i))
 30:     #define hipsolverDnXsytrf_bufferSize(a, b, c, d, e)                      hipsolverCsytrf_bufferSize((a), (b), (hipComplex *)(c), (d), (e))
 31:     #define hipsolverDnXgetrf(a, b, c, d, e, f, g, h, i)                     hipsolverCgetrf((a), (b), (c), (hipComplex *)(d), (e), (hipComplex *)(f), (g), (h), (i))
 32:     #define hipsolverDnXgetrf_bufferSize(a, b, c, d, e, f)                   hipsolverCgetrf_bufferSize((a), (b), (c), (hipComplex *)(d), (e), (f))
 33:     #define hipsolverDnXgetrs(a, b, c, d, e, f, g, h, i, j, k, l)            hipsolverCgetrs((a), (b), (c), (d), (hipComplex *)(e), (f), (g), (hipComplex *)(h), (i), (hipComplex *)(j), (k), (l))
 34:     #define hipsolverDnXgeqrf_bufferSize(a, b, c, d, e, f)                   hipsolverCgeqrf_bufferSize((a), (b), (c), (hipComplex *)(d), (e), (f))
 35:     #define hipsolverDnXgeqrf(a, b, c, d, e, f, g, h, i)                     hipsolverCgeqrf((a), (b), (c), (hipComplex *)(d), (e), (hipComplex *)(f), (hipComplex *)(g), (h), (i))
 36:     #define hipsolverDnXormqr_bufferSize(a, b, c, d, e, f, g, h, i, j, k, l) hipsolverCunmqr_bufferSize((a), (b), (c), (d), (e), (f), (hipComplex *)(g), (h), (hipComplex *)(i), (hipComplex *)(j), (k), (l))
 37:     #define hipsolverDnXormqr(a, b, c, d, e, f, g, h, i, j, k, l, m, n)      hipsolverCunmqr((a), (b), (c), (d), (e), (f), (hipComplex *)(g), (h), (hipComplex *)(i), (hipComplex *)(j), (k), (hipComplex *)(l), (m), (n))
 38:     #define hipblasXtrsm(a, b, c, d, e, f, g, h, i, j, k, l)                 hipblasCtrsm((a), (b), (c), (d), (e), (f), (g), (hipComplex *)(h), (hipComplex *)(i), (j), (hipComplex *)(k), (l))
 39:   #else /* complex double */
 40:     #define hipsolverDnXpotrf(a, b, c, d, e, f, g, h)                        hipsolverZpotrf((a), (b), (c), (hipDoubleComplex *)(d), (e), (hipDoubleComplex *)(f), (g), (h))
 41:     #define hipsolverDnXpotrf_bufferSize(a, b, c, d, e, f)                   hipsolverZpotrf_bufferSize((a), (b), (c), (hipDoubleComplex *)(d), (e), (f))
 42:     #define hipsolverDnXpotrs(a, b, c, d, e, f, g, h, i, j, k)               hipsolverZpotrs((a), (b), (c), (d), (hipDoubleComplex *)(e), (f), (hipDoubleComplex *)(g), (h), (hipDoubleComplex *)(i), (j), (k))
 43:     #define hipsolverDnXpotri(a, b, c, d, e, f, g, h)                        hipsolverZpotri((a), (b), (c), (hipDoubleComplex *)(d), (e), (hipDoubleComplex *)(f), (g), (h))
 44:     #define hipsolverDnXpotri_bufferSize(a, b, c, d, e, f)                   hipsolverZpotri_bufferSize((a), (b), (c), (hipDoubleComplex *)(d), (e), (f))
 45:     #define hipsolverDnXsytrf(a, b, c, d, e, f, g, h, i)                     hipsolverZsytrf((a), (b), (c), (hipDoubleComplex *)(d), (e), (f), (hipDoubleComplex *)(g), (h), (i))
 46:     #define hipsolverDnXsytrf_bufferSize(a, b, c, d, e)                      hipsolverZsytrf_bufferSize((a), (b), (hipDoubleComplex *)(c), (d), (e))
 47:     #define hipsolverDnXgetrf(a, b, c, d, e, f, g, h, i)                     hipsolverZgetrf((a), (b), (c), (hipDoubleComplex *)(d), (e), (hipDoubleComplex *)(f), (g), (h), (i))
 48:     #define hipsolverDnXgetrf_bufferSize(a, b, c, d, e, f)                   hipsolverZgetrf_bufferSize((a), (b), (c), (hipDoubleComplex *)(d), (e), (f))
 49:     #define hipsolverDnXgetrs(a, b, c, d, e, f, g, h, i, j, k, l)            hipsolverZgetrs((a), (b), (c), (d), (hipDoubleComplex *)(e), (f), (g), (hipDoubleComplex *)(h), (i), (hipDoubleComplex *)(j), (k), (l))
 50:     #define hipsolverDnXgeqrf_bufferSize(a, b, c, d, e, f)                   hipsolverZgeqrf_bufferSize((a), (b), (c), (hipDoubleComplex *)(d), (e), (f))
 51:     #define hipsolverDnXgeqrf(a, b, c, d, e, f, g, h, i)                     hipsolverZgeqrf((a), (b), (c), (hipDoubleComplex *)(d), (e), (hipDoubleComplex *)(f), (hipDoubleComplex *)(g), (h), (i))
 52:     #define hipsolverDnXormqr_bufferSize(a, b, c, d, e, f, g, h, i, j, k, l) hipsolverZunmqr_bufferSize((a), (b), (c), (d), (e), (f), (hipDoubleComplex *)(g), (h), (hipDoubleComplex *)(i), (hipDoubleComplex *)(j), (k), (l))
 53:     #define hipsolverDnXormqr(a, b, c, d, e, f, g, h, i, j, k, l, m, n)      hipsolverZunmqr((a), (b), (c), (d), (e), (f), (hipDoubleComplex *)(g), (h), (hipDoubleComplex *)(i), (hipDoubleComplex *)(j), (k), (hipDoubleComplex *)(l), (m), (n))
 54:     #define hipblasXtrsm(a, b, c, d, e, f, g, h, i, j, k, l)                 hipblasZtrsm((a), (b), (c), (d), (e), (f), (g), (hipDoubleComplex *)(h), (hipDoubleComplex *)(i), (j), (hipDoubleComplex *)(k), (l))
 55:   #endif
 56: #else /* real single */
 57:   #if defined(PETSC_USE_REAL_SINGLE)
 58:     #define hipsolverDnXpotrf(a, b, c, d, e, f, g, h)          hipsolverSpotrf((a), (b), (c), (float *)(d), (e), (float *)(f), (g), (h))
 59:     #define hipsolverDnXpotrf_bufferSize(a, b, c, d, e, f)     hipsolverSpotrf_bufferSize((a), (b), (c), (float *)(d), (e), (f))
 60:     #define hipsolverDnXpotrs(a, b, c, d, e, f, g, h, i, j, k) hipsolverSpotrs((a), (b), (c), (d), (float *)(e), (f), (float *)(g), (h), (float *)(i), (j), (k))
 61:     #define hipsolverDnXpotri(a, b, c, d, e, f, g, h)          hipsolverSpotri((a), (b), (c), (float *)(d), (e), (float *)(f), (g), (h))
 62:     #define hipsolverDnXpotri_bufferSize(a, b, c, d, e, f)     hipsolverSpotri_bufferSize((a), (b), (c), (float *)(d), (e), (f))
 63:     #define hipsolverDnXsytrf(a, b, c, d, e, f, g, h, i)       hipsolverSsytrf((a), (b), (c), (float *)(d), (e), (f), (float *)(g), (h), (i))
 64:     #define hipsolverDnXsytrf_bufferSize(a, b, c, d, e)        hipsolverSsytrf_bufferSize((a), (b), (float *)(c), (d), (e))
 65:     #define hipsolverDnXgetrf(a, b, c, d, e, f, g, h, i)       hipsolverSgetrf((a), (b), (c), (float *)(d), (e), (float *)(f), (g), (h), (i))
 66:     #define hipsolverDnXgetrf_bufferSize(a, b, c, d, e, f)     hipsolverSgetrf_bufferSize((a), (b), (c), (float *)(d), (e), (f))
 67:     #define hipsolverDnXgetrs(a, b, c, d, e, f, g, h, i, j, k, l)            hipsolverSgetrs((a),(b),(c),(d),(float*)(e),(f),(g),(float*)(h),(i),(float*)(j),(k),(l)))
 68:     #define hipsolverDnXgeqrf_bufferSize(a, b, c, d, e, f)                   hipsolverSgeqrf_bufferSize((a), (b), (c), (float *)(d), (e), (f))
 69:     #define hipsolverDnXgeqrf(a, b, c, d, e, f, g, h, i)                     hipsolverSgeqrf((a), (b), (c), (float *)(d), (e), (float *)(f), (float *)(g), (h), (i))
 70:     #define hipsolverDnXormqr_bufferSize(a, b, c, d, e, f, g, h, i, j, k, l) hipsolverSormqr_bufferSize((a), (b), (c), (d), (e), (f), (float *)(g), (h), (float *)(i), (float *)(j), (k), (l))
 71:     #define hipsolverDnXormqr(a, b, c, d, e, f, g, h, i, j, k, l, m, n)      hipsolverSormqr((a), (b), (c), (d), (e), (f), (float *)(g), (h), (float *)(i), (float *)(j), (k), (float *)(l), (m), (n))
 72:     #define hipblasXtrsm(a, b, c, d, e, f, g, h, i, j, k, l)                 hipblasStrsm((a), (b), (c), (d), (e), (f), (g), (float *)(h), (float *)(i), (j), (float *)(k), (l))
 73:   #else /* real double */
 74:     #define hipsolverDnXpotrf(a, b, c, d, e, f, g, h)                        hipsolverDpotrf((a), (b), (c), (double *)(d), (e), (double *)(f), (g), (h))
 75:     #define hipsolverDnXpotrf_bufferSize(a, b, c, d, e, f)                   hipsolverDpotrf_bufferSize((a), (b), (c), (double *)(d), (e), (f))
 76:     #define hipsolverDnXpotrs(a, b, c, d, e, f, g, h, i, j, k)               hipsolverDpotrs((a), (b), (c), (d), (double *)(e), (f), (double *)(g), (h), (double *)(i), (j), (k))
 77:     #define hipsolverDnXpotri(a, b, c, d, e, f, g, h)                        hipsolverDpotri((a), (b), (c), (double *)(d), (e), (double *)(f), (g), (h))
 78:     #define hipsolverDnXpotri_bufferSize(a, b, c, d, e, f)                   hipsolverDpotri_bufferSize((a), (b), (c), (double *)(d), (e), (f))
 79:     #define hipsolverDnXsytrf(a, b, c, d, e, f, g, h, i)                     hipsolverDsytrf((a), (b), (c), (double *)(d), (e), (f), (double *)(g), (h), (i))
 80:     #define hipsolverDnXsytrf_bufferSize(a, b, c, d, e)                      hipsolverDsytrf_bufferSize((a), (b), (double *)(c), (d), (e))
 81:     #define hipsolverDnXgetrf(a, b, c, d, e, f, g, h, i)                     hipsolverDgetrf((a), (b), (c), (double *)(d), (e), (double *)(f), (g), (h), (i))
 82:     #define hipsolverDnXgetrf_bufferSize(a, b, c, d, e, f)                   hipsolverDgetrf_bufferSize((a), (b), (c), (double *)(d), (e), (f))
 83:     #define hipsolverDnXgetrs(a, b, c, d, e, f, g, h, i, j, k, l)            hipsolverDgetrs((a), (b), (c), (d), (double *)(e), (f), (g), (double *)(h), (i), (double *)(j), (k), (l))
 84:     #define hipsolverDnXgeqrf_bufferSize(a, b, c, d, e, f)                   hipsolverDgeqrf_bufferSize((a), (b), (c), (double *)(d), (e), (f))
 85:     #define hipsolverDnXgeqrf(a, b, c, d, e, f, g, h, i)                     hipsolverDgeqrf((a), (b), (c), (double *)(d), (e), (double *)(f), (double *)(g), (h), (i))
 86:     #define hipsolverDnXormqr_bufferSize(a, b, c, d, e, f, g, h, i, j, k, l) hipsolverDormqr_bufferSize((a), (b), (c), (d), (e), (f), (double *)(g), (h), (double *)(i), (double *)(j), (k), (l))
 87:     #define hipsolverDnXormqr(a, b, c, d, e, f, g, h, i, j, k, l, m, n)      hipsolverDormqr((a), (b), (c), (d), (e), (f), (double *)(g), (h), (double *)(i), (double *)(j), (k), (double *)(l), (m), (n))
 88:     #define hipblasXtrsm(a, b, c, d, e, f, g, h, i, j, k, l)                 hipblasDtrsm((a), (b), (c), (d), (e), (f), (g), (double *)(h), (double *)(i), (j), (double *)(k), (l))
 89:   #endif
 90: #endif

 92: typedef struct {
 93:   PetscScalar *d_v; /* pointer to the matrix on the GPU */
 94:   PetscBool    user_alloc;
 95:   PetscScalar *unplacedarray; /* if one called MatHIPDensePlaceArray(), this is where it stashed the original */
 96:   PetscBool    unplaced_user_alloc;
 97:   /* factorization support */
 98:   PetscHipBLASInt *d_fact_ipiv; /* device pivots */
 99:   PetscScalar     *d_fact_tau;  /* device QR tau vector */
100:   PetscScalar     *d_fact_work; /* device workspace */
101:   PetscHipBLASInt  fact_lwork;
102:   PetscHipBLASInt *d_fact_info; /* device info */
103:   /* workspace */
104:   Vec workvec;
105: } Mat_SeqDenseHIP;

107: PetscErrorCode MatSeqDenseHIPSetPreallocation(Mat A, PetscScalar *d_data)
108: {
109:   Mat_SeqDense    *cA = (Mat_SeqDense *)A->data;
110:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
111:   PetscBool        iship;

113:   PetscObjectTypeCompare((PetscObject)A, MATSEQDENSEHIP, &iship);
114:   if (!iship) return 0;
115:   /* it may happen CPU preallocation has not been performed */
116:   PetscLayoutSetUp(A->rmap);
117:   PetscLayoutSetUp(A->cmap);
118:   if (cA->lda <= 0) cA->lda = A->rmap->n;
119:   if (!dA->user_alloc) hipFree(dA->d_v);
120:   if (!d_data) { /* petsc-allocated storage */
121:     size_t sz;
122:     PetscIntMultError(cA->lda, A->cmap->n, NULL);
123:     sz = cA->lda * A->cmap->n * sizeof(PetscScalar);
124:     hipMalloc((void **)&dA->d_v, sz);
125:     hipMemset(dA->d_v, 0, sz);
126:     dA->user_alloc = PETSC_FALSE;
127:   } else { /* user-allocated storage */
128:     dA->d_v        = d_data;
129:     dA->user_alloc = PETSC_TRUE;
130:   }
131:   A->offloadmask  = PETSC_OFFLOAD_GPU;
132:   A->preallocated = PETSC_TRUE;
133:   A->assembled    = PETSC_TRUE;
134:   return 0;
135: }

137: PetscErrorCode MatSeqDenseHIPCopyFromGPU(Mat A)
138: {
139:   Mat_SeqDense    *cA = (Mat_SeqDense *)A->data;
140:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;

143:   PetscInfo(A, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", A->offloadmask == PETSC_OFFLOAD_GPU ? "Copy" : "Reusing", A->rmap->n, A->cmap->n);
144:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
145:     if (!cA->v) { /* MatCreateSeqDenseHIP may not allocate CPU memory. Allocate if needed */
146:       MatSeqDenseSetPreallocation(A, NULL);
147:     }
148:     PetscLogEventBegin(MAT_DenseCopyFromGPU, A, 0, 0, 0);
149:     if (cA->lda > A->rmap->n) {
150:       hipMemcpy2D(cA->v, cA->lda * sizeof(PetscScalar), dA->d_v, cA->lda * sizeof(PetscScalar), A->rmap->n * sizeof(PetscScalar), A->cmap->n, hipMemcpyDeviceToHost);
151:     } else {
152:       hipMemcpy(cA->v, dA->d_v, cA->lda * sizeof(PetscScalar) * A->cmap->n, hipMemcpyDeviceToHost);
153:     }
154:     PetscLogGpuToCpu(cA->lda * sizeof(PetscScalar) * A->cmap->n);
155:     PetscLogEventEnd(MAT_DenseCopyFromGPU, A, 0, 0, 0);

157:     A->offloadmask = PETSC_OFFLOAD_BOTH;
158:   }
159:   return 0;
160: }

162: PetscErrorCode MatSeqDenseHIPCopyToGPU(Mat A)
163: {
164:   Mat_SeqDense    *cA = (Mat_SeqDense *)A->data;
165:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
166:   PetscBool        copy;

169:   if (A->boundtocpu) return 0;
170:   copy = (PetscBool)(A->offloadmask == PETSC_OFFLOAD_CPU || A->offloadmask == PETSC_OFFLOAD_UNALLOCATED);
171:   PetscInfo(A, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", A->rmap->n, A->cmap->n);
172:   if (copy) {
173:     if (!dA->d_v) { /* Allocate GPU memory if not present */
174:       MatSeqDenseHIPSetPreallocation(A, NULL);
175:     }
176:     PetscLogEventBegin(MAT_DenseCopyToGPU, A, 0, 0, 0);
177:     if (cA->lda > A->rmap->n) {
178:       hipMemcpy2D(dA->d_v, cA->lda * sizeof(PetscScalar), cA->v, cA->lda * sizeof(PetscScalar), A->rmap->n * sizeof(PetscScalar), A->cmap->n, hipMemcpyHostToDevice);
179:     } else {
180:       hipMemcpy(dA->d_v, cA->v, cA->lda * sizeof(PetscScalar) * A->cmap->n, hipMemcpyHostToDevice);
181:     }
182:     PetscLogCpuToGpu(cA->lda * sizeof(PetscScalar) * A->cmap->n);
183:     PetscLogEventEnd(MAT_DenseCopyToGPU, A, 0, 0, 0);

185:     A->offloadmask = PETSC_OFFLOAD_BOTH;
186:   }
187:   return 0;
188: }

190: static PetscErrorCode MatCopy_SeqDenseHIP(Mat A, Mat B, MatStructure str)
191: {
192:   const PetscScalar *va;
193:   PetscScalar       *vb;
194:   PetscInt           lda1, lda2, m = A->rmap->n, n = A->cmap->n;

196:   /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
197:   if (A->ops->copy != B->ops->copy) {
198:     MatCopy_Basic(A, B, str);
199:     return 0;
200:   }
202:   MatDenseHIPGetArrayRead(A, &va);
203:   MatDenseHIPGetArrayWrite(B, &vb);
204:   MatDenseGetLDA(A, &lda1);
205:   MatDenseGetLDA(B, &lda2);
206:   PetscLogGpuTimeBegin();
207:   if (lda1 > m || lda2 > m) {
208:     hipMemcpy2D(vb, lda2 * sizeof(PetscScalar), va, lda1 * sizeof(PetscScalar), m * sizeof(PetscScalar), n, hipMemcpyDeviceToDevice);
209:   } else {
210:     hipMemcpy(vb, va, m * (n * sizeof(PetscScalar)), hipMemcpyDeviceToDevice);
211:   }
212:   PetscLogGpuTimeEnd();
213:   MatDenseHIPRestoreArrayWrite(B, &vb);
214:   MatDenseHIPRestoreArrayRead(A, &va);
215:   return 0;
216: }

218: static PetscErrorCode MatZeroEntries_SeqDenseHIP(Mat A)
219: {
220:   PetscScalar *va;
221:   PetscInt     lda, m = A->rmap->n, n = A->cmap->n;

223:   MatDenseHIPGetArrayWrite(A, &va);
224:   MatDenseGetLDA(A, &lda);
225:   PetscLogGpuTimeBegin();
226:   if (lda > m) {
227:     hipMemset2D(va, lda * sizeof(PetscScalar), 0, m * sizeof(PetscScalar), n);
228:   } else {
229:     hipMemset(va, 0, m * (n * sizeof(PetscScalar)));
230:   }
231:   WaitForHIP();
232:   PetscLogGpuTimeEnd();
233:   MatDenseHIPRestoreArrayWrite(A, &va);
234:   return 0;
235: }

237: static PetscErrorCode MatDenseHIPPlaceArray_SeqDenseHIP(Mat A, const PetscScalar *a)
238: {
239:   Mat_SeqDense    *aa = (Mat_SeqDense *)A->data;
240:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;

245:   if (aa->v) MatSeqDenseHIPCopyToGPU(A);
246:   dA->unplacedarray       = dA->d_v;
247:   dA->unplaced_user_alloc = dA->user_alloc;
248:   dA->d_v                 = (PetscScalar *)a;
249:   dA->user_alloc          = PETSC_TRUE;
250:   return 0;
251: }

253: static PetscErrorCode MatDenseHIPResetArray_SeqDenseHIP(Mat A)
254: {
255:   Mat_SeqDense    *a  = (Mat_SeqDense *)A->data;
256:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;

260:   if (a->v) MatSeqDenseHIPCopyToGPU(A);
261:   dA->d_v           = dA->unplacedarray;
262:   dA->user_alloc    = dA->unplaced_user_alloc;
263:   dA->unplacedarray = NULL;
264:   return 0;
265: }

267: static PetscErrorCode MatDenseHIPReplaceArray_SeqDenseHIP(Mat A, const PetscScalar *a)
268: {
269:   Mat_SeqDense    *aa = (Mat_SeqDense *)A->data;
270:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;

275:   if (!dA->user_alloc) hipFree(dA->d_v);
276:   dA->d_v        = (PetscScalar *)a;
277:   dA->user_alloc = PETSC_FALSE;
278:   return 0;
279: }

281: static PetscErrorCode MatDenseHIPGetArrayWrite_SeqDenseHIP(Mat A, PetscScalar **a)
282: {
283:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;

285:   if (!dA->d_v) MatSeqDenseHIPSetPreallocation(A, NULL);
286:   *a = dA->d_v;
287:   return 0;
288: }

290: static PetscErrorCode MatDenseHIPRestoreArrayWrite_SeqDenseHIP(Mat A, PetscScalar **a)
291: {
292:   if (a) *a = NULL;
293:   return 0;
294: }

296: static PetscErrorCode MatDenseHIPGetArrayRead_SeqDenseHIP(Mat A, const PetscScalar **a)
297: {
298:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;

300:   MatSeqDenseHIPCopyToGPU(A);
301:   *a = dA->d_v;
302:   return 0;
303: }

305: static PetscErrorCode MatDenseHIPRestoreArrayRead_SeqDenseHIP(Mat A, const PetscScalar **a)
306: {
307:   if (a) *a = NULL;
308:   return 0;
309: }

311: static PetscErrorCode MatDenseHIPGetArray_SeqDenseHIP(Mat A, PetscScalar **a)
312: {
313:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;

315:   MatSeqDenseHIPCopyToGPU(A);
316:   *a = dA->d_v;
317:   return 0;
318: }

320: static PetscErrorCode MatDenseHIPRestoreArray_SeqDenseHIP(Mat A, PetscScalar **a)
321: {
322:   if (a) *a = NULL;
323:   return 0;
324: }

326: PETSC_EXTERN PetscErrorCode MatSeqDenseHIPInvertFactors_Private(Mat A)
327: {
328:   Mat_SeqDense     *a  = (Mat_SeqDense *)A->data;
329:   Mat_SeqDenseHIP  *dA = (Mat_SeqDenseHIP *)A->spptr;
330:   PetscScalar      *da;
331:   hipsolverHandle_t handle;
332:   PetscHipBLASInt   n, lda;
333: #if defined(PETSC_USE_DEBUG)
334:   PetscHipBLASInt info;
335: #endif

337:   if (!A->rmap->n || !A->cmap->n) return 0;
338:   PetscHIPSOLVERGetHandle(&handle);
339:   PetscHipBLASIntCast(A->cmap->n, &n);
340:   PetscHipBLASIntCast(a->lda, &lda);
342:   if (A->factortype == MAT_FACTOR_CHOLESKY) {
343:     if (!dA->d_fact_ipiv) { /* spd */
344:       PetscHipBLASInt il;

346:       MatDenseHIPGetArray(A, &da);
347:       hipsolverDnXpotri_bufferSize(handle, HIPSOLVER_FILL_MODE_LOWER, n, da, lda, &il);
348:       if (il > dA->fact_lwork) {
349:         dA->fact_lwork = il;

351:         hipFree(dA->d_fact_work);
352:         hipMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work));
353:       }
354:       PetscLogGpuTimeBegin();
355:       hipsolverDnXpotri(handle, HIPSOLVER_FILL_MODE_LOWER, n, da, lda, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
356:       PetscLogGpuTimeEnd();
357:       MatDenseHIPRestoreArray(A, &da);
358:       /* TODO (write hip kernel) */
359:       MatSeqDenseSymmetrize_Private(A, PETSC_TRUE);
360:     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "hipsolverDnsytri not implemented");
361:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Not implemented");
362: #if defined(PETSC_USE_DEBUG)
363:   hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost);
366: #endif
367:   PetscLogGpuFlops(1.0 * n * n * n / 3.0);
368:   A->ops->solve          = NULL;
369:   A->ops->solvetranspose = NULL;
370:   A->ops->matsolve       = NULL;
371:   A->factortype          = MAT_FACTOR_NONE;

373:   PetscFree(A->solvertype);
374:   return 0;
375: }

377: static PetscErrorCode MatSolve_SeqDenseHIP_Internal(Mat A, Vec xx, Vec yy, PetscBool transpose, PetscErrorCode (*matsolve)(Mat, PetscScalar *, PetscHipBLASInt, PetscHipBLASInt, PetscHipBLASInt, PetscHipBLASInt, PetscBool))
378: {
379:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
380:   PetscScalar     *y;
381:   PetscHipBLASInt  m = 0, k = 0;
382:   PetscBool        xiship, yiship, aiship;

385:   PetscHipBLASIntCast(A->rmap->n, &m);
386:   PetscHipBLASIntCast(A->cmap->n, &k);
387:   PetscObjectTypeCompare((PetscObject)xx, VECSEQHIP, &xiship);
388:   PetscObjectTypeCompare((PetscObject)yy, VECSEQHIP, &yiship);
389:   {
390:     const PetscScalar *x;
391:     PetscBool          xishost = PETSC_TRUE;

393:     /* The logic here is to try to minimize the amount of memory copying:
394:        if we call VecHIPGetArrayRead(X,&x) every time xiship and the
395:        data is not offloaded to the GPU yet, then the data is copied to the
396:        GPU.  But we are only trying to get the data in order to copy it into the y
397:        array.  So the array x will be wherever the data already is so that
398:        only one memcpy is performed */
399:     if (xiship && xx->offloadmask & PETSC_OFFLOAD_GPU) {
400:       VecHIPGetArrayRead(xx, &x);
401:       xishost = PETSC_FALSE;
402:     } else VecGetArrayRead(xx, &x);
403:     if (k < m || !yiship) {
404:       if (!dA->workvec) VecCreateSeqHIP(PetscObjectComm((PetscObject)A), m, &(dA->workvec));
405:       VecHIPGetArrayWrite(dA->workvec, &y);
406:     } else VecHIPGetArrayWrite(yy, &y);
407:     hipMemcpy(y, x, m * sizeof(PetscScalar), xishost ? hipMemcpyHostToDevice : hipMemcpyDeviceToDevice);
408:   }
409:   PetscObjectTypeCompare((PetscObject)A, MATSEQDENSEHIP, &aiship);
410:   if (!aiship) MatConvert(A, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &A);
411:   (*matsolve)(A, y, m, m, 1, k, transpose);
412:   if (!aiship) MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A);
413:   if (k < m || !yiship) {
414:     PetscScalar *yv;

416:     /* The logic here is that the data is not yet in either yy's GPU array or its
417:        CPU array.  There is nothing in the interface to say where the user would like
418:        it to end up.  So we choose the GPU, because it is the faster option */
419:     if (yiship) VecHIPGetArrayWrite(yy, &yv);
420:     else VecGetArray(yy, &yv);
421:     hipMemcpy(yv, y, k * sizeof(PetscScalar), yiship ? hipMemcpyDeviceToDevice : hipMemcpyDeviceToHost);
422:     if (yiship) VecHIPRestoreArrayWrite(yy, &yv);
423:     else VecRestoreArray(yy, &yv);
424:     VecHIPRestoreArrayWrite(dA->workvec, &y);
425:   } else VecHIPRestoreArrayWrite(yy, &y);
426:   return 0;
427: }

429: static PetscErrorCode MatMatSolve_SeqDenseHIP_Internal(Mat A, Mat B, Mat X, PetscBool transpose, PetscErrorCode (*matsolve)(Mat, PetscScalar *, PetscHipBLASInt, PetscHipBLASInt, PetscHipBLASInt, PetscHipBLASInt, PetscBool))
430: {
431:   PetscScalar    *y;
432:   PetscInt        n, _ldb, _ldx;
433:   PetscBool       biship, xiship, aiship;
434:   PetscHipBLASInt nrhs = 0, m = 0, k = 0, ldb = 0, ldx = 0, ldy = 0;

437:   PetscHipBLASIntCast(A->rmap->n, &m);
438:   PetscHipBLASIntCast(A->cmap->n, &k);
439:   MatGetSize(B, NULL, &n);
440:   PetscHipBLASIntCast(n, &nrhs);
441:   MatDenseGetLDA(B, &_ldb);
442:   PetscHipBLASIntCast(_ldb, &ldb);
443:   MatDenseGetLDA(X, &_ldx);
444:   PetscHipBLASIntCast(_ldx, &ldx);
445:   PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &biship);
446:   PetscObjectTypeCompare((PetscObject)X, MATSEQDENSEHIP, &xiship);
447:   /* The logic here is to try to minimize the amount of memory copying:
448:      if we call MatDenseHIPGetArrayRead(B,&b) every time biship and the
449:      data is not offloaded to the GPU yet, then the data is copied to the
450:      GPU.  But we are only trying to get the data in order to copy it into the y
451:      array.  So the array b will be wherever the data already is so that
452:      only one memcpy is performed */
453:   const PetscScalar *b;
454:   /* some copying from B will be involved */
455:   PetscBool bishost = PETSC_TRUE;
456:   if (biship && B->offloadmask & PETSC_OFFLOAD_GPU) {
457:     MatDenseHIPGetArrayRead(B, &b);
458:     bishost = PETSC_FALSE;
459:   } else MatDenseGetArrayRead(B, &b);
460:   if (ldx < m || !xiship) {
461:     /* X's array cannot serve as the array (too small or not on device), B's
462:      * array cannot serve as the array (const), so allocate a new array  */
463:     ldy = m;
464:     hipMalloc((void **)&y, nrhs * m * sizeof(PetscScalar));
465:   } else {
466:     /* X's array should serve as the array */
467:     ldy = ldx;
468:     MatDenseHIPGetArrayWrite(X, &y);
469:   }
470:   hipMemcpy2D(y, ldy * sizeof(PetscScalar), b, ldb * sizeof(PetscScalar), m * sizeof(PetscScalar), nrhs, bishost ? hipMemcpyHostToDevice : hipMemcpyDeviceToDevice);
471:   if (bishost) MatDenseRestoreArrayRead(B, &b);
472:   else MatDenseHIPRestoreArrayRead(B, &b);

474:   PetscObjectTypeCompare((PetscObject)A, MATSEQDENSEHIP, &aiship);
475:   if (!aiship) MatConvert(A, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &A);
476:   (*matsolve)(A, y, ldy, m, nrhs, k, transpose);
477:   if (!aiship) MatConvert(A, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &A);
478:   if (ldx < m || !xiship) {
479:     PetscScalar *x;

481:     /* The logic here is that the data is not yet in either X's GPU array or its
482:        CPU array.  There is nothing in the interface to say where the user would like
483:        it to end up.  So we choose the GPU, because it is the faster option */
484:     if (xiship) MatDenseHIPGetArrayWrite(X, &x);
485:     else MatDenseGetArray(X, &x);
486:     hipMemcpy2D(x, ldx * sizeof(PetscScalar), y, ldy * sizeof(PetscScalar), k * sizeof(PetscScalar), nrhs, xiship ? hipMemcpyDeviceToDevice : hipMemcpyDeviceToHost);
487:     if (xiship) MatDenseHIPRestoreArrayWrite(X, &x);
488:     else MatDenseRestoreArray(X, &x);
489:     hipFree(y);
490:   } else MatDenseHIPRestoreArrayWrite(X, &y);
491:   return 0;
492: }

494: static PetscErrorCode MatSolve_SeqDenseHIP_Internal_LU(Mat A, PetscScalar *x, PetscHipBLASInt ldx, PetscHipBLASInt m, PetscHipBLASInt nrhs, PetscHipBLASInt k, PetscBool T)
495: {
496:   Mat_SeqDense      *mat = (Mat_SeqDense *)A->data;
497:   Mat_SeqDenseHIP   *dA  = (Mat_SeqDenseHIP *)A->spptr;
498:   const PetscScalar *da;
499:   PetscHipBLASInt    lda;
500:   hipsolverHandle_t  handle;
501:   int                info;

503:   MatDenseHIPGetArrayRead(A, &da);
504:   PetscHipBLASIntCast(mat->lda, &lda);
505:   PetscHIPSOLVERGetHandle(&handle);
506:   PetscLogGpuTimeBegin();
507:   PetscInfo(A, "LU solve %d x %d on backend\n", m, k);
508:   hipsolverDnXgetrs(handle, T ? HIPSOLVER_OP_T : HIPSOLVER_OP_N, m, nrhs, da, lda, dA->d_fact_ipiv, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
509:   PetscLogGpuTimeEnd();
510:   MatDenseHIPRestoreArrayRead(A, &da);
511:   if (PetscDefined(USE_DEBUG)) {
512:     hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost);
515:   }
516:   PetscLogGpuFlops(nrhs * (2.0 * m * m - m));
517:   return 0;
518: }

520: static PetscErrorCode MatSolve_SeqDenseHIP_Internal_Cholesky(Mat A, PetscScalar *x, PetscHipBLASInt ldx, PetscHipBLASInt m, PetscHipBLASInt nrhs, PetscHipBLASInt k, PetscBool T)
521: {
522:   Mat_SeqDense      *mat = (Mat_SeqDense *)A->data;
523:   Mat_SeqDenseHIP   *dA  = (Mat_SeqDenseHIP *)A->spptr;
524:   const PetscScalar *da;
525:   PetscHipBLASInt    lda;
526:   hipsolverHandle_t  handle;
527:   int                info;

529:   MatDenseHIPGetArrayRead(A, &da);
530:   PetscHipBLASIntCast(mat->lda, &lda);
531:   PetscHIPSOLVERGetHandle(&handle);
532:   PetscLogGpuTimeBegin();
533:   PetscInfo(A, "Cholesky solve %d x %d on backend\n", m, k);
534:   if (!dA->d_fact_ipiv) { /* spd */
535:     /* ========= Program hit hipErrorNotReady (error 34) due to "device not ready" on HIP API call to hipEventQuery. */
536:     hipsolverDnXpotrs(handle, HIPSOLVER_FILL_MODE_LOWER, m, nrhs, da, lda, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
537:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "hipsolverDnsytrs not implemented");
538:   PetscLogGpuTimeEnd();
539:   MatDenseHIPRestoreArrayRead(A, &da);
540:   if (PetscDefined(USE_DEBUG)) {
541:     hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost);
544:   }
545:   PetscLogGpuFlops(nrhs * (2.0 * m * m - m));
546:   return 0;
547: }

549: static PetscErrorCode MatSolve_SeqDenseHIP_Internal_QR(Mat A, PetscScalar *x, PetscHipBLASInt ldx, PetscHipBLASInt m, PetscHipBLASInt nrhs, PetscHipBLASInt k, PetscBool T)
550: {
551:   Mat_SeqDense        *mat = (Mat_SeqDense *)A->data;
552:   Mat_SeqDenseHIP     *dA  = (Mat_SeqDenseHIP *)A->spptr;
553:   const PetscScalar   *da;
554:   PetscHipBLASInt      lda, rank;
555:   hipsolverHandle_t    handle;
556:   hipblasHandle_t      bhandle;
557:   int                  info;
558:   hipsolverOperation_t trans;
559:   PetscScalar          one = 1.;

561:   PetscHipBLASIntCast(mat->rank, &rank);
562:   MatDenseHIPGetArrayRead(A, &da);
563:   PetscHipBLASIntCast(mat->lda, &lda);
564:   PetscHIPSOLVERGetHandle(&handle);
565:   PetscHIPBLASGetHandle(&bhandle);
566:   PetscLogGpuTimeBegin();
567:   PetscInfo(A, "QR solve %d x %d on backend\n", m, k);
568:   if (!T) {
569:     if (PetscDefined(USE_COMPLEX)) trans = HIPSOLVER_OP_C;
570:     else trans = HIPSOLVER_OP_T;
571:     hipsolverDnXormqr(handle, HIPSOLVER_SIDE_LEFT, trans, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
572:     if (PetscDefined(USE_DEBUG)) {
573:       hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost);
575:     }
576:     hipblasXtrsm(bhandle, HIPBLAS_SIDE_LEFT, HIPBLAS_FILL_MODE_UPPER, HIPBLAS_OP_N, HIPBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx);
577:   } else {
578:     hipblasXtrsm(bhandle, HIPBLAS_SIDE_LEFT, HIPBLAS_FILL_MODE_UPPER, HIPBLAS_OP_T, HIPBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx);
579:     hipsolverDnXormqr(handle, HIPSOLVER_SIDE_LEFT, HIPSOLVER_OP_N, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
580:     if (PetscDefined(USE_DEBUG)) {
581:       hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost);
583:     }
584:   }
585:   PetscLogGpuTimeEnd();
586:   MatDenseHIPRestoreArrayRead(A, &da);
587:   PetscLogFlops(nrhs * (4.0 * m * mat->rank - PetscSqr(mat->rank)));
588:   return 0;
589: }

591: static PetscErrorCode MatSolve_SeqDenseHIP_LU(Mat A, Vec xx, Vec yy)
592: {
593:   MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_LU);
594:   return 0;
595: }

597: static PetscErrorCode MatSolveTranspose_SeqDenseHIP_LU(Mat A, Vec xx, Vec yy)
598: {
599:   MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_LU);
600:   return 0;
601: }

603: static PetscErrorCode MatSolve_SeqDenseHIP_Cholesky(Mat A, Vec xx, Vec yy)
604: {
605:   MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_Cholesky);
606:   return 0;
607: }

609: static PetscErrorCode MatSolveTranspose_SeqDenseHIP_Cholesky(Mat A, Vec xx, Vec yy)
610: {
611:   MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_Cholesky);
612:   return 0;
613: }

615: static PetscErrorCode MatSolve_SeqDenseHIP_QR(Mat A, Vec xx, Vec yy)
616: {
617:   MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_QR);
618:   return 0;
619: }

621: static PetscErrorCode MatSolveTranspose_SeqDenseHIP_QR(Mat A, Vec xx, Vec yy)
622: {
623:   MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_QR);
624:   return 0;
625: }

627: static PetscErrorCode MatMatSolve_SeqDenseHIP_LU(Mat A, Mat B, Mat X)
628: {
629:   MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_LU);
630:   return 0;
631: }

633: static PetscErrorCode MatMatSolveTranspose_SeqDenseHIP_LU(Mat A, Mat B, Mat X)
634: {
635:   MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_LU);
636:   return 0;
637: }

639: static PetscErrorCode MatMatSolve_SeqDenseHIP_Cholesky(Mat A, Mat B, Mat X)
640: {
641:   MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_Cholesky);
642:   return 0;
643: }

645: static PetscErrorCode MatMatSolveTranspose_SeqDenseHIP_Cholesky(Mat A, Mat B, Mat X)
646: {
647:   MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_Cholesky);
648:   return 0;
649: }

651: static PetscErrorCode MatMatSolve_SeqDenseHIP_QR(Mat A, Mat B, Mat X)
652: {
653:   MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_QR);
654:   return 0;
655: }

657: static PetscErrorCode MatMatSolveTranspose_SeqDenseHIP_QR(Mat A, Mat B, Mat X)
658: {
659:   MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_QR);
660:   return 0;
661: }

663: static PetscErrorCode MatLUFactor_SeqDenseHIP(Mat A, IS rperm, IS cperm, const MatFactorInfo *factinfo)
664: {
665:   Mat_SeqDense     *a  = (Mat_SeqDense *)A->data;
666:   Mat_SeqDenseHIP  *dA = (Mat_SeqDenseHIP *)A->spptr;
667:   PetscScalar      *da;
668:   PetscHipBLASInt   m, n, lda;
669:   hipsolverHandle_t handle;
670: #if defined(PETSC_USE_DEBUG)
671:   int info;
672: #endif

674:   if (!A->rmap->n || !A->cmap->n) return 0;
675:   PetscHIPSOLVERGetHandle(&handle);
676:   MatDenseHIPGetArray(A, &da);
677:   PetscHipBLASIntCast(A->cmap->n, &n);
678:   PetscHipBLASIntCast(A->rmap->n, &m);
679:   PetscHipBLASIntCast(a->lda, &lda);
680:   PetscInfo(A, "LU factor %d x %d on backend\n", m, n);
681:   if (!dA->d_fact_ipiv) hipMalloc((void **)&dA->d_fact_ipiv, n * sizeof(*dA->d_fact_ipiv));
682:   if (!dA->fact_lwork) {
683:     hipsolverDnXgetrf_bufferSize(handle, m, n, da, lda, &dA->fact_lwork);
684:     hipMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work));
685:   }
686:   if (!dA->d_fact_info) hipMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info));
687:   PetscLogGpuTimeBegin();
688:   hipsolverDnXgetrf(handle, m, n, da, lda, dA->d_fact_work, dA->fact_lwork, dA->d_fact_ipiv, dA->d_fact_info);
689:   PetscLogGpuTimeEnd();
690:   MatDenseHIPRestoreArray(A, &da);
691: #if defined(PETSC_USE_DEBUG)
692:   hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost);
695: #endif
696:   A->factortype = MAT_FACTOR_LU;
697:   PetscLogGpuFlops(2.0 * n * n * m / 3.0);

699:   A->ops->solve             = MatSolve_SeqDenseHIP_LU;
700:   A->ops->solvetranspose    = MatSolveTranspose_SeqDenseHIP_LU;
701:   A->ops->matsolve          = MatMatSolve_SeqDenseHIP_LU;
702:   A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseHIP_LU;

704:   PetscFree(A->solvertype);
705:   PetscStrallocpy(MATSOLVERHIP, &A->solvertype);
706:   return 0;
707: }

709: static PetscErrorCode MatCholeskyFactor_SeqDenseHIP(Mat A, IS perm, const MatFactorInfo *factinfo)
710: {
711:   Mat_SeqDense     *a  = (Mat_SeqDense *)A->data;
712:   Mat_SeqDenseHIP  *dA = (Mat_SeqDenseHIP *)A->spptr;
713:   PetscScalar      *da;
714:   PetscHipBLASInt   n, lda;
715:   hipsolverHandle_t handle;
716: #if defined(PETSC_USE_DEBUG)
717:   int info;
718: #endif

720:   if (!A->rmap->n || !A->cmap->n) return 0;
721:   PetscHIPSOLVERGetHandle(&handle);
722:   PetscHipBLASIntCast(A->rmap->n, &n);
723:   PetscInfo(A, "Cholesky factor %d x %d on backend\n", n, n);
724:   if (A->spd == PETSC_BOOL3_TRUE) {
725:     MatDenseHIPGetArray(A, &da);
726:     PetscHipBLASIntCast(a->lda, &lda);
727:     if (!dA->fact_lwork) {
728:       hipsolverDnXpotrf_bufferSize(handle, HIPSOLVER_FILL_MODE_LOWER, n, da, lda, &dA->fact_lwork);
729:       hipMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work));
730:     }
731:     if (!dA->d_fact_info) hipMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info));
732:     PetscLogGpuTimeBegin();
733:     hipsolverDnXpotrf(handle, HIPSOLVER_FILL_MODE_LOWER, n, da, lda, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
734:     PetscLogGpuTimeEnd();

736:     MatDenseHIPRestoreArray(A, &da);
737: #if defined(PETSC_USE_DEBUG)
738:     hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost);
741: #endif
742:     A->factortype = MAT_FACTOR_CHOLESKY;
743:     PetscLogGpuFlops(1.0 * n * n * n / 3.0);
744:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipsolverDnsytrs unavailable. Use MAT_FACTOR_LU");

746:   /* at the time of writing hipsolverDn has *sytrs and *hetr* routines implemented and the
747:        code below should work */
748:   if (!dA->d_fact_ipiv) hipMalloc((void **)&dA->d_fact_ipiv, n * sizeof(*dA->d_fact_ipiv));
749:   if (!dA->fact_lwork) {
750:     hipsolverDnXsytrf_bufferSize(handle, n, da, lda, &dA->fact_lwork);
751:     hipMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work));
752:   }
753:   if (!dA->d_fact_info) hipMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info));
754:   PetscLogGpuTimeBegin();
755:   hipsolverDnXsytrf(handle, HIPSOLVER_FILL_MODE_LOWER, n, da, lda, dA->d_fact_ipiv, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
756:   PetscLogGpuTimeEnd();

758:   A->ops->solve             = MatSolve_SeqDenseHIP_Cholesky;
759:   A->ops->solvetranspose    = MatSolveTranspose_SeqDenseHIP_Cholesky;
760:   A->ops->matsolve          = MatMatSolve_SeqDenseHIP_Cholesky;
761:   A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseHIP_Cholesky;
762:   PetscFree(A->solvertype);
763:   PetscStrallocpy(MATSOLVERHIP, &A->solvertype);
764:   return 0;
765: }

767: static PetscErrorCode MatQRFactor_SeqDenseHIP(Mat A, IS col, const MatFactorInfo *factinfo)
768: {
769:   Mat_SeqDense     *a  = (Mat_SeqDense *)A->data;
770:   Mat_SeqDenseHIP  *dA = (Mat_SeqDenseHIP *)A->spptr;
771:   PetscScalar      *da;
772:   PetscHipBLASInt   m, min, max, n, lda;
773:   hipsolverHandle_t handle;
774: #if defined(PETSC_USE_DEBUG)
775:   int info;
776: #endif

778:   if (!A->rmap->n || !A->cmap->n) return 0;
779:   PetscHIPSOLVERGetHandle(&handle);
780:   MatDenseHIPGetArray(A, &da);
781:   PetscHipBLASIntCast(A->cmap->n, &n);
782:   PetscHipBLASIntCast(A->rmap->n, &m);
783:   PetscHipBLASIntCast(a->lda, &lda);
784:   PetscInfo(A, "QR factor %d x %d on backend\n", m, n);
785:   max = PetscMax(m, n);
786:   min = PetscMin(m, n);
787:   if (!dA->d_fact_tau) hipMalloc((void **)&dA->d_fact_tau, min * sizeof(*dA->d_fact_tau));
788:   if (!dA->d_fact_ipiv) hipMalloc((void **)&dA->d_fact_ipiv, n * sizeof(*dA->d_fact_ipiv));
789:   if (!dA->fact_lwork) {
790:     hipsolverDnXgeqrf_bufferSize(handle, m, n, da, lda, &dA->fact_lwork);
791:     hipMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work));
792:   }
793:   if (!dA->d_fact_info) hipMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info));
794:   if (!dA->workvec) VecCreateSeqHIP(PetscObjectComm((PetscObject)A), m, &(dA->workvec));
795:   PetscLogGpuTimeBegin();
796:   hipsolverDnXgeqrf(handle, m, n, da, lda, dA->d_fact_tau, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
797:   PetscLogGpuTimeEnd();
798:   MatDenseHIPRestoreArray(A, &da);
799: #if defined(PETSC_USE_DEBUG)
800:   hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost);
802: #endif
803:   A->factortype = MAT_FACTOR_QR;
804:   a->rank       = min;
805:   PetscLogGpuFlops(2.0 * min * min * (max - min / 3.0));

807:   A->ops->solve             = MatSolve_SeqDenseHIP_QR;
808:   A->ops->solvetranspose    = MatSolveTranspose_SeqDenseHIP_QR;
809:   A->ops->matsolve          = MatMatSolve_SeqDenseHIP_QR;
810:   A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseHIP_QR;

812:   PetscFree(A->solvertype);
813:   PetscStrallocpy(MATSOLVERHIP, &A->solvertype);
814:   return 0;
815: }

817: /* GEMM kernel: C = op(A)*op(B), tA, tB flag transposition */
818: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(Mat A, Mat B, Mat C, PetscBool tA, PetscBool tB)
819: {
820:   const PetscScalar *da, *db;
821:   PetscScalar       *dc;
822:   PetscScalar        one = 1.0, zero = 0.0;
823:   PetscHipBLASInt    m, n, k;
824:   PetscInt           alda, blda, clda;
825:   hipblasHandle_t    hipblasv2handle;
826:   PetscBool          Aiship, Biship;

828:   /* we may end up with SEQDENSE as one of the arguments */
829:   PetscObjectTypeCompare((PetscObject)A, MATSEQDENSEHIP, &Aiship);
830:   PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &Biship);
831:   if (!Aiship) MatConvert(A, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &A);
832:   if (!Biship) MatConvert(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B);
833:   PetscHipBLASIntCast(C->rmap->n, &m);
834:   PetscHipBLASIntCast(C->cmap->n, &n);
835:   if (tA) PetscHipBLASIntCast(A->rmap->n, &k);
836:   else PetscHipBLASIntCast(A->cmap->n, &k);
837:   if (!m || !n || !k) return 0;
838:   PetscInfo(C, "Matrix-Matrix product %d x %d x %d on backend\n", m, k, n);
839:   MatDenseHIPGetArrayRead(A, &da);
840:   MatDenseHIPGetArrayRead(B, &db);
841:   MatDenseHIPGetArrayWrite(C, &dc);
842:   MatDenseGetLDA(A, &alda);
843:   MatDenseGetLDA(B, &blda);
844:   MatDenseGetLDA(C, &clda);
845:   PetscHIPBLASGetHandle(&hipblasv2handle);
846:   PetscLogGpuTimeBegin();
847:   hipblasXgemm(hipblasv2handle, tA ? HIPBLAS_OP_T : HIPBLAS_OP_N, tB ? HIPBLAS_OP_T : HIPBLAS_OP_N, m, n, k, &one, da, alda, db, blda, &zero, dc, clda);
848:   PetscLogGpuTimeEnd();
849:   PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1));
850:   MatDenseHIPRestoreArrayRead(A, &da);
851:   MatDenseHIPRestoreArrayRead(B, &db);
852:   MatDenseHIPRestoreArrayWrite(C, &dc);
853:   if (!Aiship) MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A);
854:   if (!Biship) MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B);
855:   return 0;
856: }

858: PetscErrorCode MatTransposeMatMultNumeric_SeqDenseHIP_SeqDenseHIP(Mat A, Mat B, Mat C)
859: {
860:   MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(A, B, C, PETSC_TRUE, PETSC_FALSE);
861:   return 0;
862: }

864: PetscErrorCode MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP(Mat A, Mat B, Mat C)
865: {
866:   MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(A, B, C, PETSC_FALSE, PETSC_FALSE);
867:   return 0;
868: }

870: PetscErrorCode MatMatTransposeMultNumeric_SeqDenseHIP_SeqDenseHIP(Mat A, Mat B, Mat C)
871: {
872:   MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(A, B, C, PETSC_FALSE, PETSC_TRUE);
873:   return 0;
874: }

876: PetscErrorCode MatProductSetFromOptions_SeqDenseHIP(Mat C)
877: {
878:   MatProductSetFromOptions_SeqDense(C);
879:   return 0;
880: }

882: /* zz = op(A)*xx + yy
883:    if yy == NULL, only MatMult */
884: static PetscErrorCode MatMultAdd_SeqDenseHIP_Private(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans)
885: {
886:   Mat_SeqDense      *mat = (Mat_SeqDense *)A->data;
887:   const PetscScalar *xarray, *da;
888:   PetscScalar       *zarray;
889:   PetscScalar        one = 1.0, zero = 0.0;
890:   PetscHipBLASInt    m, n, lda;
891:   hipblasHandle_t    hipblasv2handle;

893:   if (yy && yy != zz) VecCopy_SeqHIP(yy, zz); /* mult add */
894:   if (!A->rmap->n || !A->cmap->n) {
895:     if (!yy) VecSet_SeqHIP(zz, 0.0); /* mult only */
896:     return 0;
897:   }
898:   PetscInfo(A, "Matrix-vector product %d x %d on backend\n", A->rmap->n, A->cmap->n);
899:   PetscHipBLASIntCast(A->rmap->n, &m);
900:   PetscHipBLASIntCast(A->cmap->n, &n);
901:   PetscHIPBLASGetHandle(&hipblasv2handle);
902:   MatDenseHIPGetArrayRead(A, &da);
903:   PetscHipBLASIntCast(mat->lda, &lda);
904:   VecHIPGetArrayRead(xx, &xarray);
905:   VecHIPGetArray(zz, &zarray);
906:   PetscLogGpuTimeBegin();
907:   hipblasXgemv(hipblasv2handle, trans ? HIPBLAS_OP_T : HIPBLAS_OP_N, m, n, &one, da, lda, xarray, 1, (yy ? &one : &zero), zarray, 1);
908:   PetscLogGpuTimeEnd();
909:   PetscLogGpuFlops(2.0 * A->rmap->n * A->cmap->n - (yy ? 0 : A->rmap->n));
910:   VecHIPRestoreArrayRead(xx, &xarray);
911:   VecHIPRestoreArray(zz, &zarray);
912:   MatDenseHIPRestoreArrayRead(A, &da);
913:   return 0;
914: }

916: PetscErrorCode MatMultAdd_SeqDenseHIP(Mat A, Vec xx, Vec yy, Vec zz)
917: {
918:   MatMultAdd_SeqDenseHIP_Private(A, xx, yy, zz, PETSC_FALSE);
919:   return 0;
920: }

922: PetscErrorCode MatMultTransposeAdd_SeqDenseHIP(Mat A, Vec xx, Vec yy, Vec zz)
923: {
924:   MatMultAdd_SeqDenseHIP_Private(A, xx, yy, zz, PETSC_TRUE);
925:   return 0;
926: }

928: PetscErrorCode MatMult_SeqDenseHIP(Mat A, Vec xx, Vec yy)
929: {
930:   MatMultAdd_SeqDenseHIP_Private(A, xx, NULL, yy, PETSC_FALSE);
931:   return 0;
932: }

934: PetscErrorCode MatMultTranspose_SeqDenseHIP(Mat A, Vec xx, Vec yy)
935: {
936:   MatMultAdd_SeqDenseHIP_Private(A, xx, NULL, yy, PETSC_TRUE);
937:   return 0;
938: }

940: static PetscErrorCode MatDenseGetArrayRead_SeqDenseHIP(Mat A, const PetscScalar **array)
941: {
942:   Mat_SeqDense *mat = (Mat_SeqDense *)A->data;

944:   MatSeqDenseHIPCopyFromGPU(A);
945:   *array = mat->v;
946:   return 0;
947: }

949: static PetscErrorCode MatDenseGetArrayWrite_SeqDenseHIP(Mat A, PetscScalar **array)
950: {
951:   Mat_SeqDense *mat = (Mat_SeqDense *)A->data;

953:   if (!mat->v) MatSeqDenseSetPreallocation(A, NULL); /* MatCreateSeqDenseHIP may not allocate CPU memory. Allocate if needed */
954:   *array         = mat->v;
955:   A->offloadmask = PETSC_OFFLOAD_CPU;
956:   return 0;
957: }

959: static PetscErrorCode MatDenseGetArray_SeqDenseHIP(Mat A, PetscScalar **array)
960: {
961:   Mat_SeqDense *mat = (Mat_SeqDense *)A->data;

963:   MatSeqDenseHIPCopyFromGPU(A);
964:   *array         = mat->v;
965:   A->offloadmask = PETSC_OFFLOAD_CPU;
966:   return 0;
967: }

969: PetscErrorCode MatScale_SeqDenseHIP(Mat Y, PetscScalar alpha)
970: {
971:   Mat_SeqDense   *y = (Mat_SeqDense *)Y->data;
972:   PetscScalar    *dy;
973:   PetscHipBLASInt j, N, m, lday, one = 1;
974:   hipblasHandle_t hipblasv2handle;

976:   PetscHIPBLASGetHandle(&hipblasv2handle);
977:   MatDenseHIPGetArray(Y, &dy);
978:   PetscHipBLASIntCast(Y->rmap->n * Y->cmap->n, &N);
979:   PetscHipBLASIntCast(Y->rmap->n, &m);
980:   PetscHipBLASIntCast(y->lda, &lday);
981:   PetscInfo(Y, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", Y->rmap->n, Y->cmap->n);
982:   PetscLogGpuTimeBegin();
983:   if (lday > m) {
984:     for (j = 0; j < Y->cmap->n; j++) hipblasXscal(hipblasv2handle, m, &alpha, dy + lday * j, one);
985:   } else hipblasXscal(hipblasv2handle, N, &alpha, dy, one);
986:   PetscLogGpuTimeEnd();
987:   PetscLogGpuFlops(N);
988:   MatDenseHIPRestoreArray(Y, &dy);
989:   return 0;
990: }

992: struct petscshift : public thrust::unary_function<PetscScalar, PetscScalar> {
993:   const PetscScalar shift_;
994:   petscshift(PetscScalar shift) : shift_(shift) { }
995:   __device__ PetscScalar operator()(PetscScalar x) { return x + shift_; }
996: };

998: template <typename Iterator>
999: class strided_range {
1000: public:
1001:   typedef typename thrust::iterator_difference<Iterator>::type difference_type;
1002:   struct stride_functor : public thrust::unary_function<difference_type, difference_type> {
1003:     difference_type stride;
1004:     stride_functor(difference_type stride) : stride(stride) { }
1005:     __device__ difference_type operator()(const difference_type &i) const { return stride * i; }
1006:   };
1007:   typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
1008:   typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
1009:   typedef typename thrust::permutation_iterator<Iterator, TransformIterator>    PermutationIterator;
1010:   typedef PermutationIterator                                                   iterator; // type of the strided_range iterator
1011:   // construct strided_range for the range [first,last)
1012:   strided_range(Iterator first, Iterator last, difference_type stride) : first(first), last(last), stride(stride) { }
1013:   iterator begin(void) const { return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride))); }
1014:   iterator end(void) const { return begin() + ((last - first) + (stride - 1)) / stride; }

1016: protected:
1017:   Iterator        first;
1018:   Iterator        last;
1019:   difference_type stride;
1020: };

1022: PetscErrorCode MatShift_DenseHIP_Private(PetscScalar *da, PetscScalar alpha, PetscInt lda, PetscInt rstart, PetscInt rend, PetscInt cols)
1023: {
1024:   PetscInt rend2 = PetscMin(rend, cols);
1025:   if (rend2 > rstart) {
1026:     PetscLogGpuTimeBegin();
1027:     try {
1028:       const auto                                                  dptr  = thrust::device_pointer_cast(da);
1029:       size_t                                                      begin = rstart * lda;
1030:       size_t                                                      end   = rend2 - rstart + rend2 * lda;
1031:       strided_range<thrust::device_vector<PetscScalar>::iterator> diagonal(dptr + begin, dptr + end, lda + 1);
1032:       thrust::transform(diagonal.begin(), diagonal.end(), diagonal.begin(), petscshift(alpha));
1033:     } catch (char *ex) {
1034:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1035:     }
1036:     PetscLogGpuTimeEnd();
1037:     PetscLogGpuFlops(rend2 - rstart);
1038:   }
1039:   return 0;
1040: }

1042: PetscErrorCode MatShift_SeqDenseHIP(Mat A, PetscScalar alpha)
1043: {
1044:   PetscScalar *da;
1045:   PetscInt     m = A->rmap->n, n = A->cmap->n, lda;

1047:   MatDenseHIPGetArray(A, &da);
1048:   MatDenseGetLDA(A, &lda);
1049:   PetscInfo(A, "Performing Shift %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n);
1050:   MatShift_DenseHIP_Private(da, alpha, lda, 0, m, n);
1051:   MatDenseHIPRestoreArray(A, &da);
1052:   return 0;
1053: }

1055: PetscErrorCode MatAXPY_SeqDenseHIP(Mat Y, PetscScalar alpha, Mat X, MatStructure str)
1056: {
1057:   Mat_SeqDense      *x = (Mat_SeqDense *)X->data;
1058:   Mat_SeqDense      *y = (Mat_SeqDense *)Y->data;
1059:   const PetscScalar *dx;
1060:   PetscScalar       *dy;
1061:   PetscHipBLASInt    j, N, m, ldax, lday, one = 1;
1062:   hipblasHandle_t    hipblasv2handle;

1064:   if (!X->rmap->n || !X->cmap->n) return 0;
1065:   PetscHIPBLASGetHandle(&hipblasv2handle);
1066:   MatDenseHIPGetArrayRead(X, &dx);
1067:   if (alpha == 0.0) MatDenseHIPGetArrayWrite(Y, &dy);
1068:   else MatDenseHIPGetArray(Y, &dy);
1069:   PetscHipBLASIntCast(X->rmap->n * X->cmap->n, &N);
1070:   PetscHipBLASIntCast(X->rmap->n, &m);
1071:   PetscHipBLASIntCast(x->lda, &ldax);
1072:   PetscHipBLASIntCast(y->lda, &lday);
1073:   PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", Y->rmap->n, Y->cmap->n);
1074:   PetscLogGpuTimeBegin();
1075:   if (ldax > m || lday > m) {
1076:     for (j = 0; j < X->cmap->n; j++) hipblasXaxpy(hipblasv2handle, m, &alpha, dx + j * ldax, one, dy + j * lday, one);
1077:   } else hipblasXaxpy(hipblasv2handle, N, &alpha, dx, one, dy, one);
1078:   PetscLogGpuTimeEnd();
1079:   PetscLogGpuFlops(PetscMax(2. * N - 1, 0));
1080:   MatDenseHIPRestoreArrayRead(X, &dx);
1081:   if (alpha == 0.0) MatDenseHIPRestoreArrayWrite(Y, &dy);
1082:   else MatDenseHIPRestoreArray(Y, &dy);
1083:   return 0;
1084: }

1086: static PetscErrorCode MatReset_SeqDenseHIP(Mat A)
1087: {
1088:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;

1090:   if (dA) {
1092:     if (!dA->user_alloc) hipFree(dA->d_v);
1093:     hipFree(dA->d_fact_tau);
1094:     hipFree(dA->d_fact_ipiv);
1095:     hipFree(dA->d_fact_info);
1096:     hipFree(dA->d_fact_work);
1097:     VecDestroy(&dA->workvec);
1098:   }
1099:   PetscFree(A->spptr);
1100:   return 0;
1101: }

1103: PetscErrorCode MatDestroy_SeqDenseHIP(Mat A)
1104: {
1105:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1107:   /* prevent to copy back data if we own the data pointer */
1108:   if (!a->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU;
1109:   MatConvert_SeqDenseHIP_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A);
1110:   MatDestroy_SeqDense(A);
1111:   return 0;
1112: }

1114: PetscErrorCode MatDuplicate_SeqDenseHIP(Mat A, MatDuplicateOption cpvalues, Mat *B)
1115: {
1116:   MatDuplicateOption hcpvalues = (cpvalues == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : cpvalues;

1118:   MatCreate(PetscObjectComm((PetscObject)A), B);
1119:   MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n);
1120:   MatSetType(*B, ((PetscObject)A)->type_name);
1121:   MatDuplicateNoCreate_SeqDense(*B, A, hcpvalues);
1122:   if (cpvalues == MAT_COPY_VALUES && hcpvalues != MAT_COPY_VALUES) MatCopy_SeqDenseHIP(A, *B, SAME_NONZERO_PATTERN);
1123:   if (cpvalues != MAT_COPY_VALUES) { /* allocate memory if needed */
1124:     Mat_SeqDenseHIP *dB = (Mat_SeqDenseHIP *)(*B)->spptr;
1125:     if (!dB->d_v) MatSeqDenseHIPSetPreallocation(*B, NULL);
1126:   }
1127:   return 0;
1128: }

1130: static PetscErrorCode MatGetColumnVector_SeqDenseHIP(Mat A, Vec v, PetscInt col)
1131: {
1132:   Mat_SeqDense    *a  = (Mat_SeqDense *)A->data;
1133:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
1134:   PetscScalar     *x;
1135:   PetscBool        viship;

1137:   PetscObjectTypeCompareAny((PetscObject)v, &viship, VECSEQHIP, VECMPIHIP, VECHIP, "");
1138:   if (viship && !v->boundtocpu) { /* update device data */
1139:     VecHIPGetArrayWrite(v, &x);
1140:     if (A->offloadmask & PETSC_OFFLOAD_GPU) hipMemcpy(x, dA->d_v + col * a->lda, A->rmap->n * sizeof(PetscScalar), hipMemcpyHostToHost);
1141:     else hipMemcpy(x, a->v + col * a->lda, A->rmap->n * sizeof(PetscScalar), hipMemcpyHostToDevice);
1142:     VecHIPRestoreArrayWrite(v, &x);
1143:   } else { /* update host data */
1144:     VecGetArrayWrite(v, &x);
1145:     if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask & PETSC_OFFLOAD_CPU) PetscArraycpy(x, a->v + col * a->lda, A->rmap->n);
1146:     else if (A->offloadmask & PETSC_OFFLOAD_GPU) hipMemcpy(x, dA->d_v + col * a->lda, A->rmap->n * sizeof(PetscScalar), hipMemcpyDeviceToHost);
1147:     VecRestoreArrayWrite(v, &x);
1148:   }
1149:   return 0;
1150: }

1152: PETSC_INTERN PetscErrorCode MatGetFactor_seqdense_hip(Mat A, MatFactorType ftype, Mat *fact)
1153: {
1154:   MatCreate(PetscObjectComm((PetscObject)A), fact);
1155:   MatSetSizes(*fact, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n);
1156:   MatSetType(*fact, MATSEQDENSEHIP);
1157:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU) {
1158:     (*fact)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqDense;
1159:     (*fact)->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1160:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1161:     (*fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1162:   } else if (ftype == MAT_FACTOR_QR) {
1163:     PetscObjectComposeFunction((PetscObject)(*fact), "MatQRFactor_C", MatQRFactor_SeqDense);
1164:     PetscObjectComposeFunction((PetscObject)(*fact), "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense);
1165:   }
1166:   (*fact)->factortype = ftype;
1167:   PetscFree((*fact)->solvertype);
1168:   PetscStrallocpy(MATSOLVERHIP, &(*fact)->solvertype);
1169:   PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_LU]);
1170:   PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_ILU]);
1171:   PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_CHOLESKY]);
1172:   PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_ICC]);
1173:   return 0;
1174: }

1176: static PetscErrorCode MatDenseGetColumnVec_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1177: {
1178:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1182:   MatDenseHIPGetArray(A, (PetscScalar **)&a->ptrinuse);
1183:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecHIPPlaceArray is called */
1184:     VecCreateSeqHIPWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec);
1185:     PetscLogObjectParent((PetscObject)A, (PetscObject)a->cvec);
1186:   }
1187:   a->vecinuse = col + 1;
1188:   VecHIPPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda);
1189:   *v = a->cvec;
1190:   return 0;
1191: }

1193: static PetscErrorCode MatDenseRestoreColumnVec_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1194: {
1195:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1199:   a->vecinuse = 0;
1200:   VecHIPResetArray(a->cvec);
1201:   MatDenseHIPRestoreArray(A, (PetscScalar **)&a->ptrinuse);
1202:   if (v) *v = NULL;
1203:   return 0;
1204: }

1206: static PetscErrorCode MatDenseGetColumnVecRead_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1207: {
1208:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1212:   MatDenseHIPGetArrayRead(A, &a->ptrinuse);
1213:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecHIPPlaceArray is called */
1214:     VecCreateSeqHIPWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec);
1215:     PetscLogObjectParent((PetscObject)A, (PetscObject)a->cvec);
1216:   }
1217:   a->vecinuse = col + 1;
1218:   VecHIPPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda);
1219:   VecLockReadPush(a->cvec);
1220:   *v = a->cvec;
1221:   return 0;
1222: }

1224: static PetscErrorCode MatDenseRestoreColumnVecRead_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1225: {
1226:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1230:   a->vecinuse = 0;
1231:   VecLockReadPop(a->cvec);
1232:   VecHIPResetArray(a->cvec);
1233:   MatDenseHIPRestoreArrayRead(A, &a->ptrinuse);
1234:   if (v) *v = NULL;
1235:   return 0;
1236: }

1238: static PetscErrorCode MatDenseGetColumnVecWrite_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1239: {
1240:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1244:   MatDenseHIPGetArrayWrite(A, (PetscScalar **)&a->ptrinuse);
1245:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecHIPPlaceArray is called */
1246:     VecCreateSeqHIPWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec);
1247:     PetscLogObjectParent((PetscObject)A, (PetscObject)a->cvec);
1248:   }
1249:   a->vecinuse = col + 1;
1250:   VecHIPPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda);
1251:   *v = a->cvec;
1252:   return 0;
1253: }

1255: static PetscErrorCode MatDenseRestoreColumnVecWrite_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1256: {
1257:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1261:   a->vecinuse = 0;
1262:   VecHIPResetArray(a->cvec);
1263:   MatDenseHIPRestoreArrayWrite(A, (PetscScalar **)&a->ptrinuse);
1264:   if (v) *v = NULL;
1265:   return 0;
1266: }

1268: static PetscErrorCode MatDenseGetSubMatrix_SeqDenseHIP(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *v)
1269: {
1270:   Mat_SeqDense    *a  = (Mat_SeqDense *)A->data;
1271:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;

1275:   if (a->cmat && (cend - cbegin != a->cmat->cmap->N || rend - rbegin != a->cmat->rmap->N)) MatDestroy(&a->cmat);
1276:   MatSeqDenseHIPCopyToGPU(A);
1277:   if (!a->cmat) {
1278:     MatCreateDenseHIP(PetscObjectComm((PetscObject)A), rend - rbegin, PETSC_DECIDE, rend - rbegin, cend - cbegin, dA->d_v + rbegin + (size_t)cbegin * a->lda, &a->cmat);
1279:     PetscLogObjectParent((PetscObject)A, (PetscObject)a->cmat);
1280:   } else MatDenseHIPPlaceArray(a->cmat, dA->d_v + rbegin + (size_t)cbegin * a->lda);
1281:   MatDenseSetLDA(a->cmat, a->lda);
1282:   /* Place CPU array if present but not copy any data */
1283:   a->cmat->offloadmask = PETSC_OFFLOAD_GPU;
1284:   if (a->v) { MatDensePlaceArray(a->cmat, a->v + rbegin + (size_t)cbegin * a->lda); }
1285:   a->cmat->offloadmask = A->offloadmask;
1286:   a->matinuse          = cbegin + 1;
1287:   *v                   = a->cmat;
1288:   return 0;
1289: }

1291: static PetscErrorCode MatDenseRestoreSubMatrix_SeqDenseHIP(Mat A, Mat *v)
1292: {
1293:   Mat_SeqDense    *a    = (Mat_SeqDense *)A->data;
1294:   PetscBool        copy = PETSC_FALSE, reset;
1295:   PetscOffloadMask suboff;

1300:   a->matinuse = 0;
1301:   reset       = a->v ? PETSC_TRUE : PETSC_FALSE;
1302:   suboff      = a->cmat->offloadmask; /* calls to ResetArray may change it, so save it here */
1303:   if (suboff == PETSC_OFFLOAD_CPU && !a->v) {
1304:     copy = PETSC_TRUE;
1305:     MatSeqDenseSetPreallocation(A, NULL);
1306:   }
1307:   MatDenseHIPResetArray(a->cmat);
1308:   if (reset) MatDenseResetArray(a->cmat);
1309:   if (copy) {
1310:     MatSeqDenseHIPCopyFromGPU(A);
1311:   } else A->offloadmask = (suboff == PETSC_OFFLOAD_CPU) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1312:   a->cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1313:   if (v) *v = NULL;
1314:   return 0;
1315: }

1317: static PetscErrorCode MatDenseSetLDA_SeqDenseHIP(Mat A, PetscInt lda)
1318: {
1319:   Mat_SeqDense    *cA = (Mat_SeqDense *)A->data;
1320:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
1321:   PetscBool        data;

1323:   data = (PetscBool)((A->rmap->n > 0 && A->cmap->n > 0) ? (dA->d_v ? PETSC_TRUE : PETSC_FALSE) : PETSC_FALSE);
1326:   cA->lda = lda;
1327:   return 0;
1328: }

1330: static PetscErrorCode MatSetUp_SeqDenseHIP(Mat A)
1331: {
1332:   PetscLayoutSetUp(A->rmap);
1333:   PetscLayoutSetUp(A->cmap);
1334:   if (!A->preallocated) MatSeqDenseHIPSetPreallocation(A, NULL);
1335:   return 0;
1336: }

1338: static PetscErrorCode MatBindToCPU_SeqDenseHIP(Mat A, PetscBool flg)
1339: {
1340:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1344:   A->boundtocpu = flg;
1345:   if (!flg) {
1346:     PetscBool iship;

1348:     PetscObjectTypeCompare((PetscObject)a->cvec, VECSEQHIP, &iship);
1349:     if (!iship) VecDestroy(&a->cvec);
1350:     PetscObjectTypeCompare((PetscObject)a->cmat, MATSEQDENSEHIP, &iship);
1351:     if (!iship) MatDestroy(&a->cmat);
1352:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArray_C", MatDenseGetArray_SeqDenseHIP);
1353:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayRead_C", MatDenseGetArrayRead_SeqDenseHIP);
1354:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWrite_C", MatDenseGetArrayWrite_SeqDenseHIP);
1355:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDenseHIP);
1356:     PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDenseHIP);
1357:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDenseHIP);
1358:     PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDenseHIP);
1359:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDenseHIP);
1360:     PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDenseHIP);
1361:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDenseHIP);
1362:     PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDenseHIP);
1363:     PetscObjectComposeFunction((PetscObject)A, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDenseHIP);
1364:     PetscObjectComposeFunction((PetscObject)A, "MatQRFactor_C", MatQRFactor_SeqDenseHIP);

1366:     A->ops->duplicate               = MatDuplicate_SeqDenseHIP;
1367:     A->ops->mult                    = MatMult_SeqDenseHIP;
1368:     A->ops->multadd                 = MatMultAdd_SeqDenseHIP;
1369:     A->ops->multtranspose           = MatMultTranspose_SeqDenseHIP;
1370:     A->ops->multtransposeadd        = MatMultTransposeAdd_SeqDenseHIP;
1371:     A->ops->matmultnumeric          = MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP;
1372:     A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDenseHIP_SeqDenseHIP;
1373:     A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDenseHIP_SeqDenseHIP;
1374:     A->ops->axpy                    = MatAXPY_SeqDenseHIP;
1375:     A->ops->choleskyfactor          = MatCholeskyFactor_SeqDenseHIP;
1376:     A->ops->lufactor                = MatLUFactor_SeqDenseHIP;
1377:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDenseHIP;
1378:     A->ops->getcolumnvector         = MatGetColumnVector_SeqDenseHIP;
1379:     A->ops->scale                   = MatScale_SeqDenseHIP;
1380:     A->ops->shift                   = MatShift_SeqDenseHIP;
1381:     A->ops->copy                    = MatCopy_SeqDenseHIP;
1382:     A->ops->zeroentries             = MatZeroEntries_SeqDenseHIP;
1383:   } else {
1384:     /* make sure we have an up-to-date copy on the CPU */
1385:     MatSeqDenseHIPCopyFromGPU(A);
1386:     PetscFree(A->defaultrandtype);
1387:     PetscStrallocpy(PETSCRANDER48, &A->defaultrandtype);
1388:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArray_C", MatDenseGetArray_SeqDense);
1389:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense);
1390:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense);
1391:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense);
1392:     PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense);
1393:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense);
1394:     PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense);
1395:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense);
1396:     PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense);
1397:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense);
1398:     PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense);
1399:     PetscObjectComposeFunction((PetscObject)A, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense);
1400:     PetscObjectComposeFunction((PetscObject)A, "MatQRFactor_C", MatQRFactor_SeqDense);

1402:     A->ops->duplicate               = MatDuplicate_SeqDense;
1403:     A->ops->mult                    = MatMult_SeqDense;
1404:     A->ops->multadd                 = MatMultAdd_SeqDense;
1405:     A->ops->multtranspose           = MatMultTranspose_SeqDense;
1406:     A->ops->multtransposeadd        = MatMultTransposeAdd_SeqDense;
1407:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDense;
1408:     A->ops->matmultnumeric          = MatMatMultNumeric_SeqDense_SeqDense;
1409:     A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDense_SeqDense;
1410:     A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDense_SeqDense;
1411:     A->ops->axpy                    = MatAXPY_SeqDense;
1412:     A->ops->choleskyfactor          = MatCholeskyFactor_SeqDense;
1413:     A->ops->lufactor                = MatLUFactor_SeqDense;
1414:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDense;
1415:     A->ops->getcolumnvector         = MatGetColumnVector_SeqDense;
1416:     A->ops->scale                   = MatScale_SeqDense;
1417:     A->ops->shift                   = MatShift_SeqDense;
1418:     A->ops->copy                    = MatCopy_SeqDense;
1419:     A->ops->zeroentries             = MatZeroEntries_SeqDense;
1420:   }
1421:   if (a->cmat) { MatBindToCPU(a->cmat, flg); }
1422:   return 0;
1423: }

1425: PetscErrorCode MatConvert_SeqDenseHIP_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat)
1426: {
1427:   Mat           B;
1428:   Mat_SeqDense *a;

1430:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1431:     /* TODO these cases should be optimized */
1432:     MatConvert_Basic(M, type, reuse, newmat);
1433:     return 0;
1434:   }

1436:   B = *newmat;
1437:   MatBindToCPU_SeqDenseHIP(B, PETSC_TRUE);
1438:   MatReset_SeqDenseHIP(B);
1439:   PetscFree(B->defaultvectype);
1440:   PetscStrallocpy(VECSTANDARD, &B->defaultvectype);
1441:   PetscObjectChangeTypeName((PetscObject)B, MATSEQDENSE);
1442:   PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqdensehip_seqdense_C", NULL);
1443:   PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArray_C", NULL);
1444:   PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArrayRead_C", NULL);
1445:   PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArrayWrite_C", NULL);
1446:   PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArray_C", NULL);
1447:   PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArrayRead_C", NULL);
1448:   PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArrayWrite_C", NULL);
1449:   PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPPlaceArray_C", NULL);
1450:   PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPResetArray_C", NULL);
1451:   PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPReplaceArray_C", NULL);
1452:   PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_seqaij_seqdensehip_C", NULL);
1453:   a = (Mat_SeqDense *)B->data;
1454:   VecDestroy(&a->cvec); /* cvec might be VECSEQHIP. Destroy it and rebuild a VECSEQ when needed */
1455:   B->ops->bindtocpu = NULL;
1456:   B->ops->destroy   = MatDestroy_SeqDense;
1457:   B->offloadmask    = PETSC_OFFLOAD_CPU;
1458:   return 0;
1459: }

1461: PetscErrorCode MatConvert_SeqDense_SeqDenseHIP(Mat M, MatType type, MatReuse reuse, Mat *newmat)
1462: {
1463:   Mat_SeqDenseHIP *dB;
1464:   Mat_SeqDense    *a;
1465:   Mat              B;

1467:   PetscDeviceInitialize(PETSC_DEVICE_HIP);
1468:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1469:     /* TODO these cases should be optimized */
1470:     MatConvert_Basic(M, type, reuse, newmat);
1471:     return 0;
1472:   }

1474:   B = *newmat;
1475:   PetscFree(B->defaultvectype);
1476:   PetscStrallocpy(VECHIP, &B->defaultvectype);
1477:   PetscObjectChangeTypeName((PetscObject)B, MATSEQDENSEHIP);
1478:   PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqdensehip_seqdense_C", MatConvert_SeqDenseHIP_SeqDense);
1479:   PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArray_C", MatDenseHIPGetArray_SeqDenseHIP);
1480:   PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArrayRead_C", MatDenseHIPGetArrayRead_SeqDenseHIP);
1481:   PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArrayWrite_C", MatDenseHIPGetArrayWrite_SeqDenseHIP);
1482:   PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArray_C", MatDenseHIPRestoreArray_SeqDenseHIP);
1483:   PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArrayRead_C", MatDenseHIPRestoreArrayRead_SeqDenseHIP);
1484:   PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArrayWrite_C", MatDenseHIPRestoreArrayWrite_SeqDenseHIP);
1485:   PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPPlaceArray_C", MatDenseHIPPlaceArray_SeqDenseHIP);
1486:   PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPResetArray_C", MatDenseHIPResetArray_SeqDenseHIP);
1487:   PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPReplaceArray_C", MatDenseHIPReplaceArray_SeqDenseHIP);
1488:   PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_seqaij_seqdensehip_C", MatProductSetFromOptions_SeqAIJ_SeqDense);
1489:   a = (Mat_SeqDense *)B->data;
1490:   VecDestroy(&a->cvec); /* cvec might be VECSEQ. Destroy it and rebuild a VECSEQHIP when needed */
1491:   PetscNewLog(B, &dB);
1492:   B->spptr       = dB;
1493:   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1494:   MatBindToCPU_SeqDenseHIP(B, PETSC_FALSE);
1495:   B->ops->bindtocpu = MatBindToCPU_SeqDenseHIP;
1496:   B->ops->destroy   = MatDestroy_SeqDenseHIP;
1497:   return 0;
1498: }

1500: /*@C
1501:    MatCreateSeqDenseHIP - Creates a sequential matrix in dense format using HIP.

1503:    Collective

1505:    Input Parameters:
1506: +  comm - MPI communicator
1507: .  m - number of rows
1508: .  n - number of columns
1509: -  data - optional location of GPU matrix data.  Set data=NULL for PETSc
1510:    to control matrix memory allocation.

1512:    Output Parameter:
1513: .  A - the matrix

1515:    Notes:

1517:    Level: intermediate

1519: .seealso: `MATSEQDENSE`, `MatCreate()`, `MatCreateSeqDense()`
1520: @*/
1521: PetscErrorCode MatCreateSeqDenseHIP(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A)
1522: {
1523:   PetscMPIInt size;

1525:   MPI_Comm_size(comm, &size);
1527:   MatCreate(comm, A);
1528:   MatSetSizes(*A, m, n, m, n);
1529:   MatSetType(*A, MATSEQDENSEHIP);
1530:   MatSeqDenseHIPSetPreallocation(*A, data);
1531:   return 0;
1532: }

1534: /*MC
1535:    MATSEQDENSEHIP - MATSEQDENSEHIP = "seqdensehip" - A matrix type to be used for sequential dense matrices on GPUs.

1537:    Options Database Keys:
1538: . -mat_type seqdensehip - sets the matrix type to `MATSEQDENSEHIP` during a call to `MatSetFromOptions()`

1540:   Level: beginner

1542: .seealso: `MATSEQDENSE`
1543: M*/
1544: PETSC_EXTERN PetscErrorCode MatCreate_SeqDenseHIP(Mat B)
1545: {
1546:   PetscDeviceInitialize(PETSC_DEVICE_HIP);
1547:   MatCreate_SeqDense(B);
1548:   MatConvert_SeqDense_SeqDenseHIP(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B);
1549:   return 0;
1550: }