Actual source code: mpiaijhipsparse.hip.cpp
1: /* Portions of this code are under:
2: Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
3: */
4: #define PETSC_SKIP_SPINLOCK
6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
7: #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h>
8: #include <../src/mat/impls/aij/mpi/mpihipsparse/mpihipsparsematimpl.h>
9: #include <thrust/advance.h>
10: #include <thrust/partition.h>
11: #include <thrust/sort.h>
12: #include <thrust/unique.h>
13: #include <petscsf.h>
15: struct VecHIPEquals {
16: template <typename Tuple>
17: __host__ __device__ void operator()(Tuple t)
18: {
19: thrust::get<1>(t) = thrust::get<0>(t);
20: }
21: };
23: static PetscErrorCode MatResetPreallocationCOO_MPIAIJHIPSPARSE(Mat mat)
24: {
25: auto *aij = static_cast<Mat_MPIAIJ *>(mat->data);
26: auto *hipsparseStruct = static_cast<Mat_MPIAIJHIPSPARSE *>(aij->spptr);
28: if (!hipsparseStruct) return 0;
29: if (hipsparseStruct->use_extended_coo) {
30: hipFree(hipsparseStruct->Ajmap1_d);
31: hipFree(hipsparseStruct->Aperm1_d);
32: hipFree(hipsparseStruct->Bjmap1_d);
33: hipFree(hipsparseStruct->Bperm1_d);
34: hipFree(hipsparseStruct->Aimap2_d);
35: hipFree(hipsparseStruct->Ajmap2_d);
36: hipFree(hipsparseStruct->Aperm2_d);
37: hipFree(hipsparseStruct->Bimap2_d);
38: hipFree(hipsparseStruct->Bjmap2_d);
39: hipFree(hipsparseStruct->Bperm2_d);
40: hipFree(hipsparseStruct->Cperm1_d);
41: hipFree(hipsparseStruct->sendbuf_d);
42: hipFree(hipsparseStruct->recvbuf_d);
43: }
44: hipsparseStruct->use_extended_coo = PETSC_FALSE;
45: delete hipsparseStruct->coo_p;
46: delete hipsparseStruct->coo_pw;
47: hipsparseStruct->coo_p = nullptr;
48: hipsparseStruct->coo_pw = nullptr;
49: return 0;
50: }
52: static PetscErrorCode MatSetValuesCOO_MPIAIJHIPSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
53: {
54: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
55: Mat_MPIAIJHIPSPARSE *cusp = (Mat_MPIAIJHIPSPARSE *)a->spptr;
56: PetscInt n = cusp->coo_nd + cusp->coo_no;
58: if (cusp->coo_p && v) {
59: thrust::device_ptr<const PetscScalar> d_v;
60: THRUSTARRAY *w = NULL;
62: if (isHipMem(v)) {
63: d_v = thrust::device_pointer_cast(v);
64: } else {
65: w = new THRUSTARRAY(n);
66: w->assign(v, v + n);
67: PetscLogCpuToGpu(n * sizeof(PetscScalar));
68: d_v = w->data();
69: }
71: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin()));
72: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end()));
73: PetscLogGpuTimeBegin();
74: thrust::for_each(zibit, zieit, VecHIPEquals());
75: PetscLogGpuTimeEnd();
76: delete w;
77: MatSetValuesCOO_SeqAIJHIPSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode);
78: MatSetValuesCOO_SeqAIJHIPSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode);
79: } else {
80: MatSetValuesCOO_SeqAIJHIPSPARSE_Basic(a->A, v, imode);
81: MatSetValuesCOO_SeqAIJHIPSPARSE_Basic(a->B, v ? v + cusp->coo_nd : nullptr, imode);
82: }
83: return 0;
84: }
86: template <typename Tuple>
87: struct IsNotOffDiagT {
88: PetscInt _cstart, _cend;
90: IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
91: __host__ __device__ bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); }
92: };
94: struct IsOffDiag {
95: PetscInt _cstart, _cend;
97: IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
98: __host__ __device__ bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; }
99: };
101: struct GlobToLoc {
102: PetscInt _start;
104: GlobToLoc(PetscInt start) : _start(start) { }
105: __host__ __device__ PetscInt operator()(const PetscInt &c) { return c - _start; }
106: };
108: static PetscErrorCode MatSetPreallocationCOO_MPIAIJHIPSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
109: {
110: Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
111: Mat_MPIAIJHIPSPARSE *cusp = (Mat_MPIAIJHIPSPARSE *)b->spptr;
112: PetscInt N, *jj;
113: size_t noff = 0;
114: THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
115: THRUSTINTARRAY d_j(n);
116: ISLocalToGlobalMapping l2g;
118: MatDestroy(&b->A);
119: MatDestroy(&b->B);
121: PetscLogCpuToGpu(2. * n * sizeof(PetscInt));
122: d_i.assign(coo_i, coo_i + n);
123: d_j.assign(coo_j, coo_j + n);
124: delete cusp->coo_p;
125: delete cusp->coo_pw;
126: cusp->coo_p = NULL;
127: cusp->coo_pw = NULL;
128: PetscLogGpuTimeBegin();
129: auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
130: auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
131: if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
132: cusp->coo_p = new THRUSTINTARRAY(n);
133: cusp->coo_pw = new THRUSTARRAY(n);
134: thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0);
135: auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin()));
136: auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end()));
137: auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend));
138: firstoffd = mzipp.get_iterator_tuple().get<1>();
139: }
140: cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd);
141: cusp->coo_no = thrust::distance(firstoffd, d_j.end());
143: /* from global to local */
144: thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart));
145: thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart));
146: PetscLogGpuTimeEnd();
148: /* copy offdiag column indices to map on the CPU */
149: PetscMalloc1(cusp->coo_no, &jj); /* jj[] will store compacted col ids of the offdiag part */
150: hipMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), hipMemcpyDeviceToHost);
151: auto o_j = d_j.begin();
152: PetscLogGpuTimeBegin();
153: thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */
154: thrust::sort(thrust::device, o_j, d_j.end());
155: auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */
156: PetscLogGpuTimeEnd();
157: noff = thrust::distance(o_j, wit);
158: PetscMalloc1(noff, &b->garray);
159: hipMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), hipMemcpyDeviceToHost);
160: PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt));
161: ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g);
162: ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH);
163: ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj);
165: ISLocalToGlobalMappingDestroy(&l2g);
166: MatCreate(PETSC_COMM_SELF, &b->A);
167: MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n);
168: MatSetType(b->A, MATSEQAIJHIPSPARSE);
169: MatCreate(PETSC_COMM_SELF, &b->B);
170: MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff);
171: MatSetType(b->B, MATSEQAIJHIPSPARSE);
173: /* GPU memory, hipsparse specific call handles it internally */
174: MatSetPreallocationCOO_SeqAIJHIPSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get());
175: MatSetPreallocationCOO_SeqAIJHIPSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj);
176: PetscFree(jj);
178: MatHIPSPARSESetFormat(b->A, MAT_HIPSPARSE_MULT, cusp->diagGPUMatFormat);
179: MatHIPSPARSESetFormat(b->B, MAT_HIPSPARSE_MULT, cusp->offdiagGPUMatFormat);
180: MatBindToCPU(b->A, B->boundtocpu);
181: MatBindToCPU(b->B, B->boundtocpu);
182: MatSetUpMultiply_MPIAIJ(B);
183: return 0;
184: }
186: static PetscErrorCode MatSetPreallocationCOO_MPIAIJHIPSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
187: {
188: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
189: Mat_MPIAIJHIPSPARSE *mpidev;
190: PetscBool coo_basic = PETSC_TRUE;
191: PetscMemType mtype = PETSC_MEMTYPE_DEVICE;
192: PetscInt rstart, rend;
194: PetscFree(mpiaij->garray);
195: VecDestroy(&mpiaij->lvec);
196: #if defined(PETSC_USE_CTABLE)
197: PetscTableDestroy(&mpiaij->colmap);
198: #else
199: PetscFree(mpiaij->colmap);
200: #endif
201: VecScatterDestroy(&mpiaij->Mvctx);
202: mat->assembled = PETSC_FALSE;
203: mat->was_assembled = PETSC_FALSE;
204: MatResetPreallocationCOO_MPIAIJ(mat);
205: MatResetPreallocationCOO_MPIAIJHIPSPARSE(mat);
206: if (coo_i) {
207: PetscLayoutGetRange(mat->rmap, &rstart, &rend);
208: PetscGetMemType(coo_i, &mtype);
209: if (PetscMemTypeHost(mtype)) {
210: for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */
211: if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) {
212: coo_basic = PETSC_FALSE;
213: break;
214: }
215: }
216: }
217: }
218: /* All ranks must agree on the value of coo_basic */
219: MPI_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat));
220: if (coo_basic) {
221: MatSetPreallocationCOO_MPIAIJHIPSPARSE_Basic(mat, coo_n, coo_i, coo_j);
222: } else {
223: MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j);
224: mat->offloadmask = PETSC_OFFLOAD_CPU;
225: /* creates the GPU memory */
226: MatSeqAIJHIPSPARSECopyToGPU(mpiaij->A);
227: MatSeqAIJHIPSPARSECopyToGPU(mpiaij->B);
228: mpidev = static_cast<Mat_MPIAIJHIPSPARSE *>(mpiaij->spptr);
229: mpidev->use_extended_coo = PETSC_TRUE;
231: hipMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount));
232: hipMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount));
234: hipMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount));
235: hipMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount));
237: hipMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount));
238: hipMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount));
239: hipMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount));
241: hipMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount));
242: hipMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount));
243: hipMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount));
245: hipMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount));
246: hipMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar));
247: hipMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar));
249: hipMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice);
250: hipMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), hipMemcpyHostToDevice);
252: hipMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice);
253: hipMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), hipMemcpyHostToDevice);
255: hipMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), hipMemcpyHostToDevice);
256: hipMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), hipMemcpyHostToDevice);
257: hipMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), hipMemcpyHostToDevice);
259: hipMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), hipMemcpyHostToDevice);
260: hipMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), hipMemcpyHostToDevice);
261: hipMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), hipMemcpyHostToDevice);
263: hipMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), hipMemcpyHostToDevice);
264: }
265: return 0;
266: }
268: __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
269: {
270: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
271: const PetscCount grid_size = gridDim.x * blockDim.x;
272: for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
273: }
275: __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
276: {
277: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
278: const PetscCount grid_size = gridDim.x * blockDim.x;
279: for (; i < Annz + Bnnz; i += grid_size) {
280: PetscScalar sum = 0.0;
281: if (i < Annz) {
282: for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
283: Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
284: } else {
285: i -= Annz;
286: for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
287: Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
288: }
289: }
290: }
292: __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
293: {
294: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
295: const PetscCount grid_size = gridDim.x * blockDim.x;
296: for (; i < Annz2 + Bnnz2; i += grid_size) {
297: if (i < Annz2) {
298: for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
299: } else {
300: i -= Annz2;
301: for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
302: }
303: }
304: }
306: static PetscErrorCode MatSetValuesCOO_MPIAIJHIPSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
307: {
308: Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
309: Mat_MPIAIJHIPSPARSE *mpidev = static_cast<Mat_MPIAIJHIPSPARSE *>(mpiaij->spptr);
310: Mat A = mpiaij->A, B = mpiaij->B;
311: PetscCount Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2;
312: PetscScalar *Aa, *Ba = NULL;
313: PetscScalar *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d;
314: const PetscScalar *v1 = v;
315: const PetscCount *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d;
316: const PetscCount *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d;
317: const PetscCount *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d;
318: const PetscCount *Cperm1 = mpidev->Cperm1_d;
319: PetscMemType memtype;
321: if (mpidev->use_extended_coo) {
322: PetscMPIInt size;
324: MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size);
325: PetscGetMemType(v, &memtype);
326: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
327: hipMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar));
328: hipMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), hipMemcpyHostToDevice);
329: }
331: if (imode == INSERT_VALUES) {
332: MatSeqAIJHIPSPARSEGetArrayWrite(A, &Aa); /* write matrix values */
333: MatSeqAIJHIPSPARSEGetArrayWrite(B, &Ba);
334: } else {
335: MatSeqAIJHIPSPARSEGetArray(A, &Aa); /* read & write matrix values */
336: MatSeqAIJHIPSPARSEGetArray(B, &Ba);
337: }
339: /* Pack entries to be sent to remote */
340: if (mpiaij->sendlen) {
341: hipLaunchKernelGGL(HIP_KERNEL_NAME(MatPackCOOValues), dim3((mpiaij->sendlen + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, mpiaij->sendlen, Cperm1, vsend);
342: hipPeekAtLastError();
343: }
345: /* Send remote entries to their owner and overlap the communication with local computation */
346: PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_HIP, vsend, PETSC_MEMTYPE_HIP, v2, MPI_REPLACE);
347: /* Add local entries to A and B */
348: if (Annz + Bnnz > 0) {
349: hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddLocalCOOValues), dim3((Annz + Bnnz + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
350: hipPeekAtLastError();
351: }
352: PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE);
354: /* Add received remote entries to A and B */
355: if (Annz2 + Bnnz2 > 0) {
356: hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddRemoteCOOValues), dim3((Annz2 + Bnnz2 + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
357: hipPeekAtLastError();
358: }
360: if (imode == INSERT_VALUES) {
361: MatSeqAIJHIPSPARSERestoreArrayWrite(A, &Aa);
362: MatSeqAIJHIPSPARSERestoreArrayWrite(B, &Ba);
363: } else {
364: MatSeqAIJHIPSPARSERestoreArray(A, &Aa);
365: MatSeqAIJHIPSPARSERestoreArray(B, &Ba);
366: }
367: if (PetscMemTypeHost(memtype)) hipFree((void *)v1);
368: } else {
369: MatSetValuesCOO_MPIAIJHIPSPARSE_Basic(mat, v, imode);
370: }
371: mat->offloadmask = PETSC_OFFLOAD_GPU;
372: return 0;
373: }
375: static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJHIPSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
376: {
377: Mat Ad, Ao;
378: const PetscInt *cmap;
380: MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap);
381: MatSeqAIJHIPSPARSEMergeMats(Ad, Ao, scall, A_loc);
382: if (glob) {
383: PetscInt cst, i, dn, on, *gidx;
385: MatGetLocalSize(Ad, NULL, &dn);
386: MatGetLocalSize(Ao, NULL, &on);
387: MatGetOwnershipRangeColumn(A, &cst, NULL);
388: PetscMalloc1(dn + on, &gidx);
389: for (i = 0; i < dn; i++) gidx[i] = cst + i;
390: for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
391: ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob);
392: }
393: return 0;
394: }
396: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJHIPSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
397: {
398: Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
399: Mat_MPIAIJHIPSPARSE *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)b->spptr;
400: PetscInt i;
402: PetscLayoutSetUp(B->rmap);
403: PetscLayoutSetUp(B->cmap);
404: if (PetscDefined(USE_DEBUG) && d_nnz) {
406: }
407: if (PetscDefined(USE_DEBUG) && o_nnz) {
409: }
410: #if defined(PETSC_USE_CTABLE)
411: PetscTableDestroy(&b->colmap);
412: #else
413: PetscFree(b->colmap);
414: #endif
415: PetscFree(b->garray);
416: VecDestroy(&b->lvec);
417: VecScatterDestroy(&b->Mvctx);
418: /* Because the B will have been resized we simply destroy it and create a new one each time */
419: MatDestroy(&b->B);
420: if (!b->A) {
421: MatCreate(PETSC_COMM_SELF, &b->A);
422: MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n);
423: }
424: if (!b->B) {
425: PetscMPIInt size;
426: MPI_Comm_size(PetscObjectComm((PetscObject)B), &size);
427: MatCreate(PETSC_COMM_SELF, &b->B);
428: MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0);
429: }
430: MatSetType(b->A, MATSEQAIJHIPSPARSE);
431: MatSetType(b->B, MATSEQAIJHIPSPARSE);
432: MatBindToCPU(b->A, B->boundtocpu);
433: MatBindToCPU(b->B, B->boundtocpu);
434: MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz);
435: MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz);
436: MatHIPSPARSESetFormat(b->A, MAT_HIPSPARSE_MULT, hipsparseStruct->diagGPUMatFormat);
437: MatHIPSPARSESetFormat(b->B, MAT_HIPSPARSE_MULT, hipsparseStruct->offdiagGPUMatFormat);
438: B->preallocated = PETSC_TRUE;
439: return 0;
440: }
442: PetscErrorCode MatMult_MPIAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
443: {
444: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
446: VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD);
447: (*a->A->ops->mult)(a->A, xx, yy);
448: VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD);
449: (*a->B->ops->multadd)(a->B, a->lvec, yy, yy);
450: return 0;
451: }
453: PetscErrorCode MatZeroEntries_MPIAIJHIPSPARSE(Mat A)
454: {
455: Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
457: MatZeroEntries(l->A);
458: MatZeroEntries(l->B);
459: return 0;
460: }
462: PetscErrorCode MatMultAdd_MPIAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
463: {
464: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
466: VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD);
467: (*a->A->ops->multadd)(a->A, xx, yy, zz);
468: VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD);
469: (*a->B->ops->multadd)(a->B, a->lvec, zz, zz);
470: return 0;
471: }
473: PetscErrorCode MatMultTranspose_MPIAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
474: {
475: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
477: (*a->B->ops->multtranspose)(a->B, xx, a->lvec);
478: (*a->A->ops->multtranspose)(a->A, xx, yy);
479: VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE);
480: VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE);
481: return 0;
482: }
484: PetscErrorCode MatHIPSPARSESetFormat_MPIAIJHIPSPARSE(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
485: {
486: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
487: Mat_MPIAIJHIPSPARSE *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)a->spptr;
489: switch (op) {
490: case MAT_HIPSPARSE_MULT_DIAG:
491: hipsparseStruct->diagGPUMatFormat = format;
492: break;
493: case MAT_HIPSPARSE_MULT_OFFDIAG:
494: hipsparseStruct->offdiagGPUMatFormat = format;
495: break;
496: case MAT_HIPSPARSE_ALL:
497: hipsparseStruct->diagGPUMatFormat = format;
498: hipsparseStruct->offdiagGPUMatFormat = format;
499: break;
500: default:
501: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatHIPSPARSEFormatOperation. Only MAT_HIPSPARSE_MULT_DIAG, MAT_HIPSPARSE_MULT_DIAG, and MAT_HIPSPARSE_MULT_ALL are currently supported.", op);
502: }
503: return 0;
504: }
506: PetscErrorCode MatSetFromOptions_MPIAIJHIPSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
507: {
508: MatHIPSPARSEStorageFormat format;
509: PetscBool flg;
510: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
511: Mat_MPIAIJHIPSPARSE *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)a->spptr;
513: PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJHIPSPARSE options");
514: if (A->factortype == MAT_FACTOR_NONE) {
515: PetscOptionsEnum("-mat_hipsparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg);
516: if (flg) MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT_DIAG, format);
517: PetscOptionsEnum("-mat_hipsparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg);
518: if (flg) MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT_OFFDIAG, format);
519: PetscOptionsEnum("-mat_hipsparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg);
520: if (flg) MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_ALL, format);
521: }
522: PetscOptionsHeadEnd();
523: return 0;
524: }
526: PetscErrorCode MatAssemblyEnd_MPIAIJHIPSPARSE(Mat A, MatAssemblyType mode)
527: {
528: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;
529: Mat_MPIAIJHIPSPARSE *cusp = (Mat_MPIAIJHIPSPARSE *)mpiaij->spptr;
530: PetscObjectState onnz = A->nonzerostate;
532: MatAssemblyEnd_MPIAIJ(A, mode);
533: if (mpiaij->lvec) VecSetType(mpiaij->lvec, VECSEQHIP);
534: if (onnz != A->nonzerostate && cusp->deviceMat) {
535: PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
537: PetscInfo(A, "Destroy device mat since nonzerostate changed\n");
538: PetscNew(&h_mat);
539: hipMemcpy(h_mat, d_mat, sizeof(*d_mat), hipMemcpyDeviceToHost);
540: hipFree(h_mat->colmap);
541: if (h_mat->allocated_indices) {
542: hipFree(h_mat->diag.i);
543: hipFree(h_mat->diag.j);
544: if (h_mat->offdiag.j) {
545: hipFree(h_mat->offdiag.i);
546: hipFree(h_mat->offdiag.j);
547: }
548: }
549: hipFree(d_mat);
550: PetscFree(h_mat);
551: cusp->deviceMat = NULL;
552: }
553: return 0;
554: }
556: PetscErrorCode MatDestroy_MPIAIJHIPSPARSE(Mat A)
557: {
558: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
559: Mat_MPIAIJHIPSPARSE *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)aij->spptr;
562: if (hipsparseStruct->deviceMat) {
563: PetscSplitCSRDataStructure d_mat = hipsparseStruct->deviceMat, h_mat;
565: PetscInfo(A, "Have device matrix\n");
566: PetscNew(&h_mat);
567: hipMemcpy(h_mat, d_mat, sizeof(*d_mat), hipMemcpyDeviceToHost);
568: hipFree(h_mat->colmap);
569: if (h_mat->allocated_indices) {
570: hipFree(h_mat->diag.i);
571: hipFree(h_mat->diag.j);
572: if (h_mat->offdiag.j) {
573: hipFree(h_mat->offdiag.i);
574: hipFree(h_mat->offdiag.j);
575: }
576: }
577: hipFree(d_mat);
578: PetscFree(h_mat);
579: }
580: /* Free COO */
581: MatResetPreallocationCOO_MPIAIJHIPSPARSE(A);
582: delete hipsparseStruct;
583: PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL);
584: PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL);
585: PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL);
586: PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL);
587: PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", NULL);
588: PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijhipsparse_hypre_C", NULL);
589: MatDestroy_MPIAIJ(A);
590: return 0;
591: }
593: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJHIPSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat *newmat)
594: {
595: Mat_MPIAIJ *a;
596: Mat A;
598: PetscDeviceInitialize(PETSC_DEVICE_HIP);
599: if (reuse == MAT_INITIAL_MATRIX) MatDuplicate(B, MAT_COPY_VALUES, newmat);
600: else if (reuse == MAT_REUSE_MATRIX) MatCopy(B, *newmat, SAME_NONZERO_PATTERN);
601: A = *newmat;
602: A->boundtocpu = PETSC_FALSE;
603: PetscFree(A->defaultvectype);
604: PetscStrallocpy(VECHIP, &A->defaultvectype);
606: a = (Mat_MPIAIJ *)A->data;
607: if (a->A) MatSetType(a->A, MATSEQAIJHIPSPARSE);
608: if (a->B) MatSetType(a->B, MATSEQAIJHIPSPARSE);
609: if (a->lvec) VecSetType(a->lvec, VECSEQHIP);
611: if (reuse != MAT_REUSE_MATRIX && !a->spptr) a->spptr = new Mat_MPIAIJHIPSPARSE;
613: A->ops->assemblyend = MatAssemblyEnd_MPIAIJHIPSPARSE;
614: A->ops->mult = MatMult_MPIAIJHIPSPARSE;
615: A->ops->multadd = MatMultAdd_MPIAIJHIPSPARSE;
616: A->ops->multtranspose = MatMultTranspose_MPIAIJHIPSPARSE;
617: A->ops->setfromoptions = MatSetFromOptions_MPIAIJHIPSPARSE;
618: A->ops->destroy = MatDestroy_MPIAIJHIPSPARSE;
619: A->ops->zeroentries = MatZeroEntries_MPIAIJHIPSPARSE;
620: A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
622: PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJHIPSPARSE);
623: PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJHIPSPARSE);
624: PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJHIPSPARSE);
625: PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", MatHIPSPARSESetFormat_MPIAIJHIPSPARSE);
626: PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJHIPSPARSE);
627: PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJHIPSPARSE);
628: #if defined(PETSC_HAVE_HYPRE)
629: PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijhipsparse_hypre_C", MatConvert_AIJ_HYPRE);
630: #endif
631: return 0;
632: }
634: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJHIPSPARSE(Mat A)
635: {
636: PetscDeviceInitialize(PETSC_DEVICE_HIP);
637: MatCreate_MPIAIJ(A);
638: MatConvert_MPIAIJ_MPIAIJHIPSPARSE(A, MATMPIAIJHIPSPARSE, MAT_INPLACE_MATRIX, &A);
639: return 0;
640: }
642: /*@
643: MatCreateAIJHIPSPARSE - Creates a sparse matrix in AIJ (compressed row) format
644: (the default parallel PETSc format). This matrix will ultimately pushed down
645: to AMD GPUs and use the HIPSPARSE library for calculations. For good matrix
646: assembly performance the user should preallocate the matrix storage by setting
647: the parameter nz (or the array nnz). By setting these parameters accurately,
648: performance during matrix assembly can be increased by more than a factor of 50.
650: Collective
652: Input Parameters:
653: + comm - MPI communicator, set to PETSC_COMM_SELF
654: . m - number of rows
655: . n - number of columns
656: . nz - number of nonzeros per row (same for all rows)
657: - nnz - array containing the number of nonzeros in the various rows
658: (possibly different for each row) or NULL
660: Output Parameter:
661: . A - the matrix
663: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
664: MatXXXXSetPreallocation() paradigm instead of this routine directly.
665: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
667: Notes:
668: If nnz is given then nz is ignored
670: The AIJ format (also called the Yale sparse matrix format or
671: compressed row storage), is fully compatible with standard Fortran 77
672: storage. That is, the stored row and column indices can begin at
673: either one (as in Fortran) or zero. See the users' manual for details.
675: Specify the preallocated storage with either nz or nnz (not both).
676: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
677: allocation. For large problems you MUST preallocate memory or you
678: will get TERRIBLE performance, see the users' manual chapter on matrices.
680: By default, this format uses inodes (identical nodes) when possible, to
681: improve numerical efficiency of matrix-vector products and solves. We
682: search for consecutive rows with the same nonzero structure, thereby
683: reusing matrix information to achieve increased efficiency.
685: Level: intermediate
687: .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJHIPSPARSE`, `MATAIJHIPSPARSE`
688: @*/
689: PetscErrorCode MatCreateAIJHIPSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
690: {
691: PetscMPIInt size;
693: MatCreate(comm, A);
694: MatSetSizes(*A, m, n, M, N);
695: MPI_Comm_size(comm, &size);
696: if (size > 1) {
697: MatSetType(*A, MATMPIAIJHIPSPARSE);
698: MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz);
699: } else {
700: MatSetType(*A, MATSEQAIJHIPSPARSE);
701: MatSeqAIJSetPreallocation(*A, d_nz, d_nnz);
702: }
703: return 0;
704: }
706: /*MC
707: MATAIJHIPSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJHIPSPARSE.
709: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
710: CSR, ELL, or Hybrid format. All matrix calculations are performed on AMD GPUs using the HIPSPARSE library.
712: This matrix type is identical to MATSEQAIJHIPSPARSE when constructed with a single process communicator,
713: and MATMPIAIJHIPSPARSE otherwise. As a result, for single process communicators,
714: MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
715: for communicators controlling multiple processes. It is recommended that you call both of
716: the above preallocation routines for simplicity.
718: Options Database Keys:
719: + -mat_type mpiaijhipsparse - sets the matrix type to "mpiaijhipsparse" during a call to MatSetFromOptions()
720: . -mat_hipsparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
721: . -mat_hipsparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
722: - -mat_hipsparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
724: Level: beginner
726: .seealso: `MatCreateAIJHIPSPARSE()`, `MATSEQAIJHIPSPARSE`, `MATMPIAIJHIPSPARSE`, `MatCreateSeqAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
727: M*/
729: /*MC
730: MATMPIAIJHIPSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJHIPSPARSE.
732: Level: beginner
734: .seealso: `MATAIJHIPSPARSE`, `MATSEQAIJHIPSPARSE`
735: M*/
737: // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
738: PetscErrorCode MatHIPSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
739: {
740: PetscSplitCSRDataStructure d_mat;
741: PetscMPIInt size;
742: int *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL;
743: PetscScalar *aa = NULL, *ba = NULL;
744: Mat_SeqAIJ *jaca = NULL, *jacb = NULL;
745: Mat_SeqAIJHIPSPARSE *hipsparsestructA = NULL;
746: CsrMatrix *matrixA = NULL, *matrixB = NULL;
749: if (A->factortype != MAT_FACTOR_NONE) {
750: *B = NULL;
751: return 0;
752: }
753: MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
754: // get jaca
755: if (size == 1) {
756: PetscBool isseqaij;
758: PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij);
759: if (isseqaij) {
760: jaca = (Mat_SeqAIJ *)A->data;
762: hipsparsestructA = (Mat_SeqAIJHIPSPARSE *)A->spptr;
763: d_mat = hipsparsestructA->deviceMat;
764: MatSeqAIJHIPSPARSECopyToGPU(A);
765: } else {
766: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
768: Mat_MPIAIJHIPSPARSE *spptr = (Mat_MPIAIJHIPSPARSE *)aij->spptr;
769: jaca = (Mat_SeqAIJ *)aij->A->data;
770: hipsparsestructA = (Mat_SeqAIJHIPSPARSE *)aij->A->spptr;
771: d_mat = spptr->deviceMat;
772: MatSeqAIJHIPSPARSECopyToGPU(aij->A);
773: }
774: if (hipsparsestructA->format == MAT_HIPSPARSE_CSR) {
775: Mat_SeqAIJHIPSPARSEMultStruct *matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestructA->mat;
777: matrixA = (CsrMatrix *)matstruct->mat;
778: bi = NULL;
779: bj = NULL;
780: ba = NULL;
781: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_HIPSPARSE_CSR");
782: } else {
783: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
785: jaca = (Mat_SeqAIJ *)aij->A->data;
786: jacb = (Mat_SeqAIJ *)aij->B->data;
787: Mat_MPIAIJHIPSPARSE *spptr = (Mat_MPIAIJHIPSPARSE *)aij->spptr;
790: hipsparsestructA = (Mat_SeqAIJHIPSPARSE *)aij->A->spptr;
791: Mat_SeqAIJHIPSPARSE *hipsparsestructB = (Mat_SeqAIJHIPSPARSE *)aij->B->spptr;
793: if (hipsparsestructB->format == MAT_HIPSPARSE_CSR) {
794: MatSeqAIJHIPSPARSECopyToGPU(aij->A);
795: MatSeqAIJHIPSPARSECopyToGPU(aij->B);
796: Mat_SeqAIJHIPSPARSEMultStruct *matstructA = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestructA->mat;
797: Mat_SeqAIJHIPSPARSEMultStruct *matstructB = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestructB->mat;
800: matrixA = (CsrMatrix *)matstructA->mat;
801: matrixB = (CsrMatrix *)matstructB->mat;
802: if (jacb->compressedrow.use) {
803: if (!hipsparsestructB->rowoffsets_gpu) {
804: hipsparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
805: hipsparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1);
806: }
807: bi = thrust::raw_pointer_cast(hipsparsestructB->rowoffsets_gpu->data());
808: } else {
809: bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
810: }
811: bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
812: ba = thrust::raw_pointer_cast(matrixB->values->data());
813: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_HIPSPARSE_CSR");
814: d_mat = spptr->deviceMat;
815: }
816: if (jaca->compressedrow.use) {
817: if (!hipsparsestructA->rowoffsets_gpu) {
818: hipsparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
819: hipsparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1);
820: }
821: ai = thrust::raw_pointer_cast(hipsparsestructA->rowoffsets_gpu->data());
822: } else {
823: ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
824: }
825: aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
826: aa = thrust::raw_pointer_cast(matrixA->values->data());
828: if (!d_mat) {
829: PetscSplitCSRDataStructure h_mat;
831: // create and populate strucy on host and copy on device
832: PetscInfo(A, "Create device matrix\n");
833: PetscNew(&h_mat);
834: hipMalloc((void **)&d_mat, sizeof(*d_mat));
835: if (size > 1) { /* need the colmap array */
836: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
837: PetscInt *colmap;
838: PetscInt ii, n = aij->B->cmap->n, N = A->cmap->N;
842: PetscCalloc1(N + 1, &colmap);
843: for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = ii + 1;
844: #if defined(PETSC_USE_64BIT_INDICES)
845: { // have to make a long version of these
846: int *h_bi32, *h_bj32;
847: PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64;
848: PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64);
849: hipMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), hipMemcpyDeviceToHost);
850: for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i];
851: hipMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), hipMemcpyDeviceToHost);
852: for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i];
854: hipMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64));
855: hipMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), hipMemcpyHostToDevice);
856: hipMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64));
857: hipMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), hipMemcpyHostToDevice);
859: h_mat->offdiag.i = d_bi64;
860: h_mat->offdiag.j = d_bj64;
861: h_mat->allocated_indices = PETSC_TRUE;
863: PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64);
864: }
865: #else
866: h_mat->offdiag.i = (PetscInt *)bi;
867: h_mat->offdiag.j = (PetscInt *)bj;
868: h_mat->allocated_indices = PETSC_FALSE;
869: #endif
870: h_mat->offdiag.a = ba;
871: h_mat->offdiag.n = A->rmap->n;
873: hipMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap));
874: hipMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), hipMemcpyHostToDevice);
875: PetscFree(colmap);
876: }
877: h_mat->rstart = A->rmap->rstart;
878: h_mat->rend = A->rmap->rend;
879: h_mat->cstart = A->cmap->rstart;
880: h_mat->cend = A->cmap->rend;
881: h_mat->M = A->cmap->N;
882: #if defined(PETSC_USE_64BIT_INDICES)
883: {
884: int *h_ai32, *h_aj32;
885: PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64;
886: PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64);
887: hipMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), hipMemcpyDeviceToHost);
888: for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i];
889: hipMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), hipMemcpyDeviceToHost);
890: for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i];
892: hipMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64));
893: hipMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), hipMemcpyHostToDevice);
894: hipMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64));
895: hipMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), hipMemcpyHostToDevice);
897: h_mat->diag.i = d_ai64;
898: h_mat->diag.j = d_aj64;
899: h_mat->allocated_indices = PETSC_TRUE;
901: PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64);
902: }
903: #else
904: h_mat->diag.i = (PetscInt *)ai;
905: h_mat->diag.j = (PetscInt *)aj;
906: h_mat->allocated_indices = PETSC_FALSE;
907: #endif
908: h_mat->diag.a = aa;
909: h_mat->diag.n = A->rmap->n;
910: h_mat->rank = PetscGlobalRank;
911: // copy pointers and metadata to device
912: hipMemcpy(d_mat, h_mat, sizeof(*d_mat), hipMemcpyHostToDevice);
913: PetscFree(h_mat);
914: } else {
915: PetscInfo(A, "Reusing device matrix\n");
916: }
917: *B = d_mat;
918: A->offloadmask = PETSC_OFFLOAD_GPU;
919: return 0;
920: }