Actual source code: mpiaijkok.kokkos.cxx
1: #include <petscvec_kokkos.hpp>
2: #include <petscsf.h>
3: #include <petsc/private/sfimpl.h>
4: #include <../src/mat/impls/aij/mpi/mpiaij.h>
5: #include <../src/mat/impls/aij/mpi/kokkos/mpiaijkok.hpp>
6: #include <KokkosSparse_spadd.hpp>
8: PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A, MatAssemblyType mode)
9: {
10: Mat_SeqAIJKokkos *aijkok;
11: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;
13: MatAssemblyEnd_MPIAIJ(A, mode);
14: /* E.g., MatCreateSubMatrix() calls MatCreateMPIAIJWithSeqAIJ(comm,A,B,..), which creates Bnew of SEQAIJ and destroys B of SEQAIJKOKKOS.
15: Thus we finalize A/B/lvec's type in MatAssemblyEnd() to handle various cases.
16: */
17: if (mode == MAT_FINAL_ASSEMBLY) {
18: MatSetType(mpiaij->A, MATSEQAIJKOKKOS);
19: MatSetType(mpiaij->B, MATSEQAIJKOKKOS);
20: VecSetType(mpiaij->lvec, VECSEQKOKKOS);
21: }
22: aijkok = static_cast<Mat_SeqAIJKokkos *>(((Mat_MPIAIJ *)A->data)->A->spptr); /* Access spptr after MatAssemblyEnd_MPIAIJ(), which might have deleted old spptr */
23: if (aijkok && aijkok->device_mat_d.data()) {
24: A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
25: }
27: return 0;
28: }
30: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
31: {
32: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
34: PetscLayoutSetUp(mat->rmap);
35: PetscLayoutSetUp(mat->cmap);
36: #if defined(PETSC_USE_DEBUG)
37: if (d_nnz) {
38: PetscInt i;
40: }
41: if (o_nnz) {
42: PetscInt i;
44: }
45: #endif
46: #if defined(PETSC_USE_CTABLE)
47: PetscTableDestroy(&mpiaij->colmap);
48: #else
49: PetscFree(mpiaij->colmap);
50: #endif
51: PetscFree(mpiaij->garray);
52: VecDestroy(&mpiaij->lvec);
53: VecScatterDestroy(&mpiaij->Mvctx);
54: /* Because the B will have been resized we simply destroy it and create a new one each time */
55: MatDestroy(&mpiaij->B);
57: if (!mpiaij->A) {
58: MatCreate(PETSC_COMM_SELF, &mpiaij->A);
59: MatSetSizes(mpiaij->A, mat->rmap->n, mat->cmap->n, mat->rmap->n, mat->cmap->n);
60: }
61: if (!mpiaij->B) {
62: PetscMPIInt size;
63: MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size);
64: MatCreate(PETSC_COMM_SELF, &mpiaij->B);
65: MatSetSizes(mpiaij->B, mat->rmap->n, size > 1 ? mat->cmap->N : 0, mat->rmap->n, size > 1 ? mat->cmap->N : 0);
66: }
67: MatSetType(mpiaij->A, MATSEQAIJKOKKOS);
68: MatSetType(mpiaij->B, MATSEQAIJKOKKOS);
69: MatSeqAIJSetPreallocation(mpiaij->A, d_nz, d_nnz);
70: MatSeqAIJSetPreallocation(mpiaij->B, o_nz, o_nnz);
71: mat->preallocated = PETSC_TRUE;
72: return 0;
73: }
75: PetscErrorCode MatMult_MPIAIJKokkos(Mat mat, Vec xx, Vec yy)
76: {
77: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
78: PetscInt nt;
80: VecGetLocalSize(xx, &nt);
82: VecScatterBegin(mpiaij->Mvctx, xx, mpiaij->lvec, INSERT_VALUES, SCATTER_FORWARD);
83: (*mpiaij->A->ops->mult)(mpiaij->A, xx, yy);
84: VecScatterEnd(mpiaij->Mvctx, xx, mpiaij->lvec, INSERT_VALUES, SCATTER_FORWARD);
85: (*mpiaij->B->ops->multadd)(mpiaij->B, mpiaij->lvec, yy, yy);
86: return 0;
87: }
89: PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat, Vec xx, Vec yy, Vec zz)
90: {
91: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
92: PetscInt nt;
94: VecGetLocalSize(xx, &nt);
96: VecScatterBegin(mpiaij->Mvctx, xx, mpiaij->lvec, INSERT_VALUES, SCATTER_FORWARD);
97: (*mpiaij->A->ops->multadd)(mpiaij->A, xx, yy, zz);
98: VecScatterEnd(mpiaij->Mvctx, xx, mpiaij->lvec, INSERT_VALUES, SCATTER_FORWARD);
99: (*mpiaij->B->ops->multadd)(mpiaij->B, mpiaij->lvec, zz, zz);
100: return 0;
101: }
103: PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat, Vec xx, Vec yy)
104: {
105: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
106: PetscInt nt;
108: VecGetLocalSize(xx, &nt);
110: (*mpiaij->B->ops->multtranspose)(mpiaij->B, xx, mpiaij->lvec);
111: (*mpiaij->A->ops->multtranspose)(mpiaij->A, xx, yy);
112: VecScatterBegin(mpiaij->Mvctx, mpiaij->lvec, yy, ADD_VALUES, SCATTER_REVERSE);
113: VecScatterEnd(mpiaij->Mvctx, mpiaij->lvec, yy, ADD_VALUES, SCATTER_REVERSE);
114: return 0;
115: }
117: /* Merge the "A, B" matrices of mat into a matrix C. mat's type is MPIAIJKOKKOS. C's type is MATSEQAIJKOKKOS.
118: A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n).
119: C still uses local column ids. Their corresponding global column ids are returned in glob.
120: */
121: PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJKokkos(Mat mat, MatReuse reuse, IS *glob, Mat *C)
122: {
123: Mat Ad, Ao;
124: const PetscInt *cmap;
126: MatMPIAIJGetSeqAIJ(mat, &Ad, &Ao, &cmap);
127: MatSeqAIJKokkosMergeMats(Ad, Ao, reuse, C);
128: if (glob) {
129: PetscInt cst, i, dn, on, *gidx;
130: MatGetLocalSize(Ad, NULL, &dn);
131: MatGetLocalSize(Ao, NULL, &on);
132: MatGetOwnershipRangeColumn(mat, &cst, NULL);
133: PetscMalloc1(dn + on, &gidx);
134: for (i = 0; i < dn; i++) gidx[i] = cst + i;
135: for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
136: ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob);
137: }
138: return 0;
139: }
141: /* Structs used in matrix product C=AB, C=A^tB and C=B^tAB */
142: struct MatMatStruct {
143: MatRowMapKokkosView Cdstart; /* Used to split sequential matrix into petsc's A, B format */
144: PetscSF sf; /* SF to send/recv matrix entries */
145: MatScalarKokkosView abuf; /* buf of mat values in send/recv */
146: Mat C1, C2, B_local;
147: KokkosCsrMatrix C1_global, C2_global, C_global;
148: KernelHandle kh;
149: MatMatStruct()
150: {
151: C1 = C2 = B_local = NULL;
152: sf = NULL;
153: }
155: ~MatMatStruct()
156: {
157: MatDestroy(&C1);
158: MatDestroy(&C2);
159: MatDestroy(&B_local);
160: PetscSFDestroy(&sf);
161: kh.destroy_spadd_handle();
162: }
163: };
165: struct MatMatStruct_AB : public MatMatStruct {
166: MatColIdxKokkosView rows;
167: MatRowMapKokkosView rowoffset;
168: Mat B_other, C_petsc; /* SEQAIJKOKKOS matrices. TODO: have a better var name than C_petsc */
170: MatMatStruct_AB() : B_other(NULL), C_petsc(NULL) { }
171: ~MatMatStruct_AB()
172: {
173: MatDestroy(&B_other);
174: MatDestroy(&C_petsc);
175: }
176: };
178: struct MatMatStruct_AtB : public MatMatStruct {
179: MatRowMapKokkosView srcrowoffset, dstrowoffset;
180: };
182: struct MatProductData_MPIAIJKokkos {
183: MatMatStruct_AB *mmAB;
184: MatMatStruct_AtB *mmAtB;
185: PetscBool reusesym;
187: MatProductData_MPIAIJKokkos() : mmAB(NULL), mmAtB(NULL), reusesym(PETSC_FALSE) { }
188: ~MatProductData_MPIAIJKokkos()
189: {
190: delete mmAB;
191: delete mmAtB;
192: }
193: };
195: static PetscErrorCode MatProductDataDestroy_MPIAIJKokkos(void *data)
196: {
197: delete static_cast<MatProductData_MPIAIJKokkos *>(data);
198: return 0;
199: }
201: /* MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds - Get a KokkosCsrMatrix from a MATSEQAIJKOKKOS matrix
203: Input Parameters:
204: + A - the MATSEQAIJKOKKOS matrix
205: . N - new column size for the returned Kokkos matrix
206: - l2g - a map that maps old col ids to new col ids
208: Output Parameters:
209: . csrmat - the Kokkos matrix, which has the same row size as A, shares a, i but not j with A.
210: */
211: static PetscErrorCode MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(Mat A, PetscInt N, const ConstMatColIdxKokkosView &l2g, KokkosCsrMatrix &csrmat)
212: {
213: KokkosCsrMatrix &orig = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
214: MatColIdxKokkosView jg("jg", orig.nnz()); /* New j array for csrmat */
216: PetscCallCXX(Kokkos::parallel_for(
217: orig.nnz(), KOKKOS_LAMBDA(const PetscInt i) { jg(i) = l2g(orig.graph.entries(i)); }));
218: csrmat = KokkosCsrMatrix("csrmat", orig.numRows(), N, orig.nnz(), orig.values, orig.graph.row_map, jg);
219: return 0;
220: }
222: /* MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices - Set the diag and offdiag matrices of a MATMPIAIJKOKKOS matrix.
223: It is similar to MatCreateMPIAIJWithSplitArrays.
225: Input Parameters:
226: + mat - the MATMPIAIJKOKKOS matrix, which should have its type and layout set, but should not have its diag, offdiag matrices set
227: . A - the diag matrix using local col ids
228: - B - the offdiag matrix using global col ids
230: Output Parameters:
231: . mat - the updated MATMPIAIJKOKKOS matrix
232: */
233: static PetscErrorCode MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(Mat mat, Mat A, Mat B)
234: {
235: Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
236: PetscInt m, n, M, N, Am, An, Bm, Bn;
237: Mat_SeqAIJKokkos *bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
239: MatGetSize(mat, &M, &N);
240: MatGetLocalSize(mat, &m, &n);
241: MatGetLocalSize(A, &Am, &An);
242: MatGetLocalSize(B, &Bm, &Bn);
248: mpiaij->A = A;
249: mpiaij->B = B;
251: mat->preallocated = PETSC_TRUE;
252: mat->nooffprocentries = PETSC_TRUE; /* See MatAssemblyBegin_MPIAIJ. In effect, making MatAssemblyBegin a nop */
254: MatSetOption(mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE);
255: MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY);
256: /* MatAssemblyEnd is critical here. It sets mat->offloadmask according to A and B's, and
257: also gets mpiaij->B compacted, with its col ids and size reduced
258: */
259: MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY);
260: MatSetOption(mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_FALSE);
261: MatSetOption(mat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE);
263: /* Update bkok with new local col ids (stored on host) and size */
264: bkok->j_dual.modify_host();
265: bkok->j_dual.sync_device();
266: bkok->SetColSize(mpiaij->B->cmap->n);
267: return 0;
268: }
270: /* MatSeqAIJKokkosBcast - Bcast rows of a SEQAIJKOKKOS matrice (B) to form a SEQAIJKOKKOS matrix (C).
272: It is essentially the MPIAIJKOKKOS counterpart of MatGetBrowsOfAoCols_MPIAIJ, but supports device and uses PetscSF.
273: In the given ownerSF, leaves correspond to rows in C, and roots correspond to rows in B. Roots may connect to multiple leaves.
274: Suppose C's j-th row is connected to a root identified by PetscSFNode (k,i), it means we will bcast the i-th row of B on rank k
275: to j-th row of C. ownerSF's leaves must be contiguous (in other words, as if ilocal=NULL was used to set its graph).
277: Collective on comm of ownerSF
279: Input Parameters:
280: + B - the SEQAIJKOKKOS matrix, using local col ids
281: . reuse - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
282: . N - global col ids are in range of [0,N). N Must be the same across ranks (nonsignificant in MAT_REUSE_MATRIX)
283: . l2g - a map mapping B's local col ids to global ones (nonsignificant in MAT_REUSE_MATRIX)
284: . ownerSF - the ownership SF (nonsignificant in MAT_REUSE_MATRIX)
286: Input/Output Parameters (out when resue = MAT_INITIAL_MATRIX, inout when reuse = MAT_REUSE_MATRIX)
287: + bcastSF - the SF used to bcast rows of B. This plain SF does buffer (abuf) to buffer (Ca) send/recv. In this SF, vertices are nonzeros.
288: . abuf - buffer for sending matrix values
289: . rows - array containing indices of (local) rows that this rank needs to bcast to others. Each receiver rank has a chunk in rows[].
290: Values in rows[] might have repeats, which simply indicates a row will be bcast'ed to multiple neighbors.
291: . rowoffset - For each row in rows[], it will be copied to rowoffset[] at abuf[]
292: - C - the SEQAIJKOKKOS matrix made of the bcast'ed rows, using local col ids.
293: */
294: static PetscErrorCode MatSeqAIJKokkosBcast(Mat B, MatReuse reuse, PetscInt N, const ConstMatColIdxKokkosView &l2g, PetscSF ownerSF, PetscSF &bcastSF, MatScalarKokkosView &abuf, MatColIdxKokkosView &rows, MatRowMapKokkosView &rowoffset, Mat &C)
295: {
296: Mat_SeqAIJKokkos *bkok, *ckok;
298: MatSeqAIJKokkosSyncDevice(B); /* Make sure B->spptr is accessible */
299: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
301: if (reuse == MAT_REUSE_MATRIX) {
302: ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
304: const auto &Ba = bkok->a_dual.view_device();
305: const auto &Bi = bkok->i_dual.view_device();
306: const auto &Ca = ckok->a_dual.view_device();
308: /* Copy Ba to abuf */
309: Kokkos::parallel_for(
310: Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
311: PetscInt i = t.league_rank(); /* rows[i] is r-th row of B */
312: PetscInt r = rows(i);
313: PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */
314: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, Bi(r + 1) - Bi(r)), [&](PetscInt k) { abuf(base + k) = Ba(Bi(r) + k); });
315: });
317: /* Send abuf to Ca through bcastSF and then mark C is updated on device */
318: PetscSFBcastBegin(bcastSF, MPIU_SCALAR, abuf.data(), Ca.data(), MPI_REPLACE); /* TODO: get memtype for abuf */
319: PetscSFBcastEnd(bcastSF, MPIU_SCALAR, abuf.data(), Ca.data(), MPI_REPLACE);
320: ckok->a_dual.modify_device();
321: } else if (reuse == MAT_INITIAL_MATRIX) {
322: MPI_Comm comm;
323: PetscMPIInt tag;
324: PetscInt k, Cm, Cn, Cnnz, *Ci_h, nroots, nleaves;
326: PetscObjectGetComm((PetscObject)ownerSF, &comm);
327: PetscSFGetGraph(ownerSF, &nroots, &nleaves, NULL, NULL);
328: Cm = nleaves; /* row size of C */
329: Cn = N; /* col size of C, which initially uses global ids, so we can safely set its col size as N */
331: /* Get row lens (nz) of B's rows for later fast query */
332: PetscInt *Browlens;
333: const PetscInt *tmp = bkok->i_host_data();
334: PetscMalloc1(nroots, &Browlens);
335: for (k = 0; k < nroots; k++) Browlens[k] = tmp[k + 1] - tmp[k];
337: /* By ownerSF, each proc gets lens of rows of C */
338: MatRowMapKokkosDualView Ci("i", Cm + 1); /* C's rowmap */
339: Ci_h = Ci.view_host().data();
340: Ci_h[0] = 0;
341: PetscSFBcastWithMemTypeBegin(ownerSF, MPIU_INT, PETSC_MEMTYPE_HOST, Browlens, PETSC_MEMTYPE_HOST, &Ci_h[1], MPI_REPLACE);
342: PetscSFBcastEnd(ownerSF, MPIU_INT, Browlens, &Ci_h[1], MPI_REPLACE);
343: for (k = 1; k < Cm + 1; k++) Ci_h[k] += Ci_h[k - 1]; /* Convert lens to CSR */
344: Cnnz = Ci_h[Cm];
345: Ci.modify_host();
346: Ci.sync_device();
348: /* With the newly known Cnnz, we are able to allocate (j, a) for C on host & device */
349: MatColIdxKokkosDualView Cj("j", Cnnz);
350: MatScalarKokkosDualView Ca("a", Cnnz);
352: /* Now build the bcastSF to fill Ca, Cj. This plain SF only does (contiguous) buffer to buffer send/recv */
353: const PetscMPIInt *iranks, *ranks;
354: const PetscInt *ioffset, *irootloc, *roffset;
355: PetscInt i, j, niranks, nranks, *sdisp, *rdisp, *rowptr;
356: MPI_Request *reqs;
358: PetscSFGetLeafRanks(ownerSF, &niranks, &iranks, &ioffset, &irootloc); /* irootloc[] contains indices of rows I need to send to each receiver */
359: PetscSFGetRootRanks(ownerSF, &nranks, &ranks, &roffset, NULL /*rmine*/, NULL /*rremote*/); /* recv info */
361: /* figure out offsets at the send buffer, to build the SF
362: sdisp[] - stores offsets of nonzeros (in abuf or jbuf, see later) I need to send, per receiver.
363: rowptr[] - stores offsets for data of each row in abuf
365: rdisp[] - to receive sdisp[]
366: */
367: PetscMalloc3(niranks + 1, &sdisp, nranks, &rdisp, niranks + nranks, &reqs);
368: MatRowMapKokkosViewHost rowptr_h("rowptr_h", ioffset[niranks] + 1); /* Let Kokkos do the allocation, so that we can do an easy mirror later */
369: rowptr = rowptr_h.data();
371: sdisp[0] = 0;
372: rowptr[0] = 0;
373: for (i = 0; i < niranks; i++) { /* for each receiver */
374: PetscInt len, nz = 0;
375: for (j = ioffset[i]; j < ioffset[i + 1]; j++) { /* for each row to this receiver */
376: len = Browlens[irootloc[j]];
377: rowptr[j + 1] = rowptr[j] + len;
378: nz += len;
379: }
380: sdisp[i + 1] = sdisp[i] + nz;
381: }
382: PetscCommGetNewTag(comm, &tag);
383: for (i = 0; i < nranks; i++) MPI_Irecv(&rdisp[i], 1, MPIU_INT, ranks[i], tag, comm, &reqs[i]);
384: for (i = 0; i < niranks; i++) MPI_Isend(&sdisp[i], 1, MPIU_INT, iranks[i], tag, comm, &reqs[nranks + i]);
385: MPI_Waitall(niranks + nranks, reqs, MPI_STATUSES_IGNORE);
387: PetscInt nleaves2 = Cnnz; /* leaves are the nonzeros I will receive */
388: PetscInt nroots2 = sdisp[niranks]; /* roots are the nonzeros (in abuf) I will send */
389: PetscSFNode *iremote;
390: PetscMalloc1(nleaves2, &iremote);
391: for (i = 0; i < nranks; i++) { /* for each sender */
392: k = 0;
393: for (j = Ci_h[roffset[i]]; j < Ci_h[roffset[i + 1]]; j++) {
394: iremote[j].rank = ranks[i];
395: iremote[j].index = rdisp[i] + k;
396: k++;
397: }
398: }
399: /* TODO: we should extend PetscSF APIs for this buffer-to-buffer send/recv */
400: PetscSFCreate(comm, &bcastSF);
401: PetscSFSetGraph(bcastSF, nroots2, nleaves2, NULL /*ilocal*/, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER);
403: /* Extract selected rows of B, and copy their (a, j) into abuf[] and jbuf[], with j converted
404: from local to global. Then use bcastSF to fill Ca, Cj.
405: */
406: ConstMatColIdxKokkosViewHost rows_h(irootloc, ioffset[niranks]); /* irootloc[] stores indices of rows I need to send */
407: MatColIdxKokkosView rows("rows", ioffset[niranks]);
408: Kokkos::deep_copy(rows, rows_h); /* Use deep copy since irootoc is managed by PetscSF and we want 'rows' to be standalone */
410: rowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), rowptr_h); /* If no device, rowoffset will be an alias to rowptr_h */
412: MatColIdxKokkosView jbuf("jbuf", sdisp[niranks]); /* send buf for (global) col ids */
413: abuf = MatScalarKokkosView("abuf", sdisp[niranks]); /* send buf for mat values */
415: const auto &Ba = bkok->a_dual.view_device();
416: const auto &Bi = bkok->i_dual.view_device();
417: const auto &Bj = bkok->j_dual.view_device();
419: /* Copy Ba, Bj to abuf, jbuf with change col ids from local to global */
420: Kokkos::parallel_for(
421: Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
422: PetscInt i = t.league_rank(); /* rows[i] is r-th row of B */
423: PetscInt r = rows(i);
424: PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */
425: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, Bi(r + 1) - Bi(r)), [&](PetscInt k) {
426: abuf(base + k) = Ba(Bi(r) + k);
427: jbuf(base + k) = l2g(Bj(Bi(r) + k));
428: });
429: });
431: /* Send abuf & jbuf to fill Ca, Cj */
432: PetscSFBcastBegin(bcastSF, MPIU_INT, jbuf.data(), Cj.view_device().data(), MPI_REPLACE);
433: PetscSFBcastBegin(bcastSF, MPIU_SCALAR, abuf.data(), Ca.view_device().data(), MPI_REPLACE);
434: PetscSFBcastEnd(bcastSF, MPIU_INT, jbuf.data(), Cj.view_device().data(), MPI_REPLACE);
435: PetscSFBcastEnd(bcastSF, MPIU_SCALAR, abuf.data(), Ca.view_device().data(), MPI_REPLACE);
436: Cj.modify_device(); /* Mark Cj, Ca modified on device, but only sync Cj since we might not need Ca on host at all */
437: Cj.sync_host();
438: Ca.modify_device();
440: /* Construct C with Ca, Ci, Cj */
441: auto ckok = new Mat_SeqAIJKokkos(Cm, Cn, Cnnz, Ci, Cj, Ca);
442: MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, &C);
443: PetscFree3(sdisp, rdisp, reqs);
444: PetscFree(Browlens);
445: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unsupported MatReuse enum %d", reuse);
446: return 0;
447: }
449: /* MatSeqAIJKokkosReduce - Reduce rows of a SEQAIJKOKKOS matrix (A) to form a Kokkos Csr matrix (C)
451: It is the reverse of MatSeqAIJKokkosBcast in some sense.
453: Think each row of A as a leaf, then the given ownerSF specifies roots of the leaves. Roots may connect to multiple leaves.
454: In this routine, we reduce (i.e., concatenate) leaves (rows) at their roots to form potentially longer rows in C. Such rows might
455: contain repeats, which does not matter since they will be summed up by other routines. C's row size will be nroots of ownerSF.
457: Input Parameters:
458: + A - the SEQAIJKOKKOS matrix to be reduced
459: . reuse - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
460: . local - true if A uses local col ids; false if A is already in global col ids.
461: . N - if local, N is A's global col size
462: . l2g - if local, a map mapping A's local col ids to global ones, which are in range of [0,N).
463: - ownerSF - the SF specifies ownership (root) of rows in A
465: Output Parameters:
466: + reduceSF - the SF to reduce A's rows to contiguous buffers at the receiver side
467: . abuf - a contiguous buffer to receive A's rows sent to this proc. Suppose there are 'nrows' such rows.
468: . srcrowoffset - offset array of size nrows+1. Each entry is the corresponding row's offset in abuf[]. srcrowoffset[i+1]-srcrowoffset[i] is row i's len.
469: . dstrowoffset - offset array of size nrows. Each entry is the corresponding row's offset in Ca[], i.e., C's 'a' array. Row i, i+1 in abuf[] may go to
470: unrelated places in Ca, so dstrowoffset is not in CSR-like format as srcrowoffset.
471: - C - the matrix made up by rows sent to me from other ranks, using global col ids
473: TODO: we can even have MatSeqAIJKokkosReduceBegin/End to provide opportunity for callers to overlap comp./comm. when reuse = MAT_REUSE_MATRIX.
474: */
475: static PetscErrorCode MatSeqAIJKokkosReduce(Mat A, MatReuse reuse, PetscBool local, PetscInt N, const ConstMatColIdxKokkosView &l2g, PetscSF ownerSF, PetscSF &reduceSF, MatScalarKokkosView &abuf, MatRowMapKokkosView &srcrowoffset, MatRowMapKokkosView &dstrowoffset, KokkosCsrMatrix &C)
476: {
477: PetscInt i, r, Am, An, Annz, Cnnz, nrows;
478: const PetscInt *Ai;
479: Mat_SeqAIJKokkos *akok;
481: MatSeqAIJKokkosSyncDevice(A); /* So that A's latest data is on device */
482: MatGetSize(A, &Am, &An);
483: Ai = static_cast<Mat_SeqAIJ *>(A->data)->i;
484: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
485: Annz = Ai[Am];
487: if (reuse == MAT_REUSE_MATRIX) {
488: /* Send Aa to abuf */
489: PetscSFReduceBegin(reduceSF, MPIU_SCALAR, akok->a_device_data(), abuf.data(), MPI_REPLACE);
490: PetscSFReduceEnd(reduceSF, MPIU_SCALAR, akok->a_device_data(), abuf.data(), MPI_REPLACE);
492: /* Copy abuf to Ca */
493: const MatScalarKokkosView &Ca = C.values;
494: nrows = dstrowoffset.extent(0); /* Not srcrowoffset[] since it has an extra entry for CSR */
495: Kokkos::parallel_for(
496: Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
497: PetscInt i = t.league_rank();
498: PetscInt src = srcrowoffset(i), dst = dstrowoffset(i);
499: PetscInt len = srcrowoffset(i + 1) - srcrowoffset(i);
500: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt k) { Ca(dst + k) = abuf(src + k); });
501: });
502: } else if (reuse == MAT_INITIAL_MATRIX) {
503: MPI_Comm comm;
504: MPI_Request *reqs;
505: PetscMPIInt tag;
506: PetscInt Cm;
508: PetscObjectGetComm((PetscObject)ownerSF, &comm);
509: PetscCommGetNewTag(comm, &tag);
511: PetscInt niranks, nranks, nroots, nleaves;
512: const PetscMPIInt *iranks, *ranks;
513: const PetscInt *ioffset, *rows, *roffset; /* rows[] contains local indices of rows scattered to me from others. ioffset[] is a CSR on rows[] */
514: PetscSFSetUp(ownerSF);
515: PetscSFGetLeafRanks(ownerSF, &niranks, &iranks, &ioffset, &rows); /* recv info: iranks[] will send rows to me */
516: PetscSFGetRootRanks(ownerSF, &nranks, &ranks, &roffset, NULL /*rmine*/, NULL /*rremote*/); /* send info */
517: PetscSFGetGraph(ownerSF, &nroots, &nleaves, NULL, NULL);
519: Cm = nroots;
520: nrows = ioffset[niranks]; /* # of rows to be received. Might receive same row (each is partial) from different senders */
522: /* Tell owners how long each row I will send */
523: PetscInt *srowlens; /* send buf of row lens */
524: MatRowMapKokkosViewHost rrowlens_h("rrowoffset_h", nrows + 1); /* recv buf of row lens. +1 to make CSR later. Memory might be passed to other views */
525: PetscInt *rrowlens = rrowlens_h.data();
527: PetscMalloc2(Am, &srowlens, niranks + nranks, &reqs);
528: for (i = 0; i < Am; i++) srowlens[i] = Ai[i + 1] - Ai[i];
529: rrowlens[0] = 0;
530: rrowlens++; /* shift the pointer to make the following expression more readable */
531: for (i = 0; i < niranks; i++) MPI_Irecv(&rrowlens[ioffset[i]], ioffset[i + 1] - ioffset[i], MPIU_INT, iranks[i], tag, comm, &reqs[i]);
532: for (i = 0; i < nranks; i++) MPI_Isend(&srowlens[roffset[i]], roffset[i + 1] - roffset[i], MPIU_INT, ranks[i], tag, comm, &reqs[niranks + i]);
533: MPI_Waitall(niranks + nranks, reqs, MPI_STATUSES_IGNORE);
535: /* Owner builds Ci on host by histogramming rrowlens[] */
536: MatRowMapKokkosViewHost Ci_h("i", Cm + 1);
537: Kokkos::deep_copy(Ci_h, 0); /* Zero Ci */
538: MatRowMapType *Ci_ptr = Ci_h.data();
540: for (i = 0; i < nrows; i++) {
541: r = rows[i]; /* local row id of i-th received row */
542: #if defined(PETSC_USE_DEBUG)
544: #endif
545: Ci_ptr[r + 1] += rrowlens[i]; /* add to length of row r in C */
546: }
547: for (i = 0; i < Cm; i++) Ci_ptr[i + 1] += Ci_ptr[i]; /* to CSR format */
548: Cnnz = Ci_ptr[Cm];
550: /* For each received row, compute src & dst offsets in memory copying (from recv bufs abuf, jbuf to Ca, Cj) */
551: MatRowMapKokkosViewHost dstrowoffset_h("dstrowoffset_h", nrows);
552: PetscInt *dstrowoffset_hptr = dstrowoffset_h.data();
553: PetscInt *currowlens; /* Current row lens. They are temp accumulators for row lens in C, to help build dstrowoffset */
555: PetscCalloc1(Cm, &currowlens); /* Init with zero, to be added to */
556: for (i = 0; i < nrows; i++) { /* for each row I receive */
557: r = rows[i]; /* row id in C */
558: dstrowoffset_hptr[i] = Ci_ptr[r] + currowlens[r]; /* dst offset of the new place for each recv'ed row in Ca/Cj */
559: currowlens[r] += rrowlens[i]; /* accumulate to length of row r in C */
560: }
561: PetscFree(currowlens);
563: rrowlens--;
564: for (i = 0; i < nrows; i++) rrowlens[i + 1] += rrowlens[i]; /* Change rrowlens[] to CSR format */
565: dstrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), dstrowoffset_h);
566: srcrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), rrowlens_h); /* src offset of each recv'ed row in abuf/jbuf */
568: /* Build the reduceSF, which performs buffer to buffer send/recv */
569: PetscInt *sdisp, *rdisp; /* buffer to send offsets of roots, and buffer to recv them */
570: PetscMalloc2(niranks, &sdisp, nranks, &rdisp);
571: for (i = 0; i < niranks; i++) sdisp[i] = rrowlens[ioffset[i]];
572: for (i = 0; i < nranks; i++) MPI_Irecv(&rdisp[i], 1, MPIU_INT, ranks[i], tag, comm, &reqs[i]);
573: for (i = 0; i < niranks; i++) MPI_Isend(&sdisp[i], 1, MPIU_INT, iranks[i], tag, comm, &reqs[nranks + i]);
574: MPI_Waitall(niranks + nranks, reqs, MPI_STATUSES_IGNORE);
576: /* Nonzeros in abuf/jbuf are roots and those in A are leaves */
577: PetscInt nroots2 = Cnnz, nleaves2 = Annz;
578: PetscSFNode *iremote;
579: PetscMalloc1(nleaves2, &iremote); /* no free, since memory will be given to reduceSF */
580: for (i = 0; i < nranks; i++) {
581: PetscInt rootbase = rdisp[i]; /* root offset at this root rank */
582: PetscInt leafbase = Ai[roffset[i]]; /* leaf base */
583: PetscInt nz = Ai[roffset[i + 1]] - leafbase; /* I will send nz nonzeros to this root rank */
584: for (PetscInt k = 0; k < nz; k++) {
585: iremote[leafbase + k].rank = ranks[i];
586: iremote[leafbase + k].index = rootbase + k;
587: }
588: }
589: PetscSFCreate(comm, &reduceSF);
590: PetscSFSetGraph(reduceSF, nroots2, nleaves2, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER);
591: PetscFree2(sdisp, rdisp);
593: /* Reduce Aa, Ajg to abuf and jbuf */
595: /* If A uses local col ids, convert them to global ones before sending */
596: MatColIdxKokkosView Ajg;
597: if (local) {
598: Ajg = MatColIdxKokkosView("j", Annz);
599: const MatColIdxKokkosView &Aj = akok->j_dual.view_device();
600: Kokkos::parallel_for(
601: Annz, KOKKOS_LAMBDA(const PetscInt i) { Ajg(i) = l2g(Aj(i)); });
602: } else {
603: Ajg = akok->j_dual.view_device(); /* no data copy, just take a reference */
604: }
606: MatColIdxKokkosView jbuf("jbuf", Cnnz);
607: abuf = MatScalarKokkosView("abuf", Cnnz);
608: PetscSFReduceBegin(reduceSF, MPIU_INT, Ajg.data(), jbuf.data(), MPI_REPLACE);
609: PetscSFReduceEnd(reduceSF, MPIU_INT, Ajg.data(), jbuf.data(), MPI_REPLACE);
610: PetscSFReduceBegin(reduceSF, MPIU_SCALAR, akok->a_device_data(), abuf.data(), MPI_REPLACE);
611: PetscSFReduceEnd(reduceSF, MPIU_SCALAR, akok->a_device_data(), abuf.data(), MPI_REPLACE);
613: /* Copy data from abuf, jbuf to Ca, Cj */
614: MatRowMapKokkosView Ci = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ci_h); /* Ci is an alias of Ci_h if no device */
615: MatColIdxKokkosView Cj("j", Cnnz);
616: MatScalarKokkosView Ca("a", Cnnz);
618: Kokkos::parallel_for(
619: Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
620: PetscInt i = t.league_rank();
621: PetscInt src = srcrowoffset(i), dst = dstrowoffset(i);
622: PetscInt len = srcrowoffset(i + 1) - srcrowoffset(i);
623: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt k) {
624: Ca(dst + k) = abuf(src + k);
625: Cj(dst + k) = jbuf(src + k);
626: });
627: });
629: /* Build C with Ca, Ci, Cj */
630: C = KokkosCsrMatrix("csrmat", Cm, N, Cnnz, Ca, Ci, Cj);
631: PetscFree2(srowlens, reqs);
632: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unsupported MatReuse enum %d", reuse);
633: return 0;
634: }
636: /* MatSetMPIAIJKokkosWithGlobalCSRMatrix - Set the diag and offdiag parts of a `MATMPIAIJKOKKOS` matrix by splitting a KokkosCsrMatrix
638: Input Parameters:
639: + C - the `MATMPIAIJKOKKOS` matrix, of size m,n,M,N
640: . reuse - indicate whether the matrix has called this function before
641: . csrmat - the KokkosCsrMatrix, of size m,N
642: - Cdstart - when reuse == `MAT_REUSE_MATRIX`, it is an input parameter. For each row in csrmat, it stores the start of the first
643: entry of the diag block of C in csrmat's j array. E.g, if row i has col ids = {0, 3, 4, 5, 7, 9} and the first diag
644: entry is 5, then Cdstart[i] = 3.
646: Output Parameters:
647: + C - the updated `MATMPIAIJKOKKOS` matrix
648: - Cdstart - when reuse == `MAT_INITIAL_MATRIX`, it is an output parameter
650: Note:
651: Between calls with `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`, csrmat must have the same nonzero pattern
653: .seealso: `MATMPIAIJKOKKOS`
654: */
655: static PetscErrorCode MatSetMPIAIJKokkosWithGlobalCSRMatrix(Mat C, MatReuse reuse, const KokkosCsrMatrix &csrmat, MatRowMapKokkosView &Cdstart)
656: {
657: const MatScalarKokkosView &Ca = csrmat.values;
658: const ConstMatRowMapKokkosView &Ci = csrmat.graph.row_map;
659: PetscInt m, n, N;
661: MatGetLocalSize(C, &m, &n);
662: MatGetSize(C, NULL, &N);
664: if (reuse == MAT_REUSE_MATRIX) {
665: Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(C->data);
666: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(mpiaij->A->spptr);
667: Mat_SeqAIJKokkos *bkok = static_cast<Mat_SeqAIJKokkos *>(mpiaij->B->spptr);
668: const MatScalarKokkosView &Cda = akok->a_dual.view_device(), Coa = bkok->a_dual.view_device();
669: const MatRowMapKokkosView &Cdi = akok->i_dual.view_device(), Coi = bkok->i_dual.view_device();
671: /* Fill 'a' of Cd and Co on device */
672: Kokkos::parallel_for(
673: Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
674: PetscInt i = t.league_rank(); /* row i */
675: PetscInt clen = Ci(i + 1) - Ci(i); /* len of row i of C */
676: PetscInt cdlen = Cdi(i + 1) - Cdi(i); /* len of row i of Cd */
677: PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */
678: PetscInt cdend = cdstart + cdlen;
679: /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */
680: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) {
681: if (k < cdstart) { /* k in [0, cdstart) */
682: Coa(Coi(i) + k) = Ca(Ci(i) + k);
683: } else if (k < cdend) { /* k in [cdstart, cdend) */
684: Cda(Cdi(i) + (k - cdstart)) = Ca(Ci(i) + k);
685: } else { /* k in [cdend, clen) */
686: Coa(Coi(i) + k - cdlen) = Ca(Ci(i) + k);
687: }
688: });
689: });
691: akok->a_dual.modify_device();
692: bkok->a_dual.modify_device();
693: } else if (reuse == MAT_INITIAL_MATRIX) {
694: Mat Cd, Co;
695: const MatColIdxKokkosView &Cj = csrmat.graph.entries;
696: MatRowMapKokkosDualView Cdi_dual("i", m + 1), Coi_dual("i", m + 1);
697: MatRowMapKokkosView Cdi = Cdi_dual.view_device(), Coi = Coi_dual.view_device();
698: PetscInt cstart, cend;
700: /* Note that each row of C is sorted by col ids. We want to find out how to cut each row into three blocks:
701: left to the diag block, diag block, right to the diag block. The diag block have col ids in [cstart,cend).
702: Suppose a row of C has len nonzeros, indexed by [0, len). We want to know two indices: cdstart and cdend,
703: such that the three blocks are [0,cdstart), [cdstart,cdend), [cdend,len). The following code equivalentaly
704: stores values of cdstart and cdend-cstart (aka Cdi[]) instead.
705: */
706: Cdstart = MatRowMapKokkosView("Cdstart", m);
707: PetscLayoutGetRange(C->cmap, &cstart, &cend); /* Not MatGetOwnershipRangeColumn() since C has not been preallocated yet */
709: /* I could use RangePolicy and one thread per row. But since each thread essentially does binary search, threads in a
710: CUDA warp would completely diverge. So I use TeamPolicy with a team size 1.
711: */
712: Kokkos::parallel_for(
713: Kokkos::TeamPolicy<>(m, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
714: Kokkos::single(Kokkos::PerTeam(t), [=]() { /* Only one thread works in a team */
715: PetscInt i = t.league_rank(); /* row i */
716: PetscInt j, first, count, step;
718: if (i == 0) { /* Set the first entry of the i arrays to zero on device, to be used in CSR */
719: Cdi(0) = 0;
720: Coi(0) = 0;
721: }
723: /* Do std::lower_bound(Ci(i),Ci(i+1),cstart) on Cj[]. We use j as the iterator. lower_bound() returns
724: in 'first' the first iterator with a value >= cstart, or last iterator if no such element is found.
725: */
726: count = Ci(i + 1) - Ci(i);
727: first = Ci(i);
728: while (count > 0) {
729: j = first;
730: step = count / 2;
731: j += step;
732: if (Cj(j) < cstart) {
733: first = ++j;
734: count -= step + 1;
735: } else count = step;
736: }
737: Cdstart(i) = first - Ci(i); /* 'first' is the while-loop's output */
739: /* Do std::lower_bound(first,Ci(i+1),cend) on Cj[] */
740: count = Ci(i + 1) - first;
741: while (count > 0) {
742: j = first;
743: step = count / 2;
744: j += step;
745: if (Cj(j) < cend) {
746: first = ++j;
747: count -= step + 1;
748: } else count = step;
749: }
750: Cdi(i + 1) = first - (Ci(i) + Cdstart(i)); /* 'first' is the while-loop's output */
751: Coi(i + 1) = (Ci(i + 1) - Ci(i)) - Cdi(i + 1); /* Co's row len = C's row len - Cd's row len */
752: });
753: });
755: /* Convert row lens in Cdi[], Coi[] to CSR format using inclusive scan, e.g., changing [0,1,2,3] into [0,1,3,6] */
756: Kokkos::parallel_scan(
757: m + 1, KOKKOS_LAMBDA(const PetscInt i, PetscInt &update, const bool final) {
758: update += Cdi(i);
759: if (final) Cdi(i) = update;
760: });
761: Kokkos::parallel_scan(
762: m + 1, KOKKOS_LAMBDA(const PetscInt i, PetscInt &update, const bool final) {
763: update += Coi(i);
764: if (final) Coi(i) = update;
765: });
767: /* Get Cdi, Coi on host (it is not a waste, since we do need them on host in
768: MatCreateSeqAIJKokkosWithCSRMatrix() below), then get nnz of Cd and Co.
769: */
770: Cdi_dual.modify_device();
771: Coi_dual.modify_device();
772: Cdi_dual.sync_host();
773: Coi_dual.sync_host();
774: PetscInt Cd_nnz = Cdi_dual.view_host().data()[m];
775: PetscInt Co_nnz = Coi_dual.view_host().data()[m];
777: /* With nnz, allocate a, j for Cd and Co */
778: MatColIdxKokkosDualView Cdj_dual("j", Cd_nnz), Coj_dual("j", Co_nnz);
779: MatScalarKokkosDualView Cda_dual("a", Cd_nnz), Coa_dual("a", Co_nnz);
781: /* Fill a, j of Cd and Co on device */
782: MatColIdxKokkosView Cdj = Cdj_dual.view_device(), Coj = Coj_dual.view_device();
783: MatScalarKokkosView Cda = Cda_dual.view_device(), Coa = Coa_dual.view_device();
785: Kokkos::parallel_for(
786: Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
787: PetscInt i = t.league_rank(); /* row i */
788: PetscInt clen = Ci(i + 1) - Ci(i); /* len of row i of C */
789: PetscInt cdlen = Cdi(i + 1) - Cdi(i); /* len of row i of Cd */
790: PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */
791: PetscInt cdend = cdstart + cdlen;
792: /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */
793: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) {
794: if (k < cdstart) { /* k in [0, cdstart) */
795: Coa(Coi(i) + k) = Ca(Ci(i) + k);
796: Coj(Coi(i) + k) = Cj(Ci(i) + k);
797: } else if (k < cdend) { /* k in [cdstart, cdend) */
798: Cda(Cdi(i) + (k - cdstart)) = Ca(Ci(i) + k);
799: Cdj(Cdi(i) + (k - cdstart)) = Cj(Ci(i) + k) - cstart; /* Use local col ids in Cdj */
800: } else { /* k in [cdend, clen) */
801: Coa(Coi(i) + k - cdlen) = Ca(Ci(i) + k);
802: Coj(Coi(i) + k - cdlen) = Cj(Ci(i) + k);
803: }
804: });
805: });
807: Cdj_dual.modify_device();
808: Cda_dual.modify_device();
809: Coj_dual.modify_device();
810: Coa_dual.modify_device();
811: /* With a, i, j for Cd and Co, finally build Cd, Co and then C. Their offloadmask will be set in each's MatAssemblyEnd */
812: auto cdkok = new Mat_SeqAIJKokkos(m, n, Cd_nnz, Cdi_dual, Cdj_dual, Cda_dual);
813: auto cokok = new Mat_SeqAIJKokkos(m, N, Co_nnz, Coi_dual, Coj_dual, Coa_dual);
814: MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, cdkok, &Cd);
815: MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, cokok, &Co);
816: MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(C, Cd, Co); /* Coj will be converted to local ids within */
817: }
818: return 0;
819: }
821: /* MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos - Compact a SEQAIJKOKKS matrix's global col ids.
823: It is similar to MatSeqAIJCompactOutExtraColumns_SeqAIJ, but it applies to SEQAIJKOKKOS and returns the l2g map in Kokkos view.
825: Input Parameters:
826: + C - the MATMPIAIJKOKKOS matrix, of size m,n,M,N
827: . reuse - indicate whether the matrix has called this function before
828: . csrmat - the KokkosCsrMatrix, of size m,N
829: - Cdoffset - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the offset of the first
830: entry of the diag block of C in csrmat's j array.
832: Output Parameters:
833: + C - the updated MATMPIAIJKOKKOS matrix
834: - Cdoffset - when reuse == MAT_INITIAL_MATRIX, it is an output parameter
836: Note:
837: the input matrix's col ids and col size will be changed.
838: */
839: static PetscErrorCode MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(Mat C, MatColIdxKokkosView &l2g)
840: {
841: Mat_SeqAIJKokkos *ckok;
842: ISLocalToGlobalMapping l2gmap;
843: const PetscInt *garray;
844: PetscInt sz;
846: /* Compact P_other's global col ids and col size. We do it since we guess with local ids KK might be more memory scalable */
847: MatSeqAIJCompactOutExtraColumns_SeqAIJ(C, &l2gmap);
848: ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
849: ckok->j_dual.modify_host(); /* P_other's j is modified on host; we need to sync it on device */
850: ckok->j_dual.sync_device();
851: ckok->SetColSize(C->cmap->n); /* Update col size of the csrmat in spptr */
853: /* Build l2g -- the local to global mapping of C's cols */
854: ISLocalToGlobalMappingGetIndices(l2gmap, &garray);
855: ISLocalToGlobalMappingGetSize(l2gmap, &sz);
858: ConstMatColIdxKokkosViewHost tmp(garray, sz);
859: l2g = MatColIdxKokkosView("l2g", sz);
860: Kokkos::deep_copy(l2g, tmp);
862: ISLocalToGlobalMappingRestoreIndices(l2gmap, &garray);
863: ISLocalToGlobalMappingDestroy(&l2gmap);
864: return 0;
865: }
867: /* MatProductSymbolic_MPIAIJKokkos_AB - AB flavor of MatProductSymbolic_MPIAIJKokkos
869: Input Parameters:
870: + product - Mat_Product which carried out the computation. Passed in to access info about this mat product.
871: . A - an MPIAIJKOKKOS matrix
872: . B - an MPIAIJKOKKOS matrix
873: - mm - a struct used to stash intermediate data when computing AB. Persist from symbolic to numeric operations.
875: Note: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids.
876: */
877: static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AB(Mat_Product *product, Mat A, Mat B, MatMatStruct_AB *mm)
878: {
879: Mat_MPIAIJ *a = static_cast<Mat_MPIAIJ *>(A->data);
880: Mat Ad = a->A, Ao = a->B; /* diag and offdiag of A */
881: IS glob = NULL;
882: const PetscInt *garray;
883: PetscInt N = B->cmap->N, sz;
884: ConstMatColIdxKokkosView l2g1; /* two temp maps mapping local col ids to global ones */
885: MatColIdxKokkosView l2g2;
886: Mat C1, C2; /* intermediate matrices */
888: /* C1 = Ad * B_local. B_local is a matrix got by merging Bd and Bo, and uses local col ids */
889: MatMPIAIJGetLocalMatMerge(B, MAT_INITIAL_MATRIX, &glob, &mm->B_local);
890: MatProductCreate(Ad, mm->B_local, NULL, &C1);
891: MatProductSetType(C1, MATPRODUCT_AB);
892: MatProductSetFill(C1, product->fill);
893: C1->product->api_user = product->api_user;
894: MatProductSetFromOptions(C1);
895: PetscUseTypeMethod(C1, productsymbolic);
897: ISGetIndices(glob, &garray);
898: ISGetSize(glob, &sz);
899: const auto &tmp = ConstMatColIdxKokkosViewHost(garray, sz); /* wrap garray as a view */
900: l2g1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), tmp); /* maybe just an alias to tmp, so we restore garray at the very end */
901: MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1, N, l2g1, mm->C1_global);
903: /* C2 = Ao * B_other. B_other is a matrix consisting of needed rows of B gathered from other procs */
904: MatSeqAIJKokkosBcast(mm->B_local, MAT_INITIAL_MATRIX, N, l2g1, a->Mvctx, mm->sf, mm->abuf, mm->rows, mm->rowoffset, mm->B_other);
906: /* Compact B_other to use local ids as we guess KK spgemm is more memory scalable with that; We could skip the compaction to simplify code */
907: MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(mm->B_other, l2g2);
908: MatProductCreate(Ao, mm->B_other, NULL, &C2);
909: MatProductSetType(C2, MATPRODUCT_AB);
910: MatProductSetFill(C2, product->fill);
911: C2->product->api_user = product->api_user;
912: MatProductSetFromOptions(C2);
913: PetscUseTypeMethod(C2, productsymbolic);
914: MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C2, N, l2g2, mm->C2_global);
916: /* C = C1 + C2. We actually use their global col ids versions in adding */
917: mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B_local, B_other are not */
918: KokkosSparse::spadd_symbolic(&mm->kh, mm->C1_global, mm->C2_global, mm->C_global);
919: /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */
920: KokkosSparse::spadd_numeric(&mm->kh, (MatScalarType)1.0, mm->C1_global, (MatScalarType)1.0, mm->C2_global, mm->C_global);
922: mm->C1 = C1;
923: mm->C2 = C2;
924: ISRestoreIndices(glob, &garray);
925: ISDestroy(&glob);
926: return 0;
927: }
929: /* MatProductSymbolic_MPIAIJKokkos_AtB - A^tB flavor of MatProductSymbolic_MPIAIJKokkos
931: Input Parameters:
932: + product - Mat_Product which carried out the computation. Passed in to access info about this mat product.
933: . A - an MPIAIJKOKKOS matrix
934: . B - a SEQAIJKOKKOS matrix. It works as if A^t is multiplied by a parallel matrix made up of Bs on each rank.
935: . localB - Does B use local col ids? If false, then B is already in global col ids.
936: . N - col size of the "parallel B matrix". It implies B's global col ids are in range of [0,N) and N is the same across the communicator.
937: . l2g - If localB, then l2g maps B's local col ids to global ones.
938: - mm - a struct used to stash intermediate data in AtB
940: Note: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids.
941: */
942: static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AtB(Mat_Product *product, Mat A, Mat B, PetscBool localB, PetscInt N, const ConstMatColIdxKokkosView &l2g, MatMatStruct_AtB *mm)
943: {
944: Mat_MPIAIJ *a = static_cast<Mat_MPIAIJ *>(A->data);
945: Mat Ad = a->A, Ao = a->B; /* diag and offdiag of A */
946: Mat C1, C2; /* intermediate matrices */
948: /* C1 = Ad^t * B */
949: MatProductCreate(Ad, B, NULL, &C1);
950: MatProductSetType(C1, MATPRODUCT_AtB);
951: MatProductSetFill(C1, product->fill);
952: C1->product->api_user = product->api_user;
953: MatProductSetFromOptions(C1);
954: PetscUseTypeMethod(C1, productsymbolic);
956: if (localB) MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1, N, l2g, mm->C1_global);
957: else mm->C1_global = static_cast<Mat_SeqAIJKokkos *>(C1->spptr)->csrmat; /* the csrmat already uses global col ids */
959: /* C2 = Ao^t * B */
960: MatProductCreate(Ao, B, NULL, &C2);
961: MatProductSetType(C2, MATPRODUCT_AtB);
962: MatProductSetFill(C2, product->fill);
963: C2->product->api_user = product->api_user;
964: MatProductSetFromOptions(C2);
965: PetscUseTypeMethod(C2, productsymbolic);
967: MatSeqAIJKokkosReduce(C2, MAT_INITIAL_MATRIX, localB, N, l2g, a->Mvctx, mm->sf, mm->abuf, mm->srcrowoffset, mm->dstrowoffset, mm->C2_global);
969: mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B may be not */
970: KokkosSparse::spadd_symbolic(&mm->kh, mm->C1_global, mm->C2_global, mm->C_global);
971: /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */
972: KokkosSparse::spadd_numeric(&mm->kh, (MatScalarType)1.0, mm->C1_global, (MatScalarType)1.0, mm->C2_global, mm->C_global);
973: mm->C1 = C1;
974: mm->C2 = C2;
975: return 0;
976: }
978: PetscErrorCode MatProductNumeric_MPIAIJKokkos(Mat C)
979: {
980: Mat_Product *product = C->product;
981: MatProductType ptype;
982: MatProductData_MPIAIJKokkos *mmdata;
983: MatMatStruct *mm = NULL;
984: MatMatStruct_AB *ab;
985: MatMatStruct_AtB *atb;
986: Mat A, B, Ad, Ao, Bd, Bo;
987: const MatScalarType one = 1.0; /* Not use literal 1.0 directly, to avoid wrong template instantiation in KokkosSparse::spadd_numeric */
989: MatCheckProduct(C, 1);
990: mmdata = static_cast<MatProductData_MPIAIJKokkos *>(product->data);
991: ptype = product->type;
992: A = product->A;
993: B = product->B;
994: Ad = static_cast<Mat_MPIAIJ *>(A->data)->A;
995: Ao = static_cast<Mat_MPIAIJ *>(A->data)->B;
996: Bd = static_cast<Mat_MPIAIJ *>(B->data)->A;
997: Bo = static_cast<Mat_MPIAIJ *>(B->data)->B;
999: if (mmdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
1000: mmdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */
1001: ab = mmdata->mmAB;
1002: atb = mmdata->mmAtB;
1003: if (ab) {
1004: static_cast<MatProductData_SeqAIJKokkos *>(ab->C1->product->data)->reusesym = PETSC_FALSE;
1005: static_cast<MatProductData_SeqAIJKokkos *>(ab->C2->product->data)->reusesym = PETSC_FALSE;
1006: }
1007: if (atb) {
1008: static_cast<MatProductData_SeqAIJKokkos *>(atb->C1->product->data)->reusesym = PETSC_FALSE;
1009: static_cast<MatProductData_SeqAIJKokkos *>(atb->C2->product->data)->reusesym = PETSC_FALSE;
1010: }
1011: return 0;
1012: }
1014: if (ptype == MATPRODUCT_AB) {
1015: ab = mmdata->mmAB;
1016: /* C1 = Ad * B_local */
1018: MatMPIAIJGetLocalMatMerge(B, MAT_REUSE_MATRIX, NULL /* glob */, &ab->B_local);
1020: if (ab->C1->product->A != Ad) MatProductReplaceMats(Ad, NULL, NULL, ab->C1);
1021: (*ab->C1->ops->productnumeric)(ab->C1);
1022: MatSeqAIJKokkosBcast(ab->B_local, MAT_REUSE_MATRIX, 0 /* N */, MatColIdxKokkosView() /*l2g*/, NULL /*ownerSF*/, ab->sf, ab->abuf, ab->rows, ab->rowoffset, ab->B_other);
1023: /* C2 = Ao * B_other */
1025: if (ab->C1->product->A != Ao) MatProductReplaceMats(Ao, NULL, NULL, ab->C2);
1026: (*ab->C2->ops->productnumeric)(ab->C2);
1027: /* C = C1_global + C2_global */
1028: KokkosSparse::spadd_numeric(&ab->kh, one, ab->C1_global, one, ab->C2_global, ab->C_global);
1029: mm = static_cast<MatMatStruct *>(ab);
1030: } else if (ptype == MATPRODUCT_AtB) {
1031: atb = mmdata->mmAtB;
1033: /* C1 = Ad^t * B_local */
1034: MatMPIAIJGetLocalMatMerge(B, MAT_REUSE_MATRIX, NULL /* glob */, &atb->B_local);
1036: if (atb->C1->product->A != Ad) MatProductReplaceMats(Ad, NULL, NULL, atb->C1);
1037: (*atb->C1->ops->productnumeric)(atb->C1);
1039: /* C2 = Ao^t * B_local */
1041: if (atb->C2->product->A != Ao) MatProductReplaceMats(Ao, NULL, NULL, atb->C2);
1042: (*atb->C2->ops->productnumeric)(atb->C2);
1043: /* Form C2_global */
1044: MatSeqAIJKokkosReduce(atb->C2, MAT_REUSE_MATRIX, PETSC_TRUE, 0 /* N */, MatColIdxKokkosView() /*l2g*/, NULL /*ownerSF*/, atb->sf, atb->abuf, atb->srcrowoffset, atb->dstrowoffset, atb->C2_global);
1045: /* C = C1_global + C2_global */
1046: KokkosSparse::spadd_numeric(&atb->kh, one, atb->C1_global, one, atb->C2_global, atb->C_global);
1047: mm = static_cast<MatMatStruct *>(atb);
1048: } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */
1049: ab = mmdata->mmAB;
1050: MatMPIAIJGetLocalMatMerge(B, MAT_REUSE_MATRIX, NULL /* glob */, &ab->B_local);
1052: /* ab->C1 = Ad * B_local */
1054: if (ab->C1->product->A != Ad) MatProductReplaceMats(Ad, NULL, NULL, ab->C1);
1055: (*ab->C1->ops->productnumeric)(ab->C1);
1056: MatSeqAIJKokkosBcast(ab->B_local, MAT_REUSE_MATRIX, 0 /* N */, MatColIdxKokkosView() /*l2g*/, NULL /*ownerSF*/, ab->sf, ab->abuf, ab->rows, ab->rowoffset, ab->B_other);
1057: /* ab->C2 = Ao * B_other */
1058: if (ab->C2->product->A != Ao) MatProductReplaceMats(Ao, NULL, NULL, ab->C2);
1059: (*ab->C2->ops->productnumeric)(ab->C2); /* C2 = Ao * B_other */
1060: KokkosSparse::spadd_numeric(&ab->kh, one, ab->C1_global, one, ab->C2_global, ab->C_global);
1062: /* atb->C1 = Bd^t * ab->C_petsc */
1063: atb = mmdata->mmAtB;
1065: if (atb->C1->product->A != Bd) MatProductReplaceMats(Bd, NULL, NULL, atb->C1);
1066: (*atb->C1->ops->productnumeric)(atb->C1);
1067: /* atb->C2 = Bo^t * ab->C_petsc */
1068: if (atb->C2->product->A != Bo) MatProductReplaceMats(Bo, NULL, NULL, atb->C2);
1069: (*atb->C2->ops->productnumeric)(atb->C2);
1070: MatSeqAIJKokkosReduce(atb->C2, MAT_REUSE_MATRIX, PETSC_FALSE, 0 /* N */, MatColIdxKokkosView() /*l2g*/, NULL /* ownerSF */, atb->sf, atb->abuf, atb->srcrowoffset, atb->dstrowoffset, atb->C2_global);
1071: KokkosSparse::spadd_numeric(&atb->kh, one, atb->C1_global, one, atb->C2_global, atb->C_global);
1072: mm = static_cast<MatMatStruct *>(atb);
1073: }
1074: /* Split C_global to form C */
1075: MatSetMPIAIJKokkosWithGlobalCSRMatrix(C, MAT_REUSE_MATRIX, mm->C_global, mm->Cdstart);
1076: return 0;
1077: }
1079: PetscErrorCode MatProductSymbolic_MPIAIJKokkos(Mat C)
1080: {
1081: Mat A, B;
1082: Mat_Product *product = C->product;
1083: MatProductType ptype;
1084: MatProductData_MPIAIJKokkos *mmdata;
1085: MatMatStruct *mm = NULL;
1086: IS glob = NULL;
1087: const PetscInt *garray;
1088: PetscInt m, n, M, N, sz;
1089: ConstMatColIdxKokkosView l2g; /* map local col ids to global ones */
1091: MatCheckProduct(C, 1);
1093: ptype = product->type;
1094: A = product->A;
1095: B = product->B;
1097: switch (ptype) {
1098: case MATPRODUCT_AB:
1099: m = A->rmap->n;
1100: n = B->cmap->n;
1101: M = A->rmap->N;
1102: N = B->cmap->N;
1103: break;
1104: case MATPRODUCT_AtB:
1105: m = A->cmap->n;
1106: n = B->cmap->n;
1107: M = A->cmap->N;
1108: N = B->cmap->N;
1109: break;
1110: case MATPRODUCT_PtAP:
1111: m = B->cmap->n;
1112: n = B->cmap->n;
1113: M = B->cmap->N;
1114: N = B->cmap->N;
1115: break; /* BtAB */
1116: default:
1117: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for product type %s", MatProductTypes[ptype]);
1118: }
1120: MatSetSizes(C, m, n, M, N);
1121: PetscLayoutSetUp(C->rmap);
1122: PetscLayoutSetUp(C->cmap);
1123: MatSetType(C, ((PetscObject)A)->type_name);
1125: mmdata = new MatProductData_MPIAIJKokkos();
1126: mmdata->reusesym = product->api_user;
1128: if (ptype == MATPRODUCT_AB) {
1129: mmdata->mmAB = new MatMatStruct_AB();
1130: MatProductSymbolic_MPIAIJKokkos_AB(product, A, B, mmdata->mmAB);
1131: mm = static_cast<MatMatStruct *>(mmdata->mmAB);
1132: } else if (ptype == MATPRODUCT_AtB) {
1133: mmdata->mmAtB = new MatMatStruct_AtB();
1134: auto atb = mmdata->mmAtB;
1135: MatMPIAIJGetLocalMatMerge(B, MAT_INITIAL_MATRIX, &glob, &atb->B_local);
1136: ISGetIndices(glob, &garray);
1137: ISGetSize(glob, &sz);
1138: l2g = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatColIdxKokkosViewHost(garray, sz));
1139: MatProductSymbolic_MPIAIJKokkos_AtB(product, A, atb->B_local, PETSC_TRUE, N, l2g, atb);
1140: ISRestoreIndices(glob, &garray);
1141: ISDestroy(&glob);
1142: mm = static_cast<MatMatStruct *>(atb);
1143: } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */
1144: mmdata->mmAB = new MatMatStruct_AB(); /* tmp=A*B */
1145: mmdata->mmAtB = new MatMatStruct_AtB(); /* C=B^t*tmp */
1146: auto ab = mmdata->mmAB;
1147: auto atb = mmdata->mmAtB;
1148: MatProductSymbolic_MPIAIJKokkos_AB(product, A, B, ab);
1149: auto tmp = new Mat_SeqAIJKokkos(ab->C_global); /* Memory will be owned by ab->C_petsc */
1150: MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, tmp, &ab->C_petsc);
1151: MatProductSymbolic_MPIAIJKokkos_AtB(product, B, ab->C_petsc, PETSC_FALSE, N, l2g /*not used*/, atb);
1152: mm = static_cast<MatMatStruct *>(atb);
1153: }
1154: /* Split the C_global into petsc A, B format */
1155: MatSetMPIAIJKokkosWithGlobalCSRMatrix(C, MAT_INITIAL_MATRIX, mm->C_global, mm->Cdstart);
1156: C->product->data = mmdata;
1157: C->product->destroy = MatProductDataDestroy_MPIAIJKokkos;
1158: C->ops->productnumeric = MatProductNumeric_MPIAIJKokkos;
1159: return 0;
1160: }
1162: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIAIJKokkos(Mat mat)
1163: {
1164: Mat_Product *product = mat->product;
1165: PetscBool match = PETSC_FALSE;
1166: PetscBool usecpu = PETSC_FALSE;
1168: MatCheckProduct(mat, 1);
1169: if (!product->A->boundtocpu && !product->B->boundtocpu) PetscObjectTypeCompare((PetscObject)product->B, ((PetscObject)product->A)->type_name, &match);
1170: if (match) { /* we can always fallback to the CPU if requested */
1171: switch (product->type) {
1172: case MATPRODUCT_AB:
1173: if (product->api_user) {
1174: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
1175: PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL);
1176: PetscOptionsEnd();
1177: } else {
1178: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
1179: PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL);
1180: PetscOptionsEnd();
1181: }
1182: break;
1183: case MATPRODUCT_AtB:
1184: if (product->api_user) {
1185: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
1186: PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL);
1187: PetscOptionsEnd();
1188: } else {
1189: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
1190: PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL);
1191: PetscOptionsEnd();
1192: }
1193: break;
1194: case MATPRODUCT_PtAP:
1195: if (product->api_user) {
1196: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
1197: PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL);
1198: PetscOptionsEnd();
1199: } else {
1200: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
1201: PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL);
1202: PetscOptionsEnd();
1203: }
1204: break;
1205: default:
1206: break;
1207: }
1208: match = (PetscBool)!usecpu;
1209: }
1210: if (match) {
1211: switch (product->type) {
1212: case MATPRODUCT_AB:
1213: case MATPRODUCT_AtB:
1214: case MATPRODUCT_PtAP:
1215: mat->ops->productsymbolic = MatProductSymbolic_MPIAIJKokkos;
1216: break;
1217: default:
1218: break;
1219: }
1220: }
1221: /* fallback to MPIAIJ ops */
1222: if (!mat->ops->productsymbolic) MatProductSetFromOptions_MPIAIJ(mat);
1223: return 0;
1224: }
1226: static PetscErrorCode MatSetPreallocationCOO_MPIAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1227: {
1228: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
1229: Mat_MPIAIJKokkos *mpikok;
1231: MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j); /* mpiaij->A,B's type is set to seqaijkokkos */
1232: mat->preallocated = PETSC_TRUE;
1233: MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY);
1234: MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY);
1235: MatZeroEntries(mat);
1236: mpikok = static_cast<Mat_MPIAIJKokkos *>(mpiaij->spptr);
1237: delete mpikok;
1238: mpiaij->spptr = new Mat_MPIAIJKokkos(mpiaij);
1239: return 0;
1240: }
1242: static PetscErrorCode MatSetValuesCOO_MPIAIJKokkos(Mat mat, const PetscScalar v[], InsertMode imode)
1243: {
1244: Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
1245: Mat_MPIAIJKokkos *mpikok = static_cast<Mat_MPIAIJKokkos *>(mpiaij->spptr);
1246: Mat A = mpiaij->A, B = mpiaij->B;
1247: PetscCount Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2;
1248: MatScalarKokkosView Aa, Ba;
1249: MatScalarKokkosView v1;
1250: MatScalarKokkosView &vsend = mpikok->sendbuf_d;
1251: const MatScalarKokkosView &v2 = mpikok->recvbuf_d;
1252: const PetscCountKokkosView &Ajmap1 = mpikok->Ajmap1_d, Ajmap2 = mpikok->Ajmap2_d, Aimap2 = mpikok->Aimap2_d;
1253: const PetscCountKokkosView &Bjmap1 = mpikok->Bjmap1_d, Bjmap2 = mpikok->Bjmap2_d, Bimap2 = mpikok->Bimap2_d;
1254: const PetscCountKokkosView &Aperm1 = mpikok->Aperm1_d, Aperm2 = mpikok->Aperm2_d, Bperm1 = mpikok->Bperm1_d, Bperm2 = mpikok->Bperm2_d;
1255: const PetscCountKokkosView &Cperm1 = mpikok->Cperm1_d;
1256: PetscMemType memtype;
1258: PetscGetMemType(v, &memtype); /* Return PETSC_MEMTYPE_HOST when v is NULL */
1259: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device if any */
1260: v1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), MatScalarKokkosViewHost((PetscScalar *)v, mpiaij->coo_n));
1261: } else {
1262: v1 = MatScalarKokkosView((PetscScalar *)v, mpiaij->coo_n); /* Directly use v[]'s memory */
1263: }
1265: if (imode == INSERT_VALUES) {
1266: MatSeqAIJGetKokkosViewWrite(A, &Aa); /* write matrix values */
1267: MatSeqAIJGetKokkosViewWrite(B, &Ba);
1268: } else {
1269: MatSeqAIJGetKokkosView(A, &Aa); /* read & write matrix values */
1270: MatSeqAIJGetKokkosView(B, &Ba);
1271: }
1273: /* Pack entries to be sent to remote */
1274: Kokkos::parallel_for(
1275: vsend.extent(0), KOKKOS_LAMBDA(const PetscCount i) { vsend(i) = v1(Cperm1(i)); });
1277: /* Send remote entries to their owner and overlap the communication with local computation */
1278: PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_KOKKOS, vsend.data(), PETSC_MEMTYPE_KOKKOS, v2.data(), MPI_REPLACE);
1279: /* Add local entries to A and B in one kernel */
1280: Kokkos::parallel_for(
1281: Annz + Bnnz, KOKKOS_LAMBDA(PetscCount i) {
1282: PetscScalar sum = 0.0;
1283: if (i < Annz) {
1284: for (PetscCount k = Ajmap1(i); k < Ajmap1(i + 1); k++) sum += v1(Aperm1(k));
1285: Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1286: } else {
1287: i -= Annz;
1288: for (PetscCount k = Bjmap1(i); k < Bjmap1(i + 1); k++) sum += v1(Bperm1(k));
1289: Ba(i) = (imode == INSERT_VALUES ? 0.0 : Ba(i)) + sum;
1290: }
1291: });
1292: PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend.data(), v2.data(), MPI_REPLACE);
1294: /* Add received remote entries to A and B in one kernel */
1295: Kokkos::parallel_for(
1296: Annz2 + Bnnz2, KOKKOS_LAMBDA(PetscCount i) {
1297: if (i < Annz2) {
1298: for (PetscCount k = Ajmap2(i); k < Ajmap2(i + 1); k++) Aa(Aimap2(i)) += v2(Aperm2(k));
1299: } else {
1300: i -= Annz2;
1301: for (PetscCount k = Bjmap2(i); k < Bjmap2(i + 1); k++) Ba(Bimap2(i)) += v2(Bperm2(k));
1302: }
1303: });
1305: if (imode == INSERT_VALUES) {
1306: MatSeqAIJRestoreKokkosViewWrite(A, &Aa); /* Increase A & B's state etc. */
1307: MatSeqAIJRestoreKokkosViewWrite(B, &Ba);
1308: } else {
1309: MatSeqAIJRestoreKokkosView(A, &Aa);
1310: MatSeqAIJRestoreKokkosView(B, &Ba);
1311: }
1312: return 0;
1313: }
1315: PetscErrorCode MatDestroy_MPIAIJKokkos(Mat A)
1316: {
1317: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;
1319: PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL);
1320: PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL);
1321: PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL);
1322: PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL);
1323: delete (Mat_MPIAIJKokkos *)mpiaij->spptr;
1324: MatDestroy_MPIAIJ(A);
1325: return 0;
1326: }
1328: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
1329: {
1330: Mat B;
1331: Mat_MPIAIJ *a;
1333: if (reuse == MAT_INITIAL_MATRIX) {
1334: MatDuplicate(A, MAT_COPY_VALUES, newmat);
1335: } else if (reuse == MAT_REUSE_MATRIX) {
1336: MatCopy(A, *newmat, SAME_NONZERO_PATTERN);
1337: }
1338: B = *newmat;
1340: B->boundtocpu = PETSC_FALSE;
1341: PetscFree(B->defaultvectype);
1342: PetscStrallocpy(VECKOKKOS, &B->defaultvectype);
1343: PetscObjectChangeTypeName((PetscObject)B, MATMPIAIJKOKKOS);
1345: a = static_cast<Mat_MPIAIJ *>(A->data);
1346: if (a->A) MatSetType(a->A, MATSEQAIJKOKKOS);
1347: if (a->B) MatSetType(a->B, MATSEQAIJKOKKOS);
1348: if (a->lvec) VecSetType(a->lvec, VECSEQKOKKOS);
1350: B->ops->assemblyend = MatAssemblyEnd_MPIAIJKokkos;
1351: B->ops->mult = MatMult_MPIAIJKokkos;
1352: B->ops->multadd = MatMultAdd_MPIAIJKokkos;
1353: B->ops->multtranspose = MatMultTranspose_MPIAIJKokkos;
1354: B->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJKokkos;
1355: B->ops->destroy = MatDestroy_MPIAIJKokkos;
1357: PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJKokkos);
1358: PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJKokkos);
1359: PetscObjectComposeFunction((PetscObject)B, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJKokkos);
1360: PetscObjectComposeFunction((PetscObject)B, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJKokkos);
1361: return 0;
1362: }
1363: /*MC
1364: MATAIJKOKKOS - "mpiaijkokkos", a matrix type to be used for CSR sparse matrices with Kokkos
1366: A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
1368: Options Database Keys:
1369: . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
1371: Level: beginner
1373: .seealso: `MatCreateAIJKokkos()`, `MATSEQAIJKOKKOS`, `MATSEQAIJ`, `MATMPIAIJ`
1374: M*/
1375: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A)
1376: {
1377: PetscKokkosInitializeCheck();
1378: MatCreate_MPIAIJ(A);
1379: MatConvert_MPIAIJ_MPIAIJKokkos(A, MATMPIAIJKOKKOS, MAT_INPLACE_MATRIX, &A);
1380: return 0;
1381: }
1383: /*@C
1384: MatCreateAIJKokkos - Creates a sparse matrix in `MATAIJKOKOS` (compressed row) format
1385: (the default parallel PETSc format). This matrix will ultimately pushed down
1386: to Kokkos for calculations. For good matrix
1387: assembly performance the user should preallocate the matrix storage by setting
1388: the parameter nz (or the array nnz). By setting these parameters accurately,
1389: performance during matrix assembly can be increased by more than a factor of 50.
1391: Collective
1393: Input Parameters:
1394: + comm - MPI communicator, set to `PETSC_COMM_SELF`
1395: . m - number of rows
1396: . n - number of columns
1397: . nz - number of nonzeros per row (same for all rows)
1398: - nnz - array containing the number of nonzeros in the various rows
1399: (possibly different for each row) or NULL
1401: Output Parameter:
1402: . A - the matrix
1404: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1405: MatXXXXSetPreallocation() paradigm instead of this routine directly.
1406: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1408: Notes:
1409: If nnz is given then nz is ignored
1411: The AIJ format, also called compressed row storage), is fully compatible with standard Fortran 77
1412: storage. That is, the stored row and column indices can begin at
1413: either one (as in Fortran) or zero. See the users' manual for details.
1415: Specify the preallocated storage with either nz or nnz (not both).
1416: Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
1417: allocation. For large problems you MUST preallocate memory or you
1418: will get TERRIBLE performance, see the users' manual chapter on matrices.
1420: By default, this format uses inodes (identical nodes) when possible, to
1421: improve numerical efficiency of matrix-vector products and solves. We
1422: search for consecutive rows with the same nonzero structure, thereby
1423: reusing matrix information to achieve increased efficiency.
1425: Level: intermediate
1427: .seealso: `MATAIJKOKOS`, `MATSEQAIJKOKOS`, `MATMPIAIJKOKOS`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJKOKKOS`, `MATAIJKOKKOS`
1428: @*/
1429: PetscErrorCode MatCreateAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
1430: {
1431: PetscMPIInt size;
1433: MatCreate(comm, A);
1434: MatSetSizes(*A, m, n, M, N);
1435: MPI_Comm_size(comm, &size);
1436: if (size > 1) {
1437: MatSetType(*A, MATMPIAIJKOKKOS);
1438: MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz);
1439: } else {
1440: MatSetType(*A, MATSEQAIJKOKKOS);
1441: MatSeqAIJSetPreallocation(*A, d_nz, d_nnz);
1442: }
1443: return 0;
1444: }
1446: // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
1447: PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
1448: {
1449: PetscMPIInt size, rank;
1450: MPI_Comm comm;
1451: PetscSplitCSRDataStructure d_mat = NULL;
1453: PetscObjectGetComm((PetscObject)A, &comm);
1454: MPI_Comm_size(comm, &size);
1455: MPI_Comm_rank(comm, &rank);
1456: if (size == 1) {
1457: MatSeqAIJKokkosGetDeviceMat(A, &d_mat);
1458: MatSeqAIJKokkosModifyDevice(A); /* Since we are going to modify matrix values on device */
1459: } else {
1460: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
1461: MatSeqAIJKokkosGetDeviceMat(aij->A, &d_mat);
1462: MatSeqAIJKokkosModifyDevice(aij->A);
1463: MatSeqAIJKokkosModifyDevice(aij->B);
1465: }
1466: // act like MatSetValues because not called on host
1467: if (A->assembled) {
1468: if (A->was_assembled) PetscInfo(A, "Assemble more than once already\n");
1469: A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
1470: } else {
1471: PetscInfo(A, "Warning !assemble ??? assembled=%" PetscInt_FMT "\n", A->assembled);
1472: }
1473: if (!d_mat) {
1474: struct _n_SplitCSRMat h_mat; /* host container */
1475: Mat_SeqAIJKokkos *aijkokA;
1476: Mat_SeqAIJ *jaca;
1477: PetscInt n = A->rmap->n, nnz;
1478: Mat Amat;
1479: PetscInt *colmap;
1481: /* create and copy h_mat */
1482: h_mat.M = A->cmap->N; // use for debug build
1483: PetscInfo(A, "Create device matrix in Kokkos\n");
1484: if (size == 1) {
1485: Amat = A;
1486: jaca = (Mat_SeqAIJ *)A->data;
1487: h_mat.rstart = 0;
1488: h_mat.rend = A->rmap->n;
1489: h_mat.cstart = 0;
1490: h_mat.cend = A->cmap->n;
1491: h_mat.offdiag.i = h_mat.offdiag.j = NULL;
1492: h_mat.offdiag.a = NULL;
1493: aijkokA = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1494: } else {
1495: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
1496: Mat_SeqAIJ *jacb = (Mat_SeqAIJ *)aij->B->data;
1497: PetscInt ii;
1498: Mat_SeqAIJKokkos *aijkokB;
1500: Amat = aij->A;
1501: aijkokA = static_cast<Mat_SeqAIJKokkos *>(aij->A->spptr);
1502: aijkokB = static_cast<Mat_SeqAIJKokkos *>(aij->B->spptr);
1503: jaca = (Mat_SeqAIJ *)aij->A->data;
1506: aij->donotstash = PETSC_TRUE;
1507: aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
1508: jaca->nonew = jacb->nonew = PETSC_TRUE; // no more disassembly
1509: PetscCalloc1(A->cmap->N, &colmap);
1510: for (ii = 0; ii < aij->B->cmap->n; ii++) colmap[aij->garray[ii]] = ii + 1;
1511: // allocate B copy data
1512: h_mat.rstart = A->rmap->rstart;
1513: h_mat.rend = A->rmap->rend;
1514: h_mat.cstart = A->cmap->rstart;
1515: h_mat.cend = A->cmap->rend;
1516: nnz = jacb->i[n];
1517: if (jacb->compressedrow.use) {
1518: const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_i_k(jacb->i, n + 1);
1519: aijkokB->i_uncompressed_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_i_k));
1520: Kokkos::deep_copy(aijkokB->i_uncompressed_d, h_i_k);
1521: h_mat.offdiag.i = aijkokB->i_uncompressed_d.data();
1522: } else {
1523: h_mat.offdiag.i = aijkokB->i_device_data();
1524: }
1525: h_mat.offdiag.j = aijkokB->j_device_data();
1526: h_mat.offdiag.a = aijkokB->a_device_data();
1527: {
1528: Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_colmap_k(colmap, A->cmap->N);
1529: aijkokB->colmap_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_colmap_k));
1530: Kokkos::deep_copy(aijkokB->colmap_d, h_colmap_k);
1531: h_mat.colmap = aijkokB->colmap_d.data();
1532: PetscFree(colmap);
1533: }
1534: h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
1535: h_mat.offdiag.n = n;
1536: }
1537: // allocate A copy data
1538: nnz = jaca->i[n];
1539: h_mat.diag.n = n;
1540: h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
1541: MPI_Comm_rank(comm, &h_mat.rank);
1543: h_mat.diag.i = aijkokA->i_device_data();
1544: h_mat.diag.j = aijkokA->j_device_data();
1545: h_mat.diag.a = aijkokA->a_device_data();
1546: // copy pointers and metadata to device
1547: MatSeqAIJKokkosSetDeviceMat(Amat, &h_mat);
1548: MatSeqAIJKokkosGetDeviceMat(Amat, &d_mat);
1549: PetscInfo(A, "Create device Mat n=%" PetscInt_FMT " nnz=%" PetscInt_FMT "\n", h_mat.diag.n, nnz);
1550: }
1551: *B = d_mat; // return it, set it in Mat, and set it up
1552: A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
1553: return 0;
1554: }
1556: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetOffloadMask(Mat A, const char **mask)
1557: {
1558: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1560: if (!aijkok) *mask = "AIJKOK_UNALLOCATED";
1561: else if (aijkok->a_dual.need_sync_host()) *mask = "PETSC_OFFLOAD_GPU";
1562: else if (aijkok->a_dual.need_sync_device()) *mask = "PETSC_OFFLOAD_CPU";
1563: else *mask = "PETSC_OFFLOAD_BOTH";
1564: return 0;
1565: }
1567: PETSC_INTERN PetscErrorCode MatAIJKokkosPrintOffloadMask(Mat A)
1568: {
1569: PetscMPIInt size;
1570: Mat Ad, Ao;
1571: const char *amask, *bmask;
1573: MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
1575: if (size == 1) {
1576: MatSeqAIJKokkosGetOffloadMask(A, &amask);
1577: PetscPrintf(PETSC_COMM_SELF, "%s\n", amask);
1578: } else {
1579: Ad = ((Mat_MPIAIJ *)A->data)->A;
1580: Ao = ((Mat_MPIAIJ *)A->data)->B;
1581: MatSeqAIJKokkosGetOffloadMask(Ad, &amask);
1582: MatSeqAIJKokkosGetOffloadMask(Ao, &bmask);
1583: PetscPrintf(PETSC_COMM_SELF, "Diag : Off-diag = %s : %s\n", amask, bmask);
1584: }
1585: return 0;
1586: }