Actual source code: mpiaijkok.kokkos.cxx

  1: #include <petscvec_kokkos.hpp>
  2: #include <petscsf.h>
  3: #include <petsc/private/sfimpl.h>
  4: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  5: #include <../src/mat/impls/aij/mpi/kokkos/mpiaijkok.hpp>
  6: #include <KokkosSparse_spadd.hpp>

  8: PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A, MatAssemblyType mode)
  9: {
 10:   Mat_SeqAIJKokkos *aijkok;
 11:   Mat_MPIAIJ       *mpiaij = (Mat_MPIAIJ *)A->data;

 13:   MatAssemblyEnd_MPIAIJ(A, mode);
 14:   /* E.g., MatCreateSubMatrix() calls MatCreateMPIAIJWithSeqAIJ(comm,A,B,..), which creates Bnew of SEQAIJ and destroys B of SEQAIJKOKKOS.
 15:      Thus we finalize A/B/lvec's type in MatAssemblyEnd() to handle various cases.
 16:    */
 17:   if (mode == MAT_FINAL_ASSEMBLY) {
 18:     MatSetType(mpiaij->A, MATSEQAIJKOKKOS);
 19:     MatSetType(mpiaij->B, MATSEQAIJKOKKOS);
 20:     VecSetType(mpiaij->lvec, VECSEQKOKKOS);
 21:   }
 22:   aijkok = static_cast<Mat_SeqAIJKokkos *>(((Mat_MPIAIJ *)A->data)->A->spptr); /* Access spptr after MatAssemblyEnd_MPIAIJ(), which might have deleted old spptr */
 23:   if (aijkok && aijkok->device_mat_d.data()) {
 24:     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
 25:   }

 27:   return 0;
 28: }

 30: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
 31: {
 32:   Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;

 34:   PetscLayoutSetUp(mat->rmap);
 35:   PetscLayoutSetUp(mat->cmap);
 36: #if defined(PETSC_USE_DEBUG)
 37:   if (d_nnz) {
 38:     PetscInt i;
 40:   }
 41:   if (o_nnz) {
 42:     PetscInt i;
 44:   }
 45: #endif
 46: #if defined(PETSC_USE_CTABLE)
 47:   PetscTableDestroy(&mpiaij->colmap);
 48: #else
 49:   PetscFree(mpiaij->colmap);
 50: #endif
 51:   PetscFree(mpiaij->garray);
 52:   VecDestroy(&mpiaij->lvec);
 53:   VecScatterDestroy(&mpiaij->Mvctx);
 54:   /* Because the B will have been resized we simply destroy it and create a new one each time */
 55:   MatDestroy(&mpiaij->B);

 57:   if (!mpiaij->A) {
 58:     MatCreate(PETSC_COMM_SELF, &mpiaij->A);
 59:     MatSetSizes(mpiaij->A, mat->rmap->n, mat->cmap->n, mat->rmap->n, mat->cmap->n);
 60:   }
 61:   if (!mpiaij->B) {
 62:     PetscMPIInt size;
 63:     MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size);
 64:     MatCreate(PETSC_COMM_SELF, &mpiaij->B);
 65:     MatSetSizes(mpiaij->B, mat->rmap->n, size > 1 ? mat->cmap->N : 0, mat->rmap->n, size > 1 ? mat->cmap->N : 0);
 66:   }
 67:   MatSetType(mpiaij->A, MATSEQAIJKOKKOS);
 68:   MatSetType(mpiaij->B, MATSEQAIJKOKKOS);
 69:   MatSeqAIJSetPreallocation(mpiaij->A, d_nz, d_nnz);
 70:   MatSeqAIJSetPreallocation(mpiaij->B, o_nz, o_nnz);
 71:   mat->preallocated = PETSC_TRUE;
 72:   return 0;
 73: }

 75: PetscErrorCode MatMult_MPIAIJKokkos(Mat mat, Vec xx, Vec yy)
 76: {
 77:   Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
 78:   PetscInt    nt;

 80:   VecGetLocalSize(xx, &nt);
 82:   VecScatterBegin(mpiaij->Mvctx, xx, mpiaij->lvec, INSERT_VALUES, SCATTER_FORWARD);
 83:   (*mpiaij->A->ops->mult)(mpiaij->A, xx, yy);
 84:   VecScatterEnd(mpiaij->Mvctx, xx, mpiaij->lvec, INSERT_VALUES, SCATTER_FORWARD);
 85:   (*mpiaij->B->ops->multadd)(mpiaij->B, mpiaij->lvec, yy, yy);
 86:   return 0;
 87: }

 89: PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat, Vec xx, Vec yy, Vec zz)
 90: {
 91:   Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
 92:   PetscInt    nt;

 94:   VecGetLocalSize(xx, &nt);
 96:   VecScatterBegin(mpiaij->Mvctx, xx, mpiaij->lvec, INSERT_VALUES, SCATTER_FORWARD);
 97:   (*mpiaij->A->ops->multadd)(mpiaij->A, xx, yy, zz);
 98:   VecScatterEnd(mpiaij->Mvctx, xx, mpiaij->lvec, INSERT_VALUES, SCATTER_FORWARD);
 99:   (*mpiaij->B->ops->multadd)(mpiaij->B, mpiaij->lvec, zz, zz);
100:   return 0;
101: }

103: PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat, Vec xx, Vec yy)
104: {
105:   Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
106:   PetscInt    nt;

108:   VecGetLocalSize(xx, &nt);
110:   (*mpiaij->B->ops->multtranspose)(mpiaij->B, xx, mpiaij->lvec);
111:   (*mpiaij->A->ops->multtranspose)(mpiaij->A, xx, yy);
112:   VecScatterBegin(mpiaij->Mvctx, mpiaij->lvec, yy, ADD_VALUES, SCATTER_REVERSE);
113:   VecScatterEnd(mpiaij->Mvctx, mpiaij->lvec, yy, ADD_VALUES, SCATTER_REVERSE);
114:   return 0;
115: }

117: /* Merge the "A, B" matrices of mat into a matrix C.  mat's type is MPIAIJKOKKOS. C's type is MATSEQAIJKOKKOS.
118:    A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n).
119:    C still uses local column ids. Their corresponding global column ids are returned in glob.
120: */
121: PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJKokkos(Mat mat, MatReuse reuse, IS *glob, Mat *C)
122: {
123:   Mat             Ad, Ao;
124:   const PetscInt *cmap;

126:   MatMPIAIJGetSeqAIJ(mat, &Ad, &Ao, &cmap);
127:   MatSeqAIJKokkosMergeMats(Ad, Ao, reuse, C);
128:   if (glob) {
129:     PetscInt cst, i, dn, on, *gidx;
130:     MatGetLocalSize(Ad, NULL, &dn);
131:     MatGetLocalSize(Ao, NULL, &on);
132:     MatGetOwnershipRangeColumn(mat, &cst, NULL);
133:     PetscMalloc1(dn + on, &gidx);
134:     for (i = 0; i < dn; i++) gidx[i] = cst + i;
135:     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
136:     ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob);
137:   }
138:   return 0;
139: }

141: /* Structs used in matrix product C=AB, C=A^tB and C=B^tAB */
142: struct MatMatStruct {
143:   MatRowMapKokkosView Cdstart; /* Used to split sequential matrix into petsc's A, B format */
144:   PetscSF             sf;      /* SF to send/recv matrix entries */
145:   MatScalarKokkosView abuf;    /* buf of mat values in send/recv */
146:   Mat                 C1, C2, B_local;
147:   KokkosCsrMatrix     C1_global, C2_global, C_global;
148:   KernelHandle        kh;
149:   MatMatStruct()
150:   {
151:     C1 = C2 = B_local = NULL;
152:     sf                = NULL;
153:   }

155:   ~MatMatStruct()
156:   {
157:     MatDestroy(&C1);
158:     MatDestroy(&C2);
159:     MatDestroy(&B_local);
160:     PetscSFDestroy(&sf);
161:     kh.destroy_spadd_handle();
162:   }
163: };

165: struct MatMatStruct_AB : public MatMatStruct {
166:   MatColIdxKokkosView rows;
167:   MatRowMapKokkosView rowoffset;
168:   Mat                 B_other, C_petsc; /* SEQAIJKOKKOS matrices. TODO: have a better var name than C_petsc */

170:   MatMatStruct_AB() : B_other(NULL), C_petsc(NULL) { }
171:   ~MatMatStruct_AB()
172:   {
173:     MatDestroy(&B_other);
174:     MatDestroy(&C_petsc);
175:   }
176: };

178: struct MatMatStruct_AtB : public MatMatStruct {
179:   MatRowMapKokkosView srcrowoffset, dstrowoffset;
180: };

182: struct MatProductData_MPIAIJKokkos {
183:   MatMatStruct_AB  *mmAB;
184:   MatMatStruct_AtB *mmAtB;
185:   PetscBool         reusesym;

187:   MatProductData_MPIAIJKokkos() : mmAB(NULL), mmAtB(NULL), reusesym(PETSC_FALSE) { }
188:   ~MatProductData_MPIAIJKokkos()
189:   {
190:     delete mmAB;
191:     delete mmAtB;
192:   }
193: };

195: static PetscErrorCode MatProductDataDestroy_MPIAIJKokkos(void *data)
196: {
197:   delete static_cast<MatProductData_MPIAIJKokkos *>(data);
198:   return 0;
199: }

201: /* MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds - Get a KokkosCsrMatrix from a MATSEQAIJKOKKOS matrix

203:    Input Parameters:
204: +  A       - the MATSEQAIJKOKKOS matrix
205: .  N       - new column size for the returned Kokkos matrix
206: -  l2g     - a map that maps old col ids to new col ids

208:    Output Parameters:
209: .  csrmat  - the Kokkos matrix, which has the same row size as A, shares a, i but not j with A.
210:  */
211: static PetscErrorCode MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(Mat A, PetscInt N, const ConstMatColIdxKokkosView &l2g, KokkosCsrMatrix &csrmat)
212: {
213:   KokkosCsrMatrix    &orig = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
214:   MatColIdxKokkosView jg("jg", orig.nnz()); /* New j array for csrmat */

216:   PetscCallCXX(Kokkos::parallel_for(
217:     orig.nnz(), KOKKOS_LAMBDA(const PetscInt i) { jg(i) = l2g(orig.graph.entries(i)); }));
218:   csrmat = KokkosCsrMatrix("csrmat", orig.numRows(), N, orig.nnz(), orig.values, orig.graph.row_map, jg);
219:   return 0;
220: }

222: /* MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices - Set the diag and offdiag matrices of a MATMPIAIJKOKKOS matrix.
223:    It is similar to MatCreateMPIAIJWithSplitArrays.

225:   Input Parameters:
226: +  mat   - the MATMPIAIJKOKKOS matrix, which should have its type and layout set, but should not have its diag, offdiag matrices set
227: .  A     - the diag matrix using local col ids
228: -  B     - the offdiag matrix using global col ids

230:   Output Parameters:
231: .  mat   - the updated MATMPIAIJKOKKOS matrix
232: */
233: static PetscErrorCode MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(Mat mat, Mat A, Mat B)
234: {
235:   Mat_MPIAIJ       *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
236:   PetscInt          m, n, M, N, Am, An, Bm, Bn;
237:   Mat_SeqAIJKokkos *bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);

239:   MatGetSize(mat, &M, &N);
240:   MatGetLocalSize(mat, &m, &n);
241:   MatGetLocalSize(A, &Am, &An);
242:   MatGetLocalSize(B, &Bm, &Bn);

248:   mpiaij->A = A;
249:   mpiaij->B = B;

251:   mat->preallocated     = PETSC_TRUE;
252:   mat->nooffprocentries = PETSC_TRUE; /* See MatAssemblyBegin_MPIAIJ. In effect, making MatAssemblyBegin a nop */

254:   MatSetOption(mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE);
255:   MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY);
256:   /* MatAssemblyEnd is critical here. It sets mat->offloadmask according to A and B's, and
257:     also gets mpiaij->B compacted, with its col ids and size reduced
258:   */
259:   MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY);
260:   MatSetOption(mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_FALSE);
261:   MatSetOption(mat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE);

263:   /* Update bkok with new local col ids (stored on host) and size */
264:   bkok->j_dual.modify_host();
265:   bkok->j_dual.sync_device();
266:   bkok->SetColSize(mpiaij->B->cmap->n);
267:   return 0;
268: }

270: /* MatSeqAIJKokkosBcast - Bcast rows of a SEQAIJKOKKOS matrice (B) to form a SEQAIJKOKKOS matrix (C).

272:    It is essentially the MPIAIJKOKKOS counterpart of MatGetBrowsOfAoCols_MPIAIJ, but supports device and uses PetscSF.
273:    In the given ownerSF, leaves correspond to rows in C, and roots correspond to rows in B. Roots may connect to multiple leaves.
274:    Suppose C's j-th row is connected to a root identified by PetscSFNode (k,i), it means we will bcast the i-th row of B on rank k
275:    to j-th row of C. ownerSF's leaves must be contiguous (in other words, as if ilocal=NULL was used to set its graph).

277:    Collective on comm of ownerSF

279:    Input Parameters:
280: +   B       - the SEQAIJKOKKOS matrix, using local col ids
281: .   reuse   - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
282: .   N       - global col ids are in range of [0,N). N Must be the same across ranks (nonsignificant in MAT_REUSE_MATRIX)
283: .   l2g     - a map mapping B's local col ids to global ones (nonsignificant in MAT_REUSE_MATRIX)
284: .   ownerSF - the ownership SF (nonsignificant in MAT_REUSE_MATRIX)

286:    Input/Output Parameters (out when resue = MAT_INITIAL_MATRIX, inout when reuse = MAT_REUSE_MATRIX)
287: +   bcastSF   - the SF used to bcast rows of B. This plain SF does buffer (abuf) to buffer (Ca) send/recv. In this SF, vertices are nonzeros.
288: .   abuf      - buffer for sending matrix values
289: .   rows      - array containing indices of (local) rows that this rank needs to bcast to others. Each receiver rank has a chunk in rows[].
290:                 Values in rows[] might have repeats, which simply indicates a row will be bcast'ed to multiple neighbors.
291: .   rowoffset - For each row in rows[], it will be copied to rowoffset[] at abuf[]
292: -   C         -  the SEQAIJKOKKOS matrix made of the bcast'ed rows, using local col ids.
293: */
294: static PetscErrorCode MatSeqAIJKokkosBcast(Mat B, MatReuse reuse, PetscInt N, const ConstMatColIdxKokkosView &l2g, PetscSF ownerSF, PetscSF &bcastSF, MatScalarKokkosView &abuf, MatColIdxKokkosView &rows, MatRowMapKokkosView &rowoffset, Mat &C)
295: {
296:   Mat_SeqAIJKokkos *bkok, *ckok;

298:   MatSeqAIJKokkosSyncDevice(B); /* Make sure B->spptr is accessible */
299:   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);

301:   if (reuse == MAT_REUSE_MATRIX) {
302:     ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);

304:     const auto &Ba = bkok->a_dual.view_device();
305:     const auto &Bi = bkok->i_dual.view_device();
306:     const auto &Ca = ckok->a_dual.view_device();

308:     /* Copy Ba to abuf */
309:     Kokkos::parallel_for(
310:       Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
311:         PetscInt i    = t.league_rank(); /* rows[i] is r-th row of B */
312:         PetscInt r    = rows(i);
313:         PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */
314:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, Bi(r + 1) - Bi(r)), [&](PetscInt k) { abuf(base + k) = Ba(Bi(r) + k); });
315:       });

317:     /* Send abuf to Ca through bcastSF and then mark C is updated on device */
318:     PetscSFBcastBegin(bcastSF, MPIU_SCALAR, abuf.data(), Ca.data(), MPI_REPLACE); /* TODO: get memtype for abuf */
319:     PetscSFBcastEnd(bcastSF, MPIU_SCALAR, abuf.data(), Ca.data(), MPI_REPLACE);
320:     ckok->a_dual.modify_device();
321:   } else if (reuse == MAT_INITIAL_MATRIX) {
322:     MPI_Comm    comm;
323:     PetscMPIInt tag;
324:     PetscInt    k, Cm, Cn, Cnnz, *Ci_h, nroots, nleaves;

326:     PetscObjectGetComm((PetscObject)ownerSF, &comm);
327:     PetscSFGetGraph(ownerSF, &nroots, &nleaves, NULL, NULL);
328:     Cm = nleaves; /* row size of C */
329:     Cn = N;       /* col size of C, which initially uses global ids, so we can safely set its col size as N */

331:     /* Get row lens (nz) of B's rows for later fast query */
332:     PetscInt       *Browlens;
333:     const PetscInt *tmp = bkok->i_host_data();
334:     PetscMalloc1(nroots, &Browlens);
335:     for (k = 0; k < nroots; k++) Browlens[k] = tmp[k + 1] - tmp[k];

337:     /* By ownerSF, each proc gets lens of rows of C */
338:     MatRowMapKokkosDualView Ci("i", Cm + 1); /* C's rowmap */
339:     Ci_h    = Ci.view_host().data();
340:     Ci_h[0] = 0;
341:     PetscSFBcastWithMemTypeBegin(ownerSF, MPIU_INT, PETSC_MEMTYPE_HOST, Browlens, PETSC_MEMTYPE_HOST, &Ci_h[1], MPI_REPLACE);
342:     PetscSFBcastEnd(ownerSF, MPIU_INT, Browlens, &Ci_h[1], MPI_REPLACE);
343:     for (k = 1; k < Cm + 1; k++) Ci_h[k] += Ci_h[k - 1]; /* Convert lens to CSR */
344:     Cnnz = Ci_h[Cm];
345:     Ci.modify_host();
346:     Ci.sync_device();

348:     /* With the newly known Cnnz, we are able to allocate (j, a) for C on host & device */
349:     MatColIdxKokkosDualView Cj("j", Cnnz);
350:     MatScalarKokkosDualView Ca("a", Cnnz);

352:     /* Now build the bcastSF to fill Ca, Cj. This plain SF only does (contiguous) buffer to buffer send/recv */
353:     const PetscMPIInt *iranks, *ranks;
354:     const PetscInt    *ioffset, *irootloc, *roffset;
355:     PetscInt           i, j, niranks, nranks, *sdisp, *rdisp, *rowptr;
356:     MPI_Request       *reqs;

358:     PetscSFGetLeafRanks(ownerSF, &niranks, &iranks, &ioffset, &irootloc);                      /* irootloc[] contains indices of rows I need to send to each receiver */
359:     PetscSFGetRootRanks(ownerSF, &nranks, &ranks, &roffset, NULL /*rmine*/, NULL /*rremote*/); /* recv info */

361:     /* figure out offsets at the send buffer, to build the SF
362:       sdisp[]  - stores offsets of nonzeros (in abuf or jbuf, see later) I need to send, per receiver.
363:       rowptr[] - stores offsets for data of each row in abuf

365:       rdisp[]  - to receive sdisp[]
366:     */
367:     PetscMalloc3(niranks + 1, &sdisp, nranks, &rdisp, niranks + nranks, &reqs);
368:     MatRowMapKokkosViewHost rowptr_h("rowptr_h", ioffset[niranks] + 1); /* Let Kokkos do the allocation, so that we can do an easy mirror later */
369:     rowptr = rowptr_h.data();

371:     sdisp[0]  = 0;
372:     rowptr[0] = 0;
373:     for (i = 0; i < niranks; i++) { /* for each receiver */
374:       PetscInt len, nz = 0;
375:       for (j = ioffset[i]; j < ioffset[i + 1]; j++) { /* for each row to this receiver */
376:         len           = Browlens[irootloc[j]];
377:         rowptr[j + 1] = rowptr[j] + len;
378:         nz += len;
379:       }
380:       sdisp[i + 1] = sdisp[i] + nz;
381:     }
382:     PetscCommGetNewTag(comm, &tag);
383:     for (i = 0; i < nranks; i++) MPI_Irecv(&rdisp[i], 1, MPIU_INT, ranks[i], tag, comm, &reqs[i]);
384:     for (i = 0; i < niranks; i++) MPI_Isend(&sdisp[i], 1, MPIU_INT, iranks[i], tag, comm, &reqs[nranks + i]);
385:     MPI_Waitall(niranks + nranks, reqs, MPI_STATUSES_IGNORE);

387:     PetscInt     nleaves2 = Cnnz;           /* leaves are the nonzeros I will receive */
388:     PetscInt     nroots2  = sdisp[niranks]; /* roots are the nonzeros (in abuf) I will send */
389:     PetscSFNode *iremote;
390:     PetscMalloc1(nleaves2, &iremote);
391:     for (i = 0; i < nranks; i++) { /* for each sender */
392:       k = 0;
393:       for (j = Ci_h[roffset[i]]; j < Ci_h[roffset[i + 1]]; j++) {
394:         iremote[j].rank  = ranks[i];
395:         iremote[j].index = rdisp[i] + k;
396:         k++;
397:       }
398:     }
399:     /* TODO: we should extend PetscSF APIs for this buffer-to-buffer send/recv */
400:     PetscSFCreate(comm, &bcastSF);
401:     PetscSFSetGraph(bcastSF, nroots2, nleaves2, NULL /*ilocal*/, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER);

403:     /* Extract selected rows of B, and copy their (a, j) into abuf[] and jbuf[], with j converted
404:       from local to global. Then use bcastSF to fill Ca, Cj.
405:     */
406:     ConstMatColIdxKokkosViewHost rows_h(irootloc, ioffset[niranks]); /* irootloc[] stores indices of rows I need to send */
407:     MatColIdxKokkosView          rows("rows", ioffset[niranks]);
408:     Kokkos::deep_copy(rows, rows_h); /* Use deep copy since irootoc is managed by PetscSF and we want 'rows' to be standalone */

410:     rowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), rowptr_h); /* If no device, rowoffset will be an alias to rowptr_h */

412:     MatColIdxKokkosView jbuf("jbuf", sdisp[niranks]);   /* send buf for (global) col ids */
413:     abuf = MatScalarKokkosView("abuf", sdisp[niranks]); /* send buf for mat values */

415:     const auto &Ba = bkok->a_dual.view_device();
416:     const auto &Bi = bkok->i_dual.view_device();
417:     const auto &Bj = bkok->j_dual.view_device();

419:     /* Copy Ba, Bj to abuf, jbuf with change col ids from local to global */
420:     Kokkos::parallel_for(
421:       Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
422:         PetscInt i    = t.league_rank(); /* rows[i] is r-th row of B */
423:         PetscInt r    = rows(i);
424:         PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */
425:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, Bi(r + 1) - Bi(r)), [&](PetscInt k) {
426:           abuf(base + k) = Ba(Bi(r) + k);
427:           jbuf(base + k) = l2g(Bj(Bi(r) + k));
428:         });
429:       });

431:     /* Send abuf & jbuf to fill Ca, Cj */
432:     PetscSFBcastBegin(bcastSF, MPIU_INT, jbuf.data(), Cj.view_device().data(), MPI_REPLACE);
433:     PetscSFBcastBegin(bcastSF, MPIU_SCALAR, abuf.data(), Ca.view_device().data(), MPI_REPLACE);
434:     PetscSFBcastEnd(bcastSF, MPIU_INT, jbuf.data(), Cj.view_device().data(), MPI_REPLACE);
435:     PetscSFBcastEnd(bcastSF, MPIU_SCALAR, abuf.data(), Ca.view_device().data(), MPI_REPLACE);
436:     Cj.modify_device(); /* Mark Cj, Ca modified on device, but only sync Cj since we might not need Ca on host at all */
437:     Cj.sync_host();
438:     Ca.modify_device();

440:     /* Construct C with Ca, Ci, Cj */
441:     auto ckok = new Mat_SeqAIJKokkos(Cm, Cn, Cnnz, Ci, Cj, Ca);
442:     MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, &C);
443:     PetscFree3(sdisp, rdisp, reqs);
444:     PetscFree(Browlens);
445:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unsupported MatReuse enum %d", reuse);
446:   return 0;
447: }

449: /* MatSeqAIJKokkosReduce - Reduce rows of a SEQAIJKOKKOS matrix (A) to form a Kokkos Csr matrix (C)

451:   It is the reverse of MatSeqAIJKokkosBcast in some sense.

453:   Think each row of A as a leaf, then the given ownerSF specifies roots of the leaves. Roots may connect to multiple leaves.
454:   In this routine, we reduce (i.e., concatenate) leaves (rows) at their roots to form potentially longer rows in C. Such rows might
455:   contain repeats, which does not matter since they will be summed up by other routines. C's row size will be nroots of ownerSF.

457:   Input Parameters:
458: +  A        - the SEQAIJKOKKOS matrix to be reduced
459: .  reuse    - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
460: .  local    - true if A uses local col ids; false if A is already in global col ids.
461: .  N        - if local, N is A's global col size
462: .  l2g      - if local, a map mapping A's local col ids to global ones, which are in range of [0,N).
463: -  ownerSF  - the SF specifies ownership (root) of rows in A

465:   Output Parameters:
466: +  reduceSF    - the SF to reduce A's rows to contiguous buffers at the receiver side
467: .  abuf         - a contiguous buffer to receive A's rows sent to this proc. Suppose there are 'nrows' such rows.
468: .  srcrowoffset - offset array of size nrows+1. Each entry is the corresponding row's offset in abuf[]. srcrowoffset[i+1]-srcrowoffset[i] is row i's len.
469: .  dstrowoffset - offset array of size nrows. Each entry is the corresponding row's offset in Ca[], i.e., C's 'a' array. Row i, i+1 in abuf[] may go to
470:                   unrelated places in Ca, so dstrowoffset is not in CSR-like format as srcrowoffset.
471: -  C            - the matrix made up by rows sent to me from other ranks, using global col ids

473:    TODO: we can even have MatSeqAIJKokkosReduceBegin/End to provide opportunity for callers to overlap comp./comm. when reuse = MAT_REUSE_MATRIX.
474:  */
475: static PetscErrorCode MatSeqAIJKokkosReduce(Mat A, MatReuse reuse, PetscBool local, PetscInt N, const ConstMatColIdxKokkosView &l2g, PetscSF ownerSF, PetscSF &reduceSF, MatScalarKokkosView &abuf, MatRowMapKokkosView &srcrowoffset, MatRowMapKokkosView &dstrowoffset, KokkosCsrMatrix &C)
476: {
477:   PetscInt          i, r, Am, An, Annz, Cnnz, nrows;
478:   const PetscInt   *Ai;
479:   Mat_SeqAIJKokkos *akok;

481:   MatSeqAIJKokkosSyncDevice(A); /* So that A's latest data is on device */
482:   MatGetSize(A, &Am, &An);
483:   Ai   = static_cast<Mat_SeqAIJ *>(A->data)->i;
484:   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
485:   Annz = Ai[Am];

487:   if (reuse == MAT_REUSE_MATRIX) {
488:     /* Send Aa to abuf */
489:     PetscSFReduceBegin(reduceSF, MPIU_SCALAR, akok->a_device_data(), abuf.data(), MPI_REPLACE);
490:     PetscSFReduceEnd(reduceSF, MPIU_SCALAR, akok->a_device_data(), abuf.data(), MPI_REPLACE);

492:     /* Copy abuf to Ca */
493:     const MatScalarKokkosView &Ca = C.values;
494:     nrows                         = dstrowoffset.extent(0); /* Not srcrowoffset[] since it has an extra entry for CSR */
495:     Kokkos::parallel_for(
496:       Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
497:         PetscInt i   = t.league_rank();
498:         PetscInt src = srcrowoffset(i), dst = dstrowoffset(i);
499:         PetscInt len = srcrowoffset(i + 1) - srcrowoffset(i);
500:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt k) { Ca(dst + k) = abuf(src + k); });
501:       });
502:   } else if (reuse == MAT_INITIAL_MATRIX) {
503:     MPI_Comm     comm;
504:     MPI_Request *reqs;
505:     PetscMPIInt  tag;
506:     PetscInt     Cm;

508:     PetscObjectGetComm((PetscObject)ownerSF, &comm);
509:     PetscCommGetNewTag(comm, &tag);

511:     PetscInt           niranks, nranks, nroots, nleaves;
512:     const PetscMPIInt *iranks, *ranks;
513:     const PetscInt    *ioffset, *rows, *roffset; /* rows[] contains local indices of rows scattered to me from others. ioffset[] is a CSR on rows[] */
514:     PetscSFSetUp(ownerSF);
515:     PetscSFGetLeafRanks(ownerSF, &niranks, &iranks, &ioffset, &rows);                          /* recv info: iranks[] will send rows to me */
516:     PetscSFGetRootRanks(ownerSF, &nranks, &ranks, &roffset, NULL /*rmine*/, NULL /*rremote*/); /* send info */
517:     PetscSFGetGraph(ownerSF, &nroots, &nleaves, NULL, NULL);
519:     Cm    = nroots;
520:     nrows = ioffset[niranks]; /* # of rows to be received. Might receive same row (each is partial) from different senders */

522:     /* Tell owners how long each row I will send */
523:     PetscInt               *srowlens;                              /* send buf of row lens */
524:     MatRowMapKokkosViewHost rrowlens_h("rrowoffset_h", nrows + 1); /* recv buf of row lens. +1 to make CSR later. Memory might be passed to other views */
525:     PetscInt               *rrowlens = rrowlens_h.data();

527:     PetscMalloc2(Am, &srowlens, niranks + nranks, &reqs);
528:     for (i = 0; i < Am; i++) srowlens[i] = Ai[i + 1] - Ai[i];
529:     rrowlens[0] = 0;
530:     rrowlens++; /* shift the pointer to make the following expression more readable */
531:     for (i = 0; i < niranks; i++) MPI_Irecv(&rrowlens[ioffset[i]], ioffset[i + 1] - ioffset[i], MPIU_INT, iranks[i], tag, comm, &reqs[i]);
532:     for (i = 0; i < nranks; i++) MPI_Isend(&srowlens[roffset[i]], roffset[i + 1] - roffset[i], MPIU_INT, ranks[i], tag, comm, &reqs[niranks + i]);
533:     MPI_Waitall(niranks + nranks, reqs, MPI_STATUSES_IGNORE);

535:     /* Owner builds Ci on host by histogramming rrowlens[] */
536:     MatRowMapKokkosViewHost Ci_h("i", Cm + 1);
537:     Kokkos::deep_copy(Ci_h, 0); /* Zero Ci */
538:     MatRowMapType *Ci_ptr = Ci_h.data();

540:     for (i = 0; i < nrows; i++) {
541:       r = rows[i]; /* local row id of i-th received row */
542: #if defined(PETSC_USE_DEBUG)
544: #endif
545:       Ci_ptr[r + 1] += rrowlens[i]; /* add to length of row r in C */
546:     }
547:     for (i = 0; i < Cm; i++) Ci_ptr[i + 1] += Ci_ptr[i]; /* to CSR format */
548:     Cnnz = Ci_ptr[Cm];

550:     /* For each received row, compute src & dst offsets in memory copying (from recv bufs abuf, jbuf to Ca, Cj) */
551:     MatRowMapKokkosViewHost dstrowoffset_h("dstrowoffset_h", nrows);
552:     PetscInt               *dstrowoffset_hptr = dstrowoffset_h.data();
553:     PetscInt               *currowlens; /* Current row lens. They are temp accumulators for row lens in C, to help build dstrowoffset */

555:     PetscCalloc1(Cm, &currowlens);           /* Init with zero, to be added to */
556:     for (i = 0; i < nrows; i++) {                       /* for each row I receive */
557:       r                    = rows[i];                   /* row id in C */
558:       dstrowoffset_hptr[i] = Ci_ptr[r] + currowlens[r]; /* dst offset of the new place for each recv'ed row in Ca/Cj */
559:       currowlens[r] += rrowlens[i];                     /* accumulate to length of row r in C */
560:     }
561:     PetscFree(currowlens);

563:     rrowlens--;
564:     for (i = 0; i < nrows; i++) rrowlens[i + 1] += rrowlens[i]; /* Change rrowlens[] to CSR format */
565:     dstrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), dstrowoffset_h);
566:     srcrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), rrowlens_h); /* src offset of each recv'ed row in abuf/jbuf */

568:     /* Build the reduceSF, which performs buffer to buffer send/recv */
569:     PetscInt *sdisp, *rdisp; /* buffer to send offsets of roots, and buffer to recv them */
570:     PetscMalloc2(niranks, &sdisp, nranks, &rdisp);
571:     for (i = 0; i < niranks; i++) sdisp[i] = rrowlens[ioffset[i]];
572:     for (i = 0; i < nranks; i++) MPI_Irecv(&rdisp[i], 1, MPIU_INT, ranks[i], tag, comm, &reqs[i]);
573:     for (i = 0; i < niranks; i++) MPI_Isend(&sdisp[i], 1, MPIU_INT, iranks[i], tag, comm, &reqs[nranks + i]);
574:     MPI_Waitall(niranks + nranks, reqs, MPI_STATUSES_IGNORE);

576:     /* Nonzeros in abuf/jbuf are roots and those in A are leaves */
577:     PetscInt     nroots2 = Cnnz, nleaves2 = Annz;
578:     PetscSFNode *iremote;
579:     PetscMalloc1(nleaves2, &iremote); /* no free, since memory will be given to reduceSF */
580:     for (i = 0; i < nranks; i++) {
581:       PetscInt rootbase = rdisp[i];                      /* root offset at this root rank */
582:       PetscInt leafbase = Ai[roffset[i]];                /* leaf base */
583:       PetscInt nz       = Ai[roffset[i + 1]] - leafbase; /* I will send nz nonzeros to this root rank */
584:       for (PetscInt k = 0; k < nz; k++) {
585:         iremote[leafbase + k].rank  = ranks[i];
586:         iremote[leafbase + k].index = rootbase + k;
587:       }
588:     }
589:     PetscSFCreate(comm, &reduceSF);
590:     PetscSFSetGraph(reduceSF, nroots2, nleaves2, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER);
591:     PetscFree2(sdisp, rdisp);

593:     /* Reduce Aa, Ajg to abuf and jbuf */

595:     /* If A uses local col ids, convert them to global ones before sending */
596:     MatColIdxKokkosView Ajg;
597:     if (local) {
598:       Ajg                           = MatColIdxKokkosView("j", Annz);
599:       const MatColIdxKokkosView &Aj = akok->j_dual.view_device();
600:       Kokkos::parallel_for(
601:         Annz, KOKKOS_LAMBDA(const PetscInt i) { Ajg(i) = l2g(Aj(i)); });
602:     } else {
603:       Ajg = akok->j_dual.view_device(); /* no data copy, just take a reference */
604:     }

606:     MatColIdxKokkosView jbuf("jbuf", Cnnz);
607:     abuf = MatScalarKokkosView("abuf", Cnnz);
608:     PetscSFReduceBegin(reduceSF, MPIU_INT, Ajg.data(), jbuf.data(), MPI_REPLACE);
609:     PetscSFReduceEnd(reduceSF, MPIU_INT, Ajg.data(), jbuf.data(), MPI_REPLACE);
610:     PetscSFReduceBegin(reduceSF, MPIU_SCALAR, akok->a_device_data(), abuf.data(), MPI_REPLACE);
611:     PetscSFReduceEnd(reduceSF, MPIU_SCALAR, akok->a_device_data(), abuf.data(), MPI_REPLACE);

613:     /* Copy data from abuf, jbuf to Ca, Cj */
614:     MatRowMapKokkosView Ci = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ci_h); /* Ci is an alias of Ci_h if no device */
615:     MatColIdxKokkosView Cj("j", Cnnz);
616:     MatScalarKokkosView Ca("a", Cnnz);

618:     Kokkos::parallel_for(
619:       Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
620:         PetscInt i   = t.league_rank();
621:         PetscInt src = srcrowoffset(i), dst = dstrowoffset(i);
622:         PetscInt len = srcrowoffset(i + 1) - srcrowoffset(i);
623:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt k) {
624:           Ca(dst + k) = abuf(src + k);
625:           Cj(dst + k) = jbuf(src + k);
626:         });
627:       });

629:     /* Build C with Ca, Ci, Cj */
630:     C = KokkosCsrMatrix("csrmat", Cm, N, Cnnz, Ca, Ci, Cj);
631:     PetscFree2(srowlens, reqs);
632:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unsupported MatReuse enum %d", reuse);
633:   return 0;
634: }

636: /* MatSetMPIAIJKokkosWithGlobalCSRMatrix - Set the diag and offdiag parts of a `MATMPIAIJKOKKOS` matrix by splitting a KokkosCsrMatrix

638:   Input Parameters:
639: +  C        - the `MATMPIAIJKOKKOS` matrix, of size m,n,M,N
640: .  reuse    - indicate whether the matrix has called this function before
641: .  csrmat   - the KokkosCsrMatrix, of size m,N
642: -  Cdstart  - when reuse == `MAT_REUSE_MATRIX`, it is an input parameter. For each row in csrmat, it stores the start of the first
643:               entry of the diag block of C in csrmat's j array. E.g, if row i has col ids = {0, 3, 4, 5, 7, 9} and the first diag
644:               entry is 5, then Cdstart[i] = 3.

646:   Output Parameters:
647: +  C        - the updated `MATMPIAIJKOKKOS` matrix
648: -  Cdstart - when reuse == `MAT_INITIAL_MATRIX`, it is an output parameter

650:   Note:
651:    Between calls with `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`, csrmat must have the same nonzero pattern

653: .seealso: `MATMPIAIJKOKKOS`
654:  */
655: static PetscErrorCode MatSetMPIAIJKokkosWithGlobalCSRMatrix(Mat C, MatReuse reuse, const KokkosCsrMatrix &csrmat, MatRowMapKokkosView &Cdstart)
656: {
657:   const MatScalarKokkosView      &Ca = csrmat.values;
658:   const ConstMatRowMapKokkosView &Ci = csrmat.graph.row_map;
659:   PetscInt                        m, n, N;

661:   MatGetLocalSize(C, &m, &n);
662:   MatGetSize(C, NULL, &N);

664:   if (reuse == MAT_REUSE_MATRIX) {
665:     Mat_MPIAIJ                *mpiaij = static_cast<Mat_MPIAIJ *>(C->data);
666:     Mat_SeqAIJKokkos          *akok   = static_cast<Mat_SeqAIJKokkos *>(mpiaij->A->spptr);
667:     Mat_SeqAIJKokkos          *bkok   = static_cast<Mat_SeqAIJKokkos *>(mpiaij->B->spptr);
668:     const MatScalarKokkosView &Cda = akok->a_dual.view_device(), Coa = bkok->a_dual.view_device();
669:     const MatRowMapKokkosView &Cdi = akok->i_dual.view_device(), Coi = bkok->i_dual.view_device();

671:     /* Fill 'a' of Cd and Co on device */
672:     Kokkos::parallel_for(
673:       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
674:         PetscInt i       = t.league_rank();     /* row i */
675:         PetscInt clen    = Ci(i + 1) - Ci(i);   /* len of row i of C */
676:         PetscInt cdlen   = Cdi(i + 1) - Cdi(i); /* len of row i of Cd */
677:         PetscInt cdstart = Cdstart(i);          /* [start, end) of row i of Cd in C */
678:         PetscInt cdend   = cdstart + cdlen;
679:         /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */
680:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) {
681:           if (k < cdstart) { /* k in [0, cdstart) */
682:             Coa(Coi(i) + k) = Ca(Ci(i) + k);
683:           } else if (k < cdend) { /* k in [cdstart, cdend) */
684:             Cda(Cdi(i) + (k - cdstart)) = Ca(Ci(i) + k);
685:           } else { /* k in [cdend, clen) */
686:             Coa(Coi(i) + k - cdlen) = Ca(Ci(i) + k);
687:           }
688:         });
689:       });

691:     akok->a_dual.modify_device();
692:     bkok->a_dual.modify_device();
693:   } else if (reuse == MAT_INITIAL_MATRIX) {
694:     Mat                        Cd, Co;
695:     const MatColIdxKokkosView &Cj = csrmat.graph.entries;
696:     MatRowMapKokkosDualView    Cdi_dual("i", m + 1), Coi_dual("i", m + 1);
697:     MatRowMapKokkosView        Cdi = Cdi_dual.view_device(), Coi = Coi_dual.view_device();
698:     PetscInt                   cstart, cend;

700:     /* Note that each row of C is sorted by col ids. We want to find out how to cut each row into three blocks:
701:        left to the diag block, diag block, right to the diag block. The diag block have col ids in [cstart,cend).
702:        Suppose a row of C has len nonzeros, indexed by [0, len). We want to know two indices: cdstart and cdend,
703:        such that the three blocks are [0,cdstart), [cdstart,cdend), [cdend,len). The following code equivalentaly
704:        stores values of cdstart and cdend-cstart (aka Cdi[]) instead.
705:      */
706:     Cdstart = MatRowMapKokkosView("Cdstart", m);
707:     PetscLayoutGetRange(C->cmap, &cstart, &cend); /* Not MatGetOwnershipRangeColumn() since C has not been preallocated yet */

709:     /* I could use RangePolicy and one thread per row. But since each thread essentially does binary search, threads in a
710:       CUDA warp would completely diverge. So I use TeamPolicy with a team size 1.
711:      */
712:     Kokkos::parallel_for(
713:       Kokkos::TeamPolicy<>(m, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
714:         Kokkos::single(Kokkos::PerTeam(t), [=]() {                               /* Only one thread works in a team */
715:                                                    PetscInt i = t.league_rank(); /* row i */
716:                                                    PetscInt j, first, count, step;

718:                                                    if (i == 0) { /* Set the first entry of the i arrays to zero on device, to be used in CSR */
719:                                                      Cdi(0) = 0;
720:                                                      Coi(0) = 0;
721:                                                    }

723:                                                    /* Do std::lower_bound(Ci(i),Ci(i+1),cstart) on Cj[]. We use j as the iterator. lower_bound() returns
724:           in 'first' the first iterator with a value >= cstart, or last iterator if no such element is found.
725:         */
726:                                                    count = Ci(i + 1) - Ci(i);
727:                                                    first = Ci(i);
728:                                                    while (count > 0) {
729:                                                      j    = first;
730:                                                      step = count / 2;
731:                                                      j += step;
732:                                                      if (Cj(j) < cstart) {
733:                                                        first = ++j;
734:                                                        count -= step + 1;
735:                                                      } else count = step;
736:                                                    }
737:                                                    Cdstart(i) = first - Ci(i); /* 'first' is the while-loop's output */

739:                                                    /* Do std::lower_bound(first,Ci(i+1),cend) on Cj[] */
740:                                                    count = Ci(i + 1) - first;
741:                                                    while (count > 0) {
742:                                                      j    = first;
743:                                                      step = count / 2;
744:                                                      j += step;
745:                                                      if (Cj(j) < cend) {
746:                                                        first = ++j;
747:                                                        count -= step + 1;
748:                                                      } else count = step;
749:                                                    }
750:                                                    Cdi(i + 1) = first - (Ci(i) + Cdstart(i));     /* 'first' is the while-loop's output */
751:                                                    Coi(i + 1) = (Ci(i + 1) - Ci(i)) - Cdi(i + 1); /* Co's row len = C's row len - Cd's row len */
752:         });
753:       });

755:     /* Convert row lens in Cdi[], Coi[] to CSR format using inclusive scan, e.g., changing [0,1,2,3] into [0,1,3,6] */
756:     Kokkos::parallel_scan(
757:       m + 1, KOKKOS_LAMBDA(const PetscInt i, PetscInt &update, const bool final) {
758:         update += Cdi(i);
759:         if (final) Cdi(i) = update;
760:       });
761:     Kokkos::parallel_scan(
762:       m + 1, KOKKOS_LAMBDA(const PetscInt i, PetscInt &update, const bool final) {
763:         update += Coi(i);
764:         if (final) Coi(i) = update;
765:       });

767:     /* Get Cdi, Coi on host (it is not a waste, since we do need them on host in
768:        MatCreateSeqAIJKokkosWithCSRMatrix() below), then get nnz of Cd and Co.
769:     */
770:     Cdi_dual.modify_device();
771:     Coi_dual.modify_device();
772:     Cdi_dual.sync_host();
773:     Coi_dual.sync_host();
774:     PetscInt Cd_nnz = Cdi_dual.view_host().data()[m];
775:     PetscInt Co_nnz = Coi_dual.view_host().data()[m];

777:     /* With nnz, allocate a, j for Cd and Co */
778:     MatColIdxKokkosDualView Cdj_dual("j", Cd_nnz), Coj_dual("j", Co_nnz);
779:     MatScalarKokkosDualView Cda_dual("a", Cd_nnz), Coa_dual("a", Co_nnz);

781:     /* Fill a, j of Cd and Co on device */
782:     MatColIdxKokkosView Cdj = Cdj_dual.view_device(), Coj = Coj_dual.view_device();
783:     MatScalarKokkosView Cda = Cda_dual.view_device(), Coa = Coa_dual.view_device();

785:     Kokkos::parallel_for(
786:       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
787:         PetscInt i       = t.league_rank();     /* row i */
788:         PetscInt clen    = Ci(i + 1) - Ci(i);   /* len of row i of C */
789:         PetscInt cdlen   = Cdi(i + 1) - Cdi(i); /* len of row i of Cd */
790:         PetscInt cdstart = Cdstart(i);          /* [start, end) of row i of Cd in C */
791:         PetscInt cdend   = cdstart + cdlen;
792:         /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */
793:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) {
794:           if (k < cdstart) { /* k in [0, cdstart) */
795:             Coa(Coi(i) + k) = Ca(Ci(i) + k);
796:             Coj(Coi(i) + k) = Cj(Ci(i) + k);
797:           } else if (k < cdend) { /* k in [cdstart, cdend) */
798:             Cda(Cdi(i) + (k - cdstart)) = Ca(Ci(i) + k);
799:             Cdj(Cdi(i) + (k - cdstart)) = Cj(Ci(i) + k) - cstart; /* Use local col ids in Cdj */
800:           } else {                                                /* k in [cdend, clen) */
801:             Coa(Coi(i) + k - cdlen) = Ca(Ci(i) + k);
802:             Coj(Coi(i) + k - cdlen) = Cj(Ci(i) + k);
803:           }
804:         });
805:       });

807:     Cdj_dual.modify_device();
808:     Cda_dual.modify_device();
809:     Coj_dual.modify_device();
810:     Coa_dual.modify_device();
811:     /* With a, i, j for Cd and Co, finally build Cd, Co and then C. Their offloadmask will be set in each's MatAssemblyEnd */
812:     auto cdkok = new Mat_SeqAIJKokkos(m, n, Cd_nnz, Cdi_dual, Cdj_dual, Cda_dual);
813:     auto cokok = new Mat_SeqAIJKokkos(m, N, Co_nnz, Coi_dual, Coj_dual, Coa_dual);
814:     MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, cdkok, &Cd);
815:     MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, cokok, &Co);
816:     MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(C, Cd, Co); /* Coj will be converted to local ids within */
817:   }
818:   return 0;
819: }

821: /* MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos - Compact a SEQAIJKOKKS matrix's global col ids.

823:   It is similar to MatSeqAIJCompactOutExtraColumns_SeqAIJ, but it applies to SEQAIJKOKKOS and returns the l2g map in Kokkos view.

825:   Input Parameters:
826: +  C        - the MATMPIAIJKOKKOS matrix, of size m,n,M,N
827: .  reuse    - indicate whether the matrix has called this function before
828: .  csrmat   - the KokkosCsrMatrix, of size m,N
829: -  Cdoffset - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the offset of the first
830:               entry of the diag block of C in csrmat's j array.

832:   Output Parameters:
833: +  C        - the updated MATMPIAIJKOKKOS matrix
834: -  Cdoffset - when reuse == MAT_INITIAL_MATRIX, it is an output parameter

836:   Note:
837:   the input matrix's col ids and col size will be changed.
838: */
839: static PetscErrorCode MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(Mat C, MatColIdxKokkosView &l2g)
840: {
841:   Mat_SeqAIJKokkos      *ckok;
842:   ISLocalToGlobalMapping l2gmap;
843:   const PetscInt        *garray;
844:   PetscInt               sz;

846:   /* Compact P_other's global col ids and col size. We do it since we guess with local ids KK might be more memory scalable */
847:   MatSeqAIJCompactOutExtraColumns_SeqAIJ(C, &l2gmap);
848:   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
849:   ckok->j_dual.modify_host(); /* P_other's j is modified on host; we need to sync it on device */
850:   ckok->j_dual.sync_device();
851:   ckok->SetColSize(C->cmap->n); /* Update col size of the csrmat in spptr */

853:   /* Build l2g -- the local to global mapping of C's cols */
854:   ISLocalToGlobalMappingGetIndices(l2gmap, &garray);
855:   ISLocalToGlobalMappingGetSize(l2gmap, &sz);

858:   ConstMatColIdxKokkosViewHost tmp(garray, sz);
859:   l2g = MatColIdxKokkosView("l2g", sz);
860:   Kokkos::deep_copy(l2g, tmp);

862:   ISLocalToGlobalMappingRestoreIndices(l2gmap, &garray);
863:   ISLocalToGlobalMappingDestroy(&l2gmap);
864:   return 0;
865: }

867: /* MatProductSymbolic_MPIAIJKokkos_AB - AB flavor of MatProductSymbolic_MPIAIJKokkos

869:   Input Parameters:
870: +  product  - Mat_Product which carried out the computation. Passed in to access info about this mat product.
871: .  A        - an MPIAIJKOKKOS matrix
872: .  B        - an MPIAIJKOKKOS matrix
873: -  mm       - a struct used to stash intermediate data when computing AB. Persist from symbolic to numeric operations.

875:   Note: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids.
876: */
877: static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AB(Mat_Product *product, Mat A, Mat B, MatMatStruct_AB *mm)
878: {
879:   Mat_MPIAIJ              *a  = static_cast<Mat_MPIAIJ *>(A->data);
880:   Mat                      Ad = a->A, Ao = a->B; /* diag and offdiag of A */
881:   IS                       glob = NULL;
882:   const PetscInt          *garray;
883:   PetscInt                 N = B->cmap->N, sz;
884:   ConstMatColIdxKokkosView l2g1; /* two temp maps mapping local col ids to global ones */
885:   MatColIdxKokkosView      l2g2;
886:   Mat                      C1, C2; /* intermediate matrices */

888:   /* C1 = Ad * B_local. B_local is a matrix got by merging Bd and Bo, and uses local col ids */
889:   MatMPIAIJGetLocalMatMerge(B, MAT_INITIAL_MATRIX, &glob, &mm->B_local);
890:   MatProductCreate(Ad, mm->B_local, NULL, &C1);
891:   MatProductSetType(C1, MATPRODUCT_AB);
892:   MatProductSetFill(C1, product->fill);
893:   C1->product->api_user = product->api_user;
894:   MatProductSetFromOptions(C1);
895:   PetscUseTypeMethod(C1, productsymbolic);

897:   ISGetIndices(glob, &garray);
898:   ISGetSize(glob, &sz);
899:   const auto &tmp = ConstMatColIdxKokkosViewHost(garray, sz);                       /* wrap garray as a view */
900:   l2g1            = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), tmp); /* maybe just an alias to tmp, so we restore garray at the very end */
901:   MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1, N, l2g1, mm->C1_global);

903:   /* C2 = Ao * B_other. B_other is a matrix consisting of needed rows of B gathered from other procs */
904:   MatSeqAIJKokkosBcast(mm->B_local, MAT_INITIAL_MATRIX, N, l2g1, a->Mvctx, mm->sf, mm->abuf, mm->rows, mm->rowoffset, mm->B_other);

906:   /* Compact B_other to use local ids as we guess KK spgemm is more memory scalable with that; We could skip the compaction to simplify code */
907:   MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(mm->B_other, l2g2);
908:   MatProductCreate(Ao, mm->B_other, NULL, &C2);
909:   MatProductSetType(C2, MATPRODUCT_AB);
910:   MatProductSetFill(C2, product->fill);
911:   C2->product->api_user = product->api_user;
912:   MatProductSetFromOptions(C2);
913:   PetscUseTypeMethod(C2, productsymbolic);
914:   MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C2, N, l2g2, mm->C2_global);

916:   /* C = C1 + C2.  We actually use their global col ids versions in adding */
917:   mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B_local, B_other are not */
918:   KokkosSparse::spadd_symbolic(&mm->kh, mm->C1_global, mm->C2_global, mm->C_global);
919:   /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */
920:   KokkosSparse::spadd_numeric(&mm->kh, (MatScalarType)1.0, mm->C1_global, (MatScalarType)1.0, mm->C2_global, mm->C_global);

922:   mm->C1 = C1;
923:   mm->C2 = C2;
924:   ISRestoreIndices(glob, &garray);
925:   ISDestroy(&glob);
926:   return 0;
927: }

929: /* MatProductSymbolic_MPIAIJKokkos_AtB - A^tB flavor of MatProductSymbolic_MPIAIJKokkos

931:   Input Parameters:
932: +  product  - Mat_Product which carried out the computation. Passed in to access info about this mat product.
933: .  A        - an MPIAIJKOKKOS matrix
934: .  B        - a SEQAIJKOKKOS matrix. It works as if A^t is multiplied by a parallel matrix made up of Bs on each rank.
935: .  localB   - Does B use local col ids? If false, then B is already in global col ids.
936: .  N        - col size of the "parallel B matrix". It implies B's global col ids are in range of [0,N) and N is the same across the communicator.
937: .  l2g      - If localB, then l2g maps B's local col ids to global ones.
938: -  mm       - a struct used to stash intermediate data in AtB

940:   Note: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids.
941: */
942: static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AtB(Mat_Product *product, Mat A, Mat B, PetscBool localB, PetscInt N, const ConstMatColIdxKokkosView &l2g, MatMatStruct_AtB *mm)
943: {
944:   Mat_MPIAIJ *a  = static_cast<Mat_MPIAIJ *>(A->data);
945:   Mat         Ad = a->A, Ao = a->B; /* diag and offdiag of A */
946:   Mat         C1, C2;               /* intermediate matrices */

948:   /* C1 = Ad^t * B */
949:   MatProductCreate(Ad, B, NULL, &C1);
950:   MatProductSetType(C1, MATPRODUCT_AtB);
951:   MatProductSetFill(C1, product->fill);
952:   C1->product->api_user = product->api_user;
953:   MatProductSetFromOptions(C1);
954:   PetscUseTypeMethod(C1, productsymbolic);

956:   if (localB) MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1, N, l2g, mm->C1_global);
957:   else mm->C1_global = static_cast<Mat_SeqAIJKokkos *>(C1->spptr)->csrmat; /* the csrmat already uses global col ids */

959:   /* C2 = Ao^t * B */
960:   MatProductCreate(Ao, B, NULL, &C2);
961:   MatProductSetType(C2, MATPRODUCT_AtB);
962:   MatProductSetFill(C2, product->fill);
963:   C2->product->api_user = product->api_user;
964:   MatProductSetFromOptions(C2);
965:   PetscUseTypeMethod(C2, productsymbolic);

967:   MatSeqAIJKokkosReduce(C2, MAT_INITIAL_MATRIX, localB, N, l2g, a->Mvctx, mm->sf, mm->abuf, mm->srcrowoffset, mm->dstrowoffset, mm->C2_global);

969:   mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B may be not */
970:   KokkosSparse::spadd_symbolic(&mm->kh, mm->C1_global, mm->C2_global, mm->C_global);
971:   /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */
972:   KokkosSparse::spadd_numeric(&mm->kh, (MatScalarType)1.0, mm->C1_global, (MatScalarType)1.0, mm->C2_global, mm->C_global);
973:   mm->C1 = C1;
974:   mm->C2 = C2;
975:   return 0;
976: }

978: PetscErrorCode MatProductNumeric_MPIAIJKokkos(Mat C)
979: {
980:   Mat_Product                 *product = C->product;
981:   MatProductType               ptype;
982:   MatProductData_MPIAIJKokkos *mmdata;
983:   MatMatStruct                *mm = NULL;
984:   MatMatStruct_AB             *ab;
985:   MatMatStruct_AtB            *atb;
986:   Mat                          A, B, Ad, Ao, Bd, Bo;
987:   const MatScalarType          one = 1.0; /* Not use literal 1.0 directly, to avoid wrong template instantiation in KokkosSparse::spadd_numeric */

989:   MatCheckProduct(C, 1);
990:   mmdata = static_cast<MatProductData_MPIAIJKokkos *>(product->data);
991:   ptype  = product->type;
992:   A      = product->A;
993:   B      = product->B;
994:   Ad     = static_cast<Mat_MPIAIJ *>(A->data)->A;
995:   Ao     = static_cast<Mat_MPIAIJ *>(A->data)->B;
996:   Bd     = static_cast<Mat_MPIAIJ *>(B->data)->A;
997:   Bo     = static_cast<Mat_MPIAIJ *>(B->data)->B;

999:   if (mmdata->reusesym) {           /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
1000:     mmdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
1001:     ab               = mmdata->mmAB;
1002:     atb              = mmdata->mmAtB;
1003:     if (ab) {
1004:       static_cast<MatProductData_SeqAIJKokkos *>(ab->C1->product->data)->reusesym = PETSC_FALSE;
1005:       static_cast<MatProductData_SeqAIJKokkos *>(ab->C2->product->data)->reusesym = PETSC_FALSE;
1006:     }
1007:     if (atb) {
1008:       static_cast<MatProductData_SeqAIJKokkos *>(atb->C1->product->data)->reusesym = PETSC_FALSE;
1009:       static_cast<MatProductData_SeqAIJKokkos *>(atb->C2->product->data)->reusesym = PETSC_FALSE;
1010:     }
1011:     return 0;
1012:   }

1014:   if (ptype == MATPRODUCT_AB) {
1015:     ab = mmdata->mmAB;
1016:     /* C1 = Ad * B_local */
1018:     MatMPIAIJGetLocalMatMerge(B, MAT_REUSE_MATRIX, NULL /* glob */, &ab->B_local);
1020:     if (ab->C1->product->A != Ad) MatProductReplaceMats(Ad, NULL, NULL, ab->C1);
1021:     (*ab->C1->ops->productnumeric)(ab->C1);
1022:     MatSeqAIJKokkosBcast(ab->B_local, MAT_REUSE_MATRIX, 0 /* N */, MatColIdxKokkosView() /*l2g*/, NULL /*ownerSF*/, ab->sf, ab->abuf, ab->rows, ab->rowoffset, ab->B_other);
1023:     /* C2 = Ao * B_other */
1025:     if (ab->C1->product->A != Ao) MatProductReplaceMats(Ao, NULL, NULL, ab->C2);
1026:     (*ab->C2->ops->productnumeric)(ab->C2);
1027:     /* C = C1_global + C2_global */
1028:     KokkosSparse::spadd_numeric(&ab->kh, one, ab->C1_global, one, ab->C2_global, ab->C_global);
1029:     mm = static_cast<MatMatStruct *>(ab);
1030:   } else if (ptype == MATPRODUCT_AtB) {
1031:     atb = mmdata->mmAtB;
1033:     /* C1 = Ad^t * B_local */
1034:     MatMPIAIJGetLocalMatMerge(B, MAT_REUSE_MATRIX, NULL /* glob */, &atb->B_local);
1036:     if (atb->C1->product->A != Ad) MatProductReplaceMats(Ad, NULL, NULL, atb->C1);
1037:     (*atb->C1->ops->productnumeric)(atb->C1);

1039:     /* C2 = Ao^t * B_local */
1041:     if (atb->C2->product->A != Ao) MatProductReplaceMats(Ao, NULL, NULL, atb->C2);
1042:     (*atb->C2->ops->productnumeric)(atb->C2);
1043:     /* Form C2_global */
1044:     MatSeqAIJKokkosReduce(atb->C2, MAT_REUSE_MATRIX, PETSC_TRUE, 0 /* N */, MatColIdxKokkosView() /*l2g*/, NULL /*ownerSF*/, atb->sf, atb->abuf, atb->srcrowoffset, atb->dstrowoffset, atb->C2_global);
1045:     /* C = C1_global + C2_global */
1046:     KokkosSparse::spadd_numeric(&atb->kh, one, atb->C1_global, one, atb->C2_global, atb->C_global);
1047:     mm = static_cast<MatMatStruct *>(atb);
1048:   } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */
1049:     ab = mmdata->mmAB;
1050:     MatMPIAIJGetLocalMatMerge(B, MAT_REUSE_MATRIX, NULL /* glob */, &ab->B_local);

1052:     /* ab->C1 = Ad * B_local */
1054:     if (ab->C1->product->A != Ad) MatProductReplaceMats(Ad, NULL, NULL, ab->C1);
1055:     (*ab->C1->ops->productnumeric)(ab->C1);
1056:     MatSeqAIJKokkosBcast(ab->B_local, MAT_REUSE_MATRIX, 0 /* N */, MatColIdxKokkosView() /*l2g*/, NULL /*ownerSF*/, ab->sf, ab->abuf, ab->rows, ab->rowoffset, ab->B_other);
1057:     /* ab->C2 = Ao * B_other */
1058:     if (ab->C2->product->A != Ao) MatProductReplaceMats(Ao, NULL, NULL, ab->C2);
1059:     (*ab->C2->ops->productnumeric)(ab->C2); /* C2 = Ao * B_other */
1060:     KokkosSparse::spadd_numeric(&ab->kh, one, ab->C1_global, one, ab->C2_global, ab->C_global);

1062:     /* atb->C1 = Bd^t * ab->C_petsc */
1063:     atb = mmdata->mmAtB;
1065:     if (atb->C1->product->A != Bd) MatProductReplaceMats(Bd, NULL, NULL, atb->C1);
1066:     (*atb->C1->ops->productnumeric)(atb->C1);
1067:     /* atb->C2 = Bo^t * ab->C_petsc */
1068:     if (atb->C2->product->A != Bo) MatProductReplaceMats(Bo, NULL, NULL, atb->C2);
1069:     (*atb->C2->ops->productnumeric)(atb->C2);
1070:     MatSeqAIJKokkosReduce(atb->C2, MAT_REUSE_MATRIX, PETSC_FALSE, 0 /* N */, MatColIdxKokkosView() /*l2g*/, NULL /* ownerSF */, atb->sf, atb->abuf, atb->srcrowoffset, atb->dstrowoffset, atb->C2_global);
1071:     KokkosSparse::spadd_numeric(&atb->kh, one, atb->C1_global, one, atb->C2_global, atb->C_global);
1072:     mm = static_cast<MatMatStruct *>(atb);
1073:   }
1074:   /* Split C_global to form C */
1075:   MatSetMPIAIJKokkosWithGlobalCSRMatrix(C, MAT_REUSE_MATRIX, mm->C_global, mm->Cdstart);
1076:   return 0;
1077: }

1079: PetscErrorCode MatProductSymbolic_MPIAIJKokkos(Mat C)
1080: {
1081:   Mat                          A, B;
1082:   Mat_Product                 *product = C->product;
1083:   MatProductType               ptype;
1084:   MatProductData_MPIAIJKokkos *mmdata;
1085:   MatMatStruct                *mm   = NULL;
1086:   IS                           glob = NULL;
1087:   const PetscInt              *garray;
1088:   PetscInt                     m, n, M, N, sz;
1089:   ConstMatColIdxKokkosView     l2g; /* map local col ids to global ones */

1091:   MatCheckProduct(C, 1);
1093:   ptype = product->type;
1094:   A     = product->A;
1095:   B     = product->B;

1097:   switch (ptype) {
1098:   case MATPRODUCT_AB:
1099:     m = A->rmap->n;
1100:     n = B->cmap->n;
1101:     M = A->rmap->N;
1102:     N = B->cmap->N;
1103:     break;
1104:   case MATPRODUCT_AtB:
1105:     m = A->cmap->n;
1106:     n = B->cmap->n;
1107:     M = A->cmap->N;
1108:     N = B->cmap->N;
1109:     break;
1110:   case MATPRODUCT_PtAP:
1111:     m = B->cmap->n;
1112:     n = B->cmap->n;
1113:     M = B->cmap->N;
1114:     N = B->cmap->N;
1115:     break; /* BtAB */
1116:   default:
1117:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for product type %s", MatProductTypes[ptype]);
1118:   }

1120:   MatSetSizes(C, m, n, M, N);
1121:   PetscLayoutSetUp(C->rmap);
1122:   PetscLayoutSetUp(C->cmap);
1123:   MatSetType(C, ((PetscObject)A)->type_name);

1125:   mmdata           = new MatProductData_MPIAIJKokkos();
1126:   mmdata->reusesym = product->api_user;

1128:   if (ptype == MATPRODUCT_AB) {
1129:     mmdata->mmAB = new MatMatStruct_AB();
1130:     MatProductSymbolic_MPIAIJKokkos_AB(product, A, B, mmdata->mmAB);
1131:     mm = static_cast<MatMatStruct *>(mmdata->mmAB);
1132:   } else if (ptype == MATPRODUCT_AtB) {
1133:     mmdata->mmAtB = new MatMatStruct_AtB();
1134:     auto atb      = mmdata->mmAtB;
1135:     MatMPIAIJGetLocalMatMerge(B, MAT_INITIAL_MATRIX, &glob, &atb->B_local);
1136:     ISGetIndices(glob, &garray);
1137:     ISGetSize(glob, &sz);
1138:     l2g = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatColIdxKokkosViewHost(garray, sz));
1139:     MatProductSymbolic_MPIAIJKokkos_AtB(product, A, atb->B_local, PETSC_TRUE, N, l2g, atb);
1140:     ISRestoreIndices(glob, &garray);
1141:     ISDestroy(&glob);
1142:     mm = static_cast<MatMatStruct *>(atb);
1143:   } else if (ptype == MATPRODUCT_PtAP) {    /* BtAB */
1144:     mmdata->mmAB  = new MatMatStruct_AB();  /* tmp=A*B */
1145:     mmdata->mmAtB = new MatMatStruct_AtB(); /* C=B^t*tmp */
1146:     auto ab       = mmdata->mmAB;
1147:     auto atb      = mmdata->mmAtB;
1148:     MatProductSymbolic_MPIAIJKokkos_AB(product, A, B, ab);
1149:     auto tmp = new Mat_SeqAIJKokkos(ab->C_global); /* Memory will be owned by ab->C_petsc */
1150:     MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, tmp, &ab->C_petsc);
1151:     MatProductSymbolic_MPIAIJKokkos_AtB(product, B, ab->C_petsc, PETSC_FALSE, N, l2g /*not used*/, atb);
1152:     mm = static_cast<MatMatStruct *>(atb);
1153:   }
1154:   /* Split the C_global into petsc A, B format */
1155:   MatSetMPIAIJKokkosWithGlobalCSRMatrix(C, MAT_INITIAL_MATRIX, mm->C_global, mm->Cdstart);
1156:   C->product->data       = mmdata;
1157:   C->product->destroy    = MatProductDataDestroy_MPIAIJKokkos;
1158:   C->ops->productnumeric = MatProductNumeric_MPIAIJKokkos;
1159:   return 0;
1160: }

1162: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIAIJKokkos(Mat mat)
1163: {
1164:   Mat_Product *product = mat->product;
1165:   PetscBool    match   = PETSC_FALSE;
1166:   PetscBool    usecpu  = PETSC_FALSE;

1168:   MatCheckProduct(mat, 1);
1169:   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscObjectTypeCompare((PetscObject)product->B, ((PetscObject)product->A)->type_name, &match);
1170:   if (match) { /* we can always fallback to the CPU if requested */
1171:     switch (product->type) {
1172:     case MATPRODUCT_AB:
1173:       if (product->api_user) {
1174:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
1175:         PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL);
1176:         PetscOptionsEnd();
1177:       } else {
1178:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
1179:         PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL);
1180:         PetscOptionsEnd();
1181:       }
1182:       break;
1183:     case MATPRODUCT_AtB:
1184:       if (product->api_user) {
1185:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
1186:         PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL);
1187:         PetscOptionsEnd();
1188:       } else {
1189:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
1190:         PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL);
1191:         PetscOptionsEnd();
1192:       }
1193:       break;
1194:     case MATPRODUCT_PtAP:
1195:       if (product->api_user) {
1196:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
1197:         PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL);
1198:         PetscOptionsEnd();
1199:       } else {
1200:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
1201:         PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL);
1202:         PetscOptionsEnd();
1203:       }
1204:       break;
1205:     default:
1206:       break;
1207:     }
1208:     match = (PetscBool)!usecpu;
1209:   }
1210:   if (match) {
1211:     switch (product->type) {
1212:     case MATPRODUCT_AB:
1213:     case MATPRODUCT_AtB:
1214:     case MATPRODUCT_PtAP:
1215:       mat->ops->productsymbolic = MatProductSymbolic_MPIAIJKokkos;
1216:       break;
1217:     default:
1218:       break;
1219:     }
1220:   }
1221:   /* fallback to MPIAIJ ops */
1222:   if (!mat->ops->productsymbolic) MatProductSetFromOptions_MPIAIJ(mat);
1223:   return 0;
1224: }

1226: static PetscErrorCode MatSetPreallocationCOO_MPIAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1227: {
1228:   Mat_MPIAIJ       *mpiaij = (Mat_MPIAIJ *)mat->data;
1229:   Mat_MPIAIJKokkos *mpikok;

1231:   MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j); /* mpiaij->A,B's type is set to seqaijkokkos */
1232:   mat->preallocated = PETSC_TRUE;
1233:   MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY);
1234:   MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY);
1235:   MatZeroEntries(mat);
1236:   mpikok = static_cast<Mat_MPIAIJKokkos *>(mpiaij->spptr);
1237:   delete mpikok;
1238:   mpiaij->spptr = new Mat_MPIAIJKokkos(mpiaij);
1239:   return 0;
1240: }

1242: static PetscErrorCode MatSetValuesCOO_MPIAIJKokkos(Mat mat, const PetscScalar v[], InsertMode imode)
1243: {
1244:   Mat_MPIAIJ                 *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
1245:   Mat_MPIAIJKokkos           *mpikok = static_cast<Mat_MPIAIJKokkos *>(mpiaij->spptr);
1246:   Mat                         A = mpiaij->A, B = mpiaij->B;
1247:   PetscCount                  Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2;
1248:   MatScalarKokkosView         Aa, Ba;
1249:   MatScalarKokkosView         v1;
1250:   MatScalarKokkosView        &vsend  = mpikok->sendbuf_d;
1251:   const MatScalarKokkosView  &v2     = mpikok->recvbuf_d;
1252:   const PetscCountKokkosView &Ajmap1 = mpikok->Ajmap1_d, Ajmap2 = mpikok->Ajmap2_d, Aimap2 = mpikok->Aimap2_d;
1253:   const PetscCountKokkosView &Bjmap1 = mpikok->Bjmap1_d, Bjmap2 = mpikok->Bjmap2_d, Bimap2 = mpikok->Bimap2_d;
1254:   const PetscCountKokkosView &Aperm1 = mpikok->Aperm1_d, Aperm2 = mpikok->Aperm2_d, Bperm1 = mpikok->Bperm1_d, Bperm2 = mpikok->Bperm2_d;
1255:   const PetscCountKokkosView &Cperm1 = mpikok->Cperm1_d;
1256:   PetscMemType                memtype;

1258:   PetscGetMemType(v, &memtype); /* Return PETSC_MEMTYPE_HOST when v is NULL */
1259:   if (PetscMemTypeHost(memtype)) {         /* If user gave v[] in host, we need to copy it to device if any */
1260:     v1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), MatScalarKokkosViewHost((PetscScalar *)v, mpiaij->coo_n));
1261:   } else {
1262:     v1 = MatScalarKokkosView((PetscScalar *)v, mpiaij->coo_n); /* Directly use v[]'s memory */
1263:   }

1265:   if (imode == INSERT_VALUES) {
1266:     MatSeqAIJGetKokkosViewWrite(A, &Aa); /* write matrix values */
1267:     MatSeqAIJGetKokkosViewWrite(B, &Ba);
1268:   } else {
1269:     MatSeqAIJGetKokkosView(A, &Aa); /* read & write matrix values */
1270:     MatSeqAIJGetKokkosView(B, &Ba);
1271:   }

1273:   /* Pack entries to be sent to remote */
1274:   Kokkos::parallel_for(
1275:     vsend.extent(0), KOKKOS_LAMBDA(const PetscCount i) { vsend(i) = v1(Cperm1(i)); });

1277:   /* Send remote entries to their owner and overlap the communication with local computation */
1278:   PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_KOKKOS, vsend.data(), PETSC_MEMTYPE_KOKKOS, v2.data(), MPI_REPLACE);
1279:   /* Add local entries to A and B in one kernel */
1280:   Kokkos::parallel_for(
1281:     Annz + Bnnz, KOKKOS_LAMBDA(PetscCount i) {
1282:       PetscScalar sum = 0.0;
1283:       if (i < Annz) {
1284:         for (PetscCount k = Ajmap1(i); k < Ajmap1(i + 1); k++) sum += v1(Aperm1(k));
1285:         Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1286:       } else {
1287:         i -= Annz;
1288:         for (PetscCount k = Bjmap1(i); k < Bjmap1(i + 1); k++) sum += v1(Bperm1(k));
1289:         Ba(i) = (imode == INSERT_VALUES ? 0.0 : Ba(i)) + sum;
1290:       }
1291:     });
1292:   PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend.data(), v2.data(), MPI_REPLACE);

1294:   /* Add received remote entries to A and B in one kernel */
1295:   Kokkos::parallel_for(
1296:     Annz2 + Bnnz2, KOKKOS_LAMBDA(PetscCount i) {
1297:       if (i < Annz2) {
1298:         for (PetscCount k = Ajmap2(i); k < Ajmap2(i + 1); k++) Aa(Aimap2(i)) += v2(Aperm2(k));
1299:       } else {
1300:         i -= Annz2;
1301:         for (PetscCount k = Bjmap2(i); k < Bjmap2(i + 1); k++) Ba(Bimap2(i)) += v2(Bperm2(k));
1302:       }
1303:     });

1305:   if (imode == INSERT_VALUES) {
1306:     MatSeqAIJRestoreKokkosViewWrite(A, &Aa); /* Increase A & B's state etc. */
1307:     MatSeqAIJRestoreKokkosViewWrite(B, &Ba);
1308:   } else {
1309:     MatSeqAIJRestoreKokkosView(A, &Aa);
1310:     MatSeqAIJRestoreKokkosView(B, &Ba);
1311:   }
1312:   return 0;
1313: }

1315: PetscErrorCode MatDestroy_MPIAIJKokkos(Mat A)
1316: {
1317:   Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;

1319:   PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL);
1320:   PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL);
1321:   PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL);
1322:   PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL);
1323:   delete (Mat_MPIAIJKokkos *)mpiaij->spptr;
1324:   MatDestroy_MPIAIJ(A);
1325:   return 0;
1326: }

1328: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
1329: {
1330:   Mat         B;
1331:   Mat_MPIAIJ *a;

1333:   if (reuse == MAT_INITIAL_MATRIX) {
1334:     MatDuplicate(A, MAT_COPY_VALUES, newmat);
1335:   } else if (reuse == MAT_REUSE_MATRIX) {
1336:     MatCopy(A, *newmat, SAME_NONZERO_PATTERN);
1337:   }
1338:   B = *newmat;

1340:   B->boundtocpu = PETSC_FALSE;
1341:   PetscFree(B->defaultvectype);
1342:   PetscStrallocpy(VECKOKKOS, &B->defaultvectype);
1343:   PetscObjectChangeTypeName((PetscObject)B, MATMPIAIJKOKKOS);

1345:   a = static_cast<Mat_MPIAIJ *>(A->data);
1346:   if (a->A) MatSetType(a->A, MATSEQAIJKOKKOS);
1347:   if (a->B) MatSetType(a->B, MATSEQAIJKOKKOS);
1348:   if (a->lvec) VecSetType(a->lvec, VECSEQKOKKOS);

1350:   B->ops->assemblyend           = MatAssemblyEnd_MPIAIJKokkos;
1351:   B->ops->mult                  = MatMult_MPIAIJKokkos;
1352:   B->ops->multadd               = MatMultAdd_MPIAIJKokkos;
1353:   B->ops->multtranspose         = MatMultTranspose_MPIAIJKokkos;
1354:   B->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJKokkos;
1355:   B->ops->destroy               = MatDestroy_MPIAIJKokkos;

1357:   PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJKokkos);
1358:   PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJKokkos);
1359:   PetscObjectComposeFunction((PetscObject)B, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJKokkos);
1360:   PetscObjectComposeFunction((PetscObject)B, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJKokkos);
1361:   return 0;
1362: }
1363: /*MC
1364:    MATAIJKOKKOS - "mpiaijkokkos", a matrix type to be used for CSR sparse matrices with Kokkos

1366:    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types

1368:    Options Database Keys:
1369: .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()

1371:   Level: beginner

1373: .seealso: `MatCreateAIJKokkos()`, `MATSEQAIJKOKKOS`, `MATSEQAIJ`, `MATMPIAIJ`
1374: M*/
1375: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A)
1376: {
1377:   PetscKokkosInitializeCheck();
1378:   MatCreate_MPIAIJ(A);
1379:   MatConvert_MPIAIJ_MPIAIJKokkos(A, MATMPIAIJKOKKOS, MAT_INPLACE_MATRIX, &A);
1380:   return 0;
1381: }

1383: /*@C
1384:    MatCreateAIJKokkos - Creates a sparse matrix in `MATAIJKOKOS` (compressed row) format
1385:    (the default parallel PETSc format).  This matrix will ultimately pushed down
1386:    to Kokkos for calculations. For good matrix
1387:    assembly performance the user should preallocate the matrix storage by setting
1388:    the parameter nz (or the array nnz).  By setting these parameters accurately,
1389:    performance during matrix assembly can be increased by more than a factor of 50.

1391:    Collective

1393:    Input Parameters:
1394: +  comm - MPI communicator, set to `PETSC_COMM_SELF`
1395: .  m - number of rows
1396: .  n - number of columns
1397: .  nz - number of nonzeros per row (same for all rows)
1398: -  nnz - array containing the number of nonzeros in the various rows
1399:          (possibly different for each row) or NULL

1401:    Output Parameter:
1402: .  A - the matrix

1404:    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1405:    MatXXXXSetPreallocation() paradigm instead of this routine directly.
1406:    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

1408:    Notes:
1409:    If nnz is given then nz is ignored

1411:    The AIJ format, also called compressed row storage), is fully compatible with standard Fortran 77
1412:    storage.  That is, the stored row and column indices can begin at
1413:    either one (as in Fortran) or zero.  See the users' manual for details.

1415:    Specify the preallocated storage with either nz or nnz (not both).
1416:    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
1417:    allocation.  For large problems you MUST preallocate memory or you
1418:    will get TERRIBLE performance, see the users' manual chapter on matrices.

1420:    By default, this format uses inodes (identical nodes) when possible, to
1421:    improve numerical efficiency of matrix-vector products and solves. We
1422:    search for consecutive rows with the same nonzero structure, thereby
1423:    reusing matrix information to achieve increased efficiency.

1425:    Level: intermediate

1427: .seealso: `MATAIJKOKOS`, `MATSEQAIJKOKOS`, `MATMPIAIJKOKOS`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJKOKKOS`, `MATAIJKOKKOS`
1428: @*/
1429: PetscErrorCode MatCreateAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
1430: {
1431:   PetscMPIInt size;

1433:   MatCreate(comm, A);
1434:   MatSetSizes(*A, m, n, M, N);
1435:   MPI_Comm_size(comm, &size);
1436:   if (size > 1) {
1437:     MatSetType(*A, MATMPIAIJKOKKOS);
1438:     MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz);
1439:   } else {
1440:     MatSetType(*A, MATSEQAIJKOKKOS);
1441:     MatSeqAIJSetPreallocation(*A, d_nz, d_nnz);
1442:   }
1443:   return 0;
1444: }

1446: // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
1447: PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
1448: {
1449:   PetscMPIInt                size, rank;
1450:   MPI_Comm                   comm;
1451:   PetscSplitCSRDataStructure d_mat = NULL;

1453:   PetscObjectGetComm((PetscObject)A, &comm);
1454:   MPI_Comm_size(comm, &size);
1455:   MPI_Comm_rank(comm, &rank);
1456:   if (size == 1) {
1457:     MatSeqAIJKokkosGetDeviceMat(A, &d_mat);
1458:     MatSeqAIJKokkosModifyDevice(A); /* Since we are going to modify matrix values on device */
1459:   } else {
1460:     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
1461:     MatSeqAIJKokkosGetDeviceMat(aij->A, &d_mat);
1462:     MatSeqAIJKokkosModifyDevice(aij->A);
1463:     MatSeqAIJKokkosModifyDevice(aij->B);
1465:   }
1466:   // act like MatSetValues because not called on host
1467:   if (A->assembled) {
1468:     if (A->was_assembled) PetscInfo(A, "Assemble more than once already\n");
1469:     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
1470:   } else {
1471:     PetscInfo(A, "Warning !assemble ??? assembled=%" PetscInt_FMT "\n", A->assembled);
1472:   }
1473:   if (!d_mat) {
1474:     struct _n_SplitCSRMat h_mat; /* host container */
1475:     Mat_SeqAIJKokkos     *aijkokA;
1476:     Mat_SeqAIJ           *jaca;
1477:     PetscInt              n = A->rmap->n, nnz;
1478:     Mat                   Amat;
1479:     PetscInt             *colmap;

1481:     /* create and copy h_mat */
1482:     h_mat.M = A->cmap->N; // use for debug build
1483:     PetscInfo(A, "Create device matrix in Kokkos\n");
1484:     if (size == 1) {
1485:       Amat            = A;
1486:       jaca            = (Mat_SeqAIJ *)A->data;
1487:       h_mat.rstart    = 0;
1488:       h_mat.rend      = A->rmap->n;
1489:       h_mat.cstart    = 0;
1490:       h_mat.cend      = A->cmap->n;
1491:       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
1492:       h_mat.offdiag.a                   = NULL;
1493:       aijkokA                           = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1494:     } else {
1495:       Mat_MPIAIJ       *aij  = (Mat_MPIAIJ *)A->data;
1496:       Mat_SeqAIJ       *jacb = (Mat_SeqAIJ *)aij->B->data;
1497:       PetscInt          ii;
1498:       Mat_SeqAIJKokkos *aijkokB;

1500:       Amat    = aij->A;
1501:       aijkokA = static_cast<Mat_SeqAIJKokkos *>(aij->A->spptr);
1502:       aijkokB = static_cast<Mat_SeqAIJKokkos *>(aij->B->spptr);
1503:       jaca    = (Mat_SeqAIJ *)aij->A->data;
1506:       aij->donotstash          = PETSC_TRUE;
1507:       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
1508:       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more disassembly
1509:       PetscCalloc1(A->cmap->N, &colmap);
1510:       for (ii = 0; ii < aij->B->cmap->n; ii++) colmap[aij->garray[ii]] = ii + 1;
1511:       // allocate B copy data
1512:       h_mat.rstart = A->rmap->rstart;
1513:       h_mat.rend   = A->rmap->rend;
1514:       h_mat.cstart = A->cmap->rstart;
1515:       h_mat.cend   = A->cmap->rend;
1516:       nnz          = jacb->i[n];
1517:       if (jacb->compressedrow.use) {
1518:         const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_i_k(jacb->i, n + 1);
1519:         aijkokB->i_uncompressed_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_i_k));
1520:         Kokkos::deep_copy(aijkokB->i_uncompressed_d, h_i_k);
1521:         h_mat.offdiag.i = aijkokB->i_uncompressed_d.data();
1522:       } else {
1523:         h_mat.offdiag.i = aijkokB->i_device_data();
1524:       }
1525:       h_mat.offdiag.j = aijkokB->j_device_data();
1526:       h_mat.offdiag.a = aijkokB->a_device_data();
1527:       {
1528:         Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_colmap_k(colmap, A->cmap->N);
1529:         aijkokB->colmap_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_colmap_k));
1530:         Kokkos::deep_copy(aijkokB->colmap_d, h_colmap_k);
1531:         h_mat.colmap = aijkokB->colmap_d.data();
1532:         PetscFree(colmap);
1533:       }
1534:       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
1535:       h_mat.offdiag.n                 = n;
1536:     }
1537:     // allocate A copy data
1538:     nnz                          = jaca->i[n];
1539:     h_mat.diag.n                 = n;
1540:     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
1541:     MPI_Comm_rank(comm, &h_mat.rank);
1543:     h_mat.diag.i = aijkokA->i_device_data();
1544:     h_mat.diag.j = aijkokA->j_device_data();
1545:     h_mat.diag.a = aijkokA->a_device_data();
1546:     // copy pointers and metadata to device
1547:     MatSeqAIJKokkosSetDeviceMat(Amat, &h_mat);
1548:     MatSeqAIJKokkosGetDeviceMat(Amat, &d_mat);
1549:     PetscInfo(A, "Create device Mat n=%" PetscInt_FMT " nnz=%" PetscInt_FMT "\n", h_mat.diag.n, nnz);
1550:   }
1551:   *B           = d_mat;       // return it, set it in Mat, and set it up
1552:   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
1553:   return 0;
1554: }

1556: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetOffloadMask(Mat A, const char **mask)
1557: {
1558:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

1560:   if (!aijkok) *mask = "AIJKOK_UNALLOCATED";
1561:   else if (aijkok->a_dual.need_sync_host()) *mask = "PETSC_OFFLOAD_GPU";
1562:   else if (aijkok->a_dual.need_sync_device()) *mask = "PETSC_OFFLOAD_CPU";
1563:   else *mask = "PETSC_OFFLOAD_BOTH";
1564:   return 0;
1565: }

1567: PETSC_INTERN PetscErrorCode MatAIJKokkosPrintOffloadMask(Mat A)
1568: {
1569:   PetscMPIInt size;
1570:   Mat         Ad, Ao;
1571:   const char *amask, *bmask;

1573:   MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);

1575:   if (size == 1) {
1576:     MatSeqAIJKokkosGetOffloadMask(A, &amask);
1577:     PetscPrintf(PETSC_COMM_SELF, "%s\n", amask);
1578:   } else {
1579:     Ad = ((Mat_MPIAIJ *)A->data)->A;
1580:     Ao = ((Mat_MPIAIJ *)A->data)->B;
1581:     MatSeqAIJKokkosGetOffloadMask(Ad, &amask);
1582:     MatSeqAIJKokkosGetOffloadMask(Ao, &bmask);
1583:     PetscPrintf(PETSC_COMM_SELF, "Diag : Off-diag = %s : %s\n", amask, bmask);
1584:   }
1585:   return 0;
1586: }