Actual source code: vpbjacobi_kok.kokkos.cxx
1: #include <petscvec_kokkos.hpp>
2: #include <../src/vec/vec/impls/seq/kokkos/veckokkosimpl.hpp>
3: #include <petscdevice.h>
4: #include <../src/ksp/pc/impls/vpbjacobi/vpbjacobi.h>
6: /* A class that manages helper arrays assisting parallel PCApply() with Kokkos */
7: struct PC_VPBJacobi_Kokkos {
8: /* Cache the old sizes to check if we need realloc */
9: PetscInt n; /* number of rows of the local matrix */
10: PetscInt nblocks; /* number of point blocks */
11: PetscInt nsize; /* sum of sizes of the point blocks */
13: /* Helper arrays that are pre-computed on host and then copied to device.
14: bs: [nblocks+1], "csr" version of bsizes[]
15: bs2: [nblocks+1], "csr" version of squares of bsizes[]
16: matIdx: [n], row i of the local matrix belongs to the matIdx_d[i] block
17: */
18: PetscIntKokkosDualView bs_dual, bs2_dual, matIdx_dual;
19: PetscScalarKokkosDualView diag_dual;
21: PC_VPBJacobi_Kokkos(PetscInt n, PetscInt nblocks, PetscInt nsize, const PetscInt *bsizes, MatScalar *diag_ptr_h) :
22: n(n), nblocks(nblocks), nsize(nsize), bs_dual("bs_dual", nblocks + 1), bs2_dual("bs2_dual", nblocks + 1), matIdx_dual("matIdx_dual", n)
23: {
24: PetscScalarKokkosViewHost diag_h(diag_ptr_h, nsize);
26: auto diag_d = Kokkos::create_mirror_view(DefaultMemorySpace(), diag_h);
27: diag_dual = PetscScalarKokkosDualView(diag_d, diag_h);
28: UpdateOffsetsOnDevice(bsizes, diag_ptr_h);
29: }
31: PetscErrorCode UpdateOffsetsOnDevice(const PetscInt *bsizes, MatScalar *diag_ptr_h)
32: {
34: ComputeOffsetsOnHost(bsizes);
36: bs_dual.modify_host();
37: bs2_dual.modify_host();
38: matIdx_dual.modify_host();
39: diag_dual.modify_host();
41: bs_dual.sync_device();
42: bs2_dual.sync_device();
43: matIdx_dual.sync_device();
44: diag_dual.sync_device();
45: PetscLogCpuToGpu(sizeof(PetscInt) * (2 * nblocks + 2 + n) + sizeof(MatScalar) * nsize);
46: return 0;
47: }
49: private:
50: PetscErrorCode ComputeOffsetsOnHost(const PetscInt *bsizes)
51: {
52: PetscInt *bs_h = bs_dual.view_host().data();
53: PetscInt *bs2_h = bs2_dual.view_host().data();
54: PetscInt *matIdx_h = matIdx_dual.view_host().data();
56: bs_h[0] = bs2_h[0] = 0;
57: for (PetscInt i = 0; i < nblocks; i++) {
58: bs_h[i + 1] = bs_h[i] + bsizes[i];
59: bs2_h[i + 1] = bs2_h[i] + bsizes[i] * bsizes[i];
60: for (PetscInt j = 0; j < bsizes[i]; j++) matIdx_h[bs_h[i] + j] = i;
61: }
62: return 0;
63: }
64: };
66: static PetscErrorCode PCApply_VPBJacobi_Kokkos(PC pc, Vec x, Vec y)
67: {
68: PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data;
69: PC_VPBJacobi_Kokkos *pckok = static_cast<PC_VPBJacobi_Kokkos *>(jac->spptr);
70: ConstPetscScalarKokkosView xv;
71: PetscScalarKokkosView yv;
72: PetscScalarKokkosView diag = pckok->diag_dual.view_device();
73: PetscIntKokkosView bs = pckok->bs_dual.view_device();
74: PetscIntKokkosView bs2 = pckok->bs2_dual.view_device();
75: PetscIntKokkosView matIdx = pckok->matIdx_dual.view_device();
77: PetscLogGpuTimeBegin();
78: VecErrorIfNotKokkos(x);
79: VecErrorIfNotKokkos(y);
80: VecGetKokkosView(x, &xv);
81: VecGetKokkosViewWrite(y, &yv);
82: PetscCallCXX(Kokkos::parallel_for(
83: "PCApply_VPBJacobi_Kokkos", pckok->n, KOKKOS_LAMBDA(PetscInt row) {
84: const PetscScalar *Ap, *xp;
85: PetscScalar *yp;
86: PetscInt i, j, k, m;
88: k = matIdx(row); /* k-th block/matrix */
89: m = bs(k + 1) - bs(k); /* block size of the k-th block */
90: i = row - bs(k); /* i-th row of the block */
91: Ap = &diag(bs2(k) + i); /* Ap points to the first entry of i-th row */
92: xp = &xv(bs(k));
93: yp = &yv(bs(k));
95: yp[i] = 0.0;
96: for (j = 0; j < m; j++) {
97: yp[i] += Ap[0] * xp[j];
98: Ap += m;
99: }
100: }));
101: VecRestoreKokkosView(x, &xv);
102: VecRestoreKokkosViewWrite(y, &yv);
103: PetscLogGpuFlops(pckok->nsize * 2); /* FMA on entries in all blocks */
104: PetscLogGpuTimeEnd();
105: return 0;
106: }
108: static PetscErrorCode PCDestroy_VPBJacobi_Kokkos(PC pc)
109: {
110: PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data;
112: delete static_cast<PC_VPBJacobi_Kokkos *>(jac->spptr);
113: PCDestroy_VPBJacobi(pc);
114: return 0;
115: }
117: PETSC_INTERN PetscErrorCode PCSetUp_VPBJacobi_Kokkos(PC pc)
118: {
119: PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data;
120: PC_VPBJacobi_Kokkos *pckok = static_cast<PC_VPBJacobi_Kokkos *>(jac->spptr);
121: PetscInt i, n, nblocks, nsize = 0;
122: const PetscInt *bsizes;
124: PCSetUp_VPBJacobi_Host(pc); /* Compute the inverse on host now. Might worth doing it on device directly */
125: MatGetVariableBlockSizes(pc->pmat, &nblocks, &bsizes);
126: for (i = 0; i < nblocks; i++) nsize += bsizes[i] * bsizes[i];
127: MatGetLocalSize(pc->pmat, &n, NULL);
129: /* If one calls MatSetVariableBlockSizes() multiple times and sizes have been changed (is it allowed?), we delete the old and rebuild anyway */
130: if (pckok && (pckok->n != n || pckok->nblocks != nblocks || pckok->nsize != nsize)) {
131: delete pckok;
132: pckok = nullptr;
133: }
135: if (!pckok) { /* allocate the struct along with the helper arrays from the scratch */
136: jac->spptr = new PC_VPBJacobi_Kokkos(n, nblocks, nsize, bsizes, jac->diag);
137: } else { /* update the value only */
138: pckok->UpdateOffsetsOnDevice(bsizes, jac->diag);
139: }
141: pc->ops->apply = PCApply_VPBJacobi_Kokkos;
142: pc->ops->destroy = PCDestroy_VPBJacobi_Kokkos;
143: return 0;
144: }