Actual source code: vpbjacobi_kok.kokkos.cxx

  1: #include <petscvec_kokkos.hpp>
  2: #include <../src/vec/vec/impls/seq/kokkos/veckokkosimpl.hpp>
  3: #include <petscdevice.h>
  4: #include <../src/ksp/pc/impls/vpbjacobi/vpbjacobi.h>

  6: /* A class that manages helper arrays assisting parallel PCApply() with Kokkos */
  7: struct PC_VPBJacobi_Kokkos {
  8:   /* Cache the old sizes to check if we need realloc */
  9:   PetscInt n;       /* number of rows of the local matrix */
 10:   PetscInt nblocks; /* number of point blocks */
 11:   PetscInt nsize;   /* sum of sizes of the point blocks */

 13:   /* Helper arrays that are pre-computed on host and then copied to device.
 14:     bs:     [nblocks+1], "csr" version of bsizes[]
 15:     bs2:    [nblocks+1], "csr" version of squares of bsizes[]
 16:     matIdx: [n], row i of the local matrix belongs to the matIdx_d[i] block
 17:   */
 18:   PetscIntKokkosDualView    bs_dual, bs2_dual, matIdx_dual;
 19:   PetscScalarKokkosDualView diag_dual;

 21:   PC_VPBJacobi_Kokkos(PetscInt n, PetscInt nblocks, PetscInt nsize, const PetscInt *bsizes, MatScalar *diag_ptr_h) :
 22:     n(n), nblocks(nblocks), nsize(nsize), bs_dual("bs_dual", nblocks + 1), bs2_dual("bs2_dual", nblocks + 1), matIdx_dual("matIdx_dual", n)
 23:   {
 24:     PetscScalarKokkosViewHost diag_h(diag_ptr_h, nsize);

 26:     auto diag_d = Kokkos::create_mirror_view(DefaultMemorySpace(), diag_h);
 27:     diag_dual   = PetscScalarKokkosDualView(diag_d, diag_h);
 28:     UpdateOffsetsOnDevice(bsizes, diag_ptr_h);
 29:   }

 31:   PetscErrorCode UpdateOffsetsOnDevice(const PetscInt *bsizes, MatScalar *diag_ptr_h)
 32:   {
 34:     ComputeOffsetsOnHost(bsizes);

 36:     bs_dual.modify_host();
 37:     bs2_dual.modify_host();
 38:     matIdx_dual.modify_host();
 39:     diag_dual.modify_host();

 41:     bs_dual.sync_device();
 42:     bs2_dual.sync_device();
 43:     matIdx_dual.sync_device();
 44:     diag_dual.sync_device();
 45:     PetscLogCpuToGpu(sizeof(PetscInt) * (2 * nblocks + 2 + n) + sizeof(MatScalar) * nsize);
 46:     return 0;
 47:   }

 49: private:
 50:   PetscErrorCode ComputeOffsetsOnHost(const PetscInt *bsizes)
 51:   {
 52:     PetscInt *bs_h     = bs_dual.view_host().data();
 53:     PetscInt *bs2_h    = bs2_dual.view_host().data();
 54:     PetscInt *matIdx_h = matIdx_dual.view_host().data();

 56:     bs_h[0] = bs2_h[0] = 0;
 57:     for (PetscInt i = 0; i < nblocks; i++) {
 58:       bs_h[i + 1]  = bs_h[i] + bsizes[i];
 59:       bs2_h[i + 1] = bs2_h[i] + bsizes[i] * bsizes[i];
 60:       for (PetscInt j = 0; j < bsizes[i]; j++) matIdx_h[bs_h[i] + j] = i;
 61:     }
 62:     return 0;
 63:   }
 64: };

 66: static PetscErrorCode PCApply_VPBJacobi_Kokkos(PC pc, Vec x, Vec y)
 67: {
 68:   PC_VPBJacobi              *jac   = (PC_VPBJacobi *)pc->data;
 69:   PC_VPBJacobi_Kokkos       *pckok = static_cast<PC_VPBJacobi_Kokkos *>(jac->spptr);
 70:   ConstPetscScalarKokkosView xv;
 71:   PetscScalarKokkosView      yv;
 72:   PetscScalarKokkosView      diag   = pckok->diag_dual.view_device();
 73:   PetscIntKokkosView         bs     = pckok->bs_dual.view_device();
 74:   PetscIntKokkosView         bs2    = pckok->bs2_dual.view_device();
 75:   PetscIntKokkosView         matIdx = pckok->matIdx_dual.view_device();

 77:   PetscLogGpuTimeBegin();
 78:   VecErrorIfNotKokkos(x);
 79:   VecErrorIfNotKokkos(y);
 80:   VecGetKokkosView(x, &xv);
 81:   VecGetKokkosViewWrite(y, &yv);
 82:   PetscCallCXX(Kokkos::parallel_for(
 83:     "PCApply_VPBJacobi_Kokkos", pckok->n, KOKKOS_LAMBDA(PetscInt row) {
 84:       const PetscScalar *Ap, *xp;
 85:       PetscScalar       *yp;
 86:       PetscInt           i, j, k, m;

 88:       k  = matIdx(row);       /* k-th block/matrix */
 89:       m  = bs(k + 1) - bs(k); /* block size of the k-th block */
 90:       i  = row - bs(k);       /* i-th row of the block */
 91:       Ap = &diag(bs2(k) + i); /* Ap points to the first entry of i-th row */
 92:       xp = &xv(bs(k));
 93:       yp = &yv(bs(k));

 95:       yp[i] = 0.0;
 96:       for (j = 0; j < m; j++) {
 97:         yp[i] += Ap[0] * xp[j];
 98:         Ap += m;
 99:       }
100:     }));
101:   VecRestoreKokkosView(x, &xv);
102:   VecRestoreKokkosViewWrite(y, &yv);
103:   PetscLogGpuFlops(pckok->nsize * 2); /* FMA on entries in all blocks */
104:   PetscLogGpuTimeEnd();
105:   return 0;
106: }

108: static PetscErrorCode PCDestroy_VPBJacobi_Kokkos(PC pc)
109: {
110:   PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data;

112:   delete static_cast<PC_VPBJacobi_Kokkos *>(jac->spptr);
113:   PCDestroy_VPBJacobi(pc);
114:   return 0;
115: }

117: PETSC_INTERN PetscErrorCode PCSetUp_VPBJacobi_Kokkos(PC pc)
118: {
119:   PC_VPBJacobi        *jac   = (PC_VPBJacobi *)pc->data;
120:   PC_VPBJacobi_Kokkos *pckok = static_cast<PC_VPBJacobi_Kokkos *>(jac->spptr);
121:   PetscInt             i, n, nblocks, nsize = 0;
122:   const PetscInt      *bsizes;

124:   PCSetUp_VPBJacobi_Host(pc); /* Compute the inverse on host now. Might worth doing it on device directly */
125:   MatGetVariableBlockSizes(pc->pmat, &nblocks, &bsizes);
126:   for (i = 0; i < nblocks; i++) nsize += bsizes[i] * bsizes[i];
127:   MatGetLocalSize(pc->pmat, &n, NULL);

129:   /* If one calls MatSetVariableBlockSizes() multiple times and sizes have been changed (is it allowed?), we delete the old and rebuild anyway */
130:   if (pckok && (pckok->n != n || pckok->nblocks != nblocks || pckok->nsize != nsize)) {
131:     delete pckok;
132:     pckok = nullptr;
133:   }

135:   if (!pckok) { /* allocate the struct along with the helper arrays from the scratch */
136:     jac->spptr = new PC_VPBJacobi_Kokkos(n, nblocks, nsize, bsizes, jac->diag);
137:   } else { /* update the value only */
138:     pckok->UpdateOffsetsOnDevice(bsizes, jac->diag);
139:   }

141:   pc->ops->apply   = PCApply_VPBJacobi_Kokkos;
142:   pc->ops->destroy = PCDestroy_VPBJacobi_Kokkos;
143:   return 0;
144: }