Actual source code: ex2k.kokkos.cxx

  1: static char help[] = "Benchmarking various accessing methods of DMDA vectors on host\n\n";

  3: /*
  4:   On a machine with AMD EPYC-7452 CPUs, we got this data using one MPI rank and a serial-only Kokkos:
  5:            Time (sec.), on Mar. 1, 2022
  6:   ------------------------------------------
  7:   n     PETSc          C          Kokkos
  8:   ------------------------------------------
  9:   32    4.6464E-05  4.7451E-05  1.6880E-04
 10:   64    2.5654E-04  2.5164E-04  5.6780E-04
 11:   128   1.9383E-03  1.8878E-03  4.7938E-03
 12:   256   1.4450E-02  1.3619E-02  3.7778E-02
 13:   512   1.1580E-01  1.1551E-01  2.8428E-01
 14:   1024  1.4179E+00  1.3772E+00  2.2873E+00

 16:   Overall, C is -2% ~ 5% faster than PETSc. But Kokkos is 1.6~3.6x slower than PETSc
 17: */

 19: #include <petscdmda_kokkos.hpp>
 20: #include <petscdm.h>
 21: #include <petscdmda.h>

 23: using Kokkos::Iterate;
 24: using Kokkos::MDRangePolicy;
 25: using Kokkos::Rank;
 26: using PetscScalarKokkosOffsetView3D      = Kokkos::Experimental::OffsetView<PetscScalar ***, Kokkos::LayoutRight, Kokkos::HostSpace>;
 27: using ConstPetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<const PetscScalar ***, Kokkos::LayoutRight, Kokkos::HostSpace>;

 29: /* PETSc multi-dimensional array access */
 30: static PetscErrorCode Update1(DM da, const PetscScalar ***__restrict__ x1, PetscScalar ***__restrict__ y1, PetscInt nwarm, PetscInt nloop, PetscLogDouble *avgTime)
 31: {
 32:   PetscInt       it, i, j, k;
 33:   PetscLogDouble tstart = 0.0, tend;
 34:   PetscInt       xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;

 36:   DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm);
 37:   DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm);
 38:   for (it = 0; it < nwarm + nloop; it++) {
 39:     if (it == nwarm) PetscTime(&tstart);
 40:     for (k = zs; k < zs + zm; k++) {
 41:       for (j = ys; j < ys + ym; j++) {
 42:         for (i = xs; i < xs + xm; i++) y1[k][j][i] = 6 * x1[k][j][i] - x1[k - 1][j][i] - x1[k][j - 1][i] - x1[k][j][i - 1] - x1[k + 1][j][i] - x1[k][j + 1][i] - x1[k][j][i + 1];
 43:       }
 44:     }
 45:   }
 46:   PetscTime(&tend);
 47:   *avgTime = (tend - tstart) / nloop;
 48:   return 0;
 49: }

 51: /* C multi-dimensional array access */
 52: static PetscErrorCode Update2(DM da, const PetscScalar *__restrict__ x2, PetscScalar *__restrict__ y2, PetscInt nwarm, PetscInt nloop, PetscLogDouble *avgTime)
 53: {
 54:   PetscInt       it, i, j, k;
 55:   PetscLogDouble tstart = 0.0, tend;
 56:   PetscInt       xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;

 58:   DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm);
 59:   DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm);
 60: #define X2(k, j, i) x2[(k - gzs) * gym * gxm + (j - gys) * gxm + (i - gxs)]
 61: #define Y2(k, j, i) y2[(k - zs) * ym * xm + (j - ys) * xm + (i - xs)]
 62:   for (it = 0; it < nwarm + nloop; it++) {
 63:     if (it == nwarm) PetscTime(&tstart);
 64:     for (k = zs; k < zs + zm; k++) {
 65:       for (j = ys; j < ys + ym; j++) {
 66:         for (i = xs; i < xs + xm; i++) Y2(k, j, i) = 6 * X2(k, j, i) - X2(k - 1, j, i) - X2(k, j - 1, i) - X2(k, j, i - 1) - X2(k + 1, j, i) - X2(k, j + 1, i) - X2(k, j, i + 1);
 67:       }
 68:     }
 69:   }
 70:   PetscTime(&tend);
 71:   *avgTime = (tend - tstart) / nloop;
 72: #undef X2
 73: #undef Y2
 74:   return 0;
 75: }

 77: int main(int argc, char **argv)
 78: {
 79:   DM                                 da;
 80:   PetscInt                           xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;
 81:   PetscInt                           dof = 1, sw = 1;
 82:   DMBoundaryType                     bx = DM_BOUNDARY_PERIODIC, by = DM_BOUNDARY_PERIODIC, bz = DM_BOUNDARY_PERIODIC;
 83:   DMDAStencilType                    st = DMDA_STENCIL_STAR;
 84:   Vec                                x, y; /* local/global vectors of the da */
 85:   PetscRandom                        rctx;
 86:   const PetscScalar               ***x1;
 87:   PetscScalar                     ***y1; /* arrays of g, ll */
 88:   const PetscScalar                 *x2;
 89:   PetscScalar                       *y2;
 90:   ConstPetscScalarKokkosOffsetView3D x3;
 91:   PetscScalarKokkosOffsetView3D      y3;
 92:   PetscLogDouble                     tstart = 0.0, tend = 0.0, avgTime = 0.0;
 93:   PetscInt                           nwarm = 2, nloop = 10;
 94:   PetscInt                           min = 32, max = 32 * 8; /* min and max sizes of the grids to sample */

 97:   PetscInitialize(&argc, &argv, (char *)0, help);
 98:   PetscRandomCreate(PETSC_COMM_WORLD, &rctx);
 99:   PetscOptionsGetInt(NULL, NULL, "-min", &min, NULL);
100:   PetscOptionsGetInt(NULL, NULL, "-max", &max, NULL);

102:   for (PetscInt len = min; len <= max; len = len * 2) {
103:     DMDACreate3d(PETSC_COMM_WORLD, bx, by, bz, st, len, len, len, PETSC_DECIDE, PETSC_DECIDE, PETSC_DECIDE, dof, sw, 0, 0, 0, &da);
104:     DMSetFromOptions(da);
105:     DMSetUp(da);

107:     DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm);
108:     DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm);
109:     DMCreateLocalVector(da, &x); /* Create local x and global y */
110:     DMCreateGlobalVector(da, &y);

112:     /* Access with petsc multi-dimensional arrays */
113:     VecSetRandom(x, rctx);
114:     VecSet(y, 0.0);
115:     DMDAVecGetArrayRead(da, x, &x1);
116:     DMDAVecGetArray(da, y, &y1);
117:     Update1(da, x1, y1, nwarm, nloop, &avgTime);
118:     DMDAVecRestoreArrayRead(da, x, &x1);
119:     DMDAVecRestoreArray(da, y, &y1);
120:     PetscTime(&tend);
121:     PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- PETSc average time  = %e\n", len, avgTime);

123:     /* Access with C multi-dimensional arrays */
124:     VecSetRandom(x, rctx);
125:     VecSet(y, 0.0);
126:     VecGetArrayRead(x, &x2);
127:     VecGetArray(y, &y2);
128:     Update2(da, x2, y2, nwarm, nloop, &avgTime);
129:     VecRestoreArrayRead(x, &x2);
130:     VecRestoreArray(y, &y2);
131:     PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- C average time      = %e\n", len, avgTime);

133:     /* Access with Kokkos multi-dimensional OffsetViews */
134:     VecSet(y, 0.0);
135:     VecSetRandom(x, rctx);
136:     DMDAVecGetKokkosOffsetView(da, x, &x3);
137:     DMDAVecGetKokkosOffsetView(da, y, &y3);

139:     for (PetscInt it = 0; it < nwarm + nloop; it++) {
140:       if (it == nwarm) PetscTime(&tstart);
141:       Kokkos::parallel_for(
142:         "stencil", MDRangePolicy<Kokkos::DefaultHostExecutionSpace, Rank<3, Iterate::Right, Iterate::Right>>({zs, ys, xs}, {zs + zm, ys + ym, xs + xm}),
143:         KOKKOS_LAMBDA(PetscInt k, PetscInt j, PetscInt i) { y3(k, j, i) = 6 * x3(k, j, i) - x3(k - 1, j, i) - x3(k, j - 1, i) - x3(k, j, i - 1) - x3(k + 1, j, i) - x3(k, j + 1, i) - x3(k, j, i + 1); });
144:     }
145:     PetscTime(&tend);
146:     DMDAVecRestoreKokkosOffsetView(da, x, &x3);
147:     DMDAVecRestoreKokkosOffsetView(da, y, &y3);
148:     avgTime = (tend - tstart) / nloop;
149:     PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- Kokkos average time = %e\n", len, avgTime);

151:     VecDestroy(&x);
152:     VecDestroy(&y);
153:     DMDestroy(&da);
154:   }
155:   PetscRandomDestroy(&rctx);
156:   PetscFinalize();
157:   return 0;
158: }

160: /*TEST
161:   build:
162:     requires: kokkos_kernels

164:   test:
165:     suffix: 1
166:     requires: kokkos_kernels
167:     args: -min 32 -max 32 -dm_vec_type kokkos
168:     filter: grep -v "time"

170: TEST*/