Actual source code: ex2k.kokkos.cxx
1: static char help[] = "Benchmarking various accessing methods of DMDA vectors on host\n\n";
3: /*
4: On a machine with AMD EPYC-7452 CPUs, we got this data using one MPI rank and a serial-only Kokkos:
5: Time (sec.), on Mar. 1, 2022
6: ------------------------------------------
7: n PETSc C Kokkos
8: ------------------------------------------
9: 32 4.6464E-05 4.7451E-05 1.6880E-04
10: 64 2.5654E-04 2.5164E-04 5.6780E-04
11: 128 1.9383E-03 1.8878E-03 4.7938E-03
12: 256 1.4450E-02 1.3619E-02 3.7778E-02
13: 512 1.1580E-01 1.1551E-01 2.8428E-01
14: 1024 1.4179E+00 1.3772E+00 2.2873E+00
16: Overall, C is -2% ~ 5% faster than PETSc. But Kokkos is 1.6~3.6x slower than PETSc
17: */
19: #include <petscdmda_kokkos.hpp>
20: #include <petscdm.h>
21: #include <petscdmda.h>
23: using Kokkos::Iterate;
24: using Kokkos::MDRangePolicy;
25: using Kokkos::Rank;
26: using PetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<PetscScalar ***, Kokkos::LayoutRight, Kokkos::HostSpace>;
27: using ConstPetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<const PetscScalar ***, Kokkos::LayoutRight, Kokkos::HostSpace>;
29: /* PETSc multi-dimensional array access */
30: static PetscErrorCode Update1(DM da, const PetscScalar ***__restrict__ x1, PetscScalar ***__restrict__ y1, PetscInt nwarm, PetscInt nloop, PetscLogDouble *avgTime)
31: {
32: PetscInt it, i, j, k;
33: PetscLogDouble tstart = 0.0, tend;
34: PetscInt xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;
36: DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm);
37: DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm);
38: for (it = 0; it < nwarm + nloop; it++) {
39: if (it == nwarm) PetscTime(&tstart);
40: for (k = zs; k < zs + zm; k++) {
41: for (j = ys; j < ys + ym; j++) {
42: for (i = xs; i < xs + xm; i++) y1[k][j][i] = 6 * x1[k][j][i] - x1[k - 1][j][i] - x1[k][j - 1][i] - x1[k][j][i - 1] - x1[k + 1][j][i] - x1[k][j + 1][i] - x1[k][j][i + 1];
43: }
44: }
45: }
46: PetscTime(&tend);
47: *avgTime = (tend - tstart) / nloop;
48: return 0;
49: }
51: /* C multi-dimensional array access */
52: static PetscErrorCode Update2(DM da, const PetscScalar *__restrict__ x2, PetscScalar *__restrict__ y2, PetscInt nwarm, PetscInt nloop, PetscLogDouble *avgTime)
53: {
54: PetscInt it, i, j, k;
55: PetscLogDouble tstart = 0.0, tend;
56: PetscInt xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;
58: DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm);
59: DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm);
60: #define X2(k, j, i) x2[(k - gzs) * gym * gxm + (j - gys) * gxm + (i - gxs)]
61: #define Y2(k, j, i) y2[(k - zs) * ym * xm + (j - ys) * xm + (i - xs)]
62: for (it = 0; it < nwarm + nloop; it++) {
63: if (it == nwarm) PetscTime(&tstart);
64: for (k = zs; k < zs + zm; k++) {
65: for (j = ys; j < ys + ym; j++) {
66: for (i = xs; i < xs + xm; i++) Y2(k, j, i) = 6 * X2(k, j, i) - X2(k - 1, j, i) - X2(k, j - 1, i) - X2(k, j, i - 1) - X2(k + 1, j, i) - X2(k, j + 1, i) - X2(k, j, i + 1);
67: }
68: }
69: }
70: PetscTime(&tend);
71: *avgTime = (tend - tstart) / nloop;
72: #undef X2
73: #undef Y2
74: return 0;
75: }
77: int main(int argc, char **argv)
78: {
79: DM da;
80: PetscInt xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;
81: PetscInt dof = 1, sw = 1;
82: DMBoundaryType bx = DM_BOUNDARY_PERIODIC, by = DM_BOUNDARY_PERIODIC, bz = DM_BOUNDARY_PERIODIC;
83: DMDAStencilType st = DMDA_STENCIL_STAR;
84: Vec x, y; /* local/global vectors of the da */
85: PetscRandom rctx;
86: const PetscScalar ***x1;
87: PetscScalar ***y1; /* arrays of g, ll */
88: const PetscScalar *x2;
89: PetscScalar *y2;
90: ConstPetscScalarKokkosOffsetView3D x3;
91: PetscScalarKokkosOffsetView3D y3;
92: PetscLogDouble tstart = 0.0, tend = 0.0, avgTime = 0.0;
93: PetscInt nwarm = 2, nloop = 10;
94: PetscInt min = 32, max = 32 * 8; /* min and max sizes of the grids to sample */
97: PetscInitialize(&argc, &argv, (char *)0, help);
98: PetscRandomCreate(PETSC_COMM_WORLD, &rctx);
99: PetscOptionsGetInt(NULL, NULL, "-min", &min, NULL);
100: PetscOptionsGetInt(NULL, NULL, "-max", &max, NULL);
102: for (PetscInt len = min; len <= max; len = len * 2) {
103: DMDACreate3d(PETSC_COMM_WORLD, bx, by, bz, st, len, len, len, PETSC_DECIDE, PETSC_DECIDE, PETSC_DECIDE, dof, sw, 0, 0, 0, &da);
104: DMSetFromOptions(da);
105: DMSetUp(da);
107: DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm);
108: DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm);
109: DMCreateLocalVector(da, &x); /* Create local x and global y */
110: DMCreateGlobalVector(da, &y);
112: /* Access with petsc multi-dimensional arrays */
113: VecSetRandom(x, rctx);
114: VecSet(y, 0.0);
115: DMDAVecGetArrayRead(da, x, &x1);
116: DMDAVecGetArray(da, y, &y1);
117: Update1(da, x1, y1, nwarm, nloop, &avgTime);
118: DMDAVecRestoreArrayRead(da, x, &x1);
119: DMDAVecRestoreArray(da, y, &y1);
120: PetscTime(&tend);
121: PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- PETSc average time = %e\n", len, avgTime);
123: /* Access with C multi-dimensional arrays */
124: VecSetRandom(x, rctx);
125: VecSet(y, 0.0);
126: VecGetArrayRead(x, &x2);
127: VecGetArray(y, &y2);
128: Update2(da, x2, y2, nwarm, nloop, &avgTime);
129: VecRestoreArrayRead(x, &x2);
130: VecRestoreArray(y, &y2);
131: PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- C average time = %e\n", len, avgTime);
133: /* Access with Kokkos multi-dimensional OffsetViews */
134: VecSet(y, 0.0);
135: VecSetRandom(x, rctx);
136: DMDAVecGetKokkosOffsetView(da, x, &x3);
137: DMDAVecGetKokkosOffsetView(da, y, &y3);
139: for (PetscInt it = 0; it < nwarm + nloop; it++) {
140: if (it == nwarm) PetscTime(&tstart);
141: Kokkos::parallel_for(
142: "stencil", MDRangePolicy<Kokkos::DefaultHostExecutionSpace, Rank<3, Iterate::Right, Iterate::Right>>({zs, ys, xs}, {zs + zm, ys + ym, xs + xm}),
143: KOKKOS_LAMBDA(PetscInt k, PetscInt j, PetscInt i) { y3(k, j, i) = 6 * x3(k, j, i) - x3(k - 1, j, i) - x3(k, j - 1, i) - x3(k, j, i - 1) - x3(k + 1, j, i) - x3(k, j + 1, i) - x3(k, j, i + 1); });
144: }
145: PetscTime(&tend);
146: DMDAVecRestoreKokkosOffsetView(da, x, &x3);
147: DMDAVecRestoreKokkosOffsetView(da, y, &y3);
148: avgTime = (tend - tstart) / nloop;
149: PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- Kokkos average time = %e\n", len, avgTime);
151: VecDestroy(&x);
152: VecDestroy(&y);
153: DMDestroy(&da);
154: }
155: PetscRandomDestroy(&rctx);
156: PetscFinalize();
157: return 0;
158: }
160: /*TEST
161: build:
162: requires: kokkos_kernels
164: test:
165: suffix: 1
166: requires: kokkos_kernels
167: args: -min 32 -max 32 -dm_vec_type kokkos
168: filter: grep -v "time"
170: TEST*/