Actual source code: ex1k.kokkos.cxx
1: static char help[] = "Benchmarking memory bandwidth with VecAXPY() on parallel vectors\n";
2: /*
3: Usage:
4: mpirun -n <np> ./ex1k -vec_type <device vector type>
5: -n <n> # number of data points of vector sizes from 128, 256, 512 and up. Maxima and default is 23.
6: -m <m> # run each VecAXPY() m times to get the average time, default is 1000.
8: Example:
10: Running on Crusher at OLCF:
11: # run with 1 mpi rank (-n1), 32 CPUs (-c32), and map the process to CPU 0 and GPU 0
12: $ srun -n1 -c32 --cpu-bind=map_cpu:0 --gpus-per-node=8 --gpu-bind=map_gpu:0 ./ex1k -vec_type kokkos
13: */
15: #include <petscvec.h>
16: #include <petscdevice.h>
18: #if defined(PETSC_HAVE_CUDA)
19: #include <petscdevice_cuda.h>
20: #define SyncDevice() PetscCallCUDA(cudaDeviceSynchronize())
21: #elif defined(PETSC_HAVE_HIP)
22: #include <petscdevice_hip.h>
23: #define SyncDevice() PetscCallHIP(hipDeviceSynchronize())
24: #elif defined(PETSC_HAVE_KOKKOS)
25: #include <Kokkos_Core.hpp>
26: #define SyncDevice() Kokkos::fence()
27: #else
28: #define SyncDevice() 0
29: #endif
31: int main(int argc, char **argv)
32: {
33: PetscInt i, k, N, n, m = 1000, nsamples;
34: PetscLogDouble tstart, tend, time;
35: Vec x, y;
36: PetscScalar alpha = 3.14;
37: PetscLogDouble bandwidth;
38: PetscMPIInt size;
39: PetscInt Ns[] = {/* Use explicit sizes so that one can add sizes very close to 2^31 */
40: 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, 67108864, 134217728, 268435456, 536870912};
41: n = nsamples = sizeof(Ns) / sizeof(Ns[0]);
44: PetscInitialize(&argc, &argv, (char *)0, help);
45: PetscOptionsGetInt(NULL, NULL, "-n", &n, NULL); /* Up to vectors of local size 2^{n+6} */
46: PetscOptionsGetInt(NULL, NULL, "-m", &m, NULL); /* Run each VecAXPY() m times */
48: MPI_Comm_size(PETSC_COMM_WORLD, &size);
50: PetscPrintf(PETSC_COMM_WORLD, "Vector size (N) Time (us) Bandwidth (GB/s)\n");
51: PetscPrintf(PETSC_COMM_WORLD, "----------------------------------------------\n");
53: nsamples = PetscMin(nsamples, n);
54: for (k = 0; k < nsamples; k++) {
55: N = Ns[k];
56: VecCreate(PETSC_COMM_WORLD, &x);
57: VecSetFromOptions(x);
58: VecSetSizes(x, N, PETSC_DECIDE);
59: VecSetUp(x);
60: VecDuplicate(x, &y);
61: VecSet(x, 2.5);
62: VecSet(y, 4.0);
64: /* Warm-up */
65: for (i = 0; i < 4; i++) VecAXPY(x, alpha, y);
66: SyncDevice();
67: MPI_Barrier(PETSC_COMM_WORLD);
69: PetscTime(&tstart);
70: for (i = 0; i < m; i++) VecAXPY(x, alpha, y);
71: SyncDevice();
72: MPI_Barrier(PETSC_COMM_WORLD);
73: PetscTime(&tend);
74: time = (tend - tstart) * 1e6 / m;
75: bandwidth = 3 * N * size * sizeof(PetscScalar) / time * 1e-3; /* read x, y and write y */
76: PetscPrintf(PETSC_COMM_WORLD, "%12" PetscInt_FMT ", %12.4f, %8.2f\n", N, time, bandwidth);
77: VecDestroy(&x);
78: VecDestroy(&y);
79: }
81: PetscFinalize();
82: return 0;
83: }
85: /*TEST
86: build:
87: requires: kokkos_kernels
89: test:
90: args: -n 2 -m 2 -vec_type kokkos
91: output_file: output/empty.out
92: filter: grep "DOES_NOT_EXIST"
94: TEST*/