Actual source code: ex1k.kokkos.cxx

  1: static char help[] = "Benchmarking memory bandwidth with VecAXPY() on parallel vectors\n";
  2: /*
  3:   Usage:
  4:    mpirun -n <np> ./ex1k -vec_type <device vector type>
  5:      -n  <n>  # number of data points of vector sizes from 128, 256, 512 and up. Maxima and default is 23.
  6:      -m  <m>  # run each VecAXPY() m times to get the average time, default is 1000.

  8:   Example:

 10:   Running on Crusher at OLCF:
 11:   # run with 1 mpi rank (-n1), 32 CPUs (-c32), and map the process to CPU 0 and GPU 0
 12:   $ srun -n1 -c32 --cpu-bind=map_cpu:0 --gpus-per-node=8 --gpu-bind=map_gpu:0 ./ex1k -vec_type kokkos
 13: */

 15: #include <petscvec.h>
 16: #include <petscdevice.h>

 18: #if defined(PETSC_HAVE_CUDA)
 19: #include <petscdevice_cuda.h>
 20:   #define SyncDevice() PetscCallCUDA(cudaDeviceSynchronize())
 21: #elif defined(PETSC_HAVE_HIP)
 22: #include <petscdevice_hip.h>
 23:   #define SyncDevice() PetscCallHIP(hipDeviceSynchronize())
 24: #elif defined(PETSC_HAVE_KOKKOS)
 25:   #include <Kokkos_Core.hpp>
 26:   #define SyncDevice() Kokkos::fence()
 27: #else
 28:   #define SyncDevice() 0
 29: #endif

 31: int main(int argc, char **argv)
 32: {
 33:   PetscInt       i, k, N, n, m = 1000, nsamples;
 34:   PetscLogDouble tstart, tend, time;
 35:   Vec            x, y;
 36:   PetscScalar    alpha = 3.14;
 37:   PetscLogDouble bandwidth;
 38:   PetscMPIInt    size;
 39:   PetscInt       Ns[] = {/* Use explicit sizes so that one can add sizes very close to 2^31 */
 40:                    128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, 67108864, 134217728, 268435456, 536870912};
 41:   n = nsamples = sizeof(Ns) / sizeof(Ns[0]);

 44:   PetscInitialize(&argc, &argv, (char *)0, help);
 45:   PetscOptionsGetInt(NULL, NULL, "-n", &n, NULL); /* Up to vectors of local size 2^{n+6} */
 46:   PetscOptionsGetInt(NULL, NULL, "-m", &m, NULL); /* Run each VecAXPY() m times */

 48:   MPI_Comm_size(PETSC_COMM_WORLD, &size);

 50:   PetscPrintf(PETSC_COMM_WORLD, "Vector size (N)   Time (us)   Bandwidth (GB/s)\n");
 51:   PetscPrintf(PETSC_COMM_WORLD, "----------------------------------------------\n");

 53:   nsamples = PetscMin(nsamples, n);
 54:   for (k = 0; k < nsamples; k++) {
 55:     N = Ns[k];
 56:     VecCreate(PETSC_COMM_WORLD, &x);
 57:     VecSetFromOptions(x);
 58:     VecSetSizes(x, N, PETSC_DECIDE);
 59:     VecSetUp(x);
 60:     VecDuplicate(x, &y);
 61:     VecSet(x, 2.5);
 62:     VecSet(y, 4.0);

 64:     /* Warm-up */
 65:     for (i = 0; i < 4; i++) VecAXPY(x, alpha, y);
 66:     SyncDevice();
 67:     MPI_Barrier(PETSC_COMM_WORLD);

 69:     PetscTime(&tstart);
 70:     for (i = 0; i < m; i++) VecAXPY(x, alpha, y);
 71:     SyncDevice();
 72:     MPI_Barrier(PETSC_COMM_WORLD);
 73:     PetscTime(&tend);
 74:     time      = (tend - tstart) * 1e6 / m;
 75:     bandwidth = 3 * N * size * sizeof(PetscScalar) / time * 1e-3; /* read x, y and write y */
 76:     PetscPrintf(PETSC_COMM_WORLD, "%12" PetscInt_FMT ", %12.4f, %8.2f\n", N, time, bandwidth);
 77:     VecDestroy(&x);
 78:     VecDestroy(&y);
 79:   }

 81:   PetscFinalize();
 82:   return 0;
 83: }

 85: /*TEST
 86:   build:
 87:     requires: kokkos_kernels

 89:   test:
 90:     args: -n 2 -m 2 -vec_type kokkos
 91:     output_file: output/empty.out
 92:     filter: grep "DOES_NOT_EXIST"

 94: TEST*/