Actual source code: ex1cu.cu

  1: static char help[] = "Benchmarking CUDA kernel launch time\n";
  2: /*
  3:   Running example on Summit at OLCF:
  4:   # run with total 1 resource set (RS) (-n1), 1 RS per node (-r1), 1 MPI rank (-a1), 7 cores (-c7) and 1 GPU (-g1) per RS
  5:   $ jsrun -n1 -a1 -c7 -g1 -r1  ./ex1cu
  6:   Average asynchronous CUDA kernel launch time = 4.86 microseconds
  7:   Average synchronous  CUDA kernel launch time = 12.83 microseconds
  8: */
  9: #include <petscsys.h>
 10: #include <petscdevice_cuda.h>

 12: __global__ void NullKernel() { }

 14: int main(int argc, char **argv)
 15: {
 16:   PetscInt       i, n = 100000;
 17:   PetscLogDouble tstart, tend, time;

 20:   PetscInitialize(&argc, &argv, (char *)0, help);
 21:   PetscOptionsGetInt(NULL, NULL, "-n", &n, NULL);
 22:   cudaStreamSynchronize(NULL); /* Initialize CUDA runtime to get more accurate timing below */

 24:   /* Launch a sequence of kernels asynchronously. Previous launched kernels do not need to be completed before launching a new one */
 25:   PetscTime(&tstart);
 26:   for (i = 0; i < n; i++) NullKernel<<<1, 1, 0, NULL>>>();
 27:   PetscTime(&tend);
 28:   cudaStreamSynchronize(NULL); /* Sync after tend since we don't want to count kernel execution time */
 29:   time = (tend - tstart) * 1e6 / n;
 30:   PetscPrintf(PETSC_COMM_WORLD, "Average asynchronous CUDA kernel launch time = %.2f microseconds\n", time);

 32:   /* Launch a sequence of kernels synchronously. Only launch a new kernel after the one before it has been completed */
 33:   PetscTime(&tstart);
 34:   for (i = 0; i < n; i++) {
 35:     NullKernel<<<1, 1, 0, NULL>>>();
 36:     cudaStreamSynchronize(NULL);
 37:   }
 38:   PetscTime(&tend);
 39:   time = (tend - tstart) * 1e6 / n;
 40:   PetscPrintf(PETSC_COMM_WORLD, "Average synchronous  CUDA kernel launch time = %.2f microseconds\n", time);

 42:   PetscFinalize();
 43:   return 0;
 44: }

 46: /*TEST
 47:   build:
 48:     requires: cuda

 50:   test:
 51:     requires: cuda
 52:     args: -n 2
 53:     output_file: output/empty.out
 54:     filter: grep "DOES_NOT_EXIST"

 56: TEST*/