Actual source code: ex2.c

  1: static char help[] = "Test SF cuda stream synchronization in device to host communication\n\n";
  2: /*
  3:   SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous
  4:   operations in the default stream and does not sync these operations since it assumes routines consume
  5:   the destination data are also on the default stream. However, when destination data in on CPU,
  6:   SF must guarantee the data is ready to use on CPU after PetscSFXxxEnd().
  7:  */

  9: #include <petscvec.h>
 10: int main(int argc, char **argv)
 11: {
 12:   PetscInt           i, n = 100000; /* Big enough to make the asynchronous copy meaningful */
 13:   PetscScalar       *val;
 14:   const PetscScalar *yval;
 15:   Vec                x, y;
 16:   PetscMPIInt        size;
 17:   IS                 ix, iy;
 18:   VecScatter         vscat;

 21:   PetscInitialize(&argc, &argv, (char *)0, help);
 22:   MPI_Comm_size(PETSC_COMM_WORLD, &size);

 25:   /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector,
 26:      since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous
 27:      cudaMemcpyDeviceToHost.
 28:    */
 29:   VecCreateSeq(PETSC_COMM_WORLD, n, &x);
 30:   VecSetFromOptions(x);
 31:   VecCreateSeq(PETSC_COMM_WORLD, n, &y);
 32:   VecSetFromOptions(y);

 34:   /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */
 35:   VecGetArray(x, &val);
 36:   for (i = 0; i < n; i++) val[i] = i / 2.0;
 37:   VecRestoreArray(x, &val);
 38:   VecScale(x, 2.0);
 39:   VecSet(y, 314);

 41:   /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */
 42:   VecGetArray(y, &val);
 43:   VecRestoreArray(y, &val);

 45:   /* The vscat is simply a vector copy */
 46:   ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &ix);
 47:   ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &iy);
 48:   VecScatterCreate(x, ix, y, iy, &vscat);

 50:   /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous
 51:      cudaMemcpy or kernels, but it must guarantee y is ready to use on host. Otherwise, wrong data will be displayed.
 52:    */
 53:   VecScatterBegin(vscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
 54:   VecScatterEnd(vscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
 55:   VecGetArrayRead(y, &yval);
 56:   /* Display the first and the last entries of y to see if it is valid on host */
 57:   PetscPrintf(PETSC_COMM_SELF, "y[0]=%.f, y[%" PetscInt_FMT "] = %.f\n", (float)PetscRealPart(yval[0]), n - 1, (float)PetscRealPart(yval[n - 1]));
 58:   VecRestoreArrayRead(y, &yval);

 60:   VecDestroy(&x);
 61:   VecDestroy(&y);
 62:   ISDestroy(&ix);
 63:   ISDestroy(&iy);
 64:   VecScatterDestroy(&vscat);
 65:   PetscFinalize();
 66:   return 0;
 67: }

 69: /*TEST

 71:    test:
 72:     requires: cuda
 73:     diff_args: -j
 74:     #make sure the host memory is pinned
 75:     # sf_backend cuda is not needed if compiling only with cuda
 76:     args: -vec_type cuda -sf_backend cuda -vec_pinned_memory_min 0

 78:    test:
 79:     suffix: hip
 80:     requires: hip
 81:     diff_args: -j
 82:     output_file: output/ex2_1.out
 83:     #make sure the host memory is pinned
 84:     # sf_backend hip is not needed if compiling only with hip
 85:     args:  -vec_type hip -sf_backend hip -vec_pinned_memory_min 0

 87: TEST*/