Actual source code: ex2.c
1: static char help[] = "Test SF cuda stream synchronization in device to host communication\n\n";
2: /*
3: SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous
4: operations in the default stream and does not sync these operations since it assumes routines consume
5: the destination data are also on the default stream. However, when destination data in on CPU,
6: SF must guarantee the data is ready to use on CPU after PetscSFXxxEnd().
7: */
9: #include <petscvec.h>
10: int main(int argc, char **argv)
11: {
12: PetscInt i, n = 100000; /* Big enough to make the asynchronous copy meaningful */
13: PetscScalar *val;
14: const PetscScalar *yval;
15: Vec x, y;
16: PetscMPIInt size;
17: IS ix, iy;
18: VecScatter vscat;
21: PetscInitialize(&argc, &argv, (char *)0, help);
22: MPI_Comm_size(PETSC_COMM_WORLD, &size);
25: /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector,
26: since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous
27: cudaMemcpyDeviceToHost.
28: */
29: VecCreateSeq(PETSC_COMM_WORLD, n, &x);
30: VecSetFromOptions(x);
31: VecCreateSeq(PETSC_COMM_WORLD, n, &y);
32: VecSetFromOptions(y);
34: /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */
35: VecGetArray(x, &val);
36: for (i = 0; i < n; i++) val[i] = i / 2.0;
37: VecRestoreArray(x, &val);
38: VecScale(x, 2.0);
39: VecSet(y, 314);
41: /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */
42: VecGetArray(y, &val);
43: VecRestoreArray(y, &val);
45: /* The vscat is simply a vector copy */
46: ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &ix);
47: ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &iy);
48: VecScatterCreate(x, ix, y, iy, &vscat);
50: /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous
51: cudaMemcpy or kernels, but it must guarantee y is ready to use on host. Otherwise, wrong data will be displayed.
52: */
53: VecScatterBegin(vscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
54: VecScatterEnd(vscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
55: VecGetArrayRead(y, &yval);
56: /* Display the first and the last entries of y to see if it is valid on host */
57: PetscPrintf(PETSC_COMM_SELF, "y[0]=%.f, y[%" PetscInt_FMT "] = %.f\n", (float)PetscRealPart(yval[0]), n - 1, (float)PetscRealPart(yval[n - 1]));
58: VecRestoreArrayRead(y, &yval);
60: VecDestroy(&x);
61: VecDestroy(&y);
62: ISDestroy(&ix);
63: ISDestroy(&iy);
64: VecScatterDestroy(&vscat);
65: PetscFinalize();
66: return 0;
67: }
69: /*TEST
71: test:
72: requires: cuda
73: diff_args: -j
74: #make sure the host memory is pinned
75: # sf_backend cuda is not needed if compiling only with cuda
76: args: -vec_type cuda -sf_backend cuda -vec_pinned_memory_min 0
78: test:
79: suffix: hip
80: requires: hip
81: diff_args: -j
82: output_file: output/ex2_1.out
83: #make sure the host memory is pinned
84: # sf_backend hip is not needed if compiling only with hip
85: args: -vec_type hip -sf_backend hip -vec_pinned_memory_min 0
87: TEST*/