Actual source code: ex1k.kokkos.cxx
1: static const char help[] = "Benchmarking PetscSF Ping-pong latency (similar to osu_latency)\n\n";
3: /*
4: This is a simple test to measure the latency of MPI communication.
5: The test is run with two processes. The first process sends a message
6: to the second process, and after having received the message, the second
7: process sends a message back to the first process once. The is repeated
8: a number of times. The latency is defined as half time of the round-trip.
10: It mimics osu_latency from the OSU microbenchmarks (https://mvapich.cse.ohio-state.edu/benchmarks/).
12: Usage: mpirun -n 2 ./ex1k -mtype <type>
13: Other arguments have a default value that is also used in osu_latency.
15: Examples:
17: On Summit at OLCF:
18: jsrun --smpiargs "-gpu" -n 2 -a 1 -c 7 -g 1 -r 2 -l GPU-GPU -d packed -b packed:7 ./ex1k -mtype cuda
20: On Crusher at OLCF:
21: srun -n2 -c32 --cpu-bind=map_cpu:0,1 --gpus-per-node=8 --gpu-bind=map_gpu:0,1 ./ex1k -mtype hip
22: */
24: #include <petscsf.h>
25: #include <petscdevice.h>
26: #if defined(PETSC_HAVE_UNISTD_H)
27: #include <unistd.h>
28: #endif
30: #if defined(PETSC_HAVE_CUDA)
31: #define SyncDevice() PetscCallCUDA(cudaDeviceSynchronize())
32: #elif defined(PETSC_HAVE_HIP)
33: #define SyncDevice() PetscCallHIP(hipDeviceSynchronize())
34: #elif defined(PETSC_HAVE_KOKKOS)
35: #include <Kokkos_Core.hpp>
36: #define SyncDevice() Kokkos::fence()
37: #else
38: #define SyncDevice()
39: #endif
41: /* Same values as OSU microbenchmarks */
42: #define LAT_LOOP_SMALL 10000
43: #define LAT_SKIP_SMALL 100
44: #define LAT_LOOP_LARGE 1000
45: #define LAT_SKIP_LARGE 10
46: #define LARGE_MESSAGE_SIZE 8192
48: static inline PetscErrorCode PetscMallocWithMemType(PetscMemType mtype, size_t size, void **ptr)
49: {
50: if (PetscMemTypeHost(mtype)) {
51: #if defined(PETSC_HAVE_GETPAGESIZE)
52: posix_memalign(ptr, getpagesize(), size);
53: #else
54: PetscMalloc(size, ptr);
55: #endif
56: }
57: #if defined(PETSC_HAVE_CUDA)
58: else if (PetscMemTypeCUDA(mtype))
59: cudaMalloc(ptr, size);
60: #elif defined(PETSC_HAVE_HIP)
61: else if (PetscMemTypeHIP(mtype))
62: hipMalloc(ptr, size);
63: #elif defined(PETSC_HAVE_SYCL)
64: else if (PetscMemTypeSYCL(mtype))
65: *ptr = Kokkos::kokkos_malloc(size);
66: #endif
67: return 0;
68: }
70: static inline PetscErrorCode PetscFreeWithMemType_Private(PetscMemType mtype, void *ptr)
71: {
72: if (PetscMemTypeHost(mtype)) {
73: free(ptr);
74: }
75: #if defined(PETSC_HAVE_CUDA)
76: else if (PetscMemTypeCUDA(mtype))
77: cudaFree(ptr);
78: #elif defined(PETSC_HAVE_HIP)
79: else if (PetscMemTypeHIP(mtype))
80: hipFree(ptr);
81: #elif defined(PETSC_HAVE_SYCL)
82: else if (PetscMemTypeSYCL(mtype))
83: Kokkos::kokkos_free(ptr);
84: #endif
85: return 0;
86: }
88: /* Free memory and set ptr to NULL when succeeded */
89: #define PetscFreeWithMemType(t, p) ((p) && (PetscFreeWithMemType_Private((t), (p)) || ((p) = NULL, 0)))
91: static inline PetscErrorCode PetscMemcpyFromHostWithMemType(PetscMemType mtype, void *dst, const void *src, size_t n)
92: {
93: if (PetscMemTypeHost(mtype)) PetscMemcpy(dst, src, n);
94: #if defined(PETSC_HAVE_CUDA)
95: else if (PetscMemTypeCUDA(mtype)) cudaMemcpy(dst, src, n, cudaMemcpyHostToDevice);
96: #elif defined(PETSC_HAVE_HIP)
97: else if (PetscMemTypeHIP(mtype)) hipMemcpy(dst, src, n, hipMemcpyHostToDevice);
98: #elif defined(PETSC_HAVE_SYCL)
99: else if (PetscMemTypeSYCL(mtype)) {
100: Kokkos::View<char *> dstView((char *)dst, n);
101: Kokkos::View<const char *, Kokkos::HostSpace> srcView((const char *)src, n);
102: Kokkos::deep_copy(dstView, srcView);
103: }
104: #endif
105: return 0;
106: }
108: int main(int argc, char **argv)
109: {
110: PetscSF sf[64];
111: PetscLogDouble t_start = 0, t_end = 0, time[64];
112: PetscInt i, j, n, nroots, nleaves, niter = 100, nskip = 10;
113: PetscInt maxn = 512 * 1024; /* max 4M bytes messages */
114: PetscSFNode *iremote;
115: PetscMPIInt rank, size;
116: PetscScalar *rootdata = NULL, *leafdata = NULL, *pbuf, *ebuf;
117: size_t msgsize;
118: PetscMemType mtype = PETSC_MEMTYPE_HOST;
119: char mstring[16] = {0};
120: PetscBool isCuda, isHip, isHost, isKokkos, set;
121: PetscInt skipSmall = -1, loopSmall = -1;
122: MPI_Op op = MPI_REPLACE;
125: PetscInitialize(&argc, &argv, NULL, help);
126: #if defined(PETSC_HAVE_CUDA)
127: PetscDeviceInitialize(PETSC_DEVICE_CUDA);
128: #elif defined(PETSC_HAVE_HIP)
129: PetscDeviceInitialize(PETSC_DEVICE_HIP);
130: #endif
131: MPI_Comm_size(PETSC_COMM_WORLD, &size);
132: MPI_Comm_rank(PETSC_COMM_WORLD, &rank);
135: PetscOptionsGetInt(NULL, NULL, "-maxn", &maxn, NULL); /* maxn PetscScalars */
136: PetscOptionsGetInt(NULL, NULL, "-skipSmall", &skipSmall, NULL);
137: PetscOptionsGetInt(NULL, NULL, "-loopSmall", &loopSmall, NULL);
139: PetscMalloc1(maxn, &iremote);
140: PetscOptionsGetString(NULL, NULL, "-mtype", mstring, 16, &set);
141: if (set) {
142: PetscStrcasecmp(mstring, "cuda", &isCuda);
143: PetscStrcasecmp(mstring, "hip", &isHip);
144: PetscStrcasecmp(mstring, "host", &isHost);
145: PetscStrcasecmp(mstring, "kokkos", &isKokkos);
147: if (isHost) mtype = PETSC_MEMTYPE_HOST;
148: else if (isCuda) mtype = PETSC_MEMTYPE_CUDA;
149: else if (isHip) mtype = PETSC_MEMTYPE_HIP;
150: else if (isKokkos) {
151: mtype = PETSC_MEMTYPE_KOKKOS;
152: PetscKokkosInitializeCheck();
153: } else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Unknown memory type: %s", mstring);
154: }
156: PetscMallocWithMemType(mtype, sizeof(PetscScalar) * maxn, (void **)&rootdata);
157: PetscMallocWithMemType(mtype, sizeof(PetscScalar) * maxn, (void **)&leafdata);
159: PetscMalloc2(maxn, &pbuf, maxn, &ebuf);
160: for (i = 0; i < maxn; i++) {
161: pbuf[i] = 123.0;
162: ebuf[i] = 456.0;
163: }
165: for (n = 1, i = 0; n <= maxn; n *= 2, i++) {
166: PetscSFCreate(PETSC_COMM_WORLD, &sf[i]);
167: PetscSFSetFromOptions(sf[i]);
168: if (rank == 0) {
169: nroots = n;
170: nleaves = 0;
171: } else {
172: nroots = 0;
173: nleaves = n;
174: for (j = 0; j < nleaves; j++) {
175: iremote[j].rank = 0;
176: iremote[j].index = j;
177: }
178: }
179: PetscSFSetGraph(sf[i], nroots, nleaves, NULL, PETSC_COPY_VALUES, iremote, PETSC_COPY_VALUES);
180: PetscSFSetUp(sf[i]);
181: }
183: if (loopSmall > 0) {
184: nskip = skipSmall;
185: niter = loopSmall;
186: } else {
187: nskip = LAT_SKIP_SMALL;
188: niter = LAT_LOOP_SMALL;
189: }
191: for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
192: msgsize = sizeof(PetscScalar) * n;
193: PetscMemcpyFromHostWithMemType(mtype, rootdata, pbuf, msgsize);
194: PetscMemcpyFromHostWithMemType(mtype, leafdata, ebuf, msgsize);
196: if (msgsize > LARGE_MESSAGE_SIZE) {
197: nskip = LAT_SKIP_LARGE;
198: niter = LAT_LOOP_LARGE;
199: }
200: MPI_Barrier(MPI_COMM_WORLD);
202: for (i = 0; i < niter + nskip; i++) {
203: if (i == nskip) {
204: SyncDevice();
205: MPI_Barrier(PETSC_COMM_WORLD);
206: t_start = MPI_Wtime();
207: }
208: PetscSFBcastWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, rootdata, mtype, leafdata, op);
209: PetscSFBcastEnd(sf[j], MPIU_SCALAR, rootdata, leafdata, op);
210: PetscSFReduceWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, leafdata, mtype, rootdata, op);
211: PetscSFReduceEnd(sf[j], MPIU_SCALAR, leafdata, rootdata, op);
212: }
213: SyncDevice();
214: MPI_Barrier(PETSC_COMM_WORLD);
215: t_end = MPI_Wtime();
216: time[j] = (t_end - t_start) * 1e6 / (niter * 2);
217: }
219: PetscPrintf(PETSC_COMM_WORLD, "\t## PetscSF Ping-pong test on %s ##\n Message(Bytes) \t\tLatency(us)\n", mtype == PETSC_MEMTYPE_HOST ? "Host" : "Device");
220: for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
221: PetscSFDestroy(&sf[j]);
222: PetscPrintf(PETSC_COMM_WORLD, "%16" PetscInt_FMT " \t %16.4f\n", ((PetscInt)sizeof(PetscScalar)) * n, time[j]);
223: }
225: PetscFree2(pbuf, ebuf);
226: PetscFreeWithMemType(mtype, rootdata);
227: PetscFreeWithMemType(mtype, leafdata);
228: PetscFree(iremote);
229: PetscFinalize();
230: return 0;
231: }
233: /**TEST
234: testset:
235: # use small numbers to make the test cheap
236: args: -maxn 4 -skipSmall 1 -loopSmall 1
237: filter: grep "DOES_NOT_EXIST"
238: output_file: output/empty.out
239: nsize: 2
241: test:
242: args: -mtype host
244: test:
245: requires: cuda
246: args: -mtype cuda
248: test:
249: requires: hip
250: args: -mtype hip
252: test:
253: requires: kokkos
254: args: -mtype kokkos
255: TEST**/