Actual source code: ex1k.kokkos.cxx

  1: static const char help[] = "Benchmarking PetscSF Ping-pong latency (similar to osu_latency)\n\n";

  3: /*
  4:   This is a simple test to measure the latency of MPI communication.
  5:   The test is run with two processes.  The first process sends a message
  6:   to the second process, and after having received the message, the second
  7:   process sends a message back to the first process once.  The is repeated
  8:   a number of times.  The latency is defined as half time of the round-trip.

 10:   It mimics osu_latency from the OSU microbenchmarks (https://mvapich.cse.ohio-state.edu/benchmarks/).

 12:   Usage: mpirun -n 2 ./ex1k -mtype <type>
 13:   Other arguments have a default value that is also used in osu_latency.

 15:   Examples:

 17:   On Summit at OLCF:
 18:     jsrun --smpiargs "-gpu" -n 2 -a 1 -c 7 -g 1 -r 2 -l GPU-GPU -d packed -b packed:7 ./ex1k  -mtype cuda

 20:   On Crusher at OLCF:
 21:     srun -n2 -c32 --cpu-bind=map_cpu:0,1 --gpus-per-node=8 --gpu-bind=map_gpu:0,1 ./ex1k -mtype hip
 22: */

 24: #include <petscsf.h>
 25: #include <petscdevice.h>
 26: #if defined(PETSC_HAVE_UNISTD_H)
 27:   #include <unistd.h>
 28: #endif

 30: #if defined(PETSC_HAVE_CUDA)
 31:   #define SyncDevice() PetscCallCUDA(cudaDeviceSynchronize())
 32: #elif defined(PETSC_HAVE_HIP)
 33:   #define SyncDevice() PetscCallHIP(hipDeviceSynchronize())
 34: #elif defined(PETSC_HAVE_KOKKOS)
 35:   #include <Kokkos_Core.hpp>
 36:   #define SyncDevice() Kokkos::fence()
 37: #else
 38:   #define SyncDevice()
 39: #endif

 41: /* Same values as OSU microbenchmarks */
 42: #define LAT_LOOP_SMALL     10000
 43: #define LAT_SKIP_SMALL     100
 44: #define LAT_LOOP_LARGE     1000
 45: #define LAT_SKIP_LARGE     10
 46: #define LARGE_MESSAGE_SIZE 8192

 48: static inline PetscErrorCode PetscMallocWithMemType(PetscMemType mtype, size_t size, void **ptr)
 49: {
 50:   if (PetscMemTypeHost(mtype)) {
 51: #if defined(PETSC_HAVE_GETPAGESIZE)
 52:     posix_memalign(ptr, getpagesize(), size);
 53: #else
 54:     PetscMalloc(size, ptr);
 55: #endif
 56:   }
 57: #if defined(PETSC_HAVE_CUDA)
 58:   else if (PetscMemTypeCUDA(mtype))
 59:     cudaMalloc(ptr, size);
 60: #elif defined(PETSC_HAVE_HIP)
 61:   else if (PetscMemTypeHIP(mtype))
 62:     hipMalloc(ptr, size);
 63: #elif defined(PETSC_HAVE_SYCL)
 64:   else if (PetscMemTypeSYCL(mtype))
 65:     *ptr = Kokkos::kokkos_malloc(size);
 66: #endif
 67:   return 0;
 68: }

 70: static inline PetscErrorCode PetscFreeWithMemType_Private(PetscMemType mtype, void *ptr)
 71: {
 72:   if (PetscMemTypeHost(mtype)) {
 73:     free(ptr);
 74:   }
 75: #if defined(PETSC_HAVE_CUDA)
 76:   else if (PetscMemTypeCUDA(mtype))
 77:     cudaFree(ptr);
 78: #elif defined(PETSC_HAVE_HIP)
 79:   else if (PetscMemTypeHIP(mtype))
 80:     hipFree(ptr);
 81: #elif defined(PETSC_HAVE_SYCL)
 82:   else if (PetscMemTypeSYCL(mtype))
 83:     Kokkos::kokkos_free(ptr);
 84: #endif
 85:   return 0;
 86: }

 88: /* Free memory and set ptr to NULL when succeeded */
 89: #define PetscFreeWithMemType(t, p) ((p) && (PetscFreeWithMemType_Private((t), (p)) || ((p) = NULL, 0)))

 91: static inline PetscErrorCode PetscMemcpyFromHostWithMemType(PetscMemType mtype, void *dst, const void *src, size_t n)
 92: {
 93:   if (PetscMemTypeHost(mtype)) PetscMemcpy(dst, src, n);
 94: #if defined(PETSC_HAVE_CUDA)
 95:   else if (PetscMemTypeCUDA(mtype)) cudaMemcpy(dst, src, n, cudaMemcpyHostToDevice);
 96: #elif defined(PETSC_HAVE_HIP)
 97:   else if (PetscMemTypeHIP(mtype)) hipMemcpy(dst, src, n, hipMemcpyHostToDevice);
 98: #elif defined(PETSC_HAVE_SYCL)
 99:   else if (PetscMemTypeSYCL(mtype)) {
100:     Kokkos::View<char *>                          dstView((char *)dst, n);
101:     Kokkos::View<const char *, Kokkos::HostSpace> srcView((const char *)src, n);
102:     Kokkos::deep_copy(dstView, srcView);
103:   }
104: #endif
105:   return 0;
106: }

108: int main(int argc, char **argv)
109: {
110:   PetscSF        sf[64];
111:   PetscLogDouble t_start = 0, t_end = 0, time[64];
112:   PetscInt       i, j, n, nroots, nleaves, niter = 100, nskip = 10;
113:   PetscInt       maxn = 512 * 1024; /* max 4M bytes messages */
114:   PetscSFNode   *iremote;
115:   PetscMPIInt    rank, size;
116:   PetscScalar   *rootdata = NULL, *leafdata = NULL, *pbuf, *ebuf;
117:   size_t         msgsize;
118:   PetscMemType   mtype       = PETSC_MEMTYPE_HOST;
119:   char           mstring[16] = {0};
120:   PetscBool      isCuda, isHip, isHost, isKokkos, set;
121:   PetscInt       skipSmall = -1, loopSmall = -1;
122:   MPI_Op         op = MPI_REPLACE;

125:   PetscInitialize(&argc, &argv, NULL, help);
126: #if defined(PETSC_HAVE_CUDA)
127:   PetscDeviceInitialize(PETSC_DEVICE_CUDA);
128: #elif defined(PETSC_HAVE_HIP)
129:   PetscDeviceInitialize(PETSC_DEVICE_HIP);
130: #endif
131:   MPI_Comm_size(PETSC_COMM_WORLD, &size);
132:   MPI_Comm_rank(PETSC_COMM_WORLD, &rank);

135:   PetscOptionsGetInt(NULL, NULL, "-maxn", &maxn, NULL); /* maxn PetscScalars */
136:   PetscOptionsGetInt(NULL, NULL, "-skipSmall", &skipSmall, NULL);
137:   PetscOptionsGetInt(NULL, NULL, "-loopSmall", &loopSmall, NULL);

139:   PetscMalloc1(maxn, &iremote);
140:   PetscOptionsGetString(NULL, NULL, "-mtype", mstring, 16, &set);
141:   if (set) {
142:     PetscStrcasecmp(mstring, "cuda", &isCuda);
143:     PetscStrcasecmp(mstring, "hip", &isHip);
144:     PetscStrcasecmp(mstring, "host", &isHost);
145:     PetscStrcasecmp(mstring, "kokkos", &isKokkos);

147:     if (isHost) mtype = PETSC_MEMTYPE_HOST;
148:     else if (isCuda) mtype = PETSC_MEMTYPE_CUDA;
149:     else if (isHip) mtype = PETSC_MEMTYPE_HIP;
150:     else if (isKokkos) {
151:       mtype = PETSC_MEMTYPE_KOKKOS;
152:       PetscKokkosInitializeCheck();
153:     } else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Unknown memory type: %s", mstring);
154:   }

156:   PetscMallocWithMemType(mtype, sizeof(PetscScalar) * maxn, (void **)&rootdata);
157:   PetscMallocWithMemType(mtype, sizeof(PetscScalar) * maxn, (void **)&leafdata);

159:   PetscMalloc2(maxn, &pbuf, maxn, &ebuf);
160:   for (i = 0; i < maxn; i++) {
161:     pbuf[i] = 123.0;
162:     ebuf[i] = 456.0;
163:   }

165:   for (n = 1, i = 0; n <= maxn; n *= 2, i++) {
166:     PetscSFCreate(PETSC_COMM_WORLD, &sf[i]);
167:     PetscSFSetFromOptions(sf[i]);
168:     if (rank == 0) {
169:       nroots  = n;
170:       nleaves = 0;
171:     } else {
172:       nroots  = 0;
173:       nleaves = n;
174:       for (j = 0; j < nleaves; j++) {
175:         iremote[j].rank  = 0;
176:         iremote[j].index = j;
177:       }
178:     }
179:     PetscSFSetGraph(sf[i], nroots, nleaves, NULL, PETSC_COPY_VALUES, iremote, PETSC_COPY_VALUES);
180:     PetscSFSetUp(sf[i]);
181:   }

183:   if (loopSmall > 0) {
184:     nskip = skipSmall;
185:     niter = loopSmall;
186:   } else {
187:     nskip = LAT_SKIP_SMALL;
188:     niter = LAT_LOOP_SMALL;
189:   }

191:   for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
192:     msgsize = sizeof(PetscScalar) * n;
193:     PetscMemcpyFromHostWithMemType(mtype, rootdata, pbuf, msgsize);
194:     PetscMemcpyFromHostWithMemType(mtype, leafdata, ebuf, msgsize);

196:     if (msgsize > LARGE_MESSAGE_SIZE) {
197:       nskip = LAT_SKIP_LARGE;
198:       niter = LAT_LOOP_LARGE;
199:     }
200:     MPI_Barrier(MPI_COMM_WORLD);

202:     for (i = 0; i < niter + nskip; i++) {
203:       if (i == nskip) {
204:         SyncDevice();
205:         MPI_Barrier(PETSC_COMM_WORLD);
206:         t_start = MPI_Wtime();
207:       }
208:       PetscSFBcastWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, rootdata, mtype, leafdata, op);
209:       PetscSFBcastEnd(sf[j], MPIU_SCALAR, rootdata, leafdata, op);
210:       PetscSFReduceWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, leafdata, mtype, rootdata, op);
211:       PetscSFReduceEnd(sf[j], MPIU_SCALAR, leafdata, rootdata, op);
212:     }
213:     SyncDevice();
214:     MPI_Barrier(PETSC_COMM_WORLD);
215:     t_end   = MPI_Wtime();
216:     time[j] = (t_end - t_start) * 1e6 / (niter * 2);
217:   }

219:   PetscPrintf(PETSC_COMM_WORLD, "\t##  PetscSF Ping-pong test on %s ##\n  Message(Bytes) \t\tLatency(us)\n", mtype == PETSC_MEMTYPE_HOST ? "Host" : "Device");
220:   for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
221:     PetscSFDestroy(&sf[j]);
222:     PetscPrintf(PETSC_COMM_WORLD, "%16" PetscInt_FMT " \t %16.4f\n", ((PetscInt)sizeof(PetscScalar)) * n, time[j]);
223:   }

225:   PetscFree2(pbuf, ebuf);
226:   PetscFreeWithMemType(mtype, rootdata);
227:   PetscFreeWithMemType(mtype, leafdata);
228:   PetscFree(iremote);
229:   PetscFinalize();
230:   return 0;
231: }

233: /**TEST
234:   testset:
235:     # use small numbers to make the test cheap
236:     args: -maxn 4 -skipSmall 1 -loopSmall 1
237:     filter: grep "DOES_NOT_EXIST"
238:     output_file: output/empty.out
239:     nsize: 2

241:     test:
242:       args: -mtype host

244:     test:
245:       requires: cuda
246:       args: -mtype cuda

248:     test:
249:       requires: hip
250:       args: -mtype hip

252:     test:
253:       requires: kokkos
254:       args: -mtype kokkos
255: TEST**/