Actual source code: ex1k.kokkos.cxx
1: static char help[] = "Benchmarking MatMult() with AIJ and its subclass matrix types\n";
3: /*
4: Usage:
5: mpirun -n <np> ./ex1k
6: -f <file> : input petsc matrix binary file; one can convert a file from MatrixMarket using mat/tests/ex72.c
7: -mat_type <type> : aij or its subclass. Default is aij.
8: -n <num> : run MatMult() this many times and report average time. Default is 500.
10: Notes:
11: It uses CPU-timer to measure the time.
13: Examples:
14: On OLCF Summit (with GPU-aware MPI)
15: # 6 MPI ranks:
16: # 6 resource sets (-n 6), 1 MPI rank per RS (-a 1), 7 CPU cores per RS (-c 7), and 1 GPU per RS (-g 1), 6 RSs per node (-r 6)
17: jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 ./ex1k -f 1138_bus.aij -mat_type aijcusparse
19: # 1 MPI rank
20: jsrun --smpiargs "-gpu" -n 1 -a 1 -c 7 -g 1 -r 1 ./ex1k -f 1138_bus.aij -mat_type aijcusparse
22: On OLCF Crusher:
23: # 1 MPI rank
24: # run with 1 node (-N1), 1 mpi rank (-n1), 2 hardware threads per rank (-c2)
25: srun -N1 -n1 -c2 --gpus-per-node=8 --gpu-bind=closest ./ex1k -f HV15R.aij -mat_type aijkokkos
27: # 8 MPI ranks
28: srun -N1 -n8 -c2 --gpus-per-node=8 --gpu-bind=closest ./ex1k -f HV15R.aij -mat_type aijkokkos
29: */
30: #include <petscmat.h>
31: #include <petscdevice.h>
33: #if defined(PETSC_HAVE_CUDA)
34: #include <petscdevice_cuda.h>
35: #define SyncDevice() PetscCallCUDA(cudaDeviceSynchronize())
36: #elif defined(PETSC_HAVE_HIP)
37: #include <petscdevice_hip.h>
38: #define SyncDevice() PetscCallHIP(hipDeviceSynchronize())
39: #elif defined(PETSC_HAVE_KOKKOS)
40: #include <Kokkos_Core.hpp>
41: #define SyncDevice() Kokkos::fence()
42: #else
43: #define SyncDevice()
44: #endif
46: int main(int argc, char **args)
47: {
48: Mat A, A2;
49: Vec x, y, x2, y2;
50: PetscViewer fd;
51: char matfile[PETSC_MAX_PATH_LEN];
52: char mattype[64];
53: PetscBool flg;
54: PetscLogStage stage;
55: PetscInt i, n = 500, nskip = 5, M, N;
56: MatInfo info;
57: PetscLogDouble tstart = 0, tend = 0, avgTime;
58: PetscRandom rctx;
59: PetscReal norm;
60: PetscMPIInt size;
62: PetscInitialize(&argc, &args, (char *)0, help);
63: MPI_Comm_size(PETSC_COMM_WORLD, &size);
65: /* Read options -n */
66: PetscOptionsGetInt(NULL, NULL, "-n", &n, NULL);
68: /* Load the matrix from a binary file */
69: PetscOptionsGetString(NULL, NULL, "-f", matfile, PETSC_MAX_PATH_LEN, &flg);
71: PetscOptionsGetString(NULL, NULL, "-mat_type", mattype, sizeof(mattype), &flg);
72: if (!flg) PetscStrncpy(mattype, MATAIJ, sizeof(mattype));
74: /* Read the matrix file to A2 */
75: PetscViewerBinaryOpen(PETSC_COMM_WORLD, matfile, FILE_MODE_READ, &fd);
76: MatCreate(PETSC_COMM_WORLD, &A2);
77: MatSetType(A2, MATAIJ);
78: MatLoad(A2, fd);
79: MatCreateVecs(A2, &x2, &y2);
80: PetscViewerDestroy(&fd);
82: MatGetSize(A2, &M, &N);
83: MatGetInfo(A2, MAT_GLOBAL_SUM, &info);
84: PetscPrintf(PETSC_COMM_WORLD, "Input matrix %s: %" PetscInt_FMT " x %" PetscInt_FMT "; %lld nonzeros; %.1f per row\n", matfile, M, N, (long long)info.nz_used, (double)info.nz_used / (double)M);
86: /* Copy A2 to A and convert A to the specified type */
87: MatDuplicate(A2, MAT_COPY_VALUES, &A);
88: MatConvert(A, mattype, MAT_INPLACE_MATRIX, &A);
89: MatCreateVecs(A, &x, &y);
91: /* Init x, x2 with the same value */
92: PetscRandomCreate(PETSC_COMM_WORLD, &rctx);
93: VecSetRandom(x2, rctx);
94: PetscRandomDestroy(&rctx);
95: VecCopy(x2, x);
97: /* Compute the reference y2 = A2 x2 */
98: MatMult(A2, x2, y2);
100: /* Measure y = Ax */
101: PetscLogStageRegister("MatMult", &stage);
102: for (i = 0; i < n + nskip; i++) {
103: if (i == nskip) {
104: SyncDevice();
105: PetscLogStagePush(stage);
106: MPI_Barrier(PETSC_COMM_WORLD);
107: PetscTime(&tstart);
108: }
109: MatMult(A, x, y);
110: }
111: SyncDevice();
112: MPI_Barrier(PETSC_COMM_WORLD);
113: PetscTime(&tend);
114: avgTime = (tend - tstart) * 1e6 / n; /* microseconds */
115: PetscLogStagePop();
117: /* Validate y against y2 */
118: VecAYPX(y2, -1, y);
119: VecNorm(y2, NORM_2, &norm);
121: PetscPrintf(PETSC_COMM_WORLD, "MatMult() average time (us) with %d MPI ranks = %8.2f\n", size, avgTime);
123: MatDestroy(&A);
124: VecDestroy(&x);
125: VecDestroy(&y);
126: MatDestroy(&A2);
127: VecDestroy(&x2);
128: VecDestroy(&y2);
129: PetscFinalize();
130: return 0;
131: }
133: /*TEST
135: testset:
136: args: -n 2 -f ${DATAFILESPATH}/matrices/small
137: nsize: 1
138: filter: grep "DOES_NOT_EXIST"
139: output_file: output/empty.out
140: requires: !complex double !single kokkos_kernels
142: test:
143: suffix: 1
144: requires: cuda
145: args: -mat_type aijcusparse
147: test:
148: suffix: 2
149: args: -mat_type aijkokkos
151: test:
152: suffix: 3
153: requires: hip
154: args: -mat_type aijhipsparse
156: TEST*/