Actual source code: ex2k.kokkos.cxx
1: static char help[] = "Benchmarking MatProduct with AIJ and its subclass matrix types\n";
3: /*
4: Usage:
5: mpirun -n <np> ./ex2k
6: -A <filename> : input petsc binary file for matrix A; one can convert a file from MatrixMarket using mat/tests/ex72.c
7: -P <filename> : input petsc binary file for matrix P; optional, if not given, P = A
8: -mat_type <str> : aij or its subclass. Default is aij.
9: -prod_type <str> : AP, AtP, APt, PtAP or PAPt. Default is AP.
10: -n <num> : run MatProductNumeric() this many times and report average time. Default is 100.
12: Notes:
13: It uses CPU-timer to measure the time.
15: Examples:
16: On OLCF Summit (with GPU-aware MPI)
17: # 6 MPI ranks:
18: # 6 resource sets (-n 6), 1 MPI rank per RS (-a 1), 7 CPU cores per RS (-c 7), and 1 GPU per RS (-g 1), 6 RSs per node (-r 6)
19: jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 ./ex2k -A cage12.aij -mat_type aijcusparse
21: # 1 MPI rank
22: jsrun --smpiargs "-gpu" -n 1 -a 1 -c 7 -g 1 -r 1 ./ex2k -A cage12.aij -mat_type aijcusparse
24: On OLCF Crusher:
25: # 1 MPI rank
26: # run with 1 node (-N1), 1 mpi rank (-n1), 2 hardware threads per rank (-c2)
27: srun -N1 -n1 -c2 --gpus-per-node=8 --gpu-bind=closest ./ex2k -A HV15R.aij -mat_type aijkokkos
29: # 8 MPI ranks
30: srun -N1 -n8 -c2 --gpus-per-node=8 --gpu-bind=closest ./ex2k -A HV15R.aij -mat_type aijkokkos
31: */
32: #include <petscmat.h>
33: #include <petscdevice.h>
35: #if defined(PETSC_HAVE_CUDA)
36: #include <petscdevice_cuda.h>
37: #define SyncDevice() PetscCallCUDA(cudaDeviceSynchronize())
38: #elif defined(PETSC_HAVE_HIP)
39: #include <petscdevice_hip.h>
40: #define SyncDevice() PetscCallHIP(hipDeviceSynchronize())
41: #elif defined(PETSC_HAVE_KOKKOS)
42: #include <Kokkos_Core.hpp>
43: #define SyncDevice() Kokkos::fence()
44: #else
45: #define SyncDevice()
46: #endif
48: int main(int argc, char **args)
49: {
50: Mat A, P, C;
51: Mat A2, P2, C2; /* Shadow matrices (of MATAIJ) of A,P,C for initialization and validation */
52: char matTypeStr[64], prodTypeStr[32];
53: char fileA[PETSC_MAX_PATH_LEN], fileP[PETSC_MAX_PATH_LEN];
54: PetscViewer fdA, fdP;
55: PetscBool flg, flgA, flgP, equal = PETSC_FALSE;
56: PetscLogStage stage;
57: PetscInt i, n = 100, nskip = 2, M, N;
58: MatInfo info;
59: PetscLogDouble tstart = 0, tend = 0, avgTime;
60: PetscMPIInt size;
61: MatProductType prodType;
62: PetscBool isAP, isAtP, isAPt, isPtAP, isPAPt;
64: PetscInitialize(&argc, &args, (char *)0, help);
65: MPI_Comm_size(PETSC_COMM_WORLD, &size);
67: /* Read options -n */
68: PetscOptionsGetInt(NULL, NULL, "-n", &n, NULL);
70: /* Load the matrix from a binary file */
71: PetscOptionsGetString(NULL, NULL, "-A", fileA, PETSC_MAX_PATH_LEN, &flgA);
72: PetscOptionsGetString(NULL, NULL, "-P", fileP, PETSC_MAX_PATH_LEN, &flgP);
75: PetscOptionsGetString(NULL, NULL, "-mat_type", matTypeStr, sizeof(matTypeStr), &flg);
76: if (!flg) PetscStrncpy(matTypeStr, MATAIJ, sizeof(matTypeStr)); /* Inject the default if not provided */
78: PetscOptionsGetString(NULL, NULL, "-prod_type", prodTypeStr, sizeof(prodTypeStr), &flg);
79: if (!flg) PetscStrncpy(prodTypeStr, "AP", sizeof(prodTypeStr)); /* Inject the default if not provided */
81: PetscStrcmp(prodTypeStr, "AP", &isAP);
82: PetscStrcmp(prodTypeStr, "AtP", &isAtP);
83: PetscStrcmp(prodTypeStr, "APt", &isAPt);
84: PetscStrcmp(prodTypeStr, "PtAP", &isPtAP);
85: PetscStrcmp(prodTypeStr, "PAPt", &isPAPt);
87: if (isAP) prodType = MATPRODUCT_AB;
88: else if (isAtP) prodType = MATPRODUCT_AtB;
89: else if (isAPt) prodType = MATPRODUCT_ABt;
90: else if (isPtAP) prodType = MATPRODUCT_PtAP;
91: else if (isPAPt) prodType = MATPRODUCT_RARt;
92: else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_USER, "Unsupported product type %s", prodTypeStr);
94: /* Read the matrix file to A2 */
95: PetscViewerBinaryOpen(PETSC_COMM_WORLD, fileA, FILE_MODE_READ, &fdA);
96: MatCreate(PETSC_COMM_WORLD, &A2);
97: MatSetType(A2, MATAIJ);
98: MatLoad(A2, fdA);
99: PetscViewerDestroy(&fdA);
101: MatGetSize(A2, &M, &N);
102: MatGetInfo(A2, MAT_GLOBAL_SUM, &info);
103: PetscPrintf(PETSC_COMM_WORLD, "Input matrix A: %s, %" PetscInt_FMT " x %" PetscInt_FMT ", %lld nonzeros, %.1f per row\n", fileA, M, N, (long long)info.nz_used, (double)info.nz_used / (double)M);
105: /* Copy A2 to A and convert A to the specified type */
106: MatDuplicate(A2, MAT_COPY_VALUES, &A);
107: MatConvert(A, matTypeStr, MAT_INPLACE_MATRIX, &A);
109: /* Init P, P2 similarly */
110: if (flgP) { /* If user provided P */
111: PetscViewerBinaryOpen(PETSC_COMM_WORLD, fileP, FILE_MODE_READ, &fdP);
112: MatCreate(PETSC_COMM_WORLD, &P2);
113: MatSetType(P2, MATAIJ);
114: MatLoad(P2, fdP);
115: PetscViewerDestroy(&fdP);
117: MatDuplicate(P2, MAT_COPY_VALUES, &P);
118: MatConvert(P, matTypeStr, MAT_INPLACE_MATRIX, &P);
120: MatGetSize(P2, &M, &N);
121: MatGetInfo(P2, MAT_GLOBAL_SUM, &info);
122: PetscPrintf(PETSC_COMM_WORLD, "Input matrix P: %s, %" PetscInt_FMT " x %" PetscInt_FMT ", %lld nonzeros, %.1f per row\n", fileP, M, N, (long long)info.nz_used, (double)info.nz_used / (double)M);
123: } else { /* otherwise just let P = A */
124: PetscPrintf(PETSC_COMM_WORLD, "Input matrix P = A\n");
125: P2 = A2;
126: P = A;
127: }
129: /* Compute the reference C2 */
130: MatProductCreate(A2, P2, NULL, &C2);
131: MatProductSetType(C2, prodType);
132: MatProductSetFill(C2, PETSC_DEFAULT);
133: MatProductSetFromOptions(C2);
134: MatProductSymbolic(C2);
135: MatProductNumeric(C2);
136: MatGetSize(C2, &M, &N);
137: MatGetInfo(C2, MAT_GLOBAL_SUM, &info);
138: PetscPrintf(PETSC_COMM_WORLD, "Mat product C = %s: %" PetscInt_FMT " x %" PetscInt_FMT ", %lld nonzeros, %.1f per row\n", prodTypeStr, M, N, (long long)info.nz_used, (double)info.nz_used / (double)M);
140: /* Compute C */
141: MatProductCreate(A, P, NULL, &C);
142: MatProductSetType(C, prodType);
143: MatProductSetAlgorithm(C, MATPRODUCTALGORITHMBACKEND);
144: MatProductSetFill(C, PETSC_DEFAULT);
145: MatProductSetFromOptions(C);
147: /* Measure MatProductSymbolic */
148: PetscLogStageRegister("MatProductSymbolic", &stage);
149: PetscLogStagePush(stage);
150: SyncDevice();
151: MPI_Barrier(PETSC_COMM_WORLD);
152: PetscTime(&tstart);
153: MatProductSymbolic(C);
154: SyncDevice();
155: MPI_Barrier(PETSC_COMM_WORLD);
156: PetscTime(&tend);
157: avgTime = (tend - tstart) * 1e6; /* microseconds */
158: PetscLogStagePop();
159: PetscPrintf(PETSC_COMM_WORLD, "\nMatProductSymbolic() time (us) with %d MPI ranks = %8.2f\n", size, avgTime);
161: /* Measure MatProductNumeric */
162: PetscLogStageRegister("MatProductNumeric", &stage);
163: for (i = 0; i < n + nskip; i++) {
164: if (i == nskip) {
165: SyncDevice();
166: PetscLogStagePush(stage);
167: MPI_Barrier(PETSC_COMM_WORLD);
168: PetscTime(&tstart);
169: }
170: MatProductReplaceMats(A, P, NULL, C);
171: MatProductNumeric(C);
172: }
173: SyncDevice();
174: MPI_Barrier(PETSC_COMM_WORLD);
175: PetscTime(&tend);
176: avgTime = (tend - tstart) * 1e6 / n; /* microseconds */
177: PetscLogStagePop();
179: MatMultEqual(C, C2, 8, &equal); /* Not MatEqual() since C and C2 are not necessarily bitwise equal */
182: PetscPrintf(PETSC_COMM_WORLD, "MatProductNumeric() average time (us) with %d MPI ranks = %8.2f\n", size, avgTime);
184: MatDestroy(&A);
185: if (flgP) MatDestroy(&P);
186: MatDestroy(&C);
188: MatDestroy(&A2);
189: if (flgP) MatDestroy(&P2);
190: MatDestroy(&C2);
192: PetscFinalize();
193: return 0;
194: }
196: /*TEST
198: testset:
199: args: -n 2 -A ${DATAFILESPATH}/matrices/small
200: nsize: 1
201: filter: grep "DOES_NOT_EXIST"
202: output_file: output/empty.out
203: requires: !complex double !single kokkos_kernels
205: test:
206: suffix: 1
207: requires: cuda
208: args: -mat_type aijcusparse
210: test:
211: suffix: 2
212: args: -mat_type aijkokkos
214: test:
215: suffix: 3
216: requires: hip
217: args: -mat_type aijhipsparse
219: TEST*/