Actual source code: ex2k.kokkos.cxx

  1: static char help[] = "Benchmarking MatProduct with AIJ and its subclass matrix types\n";

  3: /*
  4: Usage:
  5:   mpirun -n <np> ./ex2k
  6:     -A <filename>     : input petsc binary file for matrix A; one can convert a file from MatrixMarket using mat/tests/ex72.c
  7:     -P <filename>     : input petsc binary file for matrix P; optional, if not given, P = A
  8:     -mat_type  <str>  : aij or its subclass. Default is aij.
  9:     -prod_type <str>  : AP, AtP, APt, PtAP or PAPt. Default is AP.
 10:     -n <num>          : run MatProductNumeric() this many times and report average time. Default is 100.

 12: Notes:
 13:   It uses CPU-timer to measure the time.

 15: Examples:
 16:   On OLCF Summit (with GPU-aware MPI)
 17:     # 6 MPI ranks:
 18:     # 6 resource sets (-n 6), 1 MPI rank per RS (-a 1), 7 CPU cores per RS (-c 7), and 1 GPU per RS (-g 1), 6 RSs per node (-r 6)
 19:     jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 ./ex2k -A cage12.aij -mat_type aijcusparse

 21:     # 1 MPI rank
 22:     jsrun --smpiargs "-gpu" -n 1 -a 1 -c 7 -g 1 -r 1 ./ex2k -A cage12.aij -mat_type aijcusparse

 24:   On OLCF Crusher:
 25:     # 1 MPI rank
 26:     # run with 1 node (-N1), 1 mpi rank (-n1), 2 hardware threads per rank (-c2)
 27:     srun -N1 -n1 -c2 --gpus-per-node=8 --gpu-bind=closest ./ex2k -A HV15R.aij -mat_type aijkokkos

 29:     # 8 MPI ranks
 30:     srun -N1 -n8 -c2 --gpus-per-node=8 --gpu-bind=closest ./ex2k -A HV15R.aij -mat_type aijkokkos
 31: */
 32: #include <petscmat.h>
 33: #include <petscdevice.h>

 35: #if defined(PETSC_HAVE_CUDA)
 36: #include <petscdevice_cuda.h>
 37:   #define SyncDevice() PetscCallCUDA(cudaDeviceSynchronize())
 38: #elif defined(PETSC_HAVE_HIP)
 39: #include <petscdevice_hip.h>
 40:   #define SyncDevice() PetscCallHIP(hipDeviceSynchronize())
 41: #elif defined(PETSC_HAVE_KOKKOS)
 42:   #include <Kokkos_Core.hpp>
 43:   #define SyncDevice() Kokkos::fence()
 44: #else
 45:   #define SyncDevice()
 46: #endif

 48: int main(int argc, char **args)
 49: {
 50:   Mat            A, P, C;
 51:   Mat            A2, P2, C2; /* Shadow matrices (of MATAIJ) of A,P,C for initialization and validation */
 52:   char           matTypeStr[64], prodTypeStr[32];
 53:   char           fileA[PETSC_MAX_PATH_LEN], fileP[PETSC_MAX_PATH_LEN];
 54:   PetscViewer    fdA, fdP;
 55:   PetscBool      flg, flgA, flgP, equal = PETSC_FALSE;
 56:   PetscLogStage  stage;
 57:   PetscInt       i, n = 100, nskip = 2, M, N;
 58:   MatInfo        info;
 59:   PetscLogDouble tstart = 0, tend = 0, avgTime;
 60:   PetscMPIInt    size;
 61:   MatProductType prodType;
 62:   PetscBool      isAP, isAtP, isAPt, isPtAP, isPAPt;

 64:   PetscInitialize(&argc, &args, (char *)0, help);
 65:   MPI_Comm_size(PETSC_COMM_WORLD, &size);

 67:   /* Read options -n */
 68:   PetscOptionsGetInt(NULL, NULL, "-n", &n, NULL);

 70:   /* Load the matrix from a binary file */
 71:   PetscOptionsGetString(NULL, NULL, "-A", fileA, PETSC_MAX_PATH_LEN, &flgA);
 72:   PetscOptionsGetString(NULL, NULL, "-P", fileP, PETSC_MAX_PATH_LEN, &flgP);

 75:   PetscOptionsGetString(NULL, NULL, "-mat_type", matTypeStr, sizeof(matTypeStr), &flg);
 76:   if (!flg) PetscStrncpy(matTypeStr, MATAIJ, sizeof(matTypeStr)); /* Inject the default if not provided */

 78:   PetscOptionsGetString(NULL, NULL, "-prod_type", prodTypeStr, sizeof(prodTypeStr), &flg);
 79:   if (!flg) PetscStrncpy(prodTypeStr, "AP", sizeof(prodTypeStr)); /* Inject the default if not provided */

 81:   PetscStrcmp(prodTypeStr, "AP", &isAP);
 82:   PetscStrcmp(prodTypeStr, "AtP", &isAtP);
 83:   PetscStrcmp(prodTypeStr, "APt", &isAPt);
 84:   PetscStrcmp(prodTypeStr, "PtAP", &isPtAP);
 85:   PetscStrcmp(prodTypeStr, "PAPt", &isPAPt);

 87:   if (isAP) prodType = MATPRODUCT_AB;
 88:   else if (isAtP) prodType = MATPRODUCT_AtB;
 89:   else if (isAPt) prodType = MATPRODUCT_ABt;
 90:   else if (isPtAP) prodType = MATPRODUCT_PtAP;
 91:   else if (isPAPt) prodType = MATPRODUCT_RARt;
 92:   else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_USER, "Unsupported product type %s", prodTypeStr);

 94:   /* Read the matrix file to A2 */
 95:   PetscViewerBinaryOpen(PETSC_COMM_WORLD, fileA, FILE_MODE_READ, &fdA);
 96:   MatCreate(PETSC_COMM_WORLD, &A2);
 97:   MatSetType(A2, MATAIJ);
 98:   MatLoad(A2, fdA);
 99:   PetscViewerDestroy(&fdA);

101:   MatGetSize(A2, &M, &N);
102:   MatGetInfo(A2, MAT_GLOBAL_SUM, &info);
103:   PetscPrintf(PETSC_COMM_WORLD, "Input matrix A: %s, %" PetscInt_FMT " x %" PetscInt_FMT ", %lld nonzeros, %.1f per row\n", fileA, M, N, (long long)info.nz_used, (double)info.nz_used / (double)M);

105:   /* Copy A2 to A and convert A to the specified type */
106:   MatDuplicate(A2, MAT_COPY_VALUES, &A);
107:   MatConvert(A, matTypeStr, MAT_INPLACE_MATRIX, &A);

109:   /* Init P, P2 similarly */
110:   if (flgP) { /* If user provided P */
111:     PetscViewerBinaryOpen(PETSC_COMM_WORLD, fileP, FILE_MODE_READ, &fdP);
112:     MatCreate(PETSC_COMM_WORLD, &P2);
113:     MatSetType(P2, MATAIJ);
114:     MatLoad(P2, fdP);
115:     PetscViewerDestroy(&fdP);

117:     MatDuplicate(P2, MAT_COPY_VALUES, &P);
118:     MatConvert(P, matTypeStr, MAT_INPLACE_MATRIX, &P);

120:     MatGetSize(P2, &M, &N);
121:     MatGetInfo(P2, MAT_GLOBAL_SUM, &info);
122:     PetscPrintf(PETSC_COMM_WORLD, "Input matrix P: %s, %" PetscInt_FMT " x %" PetscInt_FMT ", %lld nonzeros, %.1f per row\n", fileP, M, N, (long long)info.nz_used, (double)info.nz_used / (double)M);
123:   } else { /* otherwise just let P = A */
124:     PetscPrintf(PETSC_COMM_WORLD, "Input matrix P = A\n");
125:     P2 = A2;
126:     P  = A;
127:   }

129:   /* Compute the reference C2 */
130:   MatProductCreate(A2, P2, NULL, &C2);
131:   MatProductSetType(C2, prodType);
132:   MatProductSetFill(C2, PETSC_DEFAULT);
133:   MatProductSetFromOptions(C2);
134:   MatProductSymbolic(C2);
135:   MatProductNumeric(C2);
136:   MatGetSize(C2, &M, &N);
137:   MatGetInfo(C2, MAT_GLOBAL_SUM, &info);
138:   PetscPrintf(PETSC_COMM_WORLD, "Mat product  C = %s: %" PetscInt_FMT " x %" PetscInt_FMT ", %lld nonzeros, %.1f per row\n", prodTypeStr, M, N, (long long)info.nz_used, (double)info.nz_used / (double)M);

140:   /* Compute C */
141:   MatProductCreate(A, P, NULL, &C);
142:   MatProductSetType(C, prodType);
143:   MatProductSetAlgorithm(C, MATPRODUCTALGORITHMBACKEND);
144:   MatProductSetFill(C, PETSC_DEFAULT);
145:   MatProductSetFromOptions(C);

147:   /* Measure  MatProductSymbolic */
148:   PetscLogStageRegister("MatProductSymbolic", &stage);
149:   PetscLogStagePush(stage);
150:   SyncDevice();
151:   MPI_Barrier(PETSC_COMM_WORLD);
152:   PetscTime(&tstart);
153:   MatProductSymbolic(C);
154:   SyncDevice();
155:   MPI_Barrier(PETSC_COMM_WORLD);
156:   PetscTime(&tend);
157:   avgTime = (tend - tstart) * 1e6; /* microseconds */
158:   PetscLogStagePop();
159:   PetscPrintf(PETSC_COMM_WORLD, "\nMatProductSymbolic()         time (us) with %d MPI ranks = %8.2f\n", size, avgTime);

161:   /* Measure  MatProductNumeric */
162:   PetscLogStageRegister("MatProductNumeric", &stage);
163:   for (i = 0; i < n + nskip; i++) {
164:     if (i == nskip) {
165:       SyncDevice();
166:       PetscLogStagePush(stage);
167:       MPI_Barrier(PETSC_COMM_WORLD);
168:       PetscTime(&tstart);
169:     }
170:     MatProductReplaceMats(A, P, NULL, C);
171:     MatProductNumeric(C);
172:   }
173:   SyncDevice();
174:   MPI_Barrier(PETSC_COMM_WORLD);
175:   PetscTime(&tend);
176:   avgTime = (tend - tstart) * 1e6 / n; /* microseconds */
177:   PetscLogStagePop();

179:   MatMultEqual(C, C2, 8, &equal); /* Not MatEqual() since C and C2 are not necessarily bitwise equal */

182:   PetscPrintf(PETSC_COMM_WORLD, "MatProductNumeric()  average time (us) with %d MPI ranks = %8.2f\n", size, avgTime);

184:   MatDestroy(&A);
185:   if (flgP) MatDestroy(&P);
186:   MatDestroy(&C);

188:   MatDestroy(&A2);
189:   if (flgP) MatDestroy(&P2);
190:   MatDestroy(&C2);

192:   PetscFinalize();
193:   return 0;
194: }

196: /*TEST

198:   testset:
199:     args: -n 2 -A ${DATAFILESPATH}/matrices/small
200:     nsize: 1
201:     filter: grep "DOES_NOT_EXIST"
202:     output_file: output/empty.out
203:     requires: !complex double !single kokkos_kernels

205:     test:
206:       suffix: 1
207:       requires: cuda
208:       args: -mat_type aijcusparse

210:     test:
211:       suffix: 2
212:       args: -mat_type aijkokkos

214:     test:
215:       suffix: 3
216:       requires: hip
217:       args: -mat_type aijhipsparse

219: TEST*/