Actual source code: ex5cu.cu
1: static char help[] = "Test of CUDA matrix assemble with simple matrix.\n\n";
3: // This a minimal example of the use of the CUDA MatAIJ metadata for assembly.
4: //
5: // The matrix must be a type 'aijcusparse' and must first be assembled on the CPU to provide the nonzero pattern.
6: // Next, get a pointer to a simple CSR mirror (PetscSplitCSRDataStructure) of the matrix data on
7: // the GPU with MatCUSPARSEGetDeviceMatWrite().
8: // Then use this object to populate the matrix on the GPU with MatSetValuesDevice().
9: // Finally call MatAssemblyBegin/End() and the matrix is ready to use on the GPU without matrix data movement between the
10: // host and GPU.
12: #include <petscconf.h>
13: #include <petscmat.h>
14: #include <petscdevice_cuda.h>
15: #include <assert.h>
17: #include <petscaijdevice.h>
18: __global__ void assemble_on_gpu(PetscSplitCSRDataStructure d_mat, PetscInt start, PetscInt end, PetscInt N, PetscMPIInt rank)
19: {
20: const PetscInt inc = blockDim.x, my0 = threadIdx.x;
21: PetscInt i;
24: for (i = start + my0; i < end + 1; i += inc) {
25: PetscInt js[] = {i - 1, i}, nn = (i == N) ? 1 : 2; // negative indices are ignored but >= N are not, so clip end
26: PetscScalar values[] = {1, 1, 1, 1};
27: MatSetValuesDevice(d_mat, nn, js, nn, js, values, ADD_VALUES);
28: if (ierr) assert(0);
29: }
30: }
32: PetscErrorCode assemble_on_cpu(Mat A, PetscInt start, PetscInt end, PetscInt N, PetscMPIInt rank)
33: {
35: for (PetscInt i = start; i < end + 1; i++) {
36: PetscInt js[] = {i - 1, i}, nn = (i == N) ? 1 : 2;
37: PetscScalar values[] = {1, 1, 1, 1};
38: MatSetValues(A, nn, js, nn, js, values, ADD_VALUES);
39: }
40: return 0;
41: }
43: int main(int argc, char **args)
44: {
45: Mat A;
46: PetscInt N = 11, nz = 3, Istart, Iend, num_threads = 128;
47: PetscSplitCSRDataStructure d_mat;
48: PetscLogEvent event;
49: PetscMPIInt rank, size;
50: PetscBool testmpiseq = PETSC_FALSE;
51: Vec x, y;
54: PetscInitialize(&argc, &args, (char *)0, help);
55: PetscOptionsGetInt(NULL, NULL, "-n", &N, NULL);
56: PetscOptionsGetInt(NULL, NULL, "-num_threads", &num_threads, NULL);
57: PetscOptionsGetInt(NULL, NULL, "-nz_row", &nz, NULL);
58: PetscOptionsGetBool(NULL, NULL, "-testmpiseq", &testmpiseq, NULL);
59: if (nz < 3) nz = 3;
60: if (nz > N + 1) nz = N + 1;
61: MPI_Comm_rank(PETSC_COMM_WORLD, &rank);
62: MPI_Comm_size(PETSC_COMM_WORLD, &size);
64: PetscLogEventRegister("GPU operator", MAT_CLASSID, &event);
65: MatCreateAIJCUSPARSE(PETSC_COMM_WORLD, PETSC_DECIDE, PETSC_DECIDE, N, N, nz, NULL, nz - 1, NULL, &A);
66: MatSetFromOptions(A);
67: MatSetOption(A, MAT_IGNORE_OFF_PROC_ENTRIES, PETSC_TRUE);
68: MatCreateVecs(A, &x, &y);
69: MatGetOwnershipRange(A, &Istart, &Iend);
70: /* current GPU assembly code does not support offprocessor values insertion */
71: assemble_on_cpu(A, Istart, Iend, N, rank);
72: MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);
73: MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);
75: // test
76: VecSet(x, 1.0);
77: MatMult(A, x, y);
78: VecViewFromOptions(y, NULL, "-ex5_vec_view");
80: if (testmpiseq && size == 1) {
81: MatConvert(A, MATSEQAIJ, MAT_INPLACE_MATRIX, &A);
82: MatConvert(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A);
83: }
84: PetscLogEventBegin(event, 0, 0, 0, 0);
85: MatCUSPARSEGetDeviceMatWrite(A, &d_mat);
86: assemble_on_gpu<<<1, num_threads>>>(d_mat, Istart, Iend, N, rank);
87: cudaDeviceSynchronize();
88: MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);
89: MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);
90: PetscLogEventEnd(event, 0, 0, 0, 0);
92: // test
93: VecSet(x, 1.0);
94: MatMult(A, x, y);
95: VecViewFromOptions(y, NULL, "-ex5_vec_view");
97: MatDestroy(&A);
98: VecDestroy(&x);
99: VecDestroy(&y);
100: PetscFinalize();
101: return 0;
102: }
104: /*TEST
106: build:
107: requires: cuda
109: test:
110: suffix: 0
111: diff_args: -j
112: args: -n 11 -ex5_vec_view
113: nsize: 1
115: test:
116: suffix: 1
117: diff_args: -j
118: args: -n 11 -ex5_vec_view
119: nsize: 2
121: test:
122: suffix: 2
123: diff_args: -j
124: args: -n 11 -testmpiseq -ex5_vec_view
125: nsize: 1
127: TEST*/