Actual source code: vpbjacobi_cuda.cu
1: #include <petscdevice_cuda.h>
2: #include <../src/ksp/pc/impls/vpbjacobi/vpbjacobi.h>
4: /* A class that manages helper arrays assisting parallel PCApply() with CUDA */
5: struct PC_VPBJacobi_CUDA {
6: /* Cache the old sizes to check if we need realloc */
7: PetscInt n; /* number of rows of the local matrix */
8: PetscInt nblocks; /* number of point blocks */
9: PetscInt nsize; /* sum of sizes of the point blocks */
11: /* Helper arrays that are pre-computed on host and then copied to device.
12: bs: [nblocks+1], "csr" version of bsizes[], with bs[0] = 0, bs[nblocks] = n.
13: bs2: [nblocks+1], "csr" version of squares of bsizes[], with bs2[0] = 0, bs2[nblocks] = nsize.
14: matIdx: [n], row i of the local matrix belongs to the matIdx_d[i] block
15: */
16: PetscInt *bs_h, *bs2_h, *matIdx_h;
17: PetscInt *bs_d, *bs2_d, *matIdx_d;
19: MatScalar *diag_d; /* [nsize], store inverse of the point blocks on device */
21: PC_VPBJacobi_CUDA(PetscInt n, PetscInt nblocks, PetscInt nsize, const PetscInt *bsizes, MatScalar *diag_h) : n(n), nblocks(nblocks), nsize(nsize)
22: {
23: /* malloc memory on host and device, and then update */
24: PetscMalloc3(nblocks + 1, &bs_h, nblocks + 1, &bs2_h, n, &matIdx_h);
25: cudaMalloc(&bs_d, sizeof(PetscInt) * (nblocks + 1));
26: cudaMalloc(&bs2_d, sizeof(PetscInt) * (nblocks + 1));
27: cudaMalloc(&matIdx_d, sizeof(PetscInt) * n);
28: cudaMalloc(&diag_d, sizeof(MatScalar) * nsize);
29: UpdateOffsetsOnDevice(bsizes, diag_h);
30: }
32: PetscErrorCode UpdateOffsetsOnDevice(const PetscInt *bsizes, MatScalar *diag_h)
33: {
34: ComputeOffsetsOnHost(bsizes);
35: cudaMemcpy(bs_d, bs_h, sizeof(PetscInt) * (nblocks + 1), cudaMemcpyHostToDevice);
36: cudaMemcpy(bs2_d, bs2_h, sizeof(PetscInt) * (nblocks + 1), cudaMemcpyHostToDevice);
37: cudaMemcpy(matIdx_d, matIdx_h, sizeof(PetscInt) * n, cudaMemcpyHostToDevice);
38: cudaMemcpy(diag_d, diag_h, sizeof(MatScalar) * nsize, cudaMemcpyHostToDevice);
39: PetscLogCpuToGpu(sizeof(PetscInt) * (2 * nblocks + 2 + n) + sizeof(MatScalar) * nsize);
40: return 0;
41: }
43: ~PC_VPBJacobi_CUDA()
44: {
45: PetscFree3(bs_h, bs2_h, matIdx_h);
46: cudaFree(bs_d);
47: cudaFree(bs2_d);
48: cudaFree(matIdx_d);
49: cudaFree(diag_d);
50: }
52: private:
53: PetscErrorCode ComputeOffsetsOnHost(const PetscInt *bsizes)
54: {
55: bs_h[0] = bs2_h[0] = 0;
56: for (PetscInt i = 0; i < nblocks; i++) {
57: bs_h[i + 1] = bs_h[i] + bsizes[i];
58: bs2_h[i + 1] = bs2_h[i] + bsizes[i] * bsizes[i];
59: for (PetscInt j = 0; j < bsizes[i]; j++) matIdx_h[bs_h[i] + j] = i;
60: }
61: return 0;
62: }
63: };
65: /* Like cublasDgemvBatched() but with variable-sized blocks
67: Input Parameters:
68: + n - number of rows of the local matrix
69: . bs - [nblocks+1], prefix sum of bsizes[]
70: . bs2 - [nblocks+1], prefix sum of squares of bsizes[]
71: . matIdx - [n], store block/matrix index for each row
72: . A - blocks of the matrix back to back in column-major order
73: - x - the input vector
75: Output Parameter:
76: . y - the output vector
77: */
78: __global__ static void MatMultBatched(PetscInt n, const PetscInt *bs, const PetscInt *bs2, const PetscInt *matIdx, const MatScalar *A, const PetscScalar *x, PetscScalar *y)
79: {
80: const PetscInt gridSize = gridDim.x * blockDim.x;
81: PetscInt tid = blockIdx.x * blockDim.x + threadIdx.x;
82: PetscInt i, j, k, m;
84: /* One row per thread. The blocks/matrices are stored in column-major order */
85: for (; tid < n; tid += gridSize) {
86: k = matIdx[tid]; /* k-th block */
87: m = bs[k + 1] - bs[k]; /* block size of the k-th block */
88: i = tid - bs[k]; /* i-th row of the block */
89: A += bs2[k] + i; /* advance A to the first entry of i-th row */
90: x += bs[k];
91: y += bs[k];
93: y[i] = 0.0;
94: for (j = 0; j < m; j++) {
95: y[i] += A[0] * x[j];
96: A += m;
97: }
98: }
99: }
101: static PetscErrorCode PCApply_VPBJacobi_CUDA(PC pc, Vec x, Vec y)
102: {
103: PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data;
104: PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA *>(jac->spptr);
105: const PetscScalar *xx;
106: PetscScalar *yy;
107: PetscInt n;
109: PetscLogGpuTimeBegin();
110: if (PetscDefined(USE_DEBUG)) {
111: PetscBool isCuda;
112: PetscObjectTypeCompareAny((PetscObject)x, &isCuda, VECSEQCUDA, VECMPICUDA, "");
113: if (isCuda) PetscObjectTypeCompareAny((PetscObject)y, &isCuda, VECSEQCUDA, VECMPICUDA, "");
115: }
117: MatGetLocalSize(pc->pmat, &n, NULL);
118: if (n) {
119: PetscInt gridSize = PetscMin((n + 255) / 256, 2147483647); /* <= 2^31-1 */
120: VecCUDAGetArrayRead(x, &xx);
121: VecCUDAGetArrayWrite(y, &yy);
122: MatMultBatched<<<gridSize, 256>>>(n, pcuda->bs_d, pcuda->bs2_d, pcuda->matIdx_d, pcuda->diag_d, xx, yy);
123: cudaGetLastError();
124: VecCUDARestoreArrayRead(x, &xx);
125: VecCUDARestoreArrayWrite(y, &yy);
126: }
127: PetscLogGpuFlops(pcuda->nsize * 2); /* FMA on entries in all blocks */
128: PetscLogGpuTimeEnd();
129: return 0;
130: }
132: static PetscErrorCode PCDestroy_VPBJacobi_CUDA(PC pc)
133: {
134: PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data;
136: delete static_cast<PC_VPBJacobi_CUDA *>(jac->spptr);
137: PCDestroy_VPBJacobi(pc);
138: return 0;
139: }
141: PETSC_INTERN PetscErrorCode PCSetUp_VPBJacobi_CUDA(PC pc)
142: {
143: PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data;
144: PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA *>(jac->spptr);
145: PetscInt i, n, nblocks, nsize = 0;
146: const PetscInt *bsizes;
148: PCSetUp_VPBJacobi_Host(pc); /* Compute the inverse on host now. Might worth doing it on device directly */
149: MatGetVariableBlockSizes(pc->pmat, &nblocks, &bsizes);
150: for (i = 0; i < nblocks; i++) nsize += bsizes[i] * bsizes[i];
151: MatGetLocalSize(pc->pmat, &n, NULL);
153: /* If one calls MatSetVariableBlockSizes() multiple times and sizes have been changed (is it allowed?), we delete the old and rebuild anyway */
154: if (pcuda && (pcuda->n != n || pcuda->nblocks != nblocks || pcuda->nsize != nsize)) {
155: delete pcuda;
156: pcuda = nullptr;
157: }
159: if (!pcuda) { /* allocate the struct along with the helper arrays from the scratch */
160: jac->spptr = new PC_VPBJacobi_CUDA(n, nblocks, nsize, bsizes, jac->diag);
161: } else { /* update the value only */
162: pcuda->UpdateOffsetsOnDevice(bsizes, jac->diag);
163: }
165: pc->ops->apply = PCApply_VPBJacobi_CUDA;
166: pc->ops->destroy = PCDestroy_VPBJacobi_CUDA;
167: return 0;
168: }