Actual source code: vpbjacobi_cuda.cu

  1: #include <petscdevice_cuda.h>
  2: #include <../src/ksp/pc/impls/vpbjacobi/vpbjacobi.h>

  4: /* A class that manages helper arrays assisting parallel PCApply() with CUDA */
  5: struct PC_VPBJacobi_CUDA {
  6:   /* Cache the old sizes to check if we need realloc */
  7:   PetscInt n;       /* number of rows of the local matrix */
  8:   PetscInt nblocks; /* number of point blocks */
  9:   PetscInt nsize;   /* sum of sizes of the point blocks */

 11:   /* Helper arrays that are pre-computed on host and then copied to device.
 12:     bs:     [nblocks+1], "csr" version of bsizes[], with bs[0] = 0, bs[nblocks] = n.
 13:     bs2:    [nblocks+1], "csr" version of squares of bsizes[], with bs2[0] = 0, bs2[nblocks] = nsize.
 14:     matIdx: [n], row i of the local matrix belongs to the matIdx_d[i] block
 15:   */
 16:   PetscInt *bs_h, *bs2_h, *matIdx_h;
 17:   PetscInt *bs_d, *bs2_d, *matIdx_d;

 19:   MatScalar *diag_d; /* [nsize], store inverse of the point blocks on device */

 21:   PC_VPBJacobi_CUDA(PetscInt n, PetscInt nblocks, PetscInt nsize, const PetscInt *bsizes, MatScalar *diag_h) : n(n), nblocks(nblocks), nsize(nsize)
 22:   {
 23:     /* malloc memory on host and device, and then update */
 24:     PetscMalloc3(nblocks + 1, &bs_h, nblocks + 1, &bs2_h, n, &matIdx_h);
 25:     cudaMalloc(&bs_d, sizeof(PetscInt) * (nblocks + 1));
 26:     cudaMalloc(&bs2_d, sizeof(PetscInt) * (nblocks + 1));
 27:     cudaMalloc(&matIdx_d, sizeof(PetscInt) * n);
 28:     cudaMalloc(&diag_d, sizeof(MatScalar) * nsize);
 29:     UpdateOffsetsOnDevice(bsizes, diag_h);
 30:   }

 32:   PetscErrorCode UpdateOffsetsOnDevice(const PetscInt *bsizes, MatScalar *diag_h)
 33:   {
 34:     ComputeOffsetsOnHost(bsizes);
 35:     cudaMemcpy(bs_d, bs_h, sizeof(PetscInt) * (nblocks + 1), cudaMemcpyHostToDevice);
 36:     cudaMemcpy(bs2_d, bs2_h, sizeof(PetscInt) * (nblocks + 1), cudaMemcpyHostToDevice);
 37:     cudaMemcpy(matIdx_d, matIdx_h, sizeof(PetscInt) * n, cudaMemcpyHostToDevice);
 38:     cudaMemcpy(diag_d, diag_h, sizeof(MatScalar) * nsize, cudaMemcpyHostToDevice);
 39:     PetscLogCpuToGpu(sizeof(PetscInt) * (2 * nblocks + 2 + n) + sizeof(MatScalar) * nsize);
 40:     return 0;
 41:   }

 43:   ~PC_VPBJacobi_CUDA()
 44:   {
 45:     PetscFree3(bs_h, bs2_h, matIdx_h);
 46:     cudaFree(bs_d);
 47:     cudaFree(bs2_d);
 48:     cudaFree(matIdx_d);
 49:     cudaFree(diag_d);
 50:   }

 52: private:
 53:   PetscErrorCode ComputeOffsetsOnHost(const PetscInt *bsizes)
 54:   {
 55:     bs_h[0] = bs2_h[0] = 0;
 56:     for (PetscInt i = 0; i < nblocks; i++) {
 57:       bs_h[i + 1]  = bs_h[i] + bsizes[i];
 58:       bs2_h[i + 1] = bs2_h[i] + bsizes[i] * bsizes[i];
 59:       for (PetscInt j = 0; j < bsizes[i]; j++) matIdx_h[bs_h[i] + j] = i;
 60:     }
 61:     return 0;
 62:   }
 63: };

 65: /* Like cublasDgemvBatched() but with variable-sized blocks

 67:   Input Parameters:
 68: + n       - number of rows of the local matrix
 69: . bs      - [nblocks+1], prefix sum of bsizes[]
 70: . bs2     - [nblocks+1], prefix sum of squares of bsizes[]
 71: . matIdx  - [n], store block/matrix index for each row
 72: . A       - blocks of the matrix back to back in column-major order
 73: - x       - the input vector

 75:   Output Parameter:
 76: . y - the output vector
 77: */
 78: __global__ static void MatMultBatched(PetscInt n, const PetscInt *bs, const PetscInt *bs2, const PetscInt *matIdx, const MatScalar *A, const PetscScalar *x, PetscScalar *y)
 79: {
 80:   const PetscInt gridSize = gridDim.x * blockDim.x;
 81:   PetscInt       tid      = blockIdx.x * blockDim.x + threadIdx.x;
 82:   PetscInt       i, j, k, m;

 84:   /* One row per thread. The blocks/matrices are stored in column-major order */
 85:   for (; tid < n; tid += gridSize) {
 86:     k = matIdx[tid];       /* k-th block */
 87:     m = bs[k + 1] - bs[k]; /* block size of the k-th block */
 88:     i = tid - bs[k];       /* i-th row of the block */
 89:     A += bs2[k] + i;       /* advance A to the first entry of i-th row */
 90:     x += bs[k];
 91:     y += bs[k];

 93:     y[i] = 0.0;
 94:     for (j = 0; j < m; j++) {
 95:       y[i] += A[0] * x[j];
 96:       A += m;
 97:     }
 98:   }
 99: }

101: static PetscErrorCode PCApply_VPBJacobi_CUDA(PC pc, Vec x, Vec y)
102: {
103:   PC_VPBJacobi      *jac   = (PC_VPBJacobi *)pc->data;
104:   PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA *>(jac->spptr);
105:   const PetscScalar *xx;
106:   PetscScalar       *yy;
107:   PetscInt           n;

109:   PetscLogGpuTimeBegin();
110:   if (PetscDefined(USE_DEBUG)) {
111:     PetscBool isCuda;
112:     PetscObjectTypeCompareAny((PetscObject)x, &isCuda, VECSEQCUDA, VECMPICUDA, "");
113:     if (isCuda) PetscObjectTypeCompareAny((PetscObject)y, &isCuda, VECSEQCUDA, VECMPICUDA, "");
115:   }

117:   MatGetLocalSize(pc->pmat, &n, NULL);
118:   if (n) {
119:     PetscInt gridSize = PetscMin((n + 255) / 256, 2147483647); /* <= 2^31-1 */
120:     VecCUDAGetArrayRead(x, &xx);
121:     VecCUDAGetArrayWrite(y, &yy);
122:     MatMultBatched<<<gridSize, 256>>>(n, pcuda->bs_d, pcuda->bs2_d, pcuda->matIdx_d, pcuda->diag_d, xx, yy);
123:     cudaGetLastError();
124:     VecCUDARestoreArrayRead(x, &xx);
125:     VecCUDARestoreArrayWrite(y, &yy);
126:   }
127:   PetscLogGpuFlops(pcuda->nsize * 2); /* FMA on entries in all blocks */
128:   PetscLogGpuTimeEnd();
129:   return 0;
130: }

132: static PetscErrorCode PCDestroy_VPBJacobi_CUDA(PC pc)
133: {
134:   PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data;

136:   delete static_cast<PC_VPBJacobi_CUDA *>(jac->spptr);
137:   PCDestroy_VPBJacobi(pc);
138:   return 0;
139: }

141: PETSC_INTERN PetscErrorCode PCSetUp_VPBJacobi_CUDA(PC pc)
142: {
143:   PC_VPBJacobi      *jac   = (PC_VPBJacobi *)pc->data;
144:   PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA *>(jac->spptr);
145:   PetscInt           i, n, nblocks, nsize = 0;
146:   const PetscInt    *bsizes;

148:   PCSetUp_VPBJacobi_Host(pc); /* Compute the inverse on host now. Might worth doing it on device directly */
149:   MatGetVariableBlockSizes(pc->pmat, &nblocks, &bsizes);
150:   for (i = 0; i < nblocks; i++) nsize += bsizes[i] * bsizes[i];
151:   MatGetLocalSize(pc->pmat, &n, NULL);

153:   /* If one calls MatSetVariableBlockSizes() multiple times and sizes have been changed (is it allowed?), we delete the old and rebuild anyway */
154:   if (pcuda && (pcuda->n != n || pcuda->nblocks != nblocks || pcuda->nsize != nsize)) {
155:     delete pcuda;
156:     pcuda = nullptr;
157:   }

159:   if (!pcuda) { /* allocate the struct along with the helper arrays from the scratch */
160:     jac->spptr = new PC_VPBJacobi_CUDA(n, nblocks, nsize, bsizes, jac->diag);
161:   } else { /* update the value only */
162:     pcuda->UpdateOffsetsOnDevice(bsizes, jac->diag);
163:   }

165:   pc->ops->apply   = PCApply_VPBJacobi_CUDA;
166:   pc->ops->destroy = PCDestroy_VPBJacobi_CUDA;
167:   return 0;
168: }