Actual source code: landaucu.cu

  1: /*
  2:   Implements the Landau kernel
  3: */
  4: #include <petscconf.h>
  5: #include <petsc/private/dmpleximpl.h>
  6: #include <petsclandau.h>
  7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
  8: #include <../src/mat/impls/aij/seq/aij.h>
  9: #include <petscmat.h>
 10: #include <petscdevice_cuda.h>

 12: #include "../land_tensors.h"
 13: #include <petscaijdevice.h>

 15: PETSC_EXTERN PetscErrorCode LandauCUDACreateMatMaps(P4estVertexMaps maps[], pointInterpolationP4est (*pointMaps)[LANDAU_MAX_Q_FACE], PetscInt Nf[], PetscInt Nq, PetscInt grid)
 16: {
 17:   P4estVertexMaps h_maps;
 18:   h_maps.num_elements = maps[grid].num_elements;
 19:   h_maps.num_face     = maps[grid].num_face;
 20:   h_maps.num_reduced  = maps[grid].num_reduced;
 21:   h_maps.deviceType   = maps[grid].deviceType;
 22:   h_maps.Nf           = Nf[grid];
 23:   h_maps.numgrids     = maps[grid].numgrids;
 24:   cudaMalloc((void **)&h_maps.c_maps, maps[grid].num_reduced * sizeof *pointMaps);
 25:   cudaMemcpy(h_maps.c_maps, maps[grid].c_maps, maps[grid].num_reduced * sizeof *pointMaps, cudaMemcpyHostToDevice);
 26:   cudaMalloc((void **)&h_maps.gIdx, maps[grid].num_elements * sizeof *maps[grid].gIdx);
 27:   cudaMemcpy(h_maps.gIdx, maps[grid].gIdx, maps[grid].num_elements * sizeof *maps[grid].gIdx, cudaMemcpyHostToDevice);
 28:   cudaMalloc((void **)&maps[grid].d_self, sizeof(P4estVertexMaps));
 29:   cudaMemcpy(maps[grid].d_self, &h_maps, sizeof(P4estVertexMaps), cudaMemcpyHostToDevice);
 30:   return 0;
 31: }

 33: PETSC_EXTERN PetscErrorCode LandauCUDADestroyMatMaps(P4estVertexMaps maps[], PetscInt num_grids)
 34: {
 35:   for (PetscInt grid = 0; grid < num_grids; grid++) {
 36:     P4estVertexMaps *d_maps = maps[grid].d_self, h_maps;
 37:     cudaMemcpy(&h_maps, d_maps, sizeof(P4estVertexMaps), cudaMemcpyDeviceToHost);
 38:     cudaFree(h_maps.c_maps);
 39:     cudaFree(h_maps.gIdx);
 40:     cudaFree(d_maps);
 41:   }
 42:   return 0;
 43: }

 45: PetscErrorCode LandauCUDAStaticDataSet(DM plex, const PetscInt Nq, const PetscInt batch_sz, const PetscInt num_grids, PetscInt a_numCells[], PetscInt a_species_offset[], PetscInt a_mat_offset[], PetscReal nu_alpha[], PetscReal nu_beta[], PetscReal a_invMass[], PetscReal a_invJ[], PetscReal a_x[], PetscReal a_y[], PetscReal a_z[], PetscReal a_w[], LandauStaticData *SData_d)
 46: {
 47:   PetscTabulation *Tf;
 48:   PetscReal       *BB, *DD;
 49:   PetscInt         dim, Nb = Nq, szf = sizeof(PetscReal), szs = sizeof(PetscScalar), szi = sizeof(PetscInt);
 50:   PetscInt         h_ip_offset[LANDAU_MAX_GRIDS + 1], h_ipf_offset[LANDAU_MAX_GRIDS + 1], h_elem_offset[LANDAU_MAX_GRIDS + 1], nip, IPfdf_sz, Nf;
 51:   PetscDS          prob;

 53:   DMGetDimension(plex, &dim);
 54:   DMGetDS(plex, &prob);
 56:   PetscDSGetTabulation(prob, &Tf);
 57:   BB = Tf[0]->T[0];
 58:   DD = Tf[0]->T[1];
 59:   Nf = h_ip_offset[0] = h_ipf_offset[0] = h_elem_offset[0] = 0;
 60:   nip                                                      = 0;
 61:   IPfdf_sz                                                 = 0;
 62:   for (PetscInt grid = 0; grid < num_grids; grid++) {
 63:     PetscInt nfloc          = a_species_offset[grid + 1] - a_species_offset[grid];
 64:     h_elem_offset[grid + 1] = h_elem_offset[grid] + a_numCells[grid];
 65:     nip += a_numCells[grid] * Nq;
 66:     h_ip_offset[grid + 1] = nip;
 67:     IPfdf_sz += Nq * nfloc * a_numCells[grid];
 68:     h_ipf_offset[grid + 1] = IPfdf_sz;
 69:   }
 70:   Nf = a_species_offset[num_grids];
 71:   {
 72:     cudaMalloc((void **)&SData_d->B, Nq * Nb * szf); // kernel input
 73:     cudaMemcpy(SData_d->B, BB, Nq * Nb * szf, cudaMemcpyHostToDevice);
 74:     cudaMalloc((void **)&SData_d->D, Nq * Nb * dim * szf); // kernel input
 75:     cudaMemcpy(SData_d->D, DD, Nq * Nb * dim * szf, cudaMemcpyHostToDevice);

 77:     cudaMalloc((void **)&SData_d->alpha, Nf * szf);   // kernel input
 78:     cudaMalloc((void **)&SData_d->beta, Nf * szf);    // kernel input
 79:     cudaMalloc((void **)&SData_d->invMass, Nf * szf); // kernel input

 81:     cudaMemcpy(SData_d->alpha, nu_alpha, Nf * szf, cudaMemcpyHostToDevice);
 82:     cudaMemcpy(SData_d->beta, nu_beta, Nf * szf, cudaMemcpyHostToDevice);
 83:     cudaMemcpy(SData_d->invMass, a_invMass, Nf * szf, cudaMemcpyHostToDevice);

 85:     // collect geometry
 86:     cudaMalloc((void **)&SData_d->invJ, nip * dim * dim * szf); // kernel input
 87:     cudaMemcpy(SData_d->invJ, a_invJ, nip * dim * dim * szf, cudaMemcpyHostToDevice);
 88:     cudaMalloc((void **)&SData_d->x, nip * szf); // kernel input
 89:     cudaMemcpy(SData_d->x, a_x, nip * szf, cudaMemcpyHostToDevice);
 90:     cudaMalloc((void **)&SData_d->y, nip * szf); // kernel input
 91:     cudaMemcpy(SData_d->y, a_y, nip * szf, cudaMemcpyHostToDevice);
 92: #if LANDAU_DIM == 3
 93:     cudaMalloc((void **)&SData_d->z, nip * szf); // kernel input
 94:     cudaMemcpy(SData_d->z, a_z, nip * szf, cudaMemcpyHostToDevice);
 95: #endif
 96:     cudaMalloc((void **)&SData_d->w, nip * szf); // kernel input
 97:     cudaMemcpy(SData_d->w, a_w, nip * szf, cudaMemcpyHostToDevice);

 99:     cudaMalloc((void **)&SData_d->NCells, num_grids * szi);
100:     cudaMemcpy(SData_d->NCells, a_numCells, num_grids * szi, cudaMemcpyHostToDevice);
101:     cudaMalloc((void **)&SData_d->species_offset, (num_grids + 1) * szi);
102:     cudaMemcpy(SData_d->species_offset, a_species_offset, (num_grids + 1) * szi, cudaMemcpyHostToDevice);
103:     cudaMalloc((void **)&SData_d->mat_offset, (num_grids + 1) * szi);
104:     cudaMemcpy(SData_d->mat_offset, a_mat_offset, (num_grids + 1) * szi, cudaMemcpyHostToDevice);
105:     cudaMalloc((void **)&SData_d->ip_offset, (num_grids + 1) * szi);
106:     cudaMemcpy(SData_d->ip_offset, h_ip_offset, (num_grids + 1) * szi, cudaMemcpyHostToDevice);
107:     cudaMalloc((void **)&SData_d->ipf_offset, (num_grids + 1) * szi);
108:     cudaMemcpy(SData_d->ipf_offset, h_ipf_offset, (num_grids + 1) * szi, cudaMemcpyHostToDevice);
109:     cudaMalloc((void **)&SData_d->elem_offset, (num_grids + 1) * szi);
110:     cudaMemcpy(SData_d->elem_offset, h_elem_offset, (num_grids + 1) * szi, cudaMemcpyHostToDevice);
111:     cudaMalloc((void **)&SData_d->maps, num_grids * sizeof(P4estVertexMaps *));
112:     // allocate space for dynamic data once
113:     cudaMalloc((void **)&SData_d->Eq_m, Nf * szf);               // this could be for each vertex (todo?)
114:     cudaMalloc((void **)&SData_d->f, nip * Nf * szs * batch_sz); // for each vertex in batch
115:     cudaMalloc((void **)&SData_d->dfdx, nip * Nf * szs * batch_sz);
116:     cudaMalloc((void **)&SData_d->dfdy, nip * Nf * szs * batch_sz);
117: #if LANDAU_DIM == 3
118:     cudaMalloc((void **)&SData_d->dfdz, nip * Nf * szs * batch_sz);
119: #endif
120:   }
121:   return 0;
122: }

124: PetscErrorCode LandauCUDAStaticDataClear(LandauStaticData *SData_d)
125: {
126:   if (SData_d->alpha) {
127:     cudaFree(SData_d->alpha);
128:     SData_d->alpha = NULL;
129:     cudaFree(SData_d->beta);
130:     cudaFree(SData_d->invMass);
131:     cudaFree(SData_d->B);
132:     cudaFree(SData_d->D);
133:     cudaFree(SData_d->invJ);
134: #if LANDAU_DIM == 3
135:     cudaFree(SData_d->z);
136: #endif
137:     cudaFree(SData_d->x);
138:     cudaFree(SData_d->y);
139:     cudaFree(SData_d->w);
140:     // dynamic data
141:     cudaFree(SData_d->Eq_m);
142:     cudaFree(SData_d->f);
143:     cudaFree(SData_d->dfdx);
144:     cudaFree(SData_d->dfdy);
145: #if LANDAU_DIM == 3
146:     cudaFree(SData_d->dfdz);
147: #endif
148:     cudaFree(SData_d->NCells);
149:     cudaFree(SData_d->species_offset);
150:     cudaFree(SData_d->mat_offset);
151:     cudaFree(SData_d->ip_offset);
152:     cudaFree(SData_d->ipf_offset);
153:     cudaFree(SData_d->elem_offset);
154:     cudaFree(SData_d->maps);
155:   }
156:   return 0;
157: }
158: //
159: // The GPU Landau kernel
160: //
161: __global__ void landau_form_fdf(const PetscInt dim, const PetscInt Nb, const PetscInt num_grids, const PetscReal d_invJ[], const PetscReal *const BB, const PetscReal *const DD, PetscScalar *d_vertex_f, P4estVertexMaps *d_maps[], PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[],
162: #if LANDAU_DIM == 3
163:                                 PetscReal d_dfdz[],
164: #endif
165:                                 const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[]) // output
166: {
167:   const PetscInt   Nq = blockDim.y, myQi = threadIdx.y;
168:   const PetscInt   b_elem_idx = blockIdx.y, b_id = blockIdx.x, IPf_sz_glb = d_ipf_offset[num_grids];
169:   const PetscReal *Bq = &BB[myQi * Nb], *Dq = &DD[myQi * Nb * dim];
170:   PetscInt         grid = 0, f, d, b, e, q;
171:   while (b_elem_idx >= d_elem_offset[grid + 1]) grid++;
172:   {
173:     const PetscInt     loc_nip = d_numCells[grid] * Nq, loc_Nf = d_species_offset[grid + 1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
174:     const PetscInt     moffset = LAND_MOFFSET(b_id, grid, gridDim.x, num_grids, d_mat_offset);
175:     const PetscScalar *coef;
176:     PetscReal          u_x[LANDAU_DIM];
177:     const PetscReal   *invJ = &d_invJ[(d_ip_offset[grid] + loc_elem * Nq + myQi) * dim * dim];
178:     PetscScalar        coef_buff[LANDAU_MAX_SPECIES * LANDAU_MAX_NQ];
179:     if (!d_maps) {
180:       coef = &d_vertex_f[b_id * IPf_sz_glb + d_ipf_offset[grid] + loc_elem * Nb * loc_Nf]; // closure and IP indexing are the same
181:     } else {
182:       coef = coef_buff;
183:       for (f = 0; f < loc_Nf; ++f) {
184:         LandauIdx *const Idxs = &d_maps[grid]->gIdx[loc_elem][f][0];
185:         for (b = 0; b < Nb; ++b) {
186:           PetscInt idx = Idxs[b];
187:           if (idx >= 0) {
188:             coef_buff[f * Nb + b] = d_vertex_f[idx + moffset];
189:           } else {
190:             idx                   = -idx - 1;
191:             coef_buff[f * Nb + b] = 0;
192:             for (q = 0; q < d_maps[grid]->num_face; q++) {
193:               PetscInt  id    = d_maps[grid]->c_maps[idx][q].gid;
194:               PetscReal scale = d_maps[grid]->c_maps[idx][q].scale;
195:               if (id >= 0) coef_buff[f * Nb + b] += scale * d_vertex_f[id + moffset];
196:             }
197:           }
198:         }
199:       }
200:     }

202:     /* get f and df */
203:     for (f = threadIdx.x; f < loc_Nf; f += blockDim.x) {
204:       PetscReal      refSpaceDer[LANDAU_DIM];
205:       const PetscInt idx = b_id * IPf_sz_glb + d_ipf_offset[grid] + f * loc_nip + loc_elem * Nq + myQi;
206:       d_f[idx]           = 0.0;
207:       for (d = 0; d < LANDAU_DIM; ++d) refSpaceDer[d] = 0.0;
208:       for (b = 0; b < Nb; ++b) {
209:         const PetscInt cidx = b;
210:         d_f[idx] += Bq[cidx] * PetscRealPart(coef[f * Nb + cidx]);
211:         for (d = 0; d < dim; ++d) refSpaceDer[d] += Dq[cidx * dim + d] * PetscRealPart(coef[f * Nb + cidx]);
212:       }
213:       for (d = 0; d < dim; ++d) {
214:         for (e = 0, u_x[d] = 0.0; e < dim; ++e) u_x[d] += invJ[e * dim + d] * refSpaceDer[e];
215:       }
216:       d_dfdx[idx] = u_x[0];
217:       d_dfdy[idx] = u_x[1];
218: #if LANDAU_DIM == 3
219:       d_dfdz[idx] = u_x[2];
220: #endif
221:     }
222:   }
223: }

225: __device__ void landau_jac_kernel(const PetscInt num_grids, const PetscInt jpidx, PetscInt nip_global, const PetscInt grid, const PetscReal xx[], const PetscReal yy[], const PetscReal ww[], const PetscReal invJj[], const PetscInt Nftot, const PetscReal nu_alpha[], const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[], const PetscReal *const BB, const PetscReal *const DD, PetscScalar *elemMat, P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, // output
226:                                   PetscScalar s_fieldMats[][LANDAU_MAX_NQ], // all these arrays are in shared memory
227:                                   PetscReal s_scale[][LANDAU_MAX_Q_FACE], PetscInt s_idx[][LANDAU_MAX_Q_FACE], PetscReal s_g2[][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES], PetscReal s_g3[][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES], PetscReal s_gg2[][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES], PetscReal s_gg3[][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES], PetscReal s_nu_alpha[], PetscReal s_nu_beta[], PetscReal s_invMass[], PetscReal s_f[], PetscReal s_dfx[], PetscReal s_dfy[], PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[], // global memory
228: #if LANDAU_DIM == 3
229:                                   const PetscReal zz[], PetscReal s_dfz[], PetscReal d_dfdz[],
230: #endif
231:                                   const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[])
232: {
233:   const PetscInt Nq = blockDim.y, myQi = threadIdx.y;
234:   const PetscInt b_elem_idx = blockIdx.y, b_id = blockIdx.x, IPf_sz_glb = d_ipf_offset[num_grids];
235:   const PetscInt loc_Nf = d_species_offset[grid + 1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
236:   const PetscInt moffset = LAND_MOFFSET(b_id, grid, gridDim.x, num_grids, d_mat_offset);
237:   int            delta, d, f, g, d2, dp, d3, fieldA, ipidx_b;
238:   PetscReal      gg2_temp[LANDAU_DIM], gg3_temp[LANDAU_DIM][LANDAU_DIM];
239: #if LANDAU_DIM == 2
240:   const PetscReal vj[3] = {xx[jpidx], yy[jpidx]};
241:   constexpr int   dim   = 2;
242: #else
243:   const PetscReal vj[3] = {xx[jpidx], yy[jpidx], zz[jpidx]};
244:   constexpr int   dim   = 3;
245: #endif
246:   const PetscInt f_off = d_species_offset[grid], Nb = Nq;
247:   // create g2 & g3
248:   for (f = threadIdx.x; f < loc_Nf; f += blockDim.x) {
249:     for (d = 0; d < dim; d++) { // clear accumulation data D & K
250:       s_gg2[d][myQi][f] = 0;
251:       for (d2 = 0; d2 < dim; d2++) s_gg3[d][d2][myQi][f] = 0;
252:     }
253:   }
254: #pragma unroll
255:   for (d2 = 0; d2 < dim; d2++) {
256:     gg2_temp[d2] = 0;
257: #pragma unroll
258:     for (d3 = 0; d3 < dim; d3++) gg3_temp[d2][d3] = 0;
259:   }
260:   if (threadIdx.y == 0) {
261:     // copy species into shared memory
262:     for (fieldA = threadIdx.x; fieldA < Nftot; fieldA += blockDim.x) {
263:       s_nu_alpha[fieldA] = nu_alpha[fieldA];
264:       s_nu_beta[fieldA]  = nu_beta[fieldA];
265:       s_invMass[fieldA]  = invMass[fieldA];
266:     }
267:   }
268:   __syncthreads();
269:   // inner integral, collect gg2/3
270:   for (ipidx_b = 0; ipidx_b < nip_global; ipidx_b += blockDim.x) {
271:     const PetscInt ipidx = ipidx_b + threadIdx.x;
272:     PetscInt       f_off_r, grid_r, loc_Nf_r, nip_loc_r, ipidx_g, fieldB, IPf_idx_r;
273:     __syncthreads();
274:     if (ipidx < nip_global) {
275:       grid_r = 0;
276:       while (ipidx >= d_ip_offset[grid_r + 1]) grid_r++;
277:       f_off_r   = d_species_offset[grid_r];
278:       ipidx_g   = ipidx - d_ip_offset[grid_r];
279:       nip_loc_r = d_numCells[grid_r] * Nq;
280:       loc_Nf_r  = d_species_offset[grid_r + 1] - d_species_offset[grid_r];
281:       IPf_idx_r = b_id * IPf_sz_glb + d_ipf_offset[grid_r] + ipidx_g;
282:       for (fieldB = threadIdx.y; fieldB < loc_Nf_r; fieldB += blockDim.y) {
283:         const PetscInt idx                       = IPf_idx_r + fieldB * nip_loc_r;
284:         s_f[fieldB * blockDim.x + threadIdx.x]   = d_f[idx]; // all vector threads get copy of data
285:         s_dfx[fieldB * blockDim.x + threadIdx.x] = d_dfdx[idx];
286:         s_dfy[fieldB * blockDim.x + threadIdx.x] = d_dfdy[idx];
287: #if LANDAU_DIM == 3
288:         s_dfz[fieldB * blockDim.x + threadIdx.x] = d_dfdz[idx];
289: #endif
290:       }
291:     }
292:     __syncthreads();
293:     if (ipidx < nip_global) {
294:       const PetscReal wi = ww[ipidx], x = xx[ipidx], y = yy[ipidx];
295:       PetscReal       temp1[3] = {0, 0, 0}, temp2 = 0;
296: #if LANDAU_DIM == 2
297:       PetscReal Ud[2][2], Uk[2][2], mask = (PetscAbs(vj[0] - x) < 100 * PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[1] - y) < 100 * PETSC_SQRT_MACHINE_EPSILON) ? 0. : 1.;
298:       LandauTensor2D(vj, x, y, Ud, Uk, mask);
299: #else
300:       PetscReal U[3][3], z = zz[ipidx], mask = (PetscAbs(vj[0] - x) < 100 * PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[1] - y) < 100 * PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[2] - z) < 100 * PETSC_SQRT_MACHINE_EPSILON) ? 0. : 1.;
301:       LandauTensor3D(vj, x, y, z, U, mask);
302: #endif
303:       for (int fieldB = 0; fieldB < loc_Nf_r; fieldB++) {
304:         temp1[0] += s_dfx[fieldB * blockDim.x + threadIdx.x] * s_nu_beta[fieldB + f_off_r] * s_invMass[fieldB + f_off_r];
305:         temp1[1] += s_dfy[fieldB * blockDim.x + threadIdx.x] * s_nu_beta[fieldB + f_off_r] * s_invMass[fieldB + f_off_r];
306: #if LANDAU_DIM == 3
307:         temp1[2] += s_dfz[fieldB * blockDim.x + threadIdx.x] * s_nu_beta[fieldB + f_off_r] * s_invMass[fieldB + f_off_r];
308: #endif
309:         temp2 += s_f[fieldB * blockDim.x + threadIdx.x] * s_nu_beta[fieldB + f_off_r];
310:       }
311:       temp1[0] *= wi;
312:       temp1[1] *= wi;
313: #if LANDAU_DIM == 3
314:       temp1[2] *= wi;
315: #endif
316:       temp2 *= wi;
317: #if LANDAU_DIM == 2
318:   #pragma unroll
319:       for (d2 = 0; d2 < 2; d2++) {
320:   #pragma unroll
321:         for (d3 = 0; d3 < 2; ++d3) {
322:           /* K = U * grad(f): g2=e: i,A */
323:           gg2_temp[d2] += Uk[d2][d3] * temp1[d3];
324:           /* D = -U * (I \kron (fx)): g3=f: i,j,A */
325:           gg3_temp[d2][d3] += Ud[d2][d3] * temp2;
326:         }
327:       }
328: #else
329:   #pragma unroll
330:       for (d2 = 0; d2 < 3; ++d2) {
331:   #pragma unroll
332:         for (d3 = 0; d3 < 3; ++d3) {
333:           /* K = U * grad(f): g2 = e: i,A */
334:           gg2_temp[d2] += U[d2][d3] * temp1[d3];
335:           /* D = -U * (I \kron (fx)): g3 = f: i,j,A */
336:           gg3_temp[d2][d3] += U[d2][d3] * temp2;
337:         }
338:       }
339: #endif
340:     }
341:   } /* IPs */

343:   /* reduce gg temp sums across threads */
344:   for (delta = blockDim.x / 2; delta > 0; delta /= 2) {
345: #pragma unroll
346:     for (d2 = 0; d2 < dim; d2++) {
347:       gg2_temp[d2] += __shfl_xor_sync(0xffffffff, gg2_temp[d2], delta, blockDim.x);
348: #pragma unroll
349:       for (d3 = 0; d3 < dim; d3++) gg3_temp[d2][d3] += __shfl_xor_sync(0xffffffff, gg3_temp[d2][d3], delta, blockDim.x);
350:     }
351:   }
352:   // add alpha and put in gg2/3
353:   for (fieldA = threadIdx.x; fieldA < loc_Nf; fieldA += blockDim.x) {
354: #pragma unroll
355:     for (d2 = 0; d2 < dim; d2++) {
356:       s_gg2[d2][myQi][fieldA] += gg2_temp[d2] * s_nu_alpha[fieldA + f_off];
357: #pragma unroll
358:       for (d3 = 0; d3 < dim; d3++) s_gg3[d2][d3][myQi][fieldA] -= gg3_temp[d2][d3] * s_nu_alpha[fieldA + f_off] * s_invMass[fieldA + f_off];
359:     }
360:   }
361:   __syncthreads();
362:   /* add electric field term once per IP */
363:   for (fieldA = threadIdx.x; fieldA < loc_Nf; fieldA += blockDim.x) s_gg2[dim - 1][myQi][fieldA] += Eq_m[fieldA + f_off];
364:   __syncthreads();
365:   /* Jacobian transform - g2 */
366:   for (fieldA = threadIdx.x; fieldA < loc_Nf; fieldA += blockDim.x) {
367:     PetscReal wj = ww[jpidx];
368:     for (d = 0; d < dim; ++d) {
369:       s_g2[d][myQi][fieldA] = 0.0;
370:       for (d2 = 0; d2 < dim; ++d2) {
371:         s_g2[d][myQi][fieldA] += invJj[d * dim + d2] * s_gg2[d2][myQi][fieldA];
372:         s_g3[d][d2][myQi][fieldA] = 0.0;
373:         for (d3 = 0; d3 < dim; ++d3) {
374:           for (dp = 0; dp < dim; ++dp) s_g3[d][d2][myQi][fieldA] += invJj[d * dim + d3] * s_gg3[d3][dp][myQi][fieldA] * invJj[d2 * dim + dp];
375:         }
376:         s_g3[d][d2][myQi][fieldA] *= wj;
377:       }
378:       s_g2[d][myQi][fieldA] *= wj;
379:     }
380:   }
381:   __syncthreads(); // Synchronize (ensure all the data is available) and sum IP matrices
382:   /* FE matrix construction */
383:   {
384:     int fieldA, d, qj, d2, q, idx, totDim = Nb * loc_Nf;
385:     /* assemble */
386:     for (fieldA = 0; fieldA < loc_Nf; fieldA++) {
387:       for (f = threadIdx.y; f < Nb; f += blockDim.y) {
388:         for (g = threadIdx.x; g < Nb; g += blockDim.x) {
389:           PetscScalar t = 0;
390:           for (qj = 0; qj < Nq; qj++) {
391:             const PetscReal *BJq = &BB[qj * Nb], *DIq = &DD[qj * Nb * dim];
392:             for (d = 0; d < dim; ++d) {
393:               t += DIq[f * dim + d] * s_g2[d][qj][fieldA] * BJq[g];
394:               for (d2 = 0; d2 < dim; ++d2) t += DIq[f * dim + d] * s_g3[d][d2][qj][fieldA] * DIq[g * dim + d2];
395:             }
396:           }
397:           if (elemMat) {
398:             const PetscInt fOff = (fieldA * Nb + f) * totDim + fieldA * Nb + g;
399:             elemMat[fOff] += t; // ????
400:           } else s_fieldMats[f][g] = t;
401:         }
402:       }
403:       if (s_fieldMats) {
404:         PetscScalar            vals[LANDAU_MAX_Q_FACE * LANDAU_MAX_Q_FACE];
405:         PetscInt               nr, nc;
406:         const LandauIdx *const Idxs = &d_maps[grid]->gIdx[loc_elem][fieldA][0];
407:         __syncthreads();
408:         if (threadIdx.y == 0) {
409:           for (f = threadIdx.x; f < Nb; f += blockDim.x) {
410:             idx = Idxs[f];
411:             if (idx >= 0) {
412:               s_idx[f][0]   = idx + moffset;
413:               s_scale[f][0] = 1.;
414:             } else {
415:               idx = -idx - 1;
416:               for (q = 0; q < d_maps[grid]->num_face; q++) {
417:                 if (d_maps[grid]->c_maps[idx][q].gid >= 0) s_idx[f][q] = d_maps[grid]->c_maps[idx][q].gid + moffset;
418:                 else s_idx[f][q] = -1;
419:                 s_scale[f][q] = d_maps[grid]->c_maps[idx][q].scale;
420:               }
421:             }
422:           }
423:         }
424:         __syncthreads();
425:         for (f = threadIdx.y; f < Nb; f += blockDim.y) {
426:           idx = Idxs[f];
427:           if (idx >= 0) {
428:             nr = 1;
429:           } else {
430:             nr = d_maps[grid]->num_face;
431:           }
432:           for (g = threadIdx.x; g < Nb; g += blockDim.x) {
433:             idx = Idxs[g];
434:             if (idx >= 0) {
435:               nc = 1;
436:             } else {
437:               nc = d_maps[grid]->num_face;
438:             }
439:             for (q = 0; q < nr; q++) {
440:               for (d = 0; d < nc; d++) vals[q * nc + d] = s_scale[f][q] * s_scale[g][d] * s_fieldMats[f][g];
441:             }
442:             MatSetValuesDevice(d_mat, nr, s_idx[f], nc, s_idx[g], vals, ADD_VALUES);
443:           }
444:         }
445:         __syncthreads();
446:       }
447:     }
448:   }
449: }

451: //
452: // The CUDA Landau kernel
453: //
454: __global__ void __launch_bounds__(256, 2) landau_jacobian(const PetscInt nip_global, const PetscInt dim, const PetscInt Nb, const PetscInt num_grids, const PetscReal invJj[], const PetscInt Nftot, const PetscReal nu_alpha[], const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[], const PetscReal *const BB, const PetscReal *const DD, const PetscReal xx[], const PetscReal yy[], const PetscReal ww[], PetscScalar d_elem_mats[], P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[],
455: #if LANDAU_DIM == 3
456:                                                           const PetscReal zz[], PetscReal d_dfdz[],
457: #endif
458:                                                           const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[])
459: {
460:   extern __shared__ PetscReal smem[];
461:   int                         size                                = 0;
462:   PetscReal(*s_g2)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] = (PetscReal(*)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) & smem[size];
463:   size += LANDAU_MAX_NQ * LANDAU_MAX_SPECIES * LANDAU_DIM;
464:   PetscReal(*s_g3)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] = (PetscReal(*)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) & smem[size];
465:   size += LANDAU_DIM * LANDAU_DIM * LANDAU_MAX_NQ * LANDAU_MAX_SPECIES;
466:   PetscReal(*s_gg2)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] = (PetscReal(*)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) & smem[size];
467:   size += LANDAU_MAX_NQ * LANDAU_MAX_SPECIES * LANDAU_DIM;
468:   PetscReal(*s_gg3)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] = (PetscReal(*)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) & smem[size];
469:   size += LANDAU_DIM * LANDAU_DIM * LANDAU_MAX_NQ * LANDAU_MAX_SPECIES;
470:   PetscReal *s_nu_alpha = &smem[size];
471:   size += LANDAU_MAX_SPECIES;
472:   PetscReal *s_nu_beta = &smem[size];
473:   size += LANDAU_MAX_SPECIES;
474:   PetscReal *s_invMass = &smem[size];
475:   size += LANDAU_MAX_SPECIES;
476:   PetscReal *s_f = &smem[size];
477:   size += blockDim.x * LANDAU_MAX_SPECIES;
478:   PetscReal *s_dfx = &smem[size];
479:   size += blockDim.x * LANDAU_MAX_SPECIES;
480:   PetscReal *s_dfy = &smem[size];
481:   size += blockDim.x * LANDAU_MAX_SPECIES;
482: #if LANDAU_DIM == 3
483:   PetscReal *s_dfz = &smem[size];
484:   size += blockDim.x * LANDAU_MAX_SPECIES;
485: #endif
486:   PetscScalar(*s_fieldMats)[LANDAU_MAX_NQ][LANDAU_MAX_NQ];
487:   PetscReal(*s_scale)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
488:   PetscInt(*s_idx)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
489:   const PetscInt b_elem_idx = blockIdx.y, b_id = blockIdx.x;
490:   PetscInt       Nq = blockDim.y, grid = 0; // Nq == Nb
491:   PetscScalar   *elemMat = NULL;            /* my output */
492:   while (b_elem_idx >= d_elem_offset[grid + 1]) grid++;
493:   {
494:     const PetscInt   loc_Nf = d_species_offset[grid + 1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
495:     const PetscInt   myQi  = threadIdx.y;
496:     const PetscInt   jpidx = d_ip_offset[grid] + myQi + loc_elem * Nq;
497:     const PetscReal *invJ  = &invJj[jpidx * dim * dim];
498:     if (d_elem_mats) {
499:       PetscInt totDim = loc_Nf * Nb;
500:       elemMat         = d_elem_mats; // start a beginning and get to my element matrix
501:       for (PetscInt b_id2 = 0; b_id2 < b_id; b_id2++) {
502:         for (PetscInt grid2 = 0; grid2 < num_grids; grid2++) {
503:           PetscInt Nfloc2 = d_species_offset[grid2 + 1] - d_species_offset[grid2], totDim2 = Nfloc2 * Nb;
504:           elemMat += d_numCells[grid2] * totDim2 * totDim2; // jump past grids,could be in an offset
505:         }
506:       }
507:       for (PetscInt grid2 = 0; grid2 < grid; grid2++) {
508:         PetscInt Nfloc2 = d_species_offset[grid2 + 1] - d_species_offset[grid2], totDim2 = Nfloc2 * Nb;
509:         elemMat += d_numCells[grid2] * totDim2 * totDim2; // jump past grids, could be in an offset
510:       }
511:       elemMat += loc_elem * totDim * totDim; // index into local matrix & zero out
512:       for (int i = threadIdx.x + threadIdx.y * blockDim.x; i < totDim * totDim; i += blockDim.x * blockDim.y) elemMat[i] = 0;
513:     }
514:     __syncthreads();
515:     if (d_maps) {
516:       // reuse the space for fieldMats
517:       s_fieldMats = (PetscScalar(*)[LANDAU_MAX_NQ][LANDAU_MAX_NQ]) & smem[size];
518:       size += LANDAU_MAX_NQ * LANDAU_MAX_NQ;
519:       s_scale = (PetscReal(*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) & smem[size];
520:       size += LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE;
521:       s_idx = (PetscInt(*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) & smem[size];
522:       size += LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE; // this is too big, idx is an integer
523:     } else {
524:       s_fieldMats = NULL;
525:     }
526:     __syncthreads();
527:     landau_jac_kernel(num_grids, jpidx, nip_global, grid, xx, yy, ww, invJ, Nftot, nu_alpha, nu_beta, invMass, Eq_m, BB, DD, elemMat, d_maps, d_mat, *s_fieldMats, *s_scale, *s_idx, *s_g2, *s_g3, *s_gg2, *s_gg3, s_nu_alpha, s_nu_beta, s_invMass, s_f, s_dfx, s_dfy, d_f, d_dfdx, d_dfdy,
528: #if LANDAU_DIM == 3
529:                       zz, s_dfz, d_dfdz,
530: #endif
531:                       d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
532:   }
533: }

535: __global__ void __launch_bounds__(256, 4) landau_mass(const PetscInt dim, const PetscInt Nb, const PetscInt num_grids, const PetscReal d_w[], const PetscReal *const BB, const PetscReal *const DD, PetscScalar d_elem_mats[], P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, PetscReal shift, const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_elem_offset[])
536: {
537:   extern __shared__ PetscReal smem[];
538:   const PetscInt              Nq = blockDim.y, b_elem_idx = blockIdx.y, b_id = blockIdx.x;
539:   PetscScalar                *elemMat = NULL; /* my output */
540:   PetscInt                    fieldA, d, qj, q, idx, f, g, grid = 0, size = 0;
541:   PetscScalar(*s_fieldMats)[LANDAU_MAX_NQ][LANDAU_MAX_NQ];
542:   PetscReal(*s_scale)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
543:   PetscInt(*s_idx)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
544:   if (d_maps) {
545:     // reuse the space for fieldMats
546:     s_fieldMats = (PetscScalar(*)[LANDAU_MAX_NQ][LANDAU_MAX_NQ]) & smem[size];
547:     size += LANDAU_MAX_NQ * LANDAU_MAX_NQ;
548:     s_scale = (PetscReal(*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) & smem[size];
549:     size += LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE;
550:     s_idx = (PetscInt(*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) & smem[size];
551:     size += LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE; // this is too big, idx is an integer
552:   } else {
553:     s_fieldMats = NULL;
554:   }
555:   while (b_elem_idx >= d_elem_offset[grid + 1]) grid++;
556:   {
557:     const PetscInt loc_Nf = d_species_offset[grid + 1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
558:     const PetscInt moffset = LAND_MOFFSET(b_id, grid, gridDim.x, num_grids, d_mat_offset), totDim = loc_Nf * Nq;
559:     if (d_elem_mats) {
560:       elemMat = d_elem_mats; // start a beginning
561:       for (PetscInt b_id2 = 0; b_id2 < b_id; b_id2++) {
562:         for (PetscInt grid2 = 0; grid2 < num_grids; grid2++) {
563:           PetscInt Nfloc2 = d_species_offset[grid2 + 1] - d_species_offset[grid2], totDim2 = Nfloc2 * Nb;
564:           elemMat += d_numCells[grid2] * totDim2 * totDim2; // jump past grids,could be in an offset
565:         }
566:       }
567:       for (PetscInt grid2 = 0; grid2 < grid; grid2++) {
568:         PetscInt Nfloc2 = d_species_offset[grid2 + 1] - d_species_offset[grid2], totDim2 = Nfloc2 * Nb;
569:         elemMat += d_numCells[grid2] * totDim2 * totDim2; // jump past grids,could be in an offset
570:       }
571:       elemMat += loc_elem * totDim * totDim;
572:       for (int i = threadIdx.x + threadIdx.y * blockDim.x; i < totDim * totDim; i += blockDim.x * blockDim.y) elemMat[i] = 0;
573:     }
574:     __syncthreads();
575:     /* FE mass matrix construction */
576:     for (fieldA = 0; fieldA < loc_Nf; fieldA++) {
577:       PetscScalar vals[LANDAU_MAX_Q_FACE * LANDAU_MAX_Q_FACE];
578:       PetscInt    nr, nc;
579:       for (f = threadIdx.y; f < Nb; f += blockDim.y) {
580:         for (g = threadIdx.x; g < Nb; g += blockDim.x) {
581:           PetscScalar t = 0;
582:           for (qj = 0; qj < Nq; qj++) {
583:             const PetscReal *BJq   = &BB[qj * Nb];
584:             const PetscInt   jpidx = d_ip_offset[grid] + qj + loc_elem * Nq;
585:             if (dim == 2) {
586:               t += BJq[f] * d_w[jpidx] * shift * BJq[g] * 2. * PETSC_PI;
587:             } else {
588:               t += BJq[f] * d_w[jpidx] * shift * BJq[g];
589:             }
590:           }
591:           if (elemMat) {
592:             const PetscInt fOff = (fieldA * Nb + f) * totDim + fieldA * Nb + g;
593:             elemMat[fOff] += t; // ????
594:           } else (*s_fieldMats)[f][g] = t;
595:         }
596:       }
597:       if (!elemMat) {
598:         const LandauIdx *const Idxs = &d_maps[grid]->gIdx[loc_elem][fieldA][0];
599:         __syncthreads();
600:         if (threadIdx.y == 0) {
601:           for (f = threadIdx.x; f < Nb; f += blockDim.x) {
602:             idx = Idxs[f];
603:             if (idx >= 0) {
604:               (*s_idx)[f][0]   = idx + moffset;
605:               (*s_scale)[f][0] = 1.;
606:             } else {
607:               idx = -idx - 1;
608:               for (q = 0; q < d_maps[grid]->num_face; q++) {
609:                 if (d_maps[grid]->c_maps[idx][q].gid >= 0) (*s_idx)[f][q] = d_maps[grid]->c_maps[idx][q].gid + moffset;
610:                 else (*s_idx)[f][q] = -1;
611:                 (*s_scale)[f][q] = d_maps[grid]->c_maps[idx][q].scale;
612:               }
613:             }
614:           }
615:         }
616:         __syncthreads();
617:         for (f = threadIdx.y; f < Nb; f += blockDim.y) {
618:           idx = Idxs[f];
619:           if (idx >= 0) {
620:             nr = 1;
621:           } else {
622:             nr = d_maps[grid]->num_face;
623:           }
624:           for (g = threadIdx.x; g < Nb; g += blockDim.x) {
625:             idx = Idxs[g];
626:             if (idx >= 0) {
627:               nc = 1;
628:             } else {
629:               nc = d_maps[grid]->num_face;
630:             }
631:             for (q = 0; q < nr; q++) {
632:               for (d = 0; d < nc; d++) vals[q * nc + d] = (*s_scale)[f][q] * (*s_scale)[g][d] * (*s_fieldMats)[f][g];
633:             }
634:             MatSetValuesDevice(d_mat, nr, (*s_idx)[f], nc, (*s_idx)[g], vals, ADD_VALUES);
635:           }
636:         }
637:       }
638:       __syncthreads();
639:     }
640:   }
641: }

643: PetscErrorCode LandauCUDAJacobian(DM plex[], const PetscInt Nq, const PetscInt batch_sz, const PetscInt num_grids, const PetscInt a_numCells[], PetscReal a_Eq_m[], PetscScalar a_elem_closure[], const PetscScalar a_xarray[], const LandauStaticData *SData_d, const PetscReal shift, const PetscLogEvent events[], const PetscInt a_mat_offset[], const PetscInt a_species_offset[], Mat subJ[], Mat JacP)
644: {
645:   cudaError_t  cerr;
646:   PetscInt     Nb = Nq, dim, nip_global, num_cells_batch, elem_mat_size_tot;
647:   PetscInt    *d_numCells, *d_species_offset, *d_mat_offset, *d_ip_offset, *d_ipf_offset, *d_elem_offset;
648:   PetscInt     szf = sizeof(PetscReal), szs = sizeof(PetscScalar), Nftot = a_species_offset[num_grids];
649:   PetscReal   *d_BB = NULL, *d_DD = NULL, *d_invJj = NULL, *d_nu_alpha = NULL, *d_nu_beta = NULL, *d_invMass = NULL, *d_Eq_m = NULL, *d_x = NULL, *d_y = NULL, *d_w = NULL;
650:   PetscScalar *d_elem_mats = NULL, *d_vertex_f = NULL;
651:   PetscReal   *d_f = NULL, *d_dfdx = NULL, *d_dfdy = NULL;
652: #if LANDAU_DIM == 3
653:   PetscReal *d_dfdz = NULL, *d_z = NULL;
654: #endif
655:   LandauCtx                 *ctx;
656:   PetscSplitCSRDataStructure d_mat = NULL;
657:   P4estVertexMaps          **d_maps, *maps[LANDAU_MAX_GRIDS];
658:   int                        nnn = 256 / Nq; // machine dependent
659:   PetscContainer             container;

661:   PetscLogEventBegin(events[3], 0, 0, 0, 0);
662:   while (nnn & nnn - 1) nnn = nnn & nnn - 1;
663:   if (nnn > 16) nnn = 16;
664:   DMGetApplicationContext(plex[0], &ctx);
666:   DMGetDimension(plex[0], &dim);
668:   if (ctx->gpu_assembly) {
669:     PetscObjectQuery((PetscObject)JacP, "assembly_maps", (PetscObject *)&container);
670:     if (container) {       // not here first call
671:       static int init = 0; // hack. just do every time, or put in setup (but that is in base class code), or add init_maps flag
672:       if (!init++) {
673:         P4estVertexMaps *h_maps = NULL;
674:         PetscContainerGetPointer(container, (void **)&h_maps);
675:         for (PetscInt grid = 0; grid < num_grids; grid++) {
676:           if (h_maps[grid].d_self) {
677:             maps[grid] = h_maps[grid].d_self;
678:           } else {
679:             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "GPU assembly but no metadata in container");
680:           }
681:         }
682:         cudaMemcpy(SData_d->maps, maps, num_grids * sizeof(P4estVertexMaps *), cudaMemcpyHostToDevice);
683:       }
684:       d_maps = (P4estVertexMaps **)SData_d->maps;
685:       // this does the setup the first time called
686:       MatCUSPARSEGetDeviceMatWrite(JacP, &d_mat);
687:     } else {
688:       d_maps = NULL;
689:     }
690:   } else {
691:     container = NULL;
692:     d_maps    = NULL;
693:   }
694:   PetscLogEventEnd(events[3], 0, 0, 0, 0);
695:   {
696:     PetscInt elem_mat_size = 0;
697:     nip_global = num_cells_batch = 0;
698:     for (PetscInt grid = 0; grid < num_grids; grid++) {
699:       PetscInt Nfloc = a_species_offset[grid + 1] - a_species_offset[grid], totDim = Nfloc * Nb;
700:       nip_global += a_numCells[grid] * Nq;
701:       num_cells_batch += a_numCells[grid];                 // is in d_elem_offset, but not on host
702:       elem_mat_size += a_numCells[grid] * totDim * totDim; // could save in an offset here -- batch major ordering
703:     }
704:     elem_mat_size_tot = d_maps ? 0 : elem_mat_size;
705:   }
706:   dim3 dimGrid(batch_sz, num_cells_batch);
707:   if (elem_mat_size_tot) {
708:     cudaMalloc((void **)&d_elem_mats, batch_sz * elem_mat_size_tot * szs); // kernel output - first call is on CPU
709:   } else d_elem_mats = NULL;
710:   // create data
711:   d_BB = (PetscReal *)SData_d->B;
712:   d_DD = (PetscReal *)SData_d->D;
713:   if (a_elem_closure || a_xarray) { // form f and df
714:     PetscLogEventBegin(events[1], 0, 0, 0, 0);
715:     cudaMemcpy(SData_d->Eq_m, a_Eq_m, Nftot * szf, cudaMemcpyHostToDevice);
716:     d_invJj    = (PetscReal *)SData_d->invJ;
717:     d_nu_alpha = (PetscReal *)SData_d->alpha;
718:     d_nu_beta  = (PetscReal *)SData_d->beta;
719:     d_invMass  = (PetscReal *)SData_d->invMass;
720:     d_x        = (PetscReal *)SData_d->x;
721:     d_y        = (PetscReal *)SData_d->y;
722:     d_w        = (PetscReal *)SData_d->w;
723:     d_Eq_m     = (PetscReal *)SData_d->Eq_m;
724:     d_dfdx     = (PetscReal *)SData_d->dfdx;
725:     d_dfdy     = (PetscReal *)SData_d->dfdy;
726: #if LANDAU_DIM == 3
727:     d_dfdz = (PetscReal *)SData_d->dfdz;
728:     d_z    = (PetscReal *)SData_d->z;
729: #endif
730:     d_f = (PetscReal *)SData_d->f;
731:     // get a d_vertex_f
732:     if (a_elem_closure) {
733:       PetscInt closure_sz = 0; // argh, don't have this on the host!!!
734:       for (PetscInt grid = 0; grid < num_grids; grid++) {
735:         PetscInt nfloc = a_species_offset[grid + 1] - a_species_offset[grid];
736:         closure_sz += Nq * nfloc * a_numCells[grid];
737:       }
738:       closure_sz *= batch_sz;
739:       cudaMalloc((void **)&d_vertex_f, closure_sz * sizeof(*a_elem_closure));
740:       cudaMemcpy(d_vertex_f, a_elem_closure, closure_sz * sizeof(*a_elem_closure), cudaMemcpyHostToDevice);
741:     } else {
742:       d_vertex_f = (PetscScalar *)a_xarray;
743:     }
744:     PetscLogEventEnd(events[1], 0, 0, 0, 0);
745:   } else {
746:     d_w = (PetscReal *)SData_d->w; // mass just needs the weights
747:   }
748:   //
749:   d_numCells       = (PetscInt *)SData_d->NCells; // redundant -- remove
750:   d_species_offset = (PetscInt *)SData_d->species_offset;
751:   d_mat_offset     = (PetscInt *)SData_d->mat_offset;
752:   d_ip_offset      = (PetscInt *)SData_d->ip_offset;
753:   d_ipf_offset     = (PetscInt *)SData_d->ipf_offset;
754:   d_elem_offset    = (PetscInt *)SData_d->elem_offset;
755:   if (a_elem_closure || a_xarray) { // form f and df
756:     dim3 dimBlockFDF(nnn > Nftot ? Nftot : nnn, Nq), dimBlock((nip_global > nnn) ? nnn : nip_global, Nq);
757:     PetscLogEventBegin(events[8], 0, 0, 0, 0);
758:     PetscLogGpuTimeBegin();
759:     PetscInfo(plex[0], "Form F and dF/dx vectors: nip_global=%" PetscInt_FMT " num_grids=%" PetscInt_FMT "\n", nip_global, num_grids);
760:     landau_form_fdf<<<dimGrid, dimBlockFDF>>>(dim, Nb, num_grids, d_invJj, d_BB, d_DD, d_vertex_f, d_maps, d_f, d_dfdx, d_dfdy,
761: #if LANDAU_DIM == 3
762:                                               d_dfdz,
763: #endif
764:                                               d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
765:     PetscCUDACheckLaunch;
766:     PetscLogGpuFlops(batch_sz * nip_global * (PetscLogDouble)(2 * Nb * (1 + dim)));
767:     if (a_elem_closure) {
768:       cudaFree(d_vertex_f);
769:       d_vertex_f = NULL;
770:     }
771:     PetscLogGpuTimeEnd();
772:     PetscLogEventEnd(events[8], 0, 0, 0, 0);
773:     // Jacobian
774:     PetscLogEventBegin(events[4], 0, 0, 0, 0);
775:     PetscLogGpuTimeBegin();
776:     PetscLogGpuFlops(batch_sz * nip_global * (PetscLogDouble)(a_elem_closure ? (nip_global * (11 * Nftot + 4 * dim * dim) + 6 * Nftot * dim * dim * dim + 10 * Nftot * dim * dim + 4 * Nftot * dim + Nb * Nftot * Nb * Nq * dim * dim * 5) : Nb * Nftot * Nb * Nq * 4));
777:     PetscInt ii = 2 * LANDAU_MAX_NQ * LANDAU_MAX_SPECIES * LANDAU_DIM * (1 + LANDAU_DIM) + 3 * LANDAU_MAX_SPECIES + (1 + LANDAU_DIM) * dimBlock.x * LANDAU_MAX_SPECIES + LANDAU_MAX_NQ * LANDAU_MAX_NQ + 2 * LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE;
778:     if (ii * szf >= 49152) {
779:       cerr = cudaFuncSetAttribute(landau_jacobian, cudaFuncAttributeMaxDynamicSharedMemorySize, 98304);
780:       cerr;
781:     }
782:     PetscInfo(plex[0], "Jacobian shared memory size: %" PetscInt_FMT " bytes, d_elem_mats=%p d_maps=%p\n", ii, d_elem_mats, d_maps);
783:     landau_jacobian<<<dimGrid, dimBlock, ii * szf>>>(nip_global, dim, Nb, num_grids, d_invJj, Nftot, d_nu_alpha, d_nu_beta, d_invMass, d_Eq_m, d_BB, d_DD, d_x, d_y, d_w, d_elem_mats, d_maps, d_mat, d_f, d_dfdx, d_dfdy,
784: #if LANDAU_DIM == 3
785:                                                      d_z, d_dfdz,
786: #endif
787:                                                      d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
788:     PetscCUDACheckLaunch; // has sync
789:     PetscLogGpuTimeEnd();
790:     PetscLogEventEnd(events[4], 0, 0, 0, 0);
791:   } else { // mass
792:     dim3     dimBlock(nnn, Nq);
793:     PetscInt ii = LANDAU_MAX_NQ * LANDAU_MAX_NQ + 2 * LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE;
794:     if (ii * szf >= 49152) {
795:       cerr = cudaFuncSetAttribute(landau_mass, cudaFuncAttributeMaxDynamicSharedMemorySize, 98304);
796:       cerr;
797:     }
798:     PetscInfo(plex[0], "Mass d_maps = %p. Nq=%" PetscInt_FMT ", vector size %d num_cells_batch=%" PetscInt_FMT ", %" PetscInt_FMT " shared memory words\n", d_maps, Nq, nnn, num_cells_batch, ii);
799:     PetscLogEventBegin(events[16], 0, 0, 0, 0);
800:     PetscLogGpuTimeBegin();
801:     landau_mass<<<dimGrid, dimBlock, ii * szf>>>(dim, Nb, num_grids, d_w, d_BB, d_DD, d_elem_mats, d_maps, d_mat, shift, d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_elem_offset);
802:     PetscCUDACheckLaunch; // has sync
803:     PetscLogGpuTimeEnd();
804:     PetscLogEventEnd(events[16], 0, 0, 0, 0);
805:   }
806:   // First time assembly with or without GPU assembly
807:   if (d_elem_mats) {
808:     PetscInt elem_mats_idx = 0;
809:     for (PetscInt b_id = 0; b_id < batch_sz; b_id++) {    // OpenMP (once)
810:       for (PetscInt grid = 0; grid < num_grids; grid++) { // elem_mats_idx += totDim*totDim*a_numCells[grid];
811:         const PetscInt     Nfloc = a_species_offset[grid + 1] - a_species_offset[grid], totDim = Nfloc * Nq;
812:         PetscScalar       *elemMats = NULL, *elMat;
813:         PetscSection       section, globalSection;
814:         PetscInt           cStart, cEnd, ej;
815:         PetscInt           moffset = LAND_MOFFSET(b_id, grid, batch_sz, num_grids, a_mat_offset), nloc, nzl, colbuf[1024], row;
816:         const PetscInt    *cols;
817:         const PetscScalar *vals;
818:         Mat                B = subJ[LAND_PACK_IDX(b_id, grid)];
819:         PetscLogEventBegin(events[5], 0, 0, 0, 0);
820:         DMPlexGetHeightStratum(plex[grid], 0, &cStart, &cEnd);
821:         DMGetLocalSection(plex[grid], &section);
822:         DMGetGlobalSection(plex[grid], &globalSection);
823:         PetscMalloc1(totDim * totDim * a_numCells[grid], &elemMats);
824:         cudaMemcpy(elemMats, &d_elem_mats[elem_mats_idx], totDim * totDim * a_numCells[grid] * sizeof(*elemMats), cudaMemcpyDeviceToHost);
825:         PetscLogEventEnd(events[5], 0, 0, 0, 0);
826:         PetscLogEventBegin(events[6], 0, 0, 0, 0);
827:         for (ej = cStart, elMat = elemMats; ej < cEnd; ++ej, elMat += totDim * totDim) {
828:           DMPlexMatSetClosure(plex[grid], section, globalSection, B, ej, elMat, ADD_VALUES);
829:           if (ej == -1) {
830:             int d, f;
831:             PetscPrintf(PETSC_COMM_SELF, "GPU Element matrix\n");
832:             for (d = 0; d < totDim; ++d) {
833:               for (f = 0; f < totDim; ++f) PetscPrintf(PETSC_COMM_SELF, " %12.5e", PetscRealPart(elMat[d * totDim + f]));
834:               PetscPrintf(PETSC_COMM_SELF, "\n");
835:             }
836:           }
837:         }
838:         PetscFree(elemMats);
839:         MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY);
840:         MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY);
841:         // move nest matrix to global JacP
842:         MatGetSize(B, &nloc, NULL);
843:         for (int i = 0; i < nloc; i++) {
844:           MatGetRow(B, i, &nzl, &cols, &vals);
846:           for (int j = 0; j < nzl; j++) colbuf[j] = cols[j] + moffset;
847:           row = i + moffset;
848:           MatSetValues(JacP, 1, &row, nzl, colbuf, vals, ADD_VALUES);
849:           MatRestoreRow(B, i, &nzl, &cols, &vals);
850:         }
851:         MatDestroy(&B);
852:         PetscLogEventEnd(events[6], 0, 0, 0, 0);
853:         elem_mats_idx += totDim * totDim * a_numCells[grid]; // this can be a stored offset?
854:       }                                                      // grids
855:     }
857:     cudaFree(d_elem_mats);
858:   }

860:   return 0;
861: }