Actual source code: landaucu.cu
1: /*
2: Implements the Landau kernel
3: */
4: #include <petscconf.h>
5: #include <petsc/private/dmpleximpl.h>
6: #include <petsclandau.h>
7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <petscmat.h>
10: #include <petscdevice_cuda.h>
12: #include "../land_tensors.h"
13: #include <petscaijdevice.h>
15: PETSC_EXTERN PetscErrorCode LandauCUDACreateMatMaps(P4estVertexMaps maps[], pointInterpolationP4est (*pointMaps)[LANDAU_MAX_Q_FACE], PetscInt Nf[], PetscInt Nq, PetscInt grid)
16: {
17: P4estVertexMaps h_maps;
18: h_maps.num_elements = maps[grid].num_elements;
19: h_maps.num_face = maps[grid].num_face;
20: h_maps.num_reduced = maps[grid].num_reduced;
21: h_maps.deviceType = maps[grid].deviceType;
22: h_maps.Nf = Nf[grid];
23: h_maps.numgrids = maps[grid].numgrids;
24: cudaMalloc((void **)&h_maps.c_maps, maps[grid].num_reduced * sizeof *pointMaps);
25: cudaMemcpy(h_maps.c_maps, maps[grid].c_maps, maps[grid].num_reduced * sizeof *pointMaps, cudaMemcpyHostToDevice);
26: cudaMalloc((void **)&h_maps.gIdx, maps[grid].num_elements * sizeof *maps[grid].gIdx);
27: cudaMemcpy(h_maps.gIdx, maps[grid].gIdx, maps[grid].num_elements * sizeof *maps[grid].gIdx, cudaMemcpyHostToDevice);
28: cudaMalloc((void **)&maps[grid].d_self, sizeof(P4estVertexMaps));
29: cudaMemcpy(maps[grid].d_self, &h_maps, sizeof(P4estVertexMaps), cudaMemcpyHostToDevice);
30: return 0;
31: }
33: PETSC_EXTERN PetscErrorCode LandauCUDADestroyMatMaps(P4estVertexMaps maps[], PetscInt num_grids)
34: {
35: for (PetscInt grid = 0; grid < num_grids; grid++) {
36: P4estVertexMaps *d_maps = maps[grid].d_self, h_maps;
37: cudaMemcpy(&h_maps, d_maps, sizeof(P4estVertexMaps), cudaMemcpyDeviceToHost);
38: cudaFree(h_maps.c_maps);
39: cudaFree(h_maps.gIdx);
40: cudaFree(d_maps);
41: }
42: return 0;
43: }
45: PetscErrorCode LandauCUDAStaticDataSet(DM plex, const PetscInt Nq, const PetscInt batch_sz, const PetscInt num_grids, PetscInt a_numCells[], PetscInt a_species_offset[], PetscInt a_mat_offset[], PetscReal nu_alpha[], PetscReal nu_beta[], PetscReal a_invMass[], PetscReal a_invJ[], PetscReal a_x[], PetscReal a_y[], PetscReal a_z[], PetscReal a_w[], LandauStaticData *SData_d)
46: {
47: PetscTabulation *Tf;
48: PetscReal *BB, *DD;
49: PetscInt dim, Nb = Nq, szf = sizeof(PetscReal), szs = sizeof(PetscScalar), szi = sizeof(PetscInt);
50: PetscInt h_ip_offset[LANDAU_MAX_GRIDS + 1], h_ipf_offset[LANDAU_MAX_GRIDS + 1], h_elem_offset[LANDAU_MAX_GRIDS + 1], nip, IPfdf_sz, Nf;
51: PetscDS prob;
53: DMGetDimension(plex, &dim);
54: DMGetDS(plex, &prob);
56: PetscDSGetTabulation(prob, &Tf);
57: BB = Tf[0]->T[0];
58: DD = Tf[0]->T[1];
59: Nf = h_ip_offset[0] = h_ipf_offset[0] = h_elem_offset[0] = 0;
60: nip = 0;
61: IPfdf_sz = 0;
62: for (PetscInt grid = 0; grid < num_grids; grid++) {
63: PetscInt nfloc = a_species_offset[grid + 1] - a_species_offset[grid];
64: h_elem_offset[grid + 1] = h_elem_offset[grid] + a_numCells[grid];
65: nip += a_numCells[grid] * Nq;
66: h_ip_offset[grid + 1] = nip;
67: IPfdf_sz += Nq * nfloc * a_numCells[grid];
68: h_ipf_offset[grid + 1] = IPfdf_sz;
69: }
70: Nf = a_species_offset[num_grids];
71: {
72: cudaMalloc((void **)&SData_d->B, Nq * Nb * szf); // kernel input
73: cudaMemcpy(SData_d->B, BB, Nq * Nb * szf, cudaMemcpyHostToDevice);
74: cudaMalloc((void **)&SData_d->D, Nq * Nb * dim * szf); // kernel input
75: cudaMemcpy(SData_d->D, DD, Nq * Nb * dim * szf, cudaMemcpyHostToDevice);
77: cudaMalloc((void **)&SData_d->alpha, Nf * szf); // kernel input
78: cudaMalloc((void **)&SData_d->beta, Nf * szf); // kernel input
79: cudaMalloc((void **)&SData_d->invMass, Nf * szf); // kernel input
81: cudaMemcpy(SData_d->alpha, nu_alpha, Nf * szf, cudaMemcpyHostToDevice);
82: cudaMemcpy(SData_d->beta, nu_beta, Nf * szf, cudaMemcpyHostToDevice);
83: cudaMemcpy(SData_d->invMass, a_invMass, Nf * szf, cudaMemcpyHostToDevice);
85: // collect geometry
86: cudaMalloc((void **)&SData_d->invJ, nip * dim * dim * szf); // kernel input
87: cudaMemcpy(SData_d->invJ, a_invJ, nip * dim * dim * szf, cudaMemcpyHostToDevice);
88: cudaMalloc((void **)&SData_d->x, nip * szf); // kernel input
89: cudaMemcpy(SData_d->x, a_x, nip * szf, cudaMemcpyHostToDevice);
90: cudaMalloc((void **)&SData_d->y, nip * szf); // kernel input
91: cudaMemcpy(SData_d->y, a_y, nip * szf, cudaMemcpyHostToDevice);
92: #if LANDAU_DIM == 3
93: cudaMalloc((void **)&SData_d->z, nip * szf); // kernel input
94: cudaMemcpy(SData_d->z, a_z, nip * szf, cudaMemcpyHostToDevice);
95: #endif
96: cudaMalloc((void **)&SData_d->w, nip * szf); // kernel input
97: cudaMemcpy(SData_d->w, a_w, nip * szf, cudaMemcpyHostToDevice);
99: cudaMalloc((void **)&SData_d->NCells, num_grids * szi);
100: cudaMemcpy(SData_d->NCells, a_numCells, num_grids * szi, cudaMemcpyHostToDevice);
101: cudaMalloc((void **)&SData_d->species_offset, (num_grids + 1) * szi);
102: cudaMemcpy(SData_d->species_offset, a_species_offset, (num_grids + 1) * szi, cudaMemcpyHostToDevice);
103: cudaMalloc((void **)&SData_d->mat_offset, (num_grids + 1) * szi);
104: cudaMemcpy(SData_d->mat_offset, a_mat_offset, (num_grids + 1) * szi, cudaMemcpyHostToDevice);
105: cudaMalloc((void **)&SData_d->ip_offset, (num_grids + 1) * szi);
106: cudaMemcpy(SData_d->ip_offset, h_ip_offset, (num_grids + 1) * szi, cudaMemcpyHostToDevice);
107: cudaMalloc((void **)&SData_d->ipf_offset, (num_grids + 1) * szi);
108: cudaMemcpy(SData_d->ipf_offset, h_ipf_offset, (num_grids + 1) * szi, cudaMemcpyHostToDevice);
109: cudaMalloc((void **)&SData_d->elem_offset, (num_grids + 1) * szi);
110: cudaMemcpy(SData_d->elem_offset, h_elem_offset, (num_grids + 1) * szi, cudaMemcpyHostToDevice);
111: cudaMalloc((void **)&SData_d->maps, num_grids * sizeof(P4estVertexMaps *));
112: // allocate space for dynamic data once
113: cudaMalloc((void **)&SData_d->Eq_m, Nf * szf); // this could be for each vertex (todo?)
114: cudaMalloc((void **)&SData_d->f, nip * Nf * szs * batch_sz); // for each vertex in batch
115: cudaMalloc((void **)&SData_d->dfdx, nip * Nf * szs * batch_sz);
116: cudaMalloc((void **)&SData_d->dfdy, nip * Nf * szs * batch_sz);
117: #if LANDAU_DIM == 3
118: cudaMalloc((void **)&SData_d->dfdz, nip * Nf * szs * batch_sz);
119: #endif
120: }
121: return 0;
122: }
124: PetscErrorCode LandauCUDAStaticDataClear(LandauStaticData *SData_d)
125: {
126: if (SData_d->alpha) {
127: cudaFree(SData_d->alpha);
128: SData_d->alpha = NULL;
129: cudaFree(SData_d->beta);
130: cudaFree(SData_d->invMass);
131: cudaFree(SData_d->B);
132: cudaFree(SData_d->D);
133: cudaFree(SData_d->invJ);
134: #if LANDAU_DIM == 3
135: cudaFree(SData_d->z);
136: #endif
137: cudaFree(SData_d->x);
138: cudaFree(SData_d->y);
139: cudaFree(SData_d->w);
140: // dynamic data
141: cudaFree(SData_d->Eq_m);
142: cudaFree(SData_d->f);
143: cudaFree(SData_d->dfdx);
144: cudaFree(SData_d->dfdy);
145: #if LANDAU_DIM == 3
146: cudaFree(SData_d->dfdz);
147: #endif
148: cudaFree(SData_d->NCells);
149: cudaFree(SData_d->species_offset);
150: cudaFree(SData_d->mat_offset);
151: cudaFree(SData_d->ip_offset);
152: cudaFree(SData_d->ipf_offset);
153: cudaFree(SData_d->elem_offset);
154: cudaFree(SData_d->maps);
155: }
156: return 0;
157: }
158: //
159: // The GPU Landau kernel
160: //
161: __global__ void landau_form_fdf(const PetscInt dim, const PetscInt Nb, const PetscInt num_grids, const PetscReal d_invJ[], const PetscReal *const BB, const PetscReal *const DD, PetscScalar *d_vertex_f, P4estVertexMaps *d_maps[], PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[],
162: #if LANDAU_DIM == 3
163: PetscReal d_dfdz[],
164: #endif
165: const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[]) // output
166: {
167: const PetscInt Nq = blockDim.y, myQi = threadIdx.y;
168: const PetscInt b_elem_idx = blockIdx.y, b_id = blockIdx.x, IPf_sz_glb = d_ipf_offset[num_grids];
169: const PetscReal *Bq = &BB[myQi * Nb], *Dq = &DD[myQi * Nb * dim];
170: PetscInt grid = 0, f, d, b, e, q;
171: while (b_elem_idx >= d_elem_offset[grid + 1]) grid++;
172: {
173: const PetscInt loc_nip = d_numCells[grid] * Nq, loc_Nf = d_species_offset[grid + 1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
174: const PetscInt moffset = LAND_MOFFSET(b_id, grid, gridDim.x, num_grids, d_mat_offset);
175: const PetscScalar *coef;
176: PetscReal u_x[LANDAU_DIM];
177: const PetscReal *invJ = &d_invJ[(d_ip_offset[grid] + loc_elem * Nq + myQi) * dim * dim];
178: PetscScalar coef_buff[LANDAU_MAX_SPECIES * LANDAU_MAX_NQ];
179: if (!d_maps) {
180: coef = &d_vertex_f[b_id * IPf_sz_glb + d_ipf_offset[grid] + loc_elem * Nb * loc_Nf]; // closure and IP indexing are the same
181: } else {
182: coef = coef_buff;
183: for (f = 0; f < loc_Nf; ++f) {
184: LandauIdx *const Idxs = &d_maps[grid]->gIdx[loc_elem][f][0];
185: for (b = 0; b < Nb; ++b) {
186: PetscInt idx = Idxs[b];
187: if (idx >= 0) {
188: coef_buff[f * Nb + b] = d_vertex_f[idx + moffset];
189: } else {
190: idx = -idx - 1;
191: coef_buff[f * Nb + b] = 0;
192: for (q = 0; q < d_maps[grid]->num_face; q++) {
193: PetscInt id = d_maps[grid]->c_maps[idx][q].gid;
194: PetscReal scale = d_maps[grid]->c_maps[idx][q].scale;
195: if (id >= 0) coef_buff[f * Nb + b] += scale * d_vertex_f[id + moffset];
196: }
197: }
198: }
199: }
200: }
202: /* get f and df */
203: for (f = threadIdx.x; f < loc_Nf; f += blockDim.x) {
204: PetscReal refSpaceDer[LANDAU_DIM];
205: const PetscInt idx = b_id * IPf_sz_glb + d_ipf_offset[grid] + f * loc_nip + loc_elem * Nq + myQi;
206: d_f[idx] = 0.0;
207: for (d = 0; d < LANDAU_DIM; ++d) refSpaceDer[d] = 0.0;
208: for (b = 0; b < Nb; ++b) {
209: const PetscInt cidx = b;
210: d_f[idx] += Bq[cidx] * PetscRealPart(coef[f * Nb + cidx]);
211: for (d = 0; d < dim; ++d) refSpaceDer[d] += Dq[cidx * dim + d] * PetscRealPart(coef[f * Nb + cidx]);
212: }
213: for (d = 0; d < dim; ++d) {
214: for (e = 0, u_x[d] = 0.0; e < dim; ++e) u_x[d] += invJ[e * dim + d] * refSpaceDer[e];
215: }
216: d_dfdx[idx] = u_x[0];
217: d_dfdy[idx] = u_x[1];
218: #if LANDAU_DIM == 3
219: d_dfdz[idx] = u_x[2];
220: #endif
221: }
222: }
223: }
225: __device__ void landau_jac_kernel(const PetscInt num_grids, const PetscInt jpidx, PetscInt nip_global, const PetscInt grid, const PetscReal xx[], const PetscReal yy[], const PetscReal ww[], const PetscReal invJj[], const PetscInt Nftot, const PetscReal nu_alpha[], const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[], const PetscReal *const BB, const PetscReal *const DD, PetscScalar *elemMat, P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, // output
226: PetscScalar s_fieldMats[][LANDAU_MAX_NQ], // all these arrays are in shared memory
227: PetscReal s_scale[][LANDAU_MAX_Q_FACE], PetscInt s_idx[][LANDAU_MAX_Q_FACE], PetscReal s_g2[][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES], PetscReal s_g3[][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES], PetscReal s_gg2[][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES], PetscReal s_gg3[][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES], PetscReal s_nu_alpha[], PetscReal s_nu_beta[], PetscReal s_invMass[], PetscReal s_f[], PetscReal s_dfx[], PetscReal s_dfy[], PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[], // global memory
228: #if LANDAU_DIM == 3
229: const PetscReal zz[], PetscReal s_dfz[], PetscReal d_dfdz[],
230: #endif
231: const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[])
232: {
233: const PetscInt Nq = blockDim.y, myQi = threadIdx.y;
234: const PetscInt b_elem_idx = blockIdx.y, b_id = blockIdx.x, IPf_sz_glb = d_ipf_offset[num_grids];
235: const PetscInt loc_Nf = d_species_offset[grid + 1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
236: const PetscInt moffset = LAND_MOFFSET(b_id, grid, gridDim.x, num_grids, d_mat_offset);
237: int delta, d, f, g, d2, dp, d3, fieldA, ipidx_b;
238: PetscReal gg2_temp[LANDAU_DIM], gg3_temp[LANDAU_DIM][LANDAU_DIM];
239: #if LANDAU_DIM == 2
240: const PetscReal vj[3] = {xx[jpidx], yy[jpidx]};
241: constexpr int dim = 2;
242: #else
243: const PetscReal vj[3] = {xx[jpidx], yy[jpidx], zz[jpidx]};
244: constexpr int dim = 3;
245: #endif
246: const PetscInt f_off = d_species_offset[grid], Nb = Nq;
247: // create g2 & g3
248: for (f = threadIdx.x; f < loc_Nf; f += blockDim.x) {
249: for (d = 0; d < dim; d++) { // clear accumulation data D & K
250: s_gg2[d][myQi][f] = 0;
251: for (d2 = 0; d2 < dim; d2++) s_gg3[d][d2][myQi][f] = 0;
252: }
253: }
254: #pragma unroll
255: for (d2 = 0; d2 < dim; d2++) {
256: gg2_temp[d2] = 0;
257: #pragma unroll
258: for (d3 = 0; d3 < dim; d3++) gg3_temp[d2][d3] = 0;
259: }
260: if (threadIdx.y == 0) {
261: // copy species into shared memory
262: for (fieldA = threadIdx.x; fieldA < Nftot; fieldA += blockDim.x) {
263: s_nu_alpha[fieldA] = nu_alpha[fieldA];
264: s_nu_beta[fieldA] = nu_beta[fieldA];
265: s_invMass[fieldA] = invMass[fieldA];
266: }
267: }
268: __syncthreads();
269: // inner integral, collect gg2/3
270: for (ipidx_b = 0; ipidx_b < nip_global; ipidx_b += blockDim.x) {
271: const PetscInt ipidx = ipidx_b + threadIdx.x;
272: PetscInt f_off_r, grid_r, loc_Nf_r, nip_loc_r, ipidx_g, fieldB, IPf_idx_r;
273: __syncthreads();
274: if (ipidx < nip_global) {
275: grid_r = 0;
276: while (ipidx >= d_ip_offset[grid_r + 1]) grid_r++;
277: f_off_r = d_species_offset[grid_r];
278: ipidx_g = ipidx - d_ip_offset[grid_r];
279: nip_loc_r = d_numCells[grid_r] * Nq;
280: loc_Nf_r = d_species_offset[grid_r + 1] - d_species_offset[grid_r];
281: IPf_idx_r = b_id * IPf_sz_glb + d_ipf_offset[grid_r] + ipidx_g;
282: for (fieldB = threadIdx.y; fieldB < loc_Nf_r; fieldB += blockDim.y) {
283: const PetscInt idx = IPf_idx_r + fieldB * nip_loc_r;
284: s_f[fieldB * blockDim.x + threadIdx.x] = d_f[idx]; // all vector threads get copy of data
285: s_dfx[fieldB * blockDim.x + threadIdx.x] = d_dfdx[idx];
286: s_dfy[fieldB * blockDim.x + threadIdx.x] = d_dfdy[idx];
287: #if LANDAU_DIM == 3
288: s_dfz[fieldB * blockDim.x + threadIdx.x] = d_dfdz[idx];
289: #endif
290: }
291: }
292: __syncthreads();
293: if (ipidx < nip_global) {
294: const PetscReal wi = ww[ipidx], x = xx[ipidx], y = yy[ipidx];
295: PetscReal temp1[3] = {0, 0, 0}, temp2 = 0;
296: #if LANDAU_DIM == 2
297: PetscReal Ud[2][2], Uk[2][2], mask = (PetscAbs(vj[0] - x) < 100 * PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[1] - y) < 100 * PETSC_SQRT_MACHINE_EPSILON) ? 0. : 1.;
298: LandauTensor2D(vj, x, y, Ud, Uk, mask);
299: #else
300: PetscReal U[3][3], z = zz[ipidx], mask = (PetscAbs(vj[0] - x) < 100 * PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[1] - y) < 100 * PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[2] - z) < 100 * PETSC_SQRT_MACHINE_EPSILON) ? 0. : 1.;
301: LandauTensor3D(vj, x, y, z, U, mask);
302: #endif
303: for (int fieldB = 0; fieldB < loc_Nf_r; fieldB++) {
304: temp1[0] += s_dfx[fieldB * blockDim.x + threadIdx.x] * s_nu_beta[fieldB + f_off_r] * s_invMass[fieldB + f_off_r];
305: temp1[1] += s_dfy[fieldB * blockDim.x + threadIdx.x] * s_nu_beta[fieldB + f_off_r] * s_invMass[fieldB + f_off_r];
306: #if LANDAU_DIM == 3
307: temp1[2] += s_dfz[fieldB * blockDim.x + threadIdx.x] * s_nu_beta[fieldB + f_off_r] * s_invMass[fieldB + f_off_r];
308: #endif
309: temp2 += s_f[fieldB * blockDim.x + threadIdx.x] * s_nu_beta[fieldB + f_off_r];
310: }
311: temp1[0] *= wi;
312: temp1[1] *= wi;
313: #if LANDAU_DIM == 3
314: temp1[2] *= wi;
315: #endif
316: temp2 *= wi;
317: #if LANDAU_DIM == 2
318: #pragma unroll
319: for (d2 = 0; d2 < 2; d2++) {
320: #pragma unroll
321: for (d3 = 0; d3 < 2; ++d3) {
322: /* K = U * grad(f): g2=e: i,A */
323: gg2_temp[d2] += Uk[d2][d3] * temp1[d3];
324: /* D = -U * (I \kron (fx)): g3=f: i,j,A */
325: gg3_temp[d2][d3] += Ud[d2][d3] * temp2;
326: }
327: }
328: #else
329: #pragma unroll
330: for (d2 = 0; d2 < 3; ++d2) {
331: #pragma unroll
332: for (d3 = 0; d3 < 3; ++d3) {
333: /* K = U * grad(f): g2 = e: i,A */
334: gg2_temp[d2] += U[d2][d3] * temp1[d3];
335: /* D = -U * (I \kron (fx)): g3 = f: i,j,A */
336: gg3_temp[d2][d3] += U[d2][d3] * temp2;
337: }
338: }
339: #endif
340: }
341: } /* IPs */
343: /* reduce gg temp sums across threads */
344: for (delta = blockDim.x / 2; delta > 0; delta /= 2) {
345: #pragma unroll
346: for (d2 = 0; d2 < dim; d2++) {
347: gg2_temp[d2] += __shfl_xor_sync(0xffffffff, gg2_temp[d2], delta, blockDim.x);
348: #pragma unroll
349: for (d3 = 0; d3 < dim; d3++) gg3_temp[d2][d3] += __shfl_xor_sync(0xffffffff, gg3_temp[d2][d3], delta, blockDim.x);
350: }
351: }
352: // add alpha and put in gg2/3
353: for (fieldA = threadIdx.x; fieldA < loc_Nf; fieldA += blockDim.x) {
354: #pragma unroll
355: for (d2 = 0; d2 < dim; d2++) {
356: s_gg2[d2][myQi][fieldA] += gg2_temp[d2] * s_nu_alpha[fieldA + f_off];
357: #pragma unroll
358: for (d3 = 0; d3 < dim; d3++) s_gg3[d2][d3][myQi][fieldA] -= gg3_temp[d2][d3] * s_nu_alpha[fieldA + f_off] * s_invMass[fieldA + f_off];
359: }
360: }
361: __syncthreads();
362: /* add electric field term once per IP */
363: for (fieldA = threadIdx.x; fieldA < loc_Nf; fieldA += blockDim.x) s_gg2[dim - 1][myQi][fieldA] += Eq_m[fieldA + f_off];
364: __syncthreads();
365: /* Jacobian transform - g2 */
366: for (fieldA = threadIdx.x; fieldA < loc_Nf; fieldA += blockDim.x) {
367: PetscReal wj = ww[jpidx];
368: for (d = 0; d < dim; ++d) {
369: s_g2[d][myQi][fieldA] = 0.0;
370: for (d2 = 0; d2 < dim; ++d2) {
371: s_g2[d][myQi][fieldA] += invJj[d * dim + d2] * s_gg2[d2][myQi][fieldA];
372: s_g3[d][d2][myQi][fieldA] = 0.0;
373: for (d3 = 0; d3 < dim; ++d3) {
374: for (dp = 0; dp < dim; ++dp) s_g3[d][d2][myQi][fieldA] += invJj[d * dim + d3] * s_gg3[d3][dp][myQi][fieldA] * invJj[d2 * dim + dp];
375: }
376: s_g3[d][d2][myQi][fieldA] *= wj;
377: }
378: s_g2[d][myQi][fieldA] *= wj;
379: }
380: }
381: __syncthreads(); // Synchronize (ensure all the data is available) and sum IP matrices
382: /* FE matrix construction */
383: {
384: int fieldA, d, qj, d2, q, idx, totDim = Nb * loc_Nf;
385: /* assemble */
386: for (fieldA = 0; fieldA < loc_Nf; fieldA++) {
387: for (f = threadIdx.y; f < Nb; f += blockDim.y) {
388: for (g = threadIdx.x; g < Nb; g += blockDim.x) {
389: PetscScalar t = 0;
390: for (qj = 0; qj < Nq; qj++) {
391: const PetscReal *BJq = &BB[qj * Nb], *DIq = &DD[qj * Nb * dim];
392: for (d = 0; d < dim; ++d) {
393: t += DIq[f * dim + d] * s_g2[d][qj][fieldA] * BJq[g];
394: for (d2 = 0; d2 < dim; ++d2) t += DIq[f * dim + d] * s_g3[d][d2][qj][fieldA] * DIq[g * dim + d2];
395: }
396: }
397: if (elemMat) {
398: const PetscInt fOff = (fieldA * Nb + f) * totDim + fieldA * Nb + g;
399: elemMat[fOff] += t; // ????
400: } else s_fieldMats[f][g] = t;
401: }
402: }
403: if (s_fieldMats) {
404: PetscScalar vals[LANDAU_MAX_Q_FACE * LANDAU_MAX_Q_FACE];
405: PetscInt nr, nc;
406: const LandauIdx *const Idxs = &d_maps[grid]->gIdx[loc_elem][fieldA][0];
407: __syncthreads();
408: if (threadIdx.y == 0) {
409: for (f = threadIdx.x; f < Nb; f += blockDim.x) {
410: idx = Idxs[f];
411: if (idx >= 0) {
412: s_idx[f][0] = idx + moffset;
413: s_scale[f][0] = 1.;
414: } else {
415: idx = -idx - 1;
416: for (q = 0; q < d_maps[grid]->num_face; q++) {
417: if (d_maps[grid]->c_maps[idx][q].gid >= 0) s_idx[f][q] = d_maps[grid]->c_maps[idx][q].gid + moffset;
418: else s_idx[f][q] = -1;
419: s_scale[f][q] = d_maps[grid]->c_maps[idx][q].scale;
420: }
421: }
422: }
423: }
424: __syncthreads();
425: for (f = threadIdx.y; f < Nb; f += blockDim.y) {
426: idx = Idxs[f];
427: if (idx >= 0) {
428: nr = 1;
429: } else {
430: nr = d_maps[grid]->num_face;
431: }
432: for (g = threadIdx.x; g < Nb; g += blockDim.x) {
433: idx = Idxs[g];
434: if (idx >= 0) {
435: nc = 1;
436: } else {
437: nc = d_maps[grid]->num_face;
438: }
439: for (q = 0; q < nr; q++) {
440: for (d = 0; d < nc; d++) vals[q * nc + d] = s_scale[f][q] * s_scale[g][d] * s_fieldMats[f][g];
441: }
442: MatSetValuesDevice(d_mat, nr, s_idx[f], nc, s_idx[g], vals, ADD_VALUES);
443: }
444: }
445: __syncthreads();
446: }
447: }
448: }
449: }
451: //
452: // The CUDA Landau kernel
453: //
454: __global__ void __launch_bounds__(256, 2) landau_jacobian(const PetscInt nip_global, const PetscInt dim, const PetscInt Nb, const PetscInt num_grids, const PetscReal invJj[], const PetscInt Nftot, const PetscReal nu_alpha[], const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[], const PetscReal *const BB, const PetscReal *const DD, const PetscReal xx[], const PetscReal yy[], const PetscReal ww[], PetscScalar d_elem_mats[], P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[],
455: #if LANDAU_DIM == 3
456: const PetscReal zz[], PetscReal d_dfdz[],
457: #endif
458: const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[])
459: {
460: extern __shared__ PetscReal smem[];
461: int size = 0;
462: PetscReal(*s_g2)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] = (PetscReal(*)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) & smem[size];
463: size += LANDAU_MAX_NQ * LANDAU_MAX_SPECIES * LANDAU_DIM;
464: PetscReal(*s_g3)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] = (PetscReal(*)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) & smem[size];
465: size += LANDAU_DIM * LANDAU_DIM * LANDAU_MAX_NQ * LANDAU_MAX_SPECIES;
466: PetscReal(*s_gg2)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] = (PetscReal(*)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) & smem[size];
467: size += LANDAU_MAX_NQ * LANDAU_MAX_SPECIES * LANDAU_DIM;
468: PetscReal(*s_gg3)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] = (PetscReal(*)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) & smem[size];
469: size += LANDAU_DIM * LANDAU_DIM * LANDAU_MAX_NQ * LANDAU_MAX_SPECIES;
470: PetscReal *s_nu_alpha = &smem[size];
471: size += LANDAU_MAX_SPECIES;
472: PetscReal *s_nu_beta = &smem[size];
473: size += LANDAU_MAX_SPECIES;
474: PetscReal *s_invMass = &smem[size];
475: size += LANDAU_MAX_SPECIES;
476: PetscReal *s_f = &smem[size];
477: size += blockDim.x * LANDAU_MAX_SPECIES;
478: PetscReal *s_dfx = &smem[size];
479: size += blockDim.x * LANDAU_MAX_SPECIES;
480: PetscReal *s_dfy = &smem[size];
481: size += blockDim.x * LANDAU_MAX_SPECIES;
482: #if LANDAU_DIM == 3
483: PetscReal *s_dfz = &smem[size];
484: size += blockDim.x * LANDAU_MAX_SPECIES;
485: #endif
486: PetscScalar(*s_fieldMats)[LANDAU_MAX_NQ][LANDAU_MAX_NQ];
487: PetscReal(*s_scale)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
488: PetscInt(*s_idx)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
489: const PetscInt b_elem_idx = blockIdx.y, b_id = blockIdx.x;
490: PetscInt Nq = blockDim.y, grid = 0; // Nq == Nb
491: PetscScalar *elemMat = NULL; /* my output */
492: while (b_elem_idx >= d_elem_offset[grid + 1]) grid++;
493: {
494: const PetscInt loc_Nf = d_species_offset[grid + 1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
495: const PetscInt myQi = threadIdx.y;
496: const PetscInt jpidx = d_ip_offset[grid] + myQi + loc_elem * Nq;
497: const PetscReal *invJ = &invJj[jpidx * dim * dim];
498: if (d_elem_mats) {
499: PetscInt totDim = loc_Nf * Nb;
500: elemMat = d_elem_mats; // start a beginning and get to my element matrix
501: for (PetscInt b_id2 = 0; b_id2 < b_id; b_id2++) {
502: for (PetscInt grid2 = 0; grid2 < num_grids; grid2++) {
503: PetscInt Nfloc2 = d_species_offset[grid2 + 1] - d_species_offset[grid2], totDim2 = Nfloc2 * Nb;
504: elemMat += d_numCells[grid2] * totDim2 * totDim2; // jump past grids,could be in an offset
505: }
506: }
507: for (PetscInt grid2 = 0; grid2 < grid; grid2++) {
508: PetscInt Nfloc2 = d_species_offset[grid2 + 1] - d_species_offset[grid2], totDim2 = Nfloc2 * Nb;
509: elemMat += d_numCells[grid2] * totDim2 * totDim2; // jump past grids, could be in an offset
510: }
511: elemMat += loc_elem * totDim * totDim; // index into local matrix & zero out
512: for (int i = threadIdx.x + threadIdx.y * blockDim.x; i < totDim * totDim; i += blockDim.x * blockDim.y) elemMat[i] = 0;
513: }
514: __syncthreads();
515: if (d_maps) {
516: // reuse the space for fieldMats
517: s_fieldMats = (PetscScalar(*)[LANDAU_MAX_NQ][LANDAU_MAX_NQ]) & smem[size];
518: size += LANDAU_MAX_NQ * LANDAU_MAX_NQ;
519: s_scale = (PetscReal(*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) & smem[size];
520: size += LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE;
521: s_idx = (PetscInt(*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) & smem[size];
522: size += LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE; // this is too big, idx is an integer
523: } else {
524: s_fieldMats = NULL;
525: }
526: __syncthreads();
527: landau_jac_kernel(num_grids, jpidx, nip_global, grid, xx, yy, ww, invJ, Nftot, nu_alpha, nu_beta, invMass, Eq_m, BB, DD, elemMat, d_maps, d_mat, *s_fieldMats, *s_scale, *s_idx, *s_g2, *s_g3, *s_gg2, *s_gg3, s_nu_alpha, s_nu_beta, s_invMass, s_f, s_dfx, s_dfy, d_f, d_dfdx, d_dfdy,
528: #if LANDAU_DIM == 3
529: zz, s_dfz, d_dfdz,
530: #endif
531: d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
532: }
533: }
535: __global__ void __launch_bounds__(256, 4) landau_mass(const PetscInt dim, const PetscInt Nb, const PetscInt num_grids, const PetscReal d_w[], const PetscReal *const BB, const PetscReal *const DD, PetscScalar d_elem_mats[], P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, PetscReal shift, const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_elem_offset[])
536: {
537: extern __shared__ PetscReal smem[];
538: const PetscInt Nq = blockDim.y, b_elem_idx = blockIdx.y, b_id = blockIdx.x;
539: PetscScalar *elemMat = NULL; /* my output */
540: PetscInt fieldA, d, qj, q, idx, f, g, grid = 0, size = 0;
541: PetscScalar(*s_fieldMats)[LANDAU_MAX_NQ][LANDAU_MAX_NQ];
542: PetscReal(*s_scale)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
543: PetscInt(*s_idx)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
544: if (d_maps) {
545: // reuse the space for fieldMats
546: s_fieldMats = (PetscScalar(*)[LANDAU_MAX_NQ][LANDAU_MAX_NQ]) & smem[size];
547: size += LANDAU_MAX_NQ * LANDAU_MAX_NQ;
548: s_scale = (PetscReal(*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) & smem[size];
549: size += LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE;
550: s_idx = (PetscInt(*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) & smem[size];
551: size += LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE; // this is too big, idx is an integer
552: } else {
553: s_fieldMats = NULL;
554: }
555: while (b_elem_idx >= d_elem_offset[grid + 1]) grid++;
556: {
557: const PetscInt loc_Nf = d_species_offset[grid + 1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
558: const PetscInt moffset = LAND_MOFFSET(b_id, grid, gridDim.x, num_grids, d_mat_offset), totDim = loc_Nf * Nq;
559: if (d_elem_mats) {
560: elemMat = d_elem_mats; // start a beginning
561: for (PetscInt b_id2 = 0; b_id2 < b_id; b_id2++) {
562: for (PetscInt grid2 = 0; grid2 < num_grids; grid2++) {
563: PetscInt Nfloc2 = d_species_offset[grid2 + 1] - d_species_offset[grid2], totDim2 = Nfloc2 * Nb;
564: elemMat += d_numCells[grid2] * totDim2 * totDim2; // jump past grids,could be in an offset
565: }
566: }
567: for (PetscInt grid2 = 0; grid2 < grid; grid2++) {
568: PetscInt Nfloc2 = d_species_offset[grid2 + 1] - d_species_offset[grid2], totDim2 = Nfloc2 * Nb;
569: elemMat += d_numCells[grid2] * totDim2 * totDim2; // jump past grids,could be in an offset
570: }
571: elemMat += loc_elem * totDim * totDim;
572: for (int i = threadIdx.x + threadIdx.y * blockDim.x; i < totDim * totDim; i += blockDim.x * blockDim.y) elemMat[i] = 0;
573: }
574: __syncthreads();
575: /* FE mass matrix construction */
576: for (fieldA = 0; fieldA < loc_Nf; fieldA++) {
577: PetscScalar vals[LANDAU_MAX_Q_FACE * LANDAU_MAX_Q_FACE];
578: PetscInt nr, nc;
579: for (f = threadIdx.y; f < Nb; f += blockDim.y) {
580: for (g = threadIdx.x; g < Nb; g += blockDim.x) {
581: PetscScalar t = 0;
582: for (qj = 0; qj < Nq; qj++) {
583: const PetscReal *BJq = &BB[qj * Nb];
584: const PetscInt jpidx = d_ip_offset[grid] + qj + loc_elem * Nq;
585: if (dim == 2) {
586: t += BJq[f] * d_w[jpidx] * shift * BJq[g] * 2. * PETSC_PI;
587: } else {
588: t += BJq[f] * d_w[jpidx] * shift * BJq[g];
589: }
590: }
591: if (elemMat) {
592: const PetscInt fOff = (fieldA * Nb + f) * totDim + fieldA * Nb + g;
593: elemMat[fOff] += t; // ????
594: } else (*s_fieldMats)[f][g] = t;
595: }
596: }
597: if (!elemMat) {
598: const LandauIdx *const Idxs = &d_maps[grid]->gIdx[loc_elem][fieldA][0];
599: __syncthreads();
600: if (threadIdx.y == 0) {
601: for (f = threadIdx.x; f < Nb; f += blockDim.x) {
602: idx = Idxs[f];
603: if (idx >= 0) {
604: (*s_idx)[f][0] = idx + moffset;
605: (*s_scale)[f][0] = 1.;
606: } else {
607: idx = -idx - 1;
608: for (q = 0; q < d_maps[grid]->num_face; q++) {
609: if (d_maps[grid]->c_maps[idx][q].gid >= 0) (*s_idx)[f][q] = d_maps[grid]->c_maps[idx][q].gid + moffset;
610: else (*s_idx)[f][q] = -1;
611: (*s_scale)[f][q] = d_maps[grid]->c_maps[idx][q].scale;
612: }
613: }
614: }
615: }
616: __syncthreads();
617: for (f = threadIdx.y; f < Nb; f += blockDim.y) {
618: idx = Idxs[f];
619: if (idx >= 0) {
620: nr = 1;
621: } else {
622: nr = d_maps[grid]->num_face;
623: }
624: for (g = threadIdx.x; g < Nb; g += blockDim.x) {
625: idx = Idxs[g];
626: if (idx >= 0) {
627: nc = 1;
628: } else {
629: nc = d_maps[grid]->num_face;
630: }
631: for (q = 0; q < nr; q++) {
632: for (d = 0; d < nc; d++) vals[q * nc + d] = (*s_scale)[f][q] * (*s_scale)[g][d] * (*s_fieldMats)[f][g];
633: }
634: MatSetValuesDevice(d_mat, nr, (*s_idx)[f], nc, (*s_idx)[g], vals, ADD_VALUES);
635: }
636: }
637: }
638: __syncthreads();
639: }
640: }
641: }
643: PetscErrorCode LandauCUDAJacobian(DM plex[], const PetscInt Nq, const PetscInt batch_sz, const PetscInt num_grids, const PetscInt a_numCells[], PetscReal a_Eq_m[], PetscScalar a_elem_closure[], const PetscScalar a_xarray[], const LandauStaticData *SData_d, const PetscReal shift, const PetscLogEvent events[], const PetscInt a_mat_offset[], const PetscInt a_species_offset[], Mat subJ[], Mat JacP)
644: {
645: cudaError_t cerr;
646: PetscInt Nb = Nq, dim, nip_global, num_cells_batch, elem_mat_size_tot;
647: PetscInt *d_numCells, *d_species_offset, *d_mat_offset, *d_ip_offset, *d_ipf_offset, *d_elem_offset;
648: PetscInt szf = sizeof(PetscReal), szs = sizeof(PetscScalar), Nftot = a_species_offset[num_grids];
649: PetscReal *d_BB = NULL, *d_DD = NULL, *d_invJj = NULL, *d_nu_alpha = NULL, *d_nu_beta = NULL, *d_invMass = NULL, *d_Eq_m = NULL, *d_x = NULL, *d_y = NULL, *d_w = NULL;
650: PetscScalar *d_elem_mats = NULL, *d_vertex_f = NULL;
651: PetscReal *d_f = NULL, *d_dfdx = NULL, *d_dfdy = NULL;
652: #if LANDAU_DIM == 3
653: PetscReal *d_dfdz = NULL, *d_z = NULL;
654: #endif
655: LandauCtx *ctx;
656: PetscSplitCSRDataStructure d_mat = NULL;
657: P4estVertexMaps **d_maps, *maps[LANDAU_MAX_GRIDS];
658: int nnn = 256 / Nq; // machine dependent
659: PetscContainer container;
661: PetscLogEventBegin(events[3], 0, 0, 0, 0);
662: while (nnn & nnn - 1) nnn = nnn & nnn - 1;
663: if (nnn > 16) nnn = 16;
664: DMGetApplicationContext(plex[0], &ctx);
666: DMGetDimension(plex[0], &dim);
668: if (ctx->gpu_assembly) {
669: PetscObjectQuery((PetscObject)JacP, "assembly_maps", (PetscObject *)&container);
670: if (container) { // not here first call
671: static int init = 0; // hack. just do every time, or put in setup (but that is in base class code), or add init_maps flag
672: if (!init++) {
673: P4estVertexMaps *h_maps = NULL;
674: PetscContainerGetPointer(container, (void **)&h_maps);
675: for (PetscInt grid = 0; grid < num_grids; grid++) {
676: if (h_maps[grid].d_self) {
677: maps[grid] = h_maps[grid].d_self;
678: } else {
679: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "GPU assembly but no metadata in container");
680: }
681: }
682: cudaMemcpy(SData_d->maps, maps, num_grids * sizeof(P4estVertexMaps *), cudaMemcpyHostToDevice);
683: }
684: d_maps = (P4estVertexMaps **)SData_d->maps;
685: // this does the setup the first time called
686: MatCUSPARSEGetDeviceMatWrite(JacP, &d_mat);
687: } else {
688: d_maps = NULL;
689: }
690: } else {
691: container = NULL;
692: d_maps = NULL;
693: }
694: PetscLogEventEnd(events[3], 0, 0, 0, 0);
695: {
696: PetscInt elem_mat_size = 0;
697: nip_global = num_cells_batch = 0;
698: for (PetscInt grid = 0; grid < num_grids; grid++) {
699: PetscInt Nfloc = a_species_offset[grid + 1] - a_species_offset[grid], totDim = Nfloc * Nb;
700: nip_global += a_numCells[grid] * Nq;
701: num_cells_batch += a_numCells[grid]; // is in d_elem_offset, but not on host
702: elem_mat_size += a_numCells[grid] * totDim * totDim; // could save in an offset here -- batch major ordering
703: }
704: elem_mat_size_tot = d_maps ? 0 : elem_mat_size;
705: }
706: dim3 dimGrid(batch_sz, num_cells_batch);
707: if (elem_mat_size_tot) {
708: cudaMalloc((void **)&d_elem_mats, batch_sz * elem_mat_size_tot * szs); // kernel output - first call is on CPU
709: } else d_elem_mats = NULL;
710: // create data
711: d_BB = (PetscReal *)SData_d->B;
712: d_DD = (PetscReal *)SData_d->D;
713: if (a_elem_closure || a_xarray) { // form f and df
714: PetscLogEventBegin(events[1], 0, 0, 0, 0);
715: cudaMemcpy(SData_d->Eq_m, a_Eq_m, Nftot * szf, cudaMemcpyHostToDevice);
716: d_invJj = (PetscReal *)SData_d->invJ;
717: d_nu_alpha = (PetscReal *)SData_d->alpha;
718: d_nu_beta = (PetscReal *)SData_d->beta;
719: d_invMass = (PetscReal *)SData_d->invMass;
720: d_x = (PetscReal *)SData_d->x;
721: d_y = (PetscReal *)SData_d->y;
722: d_w = (PetscReal *)SData_d->w;
723: d_Eq_m = (PetscReal *)SData_d->Eq_m;
724: d_dfdx = (PetscReal *)SData_d->dfdx;
725: d_dfdy = (PetscReal *)SData_d->dfdy;
726: #if LANDAU_DIM == 3
727: d_dfdz = (PetscReal *)SData_d->dfdz;
728: d_z = (PetscReal *)SData_d->z;
729: #endif
730: d_f = (PetscReal *)SData_d->f;
731: // get a d_vertex_f
732: if (a_elem_closure) {
733: PetscInt closure_sz = 0; // argh, don't have this on the host!!!
734: for (PetscInt grid = 0; grid < num_grids; grid++) {
735: PetscInt nfloc = a_species_offset[grid + 1] - a_species_offset[grid];
736: closure_sz += Nq * nfloc * a_numCells[grid];
737: }
738: closure_sz *= batch_sz;
739: cudaMalloc((void **)&d_vertex_f, closure_sz * sizeof(*a_elem_closure));
740: cudaMemcpy(d_vertex_f, a_elem_closure, closure_sz * sizeof(*a_elem_closure), cudaMemcpyHostToDevice);
741: } else {
742: d_vertex_f = (PetscScalar *)a_xarray;
743: }
744: PetscLogEventEnd(events[1], 0, 0, 0, 0);
745: } else {
746: d_w = (PetscReal *)SData_d->w; // mass just needs the weights
747: }
748: //
749: d_numCells = (PetscInt *)SData_d->NCells; // redundant -- remove
750: d_species_offset = (PetscInt *)SData_d->species_offset;
751: d_mat_offset = (PetscInt *)SData_d->mat_offset;
752: d_ip_offset = (PetscInt *)SData_d->ip_offset;
753: d_ipf_offset = (PetscInt *)SData_d->ipf_offset;
754: d_elem_offset = (PetscInt *)SData_d->elem_offset;
755: if (a_elem_closure || a_xarray) { // form f and df
756: dim3 dimBlockFDF(nnn > Nftot ? Nftot : nnn, Nq), dimBlock((nip_global > nnn) ? nnn : nip_global, Nq);
757: PetscLogEventBegin(events[8], 0, 0, 0, 0);
758: PetscLogGpuTimeBegin();
759: PetscInfo(plex[0], "Form F and dF/dx vectors: nip_global=%" PetscInt_FMT " num_grids=%" PetscInt_FMT "\n", nip_global, num_grids);
760: landau_form_fdf<<<dimGrid, dimBlockFDF>>>(dim, Nb, num_grids, d_invJj, d_BB, d_DD, d_vertex_f, d_maps, d_f, d_dfdx, d_dfdy,
761: #if LANDAU_DIM == 3
762: d_dfdz,
763: #endif
764: d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
765: PetscCUDACheckLaunch;
766: PetscLogGpuFlops(batch_sz * nip_global * (PetscLogDouble)(2 * Nb * (1 + dim)));
767: if (a_elem_closure) {
768: cudaFree(d_vertex_f);
769: d_vertex_f = NULL;
770: }
771: PetscLogGpuTimeEnd();
772: PetscLogEventEnd(events[8], 0, 0, 0, 0);
773: // Jacobian
774: PetscLogEventBegin(events[4], 0, 0, 0, 0);
775: PetscLogGpuTimeBegin();
776: PetscLogGpuFlops(batch_sz * nip_global * (PetscLogDouble)(a_elem_closure ? (nip_global * (11 * Nftot + 4 * dim * dim) + 6 * Nftot * dim * dim * dim + 10 * Nftot * dim * dim + 4 * Nftot * dim + Nb * Nftot * Nb * Nq * dim * dim * 5) : Nb * Nftot * Nb * Nq * 4));
777: PetscInt ii = 2 * LANDAU_MAX_NQ * LANDAU_MAX_SPECIES * LANDAU_DIM * (1 + LANDAU_DIM) + 3 * LANDAU_MAX_SPECIES + (1 + LANDAU_DIM) * dimBlock.x * LANDAU_MAX_SPECIES + LANDAU_MAX_NQ * LANDAU_MAX_NQ + 2 * LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE;
778: if (ii * szf >= 49152) {
779: cerr = cudaFuncSetAttribute(landau_jacobian, cudaFuncAttributeMaxDynamicSharedMemorySize, 98304);
780: cerr;
781: }
782: PetscInfo(plex[0], "Jacobian shared memory size: %" PetscInt_FMT " bytes, d_elem_mats=%p d_maps=%p\n", ii, d_elem_mats, d_maps);
783: landau_jacobian<<<dimGrid, dimBlock, ii * szf>>>(nip_global, dim, Nb, num_grids, d_invJj, Nftot, d_nu_alpha, d_nu_beta, d_invMass, d_Eq_m, d_BB, d_DD, d_x, d_y, d_w, d_elem_mats, d_maps, d_mat, d_f, d_dfdx, d_dfdy,
784: #if LANDAU_DIM == 3
785: d_z, d_dfdz,
786: #endif
787: d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
788: PetscCUDACheckLaunch; // has sync
789: PetscLogGpuTimeEnd();
790: PetscLogEventEnd(events[4], 0, 0, 0, 0);
791: } else { // mass
792: dim3 dimBlock(nnn, Nq);
793: PetscInt ii = LANDAU_MAX_NQ * LANDAU_MAX_NQ + 2 * LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE;
794: if (ii * szf >= 49152) {
795: cerr = cudaFuncSetAttribute(landau_mass, cudaFuncAttributeMaxDynamicSharedMemorySize, 98304);
796: cerr;
797: }
798: PetscInfo(plex[0], "Mass d_maps = %p. Nq=%" PetscInt_FMT ", vector size %d num_cells_batch=%" PetscInt_FMT ", %" PetscInt_FMT " shared memory words\n", d_maps, Nq, nnn, num_cells_batch, ii);
799: PetscLogEventBegin(events[16], 0, 0, 0, 0);
800: PetscLogGpuTimeBegin();
801: landau_mass<<<dimGrid, dimBlock, ii * szf>>>(dim, Nb, num_grids, d_w, d_BB, d_DD, d_elem_mats, d_maps, d_mat, shift, d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_elem_offset);
802: PetscCUDACheckLaunch; // has sync
803: PetscLogGpuTimeEnd();
804: PetscLogEventEnd(events[16], 0, 0, 0, 0);
805: }
806: // First time assembly with or without GPU assembly
807: if (d_elem_mats) {
808: PetscInt elem_mats_idx = 0;
809: for (PetscInt b_id = 0; b_id < batch_sz; b_id++) { // OpenMP (once)
810: for (PetscInt grid = 0; grid < num_grids; grid++) { // elem_mats_idx += totDim*totDim*a_numCells[grid];
811: const PetscInt Nfloc = a_species_offset[grid + 1] - a_species_offset[grid], totDim = Nfloc * Nq;
812: PetscScalar *elemMats = NULL, *elMat;
813: PetscSection section, globalSection;
814: PetscInt cStart, cEnd, ej;
815: PetscInt moffset = LAND_MOFFSET(b_id, grid, batch_sz, num_grids, a_mat_offset), nloc, nzl, colbuf[1024], row;
816: const PetscInt *cols;
817: const PetscScalar *vals;
818: Mat B = subJ[LAND_PACK_IDX(b_id, grid)];
819: PetscLogEventBegin(events[5], 0, 0, 0, 0);
820: DMPlexGetHeightStratum(plex[grid], 0, &cStart, &cEnd);
821: DMGetLocalSection(plex[grid], §ion);
822: DMGetGlobalSection(plex[grid], &globalSection);
823: PetscMalloc1(totDim * totDim * a_numCells[grid], &elemMats);
824: cudaMemcpy(elemMats, &d_elem_mats[elem_mats_idx], totDim * totDim * a_numCells[grid] * sizeof(*elemMats), cudaMemcpyDeviceToHost);
825: PetscLogEventEnd(events[5], 0, 0, 0, 0);
826: PetscLogEventBegin(events[6], 0, 0, 0, 0);
827: for (ej = cStart, elMat = elemMats; ej < cEnd; ++ej, elMat += totDim * totDim) {
828: DMPlexMatSetClosure(plex[grid], section, globalSection, B, ej, elMat, ADD_VALUES);
829: if (ej == -1) {
830: int d, f;
831: PetscPrintf(PETSC_COMM_SELF, "GPU Element matrix\n");
832: for (d = 0; d < totDim; ++d) {
833: for (f = 0; f < totDim; ++f) PetscPrintf(PETSC_COMM_SELF, " %12.5e", PetscRealPart(elMat[d * totDim + f]));
834: PetscPrintf(PETSC_COMM_SELF, "\n");
835: }
836: }
837: }
838: PetscFree(elemMats);
839: MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY);
840: MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY);
841: // move nest matrix to global JacP
842: MatGetSize(B, &nloc, NULL);
843: for (int i = 0; i < nloc; i++) {
844: MatGetRow(B, i, &nzl, &cols, &vals);
846: for (int j = 0; j < nzl; j++) colbuf[j] = cols[j] + moffset;
847: row = i + moffset;
848: MatSetValues(JacP, 1, &row, nzl, colbuf, vals, ADD_VALUES);
849: MatRestoreRow(B, i, &nzl, &cols, &vals);
850: }
851: MatDestroy(&B);
852: PetscLogEventEnd(events[6], 0, 0, 0, 0);
853: elem_mats_idx += totDim * totDim * a_numCells[grid]; // this can be a stored offset?
854: } // grids
855: }
857: cudaFree(d_elem_mats);
858: }
860: return 0;
861: }