Actual source code: feopencl.c

  1: #include <petsc/private/petscfeimpl.h>

  3: #if defined(PETSC_HAVE_OPENCL)

  5: static PetscErrorCode PetscFEDestroy_OpenCL(PetscFE fem)
  6: {
  7:   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data;

  9:   clReleaseCommandQueue(ocl->queue_id);
 10:   ocl->queue_id = 0;
 11:   clReleaseContext(ocl->ctx_id);
 12:   ocl->ctx_id = 0;
 13:   PetscFree(ocl);
 14:   return 0;
 15: }

 17:   #define PetscCallSTR(err) \
 18:     do { \
 19:       err; \
 20:       string_tail += count; \
 22:     } while (0)
 23: enum {
 24:   LAPLACIAN  = 0,
 25:   ELASTICITY = 1
 26: };

 28: /* NOTE: This is now broken for vector problems. Must redo loops to respect vector basis elements */
 29: /* dim     Number of spatial dimensions:          2                   */
 30: /* N_b     Number of basis functions:             generated           */
 31: /* N_{bt}  Number of total basis functions:       N_b * N_{comp}      */
 32: /* N_q     Number of quadrature points:           generated           */
 33: /* N_{bs}  Number of block cells                  LCM(N_b, N_q)       */
 34: /* N_{bst} Number of block cell components        LCM(N_{bt}, N_q)    */
 35: /* N_{bl}  Number of concurrent blocks            generated           */
 36: /* N_t     Number of threads:                     N_{bl} * N_{bs}     */
 37: /* N_{cbc} Number of concurrent basis      cells: N_{bl} * N_q        */
 38: /* N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b        */
 39: /* N_{sbc} Number of serial     basis      cells: N_{bs} / N_q        */
 40: /* N_{sqc} Number of serial     quadrature cells: N_{bs} / N_b        */
 41: /* N_{cb}  Number of serial cell batches:         input               */
 42: /* N_c     Number of total cells:                 N_{cb}*N_{t}/N_{comp} */
 43: static PetscErrorCode PetscFEOpenCLGenerateIntegrationCode(PetscFE fem, char **string_buffer, PetscInt buffer_length, PetscBool useAux, PetscInt N_bl)
 44: {
 45:   PetscFE_OpenCL  *ocl = (PetscFE_OpenCL *)fem->data;
 46:   PetscQuadrature  q;
 47:   char            *string_tail   = *string_buffer;
 48:   char            *end_of_buffer = *string_buffer + buffer_length;
 49:   char             float_str[] = "float", double_str[] = "double";
 50:   char            *numeric_str    = &(float_str[0]);
 51:   PetscInt         op             = ocl->op;
 52:   PetscBool        useField       = PETSC_FALSE;
 53:   PetscBool        useFieldDer    = PETSC_TRUE;
 54:   PetscBool        useFieldAux    = useAux;
 55:   PetscBool        useFieldDerAux = PETSC_FALSE;
 56:   PetscBool        useF0          = PETSC_TRUE;
 57:   PetscBool        useF1          = PETSC_TRUE;
 58:   const PetscReal *points, *weights;
 59:   PetscTabulation  T;
 60:   PetscInt         dim, qNc, N_b, N_c, N_q, N_t, p, d, b, c;
 61:   size_t           count;

 63:   PetscFEGetSpatialDimension(fem, &dim);
 64:   PetscFEGetDimension(fem, &N_b);
 65:   PetscFEGetNumComponents(fem, &N_c);
 66:   PetscFEGetQuadrature(fem, &q);
 67:   PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights);
 69:   N_t = N_b * N_c * N_q * N_bl;
 70:   /* Enable device extension for double precision */
 71:   if (ocl->realType == PETSC_DOUBLE) {
 72:     PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
 73:                                     "#if defined(cl_khr_fp64)\n"
 74:                                     "#  pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
 75:                                     "#elif defined(cl_amd_fp64)\n"
 76:                                     "#  pragma OPENCL EXTENSION cl_amd_fp64: enable\n"
 77:                                     "#endif\n",
 78:                                     &count));
 79:     numeric_str = &(double_str[0]);
 80:   }
 81:   /* Kernel API */
 82:   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
 83:                                   "\n"
 84:                                   "__kernel void integrateElementQuadrature(int N_cb, __global %s *coefficients, __global %s *coefficientsAux, __global %s *jacobianInverses, __global %s *jacobianDeterminants, __global %s *elemVec)\n"
 85:                                   "{\n",
 86:                                   &count, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str));
 87:   /* Quadrature */
 88:   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
 89:                                   "  /* Quadrature points\n"
 90:                                   "   - (x1,y1,x2,y2,...) */\n"
 91:                                   "  const %s points[%d] = {\n",
 92:                                   &count, numeric_str, N_q * dim));
 93:   for (p = 0; p < N_q; ++p) {
 94:     for (d = 0; d < dim; ++d) PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, points[p * dim + d]);
 95:   }
 96:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);
 97:   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
 98:                                   "  /* Quadrature weights\n"
 99:                                   "   - (v1,v2,...) */\n"
100:                                   "  const %s weights[%d] = {\n",
101:                                   &count, numeric_str, N_q));
102:   for (p = 0; p < N_q; ++p) PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, weights[p]);
103:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);
104:   /* Basis Functions */
105:   PetscFEGetCellTabulation(fem, 1, &T);
106:   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
107:                                   "  /* Nodal basis function evaluations\n"
108:                                   "    - basis component is fastest varying, the basis function, then point */\n"
109:                                   "  const %s Basis[%d] = {\n",
110:                                   &count, numeric_str, N_q * N_b * N_c));
111:   for (p = 0; p < N_q; ++p) {
112:     for (b = 0; b < N_b; ++b) {
113:       for (c = 0; c < N_c; ++c) PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, T->T[0][(p * N_b + b) * N_c + c]);
114:     }
115:   }
116:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);
117:   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
118:                                   "\n"
119:                                   "  /* Nodal basis function derivative evaluations,\n"
120:                                   "      - derivative direction is fastest varying, then basis component, then basis function, then point */\n"
121:                                   "  const %s%d BasisDerivatives[%d] = {\n",
122:                                   &count, numeric_str, dim, N_q * N_b * N_c));
123:   for (p = 0; p < N_q; ++p) {
124:     for (b = 0; b < N_b; ++b) {
125:       for (c = 0; c < N_c; ++c) {
126:         PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "(%s%d)(", &count, numeric_str, dim);
127:         for (d = 0; d < dim; ++d) {
128:           if (d > 0) {
129:             PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, ", %g", &count, T->T[1][((p * N_b + b) * dim + d) * N_c + c]);
130:           } else {
131:             PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g", &count, T->T[1][((p * N_b + b) * dim + d) * N_c + c]);
132:           }
133:         }
134:         PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "),\n", &count);
135:       }
136:     }
137:   }
138:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);
139:   /* Sizes */
140:   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
141:                                   "  const int dim    = %d;                           // The spatial dimension\n"
142:                                   "  const int N_bl   = %d;                           // The number of concurrent blocks\n"
143:                                   "  const int N_b    = %d;                           // The number of basis functions\n"
144:                                   "  const int N_comp = %d;                           // The number of basis function components\n"
145:                                   "  const int N_bt   = N_b*N_comp;                    // The total number of scalar basis functions\n"
146:                                   "  const int N_q    = %d;                           // The number of quadrature points\n"
147:                                   "  const int N_bst  = N_bt*N_q;                      // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously\n"
148:                                   "  const int N_t    = N_bst*N_bl;                    // The number of threads, N_bst * N_bl\n"
149:                                   "  const int N_bc   = N_t/N_comp;                    // The number of cells per batch (N_b*N_q*N_bl)\n"
150:                                   "  const int N_sbc  = N_bst / (N_q * N_comp);\n"
151:                                   "  const int N_sqc  = N_bst / N_bt;\n"
152:                                   "  /*const int N_c    = N_cb * N_bc;*/\n"
153:                                   "\n"
154:                                   "  /* Calculated indices */\n"
155:                                   "  /*const int tidx    = get_local_id(0) + get_local_size(0)*get_local_id(1);*/\n"
156:                                   "  const int tidx    = get_local_id(0);\n"
157:                                   "  const int blidx   = tidx / N_bst;                  // Block number for this thread\n"
158:                                   "  const int bidx    = tidx %% N_bt;                   // Basis function mapped to this thread\n"
159:                                   "  const int cidx    = tidx %% N_comp;                 // Basis component mapped to this thread\n"
160:                                   "  const int qidx    = tidx %% N_q;                    // Quadrature point mapped to this thread\n"
161:                                   "  const int blbidx  = tidx %% N_q + blidx*N_q;        // Cell mapped to this thread in the basis phase\n"
162:                                   "  const int blqidx  = tidx %% N_b + blidx*N_b;        // Cell mapped to this thread in the quadrature phase\n"
163:                                   "  const int gidx    = get_group_id(1)*get_num_groups(0) + get_group_id(0);\n"
164:                                   "  const int Goffset = gidx*N_cb*N_bc;\n",
165:                                   &count, dim, N_bl, N_b, N_c, N_q));
166:   /* Local memory */
167:   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
168:                                   "\n"
169:                                   "  /* Quadrature data */\n"
170:                                   "  %s                w;                   // $w_q$, Quadrature weight at $x_q$\n"
171:                                   "  __local %s         phi_i[%d];    //[N_bt*N_q];  // $\\phi_i(x_q)$, Value of the basis function $i$ at $x_q$\n"
172:                                   "  __local %s%d       phiDer_i[%d]; //[N_bt*N_q];  // $\\frac{\\partial\\phi_i(x_q)}{\\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$\n"
173:                                   "  /* Geometric data */\n"
174:                                   "  __local %s        detJ[%d]; //[N_t];           // $|J(x_q)|$, Jacobian determinant at $x_q$\n"
175:                                   "  __local %s        invJ[%d];//[N_t*dim*dim];   // $J^{-1}(x_q)$, Jacobian inverse at $x_q$\n",
176:                                   &count, numeric_str, numeric_str, N_b * N_c * N_q, numeric_str, dim, N_b * N_c * N_q, numeric_str, N_t, numeric_str, N_t * dim * dim));
177:   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
178:                                   "  /* FEM data */\n"
179:                                   "  __local %s        u_i[%d]; //[N_t*N_bt];       // Coefficients $u_i$ of the field $u|_{\\mathcal{T}} = \\sum_i u_i \\phi_i$\n",
180:                                   &count, numeric_str, N_t * N_b * N_c));
181:   if (useAux) {
182:     PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "  __local %s        a_i[%d]; //[N_t];            // Coefficients $a_i$ of the auxiliary field $a|_{\\mathcal{T}} = \\sum_i a_i \\phi^R_i$\n", &count, numeric_str, N_t);
183:   }
184:   if (useF0) {
185:     PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
186:                                     "  /* Intermediate calculations */\n"
187:                                     "  __local %s         f_0[%d]; //[N_t*N_sqc];      // $f_0(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n",
188:                                     &count, numeric_str, N_t * N_q));
189:   }
190:   if (useF1) PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "  __local %s%d       f_1[%d]; //[N_t*N_sqc];      // $f_1(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n", &count, numeric_str, dim, N_t * N_q);
191:   /* TODO: If using elasticity, put in mu/lambda coefficients */
192:   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
193:                                   "  /* Output data */\n"
194:                                   "  %s                e_i;                 // Coefficient $e_i$ of the residual\n\n",
195:                                   &count, numeric_str));
196:   /* One-time loads */
197:   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
198:                                   "  /* These should be generated inline */\n"
199:                                   "  /* Load quadrature weights */\n"
200:                                   "  w = weights[qidx];\n"
201:                                   "  /* Load basis tabulation \\phi_i for this cell */\n"
202:                                   "  if (tidx < N_bt*N_q) {\n"
203:                                   "    phi_i[tidx]    = Basis[tidx];\n"
204:                                   "    phiDer_i[tidx] = BasisDerivatives[tidx];\n"
205:                                   "  }\n\n",
206:                                   &count));
207:   /* Batch loads */
208:   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
209:                                   "  for (int batch = 0; batch < N_cb; ++batch) {\n"
210:                                   "    /* Load geometry */\n"
211:                                   "    detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];\n"
212:                                   "    for (int n = 0; n < dim*dim; ++n) {\n"
213:                                   "      const int offset = n*N_t;\n"
214:                                   "      invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];\n"
215:                                   "    }\n"
216:                                   "    /* Load coefficients u_i for this cell */\n"
217:                                   "    for (int n = 0; n < N_bt; ++n) {\n"
218:                                   "      const int offset = n*N_t;\n"
219:                                   "      u_i[offset+tidx] = coefficients[(Goffset*N_bt)+batch*N_t*N_b+offset+tidx];\n"
220:                                   "    }\n",
221:                                   &count));
222:   if (useAux) {
223:     PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
224:                                     "    /* Load coefficients a_i for this cell */\n"
225:                                     "    /* TODO: This should not be N_t here, it should be N_bc*N_comp_aux */\n"
226:                                     "    a_i[tidx] = coefficientsAux[Goffset+batch*N_t+tidx];\n",
227:                                     &count));
228:   }
229:   /* Quadrature phase */
230:   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
231:                                   "    barrier(CLK_LOCAL_MEM_FENCE);\n"
232:                                   "\n"
233:                                   "    /* Map coefficients to values at quadrature points */\n"
234:                                   "    for (int c = 0; c < N_sqc; ++c) {\n"
235:                                   "      const int cell          = c*N_bl*N_b + blqidx;\n"
236:                                   "      const int fidx          = (cell*N_q + qidx)*N_comp + cidx;\n",
237:                                   &count));
238:   if (useField) PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      %s  u[%d]; //[N_comp];     // $u(x_q)$, Value of the field at $x_q$\n", &count, numeric_str, N_c);
239:   if (useFieldDer) PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      %s%d   gradU[%d]; //[N_comp]; // $\\nabla u(x_q)$, Value of the field gradient at $x_q$\n", &count, numeric_str, dim, N_c);
240:   if (useFieldAux) PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      %s  a[%d]; //[1];     // $a(x_q)$, Value of the auxiliary fields at $x_q$\n", &count, numeric_str, 1);
241:   if (useFieldDerAux) PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      %s%d   gradA[%d]; //[1]; // $\\nabla a(x_q)$, Value of the auxiliary field gradient at $x_q$\n", &count, numeric_str, dim, 1);
242:   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
243:                                   "\n"
244:                                   "      for (int comp = 0; comp < N_comp; ++comp) {\n",
245:                                   &count));
246:   if (useField) PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        u[comp] = 0.0;\n", &count);
247:   if (useFieldDer) {
248:     switch (dim) {
249:     case 1:
250:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        gradU[comp].x = 0.0;\n", &count);
251:       break;
252:     case 2:
253:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        gradU[comp].x = 0.0; gradU[comp].y = 0.0;\n", &count);
254:       break;
255:     case 3:
256:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        gradU[comp].x = 0.0; gradU[comp].y = 0.0; gradU[comp].z = 0.0;\n", &count);
257:       break;
258:     }
259:   }
260:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      }\n", &count);
261:   if (useFieldAux) PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      a[0] = 0.0;\n", &count);
262:   if (useFieldDerAux) {
263:     switch (dim) {
264:     case 1:
265:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      gradA[0].x = 0.0;\n", &count);
266:       break;
267:     case 2:
268:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      gradA[0].x = 0.0; gradA[0].y = 0.0;\n", &count);
269:       break;
270:     case 3:
271:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      gradA[0].x = 0.0; gradA[0].y = 0.0; gradA[0].z = 0.0;\n", &count);
272:       break;
273:     }
274:   }
275:   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
276:                                   "      /* Get field and derivatives at this quadrature point */\n"
277:                                   "      for (int i = 0; i < N_b; ++i) {\n"
278:                                   "        for (int comp = 0; comp < N_comp; ++comp) {\n"
279:                                   "          const int b    = i*N_comp+comp;\n"
280:                                   "          const int pidx = qidx*N_bt + b;\n"
281:                                   "          const int uidx = cell*N_bt + b;\n"
282:                                   "          %s%d   realSpaceDer;\n\n",
283:                                   &count, numeric_str, dim));
284:   if (useField) PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "          u[comp] += u_i[uidx]*phi_i[pidx];\n", &count);
285:   if (useFieldDer) {
286:     switch (dim) {
287:     case 2:
288:       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
289:                                       "          realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
290:                                       "          gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
291:                                       "          realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
292:                                       "          gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n",
293:                                       &count));
294:       break;
295:     case 3:
296:       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
297:                                       "          realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
298:                                       "          gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
299:                                       "          realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
300:                                       "          gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n"
301:                                       "          realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
302:                                       "          gradU[comp].z += u_i[uidx]*realSpaceDer.z;\n",
303:                                       &count));
304:       break;
305:     }
306:   }
307:   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
308:                                   "        }\n"
309:                                   "      }\n",
310:                                   &count));
311:   if (useFieldAux) PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "          a[0] += a_i[cell];\n", &count);
312:   /* Calculate residual at quadrature points: Should be generated by an weak form egine */
313:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      /* Process values at quadrature points */\n", &count);
314:   switch (op) {
315:   case LAPLACIAN:
316:     if (useF0) PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_0[fidx] = 4.0;\n", &count);
317:     if (useF1) {
318:       if (useAux) PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_1[fidx] = a[0]*gradU[cidx];\n", &count);
319:       else PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_1[fidx] = gradU[cidx];\n", &count);
320:     }
321:     break;
322:   case ELASTICITY:
323:     if (useF0) PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_0[fidx] = 4.0;\n", &count);
324:     if (useF1) {
325:       switch (dim) {
326:       case 2:
327:         PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
328:                                         "      switch (cidx) {\n"
329:                                         "      case 0:\n"
330:                                         "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].x + gradU[0].x);\n"
331:                                         "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].y + gradU[1].x);\n"
332:                                         "        break;\n"
333:                                         "      case 1:\n"
334:                                         "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].x + gradU[0].y);\n"
335:                                         "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].y + gradU[1].y);\n"
336:                                         "      }\n",
337:                                         &count));
338:         break;
339:       case 3:
340:         PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
341:                                         "      switch (cidx) {\n"
342:                                         "      case 0:\n"
343:                                         "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].x + gradU[0].x);\n"
344:                                         "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].y + gradU[1].x);\n"
345:                                         "        f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].z + gradU[2].x);\n"
346:                                         "        break;\n"
347:                                         "      case 1:\n"
348:                                         "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].x + gradU[0].y);\n"
349:                                         "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[1].y);\n"
350:                                         "        f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[2].y);\n"
351:                                         "        break;\n"
352:                                         "      case 2:\n"
353:                                         "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].x + gradU[0].z);\n"
354:                                         "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[1].z);\n"
355:                                         "        f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[2].z);\n"
356:                                         "      }\n",
357:                                         &count));
358:         break;
359:       }
360:     }
361:     break;
362:   default:
363:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "PDE operator %d is not supported", op);
364:   }
365:   if (useF0) PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_0[fidx] *= detJ[cell]*w;\n", &count);
366:   if (useF1) {
367:     switch (dim) {
368:     case 1:
369:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_1[fidx].x *= detJ[cell]*w;\n", &count);
370:       break;
371:     case 2:
372:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w;\n", &count);
373:       break;
374:     case 3:
375:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;\n", &count);
376:       break;
377:     }
378:   }
379:   /* Thread transpose */
380:   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
381:                                   "    }\n\n"
382:                                   "    /* ==== TRANSPOSE THREADS ==== */\n"
383:                                   "    barrier(CLK_LOCAL_MEM_FENCE);\n\n",
384:                                   &count));
385:   /* Basis phase */
386:   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
387:                                   "    /* Map values at quadrature points to coefficients */\n"
388:                                   "    for (int c = 0; c < N_sbc; ++c) {\n"
389:                                   "      const int cell = c*N_bl*N_q + blbidx; /* Cell number in batch */\n"
390:                                   "\n"
391:                                   "      e_i = 0.0;\n"
392:                                   "      for (int q = 0; q < N_q; ++q) {\n"
393:                                   "        const int pidx = q*N_bt + bidx;\n"
394:                                   "        const int fidx = (cell*N_q + q)*N_comp + cidx;\n"
395:                                   "        %s%d   realSpaceDer;\n\n",
396:                                   &count, numeric_str, dim));

398:   if (useF0) PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        e_i += phi_i[pidx]*f_0[fidx];\n", &count);
399:   if (useF1) {
400:     switch (dim) {
401:     case 2:
402:       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
403:                                       "        realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
404:                                       "        e_i           += realSpaceDer.x*f_1[fidx].x;\n"
405:                                       "        realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
406:                                       "        e_i           += realSpaceDer.y*f_1[fidx].y;\n",
407:                                       &count));
408:       break;
409:     case 3:
410:       PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
411:                                       "        realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
412:                                       "        e_i           += realSpaceDer.x*f_1[fidx].x;\n"
413:                                       "        realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
414:                                       "        e_i           += realSpaceDer.y*f_1[fidx].y;\n"
415:                                       "        realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
416:                                       "        e_i           += realSpaceDer.z*f_1[fidx].z;\n",
417:                                       &count));
418:       break;
419:     }
420:   }
421:   PetscCallSTR(PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
422:                                   "      }\n"
423:                                   "      /* Write element vector for N_{cbc} cells at a time */\n"
424:                                   "      elemVec[(Goffset + batch*N_bc + c*N_bl*N_q)*N_bt + tidx] = e_i;\n"
425:                                   "    }\n"
426:                                   "    /* ==== Could do one write per batch ==== */\n"
427:                                   "  }\n"
428:                                   "  return;\n"
429:                                   "}\n",
430:                                   &count));
431:   return 0;
432: }

434: static PetscErrorCode PetscFEOpenCLGetIntegrationKernel(PetscFE fem, PetscBool useAux, cl_program *ocl_prog, cl_kernel *ocl_kernel)
435: {
436:   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data;
437:   PetscInt        dim, N_bl;
438:   PetscBool       flg;
439:   char           *buffer;
440:   size_t          len;
441:   char            errMsg[8192];
442:   cl_int          err;

444:   PetscFEGetSpatialDimension(fem, &dim);
445:   PetscMalloc1(8192, &buffer);
446:   PetscFEGetTileSizes(fem, NULL, &N_bl, NULL, NULL);
447:   PetscFEOpenCLGenerateIntegrationCode(fem, &buffer, 8192, useAux, N_bl);
448:   PetscOptionsHasName(((PetscObject)fem)->options, ((PetscObject)fem)->prefix, "-petscfe_opencl_kernel_print", &flg);
449:   if (flg) PetscPrintf(PetscObjectComm((PetscObject)fem), "OpenCL FE Integration Kernel:\n%s\n", buffer);
450:   PetscStrlen(buffer, &len);
451:   *ocl_prog = clCreateProgramWithSource(ocl->ctx_id, 1, (const char **)&buffer, &len, &err);
452:   err;
453:   err = clBuildProgram(*ocl_prog, 0, NULL, NULL, NULL, NULL);
454:   if (err != CL_SUCCESS) {
455:     err = clGetProgramBuildInfo(*ocl_prog, ocl->dev_id, CL_PROGRAM_BUILD_LOG, 8192 * sizeof(char), &errMsg, NULL);
456:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Build failed! Log:\n %s", errMsg);
457:   }
458:   PetscFree(buffer);
459:   *ocl_kernel = clCreateKernel(*ocl_prog, "integrateElementQuadrature", &err);
460:   return 0;
461: }

463: static PetscErrorCode PetscFEOpenCLCalculateGrid(PetscFE fem, PetscInt N, PetscInt blockSize, size_t *x, size_t *y, size_t *z)
464: {
465:   const PetscInt Nblocks = N / blockSize;

468:   *z = 1;
469:   *y = 1;
470:   for (*x = (size_t)(PetscSqrtReal(Nblocks) + 0.5); *x > 0; --*x) {
471:     *y = Nblocks / *x;
472:     if (*x * *y == (size_t)Nblocks) break;
473:   }
475:   return 0;
476: }

478: static PetscErrorCode PetscFEOpenCLLogResidual(PetscFE fem, PetscLogDouble time, PetscLogDouble flops)
479: {
480:   PetscFE_OpenCL   *ocl = (PetscFE_OpenCL *)fem->data;
481:   PetscStageLog     stageLog;
482:   PetscEventPerfLog eventLog = NULL;
483:   int               stage;

485:   PetscLogGetStageLog(&stageLog);
486:   PetscStageLogGetCurrent(stageLog, &stage);
487:   PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);
488:   /* Log performance info */
489:   eventLog->eventInfo[ocl->residualEvent].count++;
490:   eventLog->eventInfo[ocl->residualEvent].time += time;
491:   eventLog->eventInfo[ocl->residualEvent].flops += flops;
492:   return 0;
493: }

495: static PetscErrorCode PetscFEIntegrateResidual_OpenCL(PetscDS prob, PetscFormKey key, PetscInt Ne, PetscFEGeom *cgeom, const PetscScalar coefficients[], const PetscScalar coefficients_t[], PetscDS probAux, const PetscScalar coefficientsAux[], PetscReal t, PetscScalar elemVec[])
496: {
497:   /* Nbc = batchSize */
498:   PetscFE         fem;
499:   PetscFE_OpenCL *ocl;
500:   PetscPointFunc  f0_func;
501:   PetscPointFunc  f1_func;
502:   PetscQuadrature q;
503:   PetscInt        dim, qNc;
504:   PetscInt        N_b;    /* The number of basis functions */
505:   PetscInt        N_comp; /* The number of basis function components */
506:   PetscInt        N_bt;   /* The total number of scalar basis functions */
507:   PetscInt        N_q;    /* The number of quadrature points */
508:   PetscInt        N_bst;  /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */
509:   PetscInt        N_t;    /* The number of threads, N_bst * N_bl */
510:   PetscInt        N_bl;   /* The number of blocks */
511:   PetscInt        N_bc;   /* The batch size, N_bl*N_q*N_b */
512:   PetscInt        N_cb;   /* The number of batches */
513:   const PetscInt  field = key.field;
514:   PetscInt        numFlops, f0Flops = 0, f1Flops = 0;
515:   PetscBool       useAux      = probAux ? PETSC_TRUE : PETSC_FALSE;
516:   PetscBool       useField    = PETSC_FALSE;
517:   PetscBool       useFieldDer = PETSC_TRUE;
518:   PetscBool       useF0       = PETSC_TRUE;
519:   PetscBool       useF1       = PETSC_TRUE;
520:   /* OpenCL variables */
521:   cl_program       ocl_prog;
522:   cl_kernel        ocl_kernel;
523:   cl_event         ocl_ev;   /* The event for tracking kernel execution */
524:   cl_ulong         ns_start; /* Nanoseconds counter on GPU at kernel start */
525:   cl_ulong         ns_end;   /* Nanoseconds counter on GPU at kernel stop */
526:   cl_mem           o_jacobianInverses, o_jacobianDeterminants;
527:   cl_mem           o_coefficients, o_coefficientsAux, o_elemVec;
528:   float           *f_coeff = NULL, *f_coeffAux = NULL, *f_invJ = NULL, *f_detJ = NULL;
529:   double          *d_coeff = NULL, *d_coeffAux = NULL, *d_invJ = NULL, *d_detJ = NULL;
530:   PetscReal       *r_invJ = NULL, *r_detJ = NULL;
531:   void            *oclCoeff, *oclCoeffAux, *oclInvJ, *oclDetJ;
532:   size_t           local_work_size[3], global_work_size[3];
533:   size_t           realSize, x, y, z;
534:   const PetscReal *points, *weights;
535:   int              err;

537:   PetscDSGetDiscretization(prob, field, (PetscObject *)&fem);
538:   ocl = (PetscFE_OpenCL *)fem->data;
539:   if (!Ne) {
540:     PetscFEOpenCLLogResidual(fem, 0.0, 0.0);
541:     return 0;
542:   }
543:   PetscFEGetSpatialDimension(fem, &dim);
544:   PetscFEGetQuadrature(fem, &q);
545:   PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights);
547:   PetscFEGetDimension(fem, &N_b);
548:   PetscFEGetNumComponents(fem, &N_comp);
549:   PetscDSGetResidual(prob, field, &f0_func, &f1_func);
550:   PetscFEGetTileSizes(fem, NULL, &N_bl, &N_bc, &N_cb);
551:   N_bt  = N_b * N_comp;
552:   N_bst = N_bt * N_q;
553:   N_t   = N_bst * N_bl;
555:   /* Calculate layout */
556:   if (Ne % (N_cb * N_bc)) { /* Remainder cells */
557:     PetscFEIntegrateResidual_Basic(prob, key, Ne, cgeom, coefficients, coefficients_t, probAux, coefficientsAux, t, elemVec);
558:     return 0;
559:   }
560:   PetscFEOpenCLCalculateGrid(fem, Ne, N_cb * N_bc, &x, &y, &z);
561:   local_work_size[0]  = N_bc * N_comp;
562:   local_work_size[1]  = 1;
563:   local_work_size[2]  = 1;
564:   global_work_size[0] = x * local_work_size[0];
565:   global_work_size[1] = y * local_work_size[1];
566:   global_work_size[2] = z * local_work_size[2];
567:   PetscInfo(fem, "GPU layout grid(%zu,%zu,%zu) block(%zu,%zu,%zu) with %d batches\n", x, y, z, local_work_size[0], local_work_size[1], local_work_size[2], N_cb);
568:   PetscInfo(fem, " N_t: %d, N_cb: %d\n", N_t, N_cb);
569:   /* Generate code */
570:   if (probAux) {
571:     PetscSpace P;
572:     PetscInt   NfAux, order, f;

574:     PetscDSGetNumFields(probAux, &NfAux);
575:     for (f = 0; f < NfAux; ++f) {
576:       PetscFE feAux;

578:       PetscDSGetDiscretization(probAux, f, (PetscObject *)&feAux);
579:       PetscFEGetBasisSpace(feAux, &P);
580:       PetscSpaceGetDegree(P, &order, NULL);
582:     }
583:   }
584:   PetscFEOpenCLGetIntegrationKernel(fem, useAux, &ocl_prog, &ocl_kernel);
585:   /* Create buffers on the device and send data over */
586:   PetscDataTypeGetSize(ocl->realType, &realSize);
588:   if (sizeof(PetscReal) != realSize) {
589:     switch (ocl->realType) {
590:     case PETSC_FLOAT: {
591:       PetscInt c, b, d;

593:       PetscMalloc4(Ne * N_bt, &f_coeff, Ne, &f_coeffAux, Ne * dim * dim, &f_invJ, Ne, &f_detJ);
594:       for (c = 0; c < Ne; ++c) {
595:         f_detJ[c] = (float)cgeom->detJ[c];
596:         for (d = 0; d < dim * dim; ++d) f_invJ[c * dim * dim + d] = (float)cgeom->invJ[c * dim * dim + d];
597:         for (b = 0; b < N_bt; ++b) f_coeff[c * N_bt + b] = (float)coefficients[c * N_bt + b];
598:       }
599:       if (coefficientsAux) { /* Assume P0 */
600:         for (c = 0; c < Ne; ++c) f_coeffAux[c] = (float)coefficientsAux[c];
601:       }
602:       oclCoeff = (void *)f_coeff;
603:       if (coefficientsAux) {
604:         oclCoeffAux = (void *)f_coeffAux;
605:       } else {
606:         oclCoeffAux = NULL;
607:       }
608:       oclInvJ = (void *)f_invJ;
609:       oclDetJ = (void *)f_detJ;
610:     } break;
611:     case PETSC_DOUBLE: {
612:       PetscInt c, b, d;

614:       PetscMalloc4(Ne * N_bt, &d_coeff, Ne, &d_coeffAux, Ne * dim * dim, &d_invJ, Ne, &d_detJ);
615:       for (c = 0; c < Ne; ++c) {
616:         d_detJ[c] = (double)cgeom->detJ[c];
617:         for (d = 0; d < dim * dim; ++d) d_invJ[c * dim * dim + d] = (double)cgeom->invJ[c * dim * dim + d];
618:         for (b = 0; b < N_bt; ++b) d_coeff[c * N_bt + b] = (double)coefficients[c * N_bt + b];
619:       }
620:       if (coefficientsAux) { /* Assume P0 */
621:         for (c = 0; c < Ne; ++c) d_coeffAux[c] = (double)coefficientsAux[c];
622:       }
623:       oclCoeff = (void *)d_coeff;
624:       if (coefficientsAux) {
625:         oclCoeffAux = (void *)d_coeffAux;
626:       } else {
627:         oclCoeffAux = NULL;
628:       }
629:       oclInvJ = (void *)d_invJ;
630:       oclDetJ = (void *)d_detJ;
631:     } break;
632:     default:
633:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
634:     }
635:   } else {
636:     PetscInt c, d;

638:     PetscMalloc2(Ne * dim * dim, &r_invJ, Ne, &r_detJ);
639:     for (c = 0; c < Ne; ++c) {
640:       r_detJ[c] = cgeom->detJ[c];
641:       for (d = 0; d < dim * dim; ++d) r_invJ[c * dim * dim + d] = cgeom->invJ[c * dim * dim + d];
642:     }
643:     oclCoeff    = (void *)coefficients;
644:     oclCoeffAux = (void *)coefficientsAux;
645:     oclInvJ     = (void *)r_invJ;
646:     oclDetJ     = (void *)r_detJ;
647:   }
648:   o_coefficients = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * N_bt * realSize, oclCoeff, &err);
649:   if (coefficientsAux) {
650:     o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclCoeffAux, &err);
651:   } else {
652:     o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY, Ne * realSize, oclCoeffAux, &err);
653:   }
654:   o_jacobianInverses     = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * dim * dim * realSize, oclInvJ, &err);
655:   o_jacobianDeterminants = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclDetJ, &err);
656:   o_elemVec              = clCreateBuffer(ocl->ctx_id, CL_MEM_WRITE_ONLY, Ne * N_bt * realSize, NULL, &err);
657:   /* Kernel launch */
658:   clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void *)&N_cb);
659:   clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void *)&o_coefficients);
660:   clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void *)&o_coefficientsAux);
661:   clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void *)&o_jacobianInverses);
662:   clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void *)&o_jacobianDeterminants);
663:   clSetKernelArg(ocl_kernel, 5, sizeof(cl_mem), (void *)&o_elemVec);
664:   clEnqueueNDRangeKernel(ocl->queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev);
665:   /* Read data back from device */
666:   if (sizeof(PetscReal) != realSize) {
667:     switch (ocl->realType) {
668:     case PETSC_FLOAT: {
669:       float   *elem;
670:       PetscInt c, b;

672:       PetscFree4(f_coeff, f_coeffAux, f_invJ, f_detJ);
673:       PetscMalloc1(Ne * N_bt, &elem);
674:       clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne * N_bt * realSize, elem, 0, NULL, NULL);
675:       for (c = 0; c < Ne; ++c) {
676:         for (b = 0; b < N_bt; ++b) elemVec[c * N_bt + b] = (PetscScalar)elem[c * N_bt + b];
677:       }
678:       PetscFree(elem);
679:     } break;
680:     case PETSC_DOUBLE: {
681:       double  *elem;
682:       PetscInt c, b;

684:       PetscFree4(d_coeff, d_coeffAux, d_invJ, d_detJ);
685:       PetscMalloc1(Ne * N_bt, &elem);
686:       clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne * N_bt * realSize, elem, 0, NULL, NULL);
687:       for (c = 0; c < Ne; ++c) {
688:         for (b = 0; b < N_bt; ++b) elemVec[c * N_bt + b] = (PetscScalar)elem[c * N_bt + b];
689:       }
690:       PetscFree(elem);
691:     } break;
692:     default:
693:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
694:     }
695:   } else {
696:     PetscFree2(r_invJ, r_detJ);
697:     clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne * N_bt * realSize, elemVec, 0, NULL, NULL);
698:   }
699:   /* Log performance */
700:   clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL);
701:   clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ns_end, NULL);
702:   f0Flops = 0;
703:   switch (ocl->op) {
704:   case LAPLACIAN:
705:     f1Flops = useAux ? dim : 0;
706:     break;
707:   case ELASTICITY:
708:     f1Flops = 2 * dim * dim;
709:     break;
710:   }
711:   numFlops = Ne * (N_q * (N_b * N_comp * ((useField ? 2 : 0) + (useFieldDer ? 2 * dim * (dim + 1) : 0))
712:                           /*+
713:        N_ba*N_compa*((useFieldAux ? 2 : 0) + (useFieldDerAux ? 2*dim*(dim + 1) : 0))*/
714:                           + N_comp * ((useF0 ? f0Flops + 2 : 0) + (useF1 ? f1Flops + 2 * dim : 0))) +
715:                    N_b * ((useF0 ? 2 : 0) + (useF1 ? 2 * dim * (dim + 1) : 0)));
716:   PetscFEOpenCLLogResidual(fem, (ns_end - ns_start) * 1.0e-9, numFlops);
717:   /* Cleanup */
718:   clReleaseMemObject(o_coefficients);
719:   clReleaseMemObject(o_coefficientsAux);
720:   clReleaseMemObject(o_jacobianInverses);
721:   clReleaseMemObject(o_jacobianDeterminants);
722:   clReleaseMemObject(o_elemVec);
723:   clReleaseKernel(ocl_kernel);
724:   clReleaseProgram(ocl_prog);
725:   return 0;
726: }

728: PETSC_INTERN PetscErrorCode PetscFESetUp_Basic(PetscFE);
729: PETSC_INTERN PetscErrorCode PetscFECreateTabulation_Basic(PetscFE, PetscInt, const PetscReal[], PetscInt, PetscTabulation);

731: static PetscErrorCode PetscFEInitialize_OpenCL(PetscFE fem)
732: {
733:   fem->ops->setfromoptions          = NULL;
734:   fem->ops->setup                   = PetscFESetUp_Basic;
735:   fem->ops->view                    = NULL;
736:   fem->ops->destroy                 = PetscFEDestroy_OpenCL;
737:   fem->ops->getdimension            = PetscFEGetDimension_Basic;
738:   fem->ops->createtabulation        = PetscFECreateTabulation_Basic;
739:   fem->ops->integrateresidual       = PetscFEIntegrateResidual_OpenCL;
740:   fem->ops->integratebdresidual     = NULL /* PetscFEIntegrateBdResidual_OpenCL */;
741:   fem->ops->integratejacobianaction = NULL /* PetscFEIntegrateJacobianAction_OpenCL */;
742:   fem->ops->integratejacobian       = PetscFEIntegrateJacobian_Basic;
743:   return 0;
744: }

746: /*MC
747:   PETSCFEOPENCL = "opencl" - A `PetscFEType` that integrates using a vectorized OpenCL implementation

749:   Level: intermediate

751: .seealso: `PetscFEType`, `PetscFECreate()`, `PetscFESetType()`
752: M*/

754: PETSC_EXTERN PetscErrorCode PetscFECreate_OpenCL(PetscFE fem)
755: {
756:   PetscFE_OpenCL *ocl;
757:   cl_uint         num_platforms;
758:   cl_platform_id  platform_ids[42];
759:   cl_uint         num_devices;
760:   cl_device_id    device_ids[42];
761:   cl_int          err;

764:   PetscNew(&ocl);
765:   fem->data = ocl;

767:   /* Init Platform */
768:   clGetPlatformIDs(42, platform_ids, &num_platforms);
770:   ocl->pf_id = platform_ids[0];
771:   /* Init Device */
772:   clGetDeviceIDs(ocl->pf_id, CL_DEVICE_TYPE_ALL, 42, device_ids, &num_devices);
774:   ocl->dev_id = device_ids[0];
775:   /* Create context with one command queue */
776:   ocl->ctx_id = clCreateContext(0, 1, &(ocl->dev_id), NULL, NULL, &err);
777:   err;
778:   ocl->queue_id = clCreateCommandQueue(ocl->ctx_id, ocl->dev_id, CL_QUEUE_PROFILING_ENABLE, &err);
779:   err;
780:   /* Types */
781:   ocl->realType = PETSC_FLOAT;
782:   /* Register events */
783:   PetscLogEventRegister("OpenCL FEResidual", PETSCFE_CLASSID, &ocl->residualEvent);
784:   /* Equation handling */
785:   ocl->op = LAPLACIAN;

787:   PetscFEInitialize_OpenCL(fem);
788:   return 0;
789: }

791: /*@
792:   PetscFEOpenCLSetRealType - Set the scalar type for running on the OpenCL accelerator

794:   Input Parameters:
795: + fem      - The `PetscFE`
796: - realType - The scalar type

798:   Level: developer

800: .seealso: `PetscFE`, `PetscFEOpenCLGetRealType()`
801: @*/
802: PetscErrorCode PetscFEOpenCLSetRealType(PetscFE fem, PetscDataType realType)
803: {
804:   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data;

807:   ocl->realType = realType;
808:   return 0;
809: }

811: /*@
812:   PetscFEOpenCLGetRealType - Get the scalar type for running on the OpenCL accelerator

814:   Input Parameter:
815: . fem - The `PetscFE`

817:   Output Parameter:
818: . realType - The scalar type

820:   Level: developer

822: .seealso: `PetscFE`, `PetscFEOpenCLSetRealType()`
823: @*/
824: PetscErrorCode PetscFEOpenCLGetRealType(PetscFE fem, PetscDataType *realType)
825: {
826:   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *)fem->data;

830:   *realType = ocl->realType;
831:   return 0;
832: }

834: #endif /* PETSC_HAVE_OPENCL */