Actual source code: bddcschurs.c

  1: #include <petsc/private/pcbddcimpl.h>
  2: #include <petsc/private/pcbddcprivateimpl.h>
  3: #include <../src/mat/impls/dense/seq/dense.h>
  4: #include <petscblaslapack.h>

  6: static inline PetscErrorCode PCBDDCAdjGetNextLayer_Private(PetscInt *, PetscInt, PetscBT, PetscInt *, PetscInt *, PetscInt *);
  7: static PetscErrorCode        PCBDDCComputeExplicitSchur(Mat, PetscBool, MatReuse, Mat *);
  8: static PetscErrorCode        PCBDDCReuseSolvers_Interior(PC, Vec, Vec);
  9: static PetscErrorCode        PCBDDCReuseSolvers_Correction(PC, Vec, Vec);

 11: /* if v2 is not present, correction is done in-place */
 12: PetscErrorCode PCBDDCReuseSolversBenignAdapt(PCBDDCReuseSolvers ctx, Vec v, Vec v2, PetscBool sol, PetscBool full)
 13: {
 14:   PetscScalar *array;
 15:   PetscScalar *array2;

 17:   if (!ctx->benign_n) return 0;
 18:   if (sol && full) {
 19:     PetscInt n_I, size_schur;

 21:     /* get sizes */
 22:     MatGetSize(ctx->benign_csAIB, &size_schur, NULL);
 23:     VecGetSize(v, &n_I);
 24:     n_I = n_I - size_schur;
 25:     /* get schur sol from array */
 26:     VecGetArray(v, &array);
 27:     VecPlaceArray(ctx->benign_dummy_schur_vec, array + n_I);
 28:     VecRestoreArray(v, &array);
 29:     /* apply interior sol correction */
 30:     MatMultTranspose(ctx->benign_csAIB, ctx->benign_dummy_schur_vec, ctx->benign_corr_work);
 31:     VecResetArray(ctx->benign_dummy_schur_vec);
 32:     MatMultAdd(ctx->benign_AIIm1ones, ctx->benign_corr_work, v, v);
 33:   }
 34:   if (v2) {
 35:     PetscInt nl;

 37:     VecGetArrayRead(v, (const PetscScalar **)&array);
 38:     VecGetLocalSize(v2, &nl);
 39:     VecGetArray(v2, &array2);
 40:     PetscArraycpy(array2, array, nl);
 41:   } else {
 42:     VecGetArray(v, &array);
 43:     array2 = array;
 44:   }
 45:   if (!sol) { /* change rhs */
 46:     PetscInt n;
 47:     for (n = 0; n < ctx->benign_n; n++) {
 48:       PetscScalar     sum = 0.;
 49:       const PetscInt *cols;
 50:       PetscInt        nz, i;

 52:       ISGetLocalSize(ctx->benign_zerodiag_subs[n], &nz);
 53:       ISGetIndices(ctx->benign_zerodiag_subs[n], &cols);
 54:       for (i = 0; i < nz - 1; i++) sum += array[cols[i]];
 55: #if defined(PETSC_USE_COMPLEX)
 56:       sum = -(PetscRealPart(sum) / nz + PETSC_i * (PetscImaginaryPart(sum) / nz));
 57: #else
 58:       sum = -sum / nz;
 59: #endif
 60:       for (i = 0; i < nz - 1; i++) array2[cols[i]] += sum;
 61:       ctx->benign_save_vals[n] = array2[cols[nz - 1]];
 62:       array2[cols[nz - 1]]     = sum;
 63:       ISRestoreIndices(ctx->benign_zerodiag_subs[n], &cols);
 64:     }
 65:   } else {
 66:     PetscInt n;
 67:     for (n = 0; n < ctx->benign_n; n++) {
 68:       PetscScalar     sum = 0.;
 69:       const PetscInt *cols;
 70:       PetscInt        nz, i;
 71:       ISGetLocalSize(ctx->benign_zerodiag_subs[n], &nz);
 72:       ISGetIndices(ctx->benign_zerodiag_subs[n], &cols);
 73:       for (i = 0; i < nz - 1; i++) sum += array[cols[i]];
 74: #if defined(PETSC_USE_COMPLEX)
 75:       sum = -(PetscRealPart(sum) / nz + PETSC_i * (PetscImaginaryPart(sum) / nz));
 76: #else
 77:       sum = -sum / nz;
 78: #endif
 79:       for (i = 0; i < nz - 1; i++) array2[cols[i]] += sum;
 80:       array2[cols[nz - 1]] = ctx->benign_save_vals[n];
 81:       ISRestoreIndices(ctx->benign_zerodiag_subs[n], &cols);
 82:     }
 83:   }
 84:   if (v2) {
 85:     VecRestoreArrayRead(v, (const PetscScalar **)&array);
 86:     VecRestoreArray(v2, &array2);
 87:   } else {
 88:     VecRestoreArray(v, &array);
 89:   }
 90:   if (!sol && full) {
 91:     Vec      usedv;
 92:     PetscInt n_I, size_schur;

 94:     /* get sizes */
 95:     MatGetSize(ctx->benign_csAIB, &size_schur, NULL);
 96:     VecGetSize(v, &n_I);
 97:     n_I = n_I - size_schur;
 98:     /* compute schur rhs correction */
 99:     if (v2) {
100:       usedv = v2;
101:     } else {
102:       usedv = v;
103:     }
104:     /* apply schur rhs correction */
105:     MatMultTranspose(ctx->benign_AIIm1ones, usedv, ctx->benign_corr_work);
106:     VecGetArrayRead(usedv, (const PetscScalar **)&array);
107:     VecPlaceArray(ctx->benign_dummy_schur_vec, array + n_I);
108:     VecRestoreArrayRead(usedv, (const PetscScalar **)&array);
109:     MatMultAdd(ctx->benign_csAIB, ctx->benign_corr_work, ctx->benign_dummy_schur_vec, ctx->benign_dummy_schur_vec);
110:     VecResetArray(ctx->benign_dummy_schur_vec);
111:   }
112:   return 0;
113: }

115: static PetscErrorCode PCBDDCReuseSolvers_Solve_Private(PC pc, Vec rhs, Vec sol, PetscBool transpose, PetscBool full)
116: {
117:   PCBDDCReuseSolvers ctx;
118:   PetscBool          copy = PETSC_FALSE;

120:   PCShellGetContext(pc, &ctx);
121:   if (full) {
122: #if defined(PETSC_HAVE_MUMPS)
123:     MatMumpsSetIcntl(ctx->F, 26, -1);
124: #endif
125: #if defined(PETSC_HAVE_MKL_PARDISO)
126:     MatMkl_PardisoSetCntl(ctx->F, 70, 0);
127: #endif
128:     copy = ctx->has_vertices;
129:   } else { /* interior solver */
130: #if defined(PETSC_HAVE_MUMPS)
131:     MatMumpsSetIcntl(ctx->F, 26, 0);
132: #endif
133: #if defined(PETSC_HAVE_MKL_PARDISO)
134:     MatMkl_PardisoSetCntl(ctx->F, 70, 1);
135: #endif
136:     copy = PETSC_TRUE;
137:   }
138:   /* copy rhs into factored matrix workspace */
139:   if (copy) {
140:     PetscInt     n;
141:     PetscScalar *array, *array_solver;

143:     VecGetLocalSize(rhs, &n);
144:     VecGetArrayRead(rhs, (const PetscScalar **)&array);
145:     VecGetArray(ctx->rhs, &array_solver);
146:     PetscArraycpy(array_solver, array, n);
147:     VecRestoreArray(ctx->rhs, &array_solver);
148:     VecRestoreArrayRead(rhs, (const PetscScalar **)&array);

150:     PCBDDCReuseSolversBenignAdapt(ctx, ctx->rhs, NULL, PETSC_FALSE, full);
151:     if (transpose) {
152:       MatSolveTranspose(ctx->F, ctx->rhs, ctx->sol);
153:     } else {
154:       MatSolve(ctx->F, ctx->rhs, ctx->sol);
155:     }
156:     PCBDDCReuseSolversBenignAdapt(ctx, ctx->sol, NULL, PETSC_TRUE, full);

158:     /* get back data to caller worskpace */
159:     VecGetArrayRead(ctx->sol, (const PetscScalar **)&array_solver);
160:     VecGetArray(sol, &array);
161:     PetscArraycpy(array, array_solver, n);
162:     VecRestoreArray(sol, &array);
163:     VecRestoreArrayRead(ctx->sol, (const PetscScalar **)&array_solver);
164:   } else {
165:     if (ctx->benign_n) {
166:       PCBDDCReuseSolversBenignAdapt(ctx, rhs, ctx->rhs, PETSC_FALSE, full);
167:       if (transpose) {
168:         MatSolveTranspose(ctx->F, ctx->rhs, sol);
169:       } else {
170:         MatSolve(ctx->F, ctx->rhs, sol);
171:       }
172:       PCBDDCReuseSolversBenignAdapt(ctx, sol, NULL, PETSC_TRUE, full);
173:     } else {
174:       if (transpose) {
175:         MatSolveTranspose(ctx->F, rhs, sol);
176:       } else {
177:         MatSolve(ctx->F, rhs, sol);
178:       }
179:     }
180:   }
181:   /* restore defaults */
182: #if defined(PETSC_HAVE_MUMPS)
183:   MatMumpsSetIcntl(ctx->F, 26, -1);
184: #endif
185: #if defined(PETSC_HAVE_MKL_PARDISO)
186:   MatMkl_PardisoSetCntl(ctx->F, 70, 0);
187: #endif
188:   return 0;
189: }

191: static PetscErrorCode PCBDDCReuseSolvers_Correction(PC pc, Vec rhs, Vec sol)
192: {
193:   PCBDDCReuseSolvers_Solve_Private(pc, rhs, sol, PETSC_FALSE, PETSC_TRUE);
194:   return 0;
195: }

197: static PetscErrorCode PCBDDCReuseSolvers_CorrectionTranspose(PC pc, Vec rhs, Vec sol)
198: {
199:   PCBDDCReuseSolvers_Solve_Private(pc, rhs, sol, PETSC_TRUE, PETSC_TRUE);
200:   return 0;
201: }

203: static PetscErrorCode PCBDDCReuseSolvers_Interior(PC pc, Vec rhs, Vec sol)
204: {
205:   PCBDDCReuseSolvers_Solve_Private(pc, rhs, sol, PETSC_FALSE, PETSC_FALSE);
206:   return 0;
207: }

209: static PetscErrorCode PCBDDCReuseSolvers_InteriorTranspose(PC pc, Vec rhs, Vec sol)
210: {
211:   PCBDDCReuseSolvers_Solve_Private(pc, rhs, sol, PETSC_TRUE, PETSC_FALSE);
212:   return 0;
213: }

215: static PetscErrorCode PCBDDCReuseSolvers_View(PC pc, PetscViewer viewer)
216: {
217:   PCBDDCReuseSolvers ctx;
218:   PetscBool          iascii;

220:   PCShellGetContext(pc, &ctx);
221:   PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii);
222:   if (iascii) PetscViewerPushFormat(viewer, PETSC_VIEWER_ASCII_INFO);
223:   MatView(ctx->F, viewer);
224:   if (iascii) PetscViewerPopFormat(viewer);
225:   return 0;
226: }

228: static PetscErrorCode PCBDDCReuseSolversReset(PCBDDCReuseSolvers reuse)
229: {
230:   PetscInt i;

232:   MatDestroy(&reuse->F);
233:   VecDestroy(&reuse->sol);
234:   VecDestroy(&reuse->rhs);
235:   PCDestroy(&reuse->interior_solver);
236:   PCDestroy(&reuse->correction_solver);
237:   ISDestroy(&reuse->is_R);
238:   ISDestroy(&reuse->is_B);
239:   VecScatterDestroy(&reuse->correction_scatter_B);
240:   VecDestroy(&reuse->sol_B);
241:   VecDestroy(&reuse->rhs_B);
242:   for (i = 0; i < reuse->benign_n; i++) ISDestroy(&reuse->benign_zerodiag_subs[i]);
243:   PetscFree(reuse->benign_zerodiag_subs);
244:   PetscFree(reuse->benign_save_vals);
245:   MatDestroy(&reuse->benign_csAIB);
246:   MatDestroy(&reuse->benign_AIIm1ones);
247:   VecDestroy(&reuse->benign_corr_work);
248:   VecDestroy(&reuse->benign_dummy_schur_vec);
249:   return 0;
250: }

252: static PetscErrorCode PCBDDCReuseSolvers_Destroy(PC pc)
253: {
254:   PCBDDCReuseSolvers ctx;

256:   PCShellGetContext(pc, &ctx);
257:   PCBDDCReuseSolversReset(ctx);
258:   PetscFree(ctx);
259:   PCShellSetContext(pc, NULL);
260:   return 0;
261: }

263: static PetscErrorCode PCBDDCComputeExplicitSchur(Mat M, PetscBool issym, MatReuse reuse, Mat *S)
264: {
265:   Mat         B, C, D, Bd, Cd, AinvBd;
266:   KSP         ksp;
267:   PC          pc;
268:   PetscBool   isLU, isILU, isCHOL, Bdense, Cdense;
269:   PetscReal   fill = 2.0;
270:   PetscInt    n_I;
271:   PetscMPIInt size;

273:   MPI_Comm_size(PetscObjectComm((PetscObject)M), &size);
275:   if (reuse == MAT_REUSE_MATRIX) {
276:     PetscBool Sdense;

278:     PetscObjectTypeCompare((PetscObject)*S, MATSEQDENSE, &Sdense);
280:   }
281:   MatSchurComplementGetSubMatrices(M, NULL, NULL, &B, &C, &D);
282:   MatSchurComplementGetKSP(M, &ksp);
283:   KSPGetPC(ksp, &pc);
284:   PetscObjectTypeCompare((PetscObject)pc, PCLU, &isLU);
285:   PetscObjectTypeCompare((PetscObject)pc, PCILU, &isILU);
286:   PetscObjectTypeCompare((PetscObject)pc, PCCHOLESKY, &isCHOL);
287:   PetscObjectTypeCompare((PetscObject)B, MATSEQDENSE, &Bdense);
288:   PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &Cdense);
289:   MatGetSize(B, &n_I, NULL);
290:   if (n_I) {
291:     if (!Bdense) {
292:       MatConvert(B, MATSEQDENSE, MAT_INITIAL_MATRIX, &Bd);
293:     } else {
294:       Bd = B;
295:     }

297:     if (isLU || isILU || isCHOL) {
298:       Mat fact;
299:       KSPSetUp(ksp);
300:       PCFactorGetMatrix(pc, &fact);
301:       MatDuplicate(Bd, MAT_DO_NOT_COPY_VALUES, &AinvBd);
302:       MatMatSolve(fact, Bd, AinvBd);
303:     } else {
304:       PetscBool ex = PETSC_TRUE;

306:       if (ex) {
307:         Mat Ainvd;

309:         PCComputeOperator(pc, MATDENSE, &Ainvd);
310:         MatMatMult(Ainvd, Bd, MAT_INITIAL_MATRIX, fill, &AinvBd);
311:         MatDestroy(&Ainvd);
312:       } else {
313:         Vec          sol, rhs;
314:         PetscScalar *arrayrhs, *arraysol;
315:         PetscInt     i, nrhs, n;

317:         MatDuplicate(Bd, MAT_DO_NOT_COPY_VALUES, &AinvBd);
318:         MatGetSize(Bd, &n, &nrhs);
319:         MatDenseGetArray(Bd, &arrayrhs);
320:         MatDenseGetArray(AinvBd, &arraysol);
321:         KSPGetSolution(ksp, &sol);
322:         KSPGetRhs(ksp, &rhs);
323:         for (i = 0; i < nrhs; i++) {
324:           VecPlaceArray(rhs, arrayrhs + i * n);
325:           VecPlaceArray(sol, arraysol + i * n);
326:           KSPSolve(ksp, rhs, sol);
327:           VecResetArray(rhs);
328:           VecResetArray(sol);
329:         }
330:         MatDenseRestoreArray(Bd, &arrayrhs);
331:         MatDenseRestoreArray(AinvBd, &arrayrhs);
332:       }
333:     }
334:     if (!Bdense & !issym) MatDestroy(&Bd);

336:     if (!issym) {
337:       if (!Cdense) {
338:         MatConvert(C, MATSEQDENSE, MAT_INITIAL_MATRIX, &Cd);
339:       } else {
340:         Cd = C;
341:       }
342:       MatMatMult(Cd, AinvBd, reuse, fill, S);
343:       if (!Cdense) MatDestroy(&Cd);
344:     } else {
345:       MatTransposeMatMult(Bd, AinvBd, reuse, fill, S);
346:       if (!Bdense) MatDestroy(&Bd);
347:     }
348:     MatDestroy(&AinvBd);
349:   }

351:   if (D) {
352:     Mat       Dd;
353:     PetscBool Ddense;

355:     PetscObjectTypeCompare((PetscObject)D, MATSEQDENSE, &Ddense);
356:     if (!Ddense) {
357:       MatConvert(D, MATSEQDENSE, MAT_INITIAL_MATRIX, &Dd);
358:     } else {
359:       Dd = D;
360:     }
361:     if (n_I) {
362:       MatAYPX(*S, -1.0, Dd, SAME_NONZERO_PATTERN);
363:     } else {
364:       if (reuse == MAT_INITIAL_MATRIX) {
365:         MatDuplicate(Dd, MAT_COPY_VALUES, S);
366:       } else {
367:         MatCopy(Dd, *S, SAME_NONZERO_PATTERN);
368:       }
369:     }
370:     if (!Ddense) MatDestroy(&Dd);
371:   } else {
372:     MatScale(*S, -1.0);
373:   }
374:   return 0;
375: }

377: PetscErrorCode PCBDDCSubSchursSetUp(PCBDDCSubSchurs sub_schurs, Mat Ain, Mat Sin, PetscBool exact_schur, PetscInt xadj[], PetscInt adjncy[], PetscInt nlayers, Vec scaling, PetscBool compute_Stilda, PetscBool reuse_solvers, PetscBool benign_trick, PetscInt benign_n, PetscInt benign_p0_lidx[], IS benign_zerodiag_subs[], Mat change, IS change_primal)
378: {
379:   Mat          F, A_II, A_IB, A_BI, A_BB, AE_II;
380:   Mat          S_all;
381:   Vec          gstash, lstash;
382:   VecScatter   sstash;
383:   IS           is_I, is_I_layer;
384:   IS           all_subsets, all_subsets_mult, all_subsets_n;
385:   PetscScalar *stasharray, *Bwork;
386:   PetscInt    *nnz, *all_local_idx_N;
387:   PetscInt    *auxnum1, *auxnum2;
388:   PetscInt     i, subset_size, max_subset_size;
389:   PetscInt     n_B, extra, local_size, global_size;
390:   PetscInt     local_stash_size;
391:   PetscBLASInt B_N, B_ierr, B_lwork, *pivots;
392:   MPI_Comm     comm_n;
393:   PetscBool    deluxe   = PETSC_TRUE;
394:   PetscBool    use_potr = PETSC_FALSE, use_sytr = PETSC_FALSE;
395:   PetscViewer  matl_dbg_viewer = NULL;
396:   PetscBool    flg;

398:   MatDestroy(&sub_schurs->A);
399:   MatDestroy(&sub_schurs->S);
400:   if (Ain) {
401:     PetscObjectReference((PetscObject)Ain);
402:     sub_schurs->A = Ain;
403:   }

405:   PetscObjectReference((PetscObject)Sin);
406:   sub_schurs->S = Sin;
407:   if (sub_schurs->schur_explicit) sub_schurs->schur_explicit = (PetscBool)(!!sub_schurs->A);

409:   /* preliminary checks */

412:   if (benign_trick) sub_schurs->is_posdef = PETSC_FALSE;

414:   /* debug (MATLAB) */
415:   if (sub_schurs->debug) {
416:     PetscMPIInt size, rank;
417:     PetscInt    nr, *print_schurs_ranks, print_schurs = PETSC_FALSE;

419:     MPI_Comm_size(PetscObjectComm((PetscObject)sub_schurs->l2gmap), &size);
420:     MPI_Comm_rank(PetscObjectComm((PetscObject)sub_schurs->l2gmap), &rank);
421:     nr = size;
422:     PetscMalloc1(nr, &print_schurs_ranks);
423:     PetscOptionsBegin(PetscObjectComm((PetscObject)sub_schurs->l2gmap), sub_schurs->prefix, "BDDC sub_schurs options", "PC");
424:     PetscOptionsIntArray("-sub_schurs_debug_ranks", "Ranks to debug (all if the option is not used)", NULL, print_schurs_ranks, &nr, &flg);
425:     if (!flg) print_schurs = PETSC_TRUE;
426:     else {
427:       print_schurs = PETSC_FALSE;
428:       for (i = 0; i < nr; i++)
429:         if (print_schurs_ranks[i] == (PetscInt)rank) {
430:           print_schurs = PETSC_TRUE;
431:           break;
432:         }
433:     }
434:     PetscOptionsEnd();
435:     PetscFree(print_schurs_ranks);
436:     if (print_schurs) {
437:       char filename[256];

439:       PetscSNPrintf(filename, sizeof(filename), "sub_schurs_Schur_r%d.m", PetscGlobalRank);
440:       PetscViewerASCIIOpen(PETSC_COMM_SELF, filename, &matl_dbg_viewer);
441:       PetscViewerPushFormat(matl_dbg_viewer, PETSC_VIEWER_ASCII_MATLAB);
442:     }
443:   }

445:   /* restrict work on active processes */
446:   if (sub_schurs->restrict_comm) {
447:     PetscSubcomm subcomm;
448:     PetscMPIInt  color, rank;

450:     color = 0;
451:     if (!sub_schurs->n_subs) color = 1; /* this can happen if we are in a multilevel case or if the subdomain is disconnected */
452:     MPI_Comm_rank(PetscObjectComm((PetscObject)sub_schurs->l2gmap), &rank);
453:     PetscSubcommCreate(PetscObjectComm((PetscObject)sub_schurs->l2gmap), &subcomm);
454:     PetscSubcommSetNumber(subcomm, 2);
455:     PetscSubcommSetTypeGeneral(subcomm, color, rank);
456:     PetscCommDuplicate(PetscSubcommChild(subcomm), &comm_n, NULL);
457:     PetscSubcommDestroy(&subcomm);
458:     if (!sub_schurs->n_subs) {
459:       PetscCommDestroy(&comm_n);
460:       return 0;
461:     }
462:   } else {
463:     PetscCommDuplicate(PetscObjectComm((PetscObject)sub_schurs->l2gmap), &comm_n, NULL);
464:   }

466:   /* get Schur complement matrices */
467:   if (!sub_schurs->schur_explicit) {
468:     Mat       tA_IB, tA_BI, tA_BB;
469:     PetscBool isseqsbaij;
470:     MatSchurComplementGetSubMatrices(sub_schurs->S, &A_II, NULL, &tA_IB, &tA_BI, &tA_BB);
471:     PetscObjectTypeCompare((PetscObject)tA_BB, MATSEQSBAIJ, &isseqsbaij);
472:     if (isseqsbaij) {
473:       MatConvert(tA_BB, MATSEQAIJ, MAT_INITIAL_MATRIX, &A_BB);
474:       MatConvert(tA_IB, MATSEQAIJ, MAT_INITIAL_MATRIX, &A_IB);
475:       MatConvert(tA_BI, MATSEQAIJ, MAT_INITIAL_MATRIX, &A_BI);
476:     } else {
477:       PetscObjectReference((PetscObject)tA_BB);
478:       A_BB = tA_BB;
479:       PetscObjectReference((PetscObject)tA_IB);
480:       A_IB = tA_IB;
481:       PetscObjectReference((PetscObject)tA_BI);
482:       A_BI = tA_BI;
483:     }
484:   } else {
485:     A_II = NULL;
486:     A_IB = NULL;
487:     A_BI = NULL;
488:     A_BB = NULL;
489:   }
490:   S_all = NULL;

492:   /* determine interior problems */
493:   ISGetLocalSize(sub_schurs->is_I, &i);
494:   if (nlayers >= 0 && i) { /* Interior problems can be different from the original one */
495:     PetscBT         touched;
496:     const PetscInt *idx_B;
497:     PetscInt        n_I, n_B, n_local_dofs, n_prev_added, j, layer, *local_numbering;

500:     /* get sizes */
501:     ISGetLocalSize(sub_schurs->is_I, &n_I);
502:     ISGetLocalSize(sub_schurs->is_B, &n_B);

504:     PetscMalloc1(n_I + n_B, &local_numbering);
505:     PetscBTCreate(n_I + n_B, &touched);
506:     PetscBTMemzero(n_I + n_B, touched);

508:     /* all boundary dofs must be skipped when adding layers */
509:     ISGetIndices(sub_schurs->is_B, &idx_B);
510:     for (j = 0; j < n_B; j++) PetscBTSet(touched, idx_B[j]);
511:     PetscArraycpy(local_numbering, idx_B, n_B);
512:     ISRestoreIndices(sub_schurs->is_B, &idx_B);

514:     /* add prescribed number of layers of dofs */
515:     n_local_dofs = n_B;
516:     n_prev_added = n_B;
517:     for (layer = 0; layer < nlayers; layer++) {
518:       PetscInt n_added = 0;
519:       if (n_local_dofs == n_I + n_B) break;
521:       PCBDDCAdjGetNextLayer_Private(local_numbering + n_local_dofs, n_prev_added, touched, xadj, adjncy, &n_added);
522:       n_prev_added = n_added;
523:       n_local_dofs += n_added;
524:       if (!n_added) break;
525:     }
526:     PetscBTDestroy(&touched);

528:     /* IS for I layer dofs in original numbering */
529:     ISCreateGeneral(PetscObjectComm((PetscObject)sub_schurs->is_I), n_local_dofs - n_B, local_numbering + n_B, PETSC_COPY_VALUES, &is_I_layer);
530:     PetscFree(local_numbering);
531:     ISSort(is_I_layer);
532:     /* IS for I layer dofs in I numbering */
533:     if (!sub_schurs->schur_explicit) {
534:       ISLocalToGlobalMapping ItoNmap;
535:       ISLocalToGlobalMappingCreateIS(sub_schurs->is_I, &ItoNmap);
536:       ISGlobalToLocalMappingApplyIS(ItoNmap, IS_GTOLM_DROP, is_I_layer, &is_I);
537:       ISLocalToGlobalMappingDestroy(&ItoNmap);

539:       /* II block */
540:       MatCreateSubMatrix(A_II, is_I, is_I, MAT_INITIAL_MATRIX, &AE_II);
541:     }
542:   } else {
543:     PetscInt n_I;

545:     /* IS for I dofs in original numbering */
546:     PetscObjectReference((PetscObject)sub_schurs->is_I);
547:     is_I_layer = sub_schurs->is_I;

549:     /* IS for I dofs in I numbering (strided 1) */
550:     if (!sub_schurs->schur_explicit) {
551:       ISGetSize(sub_schurs->is_I, &n_I);
552:       ISCreateStride(PetscObjectComm((PetscObject)sub_schurs->is_I), n_I, 0, 1, &is_I);

554:       /* II block is the same */
555:       PetscObjectReference((PetscObject)A_II);
556:       AE_II = A_II;
557:     }
558:   }

560:   /* Get info on subset sizes and sum of all subsets sizes */
561:   max_subset_size = 0;
562:   local_size      = 0;
563:   for (i = 0; i < sub_schurs->n_subs; i++) {
564:     ISGetLocalSize(sub_schurs->is_subs[i], &subset_size);
565:     max_subset_size = PetscMax(subset_size, max_subset_size);
566:     local_size += subset_size;
567:   }

569:   /* Work arrays for local indices */
570:   extra = 0;
571:   ISGetLocalSize(sub_schurs->is_B, &n_B);
572:   if (sub_schurs->schur_explicit && is_I_layer) ISGetLocalSize(is_I_layer, &extra);
573:   PetscMalloc1(n_B + extra, &all_local_idx_N);
574:   if (extra) {
575:     const PetscInt *idxs;
576:     ISGetIndices(is_I_layer, &idxs);
577:     PetscArraycpy(all_local_idx_N, idxs, extra);
578:     ISRestoreIndices(is_I_layer, &idxs);
579:   }
580:   PetscMalloc1(sub_schurs->n_subs, &auxnum1);
581:   PetscMalloc1(sub_schurs->n_subs, &auxnum2);

583:   /* Get local indices in local numbering */
584:   local_size       = 0;
585:   local_stash_size = 0;
586:   for (i = 0; i < sub_schurs->n_subs; i++) {
587:     const PetscInt *idxs;

589:     ISGetLocalSize(sub_schurs->is_subs[i], &subset_size);
590:     ISGetIndices(sub_schurs->is_subs[i], &idxs);
591:     /* start (smallest in global ordering) and multiplicity */
592:     auxnum1[i] = idxs[0];
593:     auxnum2[i] = subset_size * subset_size;
594:     /* subset indices in local numbering */
595:     PetscArraycpy(all_local_idx_N + local_size + extra, idxs, subset_size);
596:     ISRestoreIndices(sub_schurs->is_subs[i], &idxs);
597:     local_size += subset_size;
598:     local_stash_size += subset_size * subset_size;
599:   }

601:   /* allocate extra workspace needed only for GETRI or SYTRF */
602:   use_potr = use_sytr = PETSC_FALSE;
603:   if (benign_trick || (sub_schurs->is_hermitian && sub_schurs->is_posdef)) {
604:     use_potr = PETSC_TRUE;
605:   } else if (sub_schurs->is_symmetric) {
606:     use_sytr = PETSC_TRUE;
607:   }
608:   if (local_size && !use_potr) {
609:     PetscScalar  lwork, dummyscalar = 0.;
610:     PetscBLASInt dummyint = 0;

612:     B_lwork = -1;
613:     PetscBLASIntCast(local_size, &B_N);
614:     PetscFPTrapPush(PETSC_FP_TRAP_OFF);
615:     if (use_sytr) {
616:       PetscCallBLAS("LAPACKsytrf", LAPACKsytrf_("L", &B_N, &dummyscalar, &B_N, &dummyint, &lwork, &B_lwork, &B_ierr));
618:     } else {
619:       PetscCallBLAS("LAPACKgetri", LAPACKgetri_(&B_N, &dummyscalar, &B_N, &dummyint, &lwork, &B_lwork, &B_ierr));
621:     }
622:     PetscFPTrapPop();
623:     PetscBLASIntCast((PetscInt)PetscRealPart(lwork), &B_lwork);
624:     PetscMalloc2(B_lwork, &Bwork, B_N, &pivots);
625:   } else {
626:     Bwork  = NULL;
627:     pivots = NULL;
628:   }

630:   /* prepare data for summing up properly schurs on subsets */
631:   ISCreateGeneral(comm_n, sub_schurs->n_subs, auxnum1, PETSC_OWN_POINTER, &all_subsets_n);
632:   ISLocalToGlobalMappingApplyIS(sub_schurs->l2gmap, all_subsets_n, &all_subsets);
633:   ISDestroy(&all_subsets_n);
634:   ISCreateGeneral(comm_n, sub_schurs->n_subs, auxnum2, PETSC_OWN_POINTER, &all_subsets_mult);
635:   ISRenumber(all_subsets, all_subsets_mult, &global_size, &all_subsets_n);
636:   ISDestroy(&all_subsets);
637:   ISDestroy(&all_subsets_mult);
638:   ISGetLocalSize(all_subsets_n, &i);
640:   VecCreateSeqWithArray(PETSC_COMM_SELF, 1, local_stash_size, NULL, &lstash);
641:   VecCreateMPI(comm_n, PETSC_DECIDE, global_size, &gstash);
642:   VecScatterCreate(lstash, NULL, gstash, all_subsets_n, &sstash);
643:   ISDestroy(&all_subsets_n);

645:   /* subset indices in local boundary numbering */
646:   if (!sub_schurs->is_Ej_all) {
647:     PetscInt *all_local_idx_B;

649:     PetscMalloc1(local_size, &all_local_idx_B);
650:     ISGlobalToLocalMappingApply(sub_schurs->BtoNmap, IS_GTOLM_DROP, local_size, all_local_idx_N + extra, &subset_size, all_local_idx_B);
652:     ISCreateGeneral(PETSC_COMM_SELF, local_size, all_local_idx_B, PETSC_OWN_POINTER, &sub_schurs->is_Ej_all);
653:   }

655:   if (change) {
656:     ISLocalToGlobalMapping BtoS;
657:     IS                     change_primal_B;
658:     IS                     change_primal_all;

662:     PetscMalloc1(sub_schurs->n_subs, &sub_schurs->change_primal_sub);
663:     for (i = 0; i < sub_schurs->n_subs; i++) {
664:       ISLocalToGlobalMapping NtoS;
665:       ISLocalToGlobalMappingCreateIS(sub_schurs->is_subs[i], &NtoS);
666:       ISGlobalToLocalMappingApplyIS(NtoS, IS_GTOLM_DROP, change_primal, &sub_schurs->change_primal_sub[i]);
667:       ISLocalToGlobalMappingDestroy(&NtoS);
668:     }
669:     ISGlobalToLocalMappingApplyIS(sub_schurs->BtoNmap, IS_GTOLM_DROP, change_primal, &change_primal_B);
670:     ISLocalToGlobalMappingCreateIS(sub_schurs->is_Ej_all, &BtoS);
671:     ISGlobalToLocalMappingApplyIS(BtoS, IS_GTOLM_DROP, change_primal_B, &change_primal_all);
672:     ISLocalToGlobalMappingDestroy(&BtoS);
673:     ISDestroy(&change_primal_B);
674:     PetscMalloc1(sub_schurs->n_subs, &sub_schurs->change);
675:     for (i = 0; i < sub_schurs->n_subs; i++) {
676:       Mat change_sub;

678:       ISGetLocalSize(sub_schurs->is_subs[i], &subset_size);
679:       KSPCreate(PETSC_COMM_SELF, &sub_schurs->change[i]);
680:       KSPSetType(sub_schurs->change[i], KSPPREONLY);
681:       if (!sub_schurs->change_with_qr) {
682:         MatCreateSubMatrix(change, sub_schurs->is_subs[i], sub_schurs->is_subs[i], MAT_INITIAL_MATRIX, &change_sub);
683:       } else {
684:         Mat change_subt;
685:         MatCreateSubMatrix(change, sub_schurs->is_subs[i], sub_schurs->is_subs[i], MAT_INITIAL_MATRIX, &change_subt);
686:         MatConvert(change_subt, MATSEQDENSE, MAT_INITIAL_MATRIX, &change_sub);
687:         MatDestroy(&change_subt);
688:       }
689:       KSPSetOperators(sub_schurs->change[i], change_sub, change_sub);
690:       MatDestroy(&change_sub);
691:       KSPSetOptionsPrefix(sub_schurs->change[i], sub_schurs->prefix);
692:       KSPAppendOptionsPrefix(sub_schurs->change[i], "sub_schurs_change_");
693:     }
694:     ISDestroy(&change_primal_all);
695:   }

697:   /* Local matrix of all local Schur on subsets (transposed) */
698:   if (!sub_schurs->S_Ej_all) {
699:     Mat          T;
700:     PetscScalar *v;
701:     PetscInt    *ii, *jj;
702:     PetscInt     cum, i, j, k;

704:     /* MatSeqAIJSetPreallocation + MatSetValues is slow for these kind of matrices (may have large blocks)
705:        Allocate properly a representative matrix and duplicate */
706:     PetscMalloc3(local_size + 1, &ii, local_stash_size, &jj, local_stash_size, &v);
707:     ii[0] = 0;
708:     cum   = 0;
709:     for (i = 0; i < sub_schurs->n_subs; i++) {
710:       ISGetLocalSize(sub_schurs->is_subs[i], &subset_size);
711:       for (j = 0; j < subset_size; j++) {
712:         const PetscInt row = cum + j;
713:         PetscInt       col = cum;

715:         ii[row + 1] = ii[row] + subset_size;
716:         for (k = ii[row]; k < ii[row + 1]; k++) {
717:           jj[k] = col;
718:           col++;
719:         }
720:       }
721:       cum += subset_size;
722:     }
723:     MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, local_size, local_size, ii, jj, v, &T);
724:     MatDuplicate(T, MAT_DO_NOT_COPY_VALUES, &sub_schurs->S_Ej_all);
725:     MatDestroy(&T);
726:     PetscFree3(ii, jj, v);
727:   }
728:   /* matrices for deluxe scaling and adaptive selection */
729:   if (compute_Stilda) {
730:     if (!sub_schurs->sum_S_Ej_tilda_all) MatDuplicate(sub_schurs->S_Ej_all, MAT_DO_NOT_COPY_VALUES, &sub_schurs->sum_S_Ej_tilda_all);
731:     if (!sub_schurs->sum_S_Ej_inv_all && deluxe) MatDuplicate(sub_schurs->S_Ej_all, MAT_DO_NOT_COPY_VALUES, &sub_schurs->sum_S_Ej_inv_all);
732:   }

734:   /* Compute Schur complements explicitly */
735:   F = NULL;
736:   if (!sub_schurs->schur_explicit) {
737:     /* this code branch is used when MatFactor with Schur complement support is not present or when explicitly requested;
738:        it is not efficient, unless the economic version of the scaling is used */
739:     Mat          S_Ej_expl;
740:     PetscScalar *work;
741:     PetscInt     j, *dummy_idx;
742:     PetscBool    Sdense;

744:     PetscMalloc2(max_subset_size, &dummy_idx, max_subset_size * max_subset_size, &work);
745:     local_size = 0;
746:     for (i = 0; i < sub_schurs->n_subs; i++) {
747:       IS  is_subset_B;
748:       Mat AE_EE, AE_IE, AE_EI, S_Ej;

750:       /* subsets in original and boundary numbering */
751:       ISGlobalToLocalMappingApplyIS(sub_schurs->BtoNmap, IS_GTOLM_DROP, sub_schurs->is_subs[i], &is_subset_B);
752:       /* EE block */
753:       MatCreateSubMatrix(A_BB, is_subset_B, is_subset_B, MAT_INITIAL_MATRIX, &AE_EE);
754:       /* IE block */
755:       MatCreateSubMatrix(A_IB, is_I, is_subset_B, MAT_INITIAL_MATRIX, &AE_IE);
756:       /* EI block */
757:       if (sub_schurs->is_symmetric) {
758:         MatCreateTranspose(AE_IE, &AE_EI);
759:       } else if (sub_schurs->is_hermitian) {
760:         MatCreateHermitianTranspose(AE_IE, &AE_EI);
761:       } else {
762:         MatCreateSubMatrix(A_BI, is_subset_B, is_I, MAT_INITIAL_MATRIX, &AE_EI);
763:       }
764:       ISDestroy(&is_subset_B);
765:       MatCreateSchurComplement(AE_II, AE_II, AE_IE, AE_EI, AE_EE, &S_Ej);
766:       MatDestroy(&AE_EE);
767:       MatDestroy(&AE_IE);
768:       MatDestroy(&AE_EI);
769:       if (AE_II == A_II) { /* we can reuse the same ksp */
770:         KSP ksp;
771:         MatSchurComplementGetKSP(sub_schurs->S, &ksp);
772:         MatSchurComplementSetKSP(S_Ej, ksp);
773:       } else { /* build new ksp object which inherits ksp and pc types from the original one */
774:         KSP       origksp, schurksp;
775:         PC        origpc, schurpc;
776:         KSPType   ksp_type;
777:         PetscInt  n_internal;
778:         PetscBool ispcnone;

780:         MatSchurComplementGetKSP(sub_schurs->S, &origksp);
781:         MatSchurComplementGetKSP(S_Ej, &schurksp);
782:         KSPGetType(origksp, &ksp_type);
783:         KSPSetType(schurksp, ksp_type);
784:         KSPGetPC(schurksp, &schurpc);
785:         KSPGetPC(origksp, &origpc);
786:         PetscObjectTypeCompare((PetscObject)origpc, PCNONE, &ispcnone);
787:         if (!ispcnone) {
788:           PCType pc_type;
789:           PCGetType(origpc, &pc_type);
790:           PCSetType(schurpc, pc_type);
791:         } else {
792:           PCSetType(schurpc, PCLU);
793:         }
794:         ISGetSize(is_I, &n_internal);
795:         if (!n_internal) { /* UMFPACK gives error with 0 sized problems */
796:           MatSolverType solver = NULL;
797:           PCFactorGetMatSolverType(origpc, (MatSolverType *)&solver);
798:           if (solver) PCFactorSetMatSolverType(schurpc, solver);
799:         }
800:         KSPSetUp(schurksp);
801:       }
802:       ISGetLocalSize(sub_schurs->is_subs[i], &subset_size);
803:       MatCreateSeqDense(PETSC_COMM_SELF, subset_size, subset_size, work, &S_Ej_expl);
804:       PCBDDCComputeExplicitSchur(S_Ej, sub_schurs->is_symmetric, MAT_REUSE_MATRIX, &S_Ej_expl);
805:       PetscObjectTypeCompare((PetscObject)S_Ej_expl, MATSEQDENSE, &Sdense);
806:       if (Sdense) {
807:         for (j = 0; j < subset_size; j++) dummy_idx[j] = local_size + j;
808:         MatSetValues(sub_schurs->S_Ej_all, subset_size, dummy_idx, subset_size, dummy_idx, work, INSERT_VALUES);
809:       } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not yet implemented for sparse matrices");
810:       MatDestroy(&S_Ej);
811:       MatDestroy(&S_Ej_expl);
812:       local_size += subset_size;
813:     }
814:     PetscFree2(dummy_idx, work);
815:     /* free */
816:     ISDestroy(&is_I);
817:     MatDestroy(&AE_II);
818:     PetscFree(all_local_idx_N);
819:   } else {
820:     Mat                A, cs_AIB_mat = NULL, benign_AIIm1_ones_mat = NULL;
821:     Mat               *gdswA;
822:     Vec                Dall = NULL;
823:     IS                 is_A_all, *is_p_r = NULL;
824:     MatType            Stype;
825:     PetscScalar       *work, *S_data, *schur_factor, infty = PETSC_MAX_REAL;
826:     PetscScalar       *SEj_arr = NULL, *SEjinv_arr = NULL;
827:     const PetscScalar *rS_data;
828:     PetscInt           n, n_I, size_schur, size_active_schur, cum, cum2;
829:     PetscBool          economic, solver_S, S_lower_triangular = PETSC_FALSE;
830:     PetscBool          schur_has_vertices, factor_workaround;
831:     PetscBool          use_cholesky;
832: #if defined(PETSC_HAVE_VIENNACL) || defined(PETSC_HAVE_CUDA)
833:     PetscBool oldpin;
834: #endif

836:     /* get sizes */
837:     n_I = 0;
838:     if (is_I_layer) ISGetLocalSize(is_I_layer, &n_I);
839:     economic = PETSC_FALSE;
840:     ISGetLocalSize(sub_schurs->is_I, &cum);
841:     if (cum != n_I) economic = PETSC_TRUE;
842:     MatGetLocalSize(sub_schurs->A, &n, NULL);
843:     size_active_schur = local_size;

845:     /* import scaling vector (wrong formulation if we have 3D edges) */
846:     if (scaling && compute_Stilda) {
847:       const PetscScalar *array;
848:       PetscScalar       *array2;
849:       const PetscInt    *idxs;
850:       PetscInt           i;

852:       ISGetIndices(sub_schurs->is_Ej_all, &idxs);
853:       VecCreateSeq(PETSC_COMM_SELF, size_active_schur, &Dall);
854:       VecGetArrayRead(scaling, &array);
855:       VecGetArray(Dall, &array2);
856:       for (i = 0; i < size_active_schur; i++) array2[i] = array[idxs[i]];
857:       VecRestoreArray(Dall, &array2);
858:       VecRestoreArrayRead(scaling, &array);
859:       ISRestoreIndices(sub_schurs->is_Ej_all, &idxs);
860:       deluxe = PETSC_FALSE;
861:     }

863:     /* size active schurs does not count any dirichlet or vertex dof on the interface */
864:     factor_workaround  = PETSC_FALSE;
865:     schur_has_vertices = PETSC_FALSE;
866:     cum                = n_I + size_active_schur;
867:     if (sub_schurs->is_dir) {
868:       const PetscInt *idxs;
869:       PetscInt        n_dir;

871:       ISGetLocalSize(sub_schurs->is_dir, &n_dir);
872:       ISGetIndices(sub_schurs->is_dir, &idxs);
873:       PetscArraycpy(all_local_idx_N + cum, idxs, n_dir);
874:       ISRestoreIndices(sub_schurs->is_dir, &idxs);
875:       cum += n_dir;
876:       if (!sub_schurs->gdsw) factor_workaround = PETSC_TRUE;
877:     }
878:     /* include the primal vertices in the Schur complement */
879:     if (exact_schur && sub_schurs->is_vertices && (compute_Stilda || benign_n)) {
880:       PetscInt n_v;

882:       ISGetLocalSize(sub_schurs->is_vertices, &n_v);
883:       if (n_v) {
884:         const PetscInt *idxs;

886:         ISGetIndices(sub_schurs->is_vertices, &idxs);
887:         PetscArraycpy(all_local_idx_N + cum, idxs, n_v);
888:         ISRestoreIndices(sub_schurs->is_vertices, &idxs);
889:         cum += n_v;
890:         if (!sub_schurs->gdsw) factor_workaround = PETSC_TRUE;
891:         schur_has_vertices = PETSC_TRUE;
892:       }
893:     }
894:     size_schur = cum - n_I;
895:     ISCreateGeneral(PETSC_COMM_SELF, cum, all_local_idx_N, PETSC_OWN_POINTER, &is_A_all);
896: #if defined(PETSC_HAVE_VIENNACL) || defined(PETSC_HAVE_CUDA)
897:     oldpin = sub_schurs->A->boundtocpu;
898:     MatBindToCPU(sub_schurs->A, PETSC_TRUE);
899: #endif
900:     if (cum == n) {
901:       ISSetPermutation(is_A_all);
902:       MatPermute(sub_schurs->A, is_A_all, is_A_all, &A);
903:     } else {
904:       MatCreateSubMatrix(sub_schurs->A, is_A_all, is_A_all, MAT_INITIAL_MATRIX, &A);
905:     }
906: #if defined(PETSC_HAVE_VIENNACL) || defined(PETSC_HAVE_CUDA)
907:     MatBindToCPU(sub_schurs->A, oldpin);
908: #endif
909:     MatSetOptionsPrefixFactor(A, sub_schurs->prefix);
910:     MatAppendOptionsPrefixFactor(A, "sub_schurs_");

912:     /* if we actually change the basis for the pressures, LDL^T factors will use a lot of memory
913:        this is a workaround */
914:     if (benign_n) {
915:       Vec                    v, benign_AIIm1_ones;
916:       ISLocalToGlobalMapping N_to_reor;
917:       IS                     is_p0, is_p0_p;
918:       PetscScalar           *cs_AIB, *AIIm1_data;
919:       PetscInt               sizeA;

921:       ISLocalToGlobalMappingCreateIS(is_A_all, &N_to_reor);
922:       ISCreateGeneral(PETSC_COMM_SELF, benign_n, benign_p0_lidx, PETSC_COPY_VALUES, &is_p0);
923:       ISGlobalToLocalMappingApplyIS(N_to_reor, IS_GTOLM_DROP, is_p0, &is_p0_p);
924:       ISDestroy(&is_p0);
925:       MatCreateVecs(A, &v, &benign_AIIm1_ones);
926:       VecGetSize(v, &sizeA);
927:       MatCreateSeqDense(PETSC_COMM_SELF, sizeA, benign_n, NULL, &benign_AIIm1_ones_mat);
928:       MatCreateSeqDense(PETSC_COMM_SELF, size_schur, benign_n, NULL, &cs_AIB_mat);
929:       MatDenseGetArray(cs_AIB_mat, &cs_AIB);
930:       MatDenseGetArray(benign_AIIm1_ones_mat, &AIIm1_data);
931:       PetscMalloc1(benign_n, &is_p_r);
932:       /* compute colsum of A_IB restricted to pressures */
933:       for (i = 0; i < benign_n; i++) {
934:         const PetscScalar *array;
935:         const PetscInt    *idxs;
936:         PetscInt           j, nz;

938:         ISGlobalToLocalMappingApplyIS(N_to_reor, IS_GTOLM_DROP, benign_zerodiag_subs[i], &is_p_r[i]);
939:         ISGetLocalSize(is_p_r[i], &nz);
940:         ISGetIndices(is_p_r[i], &idxs);
941:         for (j = 0; j < nz; j++) AIIm1_data[idxs[j] + sizeA * i] = 1.;
942:         ISRestoreIndices(is_p_r[i], &idxs);
943:         VecPlaceArray(benign_AIIm1_ones, AIIm1_data + sizeA * i);
944:         MatMult(A, benign_AIIm1_ones, v);
945:         VecResetArray(benign_AIIm1_ones);
946:         VecGetArrayRead(v, &array);
947:         for (j = 0; j < size_schur; j++) {
948: #if defined(PETSC_USE_COMPLEX)
949:           cs_AIB[i * size_schur + j] = (PetscRealPart(array[j + n_I]) / nz + PETSC_i * (PetscImaginaryPart(array[j + n_I]) / nz));
950: #else
951:           cs_AIB[i * size_schur + j] = array[j + n_I] / nz;
952: #endif
953:         }
954:         VecRestoreArrayRead(v, &array);
955:       }
956:       MatDenseRestoreArray(cs_AIB_mat, &cs_AIB);
957:       MatDenseRestoreArray(benign_AIIm1_ones_mat, &AIIm1_data);
958:       VecDestroy(&v);
959:       VecDestroy(&benign_AIIm1_ones);
960:       MatSetOption(A, MAT_KEEP_NONZERO_PATTERN, PETSC_FALSE);
961:       MatSetOption(A, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_FALSE);
962:       MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE);
963:       MatZeroRowsColumnsIS(A, is_p0_p, 1.0, NULL, NULL);
964:       ISDestroy(&is_p0_p);
965:       ISLocalToGlobalMappingDestroy(&N_to_reor);
966:     }
967:     MatSetOption(A, MAT_SYMMETRIC, sub_schurs->is_symmetric);
968:     MatSetOption(A, MAT_HERMITIAN, sub_schurs->is_hermitian);
969:     MatSetOption(A, MAT_SPD, sub_schurs->is_posdef);

971:     /* for complexes, symmetric and hermitian at the same time implies null imaginary part */
972:     use_cholesky = (PetscBool)((use_potr || use_sytr) && sub_schurs->is_hermitian && sub_schurs->is_symmetric);

974:     /* when using the benign subspace trick, the local Schur complements are SPD */
975:     /* MKL_PARDISO does not handle well the computation of a Schur complement from a symmetric indefinite factorization
976:        Use LU and adapt pivoting perturbation (still, solution is not as accurate as with using MUMPS) */
977:     if (benign_trick) {
978:       sub_schurs->is_posdef = PETSC_TRUE;
979:       PetscStrcmp(sub_schurs->mat_solver_type, MATSOLVERMKL_PARDISO, &flg);
980:       if (flg) use_cholesky = PETSC_FALSE;
981:     }

983:     if (n_I) {
984:       IS        is_schur;
985:       char      stype[64];
986:       PetscBool gpu = PETSC_FALSE;

988:       if (use_cholesky) {
989:         MatGetFactor(A, sub_schurs->mat_solver_type, MAT_FACTOR_CHOLESKY, &F);
990:       } else {
991:         MatGetFactor(A, sub_schurs->mat_solver_type, MAT_FACTOR_LU, &F);
992:       }
993:       MatSetErrorIfFailure(A, PETSC_TRUE);
994: #if defined(PETSC_HAVE_MKL_PARDISO)
995:       if (benign_trick) MatMkl_PardisoSetCntl(F, 10, 10);
996: #endif
997:       /* subsets ordered last */
998:       ISCreateStride(PETSC_COMM_SELF, size_schur, n_I, 1, &is_schur);
999:       MatFactorSetSchurIS(F, is_schur);
1000:       ISDestroy(&is_schur);

1002:       /* factorization step */
1003:       if (use_cholesky) {
1004:         MatCholeskyFactorSymbolic(F, A, NULL, NULL);
1005: #if defined(PETSC_HAVE_MUMPS) /* be sure that icntl 19 is not set by command line */
1006:         MatMumpsSetIcntl(F, 19, 2);
1007: #endif
1008:         MatCholeskyFactorNumeric(F, A, NULL);
1009:         S_lower_triangular = PETSC_TRUE;
1010:       } else {
1011:         MatLUFactorSymbolic(F, A, NULL, NULL, NULL);
1012: #if defined(PETSC_HAVE_MUMPS) /* be sure that icntl 19 is not set by command line */
1013:         MatMumpsSetIcntl(F, 19, 3);
1014: #endif
1015:         MatLUFactorNumeric(F, A, NULL);
1016:       }
1017:       MatViewFromOptions(F, (PetscObject)A, "-mat_factor_view");

1019:       if (matl_dbg_viewer) {
1020:         Mat S;
1021:         IS  is;

1023:         PetscObjectSetName((PetscObject)A, "A");
1024:         MatView(A, matl_dbg_viewer);
1025:         MatFactorCreateSchurComplement(F, &S, NULL);
1026:         PetscObjectSetName((PetscObject)S, "S");
1027:         MatView(S, matl_dbg_viewer);
1028:         MatDestroy(&S);
1029:         ISCreateStride(PETSC_COMM_SELF, n_I, 0, 1, &is);
1030:         PetscObjectSetName((PetscObject)is, "I");
1031:         ISView(is, matl_dbg_viewer);
1032:         ISDestroy(&is);
1033:         ISCreateStride(PETSC_COMM_SELF, size_schur, n_I, 1, &is);
1034:         PetscObjectSetName((PetscObject)is, "B");
1035:         ISView(is, matl_dbg_viewer);
1036:         ISDestroy(&is);
1037:         PetscObjectSetName((PetscObject)is_A_all, "IA");
1038:         ISView(is_A_all, matl_dbg_viewer);
1039:         for (i = 0, cum = 0; i < sub_schurs->n_subs; i++) {
1040:           IS   is;
1041:           char name[16];

1043:           PetscSNPrintf(name, sizeof(name), "IE%" PetscInt_FMT, i);
1044:           ISGetLocalSize(sub_schurs->is_subs[i], &subset_size);
1045:           ISCreateStride(PETSC_COMM_SELF, subset_size, cum, 1, &is);
1046:           PetscObjectSetName((PetscObject)is, name);
1047:           ISView(is, matl_dbg_viewer);
1048:           ISDestroy(&is);
1049:           if (sub_schurs->change) {
1050:             Mat T;

1052:             PetscSNPrintf(name, sizeof(name), "TE%" PetscInt_FMT, i);
1053:             KSPGetOperators(sub_schurs->change[i], &T, NULL);
1054:             PetscObjectSetName((PetscObject)T, name);
1055:             MatView(T, matl_dbg_viewer);
1056:             PetscSNPrintf(name, sizeof(name), "ITE%" PetscInt_FMT, i);
1057:             PetscObjectSetName((PetscObject)sub_schurs->change_primal_sub[i], name);
1058:             ISView(sub_schurs->change_primal_sub[i], matl_dbg_viewer);
1059:           }
1060:           cum += subset_size;
1061:         }
1062:         PetscViewerFlush(matl_dbg_viewer);
1063:       }

1065:       /* get explicit Schur Complement computed during numeric factorization */
1066:       MatFactorGetSchurComplement(F, &S_all, NULL);
1067:       PetscStrncpy(stype, MATSEQDENSE, sizeof(stype));
1068: #if defined(PETSC_HAVE_CUDA)
1069:       PetscObjectTypeCompareAny((PetscObject)A, &gpu, MATSEQAIJVIENNACL, MATSEQAIJCUSPARSE, "");
1070: #endif
1071:       if (gpu) PetscStrncpy(stype, MATSEQDENSECUDA, sizeof(stype));
1072:       PetscOptionsGetString(NULL, sub_schurs->prefix, "-sub_schurs_schur_mat_type", stype, sizeof(stype), NULL);
1073:       MatConvert(S_all, stype, MAT_INPLACE_MATRIX, &S_all);
1074:       MatSetOption(S_all, MAT_SPD, sub_schurs->is_posdef);
1075:       MatSetOption(S_all, MAT_HERMITIAN, sub_schurs->is_hermitian);
1076:       MatGetType(S_all, &Stype);

1078:       /* we can reuse the solvers if we are not using the economic version */
1079:       reuse_solvers = (PetscBool)(reuse_solvers && !economic);
1080:       if (!sub_schurs->gdsw) {
1081:         factor_workaround = (PetscBool)(reuse_solvers && factor_workaround);
1082:         if (!sub_schurs->is_posdef && factor_workaround && compute_Stilda && size_active_schur) reuse_solvers = factor_workaround = PETSC_FALSE;
1083:       }
1084:       solver_S = PETSC_TRUE;

1086:       /* update the Schur complement with the change of basis on the pressures */
1087:       if (benign_n) {
1088:         const PetscScalar *cs_AIB;
1089:         PetscScalar       *S_data, *AIIm1_data;
1090:         Mat                S2 = NULL, S3 = NULL; /* dbg */
1091:         PetscScalar       *S2_data, *S3_data;    /* dbg */
1092:         Vec                v, benign_AIIm1_ones;
1093:         PetscInt           sizeA;

1095:         MatDenseGetArray(S_all, &S_data);
1096:         MatCreateVecs(A, &v, &benign_AIIm1_ones);
1097:         VecGetSize(v, &sizeA);
1098: #if defined(PETSC_HAVE_MUMPS)
1099:         MatMumpsSetIcntl(F, 26, 0);
1100: #endif
1101: #if defined(PETSC_HAVE_MKL_PARDISO)
1102:         MatMkl_PardisoSetCntl(F, 70, 1);
1103: #endif
1104:         MatDenseGetArrayRead(cs_AIB_mat, &cs_AIB);
1105:         MatDenseGetArray(benign_AIIm1_ones_mat, &AIIm1_data);
1106:         if (matl_dbg_viewer) {
1107:           MatDuplicate(S_all, MAT_DO_NOT_COPY_VALUES, &S2);
1108:           MatDuplicate(S_all, MAT_DO_NOT_COPY_VALUES, &S3);
1109:           MatDenseGetArray(S2, &S2_data);
1110:           MatDenseGetArray(S3, &S3_data);
1111:         }
1112:         for (i = 0; i < benign_n; i++) {
1113:           PetscScalar    *array, sum = 0., one = 1., *sums;
1114:           const PetscInt *idxs;
1115:           PetscInt        k, j, nz;
1116:           PetscBLASInt    B_k, B_n;

1118:           PetscCalloc1(benign_n, &sums);
1119:           VecPlaceArray(benign_AIIm1_ones, AIIm1_data + sizeA * i);
1120:           VecCopy(benign_AIIm1_ones, v);
1121:           MatSolve(F, v, benign_AIIm1_ones);
1122:           MatMult(A, benign_AIIm1_ones, v);
1123:           VecResetArray(benign_AIIm1_ones);
1124:           /* p0 dofs (eliminated) are excluded from the sums */
1125:           for (k = 0; k < benign_n; k++) {
1126:             ISGetLocalSize(is_p_r[k], &nz);
1127:             ISGetIndices(is_p_r[k], &idxs);
1128:             for (j = 0; j < nz - 1; j++) sums[k] -= AIIm1_data[idxs[j] + sizeA * i];
1129:             ISRestoreIndices(is_p_r[k], &idxs);
1130:           }
1131:           VecGetArrayRead(v, (const PetscScalar **)&array);
1132:           if (matl_dbg_viewer) {
1133:             Vec  vv;
1134:             char name[16];

1136:             VecCreateSeqWithArray(PETSC_COMM_SELF, 1, size_schur, array + n_I, &vv);
1137:             PetscSNPrintf(name, sizeof(name), "Pvs%" PetscInt_FMT, i);
1138:             PetscObjectSetName((PetscObject)vv, name);
1139:             VecView(vv, matl_dbg_viewer);
1140:           }
1141:           /* perform sparse rank updates on symmetric Schur (TODO: move outside of the loop?) */
1142:           /* cs_AIB already scaled by 1./nz */
1143:           B_k = 1;
1144:           for (k = 0; k < benign_n; k++) {
1145:             sum = sums[k];
1146:             PetscBLASIntCast(size_schur, &B_n);

1148:             if (PetscAbsScalar(sum) == 0.0) continue;
1149:             if (k == i) {
1150:               PetscCallBLAS("BLASsyrk", BLASsyrk_("L", "N", &B_n, &B_k, &sum, cs_AIB + i * size_schur, &B_n, &one, S_data, &B_n));
1151:               if (matl_dbg_viewer) PetscCallBLAS("BLASsyrk", BLASsyrk_("L", "N", &B_n, &B_k, &sum, cs_AIB + i * size_schur, &B_n, &one, S3_data, &B_n));
1152:             } else { /* XXX Is it correct to use symmetric rank-2 update with half of the sum? */
1153:               sum /= 2.0;
1154:               PetscCallBLAS("BLASsyr2k", BLASsyr2k_("L", "N", &B_n, &B_k, &sum, cs_AIB + k * size_schur, &B_n, cs_AIB + i * size_schur, &B_n, &one, S_data, &B_n));
1155:               if (matl_dbg_viewer) PetscCallBLAS("BLASsyr2k", BLASsyr2k_("L", "N", &B_n, &B_k, &sum, cs_AIB + k * size_schur, &B_n, cs_AIB + i * size_schur, &B_n, &one, S3_data, &B_n));
1156:             }
1157:           }
1158:           sum = 1.;
1159:           PetscCallBLAS("BLASsyr2k", BLASsyr2k_("L", "N", &B_n, &B_k, &sum, array + n_I, &B_n, cs_AIB + i * size_schur, &B_n, &one, S_data, &B_n));
1160:           if (matl_dbg_viewer) PetscCallBLAS("BLASsyr2k", BLASsyr2k_("L", "N", &B_n, &B_k, &sum, array + n_I, &B_n, cs_AIB + i * size_schur, &B_n, &one, S2_data, &B_n));
1161:           VecRestoreArrayRead(v, (const PetscScalar **)&array);
1162:           /* set p0 entry of AIIm1_ones to zero */
1163:           ISGetLocalSize(is_p_r[i], &nz);
1164:           ISGetIndices(is_p_r[i], &idxs);
1165:           for (j = 0; j < benign_n; j++) AIIm1_data[idxs[nz - 1] + sizeA * j] = 0.;
1166:           ISRestoreIndices(is_p_r[i], &idxs);
1167:           PetscFree(sums);
1168:         }
1169:         VecDestroy(&benign_AIIm1_ones);
1170:         if (matl_dbg_viewer) {
1171:           MatDenseRestoreArray(S2, &S2_data);
1172:           MatDenseRestoreArray(S3, &S3_data);
1173:         }
1174:         if (!S_lower_triangular) { /* I need to expand the upper triangular data (column oriented) */
1175:           PetscInt k, j;
1176:           for (k = 0; k < size_schur; k++) {
1177:             for (j = k; j < size_schur; j++) S_data[j * size_schur + k] = PetscConj(S_data[k * size_schur + j]);
1178:           }
1179:         }

1181:         /* restore defaults */
1182: #if defined(PETSC_HAVE_MUMPS)
1183:         MatMumpsSetIcntl(F, 26, -1);
1184: #endif
1185: #if defined(PETSC_HAVE_MKL_PARDISO)
1186:         MatMkl_PardisoSetCntl(F, 70, 0);
1187: #endif
1188:         MatDenseRestoreArrayRead(cs_AIB_mat, &cs_AIB);
1189:         MatDenseRestoreArray(benign_AIIm1_ones_mat, &AIIm1_data);
1190:         VecDestroy(&v);
1191:         MatDenseRestoreArray(S_all, &S_data);
1192:         if (matl_dbg_viewer) {
1193:           Mat S;

1195:           MatFactorRestoreSchurComplement(F, &S_all, MAT_FACTOR_SCHUR_UNFACTORED);
1196:           MatFactorCreateSchurComplement(F, &S, NULL);
1197:           PetscObjectSetName((PetscObject)S, "Sb");
1198:           MatView(S, matl_dbg_viewer);
1199:           MatDestroy(&S);
1200:           PetscObjectSetName((PetscObject)S2, "S2P");
1201:           MatView(S2, matl_dbg_viewer);
1202:           PetscObjectSetName((PetscObject)S3, "S3P");
1203:           MatView(S3, matl_dbg_viewer);
1204:           PetscObjectSetName((PetscObject)cs_AIB_mat, "cs");
1205:           MatView(cs_AIB_mat, matl_dbg_viewer);
1206:           MatFactorGetSchurComplement(F, &S_all, NULL);
1207:         }
1208:         MatDestroy(&S2);
1209:         MatDestroy(&S3);
1210:       }
1211:       if (!reuse_solvers) {
1212:         for (i = 0; i < benign_n; i++) ISDestroy(&is_p_r[i]);
1213:         PetscFree(is_p_r);
1214:         MatDestroy(&cs_AIB_mat);
1215:         MatDestroy(&benign_AIIm1_ones_mat);
1216:       }
1217:     } else { /* we can't use MatFactor when size_schur == size_of_the_problem */
1218:       MatConvert(A, MATSEQDENSE, MAT_INITIAL_MATRIX, &S_all);
1219:       MatGetType(S_all, &Stype);
1220:       reuse_solvers     = PETSC_FALSE; /* TODO: why we can't reuse the solvers here? */
1221:       factor_workaround = PETSC_FALSE;
1222:       solver_S          = PETSC_FALSE;
1223:     }

1225:     if (reuse_solvers) {
1226:       Mat                A_II, pA_II, Afake;
1227:       Vec                vec1_B;
1228:       PCBDDCReuseSolvers msolv_ctx;
1229:       PetscInt           n_R;

1231:       if (sub_schurs->reuse_solver) {
1232:         PCBDDCReuseSolversReset(sub_schurs->reuse_solver);
1233:       } else {
1234:         PetscNew(&sub_schurs->reuse_solver);
1235:       }
1236:       msolv_ctx = sub_schurs->reuse_solver;
1237:       MatSchurComplementGetSubMatrices(sub_schurs->S, &A_II, &pA_II, NULL, NULL, NULL);
1238:       PetscObjectReference((PetscObject)F);
1239:       msolv_ctx->F = F;
1240:       MatCreateVecs(F, &msolv_ctx->sol, NULL);
1241:       /* currently PETSc has no support for MatSolve(F,x,x), so cheat and let rhs and sol share the same memory */
1242:       {
1243:         PetscScalar *array;
1244:         PetscInt     n;

1246:         VecGetLocalSize(msolv_ctx->sol, &n);
1247:         VecGetArray(msolv_ctx->sol, &array);
1248:         VecCreateSeqWithArray(PetscObjectComm((PetscObject)msolv_ctx->sol), 1, n, array, &msolv_ctx->rhs);
1249:         VecRestoreArray(msolv_ctx->sol, &array);
1250:       }
1251:       msolv_ctx->has_vertices = schur_has_vertices;

1253:       /* interior solver */
1254:       PCCreate(PetscObjectComm((PetscObject)A_II), &msolv_ctx->interior_solver);
1255:       PCSetOperators(msolv_ctx->interior_solver, A_II, pA_II);
1256:       PCSetType(msolv_ctx->interior_solver, PCSHELL);
1257:       PCShellSetName(msolv_ctx->interior_solver, "Interior solver (w/o Schur factorization)");
1258:       PCShellSetContext(msolv_ctx->interior_solver, msolv_ctx);
1259:       PCShellSetView(msolv_ctx->interior_solver, PCBDDCReuseSolvers_View);
1260:       PCShellSetApply(msolv_ctx->interior_solver, PCBDDCReuseSolvers_Interior);
1261:       PCShellSetApplyTranspose(msolv_ctx->interior_solver, PCBDDCReuseSolvers_InteriorTranspose);
1262:       if (sub_schurs->gdsw) PCShellSetDestroy(msolv_ctx->interior_solver, PCBDDCReuseSolvers_Destroy);

1264:       /* correction solver */
1265:       if (!sub_schurs->gdsw) {
1266:         PCCreate(PetscObjectComm((PetscObject)A_II), &msolv_ctx->correction_solver);
1267:         PCSetType(msolv_ctx->correction_solver, PCSHELL);
1268:         PCShellSetName(msolv_ctx->correction_solver, "Correction solver (with Schur factorization)");
1269:         PCShellSetContext(msolv_ctx->correction_solver, msolv_ctx);
1270:         PCShellSetView(msolv_ctx->interior_solver, PCBDDCReuseSolvers_View);
1271:         PCShellSetApply(msolv_ctx->correction_solver, PCBDDCReuseSolvers_Correction);
1272:         PCShellSetApplyTranspose(msolv_ctx->correction_solver, PCBDDCReuseSolvers_CorrectionTranspose);

1274:         /* scatter and vecs for Schur complement solver */
1275:         MatCreateVecs(S_all, &msolv_ctx->sol_B, &msolv_ctx->rhs_B);
1276:         MatCreateVecs(sub_schurs->S, &vec1_B, NULL);
1277:         if (!schur_has_vertices) {
1278:           ISGlobalToLocalMappingApplyIS(sub_schurs->BtoNmap, IS_GTOLM_DROP, is_A_all, &msolv_ctx->is_B);
1279:           VecScatterCreate(vec1_B, msolv_ctx->is_B, msolv_ctx->sol_B, NULL, &msolv_ctx->correction_scatter_B);
1280:           PetscObjectReference((PetscObject)is_A_all);
1281:           msolv_ctx->is_R = is_A_all;
1282:         } else {
1283:           IS              is_B_all;
1284:           const PetscInt *idxs;
1285:           PetscInt        dual, n_v, n;

1287:           ISGetLocalSize(sub_schurs->is_vertices, &n_v);
1288:           dual = size_schur - n_v;
1289:           ISGetLocalSize(is_A_all, &n);
1290:           ISGetIndices(is_A_all, &idxs);
1291:           ISCreateGeneral(PetscObjectComm((PetscObject)is_A_all), dual, idxs + n_I, PETSC_COPY_VALUES, &is_B_all);
1292:           ISGlobalToLocalMappingApplyIS(sub_schurs->BtoNmap, IS_GTOLM_DROP, is_B_all, &msolv_ctx->is_B);
1293:           ISDestroy(&is_B_all);
1294:           ISCreateStride(PetscObjectComm((PetscObject)is_A_all), dual, 0, 1, &is_B_all);
1295:           VecScatterCreate(vec1_B, msolv_ctx->is_B, msolv_ctx->sol_B, is_B_all, &msolv_ctx->correction_scatter_B);
1296:           ISDestroy(&is_B_all);
1297:           ISCreateGeneral(PetscObjectComm((PetscObject)is_A_all), n - n_v, idxs, PETSC_COPY_VALUES, &msolv_ctx->is_R);
1298:           ISRestoreIndices(is_A_all, &idxs);
1299:         }
1300:         ISGetLocalSize(msolv_ctx->is_R, &n_R);
1301:         MatCreateSeqAIJ(PETSC_COMM_SELF, n_R, n_R, 0, NULL, &Afake);
1302:         MatAssemblyBegin(Afake, MAT_FINAL_ASSEMBLY);
1303:         MatAssemblyEnd(Afake, MAT_FINAL_ASSEMBLY);
1304:         PCSetOperators(msolv_ctx->correction_solver, Afake, Afake);
1305:         MatDestroy(&Afake);
1306:         VecDestroy(&vec1_B);
1307:       }
1308:       /* communicate benign info to solver context */
1309:       if (benign_n) {
1310:         PetscScalar *array;

1312:         msolv_ctx->benign_n             = benign_n;
1313:         msolv_ctx->benign_zerodiag_subs = is_p_r;
1314:         PetscMalloc1(benign_n, &msolv_ctx->benign_save_vals);
1315:         msolv_ctx->benign_csAIB = cs_AIB_mat;
1316:         MatCreateVecs(cs_AIB_mat, &msolv_ctx->benign_corr_work, NULL);
1317:         VecGetArray(msolv_ctx->benign_corr_work, &array);
1318:         VecCreateSeqWithArray(PETSC_COMM_SELF, 1, size_schur, array, &msolv_ctx->benign_dummy_schur_vec);
1319:         VecRestoreArray(msolv_ctx->benign_corr_work, &array);
1320:         msolv_ctx->benign_AIIm1ones = benign_AIIm1_ones_mat;
1321:       }
1322:     } else {
1323:       if (sub_schurs->reuse_solver) PCBDDCReuseSolversReset(sub_schurs->reuse_solver);
1324:       PetscFree(sub_schurs->reuse_solver);
1325:     }
1326:     MatDestroy(&A);
1327:     ISDestroy(&is_A_all);

1329:     /* Work arrays */
1330:     PetscMalloc1(max_subset_size * max_subset_size, &work);

1332:     /* S_Ej_all */
1333:     cum = cum2 = 0;
1334:     MatDenseGetArrayRead(S_all, &rS_data);
1335:     MatSeqAIJGetArray(sub_schurs->S_Ej_all, &SEj_arr);
1336:     if (sub_schurs->sum_S_Ej_inv_all) MatSeqAIJGetArray(sub_schurs->sum_S_Ej_inv_all, &SEjinv_arr);
1337:     if (sub_schurs->gdsw) MatCreateSubMatrices(sub_schurs->A, sub_schurs->n_subs, sub_schurs->is_subs, sub_schurs->is_subs, MAT_INITIAL_MATRIX, &gdswA);
1338:     for (i = 0; i < sub_schurs->n_subs; i++) {
1339:       PetscInt j;

1341:       /* get S_E (or K^i_EE for GDSW) */
1342:       ISGetLocalSize(sub_schurs->is_subs[i], &subset_size);
1343:       if (sub_schurs->gdsw) {
1344:         Mat T;

1346:         MatCreateSeqDense(PETSC_COMM_SELF, subset_size, subset_size, work, &T);
1347:         MatConvert(gdswA[i], MATDENSE, MAT_REUSE_MATRIX, &T);
1348:         MatDestroy(&T);
1349:       } else {
1350:         if (S_lower_triangular) { /* I need to expand the upper triangular data (column oriented) */
1351:           PetscInt k;
1352:           for (k = 0; k < subset_size; k++) {
1353:             for (j = k; j < subset_size; j++) {
1354:               work[k * subset_size + j] = rS_data[cum2 + k * size_schur + j];
1355:               work[j * subset_size + k] = PetscConj(rS_data[cum2 + k * size_schur + j]);
1356:             }
1357:           }
1358:         } else { /* just copy to workspace */
1359:           PetscInt k;
1360:           for (k = 0; k < subset_size; k++) {
1361:             for (j = 0; j < subset_size; j++) work[k * subset_size + j] = rS_data[cum2 + k * size_schur + j];
1362:           }
1363:         }
1364:       }
1365:       /* insert S_E values */
1366:       if (sub_schurs->change) {
1367:         Mat change_sub, SEj, T;

1369:         /* change basis */
1370:         KSPGetOperators(sub_schurs->change[i], &change_sub, NULL);
1371:         MatCreateSeqDense(PETSC_COMM_SELF, subset_size, subset_size, work, &SEj);
1372:         if (!sub_schurs->change_with_qr) { /* currently there's no support for PtAP with P SeqAIJ */
1373:           Mat T2;
1374:           MatTransposeMatMult(change_sub, SEj, MAT_INITIAL_MATRIX, 1.0, &T2);
1375:           MatMatMult(T2, change_sub, MAT_INITIAL_MATRIX, 1.0, &T);
1376:           MatConvert(T, MATSEQDENSE, MAT_INPLACE_MATRIX, &T);
1377:           MatDestroy(&T2);
1378:         } else {
1379:           MatPtAP(SEj, change_sub, MAT_INITIAL_MATRIX, 1.0, &T);
1380:         }
1381:         MatCopy(T, SEj, SAME_NONZERO_PATTERN);
1382:         MatDestroy(&T);
1383:         MatZeroRowsColumnsIS(SEj, sub_schurs->change_primal_sub[i], 1.0, NULL, NULL);
1384:         MatDestroy(&SEj);
1385:       }
1386:       PetscArraycpy(SEj_arr, work, subset_size * subset_size);
1387:       if (compute_Stilda) {
1388:         if (deluxe) { /* if adaptivity is requested, invert S_E blocks */
1389:           Mat                M;
1390:           const PetscScalar *vals;
1391:           PetscBool          isdense, isdensecuda;

1393:           MatCreateSeqDense(PETSC_COMM_SELF, subset_size, subset_size, work, &M);
1394:           MatSetOption(M, MAT_SPD, sub_schurs->is_posdef);
1395:           MatSetOption(M, MAT_HERMITIAN, sub_schurs->is_hermitian);
1396:           if (!PetscBTLookup(sub_schurs->is_edge, i)) MatSetType(M, Stype);
1397:           PetscObjectTypeCompare((PetscObject)M, MATSEQDENSE, &isdense);
1398:           PetscObjectTypeCompare((PetscObject)M, MATSEQDENSECUDA, &isdensecuda);
1399:           if (use_cholesky) {
1400:             MatCholeskyFactor(M, NULL, NULL);
1401:           } else {
1402:             MatLUFactor(M, NULL, NULL, NULL);
1403:           }
1404:           if (isdense) {
1405:             MatSeqDenseInvertFactors_Private(M);
1406: #if defined(PETSC_HAVE_CUDA)
1407:           } else if (isdensecuda) {
1408:             MatSeqDenseCUDAInvertFactors_Private(M);
1409: #endif
1410:           } else SETERRQ(PetscObjectComm((PetscObject)M), PETSC_ERR_SUP, "Not implemented for type %s", Stype);
1411:           MatDenseGetArrayRead(M, &vals);
1412:           PetscArraycpy(SEjinv_arr, vals, subset_size * subset_size);
1413:           MatDenseRestoreArrayRead(M, &vals);
1414:           MatDestroy(&M);
1415:         } else if (scaling) { /* not using deluxe */
1416:           Mat          SEj;
1417:           Vec          D;
1418:           PetscScalar *array;

1420:           MatCreateSeqDense(PETSC_COMM_SELF, subset_size, subset_size, work, &SEj);
1421:           VecGetArray(Dall, &array);
1422:           VecCreateSeqWithArray(PETSC_COMM_SELF, 1, subset_size, array + cum, &D);
1423:           VecRestoreArray(Dall, &array);
1424:           VecShift(D, -1.);
1425:           MatDiagonalScale(SEj, D, D);
1426:           MatDestroy(&SEj);
1427:           VecDestroy(&D);
1428:           PetscArraycpy(SEj_arr, work, subset_size * subset_size);
1429:         }
1430:       }
1431:       cum += subset_size;
1432:       cum2 += subset_size * (size_schur + 1);
1433:       SEj_arr += subset_size * subset_size;
1434:       if (SEjinv_arr) SEjinv_arr += subset_size * subset_size;
1435:     }
1436:     if (sub_schurs->gdsw) MatDestroySubMatrices(sub_schurs->n_subs, &gdswA);
1437:     MatDenseRestoreArrayRead(S_all, &rS_data);
1438:     MatSeqAIJRestoreArray(sub_schurs->S_Ej_all, &SEj_arr);
1439:     if (sub_schurs->sum_S_Ej_inv_all) MatSeqAIJRestoreArray(sub_schurs->sum_S_Ej_inv_all, &SEjinv_arr);
1440:     if (solver_S) MatFactorRestoreSchurComplement(F, &S_all, MAT_FACTOR_SCHUR_UNFACTORED);

1442:       /* may prevent from unneeded copies, since MUMPS or MKL_Pardiso always use CPU memory
1443:        however, preliminary tests indicate using GPUs is still faster in the solve phase */
1444: #if defined(PETSC_HAVE_VIENNACL) || defined(PETSC_HAVE_CUDA)
1445:     if (reuse_solvers) {
1446:       Mat                  St;
1447:       MatFactorSchurStatus st;

1449:       flg = PETSC_FALSE;
1450:       PetscOptionsGetBool(NULL, sub_schurs->prefix, "-sub_schurs_schur_pin_to_cpu", &flg, NULL);
1451:       MatFactorGetSchurComplement(F, &St, &st);
1452:       MatBindToCPU(St, flg);
1453:       MatFactorRestoreSchurComplement(F, &St, st);
1454:     }
1455: #endif

1457:     schur_factor = NULL;
1458:     if (compute_Stilda && size_active_schur) {
1459:       if (sub_schurs->n_subs == 1 && size_schur == size_active_schur && deluxe) { /* we already computed the inverse */
1460:         MatSeqAIJGetArrayWrite(sub_schurs->sum_S_Ej_tilda_all, &SEjinv_arr);
1461:         PetscArraycpy(SEjinv_arr, work, size_schur * size_schur);
1462:         MatSeqAIJRestoreArrayWrite(sub_schurs->sum_S_Ej_tilda_all, &SEjinv_arr);
1463:       } else {
1464:         Mat S_all_inv = NULL;

1466:         if (solver_S && !sub_schurs->gdsw) {
1467:           /* for adaptive selection we need S^-1; for solver reusage we need S_\Delta\Delta^-1.
1468:              The latter is not the principal subminor for S^-1. However, the factors can be reused since S_\Delta\Delta is the leading principal submatrix of S */
1469:           if (factor_workaround) { /* invert without calling MatFactorInvertSchurComplement, since we are hacking */
1470:             PetscScalar *data;
1471:             PetscInt     nd = 0;

1474:             MatFactorGetSchurComplement(F, &S_all_inv, NULL);
1475:             MatDenseGetArray(S_all_inv, &data);
1476:             if (sub_schurs->is_dir) { /* dirichlet dofs could have different scalings */
1477:               ISGetLocalSize(sub_schurs->is_dir, &nd);
1478:             }

1480:             /* factor and invert activedofs and vertices (dirichlet dofs does not contribute) */
1481:             if (schur_has_vertices) {
1482:               Mat          M;
1483:               PetscScalar *tdata;
1484:               PetscInt     nv = 0, news;

1486:               ISGetLocalSize(sub_schurs->is_vertices, &nv);
1487:               news = size_active_schur + nv;
1488:               PetscCalloc1(news * news, &tdata);
1489:               for (i = 0; i < size_active_schur; i++) {
1490:                 PetscArraycpy(tdata + i * (news + 1), data + i * (size_schur + 1), size_active_schur - i);
1491:                 PetscArraycpy(tdata + i * (news + 1) + size_active_schur - i, data + i * size_schur + size_active_schur + nd, nv);
1492:               }
1493:               for (i = 0; i < nv; i++) {
1494:                 PetscInt k = i + size_active_schur;
1495:                 PetscArraycpy(tdata + k * (news + 1), data + (k + nd) * (size_schur + 1), nv - i);
1496:               }

1498:               MatCreateSeqDense(PETSC_COMM_SELF, news, news, tdata, &M);
1499:               MatSetOption(M, MAT_SPD, PETSC_TRUE);
1500:               MatCholeskyFactor(M, NULL, NULL);
1501:               /* save the factors */
1502:               cum = 0;
1503:               PetscMalloc1((size_active_schur * (size_active_schur + 1)) / 2 + nd, &schur_factor);
1504:               for (i = 0; i < size_active_schur; i++) {
1505:                 PetscArraycpy(schur_factor + cum, tdata + i * (news + 1), size_active_schur - i);
1506:                 cum += size_active_schur - i;
1507:               }
1508:               for (i = 0; i < nd; i++) schur_factor[cum + i] = PetscSqrtReal(PetscRealPart(data[(i + size_active_schur) * (size_schur + 1)]));
1509:               MatSeqDenseInvertFactors_Private(M);
1510:               /* move back just the active dofs to the Schur complement */
1511:               for (i = 0; i < size_active_schur; i++) PetscArraycpy(data + i * size_schur, tdata + i * news, size_active_schur);
1512:               PetscFree(tdata);
1513:               MatDestroy(&M);
1514:             } else { /* we can factorize and invert just the activedofs */
1515:               Mat          M;
1516:               PetscScalar *aux;

1518:               PetscMalloc1(nd, &aux);
1519:               for (i = 0; i < nd; i++) aux[i] = 1.0 / data[(i + size_active_schur) * (size_schur + 1)];
1520:               MatCreateSeqDense(PETSC_COMM_SELF, size_active_schur, size_active_schur, data, &M);
1521:               MatDenseSetLDA(M, size_schur);
1522:               MatSetOption(M, MAT_SPD, PETSC_TRUE);
1523:               MatCholeskyFactor(M, NULL, NULL);
1524:               MatSeqDenseInvertFactors_Private(M);
1525:               MatDestroy(&M);
1526:               MatCreateSeqDense(PETSC_COMM_SELF, size_schur, nd, data + size_active_schur * size_schur, &M);
1527:               MatZeroEntries(M);
1528:               MatDestroy(&M);
1529:               MatCreateSeqDense(PETSC_COMM_SELF, nd, size_schur, data + size_active_schur, &M);
1530:               MatDenseSetLDA(M, size_schur);
1531:               MatZeroEntries(M);
1532:               MatDestroy(&M);
1533:               for (i = 0; i < nd; i++) data[(i + size_active_schur) * (size_schur + 1)] = aux[i];
1534:               PetscFree(aux);
1535:             }
1536:             MatDenseRestoreArray(S_all_inv, &data);
1537:           } else { /* use MatFactor calls to invert S */
1538:             MatFactorInvertSchurComplement(F);
1539:             MatFactorGetSchurComplement(F, &S_all_inv, NULL);
1540:           }
1541:         } else if (!sub_schurs->gdsw) { /* we need to invert explicitly since we are not using MatFactor for S */
1542:           PetscObjectReference((PetscObject)S_all);
1543:           S_all_inv = S_all;
1544:           MatDenseGetArray(S_all_inv, &S_data);
1545:           PetscBLASIntCast(size_schur, &B_N);
1546:           PetscFPTrapPush(PETSC_FP_TRAP_OFF);
1547:           if (use_potr) {
1548:             PetscCallBLAS("LAPACKpotrf", LAPACKpotrf_("L", &B_N, S_data, &B_N, &B_ierr));
1550:             PetscCallBLAS("LAPACKpotri", LAPACKpotri_("L", &B_N, S_data, &B_N, &B_ierr));
1552:           } else if (use_sytr) {
1553:             PetscCallBLAS("LAPACKsytrf", LAPACKsytrf_("L", &B_N, S_data, &B_N, pivots, Bwork, &B_lwork, &B_ierr));
1555:             PetscCallBLAS("LAPACKsytri", LAPACKsytri_("L", &B_N, S_data, &B_N, pivots, Bwork, &B_ierr));
1557:           } else {
1558:             PetscCallBLAS("LAPACKgetrf", LAPACKgetrf_(&B_N, &B_N, S_data, &B_N, pivots, &B_ierr));
1560:             PetscCallBLAS("LAPACKgetri", LAPACKgetri_(&B_N, S_data, &B_N, pivots, Bwork, &B_lwork, &B_ierr));
1562:           }
1563:           PetscLogFlops(1.0 * size_schur * size_schur * size_schur);
1564:           PetscFPTrapPop();
1565:           MatDenseRestoreArray(S_all_inv, &S_data);
1566:         } else if (sub_schurs->gdsw) {
1567:           Mat      tS, tX, SEj, S_II, S_IE, S_EE;
1568:           KSP      pS_II;
1569:           PC       pS_II_pc;
1570:           IS       EE, II;
1571:           PetscInt nS;

1573:           MatFactorCreateSchurComplement(F, &tS, NULL);
1574:           MatGetSize(tS, &nS, NULL);
1575:           MatSeqAIJGetArray(sub_schurs->sum_S_Ej_tilda_all, &SEjinv_arr);
1576:           for (i = 0, cum = 0; i < sub_schurs->n_subs; i++) { /* naive implementation */
1577:             ISGetLocalSize(sub_schurs->is_subs[i], &subset_size);
1578:             MatCreateSeqDense(PETSC_COMM_SELF, subset_size, subset_size, SEjinv_arr, &SEj);

1580:             ISCreateStride(PETSC_COMM_SELF, subset_size, cum, 1, &EE);
1581:             ISComplement(EE, 0, nS, &II);
1582:             MatCreateSubMatrix(tS, II, II, MAT_INITIAL_MATRIX, &S_II);
1583:             MatCreateSubMatrix(tS, II, EE, MAT_INITIAL_MATRIX, &S_IE);
1584:             MatCreateSubMatrix(tS, EE, EE, MAT_INITIAL_MATRIX, &S_EE);
1585:             ISDestroy(&II);
1586:             ISDestroy(&EE);

1588:             KSPCreate(PETSC_COMM_SELF, &pS_II);
1589:             KSPSetType(pS_II, KSPPREONLY);
1590:             KSPGetPC(pS_II, &pS_II_pc);
1591:             PCSetType(pS_II_pc, PCSVD);
1592:             KSPSetOptionsPrefix(pS_II, sub_schurs->prefix);
1593:             KSPAppendOptionsPrefix(pS_II, "pseudo_");
1594:             KSPSetOperators(pS_II, S_II, S_II);
1595:             MatDestroy(&S_II);
1596:             KSPSetFromOptions(pS_II);
1597:             KSPSetUp(pS_II);
1598:             MatDuplicate(S_IE, MAT_DO_NOT_COPY_VALUES, &tX);
1599:             KSPMatSolve(pS_II, S_IE, tX);
1600:             KSPDestroy(&pS_II);

1602:             MatTransposeMatMult(S_IE, tX, MAT_REUSE_MATRIX, PETSC_DEFAULT, &SEj);
1603:             MatDestroy(&S_IE);
1604:             MatDestroy(&tX);
1605:             MatAYPX(SEj, -1, S_EE, SAME_NONZERO_PATTERN);
1606:             MatDestroy(&S_EE);

1608:             MatDestroy(&SEj);
1609:             cum += subset_size;
1610:             SEjinv_arr += subset_size * subset_size;
1611:           }
1612:           MatDestroy(&tS);
1613:           MatSeqAIJRestoreArray(sub_schurs->sum_S_Ej_tilda_all, &SEjinv_arr);
1614:         }
1615:         /* S_Ej_tilda_all */
1616:         cum = cum2 = 0;
1617:         rS_data    = NULL;
1618:         if (S_all_inv) MatDenseGetArrayRead(S_all_inv, &rS_data);
1619:         MatSeqAIJGetArrayWrite(sub_schurs->sum_S_Ej_tilda_all, &SEjinv_arr);
1620:         for (i = 0; i < sub_schurs->n_subs; i++) {
1621:           PetscInt j;

1623:           ISGetLocalSize(sub_schurs->is_subs[i], &subset_size);
1624:           /* get (St^-1)_E */
1625:           /* Unless we are changing the variables, I don't need to expand to upper triangular since St^-1
1626:              will be properly accessed later during adaptive selection */
1627:           if (rS_data) {
1628:             if (S_lower_triangular) {
1629:               PetscInt k;
1630:               if (sub_schurs->change) {
1631:                 for (k = 0; k < subset_size; k++) {
1632:                   for (j = k; j < subset_size; j++) {
1633:                     work[k * subset_size + j] = rS_data[cum2 + k * size_schur + j];
1634:                     work[j * subset_size + k] = work[k * subset_size + j];
1635:                   }
1636:                 }
1637:               } else {
1638:                 for (k = 0; k < subset_size; k++) {
1639:                   for (j = k; j < subset_size; j++) work[k * subset_size + j] = rS_data[cum2 + k * size_schur + j];
1640:                 }
1641:               }
1642:             } else {
1643:               PetscInt k;
1644:               for (k = 0; k < subset_size; k++) {
1645:                 for (j = 0; j < subset_size; j++) work[k * subset_size + j] = rS_data[cum2 + k * size_schur + j];
1646:               }
1647:             }
1648:           }
1649:           if (sub_schurs->change) {
1650:             Mat         change_sub, SEj, T;
1651:             PetscScalar val = sub_schurs->gdsw ? PETSC_SMALL : 1. / PETSC_SMALL;

1653:             /* change basis */
1654:             KSPGetOperators(sub_schurs->change[i], &change_sub, NULL);
1655:             MatCreateSeqDense(PETSC_COMM_SELF, subset_size, subset_size, rS_data ? work : SEjinv_arr, &SEj);
1656:             if (!sub_schurs->change_with_qr) { /* currently there's no support for PtAP with P SeqAIJ */
1657:               Mat T2;
1658:               MatTransposeMatMult(change_sub, SEj, MAT_INITIAL_MATRIX, 1.0, &T2);
1659:               MatMatMult(T2, change_sub, MAT_INITIAL_MATRIX, 1.0, &T);
1660:               MatDestroy(&T2);
1661:               MatConvert(T, MATSEQDENSE, MAT_INPLACE_MATRIX, &T);
1662:             } else {
1663:               MatPtAP(SEj, change_sub, MAT_INITIAL_MATRIX, 1.0, &T);
1664:             }
1665:             MatCopy(T, SEj, SAME_NONZERO_PATTERN);
1666:             MatDestroy(&T);
1667:             MatZeroRowsColumnsIS(SEj, sub_schurs->change_primal_sub[i], val, NULL, NULL);
1668:             MatDestroy(&SEj);
1669:           }
1670:           if (rS_data) PetscArraycpy(SEjinv_arr, work, subset_size * subset_size);
1671:           cum += subset_size;
1672:           cum2 += subset_size * (size_schur + 1);
1673:           SEjinv_arr += subset_size * subset_size;
1674:         }
1675:         MatSeqAIJRestoreArrayWrite(sub_schurs->sum_S_Ej_tilda_all, &SEjinv_arr);
1676:         if (S_all_inv) {
1677:           MatDenseRestoreArrayRead(S_all_inv, &rS_data);
1678:           if (solver_S) {
1679:             if (schur_has_vertices) {
1680:               MatFactorRestoreSchurComplement(F, &S_all_inv, MAT_FACTOR_SCHUR_FACTORED);
1681:             } else {
1682:               MatFactorRestoreSchurComplement(F, &S_all_inv, MAT_FACTOR_SCHUR_INVERTED);
1683:             }
1684:           }
1685:         }
1686:         MatDestroy(&S_all_inv);
1687:       }

1689:       /* move back factors if needed */
1690:       if (schur_has_vertices && factor_workaround && !sub_schurs->gdsw) {
1691:         Mat      S_tmp;
1692:         PetscInt nd = 0;

1695:         MatFactorGetSchurComplement(F, &S_tmp, NULL);
1696:         if (use_potr) {
1697:           PetscScalar *data;

1699:           MatDenseGetArray(S_tmp, &data);
1700:           PetscArrayzero(data, size_schur * size_schur);

1702:           if (S_lower_triangular) {
1703:             cum = 0;
1704:             for (i = 0; i < size_active_schur; i++) {
1705:               PetscArraycpy(data + i * (size_schur + 1), schur_factor + cum, size_active_schur - i);
1706:               cum += size_active_schur - i;
1707:             }
1708:           } else {
1709:             PetscArraycpy(data, schur_factor, size_schur * size_schur);
1710:           }
1711:           if (sub_schurs->is_dir) {
1712:             ISGetLocalSize(sub_schurs->is_dir, &nd);
1713:             for (i = 0; i < nd; i++) data[(i + size_active_schur) * (size_schur + 1)] = schur_factor[cum + i];
1714:           }
1715:           /* workaround: since I cannot modify the matrices used inside the solvers for the forward and backward substitutions,
1716:              set the diagonal entry of the Schur factor to a very large value */
1717:           for (i = size_active_schur + nd; i < size_schur; i++) data[i * (size_schur + 1)] = infty;
1718:           MatDenseRestoreArray(S_tmp, &data);
1719:         } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor update not yet implemented for non SPD matrices");
1720:         MatFactorRestoreSchurComplement(F, &S_tmp, MAT_FACTOR_SCHUR_FACTORED);
1721:       }
1722:     } else if (factor_workaround && !sub_schurs->gdsw) { /* we need to eliminate any unneeded coupling */
1723:       PetscScalar *data;
1724:       PetscInt     nd = 0;

1726:       if (sub_schurs->is_dir) { /* dirichlet dofs could have different scalings */
1727:         ISGetLocalSize(sub_schurs->is_dir, &nd);
1728:       }
1729:       MatFactorGetSchurComplement(F, &S_all, NULL);
1730:       MatDenseGetArray(S_all, &data);
1731:       for (i = 0; i < size_active_schur; i++) PetscArrayzero(data + i * size_schur + size_active_schur, size_schur - size_active_schur);
1732:       for (i = size_active_schur + nd; i < size_schur; i++) {
1733:         PetscArrayzero(data + i * size_schur + size_active_schur, size_schur - size_active_schur);
1734:         data[i * (size_schur + 1)] = infty;
1735:       }
1736:       MatDenseRestoreArray(S_all, &data);
1737:       MatFactorRestoreSchurComplement(F, &S_all, MAT_FACTOR_SCHUR_UNFACTORED);
1738:     }
1739:     PetscFree(work);
1740:     PetscFree(schur_factor);
1741:     VecDestroy(&Dall);
1742:   }
1743:   ISDestroy(&is_I_layer);
1744:   MatDestroy(&S_all);
1745:   MatDestroy(&A_BB);
1746:   MatDestroy(&A_IB);
1747:   MatDestroy(&A_BI);
1748:   MatDestroy(&F);

1750:   PetscMalloc1(sub_schurs->n_subs, &nnz);
1751:   for (i = 0; i < sub_schurs->n_subs; i++) ISGetLocalSize(sub_schurs->is_subs[i], &nnz[i]);
1752:   ISCreateGeneral(PETSC_COMM_SELF, sub_schurs->n_subs, nnz, PETSC_OWN_POINTER, &is_I_layer);
1753:   MatSetVariableBlockSizes(sub_schurs->S_Ej_all, sub_schurs->n_subs, nnz);
1754:   MatAssemblyBegin(sub_schurs->S_Ej_all, MAT_FINAL_ASSEMBLY);
1755:   MatAssemblyEnd(sub_schurs->S_Ej_all, MAT_FINAL_ASSEMBLY);
1756:   if (compute_Stilda) {
1757:     MatSetVariableBlockSizes(sub_schurs->sum_S_Ej_tilda_all, sub_schurs->n_subs, nnz);
1758:     MatAssemblyBegin(sub_schurs->sum_S_Ej_tilda_all, MAT_FINAL_ASSEMBLY);
1759:     MatAssemblyEnd(sub_schurs->sum_S_Ej_tilda_all, MAT_FINAL_ASSEMBLY);
1760:     if (deluxe) {
1761:       MatSetVariableBlockSizes(sub_schurs->sum_S_Ej_inv_all, sub_schurs->n_subs, nnz);
1762:       MatAssemblyBegin(sub_schurs->sum_S_Ej_inv_all, MAT_FINAL_ASSEMBLY);
1763:       MatAssemblyEnd(sub_schurs->sum_S_Ej_inv_all, MAT_FINAL_ASSEMBLY);
1764:     }
1765:   }
1766:   ISDestroy(&is_I_layer);

1768:   /* Get local part of (\sum_j S_Ej) */
1769:   if (!sub_schurs->sum_S_Ej_all) MatDuplicate(sub_schurs->S_Ej_all, MAT_DO_NOT_COPY_VALUES, &sub_schurs->sum_S_Ej_all);
1770:   VecSet(gstash, 0.0);
1771:   MatSeqAIJGetArray(sub_schurs->S_Ej_all, &stasharray);
1772:   VecPlaceArray(lstash, stasharray);
1773:   VecScatterBegin(sstash, lstash, gstash, ADD_VALUES, SCATTER_FORWARD);
1774:   VecScatterEnd(sstash, lstash, gstash, ADD_VALUES, SCATTER_FORWARD);
1775:   MatSeqAIJRestoreArray(sub_schurs->S_Ej_all, &stasharray);
1776:   VecResetArray(lstash);
1777:   MatSeqAIJGetArray(sub_schurs->sum_S_Ej_all, &stasharray);
1778:   VecPlaceArray(lstash, stasharray);
1779:   VecScatterBegin(sstash, gstash, lstash, INSERT_VALUES, SCATTER_REVERSE);
1780:   VecScatterEnd(sstash, gstash, lstash, INSERT_VALUES, SCATTER_REVERSE);
1781:   MatSeqAIJRestoreArray(sub_schurs->sum_S_Ej_all, &stasharray);
1782:   VecResetArray(lstash);

1784:   /* Get local part of (\sum_j S^-1_Ej) (\sum_j St^-1_Ej) */
1785:   if (compute_Stilda) {
1786:     VecSet(gstash, 0.0);
1787:     MatSeqAIJGetArray(sub_schurs->sum_S_Ej_tilda_all, &stasharray);
1788:     VecPlaceArray(lstash, stasharray);
1789:     VecScatterBegin(sstash, lstash, gstash, ADD_VALUES, SCATTER_FORWARD);
1790:     VecScatterEnd(sstash, lstash, gstash, ADD_VALUES, SCATTER_FORWARD);
1791:     VecScatterBegin(sstash, gstash, lstash, INSERT_VALUES, SCATTER_REVERSE);
1792:     VecScatterEnd(sstash, gstash, lstash, INSERT_VALUES, SCATTER_REVERSE);
1793:     MatSeqAIJRestoreArray(sub_schurs->sum_S_Ej_tilda_all, &stasharray);
1794:     VecResetArray(lstash);
1795:     if (deluxe) {
1796:       VecSet(gstash, 0.0);
1797:       MatSeqAIJGetArray(sub_schurs->sum_S_Ej_inv_all, &stasharray);
1798:       VecPlaceArray(lstash, stasharray);
1799:       VecScatterBegin(sstash, lstash, gstash, ADD_VALUES, SCATTER_FORWARD);
1800:       VecScatterEnd(sstash, lstash, gstash, ADD_VALUES, SCATTER_FORWARD);
1801:       VecScatterBegin(sstash, gstash, lstash, INSERT_VALUES, SCATTER_REVERSE);
1802:       VecScatterEnd(sstash, gstash, lstash, INSERT_VALUES, SCATTER_REVERSE);
1803:       MatSeqAIJRestoreArray(sub_schurs->sum_S_Ej_inv_all, &stasharray);
1804:       VecResetArray(lstash);
1805:     } else if (!sub_schurs->gdsw) {
1806:       PetscScalar *array;
1807:       PetscInt     cum;

1809:       MatSeqAIJGetArray(sub_schurs->sum_S_Ej_tilda_all, &array);
1810:       cum = 0;
1811:       for (i = 0; i < sub_schurs->n_subs; i++) {
1812:         ISGetLocalSize(sub_schurs->is_subs[i], &subset_size);
1813:         PetscBLASIntCast(subset_size, &B_N);
1814:         PetscFPTrapPush(PETSC_FP_TRAP_OFF);
1815:         if (use_potr) {
1816:           PetscCallBLAS("LAPACKpotrf", LAPACKpotrf_("L", &B_N, array + cum, &B_N, &B_ierr));
1818:           PetscCallBLAS("LAPACKpotri", LAPACKpotri_("L", &B_N, array + cum, &B_N, &B_ierr));
1820:         } else if (use_sytr) {
1821:           PetscCallBLAS("LAPACKsytrf", LAPACKsytrf_("L", &B_N, array + cum, &B_N, pivots, Bwork, &B_lwork, &B_ierr));
1823:           PetscCallBLAS("LAPACKsytri", LAPACKsytri_("L", &B_N, array + cum, &B_N, pivots, Bwork, &B_ierr));
1825:         } else {
1826:           PetscCallBLAS("LAPACKgetrf", LAPACKgetrf_(&B_N, &B_N, array + cum, &B_N, pivots, &B_ierr));
1828:           PetscCallBLAS("LAPACKgetri", LAPACKgetri_(&B_N, array + cum, &B_N, pivots, Bwork, &B_lwork, &B_ierr));
1830:         }
1831:         PetscLogFlops(1.0 * subset_size * subset_size * subset_size);
1832:         PetscFPTrapPop();
1833:         cum += subset_size * subset_size;
1834:       }
1835:       MatSeqAIJRestoreArray(sub_schurs->sum_S_Ej_tilda_all, &array);
1836:       PetscObjectReference((PetscObject)sub_schurs->sum_S_Ej_all);
1837:       MatDestroy(&sub_schurs->sum_S_Ej_inv_all);
1838:       sub_schurs->sum_S_Ej_inv_all = sub_schurs->sum_S_Ej_all;
1839:     }
1840:   }
1841:   VecDestroy(&lstash);
1842:   VecDestroy(&gstash);
1843:   VecScatterDestroy(&sstash);

1845:   if (matl_dbg_viewer) {
1846:     if (sub_schurs->S_Ej_all) {
1847:       PetscObjectSetName((PetscObject)sub_schurs->S_Ej_all, "SE");
1848:       MatView(sub_schurs->S_Ej_all, matl_dbg_viewer);
1849:     }
1850:     if (sub_schurs->sum_S_Ej_all) {
1851:       PetscObjectSetName((PetscObject)sub_schurs->sum_S_Ej_all, "SSE");
1852:       MatView(sub_schurs->sum_S_Ej_all, matl_dbg_viewer);
1853:     }
1854:     if (sub_schurs->sum_S_Ej_inv_all) {
1855:       PetscObjectSetName((PetscObject)sub_schurs->sum_S_Ej_inv_all, "SSEm");
1856:       MatView(sub_schurs->sum_S_Ej_inv_all, matl_dbg_viewer);
1857:     }
1858:     if (sub_schurs->sum_S_Ej_tilda_all) {
1859:       PetscObjectSetName((PetscObject)sub_schurs->sum_S_Ej_tilda_all, "SSEt");
1860:       MatView(sub_schurs->sum_S_Ej_tilda_all, matl_dbg_viewer);
1861:     }
1862:   }

1864:   /* free workspace */
1865:   if (matl_dbg_viewer) PetscViewerFlush(matl_dbg_viewer);
1866:   if (sub_schurs->debug) MPI_Barrier(comm_n);
1867:   PetscViewerDestroy(&matl_dbg_viewer);
1868:   PetscFree2(Bwork, pivots);
1869:   PetscCommDestroy(&comm_n);
1870:   return 0;
1871: }

1873: PetscErrorCode PCBDDCSubSchursInit(PCBDDCSubSchurs sub_schurs, const char *prefix, IS is_I, IS is_B, PCBDDCGraph graph, ISLocalToGlobalMapping BtoNmap, PetscBool copycc, PetscBool gdsw)
1874: {
1875:   IS       *faces, *edges, *all_cc, vertices;
1876:   PetscInt  s, i, n_faces, n_edges, n_all_cc;
1877:   PetscBool is_sorted, ispardiso, ismumps;

1879:   ISSorted(is_I, &is_sorted);
1881:   ISSorted(is_B, &is_sorted);

1884:   /* reset any previous data */
1885:   PCBDDCSubSchursReset(sub_schurs);

1887:   sub_schurs->gdsw = gdsw;

1889:   /* get index sets for faces and edges (already sorted by global ordering) */
1890:   PCBDDCGraphGetCandidatesIS(graph, &n_faces, &faces, &n_edges, &edges, &vertices);
1891:   n_all_cc = n_faces + n_edges;
1892:   PetscBTCreate(n_all_cc, &sub_schurs->is_edge);
1893:   PetscMalloc1(n_all_cc, &all_cc);
1894:   n_all_cc = 0;
1895:   for (i = 0; i < n_faces; i++) {
1896:     ISGetSize(faces[i], &s);
1897:     if (!s) continue;
1898:     if (copycc) {
1899:       ISDuplicate(faces[i], &all_cc[n_all_cc]);
1900:     } else {
1901:       PetscObjectReference((PetscObject)faces[i]);
1902:       all_cc[n_all_cc] = faces[i];
1903:     }
1904:     n_all_cc++;
1905:   }
1906:   for (i = 0; i < n_edges; i++) {
1907:     ISGetSize(edges[i], &s);
1908:     if (!s) continue;
1909:     if (copycc) {
1910:       ISDuplicate(edges[i], &all_cc[n_all_cc]);
1911:     } else {
1912:       PetscObjectReference((PetscObject)edges[i]);
1913:       all_cc[n_all_cc] = edges[i];
1914:     }
1915:     PetscBTSet(sub_schurs->is_edge, n_all_cc);
1916:     n_all_cc++;
1917:   }
1918:   PetscObjectReference((PetscObject)vertices);
1919:   sub_schurs->is_vertices = vertices;
1920:   PCBDDCGraphRestoreCandidatesIS(graph, &n_faces, &faces, &n_edges, &edges, &vertices);
1921:   sub_schurs->is_dir = NULL;
1922:   PCBDDCGraphGetDirichletDofsB(graph, &sub_schurs->is_dir);

1924:   /* Determine if MatFactor can be used */
1925:   PetscStrallocpy(prefix, &sub_schurs->prefix);
1926: #if defined(PETSC_HAVE_MUMPS)
1927:   PetscStrncpy(sub_schurs->mat_solver_type, MATSOLVERMUMPS, sizeof(sub_schurs->mat_solver_type));
1928: #elif defined(PETSC_HAVE_MKL_PARDISO)
1929:   PetscStrncpy(sub_schurs->mat_solver_type, MATSOLVERMKL_PARDISO, sizeof(sub_schurs->mat_solver_type));
1930: #else
1931:   PetscStrncpy(sub_schurs->mat_solver_type, MATSOLVERPETSC, sizeof(sub_schurs->mat_solver_type));
1932: #endif
1933: #if defined(PETSC_USE_COMPLEX)
1934:   sub_schurs->is_hermitian = PETSC_FALSE; /* Hermitian Cholesky is not supported by PETSc and external packages */
1935: #else
1936:   sub_schurs->is_hermitian = PETSC_TRUE;
1937: #endif
1938:   sub_schurs->is_posdef     = PETSC_TRUE;
1939:   sub_schurs->is_symmetric  = PETSC_TRUE;
1940:   sub_schurs->debug         = PETSC_FALSE;
1941:   sub_schurs->restrict_comm = PETSC_FALSE;
1942:   PetscOptionsBegin(PetscObjectComm((PetscObject)graph->l2gmap), sub_schurs->prefix, "BDDC sub_schurs options", "PC");
1943:   PetscOptionsString("-sub_schurs_mat_solver_type", "Specific direct solver to use", NULL, sub_schurs->mat_solver_type, sub_schurs->mat_solver_type, sizeof(sub_schurs->mat_solver_type), NULL);
1944:   PetscOptionsBool("-sub_schurs_symmetric", "Symmetric problem", NULL, sub_schurs->is_symmetric, &sub_schurs->is_symmetric, NULL);
1945:   PetscOptionsBool("-sub_schurs_hermitian", "Hermitian problem", NULL, sub_schurs->is_hermitian, &sub_schurs->is_hermitian, NULL);
1946:   PetscOptionsBool("-sub_schurs_posdef", "Positive definite problem", NULL, sub_schurs->is_posdef, &sub_schurs->is_posdef, NULL);
1947:   PetscOptionsBool("-sub_schurs_restrictcomm", "Restrict communicator on active processes only", NULL, sub_schurs->restrict_comm, &sub_schurs->restrict_comm, NULL);
1948:   PetscOptionsBool("-sub_schurs_debug", "Debug output", NULL, sub_schurs->debug, &sub_schurs->debug, NULL);
1949:   PetscOptionsEnd();
1950:   PetscStrcmp(sub_schurs->mat_solver_type, MATSOLVERMUMPS, &ismumps);
1951:   PetscStrcmp(sub_schurs->mat_solver_type, MATSOLVERMKL_PARDISO, &ispardiso);
1952:   sub_schurs->schur_explicit = (PetscBool)(ispardiso || ismumps);

1954:   /* for reals, symmetric and hermitian are synonims */
1955: #if !defined(PETSC_USE_COMPLEX)
1956:   sub_schurs->is_symmetric = (PetscBool)(sub_schurs->is_symmetric && sub_schurs->is_hermitian);
1957:   sub_schurs->is_hermitian = sub_schurs->is_symmetric;
1958: #endif

1960:   PetscObjectReference((PetscObject)is_I);
1961:   sub_schurs->is_I = is_I;
1962:   PetscObjectReference((PetscObject)is_B);
1963:   sub_schurs->is_B = is_B;
1964:   PetscObjectReference((PetscObject)graph->l2gmap);
1965:   sub_schurs->l2gmap = graph->l2gmap;
1966:   PetscObjectReference((PetscObject)BtoNmap);
1967:   sub_schurs->BtoNmap            = BtoNmap;
1968:   sub_schurs->n_subs             = n_all_cc;
1969:   sub_schurs->is_subs            = all_cc;
1970:   sub_schurs->S_Ej_all           = NULL;
1971:   sub_schurs->sum_S_Ej_all       = NULL;
1972:   sub_schurs->sum_S_Ej_inv_all   = NULL;
1973:   sub_schurs->sum_S_Ej_tilda_all = NULL;
1974:   sub_schurs->is_Ej_all          = NULL;
1975:   return 0;
1976: }

1978: PetscErrorCode PCBDDCSubSchursCreate(PCBDDCSubSchurs *sub_schurs)
1979: {
1980:   PCBDDCSubSchurs schurs_ctx;

1982:   PetscNew(&schurs_ctx);
1983:   schurs_ctx->n_subs = 0;
1984:   *sub_schurs        = schurs_ctx;
1985:   return 0;
1986: }

1988: PetscErrorCode PCBDDCSubSchursReset(PCBDDCSubSchurs sub_schurs)
1989: {
1990:   PetscInt i;

1992:   if (!sub_schurs) return 0;
1993:   PetscFree(sub_schurs->prefix);
1994:   MatDestroy(&sub_schurs->A);
1995:   MatDestroy(&sub_schurs->S);
1996:   ISDestroy(&sub_schurs->is_I);
1997:   ISDestroy(&sub_schurs->is_B);
1998:   ISLocalToGlobalMappingDestroy(&sub_schurs->l2gmap);
1999:   ISLocalToGlobalMappingDestroy(&sub_schurs->BtoNmap);
2000:   MatDestroy(&sub_schurs->S_Ej_all);
2001:   MatDestroy(&sub_schurs->sum_S_Ej_all);
2002:   MatDestroy(&sub_schurs->sum_S_Ej_inv_all);
2003:   MatDestroy(&sub_schurs->sum_S_Ej_tilda_all);
2004:   ISDestroy(&sub_schurs->is_Ej_all);
2005:   ISDestroy(&sub_schurs->is_vertices);
2006:   ISDestroy(&sub_schurs->is_dir);
2007:   PetscBTDestroy(&sub_schurs->is_edge);
2008:   for (i = 0; i < sub_schurs->n_subs; i++) ISDestroy(&sub_schurs->is_subs[i]);
2009:   if (sub_schurs->n_subs) PetscFree(sub_schurs->is_subs);
2010:   if (sub_schurs->reuse_solver) PCBDDCReuseSolversReset(sub_schurs->reuse_solver);
2011:   PetscFree(sub_schurs->reuse_solver);
2012:   if (sub_schurs->change) {
2013:     for (i = 0; i < sub_schurs->n_subs; i++) {
2014:       KSPDestroy(&sub_schurs->change[i]);
2015:       ISDestroy(&sub_schurs->change_primal_sub[i]);
2016:     }
2017:   }
2018:   PetscFree(sub_schurs->change);
2019:   PetscFree(sub_schurs->change_primal_sub);
2020:   sub_schurs->n_subs = 0;
2021:   return 0;
2022: }

2024: PetscErrorCode PCBDDCSubSchursDestroy(PCBDDCSubSchurs *sub_schurs)
2025: {
2026:   PCBDDCSubSchursReset(*sub_schurs);
2027:   PetscFree(*sub_schurs);
2028:   return 0;
2029: }

2031: static inline PetscErrorCode PCBDDCAdjGetNextLayer_Private(PetscInt *queue_tip, PetscInt n_prev, PetscBT touched, PetscInt *xadj, PetscInt *adjncy, PetscInt *n_added)
2032: {
2033:   PetscInt i, j, n;

2035:   n = 0;
2036:   for (i = -n_prev; i < 0; i++) {
2037:     PetscInt start_dof = queue_tip[i];
2038:     for (j = xadj[start_dof]; j < xadj[start_dof + 1]; j++) {
2039:       PetscInt dof = adjncy[j];
2040:       if (!PetscBTLookup(touched, dof)) {
2041:         PetscBTSet(touched, dof);
2042:         queue_tip[n] = dof;
2043:         n++;
2044:       }
2045:     }
2046:   }
2047:   *n_added = n;
2048:   return 0;
2049: }