Actual source code: math2opus.cu
1: #include <h2opusconf.h>
2: /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
4: #include <h2opus.h>
5: #if defined(H2OPUS_USE_MPI)
6: #include <h2opus/distributed/distributed_h2opus_handle.h>
7: #include <h2opus/distributed/distributed_geometric_construction.h>
8: #include <h2opus/distributed/distributed_hgemv.h>
9: #include <h2opus/distributed/distributed_horthog.h>
10: #include <h2opus/distributed/distributed_hcompress.h>
11: #endif
12: #include <h2opus/util/boxentrygen.h>
13: #include <petsc/private/matimpl.h>
14: #include <petsc/private/vecimpl.h>
15: #include <petsc/private/deviceimpl.h>
16: #include <petscsf.h>
18: /* math2opusutils */
19: PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF, PetscInt, PetscInt, PetscInt, PetscSF *);
20: PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat, PetscSF, PetscSF *);
21: PETSC_INTERN PetscErrorCode VecSign(Vec, Vec);
22: PETSC_INTERN PetscErrorCode VecSetDelta(Vec, PetscInt);
23: PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat, NormType, PetscInt, PetscReal *);
25: #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())
27: /* Use GPU only if H2OPUS is configured for GPU */
28: #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
29: #define PETSC_H2OPUS_USE_GPU
30: #endif
31: #if defined(PETSC_H2OPUS_USE_GPU)
32: #define MatH2OpusUpdateIfNeeded(A, B) MatBindToCPU(A, (PetscBool)((A)->boundtocpu || (B)))
33: #else
34: #define MatH2OpusUpdateIfNeeded(A, B) 0
35: #endif
37: // TODO H2OPUS:
38: // DistributedHMatrix
39: // unsymmetric ?
40: // transpose for distributed_hgemv?
41: // clearData()
42: // Unify interface for sequential and parallel?
43: // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
44: //
45: template <class T>
46: class PetscPointCloud : public H2OpusDataSet<T> {
47: private:
48: int dimension;
49: size_t num_points;
50: std::vector<T> pts;
52: public:
53: PetscPointCloud(int dim, size_t num_pts, const T coords[])
54: {
55: dim = dim > 0 ? dim : 1;
56: this->dimension = dim;
57: this->num_points = num_pts;
59: pts.resize(num_pts * dim);
60: if (coords) {
61: for (size_t n = 0; n < num_pts; n++)
62: for (int i = 0; i < dim; i++) pts[n * dim + i] = coords[n * dim + i];
63: } else {
64: PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0;
65: for (size_t n = 0; n < num_pts; n++) {
66: pts[n * dim] = n * h;
67: for (int i = 1; i < dim; i++) pts[n * dim + i] = 0.0;
68: }
69: }
70: }
72: PetscPointCloud(const PetscPointCloud<T> &other)
73: {
74: size_t N = other.dimension * other.num_points;
75: this->dimension = other.dimension;
76: this->num_points = other.num_points;
77: this->pts.resize(N);
78: for (size_t i = 0; i < N; i++) this->pts[i] = other.pts[i];
79: }
81: int getDimension() const { return dimension; }
83: size_t getDataSetSize() const { return num_points; }
85: T getDataPoint(size_t idx, int dim) const
86: {
87: assert(dim < dimension && idx < num_points);
88: return pts[idx * dimension + dim];
89: }
91: void Print(std::ostream &out = std::cout)
92: {
93: out << "Dimension: " << dimension << std::endl;
94: out << "NumPoints: " << num_points << std::endl;
95: for (size_t n = 0; n < num_points; n++) {
96: for (int d = 0; d < dimension; d++) out << pts[n * dimension + d] << " ";
97: out << std::endl;
98: }
99: }
100: };
102: template <class T>
103: class PetscFunctionGenerator {
104: private:
105: MatH2OpusKernel k;
106: int dim;
107: void *ctx;
109: public:
110: PetscFunctionGenerator(MatH2OpusKernel k, int dim, void *ctx)
111: {
112: this->k = k;
113: this->dim = dim;
114: this->ctx = ctx;
115: }
116: PetscFunctionGenerator(PetscFunctionGenerator &other)
117: {
118: this->k = other.k;
119: this->dim = other.dim;
120: this->ctx = other.ctx;
121: }
122: T operator()(PetscReal *pt1, PetscReal *pt2) { return (T)((*this->k)(this->dim, pt1, pt2, this->ctx)); }
123: };
125: #include <../src/mat/impls/h2opus/math2opussampler.hpp>
127: /* just to not clutter the code */
128: #if !defined(H2OPUS_USE_GPU)
129: typedef HMatrix HMatrix_GPU;
130: #if defined(H2OPUS_USE_MPI)
131: typedef DistributedHMatrix DistributedHMatrix_GPU;
132: #endif
133: #endif
135: typedef struct {
136: #if defined(H2OPUS_USE_MPI)
137: distributedH2OpusHandle_t handle;
138: #else
139: h2opusHandle_t handle;
140: #endif
141: /* Sequential and parallel matrices are two different classes at the moment */
142: HMatrix *hmatrix;
143: #if defined(H2OPUS_USE_MPI)
144: DistributedHMatrix *dist_hmatrix;
145: #else
146: HMatrix *dist_hmatrix; /* just to not clutter the code */
147: #endif
148: /* May use permutations */
149: PetscSF sf;
150: PetscLayout h2opus_rmap, h2opus_cmap;
151: IS h2opus_indexmap;
152: thrust::host_vector<PetscScalar> *xx, *yy;
153: PetscInt xxs, yys;
154: PetscBool multsetup;
156: /* GPU */
157: HMatrix_GPU *hmatrix_gpu;
158: #if defined(H2OPUS_USE_MPI)
159: DistributedHMatrix_GPU *dist_hmatrix_gpu;
160: #else
161: HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */
162: #endif
163: #if defined(PETSC_H2OPUS_USE_GPU)
164: thrust::device_vector<PetscScalar> *xx_gpu, *yy_gpu;
165: PetscInt xxs_gpu, yys_gpu;
166: #endif
168: /* construction from matvecs */
169: PetscMatrixSampler *sampler;
170: PetscBool nativemult;
172: /* Admissibility */
173: PetscReal eta;
174: PetscInt leafsize;
176: /* for dof reordering */
177: PetscPointCloud<PetscReal> *ptcloud;
179: /* kernel for generating matrix entries */
180: PetscFunctionGenerator<PetscScalar> *kernel;
182: /* basis orthogonalized? */
183: PetscBool orthogonal;
185: /* customization */
186: PetscInt basisord;
187: PetscInt max_rank;
188: PetscInt bs;
189: PetscReal rtol;
190: PetscInt norm_max_samples;
191: PetscBool check_construction;
192: PetscBool hara_verbose;
193: PetscBool resize;
195: /* keeps track of MatScale values */
196: PetscScalar s;
197: } Mat_H2OPUS;
199: static PetscErrorCode MatDestroy_H2OPUS(Mat A)
200: {
201: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
203: #if defined(H2OPUS_USE_MPI)
204: h2opusDestroyDistributedHandle(a->handle);
205: #else
206: h2opusDestroyHandle(a->handle);
207: #endif
208: delete a->dist_hmatrix;
209: delete a->hmatrix;
210: PetscSFDestroy(&a->sf);
211: PetscLayoutDestroy(&a->h2opus_rmap);
212: PetscLayoutDestroy(&a->h2opus_cmap);
213: ISDestroy(&a->h2opus_indexmap);
214: delete a->xx;
215: delete a->yy;
216: delete a->hmatrix_gpu;
217: delete a->dist_hmatrix_gpu;
218: #if defined(PETSC_H2OPUS_USE_GPU)
219: delete a->xx_gpu;
220: delete a->yy_gpu;
221: #endif
222: delete a->sampler;
223: delete a->ptcloud;
224: delete a->kernel;
225: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", NULL);
226: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", NULL);
227: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", NULL);
228: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", NULL);
229: PetscObjectChangeTypeName((PetscObject)A, NULL);
230: PetscFree(A->data);
231: return 0;
232: }
234: PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
235: {
236: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
237: PetscBool ish2opus;
241: PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus);
242: if (ish2opus) {
243: if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
244: if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
245: PetscLayout t;
246: t = A->rmap;
247: A->rmap = a->h2opus_rmap;
248: a->h2opus_rmap = t;
249: t = A->cmap;
250: A->cmap = a->h2opus_cmap;
251: a->h2opus_cmap = t;
252: }
253: }
254: a->nativemult = nm;
255: }
256: return 0;
257: }
259: PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
260: {
261: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
262: PetscBool ish2opus;
266: PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus);
268: *nm = a->nativemult;
269: return 0;
270: }
272: PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal *n)
273: {
274: PetscBool ish2opus;
275: PetscInt nmax = PETSC_DECIDE;
276: Mat_H2OPUS *a = NULL;
277: PetscBool mult = PETSC_FALSE;
279: PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus);
280: if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
281: a = (Mat_H2OPUS *)A->data;
283: nmax = a->norm_max_samples;
284: mult = a->nativemult;
285: MatH2OpusSetNativeMult(A, PETSC_TRUE);
286: } else {
287: PetscOptionsGetInt(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_approximate_norm_samples", &nmax, NULL);
288: }
289: MatApproximateNorm_Private(A, normtype, nmax, n);
290: if (a) MatH2OpusSetNativeMult(A, mult);
291: return 0;
292: }
294: static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN)
295: {
296: Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
297: PetscInt n;
298: PetscBool boundtocpu = PETSC_TRUE;
300: #if defined(PETSC_H2OPUS_USE_GPU)
301: boundtocpu = A->boundtocpu;
302: #endif
303: PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL);
304: if (boundtocpu) {
305: if (h2opus->xxs < xN) {
306: h2opus->xx->resize(n * xN);
307: h2opus->xxs = xN;
308: }
309: if (h2opus->yys < yN) {
310: h2opus->yy->resize(n * yN);
311: h2opus->yys = yN;
312: }
313: }
314: #if defined(PETSC_H2OPUS_USE_GPU)
315: if (!boundtocpu) {
316: if (h2opus->xxs_gpu < xN) {
317: h2opus->xx_gpu->resize(n * xN);
318: h2opus->xxs_gpu = xN;
319: }
320: if (h2opus->yys_gpu < yN) {
321: h2opus->yy_gpu->resize(n * yN);
322: h2opus->yys_gpu = yN;
323: }
324: }
325: #endif
326: return 0;
327: }
329: static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
330: {
331: Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
332: #if defined(H2OPUS_USE_MPI)
333: h2opusHandle_t handle = h2opus->handle->handle;
334: #else
335: h2opusHandle_t handle = h2opus->handle;
336: #endif
337: PetscBool boundtocpu = PETSC_TRUE;
338: PetscScalar *xx, *yy, *uxx, *uyy;
339: PetscInt blda, clda;
340: PetscMPIInt size;
341: PetscSF bsf, csf;
342: PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
344: HLibProfile::clear();
345: #if defined(PETSC_H2OPUS_USE_GPU)
346: boundtocpu = A->boundtocpu;
347: #endif
348: MatDenseGetLDA(B, &blda);
349: MatDenseGetLDA(C, &clda);
350: if (usesf) {
351: PetscInt n;
353: MatDenseGetH2OpusVectorSF(B, h2opus->sf, &bsf);
354: MatDenseGetH2OpusVectorSF(C, h2opus->sf, &csf);
356: MatH2OpusResizeBuffers_Private(A, B->cmap->N, C->cmap->N);
357: PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL);
358: blda = n;
359: clda = n;
360: }
361: MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
362: if (boundtocpu) {
363: MatDenseGetArrayRead(B, (const PetscScalar **)&xx);
364: MatDenseGetArrayWrite(C, &yy);
365: if (usesf) {
366: uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
367: uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
368: PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE);
369: PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE);
370: } else {
371: uxx = xx;
372: uyy = yy;
373: }
374: if (size > 1) {
377: #if defined(H2OPUS_USE_MPI)
378: distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
379: #endif
380: } else {
382: hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
383: }
384: MatDenseRestoreArrayRead(B, (const PetscScalar **)&xx);
385: if (usesf) {
386: PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE);
387: PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE);
388: }
389: MatDenseRestoreArrayWrite(C, &yy);
390: #if defined(PETSC_H2OPUS_USE_GPU)
391: } else {
392: PetscBool ciscuda, biscuda;
394: /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
395: PetscObjectTypeCompareAny((PetscObject)B, &biscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, "");
396: if (!biscuda) MatConvert(B, MATDENSECUDA, MAT_INPLACE_MATRIX, &B);
397: PetscObjectTypeCompareAny((PetscObject)C, &ciscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, "");
398: if (!ciscuda) {
399: C->assembled = PETSC_TRUE;
400: MatConvert(C, MATDENSECUDA, MAT_INPLACE_MATRIX, &C);
401: }
402: MatDenseCUDAGetArrayRead(B, (const PetscScalar **)&xx);
403: MatDenseCUDAGetArrayWrite(C, &yy);
404: if (usesf) {
405: uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
406: uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
407: PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE);
408: PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE);
409: } else {
410: uxx = xx;
411: uyy = yy;
412: }
413: PetscLogGpuTimeBegin();
414: if (size > 1) {
417: #if defined(H2OPUS_USE_MPI)
418: distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
419: #endif
420: } else {
422: hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
423: }
424: PetscLogGpuTimeEnd();
425: MatDenseCUDARestoreArrayRead(B, (const PetscScalar **)&xx);
426: if (usesf) {
427: PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE);
428: PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE);
429: }
430: MatDenseCUDARestoreArrayWrite(C, &yy);
431: if (!biscuda) MatConvert(B, MATDENSE, MAT_INPLACE_MATRIX, &B);
432: if (!ciscuda) MatConvert(C, MATDENSE, MAT_INPLACE_MATRIX, &C);
433: #endif
434: }
435: { /* log flops */
436: double gops, time, perf, dev;
437: HLibProfile::getHgemvPerf(gops, time, perf, dev);
438: #if defined(PETSC_H2OPUS_USE_GPU)
439: if (boundtocpu) {
440: PetscLogFlops(1e9 * gops);
441: } else {
442: PetscLogGpuFlops(1e9 * gops);
443: }
444: #else
445: PetscLogFlops(1e9 * gops);
446: #endif
447: }
448: return 0;
449: }
451: static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
452: {
453: Mat_Product *product = C->product;
455: MatCheckProduct(C, 1);
456: switch (product->type) {
457: case MATPRODUCT_AB:
458: MatMultNKernel_H2OPUS(product->A, PETSC_FALSE, product->B, C);
459: break;
460: case MATPRODUCT_AtB:
461: MatMultNKernel_H2OPUS(product->A, PETSC_TRUE, product->B, C);
462: break;
463: default:
464: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
465: }
466: return 0;
467: }
469: static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
470: {
471: Mat_Product *product = C->product;
472: PetscBool cisdense;
473: Mat A, B;
475: MatCheckProduct(C, 1);
476: A = product->A;
477: B = product->B;
478: switch (product->type) {
479: case MATPRODUCT_AB:
480: MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N);
481: MatSetBlockSizesFromMats(C, product->A, product->B);
482: PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, "");
483: if (!cisdense) MatSetType(C, ((PetscObject)product->B)->type_name);
484: MatSetUp(C);
485: break;
486: case MATPRODUCT_AtB:
487: MatSetSizes(C, A->cmap->n, B->cmap->n, A->cmap->N, B->cmap->N);
488: MatSetBlockSizesFromMats(C, product->A, product->B);
489: PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, "");
490: if (!cisdense) MatSetType(C, ((PetscObject)product->B)->type_name);
491: MatSetUp(C);
492: break;
493: default:
494: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
495: }
496: C->ops->productsymbolic = NULL;
497: C->ops->productnumeric = MatProductNumeric_H2OPUS;
498: return 0;
499: }
501: static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
502: {
503: MatCheckProduct(C, 1);
504: if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
505: return 0;
506: }
508: static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
509: {
510: Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
511: #if defined(H2OPUS_USE_MPI)
512: h2opusHandle_t handle = h2opus->handle->handle;
513: #else
514: h2opusHandle_t handle = h2opus->handle;
515: #endif
516: PetscBool boundtocpu = PETSC_TRUE;
517: PetscInt n;
518: PetscScalar *xx, *yy, *uxx, *uyy;
519: PetscMPIInt size;
520: PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
522: HLibProfile::clear();
523: MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
524: #if defined(PETSC_H2OPUS_USE_GPU)
525: boundtocpu = A->boundtocpu;
526: #endif
527: if (usesf) {
528: PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL);
529: } else n = A->rmap->n;
530: if (boundtocpu) {
531: VecGetArrayRead(x, (const PetscScalar **)&xx);
532: if (sy == 0.0) {
533: VecGetArrayWrite(y, &yy);
534: } else {
535: VecGetArray(y, &yy);
536: }
537: if (usesf) {
538: uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
539: uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
541: PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE);
542: PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE);
543: if (sy != 0.0) {
544: PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE);
545: PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE);
546: }
547: } else {
548: uxx = xx;
549: uyy = yy;
550: }
551: if (size > 1) {
554: #if defined(H2OPUS_USE_MPI)
555: distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
556: #endif
557: } else {
559: hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
560: }
561: VecRestoreArrayRead(x, (const PetscScalar **)&xx);
562: if (usesf) {
563: PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE);
564: PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE);
565: }
566: if (sy == 0.0) {
567: VecRestoreArrayWrite(y, &yy);
568: } else {
569: VecRestoreArray(y, &yy);
570: }
571: #if defined(PETSC_H2OPUS_USE_GPU)
572: } else {
573: VecCUDAGetArrayRead(x, (const PetscScalar **)&xx);
574: if (sy == 0.0) {
575: VecCUDAGetArrayWrite(y, &yy);
576: } else {
577: VecCUDAGetArray(y, &yy);
578: }
579: if (usesf) {
580: uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
581: uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
583: PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE);
584: PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE);
585: if (sy != 0.0) {
586: PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE);
587: PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE);
588: }
589: } else {
590: uxx = xx;
591: uyy = yy;
592: }
593: PetscLogGpuTimeBegin();
594: if (size > 1) {
597: #if defined(H2OPUS_USE_MPI)
598: distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
599: #endif
600: } else {
602: hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
603: }
604: PetscLogGpuTimeEnd();
605: VecCUDARestoreArrayRead(x, (const PetscScalar **)&xx);
606: if (usesf) {
607: PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE);
608: PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE);
609: }
610: if (sy == 0.0) {
611: VecCUDARestoreArrayWrite(y, &yy);
612: } else {
613: VecCUDARestoreArray(y, &yy);
614: }
615: #endif
616: }
617: { /* log flops */
618: double gops, time, perf, dev;
619: HLibProfile::getHgemvPerf(gops, time, perf, dev);
620: #if defined(PETSC_H2OPUS_USE_GPU)
621: if (boundtocpu) {
622: PetscLogFlops(1e9 * gops);
623: } else {
624: PetscLogGpuFlops(1e9 * gops);
625: }
626: #else
627: PetscLogFlops(1e9 * gops);
628: #endif
629: }
630: return 0;
631: }
633: static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
634: {
635: PetscBool xiscuda, yiscuda;
637: PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "");
638: PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, "");
639: MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda);
640: MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_TRUE);
641: return 0;
642: }
644: static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
645: {
646: PetscBool xiscuda, yiscuda;
648: PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "");
649: PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, "");
650: MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda);
651: MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_FALSE);
652: return 0;
653: }
655: static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
656: {
657: PetscBool xiscuda, ziscuda;
659: VecCopy(y, z);
660: PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "");
661: PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, "");
662: MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda);
663: MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_TRUE);
664: return 0;
665: }
667: static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
668: {
669: PetscBool xiscuda, ziscuda;
671: VecCopy(y, z);
672: PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "");
673: PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, "");
674: MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda);
675: MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_FALSE);
676: return 0;
677: }
679: static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
680: {
681: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
683: a->s *= s;
684: return 0;
685: }
687: static PetscErrorCode MatSetFromOptions_H2OPUS(Mat A, PetscOptionItems *PetscOptionsObject)
688: {
689: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
691: PetscOptionsHeadBegin(PetscOptionsObject, "H2OPUS options");
692: PetscOptionsInt("-mat_h2opus_leafsize", "Leaf size of cluster tree", NULL, a->leafsize, &a->leafsize, NULL);
693: PetscOptionsReal("-mat_h2opus_eta", "Admissibility condition tolerance", NULL, a->eta, &a->eta, NULL);
694: PetscOptionsInt("-mat_h2opus_order", "Basis order for off-diagonal sampling when constructed from kernel", NULL, a->basisord, &a->basisord, NULL);
695: PetscOptionsInt("-mat_h2opus_maxrank", "Maximum rank when constructed from matvecs", NULL, a->max_rank, &a->max_rank, NULL);
696: PetscOptionsInt("-mat_h2opus_samples", "Maximum number of samples to be taken concurrently when constructing from matvecs", NULL, a->bs, &a->bs, NULL);
697: PetscOptionsInt("-mat_h2opus_normsamples", "Maximum bumber of samples to be when estimating norms", NULL, a->norm_max_samples, &a->norm_max_samples, NULL);
698: PetscOptionsReal("-mat_h2opus_rtol", "Relative tolerance for construction from sampling", NULL, a->rtol, &a->rtol, NULL);
699: PetscOptionsBool("-mat_h2opus_check", "Check error when constructing from sampling during MatAssemblyEnd()", NULL, a->check_construction, &a->check_construction, NULL);
700: PetscOptionsBool("-mat_h2opus_hara_verbose", "Verbose output from hara construction", NULL, a->hara_verbose, &a->hara_verbose, NULL);
701: PetscOptionsBool("-mat_h2opus_resize", "Resize after compression", NULL, a->resize, &a->resize, NULL);
702: PetscOptionsHeadEnd();
703: return 0;
704: }
706: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat, PetscInt, const PetscReal[], PetscBool, MatH2OpusKernel, void *);
708: static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
709: {
710: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
711: Vec c;
712: PetscInt spacedim;
713: const PetscScalar *coords;
715: if (a->ptcloud) return 0;
716: PetscObjectQuery((PetscObject)A, "__math2opus_coords", (PetscObject *)&c);
717: if (!c && a->sampler) {
718: Mat S = a->sampler->GetSamplingMat();
720: PetscObjectQuery((PetscObject)S, "__math2opus_coords", (PetscObject *)&c);
721: }
722: if (!c) {
723: MatH2OpusSetCoords_H2OPUS(A, -1, NULL, PETSC_FALSE, NULL, NULL);
724: } else {
725: VecGetArrayRead(c, &coords);
726: VecGetBlockSize(c, &spacedim);
727: MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, PETSC_FALSE, NULL, NULL);
728: VecRestoreArrayRead(c, &coords);
729: }
730: return 0;
731: }
733: static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
734: {
735: MPI_Comm comm;
736: PetscMPIInt size;
737: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
738: PetscInt n = 0, *idx = NULL;
739: int *iidx = NULL;
740: PetscCopyMode own;
741: PetscBool rid;
743: if (a->multsetup) return 0;
744: if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
745: PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL);
746: #if defined(PETSC_H2OPUS_USE_GPU)
747: a->xx_gpu = new thrust::device_vector<PetscScalar>(n);
748: a->yy_gpu = new thrust::device_vector<PetscScalar>(n);
749: a->xxs_gpu = 1;
750: a->yys_gpu = 1;
751: #endif
752: a->xx = new thrust::host_vector<PetscScalar>(n);
753: a->yy = new thrust::host_vector<PetscScalar>(n);
754: a->xxs = 1;
755: a->yys = 1;
756: } else {
757: IS is;
758: PetscObjectGetComm((PetscObject)A, &comm);
759: MPI_Comm_size(comm, &size);
760: if (!a->h2opus_indexmap) {
761: if (size > 1) {
763: #if defined(H2OPUS_USE_MPI)
764: iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
765: n = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
766: #endif
767: } else {
768: iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
769: n = a->hmatrix->u_basis_tree.index_map.size();
770: }
772: if (PetscDefined(USE_64BIT_INDICES)) {
773: PetscInt i;
775: own = PETSC_OWN_POINTER;
776: PetscMalloc1(n, &idx);
777: for (i = 0; i < n; i++) idx[i] = iidx[i];
778: } else {
779: own = PETSC_COPY_VALUES;
780: idx = (PetscInt *)iidx;
781: }
782: ISCreateGeneral(comm, n, idx, own, &is);
783: ISSetPermutation(is);
784: ISViewFromOptions(is, (PetscObject)A, "-mat_h2opus_indexmap_view");
785: a->h2opus_indexmap = is;
786: }
787: ISGetLocalSize(a->h2opus_indexmap, &n);
788: ISGetIndices(a->h2opus_indexmap, (const PetscInt **)&idx);
789: rid = (PetscBool)(n == A->rmap->n);
790: MPIU_Allreduce(MPI_IN_PLACE, &rid, 1, MPIU_BOOL, MPI_LAND, comm);
791: if (rid) ISIdentity(a->h2opus_indexmap, &rid);
792: if (!rid) {
793: if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
794: PetscLayoutCreate(comm, &a->h2opus_rmap);
795: PetscLayoutSetLocalSize(a->h2opus_rmap, n);
796: PetscLayoutSetUp(a->h2opus_rmap);
797: PetscLayoutReference(a->h2opus_rmap, &a->h2opus_cmap);
798: }
799: PetscSFCreate(comm, &a->sf);
800: PetscSFSetGraphLayout(a->sf, A->rmap, n, NULL, PETSC_OWN_POINTER, idx);
801: PetscSFSetUp(a->sf);
802: PetscSFViewFromOptions(a->sf, (PetscObject)A, "-mat_h2opus_sf_view");
803: #if defined(PETSC_H2OPUS_USE_GPU)
804: a->xx_gpu = new thrust::device_vector<PetscScalar>(n);
805: a->yy_gpu = new thrust::device_vector<PetscScalar>(n);
806: a->xxs_gpu = 1;
807: a->yys_gpu = 1;
808: #endif
809: a->xx = new thrust::host_vector<PetscScalar>(n);
810: a->yy = new thrust::host_vector<PetscScalar>(n);
811: a->xxs = 1;
812: a->yys = 1;
813: }
814: ISRestoreIndices(a->h2opus_indexmap, (const PetscInt **)&idx);
815: }
816: a->multsetup = PETSC_TRUE;
817: return 0;
818: }
820: static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
821: {
822: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
823: #if defined(H2OPUS_USE_MPI)
824: h2opusHandle_t handle = a->handle->handle;
825: #else
826: h2opusHandle_t handle = a->handle;
827: #endif
828: PetscBool kernel = PETSC_FALSE;
829: PetscBool boundtocpu = PETSC_TRUE;
830: PetscBool samplingdone = PETSC_FALSE;
831: MPI_Comm comm;
832: PetscMPIInt size;
834: PetscObjectGetComm((PetscObject)A, &comm);
838: /* XXX */
839: a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N));
841: MPI_Comm_size(comm, &size);
842: /* TODO REUSABILITY of geometric construction */
843: delete a->hmatrix;
844: delete a->dist_hmatrix;
845: #if defined(PETSC_H2OPUS_USE_GPU)
846: delete a->hmatrix_gpu;
847: delete a->dist_hmatrix_gpu;
848: #endif
849: a->orthogonal = PETSC_FALSE;
851: /* TODO: other? */
852: H2OpusBoxCenterAdmissibility adm(a->eta);
854: PetscLogEventBegin(MAT_H2Opus_Build, A, 0, 0, 0);
855: if (size > 1) {
856: #if defined(H2OPUS_USE_MPI)
857: a->dist_hmatrix = new DistributedHMatrix(A->rmap->n /* ,A->symmetric */);
858: #else
859: a->dist_hmatrix = NULL;
860: #endif
861: } else a->hmatrix = new HMatrix(A->rmap->n, A->symmetric == PETSC_BOOL3_TRUE);
862: MatH2OpusInferCoordinates_Private(A);
864: if (a->kernel) {
865: BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
866: if (size > 1) {
868: #if defined(H2OPUS_USE_MPI)
869: buildDistributedHMatrix(*a->dist_hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord, a->handle);
870: #endif
871: } else {
872: buildHMatrix(*a->hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord);
873: }
874: kernel = PETSC_TRUE;
875: } else {
877: buildHMatrixStructure(*a->hmatrix, a->ptcloud, a->leafsize, adm);
878: }
879: MatSetUpMultiply_H2OPUS(A);
881: #if defined(PETSC_H2OPUS_USE_GPU)
882: boundtocpu = A->boundtocpu;
883: if (!boundtocpu) {
884: if (size > 1) {
886: #if defined(H2OPUS_USE_MPI)
887: a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
888: #endif
889: } else {
890: a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
891: }
892: }
893: #endif
894: if (size == 1) {
895: if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
896: PetscReal Anorm;
897: bool verbose;
899: PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_hara_verbose", &a->hara_verbose, NULL);
900: verbose = a->hara_verbose;
901: MatApproximateNorm_Private(a->sampler->GetSamplingMat(), NORM_2, a->norm_max_samples, &Anorm);
902: if (a->hara_verbose) PetscPrintf(PETSC_COMM_SELF, "Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n", a->max_rank, a->rtol * Anorm, a->rtol, Anorm, boundtocpu ? "CPU" : "GPU", a->bs);
903: if (a->sf && !a->nativemult) a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(), a->hmatrix->u_basis_tree.index_map.data());
904: a->sampler->SetStream(handle->getMainStream());
905: if (boundtocpu) {
906: a->sampler->SetGPUSampling(false);
907: hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
908: #if defined(PETSC_H2OPUS_USE_GPU)
909: } else {
910: a->sampler->SetGPUSampling(true);
911: hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
912: #endif
913: }
914: samplingdone = PETSC_TRUE;
915: }
916: }
917: #if defined(PETSC_H2OPUS_USE_GPU)
918: if (!boundtocpu) {
919: delete a->hmatrix;
920: delete a->dist_hmatrix;
921: a->hmatrix = NULL;
922: a->dist_hmatrix = NULL;
923: }
924: A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
925: #endif
926: PetscLogEventEnd(MAT_H2Opus_Build, A, 0, 0, 0);
928: if (!a->s) a->s = 1.0;
929: A->assembled = PETSC_TRUE;
931: if (samplingdone) {
932: PetscBool check = a->check_construction;
933: PetscBool checke = PETSC_FALSE;
935: PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check", &check, NULL);
936: PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check_explicit", &checke, NULL);
937: if (check) {
938: Mat E, Ae;
939: PetscReal n1, ni, n2;
940: PetscReal n1A, niA, n2A;
941: void (*normfunc)(void);
943: Ae = a->sampler->GetSamplingMat();
944: MatConvert(A, MATSHELL, MAT_INITIAL_MATRIX, &E);
945: MatShellSetOperation(E, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS);
946: MatAXPY(E, -1.0, Ae, DIFFERENT_NONZERO_PATTERN);
947: MatNorm(E, NORM_1, &n1);
948: MatNorm(E, NORM_INFINITY, &ni);
949: MatNorm(E, NORM_2, &n2);
950: if (checke) {
951: Mat eA, eE, eAe;
953: MatComputeOperator(A, MATAIJ, &eA);
954: MatComputeOperator(E, MATAIJ, &eE);
955: MatComputeOperator(Ae, MATAIJ, &eAe);
956: MatChop(eA, PETSC_SMALL);
957: MatChop(eE, PETSC_SMALL);
958: MatChop(eAe, PETSC_SMALL);
959: PetscObjectSetName((PetscObject)eA, "H2Mat");
960: MatView(eA, NULL);
961: PetscObjectSetName((PetscObject)eAe, "S");
962: MatView(eAe, NULL);
963: PetscObjectSetName((PetscObject)eE, "H2Mat - S");
964: MatView(eE, NULL);
965: MatDestroy(&eA);
966: MatDestroy(&eE);
967: MatDestroy(&eAe);
968: }
970: MatGetOperation(Ae, MATOP_NORM, &normfunc);
971: MatSetOperation(Ae, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS);
972: MatNorm(Ae, NORM_1, &n1A);
973: MatNorm(Ae, NORM_INFINITY, &niA);
974: MatNorm(Ae, NORM_2, &n2A);
975: n1A = PetscMax(n1A, PETSC_SMALL);
976: n2A = PetscMax(n2A, PETSC_SMALL);
977: niA = PetscMax(niA, PETSC_SMALL);
978: MatSetOperation(Ae, MATOP_NORM, normfunc);
979: PetscPrintf(PetscObjectComm((PetscObject)A), "MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n", (double)n1, (double)ni, (double)n2, (double)(n1 / n1A), (double)(ni / niA), (double)(n2 / n2A));
980: MatDestroy(&E);
981: }
982: a->sampler->SetSamplingMat(NULL);
983: }
984: return 0;
985: }
987: static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
988: {
989: PetscMPIInt size;
990: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
992: MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
994: a->hmatrix->clearData();
995: #if defined(PETSC_H2OPUS_USE_GPU)
996: if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
997: #endif
998: return 0;
999: }
1001: static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1002: {
1003: Mat A;
1004: Mat_H2OPUS *a, *b = (Mat_H2OPUS *)B->data;
1005: #if defined(PETSC_H2OPUS_USE_GPU)
1006: PetscBool iscpu = PETSC_FALSE;
1007: #else
1008: PetscBool iscpu = PETSC_TRUE;
1009: #endif
1010: MPI_Comm comm;
1012: PetscObjectGetComm((PetscObject)B, &comm);
1013: MatCreate(comm, &A);
1014: MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N);
1015: MatSetType(A, MATH2OPUS);
1016: MatPropagateSymmetryOptions(B, A);
1017: a = (Mat_H2OPUS *)A->data;
1019: a->eta = b->eta;
1020: a->leafsize = b->leafsize;
1021: a->basisord = b->basisord;
1022: a->max_rank = b->max_rank;
1023: a->bs = b->bs;
1024: a->rtol = b->rtol;
1025: a->norm_max_samples = b->norm_max_samples;
1026: if (op == MAT_COPY_VALUES) a->s = b->s;
1028: a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1029: if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);
1031: #if defined(H2OPUS_USE_MPI)
1032: if (b->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix);
1033: #if defined(PETSC_H2OPUS_USE_GPU)
1034: if (b->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu);
1035: #endif
1036: #endif
1037: if (b->hmatrix) {
1038: a->hmatrix = new HMatrix(*b->hmatrix);
1039: if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1040: }
1041: #if defined(PETSC_H2OPUS_USE_GPU)
1042: if (b->hmatrix_gpu) {
1043: a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1044: if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1045: }
1046: #endif
1047: if (b->sf) {
1048: PetscObjectReference((PetscObject)b->sf);
1049: a->sf = b->sf;
1050: }
1051: if (b->h2opus_indexmap) {
1052: PetscObjectReference((PetscObject)b->h2opus_indexmap);
1053: a->h2opus_indexmap = b->h2opus_indexmap;
1054: }
1056: MatSetUp(A);
1057: MatSetUpMultiply_H2OPUS(A);
1058: if (op == MAT_COPY_VALUES) {
1059: A->assembled = PETSC_TRUE;
1060: a->orthogonal = b->orthogonal;
1061: #if defined(PETSC_H2OPUS_USE_GPU)
1062: A->offloadmask = B->offloadmask;
1063: #endif
1064: }
1065: #if defined(PETSC_H2OPUS_USE_GPU)
1066: iscpu = B->boundtocpu;
1067: #endif
1068: MatBindToCPU(A, iscpu);
1070: *nA = A;
1071: return 0;
1072: }
1074: static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1075: {
1076: Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
1077: PetscBool isascii, vieweps;
1078: PetscMPIInt size;
1079: PetscViewerFormat format;
1081: PetscObjectTypeCompare((PetscObject)view, PETSCVIEWERASCII, &isascii);
1082: PetscViewerGetFormat(view, &format);
1083: MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
1084: if (isascii) {
1085: if (format == PETSC_VIEWER_ASCII_MATLAB) {
1086: if (size == 1) {
1087: FILE *fp;
1088: PetscViewerASCIIGetPointer(view, &fp);
1089: dumpHMatrix(*h2opus->hmatrix, 6, fp);
1090: }
1091: } else {
1092: PetscViewerASCIIPrintf(view, " H-Matrix constructed from %s\n", h2opus->kernel ? "Kernel" : "Mat");
1093: PetscViewerASCIIPrintf(view, " PointCloud dim %" PetscInt_FMT "\n", h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0);
1094: PetscViewerASCIIPrintf(view, " Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n", h2opus->leafsize, (double)h2opus->eta);
1095: if (!h2opus->kernel) {
1096: PetscViewerASCIIPrintf(view, " Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n", h2opus->max_rank, h2opus->bs, (double)h2opus->rtol);
1097: } else {
1098: PetscViewerASCIIPrintf(view, " Offdiagonal blocks approximation order %" PetscInt_FMT "\n", h2opus->basisord);
1099: }
1100: PetscViewerASCIIPrintf(view, " Number of samples for norms %" PetscInt_FMT "\n", h2opus->norm_max_samples);
1101: if (size == 1) {
1102: double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1103: double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1104: #if defined(PETSC_HAVE_CUDA)
1105: double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1106: double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1107: #endif
1108: PetscViewerASCIIPrintf(view, " Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu);
1109: #if defined(PETSC_HAVE_CUDA)
1110: PetscViewerASCIIPrintf(view, " Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu);
1111: #endif
1112: } else {
1113: #if defined(PETSC_HAVE_CUDA)
1114: double matrix_mem[4] = {0., 0., 0., 0.};
1115: PetscMPIInt rsize = 4;
1116: #else
1117: double matrix_mem[2] = {0., 0.};
1118: PetscMPIInt rsize = 2;
1119: #endif
1120: #if defined(H2OPUS_USE_MPI)
1121: matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1122: matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1123: #if defined(PETSC_HAVE_CUDA)
1124: matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1125: matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1126: #endif
1127: #endif
1128: MPIU_Allreduce(MPI_IN_PLACE, matrix_mem, rsize, MPI_DOUBLE_PRECISION, MPI_SUM, PetscObjectComm((PetscObject)A));
1129: PetscViewerASCIIPrintf(view, " Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]);
1130: #if defined(PETSC_HAVE_CUDA)
1131: PetscViewerASCIIPrintf(view, " Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]);
1132: #endif
1133: }
1134: }
1135: }
1136: vieweps = PETSC_FALSE;
1137: PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps", &vieweps, NULL);
1138: if (vieweps) {
1139: char filename[256];
1140: const char *name;
1142: PetscObjectGetName((PetscObject)A, &name);
1143: PetscSNPrintf(filename, sizeof(filename), "%s_structure.eps", name);
1144: PetscOptionsGetString(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps_filename", filename, sizeof(filename), NULL);
1145: outputEps(*h2opus->hmatrix, filename);
1146: }
1147: return 0;
1148: }
1150: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx)
1151: {
1152: Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
1153: PetscReal *gcoords;
1154: PetscInt N;
1155: MPI_Comm comm;
1156: PetscMPIInt size;
1157: PetscBool cong;
1159: PetscLayoutSetUp(A->rmap);
1160: PetscLayoutSetUp(A->cmap);
1161: PetscObjectGetComm((PetscObject)A, &comm);
1162: MatHasCongruentLayouts(A, &cong);
1164: N = A->rmap->N;
1165: MPI_Comm_size(comm, &size);
1166: if (spacedim > 0 && size > 1 && cdist) {
1167: PetscSF sf;
1168: MPI_Datatype dtype;
1170: MPI_Type_contiguous(spacedim, MPIU_REAL, &dtype);
1171: MPI_Type_commit(&dtype);
1173: PetscSFCreate(comm, &sf);
1174: PetscSFSetGraphWithPattern(sf, A->rmap, PETSCSF_PATTERN_ALLGATHER);
1175: PetscMalloc1(spacedim * N, &gcoords);
1176: PetscSFBcastBegin(sf, dtype, coords, gcoords, MPI_REPLACE);
1177: PetscSFBcastEnd(sf, dtype, coords, gcoords, MPI_REPLACE);
1178: PetscSFDestroy(&sf);
1179: MPI_Type_free(&dtype);
1180: } else gcoords = (PetscReal *)coords;
1182: delete h2opus->ptcloud;
1183: delete h2opus->kernel;
1184: h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim, N, gcoords);
1185: if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel, spacedim, kernelctx);
1186: if (gcoords != coords) PetscFree(gcoords);
1187: A->preallocated = PETSC_TRUE;
1188: return 0;
1189: }
1191: #if defined(PETSC_H2OPUS_USE_GPU)
1192: static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1193: {
1194: PetscMPIInt size;
1195: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1197: MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
1198: if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1199: if (size > 1) {
1201: #if defined(H2OPUS_USE_MPI)
1202: if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1203: else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1204: #endif
1205: } else {
1207: if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1208: else *a->hmatrix = *a->hmatrix_gpu;
1209: }
1210: delete a->hmatrix_gpu;
1211: delete a->dist_hmatrix_gpu;
1212: a->hmatrix_gpu = NULL;
1213: a->dist_hmatrix_gpu = NULL;
1214: A->offloadmask = PETSC_OFFLOAD_CPU;
1215: } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1216: if (size > 1) {
1218: #if defined(H2OPUS_USE_MPI)
1219: if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1220: else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1221: #endif
1222: } else {
1224: if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1225: else *a->hmatrix_gpu = *a->hmatrix;
1226: }
1227: delete a->hmatrix;
1228: delete a->dist_hmatrix;
1229: a->hmatrix = NULL;
1230: a->dist_hmatrix = NULL;
1231: A->offloadmask = PETSC_OFFLOAD_GPU;
1232: }
1233: PetscFree(A->defaultvectype);
1234: if (!flg) {
1235: PetscStrallocpy(VECCUDA, &A->defaultvectype);
1236: } else {
1237: PetscStrallocpy(VECSTANDARD, &A->defaultvectype);
1238: }
1239: A->boundtocpu = flg;
1240: return 0;
1241: }
1242: #endif
1244: /*MC
1245: MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package.
1247: Options Database Keys:
1248: . -mat_type h2opus - matrix type to "h2opus" during a call to MatSetFromOptions()
1250: Notes:
1251: H2Opus implements hierarchical matrices in the H^2 flavour. It supports CPU or NVIDIA GPUs.
1253: For CPU only builds, use ./configure --download-h2opus --download-thrust to install PETSc to use H2Opus.
1254: In order to run on NVIDIA GPUs, use ./configure --download-h2opus --download-magma --download-kblas.
1256: Level: beginner
1258: Reference:
1259: . * - "H2Opus: A distributed-memory multi-GPU software package for non-local operators", https://arxiv.org/abs/2109.05451
1261: .seealso: `MATH2OPUS`, `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()`
1262: M*/
1263: PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1264: {
1265: Mat_H2OPUS *a;
1266: PetscMPIInt size;
1268: #if defined(PETSC_H2OPUS_USE_GPU)
1269: PetscDeviceInitialize(PETSC_DEVICE_CUDA);
1270: #endif
1271: PetscNew(&a);
1272: A->data = (void *)a;
1274: a->eta = 0.9;
1275: a->leafsize = 32;
1276: a->basisord = 4;
1277: a->max_rank = 64;
1278: a->bs = 32;
1279: a->rtol = 1.e-4;
1280: a->s = 1.0;
1281: a->norm_max_samples = 10;
1282: a->resize = PETSC_TRUE; /* reallocate after compression */
1283: #if defined(H2OPUS_USE_MPI)
1284: h2opusCreateDistributedHandleComm(&a->handle, PetscObjectComm((PetscObject)A));
1285: #else
1286: h2opusCreateHandle(&a->handle);
1287: #endif
1288: MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
1289: PetscObjectChangeTypeName((PetscObject)A, MATH2OPUS);
1290: PetscMemzero(A->ops, sizeof(struct _MatOps));
1292: A->ops->destroy = MatDestroy_H2OPUS;
1293: A->ops->view = MatView_H2OPUS;
1294: A->ops->assemblyend = MatAssemblyEnd_H2OPUS;
1295: A->ops->mult = MatMult_H2OPUS;
1296: A->ops->multtranspose = MatMultTranspose_H2OPUS;
1297: A->ops->multadd = MatMultAdd_H2OPUS;
1298: A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1299: A->ops->scale = MatScale_H2OPUS;
1300: A->ops->duplicate = MatDuplicate_H2OPUS;
1301: A->ops->setfromoptions = MatSetFromOptions_H2OPUS;
1302: A->ops->norm = MatNorm_H2OPUS;
1303: A->ops->zeroentries = MatZeroEntries_H2OPUS;
1304: #if defined(PETSC_H2OPUS_USE_GPU)
1305: A->ops->bindtocpu = MatBindToCPU_H2OPUS;
1306: #endif
1308: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", MatProductSetFromOptions_H2OPUS);
1309: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", MatProductSetFromOptions_H2OPUS);
1310: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", MatProductSetFromOptions_H2OPUS);
1311: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", MatProductSetFromOptions_H2OPUS);
1312: #if defined(PETSC_H2OPUS_USE_GPU)
1313: PetscFree(A->defaultvectype);
1314: PetscStrallocpy(VECCUDA, &A->defaultvectype);
1315: #endif
1316: return 0;
1317: }
1319: /*@C
1320: MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.
1322: Input Parameter:
1323: . A - the matrix
1325: Level: intermediate
1327: .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`
1328: @*/
1329: PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1330: {
1331: PetscBool ish2opus;
1332: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1333: PetscMPIInt size;
1334: PetscBool boundtocpu = PETSC_TRUE;
1338: PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus);
1339: if (!ish2opus) return 0;
1340: if (a->orthogonal) return 0;
1341: HLibProfile::clear();
1342: PetscLogEventBegin(MAT_H2Opus_Orthog, A, 0, 0, 0);
1343: #if defined(PETSC_H2OPUS_USE_GPU)
1344: boundtocpu = A->boundtocpu;
1345: #endif
1346: MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
1347: if (size > 1) {
1348: if (boundtocpu) {
1350: #if defined(H2OPUS_USE_MPI)
1351: distributed_horthog(*a->dist_hmatrix, a->handle);
1352: #endif
1353: #if defined(PETSC_H2OPUS_USE_GPU)
1354: A->offloadmask = PETSC_OFFLOAD_CPU;
1355: } else {
1357: PetscLogGpuTimeBegin();
1358: #if defined(H2OPUS_USE_MPI)
1359: distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1360: #endif
1361: PetscLogGpuTimeEnd();
1362: #endif
1363: }
1364: } else {
1365: #if defined(H2OPUS_USE_MPI)
1366: h2opusHandle_t handle = a->handle->handle;
1367: #else
1368: h2opusHandle_t handle = a->handle;
1369: #endif
1370: if (boundtocpu) {
1372: horthog(*a->hmatrix, handle);
1373: #if defined(PETSC_H2OPUS_USE_GPU)
1374: A->offloadmask = PETSC_OFFLOAD_CPU;
1375: } else {
1377: PetscLogGpuTimeBegin();
1378: horthog(*a->hmatrix_gpu, handle);
1379: PetscLogGpuTimeEnd();
1380: #endif
1381: }
1382: }
1383: a->orthogonal = PETSC_TRUE;
1384: { /* log flops */
1385: double gops, time, perf, dev;
1386: HLibProfile::getHorthogPerf(gops, time, perf, dev);
1387: #if defined(PETSC_H2OPUS_USE_GPU)
1388: if (boundtocpu) {
1389: PetscLogFlops(1e9 * gops);
1390: } else {
1391: PetscLogGpuFlops(1e9 * gops);
1392: }
1393: #else
1394: PetscLogFlops(1e9 * gops);
1395: #endif
1396: }
1397: PetscLogEventEnd(MAT_H2Opus_Orthog, A, 0, 0, 0);
1398: return 0;
1399: }
1401: /*@C
1402: MatH2OpusCompress - Compress a hierarchical matrix.
1404: Input Parameters:
1405: + A - the matrix
1406: - tol - the absolute truncation threshold
1408: Level: intermediate
1410: .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()`
1411: @*/
1412: PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1413: {
1414: PetscBool ish2opus;
1415: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1416: PetscMPIInt size;
1417: PetscBool boundtocpu = PETSC_TRUE;
1422: PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus);
1423: if (!ish2opus || tol <= 0.0) return 0;
1424: MatH2OpusOrthogonalize(A);
1425: HLibProfile::clear();
1426: PetscLogEventBegin(MAT_H2Opus_Compress, A, 0, 0, 0);
1427: #if defined(PETSC_H2OPUS_USE_GPU)
1428: boundtocpu = A->boundtocpu;
1429: #endif
1430: MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
1431: if (size > 1) {
1432: if (boundtocpu) {
1434: #if defined(H2OPUS_USE_MPI)
1435: distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1436: if (a->resize) {
1437: DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix);
1438: delete a->dist_hmatrix;
1439: a->dist_hmatrix = dist_hmatrix;
1440: }
1441: #endif
1442: #if defined(PETSC_H2OPUS_USE_GPU)
1443: A->offloadmask = PETSC_OFFLOAD_CPU;
1444: } else {
1446: PetscLogGpuTimeBegin();
1447: #if defined(H2OPUS_USE_MPI)
1448: distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);
1450: if (a->resize) {
1451: DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu);
1452: delete a->dist_hmatrix_gpu;
1453: a->dist_hmatrix_gpu = dist_hmatrix_gpu;
1454: }
1455: #endif
1456: PetscLogGpuTimeEnd();
1457: #endif
1458: }
1459: } else {
1460: #if defined(H2OPUS_USE_MPI)
1461: h2opusHandle_t handle = a->handle->handle;
1462: #else
1463: h2opusHandle_t handle = a->handle;
1464: #endif
1465: if (boundtocpu) {
1467: hcompress(*a->hmatrix, tol, handle);
1469: if (a->resize) {
1470: HMatrix *hmatrix = new HMatrix(*a->hmatrix);
1471: delete a->hmatrix;
1472: a->hmatrix = hmatrix;
1473: }
1474: #if defined(PETSC_H2OPUS_USE_GPU)
1475: A->offloadmask = PETSC_OFFLOAD_CPU;
1476: } else {
1478: PetscLogGpuTimeBegin();
1479: hcompress(*a->hmatrix_gpu, tol, handle);
1480: PetscLogGpuTimeEnd();
1482: if (a->resize) {
1483: HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu);
1484: delete a->hmatrix_gpu;
1485: a->hmatrix_gpu = hmatrix_gpu;
1486: }
1487: #endif
1488: }
1489: }
1490: { /* log flops */
1491: double gops, time, perf, dev;
1492: HLibProfile::getHcompressPerf(gops, time, perf, dev);
1493: #if defined(PETSC_H2OPUS_USE_GPU)
1494: if (boundtocpu) {
1495: PetscLogFlops(1e9 * gops);
1496: } else {
1497: PetscLogGpuFlops(1e9 * gops);
1498: }
1499: #else
1500: PetscLogFlops(1e9 * gops);
1501: #endif
1502: }
1503: PetscLogEventEnd(MAT_H2Opus_Compress, A, 0, 0, 0);
1504: return 0;
1505: }
1507: /*@C
1508: MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix vector product to construct a hierarchical matrix.
1510: Input Parameters:
1511: + A - the hierarchical matrix
1512: . B - the matrix to be sampled
1513: . bs - maximum number of samples to be taken concurrently
1514: - tol - relative tolerance for construction
1516: Notes:
1517: You need to call `MatAssemblyBegin()` and `MatAssemblyEnd()` to update the hierarchical matrix.
1519: Level: intermediate
1521: .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`
1522: @*/
1523: PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1524: {
1525: PetscBool ish2opus;
1532: PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus);
1533: if (ish2opus) {
1534: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1536: if (!a->sampler) a->sampler = new PetscMatrixSampler();
1537: a->sampler->SetSamplingMat(B);
1538: if (bs > 0) a->bs = bs;
1539: if (tol > 0.) a->rtol = tol;
1540: delete a->kernel;
1541: }
1542: return 0;
1543: }
1545: /*@C
1546: MatCreateH2OpusFromKernel - Creates a `MATH2OPUS` from a user-supplied kernel.
1548: Input Parameters:
1549: + comm - MPI communicator
1550: . m - number of local rows (or `PETSC_DECIDE` to have calculated if M is given)
1551: . n - number of local columns (or `PETSC_DECIDE` to have calculated if N is given)
1552: . M - number of global rows (or `PETSC_DETERMINE` to have calculated if m is given)
1553: . N - number of global columns (or `PETSC_DETERMINE` to have calculated if n is given)
1554: . spacedim - dimension of the space coordinates
1555: . coords - coordinates of the points
1556: . cdist - whether or not coordinates are distributed
1557: . kernel - computational kernel (or NULL)
1558: . kernelctx - kernel context
1559: . eta - admissibility condition tolerance
1560: . leafsize - leaf size in cluster tree
1561: - basisord - approximation order for Chebychev interpolation of low-rank blocks
1563: Output Parameter:
1564: . nA - matrix
1566: Options Database Keys:
1567: + -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree
1568: . -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance
1569: . -mat_h2opus_order <`PetscInt`> - Chebychev approximation order
1570: - -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be used when estimating norms
1572: Level: intermediate
1574: .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`
1575: @*/
1576: PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat *nA)
1577: {
1578: Mat A;
1579: Mat_H2OPUS *h2opus;
1580: #if defined(PETSC_H2OPUS_USE_GPU)
1581: PetscBool iscpu = PETSC_FALSE;
1582: #else
1583: PetscBool iscpu = PETSC_TRUE;
1584: #endif
1587: MatCreate(comm, &A);
1588: MatSetSizes(A, m, n, M, N);
1590: MatSetType(A, MATH2OPUS);
1591: MatBindToCPU(A, iscpu);
1592: MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, kernel, kernelctx);
1594: h2opus = (Mat_H2OPUS *)A->data;
1595: if (eta > 0.) h2opus->eta = eta;
1596: if (leafsize > 0) h2opus->leafsize = leafsize;
1597: if (basisord > 0) h2opus->basisord = basisord;
1599: *nA = A;
1600: return 0;
1601: }
1603: /*@C
1604: MatCreateH2OpusFromMat - Creates a `MATH2OPUS` sampling from a user-supplied operator.
1606: Input Parameters:
1607: + B - the matrix to be sampled
1608: . spacedim - dimension of the space coordinates
1609: . coords - coordinates of the points
1610: . cdist - whether or not coordinates are distributed
1611: . eta - admissibility condition tolerance
1612: . leafsize - leaf size in cluster tree
1613: . maxrank - maximum rank allowed
1614: . bs - maximum number of samples to be taken concurrently
1615: - rtol - relative tolerance for construction
1617: Output Parameter:
1618: . nA - matrix
1620: Options Database Keys:
1621: + -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree
1622: . -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance
1623: . -mat_h2opus_maxrank <`PetscInt`> - Maximum rank when constructed from matvecs
1624: . -mat_h2opus_samples <`PetscInt`> - Maximum number of samples to be taken concurrently when constructing from matvecs
1625: . -mat_h2opus_rtol <`PetscReal`> - Relative tolerance for construction from sampling
1626: . -mat_h2opus_check <`PetscBool`> - Check error when constructing from sampling during MatAssemblyEnd()
1627: . -mat_h2opus_hara_verbose <`PetscBool`> - Verbose output from hara construction
1628: - -mat_h2opus_normsamples <`PetscInt`> - Maximum bumber of samples to be when estimating norms
1630: Note:
1631: Not available in parallel
1633: Level: intermediate
1635: .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()`
1636: @*/
1637: PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1638: {
1639: Mat A;
1640: Mat_H2OPUS *h2opus;
1641: MPI_Comm comm;
1642: PetscBool boundtocpu = PETSC_TRUE;
1653: PetscObjectGetComm((PetscObject)B, &comm);
1656: MatCreate(comm, &A);
1657: MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N);
1658: #if defined(PETSC_H2OPUS_USE_GPU)
1659: {
1660: PetscBool iscuda;
1661: VecType vtype;
1663: MatGetVecType(B, &vtype);
1664: PetscStrcmp(vtype, VECCUDA, &iscuda);
1665: if (!iscuda) {
1666: PetscStrcmp(vtype, VECSEQCUDA, &iscuda);
1667: if (!iscuda) PetscStrcmp(vtype, VECMPICUDA, &iscuda);
1668: }
1669: if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1670: }
1671: #endif
1672: MatSetType(A, MATH2OPUS);
1673: MatBindToCPU(A, boundtocpu);
1674: if (spacedim) MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, NULL, NULL);
1675: MatPropagateSymmetryOptions(B, A);
1678: h2opus = (Mat_H2OPUS *)A->data;
1679: h2opus->sampler = new PetscMatrixSampler(B);
1680: if (eta > 0.) h2opus->eta = eta;
1681: if (leafsize > 0) h2opus->leafsize = leafsize;
1682: if (maxrank > 0) h2opus->max_rank = maxrank;
1683: if (bs > 0) h2opus->bs = bs;
1684: if (rtol > 0.) h2opus->rtol = rtol;
1685: *nA = A;
1686: A->preallocated = PETSC_TRUE;
1687: return 0;
1688: }
1690: /*@C
1691: MatH2OpusGetIndexMap - Access reordering index set.
1693: Input Parameters:
1694: . A - the matrix
1696: Output Parameter:
1697: . indexmap - the index set for the reordering
1699: Level: intermediate
1701: .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1702: @*/
1703: PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1704: {
1705: PetscBool ish2opus;
1706: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1712: PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus);
1714: *indexmap = a->h2opus_indexmap;
1715: return 0;
1716: }
1718: /*@C
1719: MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering
1721: Input Parameters:
1722: + A - the matrix
1723: . nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1724: - in - the vector to be mapped
1726: Output Parameter:
1727: . out - the newly created mapped vector
1729: Level: intermediate
1731: .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1732: @*/
1733: PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec *out)
1734: {
1735: PetscBool ish2opus;
1736: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1737: PetscScalar *xin, *xout;
1738: PetscBool nm;
1746: PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus);
1748: nm = a->nativemult;
1749: MatH2OpusSetNativeMult(A, (PetscBool)!nativetopetsc);
1750: MatCreateVecs(A, out, NULL);
1751: MatH2OpusSetNativeMult(A, nm);
1752: if (!a->sf) { /* same ordering */
1753: VecCopy(in, *out);
1754: return 0;
1755: }
1756: VecGetArrayRead(in, (const PetscScalar **)&xin);
1757: VecGetArrayWrite(*out, &xout);
1758: if (nativetopetsc) {
1759: PetscSFReduceBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE);
1760: PetscSFReduceEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE);
1761: } else {
1762: PetscSFBcastBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE);
1763: PetscSFBcastEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE);
1764: }
1765: VecRestoreArrayRead(in, (const PetscScalar **)&xin);
1766: VecRestoreArrayWrite(*out, &xout);
1767: return 0;
1768: }
1770: /*@C
1771: MatH2OpusLowRankUpdate - Perform a low-rank update of the form A = A + s * U * V^T
1773: Input Parameters:
1774: + A - the hierarchical `MATH2OPUS` matrix
1775: . s - the scaling factor
1776: . U - the dense low-rank update matrix
1777: - V - (optional) the dense low-rank update matrix (if NULL, then V = U is assumed)
1779: Note:
1780: The U and V matrices must be in dense format
1782: Level: intermediate
1784: .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE`
1785: @*/
1786: PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s)
1787: {
1788: PetscBool flg;
1795: if (V) {
1798: }
1801: if (!V) V = U;
1803: if (!U->cmap->N) return 0;
1804: PetscLayoutCompare(U->rmap, A->rmap, &flg);
1806: PetscLayoutCompare(V->rmap, A->cmap, &flg);
1808: PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &flg);
1809: if (flg) {
1810: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1811: const PetscScalar *u, *v, *uu, *vv;
1812: PetscInt ldu, ldv;
1813: PetscMPIInt size;
1814: #if defined(H2OPUS_USE_MPI)
1815: h2opusHandle_t handle = a->handle->handle;
1816: #else
1817: h2opusHandle_t handle = a->handle;
1818: #endif
1819: PetscBool usesf = (PetscBool)(a->sf && !a->nativemult);
1820: PetscSF usf, vsf;
1822: MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
1824: PetscLogEventBegin(MAT_H2Opus_LR, A, 0, 0, 0);
1825: PetscObjectBaseTypeCompareAny((PetscObject)U, &flg, MATSEQDENSE, MATMPIDENSE, "");
1827: PetscObjectBaseTypeCompareAny((PetscObject)V, &flg, MATSEQDENSE, MATMPIDENSE, "");
1829: MatDenseGetLDA(U, &ldu);
1830: MatDenseGetLDA(V, &ldv);
1831: MatBoundToCPU(A, &flg);
1832: if (usesf) {
1833: PetscInt n;
1835: MatDenseGetH2OpusVectorSF(U, a->sf, &usf);
1836: MatDenseGetH2OpusVectorSF(V, a->sf, &vsf);
1837: MatH2OpusResizeBuffers_Private(A, U->cmap->N, V->cmap->N);
1838: PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL);
1839: ldu = n;
1840: ldv = n;
1841: }
1842: if (flg) {
1844: MatDenseGetArrayRead(U, &u);
1845: MatDenseGetArrayRead(V, &v);
1846: if (usesf) {
1847: vv = MatH2OpusGetThrustPointer(*a->yy);
1848: PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE);
1849: PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE);
1850: if (U != V) {
1851: uu = MatH2OpusGetThrustPointer(*a->xx);
1852: PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE);
1853: PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE);
1854: } else uu = vv;
1855: } else {
1856: uu = u;
1857: vv = v;
1858: }
1859: hlru_global(*a->hmatrix, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1860: MatDenseRestoreArrayRead(U, &u);
1861: MatDenseRestoreArrayRead(V, &v);
1862: } else {
1863: #if defined(PETSC_H2OPUS_USE_GPU)
1864: PetscBool flgU, flgV;
1867: PetscObjectTypeCompareAny((PetscObject)U, &flgU, MATSEQDENSE, MATMPIDENSE, "");
1868: if (flgU) MatConvert(U, MATDENSECUDA, MAT_INPLACE_MATRIX, &U);
1869: PetscObjectTypeCompareAny((PetscObject)V, &flgV, MATSEQDENSE, MATMPIDENSE, "");
1870: if (flgV) MatConvert(V, MATDENSECUDA, MAT_INPLACE_MATRIX, &V);
1871: MatDenseCUDAGetArrayRead(U, &u);
1872: MatDenseCUDAGetArrayRead(V, &v);
1873: if (usesf) {
1874: vv = MatH2OpusGetThrustPointer(*a->yy_gpu);
1875: PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE);
1876: PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE);
1877: if (U != V) {
1878: uu = MatH2OpusGetThrustPointer(*a->xx_gpu);
1879: PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE);
1880: PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE);
1881: } else uu = vv;
1882: } else {
1883: uu = u;
1884: vv = v;
1885: }
1886: #else
1887: SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "This should not happen");
1888: #endif
1889: hlru_global(*a->hmatrix_gpu, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1890: #if defined(PETSC_H2OPUS_USE_GPU)
1891: MatDenseCUDARestoreArrayRead(U, &u);
1892: MatDenseCUDARestoreArrayRead(V, &v);
1893: if (flgU) MatConvert(U, MATDENSE, MAT_INPLACE_MATRIX, &U);
1894: if (flgV) MatConvert(V, MATDENSE, MAT_INPLACE_MATRIX, &V);
1895: #endif
1896: }
1897: PetscLogEventEnd(MAT_H2Opus_LR, A, 0, 0, 0);
1898: a->orthogonal = PETSC_FALSE;
1899: }
1900: return 0;
1901: }
1902: #endif