Actual source code: mpiptap.c
2: /*
3: Defines projective product routines where A is a MPIAIJ matrix
4: C = P^T * A * P
5: */
7: #include <../src/mat/impls/aij/seq/aij.h>
8: #include <../src/mat/utils/freespace.h>
9: #include <../src/mat/impls/aij/mpi/mpiaij.h>
10: #include <petscbt.h>
11: #include <petsctime.h>
12: #include <petsc/private/hashmapiv.h>
13: #include <petsc/private/hashseti.h>
14: #include <petscsf.h>
16: PetscErrorCode MatView_MPIAIJ_PtAP(Mat A, PetscViewer viewer)
17: {
18: PetscBool iascii;
19: PetscViewerFormat format;
20: Mat_APMPI *ptap;
22: MatCheckProduct(A, 1);
23: ptap = (Mat_APMPI *)A->product->data;
24: PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii);
25: if (iascii) {
26: PetscViewerGetFormat(viewer, &format);
27: if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
28: if (ptap->algType == 0) {
29: PetscViewerASCIIPrintf(viewer, "using scalable MatPtAP() implementation\n");
30: } else if (ptap->algType == 1) {
31: PetscViewerASCIIPrintf(viewer, "using nonscalable MatPtAP() implementation\n");
32: } else if (ptap->algType == 2) {
33: PetscViewerASCIIPrintf(viewer, "using allatonce MatPtAP() implementation\n");
34: } else if (ptap->algType == 3) {
35: PetscViewerASCIIPrintf(viewer, "using merged allatonce MatPtAP() implementation\n");
36: }
37: }
38: }
39: return 0;
40: }
42: PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data)
43: {
44: Mat_APMPI *ptap = (Mat_APMPI *)data;
45: Mat_Merge_SeqsToMPI *merge;
47: PetscFree2(ptap->startsj_s, ptap->startsj_r);
48: PetscFree(ptap->bufa);
49: MatDestroy(&ptap->P_loc);
50: MatDestroy(&ptap->P_oth);
51: MatDestroy(&ptap->A_loc); /* used by MatTransposeMatMult() */
52: MatDestroy(&ptap->Rd);
53: MatDestroy(&ptap->Ro);
54: if (ptap->AP_loc) { /* used by alg_rap */
55: Mat_SeqAIJ *ap = (Mat_SeqAIJ *)(ptap->AP_loc)->data;
56: PetscFree(ap->i);
57: PetscFree2(ap->j, ap->a);
58: MatDestroy(&ptap->AP_loc);
59: } else { /* used by alg_ptap */
60: PetscFree(ptap->api);
61: PetscFree(ptap->apj);
62: }
63: MatDestroy(&ptap->C_loc);
64: MatDestroy(&ptap->C_oth);
65: if (ptap->apa) PetscFree(ptap->apa);
67: MatDestroy(&ptap->Pt);
69: merge = ptap->merge;
70: if (merge) { /* used by alg_ptap */
71: PetscFree(merge->id_r);
72: PetscFree(merge->len_s);
73: PetscFree(merge->len_r);
74: PetscFree(merge->bi);
75: PetscFree(merge->bj);
76: PetscFree(merge->buf_ri[0]);
77: PetscFree(merge->buf_ri);
78: PetscFree(merge->buf_rj[0]);
79: PetscFree(merge->buf_rj);
80: PetscFree(merge->coi);
81: PetscFree(merge->coj);
82: PetscFree(merge->owners_co);
83: PetscLayoutDestroy(&merge->rowmap);
84: PetscFree(ptap->merge);
85: }
86: ISLocalToGlobalMappingDestroy(&ptap->ltog);
88: PetscSFDestroy(&ptap->sf);
89: PetscFree(ptap->c_othi);
90: PetscFree(ptap->c_rmti);
91: PetscFree(ptap);
92: return 0;
93: }
95: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, Mat C)
96: {
97: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
98: Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data;
99: Mat_SeqAIJ *ap, *p_loc, *p_oth = NULL, *c_seq;
100: Mat_APMPI *ptap;
101: Mat AP_loc, C_loc, C_oth;
102: PetscInt i, rstart, rend, cm, ncols, row, *api, *apj, am = A->rmap->n, apnz, nout;
103: PetscScalar *apa;
104: const PetscInt *cols;
105: const PetscScalar *vals;
107: MatCheckProduct(C, 3);
108: ptap = (Mat_APMPI *)C->product->data;
112: MatZeroEntries(C);
114: /* 1) get R = Pd^T,Ro = Po^T */
115: if (ptap->reuse == MAT_REUSE_MATRIX) {
116: MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd);
117: MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro);
118: }
120: /* 2) get AP_loc */
121: AP_loc = ptap->AP_loc;
122: ap = (Mat_SeqAIJ *)AP_loc->data;
124: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
125: /*-----------------------------------------------------*/
126: if (ptap->reuse == MAT_REUSE_MATRIX) {
127: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
128: MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth);
129: MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc);
130: }
132: /* 2-2) compute numeric A_loc*P - dominating part */
133: /* ---------------------------------------------- */
134: /* get data from symbolic products */
135: p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
136: if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
138: api = ap->i;
139: apj = ap->j;
140: ISLocalToGlobalMappingApply(ptap->ltog, api[AP_loc->rmap->n], apj, apj);
141: for (i = 0; i < am; i++) {
142: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
143: apnz = api[i + 1] - api[i];
144: apa = ap->a + api[i];
145: PetscArrayzero(apa, apnz);
146: AProw_scalable(i, ad, ao, p_loc, p_oth, api, apj, apa);
147: }
148: ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, api[AP_loc->rmap->n], apj, &nout, apj);
151: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
152: /* Always use scalable version since we are in the MPI scalable version */
153: MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd, AP_loc, ptap->C_loc);
154: MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro, AP_loc, ptap->C_oth);
156: C_loc = ptap->C_loc;
157: C_oth = ptap->C_oth;
159: /* add C_loc and Co to to C */
160: MatGetOwnershipRange(C, &rstart, &rend);
162: /* C_loc -> C */
163: cm = C_loc->rmap->N;
164: c_seq = (Mat_SeqAIJ *)C_loc->data;
165: cols = c_seq->j;
166: vals = c_seq->a;
167: ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_loc->rmap->n], c_seq->j, c_seq->j);
169: /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
170: /* when there are no off-processor parts. */
171: /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
172: /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
173: /* a table, and other, more complex stuff has to be done. */
174: if (C->assembled) {
175: C->was_assembled = PETSC_TRUE;
176: C->assembled = PETSC_FALSE;
177: }
178: if (C->was_assembled) {
179: for (i = 0; i < cm; i++) {
180: ncols = c_seq->i[i + 1] - c_seq->i[i];
181: row = rstart + i;
182: MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES);
183: cols += ncols;
184: vals += ncols;
185: }
186: } else {
187: MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a);
188: }
189: ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_loc->rmap->n], c_seq->j, &nout, c_seq->j);
192: /* Co -> C, off-processor part */
193: cm = C_oth->rmap->N;
194: c_seq = (Mat_SeqAIJ *)C_oth->data;
195: cols = c_seq->j;
196: vals = c_seq->a;
197: ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_oth->rmap->n], c_seq->j, c_seq->j);
198: for (i = 0; i < cm; i++) {
199: ncols = c_seq->i[i + 1] - c_seq->i[i];
200: row = p->garray[i];
201: MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES);
202: cols += ncols;
203: vals += ncols;
204: }
205: MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
206: MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);
208: ptap->reuse = MAT_REUSE_MATRIX;
210: ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_oth->rmap->n], c_seq->j, &nout, c_seq->j);
212: return 0;
213: }
215: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, PetscReal fill, Mat Cmpi)
216: {
217: Mat_APMPI *ptap;
218: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
219: MPI_Comm comm;
220: PetscMPIInt size, rank;
221: Mat P_loc, P_oth;
222: PetscFreeSpaceList free_space = NULL, current_space = NULL;
223: PetscInt am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
224: PetscInt *lnk, i, k, pnz, row, nsend;
225: PetscMPIInt tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
226: PETSC_UNUSED PetscMPIInt icompleted = 0;
227: PetscInt **buf_rj, **buf_ri, **buf_ri_k;
228: const PetscInt *owners;
229: PetscInt len, proc, *dnz, *onz, nzi, nspacedouble;
230: PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
231: MPI_Request *swaits, *rwaits;
232: MPI_Status *sstatus, rstatus;
233: PetscLayout rowmap;
234: PetscInt *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
235: PetscMPIInt *len_r, *id_r; /* array of length of comm->size, store send/recv matrix values */
236: PetscInt *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, Crmax, *aj, *ai, *pi, nout;
237: Mat_SeqAIJ *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth;
238: PetscScalar *apv;
239: PetscTable ta;
240: MatType mtype;
241: const char *prefix;
242: #if defined(PETSC_USE_INFO)
243: PetscReal apfill;
244: #endif
246: MatCheckProduct(Cmpi, 4);
248: PetscObjectGetComm((PetscObject)A, &comm);
249: MPI_Comm_size(comm, &size);
250: MPI_Comm_rank(comm, &rank);
252: if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data;
254: /* create symbolic parallel matrix Cmpi */
255: MatGetType(A, &mtype);
256: MatSetType(Cmpi, mtype);
258: /* create struct Mat_APMPI and attached it to C later */
259: PetscNew(&ptap);
260: ptap->reuse = MAT_INITIAL_MATRIX;
261: ptap->algType = 0;
263: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
264: MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &P_oth);
265: /* get P_loc by taking all local rows of P */
266: MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &P_loc);
268: ptap->P_loc = P_loc;
269: ptap->P_oth = P_oth;
271: /* (0) compute Rd = Pd^T, Ro = Po^T */
272: /* --------------------------------- */
273: MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd);
274: MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro);
276: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
277: /* ----------------------------------------------------------------- */
278: p_loc = (Mat_SeqAIJ *)P_loc->data;
279: if (P_oth) p_oth = (Mat_SeqAIJ *)P_oth->data;
281: /* create and initialize a linked list */
282: PetscTableCreate(pn, pN, &ta); /* for compute AP_loc and Cmpi */
283: MatRowMergeMax_SeqAIJ(p_loc, P_loc->rmap->N, ta);
284: MatRowMergeMax_SeqAIJ(p_oth, P_oth->rmap->N, ta);
285: PetscTableGetCount(ta, &Crmax); /* Crmax = nnz(sum of Prows) */
287: PetscLLCondensedCreate_Scalable(Crmax, &lnk);
289: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
290: if (ao) {
291: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space);
292: } else {
293: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space);
294: }
295: current_space = free_space;
296: nspacedouble = 0;
298: PetscMalloc1(am + 1, &api);
299: api[0] = 0;
300: for (i = 0; i < am; i++) {
301: /* diagonal portion: Ad[i,:]*P */
302: ai = ad->i;
303: pi = p_loc->i;
304: nzi = ai[i + 1] - ai[i];
305: aj = ad->j + ai[i];
306: for (j = 0; j < nzi; j++) {
307: row = aj[j];
308: pnz = pi[row + 1] - pi[row];
309: Jptr = p_loc->j + pi[row];
310: /* add non-zero cols of P into the sorted linked list lnk */
311: PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk);
312: }
313: /* off-diagonal portion: Ao[i,:]*P */
314: if (ao) {
315: ai = ao->i;
316: pi = p_oth->i;
317: nzi = ai[i + 1] - ai[i];
318: aj = ao->j + ai[i];
319: for (j = 0; j < nzi; j++) {
320: row = aj[j];
321: pnz = pi[row + 1] - pi[row];
322: Jptr = p_oth->j + pi[row];
323: PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk);
324: }
325: }
326: apnz = lnk[0];
327: api[i + 1] = api[i] + apnz;
329: /* if free space is not available, double the total space in the list */
330: if (current_space->local_remaining < apnz) {
331: PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space);
332: nspacedouble++;
333: }
335: /* Copy data into free space, then initialize lnk */
336: PetscLLCondensedClean_Scalable(apnz, current_space->array, lnk);
338: current_space->array += apnz;
339: current_space->local_used += apnz;
340: current_space->local_remaining -= apnz;
341: }
342: /* Allocate space for apj and apv, initialize apj, and */
343: /* destroy list of free space and other temporary array(s) */
344: PetscCalloc2(api[am], &apj, api[am], &apv);
345: PetscFreeSpaceContiguous(&free_space, apj);
346: PetscLLCondensedDestroy_Scalable(lnk);
348: /* Create AP_loc for reuse */
349: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc);
350: MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog);
352: #if defined(PETSC_USE_INFO)
353: if (ao) {
354: apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
355: } else {
356: apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
357: }
358: ptap->AP_loc->info.mallocs = nspacedouble;
359: ptap->AP_loc->info.fill_ratio_given = fill;
360: ptap->AP_loc->info.fill_ratio_needed = apfill;
362: if (api[am]) {
363: PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill);
364: PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill);
365: } else {
366: PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc is empty \n");
367: }
368: #endif
370: /* (2-1) compute symbolic Co = Ro*AP_loc */
371: /* -------------------------------------- */
372: MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth);
373: MatGetOptionsPrefix(A, &prefix);
374: MatSetOptionsPrefix(ptap->C_oth, prefix);
375: MatAppendOptionsPrefix(ptap->C_oth, "inner_offdiag_");
377: MatProductSetType(ptap->C_oth, MATPRODUCT_AB);
378: MatProductSetAlgorithm(ptap->C_oth, "sorted");
379: MatProductSetFill(ptap->C_oth, fill);
380: MatProductSetFromOptions(ptap->C_oth);
381: MatProductSymbolic(ptap->C_oth);
383: /* (3) send coj of C_oth to other processors */
384: /* ------------------------------------------ */
385: /* determine row ownership */
386: PetscLayoutCreate(comm, &rowmap);
387: PetscLayoutSetLocalSize(rowmap, pn);
388: PetscLayoutSetBlockSize(rowmap, 1);
389: PetscLayoutSetUp(rowmap);
390: PetscLayoutGetRanges(rowmap, &owners);
392: /* determine the number of messages to send, their lengths */
393: PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co);
394: PetscArrayzero(len_s, size);
395: PetscArrayzero(len_si, size);
397: c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
398: coi = c_oth->i;
399: coj = c_oth->j;
400: con = ptap->C_oth->rmap->n;
401: proc = 0;
402: ISLocalToGlobalMappingApply(ptap->ltog, coi[con], coj, coj);
403: for (i = 0; i < con; i++) {
404: while (prmap[i] >= owners[proc + 1]) proc++;
405: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
406: len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
407: }
409: len = 0; /* max length of buf_si[], see (4) */
410: owners_co[0] = 0;
411: nsend = 0;
412: for (proc = 0; proc < size; proc++) {
413: owners_co[proc + 1] = owners_co[proc] + len_si[proc];
414: if (len_s[proc]) {
415: nsend++;
416: len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
417: len += len_si[proc];
418: }
419: }
421: /* determine the number and length of messages to receive for coi and coj */
422: PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv);
423: PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri);
425: /* post the Irecv and Isend of coj */
426: PetscCommGetNewTag(comm, &tagj);
427: PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits);
428: PetscMalloc1(nsend + 1, &swaits);
429: for (proc = 0, k = 0; proc < size; proc++) {
430: if (!len_s[proc]) continue;
431: i = owners_co[proc];
432: MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k);
433: k++;
434: }
436: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
437: /* ---------------------------------------- */
438: MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc);
439: MatProductSetType(ptap->C_loc, MATPRODUCT_AB);
440: MatProductSetAlgorithm(ptap->C_loc, "default");
441: MatProductSetFill(ptap->C_loc, fill);
443: MatSetOptionsPrefix(ptap->C_loc, prefix);
444: MatAppendOptionsPrefix(ptap->C_loc, "inner_diag_");
446: MatProductSetFromOptions(ptap->C_loc);
447: MatProductSymbolic(ptap->C_loc);
449: c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
450: ISLocalToGlobalMappingApply(ptap->ltog, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, c_loc->j);
452: /* receives coj are complete */
453: for (i = 0; i < nrecv; i++) MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus);
454: PetscFree(rwaits);
455: if (nsend) MPI_Waitall(nsend, swaits, sstatus);
457: /* add received column indices into ta to update Crmax */
458: for (k = 0; k < nrecv; k++) { /* k-th received message */
459: Jptr = buf_rj[k];
460: for (j = 0; j < len_r[k]; j++) PetscTableAdd(ta, *(Jptr + j) + 1, 1, INSERT_VALUES);
461: }
462: PetscTableGetCount(ta, &Crmax);
463: PetscTableDestroy(&ta);
465: /* (4) send and recv coi */
466: /*-----------------------*/
467: PetscCommGetNewTag(comm, &tagi);
468: PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits);
469: PetscMalloc1(len + 1, &buf_s);
470: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
471: for (proc = 0, k = 0; proc < size; proc++) {
472: if (!len_s[proc]) continue;
473: /* form outgoing message for i-structure:
474: buf_si[0]: nrows to be sent
475: [1:nrows]: row index (global)
476: [nrows+1:2*nrows+1]: i-structure index
477: */
478: /*-------------------------------------------*/
479: nrows = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
480: buf_si_i = buf_si + nrows + 1;
481: buf_si[0] = nrows;
482: buf_si_i[0] = 0;
483: nrows = 0;
484: for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
485: nzi = coi[i + 1] - coi[i];
486: buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */
487: buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */
488: nrows++;
489: }
490: MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k);
491: k++;
492: buf_si += len_si[proc];
493: }
494: for (i = 0; i < nrecv; i++) MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus);
495: PetscFree(rwaits);
496: if (nsend) MPI_Waitall(nsend, swaits, sstatus);
498: PetscFree4(len_s, len_si, sstatus, owners_co);
499: PetscFree(len_ri);
500: PetscFree(swaits);
501: PetscFree(buf_s);
503: /* (5) compute the local portion of Cmpi */
504: /* ------------------------------------------ */
505: /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
506: PetscFreeSpaceGet(Crmax, &free_space);
507: current_space = free_space;
509: PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci);
510: for (k = 0; k < nrecv; k++) {
511: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
512: nrows = *buf_ri_k[k];
513: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
514: nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */
515: }
517: MatPreallocateBegin(comm, pn, pn, dnz, onz);
518: PetscLLCondensedCreate_Scalable(Crmax, &lnk);
519: for (i = 0; i < pn; i++) {
520: /* add C_loc into Cmpi */
521: nzi = c_loc->i[i + 1] - c_loc->i[i];
522: Jptr = c_loc->j + c_loc->i[i];
523: PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk);
525: /* add received col data into lnk */
526: for (k = 0; k < nrecv; k++) { /* k-th received message */
527: if (i == *nextrow[k]) { /* i-th row */
528: nzi = *(nextci[k] + 1) - *nextci[k];
529: Jptr = buf_rj[k] + *nextci[k];
530: PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk);
531: nextrow[k]++;
532: nextci[k]++;
533: }
534: }
535: nzi = lnk[0];
537: /* copy data into free space, then initialize lnk */
538: PetscLLCondensedClean_Scalable(nzi, current_space->array, lnk);
539: MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz);
540: }
541: PetscFree3(buf_ri_k, nextrow, nextci);
542: PetscLLCondensedDestroy_Scalable(lnk);
543: PetscFreeSpaceDestroy(free_space);
545: /* local sizes and preallocation */
546: MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE);
547: if (P->cmap->bs > 0) {
548: PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs);
549: PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs);
550: }
551: MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz);
552: MatPreallocateEnd(dnz, onz);
554: /* members in merge */
555: PetscFree(id_r);
556: PetscFree(len_r);
557: PetscFree(buf_ri[0]);
558: PetscFree(buf_ri);
559: PetscFree(buf_rj[0]);
560: PetscFree(buf_rj);
561: PetscLayoutDestroy(&rowmap);
563: nout = 0;
564: ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_oth->i[ptap->C_oth->rmap->n], c_oth->j, &nout, c_oth->j);
566: ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, &nout, c_loc->j);
569: /* attach the supporting struct to Cmpi for reuse */
570: Cmpi->product->data = ptap;
571: Cmpi->product->view = MatView_MPIAIJ_PtAP;
572: Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
574: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
575: Cmpi->assembled = PETSC_FALSE;
576: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
577: return 0;
578: }
580: static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHSetI dht, PetscHSetI oht)
581: {
582: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
583: Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
584: PetscInt *ai, nzi, j, *aj, row, col, *pi, *pj, pnz, nzpi, *p_othcols, k;
585: PetscInt pcstart, pcend, column, offset;
587: pcstart = P->cmap->rstart;
588: pcstart *= dof;
589: pcend = P->cmap->rend;
590: pcend *= dof;
591: /* diagonal portion: Ad[i,:]*P */
592: ai = ad->i;
593: nzi = ai[i + 1] - ai[i];
594: aj = ad->j + ai[i];
595: for (j = 0; j < nzi; j++) {
596: row = aj[j];
597: offset = row % dof;
598: row /= dof;
599: nzpi = pd->i[row + 1] - pd->i[row];
600: pj = pd->j + pd->i[row];
601: for (k = 0; k < nzpi; k++) PetscHSetIAdd(dht, pj[k] * dof + offset + pcstart);
602: }
603: /* off diag P */
604: for (j = 0; j < nzi; j++) {
605: row = aj[j];
606: offset = row % dof;
607: row /= dof;
608: nzpi = po->i[row + 1] - po->i[row];
609: pj = po->j + po->i[row];
610: for (k = 0; k < nzpi; k++) PetscHSetIAdd(oht, p->garray[pj[k]] * dof + offset);
611: }
613: /* off diagonal part: Ao[i, :]*P_oth */
614: if (ao) {
615: ai = ao->i;
616: pi = p_oth->i;
617: nzi = ai[i + 1] - ai[i];
618: aj = ao->j + ai[i];
619: for (j = 0; j < nzi; j++) {
620: row = aj[j];
621: offset = a->garray[row] % dof;
622: row = map[row];
623: pnz = pi[row + 1] - pi[row];
624: p_othcols = p_oth->j + pi[row];
625: for (col = 0; col < pnz; col++) {
626: column = p_othcols[col] * dof + offset;
627: if (column >= pcstart && column < pcend) {
628: PetscHSetIAdd(dht, column);
629: } else {
630: PetscHSetIAdd(oht, column);
631: }
632: }
633: }
634: } /* end if (ao) */
635: return 0;
636: }
638: static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHMapIV hmap)
639: {
640: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
641: Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
642: PetscInt *ai, nzi, j, *aj, row, col, *pi, pnz, *p_othcols, pcstart, *pj, k, nzpi, offset;
643: PetscScalar ra, *aa, *pa;
645: pcstart = P->cmap->rstart;
646: pcstart *= dof;
648: /* diagonal portion: Ad[i,:]*P */
649: ai = ad->i;
650: nzi = ai[i + 1] - ai[i];
651: aj = ad->j + ai[i];
652: aa = ad->a + ai[i];
653: for (j = 0; j < nzi; j++) {
654: ra = aa[j];
655: row = aj[j];
656: offset = row % dof;
657: row /= dof;
658: nzpi = pd->i[row + 1] - pd->i[row];
659: pj = pd->j + pd->i[row];
660: pa = pd->a + pd->i[row];
661: for (k = 0; k < nzpi; k++) PetscHMapIVAddValue(hmap, pj[k] * dof + offset + pcstart, ra * pa[k]);
662: PetscLogFlops(2.0 * nzpi);
663: }
664: for (j = 0; j < nzi; j++) {
665: ra = aa[j];
666: row = aj[j];
667: offset = row % dof;
668: row /= dof;
669: nzpi = po->i[row + 1] - po->i[row];
670: pj = po->j + po->i[row];
671: pa = po->a + po->i[row];
672: for (k = 0; k < nzpi; k++) PetscHMapIVAddValue(hmap, p->garray[pj[k]] * dof + offset, ra * pa[k]);
673: PetscLogFlops(2.0 * nzpi);
674: }
676: /* off diagonal part: Ao[i, :]*P_oth */
677: if (ao) {
678: ai = ao->i;
679: pi = p_oth->i;
680: nzi = ai[i + 1] - ai[i];
681: aj = ao->j + ai[i];
682: aa = ao->a + ai[i];
683: for (j = 0; j < nzi; j++) {
684: row = aj[j];
685: offset = a->garray[row] % dof;
686: row = map[row];
687: ra = aa[j];
688: pnz = pi[row + 1] - pi[row];
689: p_othcols = p_oth->j + pi[row];
690: pa = p_oth->a + pi[row];
691: for (col = 0; col < pnz; col++) PetscHMapIVAddValue(hmap, p_othcols[col] * dof + offset, ra * pa[col]);
692: PetscLogFlops(2.0 * pnz);
693: }
694: } /* end if (ao) */
696: return 0;
697: }
699: PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat, Mat, PetscInt dof, MatReuse, Mat *);
701: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, Mat C)
702: {
703: Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
704: Mat_SeqAIJ *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
705: Mat_APMPI *ptap;
706: PetscHMapIV hmap;
707: PetscInt i, j, jj, kk, nzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, ccstart, ccend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, *dcc, *occ, loc;
708: PetscScalar *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
709: PetscInt offset, ii, pocol;
710: const PetscInt *mappingindices;
711: IS map;
713: MatCheckProduct(C, 4);
714: ptap = (Mat_APMPI *)C->product->data;
718: MatZeroEntries(C);
720: /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
721: /*-----------------------------------------------------*/
722: if (ptap->reuse == MAT_REUSE_MATRIX) {
723: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
724: MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth);
725: }
726: PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map);
728: MatGetLocalSize(p->B, NULL, &pon);
729: pon *= dof;
730: PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta);
731: MatGetLocalSize(A, &am, NULL);
732: cmaxr = 0;
733: for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
734: PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc);
735: PetscHMapIVCreate(&hmap);
736: PetscHMapIVResize(hmap, cmaxr);
737: ISGetIndices(map, &mappingindices);
738: for (i = 0; i < am && pon; i++) {
739: PetscHMapIVClear(hmap);
740: offset = i % dof;
741: ii = i / dof;
742: nzi = po->i[ii + 1] - po->i[ii];
743: if (!nzi) continue;
744: MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap);
745: voff = 0;
746: PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues);
747: if (!voff) continue;
749: /* Form C(ii, :) */
750: poj = po->j + po->i[ii];
751: poa = po->a + po->i[ii];
752: for (j = 0; j < nzi; j++) {
753: pocol = poj[j] * dof + offset;
754: c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
755: c_rmtaa = c_rmta + ptap->c_rmti[pocol];
756: for (jj = 0; jj < voff; jj++) {
757: apvaluestmp[jj] = apvalues[jj] * poa[j];
758: /* If the row is empty */
759: if (!c_rmtc[pocol]) {
760: c_rmtjj[jj] = apindices[jj];
761: c_rmtaa[jj] = apvaluestmp[jj];
762: c_rmtc[pocol]++;
763: } else {
764: PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc);
765: if (loc >= 0) { /* hit */
766: c_rmtaa[loc] += apvaluestmp[jj];
767: PetscLogFlops(1.0);
768: } else { /* new element */
769: loc = -(loc + 1);
770: /* Move data backward */
771: for (kk = c_rmtc[pocol]; kk > loc; kk--) {
772: c_rmtjj[kk] = c_rmtjj[kk - 1];
773: c_rmtaa[kk] = c_rmtaa[kk - 1];
774: } /* End kk */
775: c_rmtjj[loc] = apindices[jj];
776: c_rmtaa[loc] = apvaluestmp[jj];
777: c_rmtc[pocol]++;
778: }
779: }
780: PetscLogFlops(voff);
781: } /* End jj */
782: } /* End j */
783: } /* End i */
785: PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc);
786: PetscHMapIVDestroy(&hmap);
788: MatGetLocalSize(P, NULL, &pn);
789: pn *= dof;
790: PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha);
792: PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE);
793: PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE);
794: MatGetOwnershipRangeColumn(P, &pcstart, &pcend);
795: pcstart = pcstart * dof;
796: pcend = pcend * dof;
797: cd = (Mat_SeqAIJ *)(c->A)->data;
798: co = (Mat_SeqAIJ *)(c->B)->data;
800: cmaxr = 0;
801: for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
802: PetscCalloc5(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pn, &dcc, pn, &occ);
803: PetscHMapIVCreate(&hmap);
804: PetscHMapIVResize(hmap, cmaxr);
805: for (i = 0; i < am && pn; i++) {
806: PetscHMapIVClear(hmap);
807: offset = i % dof;
808: ii = i / dof;
809: nzi = pd->i[ii + 1] - pd->i[ii];
810: if (!nzi) continue;
811: MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap);
812: voff = 0;
813: PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues);
814: if (!voff) continue;
815: /* Form C(ii, :) */
816: pdj = pd->j + pd->i[ii];
817: pda = pd->a + pd->i[ii];
818: for (j = 0; j < nzi; j++) {
819: row = pcstart + pdj[j] * dof + offset;
820: for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j];
821: PetscLogFlops(voff);
822: MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES);
823: }
824: }
825: ISRestoreIndices(map, &mappingindices);
826: MatGetOwnershipRangeColumn(C, &ccstart, &ccend);
827: PetscFree5(apindices, apvalues, apvaluestmp, dcc, occ);
828: PetscHMapIVDestroy(&hmap);
829: PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE);
830: PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE);
831: PetscFree2(c_rmtj, c_rmta);
833: /* Add contributions from remote */
834: for (i = 0; i < pn; i++) {
835: row = i + pcstart;
836: MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES);
837: }
838: PetscFree2(c_othj, c_otha);
840: MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
841: MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);
843: ptap->reuse = MAT_REUSE_MATRIX;
844: return 0;
845: }
847: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, Mat C)
848: {
850: MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, C);
851: return 0;
852: }
854: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, Mat C)
855: {
856: Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
857: Mat_SeqAIJ *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
858: Mat_APMPI *ptap;
859: PetscHMapIV hmap;
860: PetscInt i, j, jj, kk, nzi, dnzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, loc;
861: PetscScalar *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
862: PetscInt offset, ii, pocol;
863: const PetscInt *mappingindices;
864: IS map;
866: MatCheckProduct(C, 4);
867: ptap = (Mat_APMPI *)C->product->data;
871: MatZeroEntries(C);
873: /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
874: /*-----------------------------------------------------*/
875: if (ptap->reuse == MAT_REUSE_MATRIX) {
876: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
877: MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth);
878: }
879: PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map);
880: MatGetLocalSize(p->B, NULL, &pon);
881: pon *= dof;
882: MatGetLocalSize(P, NULL, &pn);
883: pn *= dof;
885: PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta);
886: MatGetLocalSize(A, &am, NULL);
887: MatGetOwnershipRangeColumn(P, &pcstart, &pcend);
888: pcstart *= dof;
889: pcend *= dof;
890: cmaxr = 0;
891: for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
892: cd = (Mat_SeqAIJ *)(c->A)->data;
893: co = (Mat_SeqAIJ *)(c->B)->data;
894: for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
895: PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc);
896: PetscHMapIVCreate(&hmap);
897: PetscHMapIVResize(hmap, cmaxr);
898: ISGetIndices(map, &mappingindices);
899: for (i = 0; i < am && (pon || pn); i++) {
900: PetscHMapIVClear(hmap);
901: offset = i % dof;
902: ii = i / dof;
903: nzi = po->i[ii + 1] - po->i[ii];
904: dnzi = pd->i[ii + 1] - pd->i[ii];
905: if (!nzi && !dnzi) continue;
906: MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap);
907: voff = 0;
908: PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues);
909: if (!voff) continue;
911: /* Form remote C(ii, :) */
912: poj = po->j + po->i[ii];
913: poa = po->a + po->i[ii];
914: for (j = 0; j < nzi; j++) {
915: pocol = poj[j] * dof + offset;
916: c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
917: c_rmtaa = c_rmta + ptap->c_rmti[pocol];
918: for (jj = 0; jj < voff; jj++) {
919: apvaluestmp[jj] = apvalues[jj] * poa[j];
920: /* If the row is empty */
921: if (!c_rmtc[pocol]) {
922: c_rmtjj[jj] = apindices[jj];
923: c_rmtaa[jj] = apvaluestmp[jj];
924: c_rmtc[pocol]++;
925: } else {
926: PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc);
927: if (loc >= 0) { /* hit */
928: c_rmtaa[loc] += apvaluestmp[jj];
929: PetscLogFlops(1.0);
930: } else { /* new element */
931: loc = -(loc + 1);
932: /* Move data backward */
933: for (kk = c_rmtc[pocol]; kk > loc; kk--) {
934: c_rmtjj[kk] = c_rmtjj[kk - 1];
935: c_rmtaa[kk] = c_rmtaa[kk - 1];
936: } /* End kk */
937: c_rmtjj[loc] = apindices[jj];
938: c_rmtaa[loc] = apvaluestmp[jj];
939: c_rmtc[pocol]++;
940: }
941: }
942: } /* End jj */
943: PetscLogFlops(voff);
944: } /* End j */
946: /* Form local C(ii, :) */
947: pdj = pd->j + pd->i[ii];
948: pda = pd->a + pd->i[ii];
949: for (j = 0; j < dnzi; j++) {
950: row = pcstart + pdj[j] * dof + offset;
951: for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; /* End kk */
952: PetscLogFlops(voff);
953: MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES);
954: } /* End j */
955: } /* End i */
957: ISRestoreIndices(map, &mappingindices);
958: PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc);
959: PetscHMapIVDestroy(&hmap);
960: PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha);
962: PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE);
963: PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE);
964: PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE);
965: PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE);
966: PetscFree2(c_rmtj, c_rmta);
968: /* Add contributions from remote */
969: for (i = 0; i < pn; i++) {
970: row = i + pcstart;
971: MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES);
972: }
973: PetscFree2(c_othj, c_otha);
975: MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
976: MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);
978: ptap->reuse = MAT_REUSE_MATRIX;
979: return 0;
980: }
982: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, Mat C)
983: {
985: MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, C);
986: return 0;
987: }
989: /* TODO: move algorithm selection to MatProductSetFromOptions */
990: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
991: {
992: Mat_APMPI *ptap;
993: Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data;
994: MPI_Comm comm;
995: Mat_SeqAIJ *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
996: MatType mtype;
997: PetscSF sf;
998: PetscSFNode *iremote;
999: PetscInt rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
1000: const PetscInt *rootdegrees;
1001: PetscHSetI ht, oht, *hta, *hto;
1002: PetscInt pn, pon, *c_rmtc, i, j, nzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
1003: PetscInt lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
1004: PetscInt nalg = 2, alg = 0, offset, ii;
1005: PetscMPIInt owner;
1006: const PetscInt *mappingindices;
1007: PetscBool flg;
1008: const char *algTypes[2] = {"overlapping", "merged"};
1009: IS map;
1011: MatCheckProduct(Cmpi, 5);
1013: PetscObjectGetComm((PetscObject)A, &comm);
1015: /* Create symbolic parallel matrix Cmpi */
1016: MatGetLocalSize(P, NULL, &pn);
1017: pn *= dof;
1018: MatGetType(A, &mtype);
1019: MatSetType(Cmpi, mtype);
1020: MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE);
1022: PetscNew(&ptap);
1023: ptap->reuse = MAT_INITIAL_MATRIX;
1024: ptap->algType = 2;
1026: /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1027: MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth);
1028: PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map);
1029: /* This equals to the number of offdiag columns in P */
1030: MatGetLocalSize(p->B, NULL, &pon);
1031: pon *= dof;
1032: /* offsets */
1033: PetscMalloc1(pon + 1, &ptap->c_rmti);
1034: /* The number of columns we will send to remote ranks */
1035: PetscMalloc1(pon, &c_rmtc);
1036: PetscMalloc1(pon, &hta);
1037: for (i = 0; i < pon; i++) PetscHSetICreate(&hta[i]);
1038: MatGetLocalSize(A, &am, NULL);
1039: MatGetOwnershipRange(A, &arstart, &arend);
1040: /* Create hash table to merge all columns for C(i, :) */
1041: PetscHSetICreate(&ht);
1043: ISGetIndices(map, &mappingindices);
1044: ptap->c_rmti[0] = 0;
1045: /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */
1046: for (i = 0; i < am && pon; i++) {
1047: /* Form one row of AP */
1048: PetscHSetIClear(ht);
1049: offset = i % dof;
1050: ii = i / dof;
1051: /* If the off diag is empty, we should not do any calculation */
1052: nzi = po->i[ii + 1] - po->i[ii];
1053: if (!nzi) continue;
1055: MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, ht);
1056: PetscHSetIGetSize(ht, &htsize);
1057: /* If AP is empty, just continue */
1058: if (!htsize) continue;
1059: /* Form C(ii, :) */
1060: poj = po->j + po->i[ii];
1061: for (j = 0; j < nzi; j++) PetscHSetIUpdate(hta[poj[j] * dof + offset], ht);
1062: }
1064: for (i = 0; i < pon; i++) {
1065: PetscHSetIGetSize(hta[i], &htsize);
1066: ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
1067: c_rmtc[i] = htsize;
1068: }
1070: PetscMalloc1(ptap->c_rmti[pon], &c_rmtj);
1072: for (i = 0; i < pon; i++) {
1073: off = 0;
1074: PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]);
1075: PetscHSetIDestroy(&hta[i]);
1076: }
1077: PetscFree(hta);
1079: PetscMalloc1(pon, &iremote);
1080: for (i = 0; i < pon; i++) {
1081: owner = 0;
1082: lidx = 0;
1083: offset = i % dof;
1084: ii = i / dof;
1085: PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx);
1086: iremote[i].index = lidx * dof + offset;
1087: iremote[i].rank = owner;
1088: }
1090: PetscSFCreate(comm, &sf);
1091: PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER);
1092: /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1093: PetscSFSetRankOrder(sf, PETSC_TRUE);
1094: PetscSFSetFromOptions(sf);
1095: PetscSFSetUp(sf);
1096: /* How many neighbors have contributions to my rows? */
1097: PetscSFComputeDegreeBegin(sf, &rootdegrees);
1098: PetscSFComputeDegreeEnd(sf, &rootdegrees);
1099: rootspacesize = 0;
1100: for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
1101: PetscMalloc1(rootspacesize, &rootspace);
1102: PetscMalloc1(rootspacesize + 1, &rootspaceoffsets);
1103: /* Get information from leaves
1104: * Number of columns other people contribute to my rows
1105: * */
1106: PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace);
1107: PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace);
1108: PetscFree(c_rmtc);
1109: PetscCalloc1(pn + 1, &ptap->c_othi);
1110: /* The number of columns is received for each row */
1111: ptap->c_othi[0] = 0;
1112: rootspacesize = 0;
1113: rootspaceoffsets[0] = 0;
1114: for (i = 0; i < pn; i++) {
1115: rcvncols = 0;
1116: for (j = 0; j < rootdegrees[i]; j++) {
1117: rcvncols += rootspace[rootspacesize];
1118: rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1119: rootspacesize++;
1120: }
1121: ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
1122: }
1123: PetscFree(rootspace);
1125: PetscMalloc1(pon, &c_rmtoffsets);
1126: PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets);
1127: PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets);
1128: PetscSFDestroy(&sf);
1129: PetscFree(rootspaceoffsets);
1131: PetscCalloc1(ptap->c_rmti[pon], &iremote);
1132: nleaves = 0;
1133: for (i = 0; i < pon; i++) {
1134: owner = 0;
1135: ii = i / dof;
1136: PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL);
1137: sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
1138: for (j = 0; j < sendncols; j++) {
1139: iremote[nleaves].rank = owner;
1140: iremote[nleaves++].index = c_rmtoffsets[i] + j;
1141: }
1142: }
1143: PetscFree(c_rmtoffsets);
1144: PetscCalloc1(ptap->c_othi[pn], &c_othj);
1146: PetscSFCreate(comm, &ptap->sf);
1147: PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER);
1148: PetscSFSetFromOptions(ptap->sf);
1149: /* One to one map */
1150: PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE);
1152: PetscMalloc2(pn, &dnz, pn, &onz);
1153: PetscHSetICreate(&oht);
1154: MatGetOwnershipRangeColumn(P, &pcstart, &pcend);
1155: pcstart *= dof;
1156: pcend *= dof;
1157: PetscMalloc2(pn, &hta, pn, &hto);
1158: for (i = 0; i < pn; i++) {
1159: PetscHSetICreate(&hta[i]);
1160: PetscHSetICreate(&hto[i]);
1161: }
1162: /* Work on local part */
1163: /* 4) Pass 1: Estimate memory for C_loc */
1164: for (i = 0; i < am && pn; i++) {
1165: PetscHSetIClear(ht);
1166: PetscHSetIClear(oht);
1167: offset = i % dof;
1168: ii = i / dof;
1169: nzi = pd->i[ii + 1] - pd->i[ii];
1170: if (!nzi) continue;
1172: MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht);
1173: PetscHSetIGetSize(ht, &htsize);
1174: PetscHSetIGetSize(oht, &htosize);
1175: if (!(htsize + htosize)) continue;
1176: /* Form C(ii, :) */
1177: pdj = pd->j + pd->i[ii];
1178: for (j = 0; j < nzi; j++) {
1179: PetscHSetIUpdate(hta[pdj[j] * dof + offset], ht);
1180: PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht);
1181: }
1182: }
1184: ISRestoreIndices(map, &mappingindices);
1186: PetscHSetIDestroy(&ht);
1187: PetscHSetIDestroy(&oht);
1189: /* Get remote data */
1190: PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE);
1191: PetscFree(c_rmtj);
1193: for (i = 0; i < pn; i++) {
1194: nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
1195: rdj = c_othj + ptap->c_othi[i];
1196: for (j = 0; j < nzi; j++) {
1197: col = rdj[j];
1198: /* diag part */
1199: if (col >= pcstart && col < pcend) {
1200: PetscHSetIAdd(hta[i], col);
1201: } else { /* off diag */
1202: PetscHSetIAdd(hto[i], col);
1203: }
1204: }
1205: PetscHSetIGetSize(hta[i], &htsize);
1206: dnz[i] = htsize;
1207: PetscHSetIDestroy(&hta[i]);
1208: PetscHSetIGetSize(hto[i], &htsize);
1209: onz[i] = htsize;
1210: PetscHSetIDestroy(&hto[i]);
1211: }
1213: PetscFree2(hta, hto);
1214: PetscFree(c_othj);
1216: /* local sizes and preallocation */
1217: MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE);
1218: MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs);
1219: MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz);
1220: MatSetUp(Cmpi);
1221: PetscFree2(dnz, onz);
1223: /* attach the supporting struct to Cmpi for reuse */
1224: Cmpi->product->data = ptap;
1225: Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1226: Cmpi->product->view = MatView_MPIAIJ_PtAP;
1228: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1229: Cmpi->assembled = PETSC_FALSE;
1230: /* pick an algorithm */
1231: PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
1232: alg = 0;
1233: PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg);
1234: PetscOptionsEnd();
1235: switch (alg) {
1236: case 0:
1237: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1238: break;
1239: case 1:
1240: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1241: break;
1242: default:
1243: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
1244: }
1245: return 0;
1246: }
1248: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, PetscReal fill, Mat C)
1249: {
1250: MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, fill, C);
1251: return 0;
1252: }
1254: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
1255: {
1256: Mat_APMPI *ptap;
1257: Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data;
1258: MPI_Comm comm;
1259: Mat_SeqAIJ *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
1260: MatType mtype;
1261: PetscSF sf;
1262: PetscSFNode *iremote;
1263: PetscInt rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
1264: const PetscInt *rootdegrees;
1265: PetscHSetI ht, oht, *hta, *hto, *htd;
1266: PetscInt pn, pon, *c_rmtc, i, j, nzi, dnzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
1267: PetscInt lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
1268: PetscInt nalg = 2, alg = 0, offset, ii;
1269: PetscMPIInt owner;
1270: PetscBool flg;
1271: const char *algTypes[2] = {"merged", "overlapping"};
1272: const PetscInt *mappingindices;
1273: IS map;
1275: MatCheckProduct(Cmpi, 5);
1277: PetscObjectGetComm((PetscObject)A, &comm);
1279: /* Create symbolic parallel matrix Cmpi */
1280: MatGetLocalSize(P, NULL, &pn);
1281: pn *= dof;
1282: MatGetType(A, &mtype);
1283: MatSetType(Cmpi, mtype);
1284: MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE);
1286: PetscNew(&ptap);
1287: ptap->reuse = MAT_INITIAL_MATRIX;
1288: ptap->algType = 3;
1290: /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1291: MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth);
1292: PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map);
1294: /* This equals to the number of offdiag columns in P */
1295: MatGetLocalSize(p->B, NULL, &pon);
1296: pon *= dof;
1297: /* offsets */
1298: PetscMalloc1(pon + 1, &ptap->c_rmti);
1299: /* The number of columns we will send to remote ranks */
1300: PetscMalloc1(pon, &c_rmtc);
1301: PetscMalloc1(pon, &hta);
1302: for (i = 0; i < pon; i++) PetscHSetICreate(&hta[i]);
1303: MatGetLocalSize(A, &am, NULL);
1304: MatGetOwnershipRange(A, &arstart, &arend);
1305: /* Create hash table to merge all columns for C(i, :) */
1306: PetscHSetICreate(&ht);
1307: PetscHSetICreate(&oht);
1308: PetscMalloc2(pn, &htd, pn, &hto);
1309: for (i = 0; i < pn; i++) {
1310: PetscHSetICreate(&htd[i]);
1311: PetscHSetICreate(&hto[i]);
1312: }
1314: ISGetIndices(map, &mappingindices);
1315: ptap->c_rmti[0] = 0;
1316: /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */
1317: for (i = 0; i < am && (pon || pn); i++) {
1318: /* Form one row of AP */
1319: PetscHSetIClear(ht);
1320: PetscHSetIClear(oht);
1321: offset = i % dof;
1322: ii = i / dof;
1323: /* If the off diag is empty, we should not do any calculation */
1324: nzi = po->i[ii + 1] - po->i[ii];
1325: dnzi = pd->i[ii + 1] - pd->i[ii];
1326: if (!nzi && !dnzi) continue;
1328: MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht);
1329: PetscHSetIGetSize(ht, &htsize);
1330: PetscHSetIGetSize(oht, &htosize);
1331: /* If AP is empty, just continue */
1332: if (!(htsize + htosize)) continue;
1334: /* Form remote C(ii, :) */
1335: poj = po->j + po->i[ii];
1336: for (j = 0; j < nzi; j++) {
1337: PetscHSetIUpdate(hta[poj[j] * dof + offset], ht);
1338: PetscHSetIUpdate(hta[poj[j] * dof + offset], oht);
1339: }
1341: /* Form local C(ii, :) */
1342: pdj = pd->j + pd->i[ii];
1343: for (j = 0; j < dnzi; j++) {
1344: PetscHSetIUpdate(htd[pdj[j] * dof + offset], ht);
1345: PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht);
1346: }
1347: }
1349: ISRestoreIndices(map, &mappingindices);
1351: PetscHSetIDestroy(&ht);
1352: PetscHSetIDestroy(&oht);
1354: for (i = 0; i < pon; i++) {
1355: PetscHSetIGetSize(hta[i], &htsize);
1356: ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
1357: c_rmtc[i] = htsize;
1358: }
1360: PetscMalloc1(ptap->c_rmti[pon], &c_rmtj);
1362: for (i = 0; i < pon; i++) {
1363: off = 0;
1364: PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]);
1365: PetscHSetIDestroy(&hta[i]);
1366: }
1367: PetscFree(hta);
1369: PetscMalloc1(pon, &iremote);
1370: for (i = 0; i < pon; i++) {
1371: owner = 0;
1372: lidx = 0;
1373: offset = i % dof;
1374: ii = i / dof;
1375: PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx);
1376: iremote[i].index = lidx * dof + offset;
1377: iremote[i].rank = owner;
1378: }
1380: PetscSFCreate(comm, &sf);
1381: PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER);
1382: /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1383: PetscSFSetRankOrder(sf, PETSC_TRUE);
1384: PetscSFSetFromOptions(sf);
1385: PetscSFSetUp(sf);
1386: /* How many neighbors have contributions to my rows? */
1387: PetscSFComputeDegreeBegin(sf, &rootdegrees);
1388: PetscSFComputeDegreeEnd(sf, &rootdegrees);
1389: rootspacesize = 0;
1390: for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
1391: PetscMalloc1(rootspacesize, &rootspace);
1392: PetscMalloc1(rootspacesize + 1, &rootspaceoffsets);
1393: /* Get information from leaves
1394: * Number of columns other people contribute to my rows
1395: * */
1396: PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace);
1397: PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace);
1398: PetscFree(c_rmtc);
1399: PetscMalloc1(pn + 1, &ptap->c_othi);
1400: /* The number of columns is received for each row */
1401: ptap->c_othi[0] = 0;
1402: rootspacesize = 0;
1403: rootspaceoffsets[0] = 0;
1404: for (i = 0; i < pn; i++) {
1405: rcvncols = 0;
1406: for (j = 0; j < rootdegrees[i]; j++) {
1407: rcvncols += rootspace[rootspacesize];
1408: rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1409: rootspacesize++;
1410: }
1411: ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
1412: }
1413: PetscFree(rootspace);
1415: PetscMalloc1(pon, &c_rmtoffsets);
1416: PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets);
1417: PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets);
1418: PetscSFDestroy(&sf);
1419: PetscFree(rootspaceoffsets);
1421: PetscCalloc1(ptap->c_rmti[pon], &iremote);
1422: nleaves = 0;
1423: for (i = 0; i < pon; i++) {
1424: owner = 0;
1425: ii = i / dof;
1426: PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL);
1427: sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
1428: for (j = 0; j < sendncols; j++) {
1429: iremote[nleaves].rank = owner;
1430: iremote[nleaves++].index = c_rmtoffsets[i] + j;
1431: }
1432: }
1433: PetscFree(c_rmtoffsets);
1434: PetscCalloc1(ptap->c_othi[pn], &c_othj);
1436: PetscSFCreate(comm, &ptap->sf);
1437: PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER);
1438: PetscSFSetFromOptions(ptap->sf);
1439: /* One to one map */
1440: PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE);
1441: /* Get remote data */
1442: PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE);
1443: PetscFree(c_rmtj);
1444: PetscMalloc2(pn, &dnz, pn, &onz);
1445: MatGetOwnershipRangeColumn(P, &pcstart, &pcend);
1446: pcstart *= dof;
1447: pcend *= dof;
1448: for (i = 0; i < pn; i++) {
1449: nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
1450: rdj = c_othj + ptap->c_othi[i];
1451: for (j = 0; j < nzi; j++) {
1452: col = rdj[j];
1453: /* diag part */
1454: if (col >= pcstart && col < pcend) {
1455: PetscHSetIAdd(htd[i], col);
1456: } else { /* off diag */
1457: PetscHSetIAdd(hto[i], col);
1458: }
1459: }
1460: PetscHSetIGetSize(htd[i], &htsize);
1461: dnz[i] = htsize;
1462: PetscHSetIDestroy(&htd[i]);
1463: PetscHSetIGetSize(hto[i], &htsize);
1464: onz[i] = htsize;
1465: PetscHSetIDestroy(&hto[i]);
1466: }
1468: PetscFree2(htd, hto);
1469: PetscFree(c_othj);
1471: /* local sizes and preallocation */
1472: MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE);
1473: MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs);
1474: MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz);
1475: PetscFree2(dnz, onz);
1477: /* attach the supporting struct to Cmpi for reuse */
1478: Cmpi->product->data = ptap;
1479: Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1480: Cmpi->product->view = MatView_MPIAIJ_PtAP;
1482: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1483: Cmpi->assembled = PETSC_FALSE;
1484: /* pick an algorithm */
1485: PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
1486: alg = 0;
1487: PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg);
1488: PetscOptionsEnd();
1489: switch (alg) {
1490: case 0:
1491: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1492: break;
1493: case 1:
1494: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1495: break;
1496: default:
1497: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
1498: }
1499: return 0;
1500: }
1502: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, PetscReal fill, Mat C)
1503: {
1504: MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, fill, C);
1505: return 0;
1506: }
1508: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A, Mat P, PetscReal fill, Mat Cmpi)
1509: {
1510: Mat_APMPI *ptap;
1511: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1512: MPI_Comm comm;
1513: PetscMPIInt size, rank;
1514: PetscFreeSpaceList free_space = NULL, current_space = NULL;
1515: PetscInt am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
1516: PetscInt *lnk, i, k, pnz, row, nsend;
1517: PetscBT lnkbt;
1518: PetscMPIInt tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
1519: PETSC_UNUSED PetscMPIInt icompleted = 0;
1520: PetscInt **buf_rj, **buf_ri, **buf_ri_k;
1521: PetscInt len, proc, *dnz, *onz, *owners, nzi, nspacedouble;
1522: PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
1523: MPI_Request *swaits, *rwaits;
1524: MPI_Status *sstatus, rstatus;
1525: PetscLayout rowmap;
1526: PetscInt *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
1527: PetscMPIInt *len_r, *id_r; /* array of length of comm->size, store send/recv matrix values */
1528: PetscInt *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, ap_rmax = 0, Crmax, *aj, *ai, *pi;
1529: Mat_SeqAIJ *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth;
1530: PetscScalar *apv;
1531: PetscTable ta;
1532: MatType mtype;
1533: const char *prefix;
1534: #if defined(PETSC_USE_INFO)
1535: PetscReal apfill;
1536: #endif
1538: MatCheckProduct(Cmpi, 4);
1540: PetscObjectGetComm((PetscObject)A, &comm);
1541: MPI_Comm_size(comm, &size);
1542: MPI_Comm_rank(comm, &rank);
1544: if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data;
1546: /* create symbolic parallel matrix Cmpi */
1547: MatGetType(A, &mtype);
1548: MatSetType(Cmpi, mtype);
1550: /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1551: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1553: /* create struct Mat_APMPI and attached it to C later */
1554: PetscNew(&ptap);
1555: ptap->reuse = MAT_INITIAL_MATRIX;
1556: ptap->algType = 1;
1558: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1559: MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth);
1560: /* get P_loc by taking all local rows of P */
1561: MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc);
1563: /* (0) compute Rd = Pd^T, Ro = Po^T */
1564: /* --------------------------------- */
1565: MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd);
1566: MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro);
1568: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
1569: /* ----------------------------------------------------------------- */
1570: p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
1571: if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
1573: /* create and initialize a linked list */
1574: PetscTableCreate(pn, pN, &ta); /* for compute AP_loc and Cmpi */
1575: MatRowMergeMax_SeqAIJ(p_loc, ptap->P_loc->rmap->N, ta);
1576: MatRowMergeMax_SeqAIJ(p_oth, ptap->P_oth->rmap->N, ta);
1577: PetscTableGetCount(ta, &Crmax); /* Crmax = nnz(sum of Prows) */
1578: /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
1580: PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt);
1582: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1583: if (ao) {
1584: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space);
1585: } else {
1586: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space);
1587: }
1588: current_space = free_space;
1589: nspacedouble = 0;
1591: PetscMalloc1(am + 1, &api);
1592: api[0] = 0;
1593: for (i = 0; i < am; i++) {
1594: /* diagonal portion: Ad[i,:]*P */
1595: ai = ad->i;
1596: pi = p_loc->i;
1597: nzi = ai[i + 1] - ai[i];
1598: aj = ad->j + ai[i];
1599: for (j = 0; j < nzi; j++) {
1600: row = aj[j];
1601: pnz = pi[row + 1] - pi[row];
1602: Jptr = p_loc->j + pi[row];
1603: /* add non-zero cols of P into the sorted linked list lnk */
1604: PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt);
1605: }
1606: /* off-diagonal portion: Ao[i,:]*P */
1607: if (ao) {
1608: ai = ao->i;
1609: pi = p_oth->i;
1610: nzi = ai[i + 1] - ai[i];
1611: aj = ao->j + ai[i];
1612: for (j = 0; j < nzi; j++) {
1613: row = aj[j];
1614: pnz = pi[row + 1] - pi[row];
1615: Jptr = p_oth->j + pi[row];
1616: PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt);
1617: }
1618: }
1619: apnz = lnk[0];
1620: api[i + 1] = api[i] + apnz;
1621: if (ap_rmax < apnz) ap_rmax = apnz;
1623: /* if free space is not available, double the total space in the list */
1624: if (current_space->local_remaining < apnz) {
1625: PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space);
1626: nspacedouble++;
1627: }
1629: /* Copy data into free space, then initialize lnk */
1630: PetscLLCondensedClean(pN, apnz, current_space->array, lnk, lnkbt);
1632: current_space->array += apnz;
1633: current_space->local_used += apnz;
1634: current_space->local_remaining -= apnz;
1635: }
1636: /* Allocate space for apj and apv, initialize apj, and */
1637: /* destroy list of free space and other temporary array(s) */
1638: PetscMalloc2(api[am], &apj, api[am], &apv);
1639: PetscFreeSpaceContiguous(&free_space, apj);
1640: PetscLLDestroy(lnk, lnkbt);
1642: /* Create AP_loc for reuse */
1643: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc);
1644: MatSetType(ptap->AP_loc, ((PetscObject)p->A)->type_name);
1645: #if defined(PETSC_USE_INFO)
1646: if (ao) {
1647: apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
1648: } else {
1649: apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
1650: }
1651: ptap->AP_loc->info.mallocs = nspacedouble;
1652: ptap->AP_loc->info.fill_ratio_given = fill;
1653: ptap->AP_loc->info.fill_ratio_needed = apfill;
1655: if (api[am]) {
1656: PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill);
1657: PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill);
1658: } else {
1659: PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc is empty \n");
1660: }
1661: #endif
1663: /* (2-1) compute symbolic Co = Ro*AP_loc */
1664: /* ------------------------------------ */
1665: MatGetOptionsPrefix(A, &prefix);
1666: MatSetOptionsPrefix(ptap->Ro, prefix);
1667: MatAppendOptionsPrefix(ptap->Ro, "inner_offdiag_");
1668: MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth);
1669: MatGetOptionsPrefix(Cmpi, &prefix);
1670: MatSetOptionsPrefix(ptap->C_oth, prefix);
1671: MatAppendOptionsPrefix(ptap->C_oth, "inner_C_oth_");
1672: MatProductSetType(ptap->C_oth, MATPRODUCT_AB);
1673: MatProductSetAlgorithm(ptap->C_oth, "default");
1674: MatProductSetFill(ptap->C_oth, fill);
1675: MatProductSetFromOptions(ptap->C_oth);
1676: MatProductSymbolic(ptap->C_oth);
1678: /* (3) send coj of C_oth to other processors */
1679: /* ------------------------------------------ */
1680: /* determine row ownership */
1681: PetscLayoutCreate(comm, &rowmap);
1682: rowmap->n = pn;
1683: rowmap->bs = 1;
1684: PetscLayoutSetUp(rowmap);
1685: owners = rowmap->range;
1687: /* determine the number of messages to send, their lengths */
1688: PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co);
1689: PetscArrayzero(len_s, size);
1690: PetscArrayzero(len_si, size);
1692: c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
1693: coi = c_oth->i;
1694: coj = c_oth->j;
1695: con = ptap->C_oth->rmap->n;
1696: proc = 0;
1697: for (i = 0; i < con; i++) {
1698: while (prmap[i] >= owners[proc + 1]) proc++;
1699: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1700: len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1701: }
1703: len = 0; /* max length of buf_si[], see (4) */
1704: owners_co[0] = 0;
1705: nsend = 0;
1706: for (proc = 0; proc < size; proc++) {
1707: owners_co[proc + 1] = owners_co[proc] + len_si[proc];
1708: if (len_s[proc]) {
1709: nsend++;
1710: len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1711: len += len_si[proc];
1712: }
1713: }
1715: /* determine the number and length of messages to receive for coi and coj */
1716: PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv);
1717: PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri);
1719: /* post the Irecv and Isend of coj */
1720: PetscCommGetNewTag(comm, &tagj);
1721: PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits);
1722: PetscMalloc1(nsend + 1, &swaits);
1723: for (proc = 0, k = 0; proc < size; proc++) {
1724: if (!len_s[proc]) continue;
1725: i = owners_co[proc];
1726: MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k);
1727: k++;
1728: }
1730: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1731: /* ---------------------------------------- */
1732: MatSetOptionsPrefix(ptap->Rd, prefix);
1733: MatAppendOptionsPrefix(ptap->Rd, "inner_diag_");
1734: MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc);
1735: MatGetOptionsPrefix(Cmpi, &prefix);
1736: MatSetOptionsPrefix(ptap->C_loc, prefix);
1737: MatAppendOptionsPrefix(ptap->C_loc, "inner_C_loc_");
1738: MatProductSetType(ptap->C_loc, MATPRODUCT_AB);
1739: MatProductSetAlgorithm(ptap->C_loc, "default");
1740: MatProductSetFill(ptap->C_loc, fill);
1741: MatProductSetFromOptions(ptap->C_loc);
1742: MatProductSymbolic(ptap->C_loc);
1744: c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
1746: /* receives coj are complete */
1747: for (i = 0; i < nrecv; i++) MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus);
1748: PetscFree(rwaits);
1749: if (nsend) MPI_Waitall(nsend, swaits, sstatus);
1751: /* add received column indices into ta to update Crmax */
1752: for (k = 0; k < nrecv; k++) { /* k-th received message */
1753: Jptr = buf_rj[k];
1754: for (j = 0; j < len_r[k]; j++) PetscTableAdd(ta, *(Jptr + j) + 1, 1, INSERT_VALUES);
1755: }
1756: PetscTableGetCount(ta, &Crmax);
1757: PetscTableDestroy(&ta);
1759: /* (4) send and recv coi */
1760: /*-----------------------*/
1761: PetscCommGetNewTag(comm, &tagi);
1762: PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits);
1763: PetscMalloc1(len + 1, &buf_s);
1764: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1765: for (proc = 0, k = 0; proc < size; proc++) {
1766: if (!len_s[proc]) continue;
1767: /* form outgoing message for i-structure:
1768: buf_si[0]: nrows to be sent
1769: [1:nrows]: row index (global)
1770: [nrows+1:2*nrows+1]: i-structure index
1771: */
1772: /*-------------------------------------------*/
1773: nrows = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
1774: buf_si_i = buf_si + nrows + 1;
1775: buf_si[0] = nrows;
1776: buf_si_i[0] = 0;
1777: nrows = 0;
1778: for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
1779: nzi = coi[i + 1] - coi[i];
1780: buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */
1781: buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */
1782: nrows++;
1783: }
1784: MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k);
1785: k++;
1786: buf_si += len_si[proc];
1787: }
1788: for (i = 0; i < nrecv; i++) MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus);
1789: PetscFree(rwaits);
1790: if (nsend) MPI_Waitall(nsend, swaits, sstatus);
1792: PetscFree4(len_s, len_si, sstatus, owners_co);
1793: PetscFree(len_ri);
1794: PetscFree(swaits);
1795: PetscFree(buf_s);
1797: /* (5) compute the local portion of Cmpi */
1798: /* ------------------------------------------ */
1799: /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
1800: PetscFreeSpaceGet(Crmax, &free_space);
1801: current_space = free_space;
1803: PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci);
1804: for (k = 0; k < nrecv; k++) {
1805: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1806: nrows = *buf_ri_k[k];
1807: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1808: nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */
1809: }
1811: MatPreallocateBegin(comm, pn, pn, dnz, onz);
1812: PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt);
1813: for (i = 0; i < pn; i++) {
1814: /* add C_loc into Cmpi */
1815: nzi = c_loc->i[i + 1] - c_loc->i[i];
1816: Jptr = c_loc->j + c_loc->i[i];
1817: PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt);
1819: /* add received col data into lnk */
1820: for (k = 0; k < nrecv; k++) { /* k-th received message */
1821: if (i == *nextrow[k]) { /* i-th row */
1822: nzi = *(nextci[k] + 1) - *nextci[k];
1823: Jptr = buf_rj[k] + *nextci[k];
1824: PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt);
1825: nextrow[k]++;
1826: nextci[k]++;
1827: }
1828: }
1829: nzi = lnk[0];
1831: /* copy data into free space, then initialize lnk */
1832: PetscLLCondensedClean(pN, nzi, current_space->array, lnk, lnkbt);
1833: MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz);
1834: }
1835: PetscFree3(buf_ri_k, nextrow, nextci);
1836: PetscLLDestroy(lnk, lnkbt);
1837: PetscFreeSpaceDestroy(free_space);
1839: /* local sizes and preallocation */
1840: MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE);
1841: if (P->cmap->bs > 0) {
1842: PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs);
1843: PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs);
1844: }
1845: MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz);
1846: MatPreallocateEnd(dnz, onz);
1848: /* members in merge */
1849: PetscFree(id_r);
1850: PetscFree(len_r);
1851: PetscFree(buf_ri[0]);
1852: PetscFree(buf_ri);
1853: PetscFree(buf_rj[0]);
1854: PetscFree(buf_rj);
1855: PetscLayoutDestroy(&rowmap);
1857: PetscCalloc1(pN, &ptap->apa);
1859: /* attach the supporting struct to Cmpi for reuse */
1860: Cmpi->product->data = ptap;
1861: Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1862: Cmpi->product->view = MatView_MPIAIJ_PtAP;
1864: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1865: Cmpi->assembled = PETSC_FALSE;
1866: return 0;
1867: }
1869: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A, Mat P, Mat C)
1870: {
1871: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1872: Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data;
1873: Mat_SeqAIJ *ap, *p_loc, *p_oth = NULL, *c_seq;
1874: Mat_APMPI *ptap;
1875: Mat AP_loc, C_loc, C_oth;
1876: PetscInt i, rstart, rend, cm, ncols, row;
1877: PetscInt *api, *apj, am = A->rmap->n, j, col, apnz;
1878: PetscScalar *apa;
1879: const PetscInt *cols;
1880: const PetscScalar *vals;
1882: MatCheckProduct(C, 3);
1883: ptap = (Mat_APMPI *)C->product->data;
1887: MatZeroEntries(C);
1888: /* 1) get R = Pd^T,Ro = Po^T */
1889: if (ptap->reuse == MAT_REUSE_MATRIX) {
1890: MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd);
1891: MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro);
1892: }
1894: /* 2) get AP_loc */
1895: AP_loc = ptap->AP_loc;
1896: ap = (Mat_SeqAIJ *)AP_loc->data;
1898: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
1899: /*-----------------------------------------------------*/
1900: if (ptap->reuse == MAT_REUSE_MATRIX) {
1901: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1902: MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth);
1903: MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc);
1904: }
1906: /* 2-2) compute numeric A_loc*P - dominating part */
1907: /* ---------------------------------------------- */
1908: /* get data from symbolic products */
1909: p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
1910: if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
1911: apa = ptap->apa;
1912: api = ap->i;
1913: apj = ap->j;
1914: for (i = 0; i < am; i++) {
1915: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1916: AProw_nonscalable(i, ad, ao, p_loc, p_oth, apa);
1917: apnz = api[i + 1] - api[i];
1918: for (j = 0; j < apnz; j++) {
1919: col = apj[j + api[i]];
1920: ap->a[j + ap->i[i]] = apa[col];
1921: apa[col] = 0.0;
1922: }
1923: }
1924: /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
1925: PetscObjectStateIncrease((PetscObject)AP_loc);
1927: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1928: MatProductNumeric(ptap->C_loc);
1929: MatProductNumeric(ptap->C_oth);
1930: C_loc = ptap->C_loc;
1931: C_oth = ptap->C_oth;
1933: /* add C_loc and Co to to C */
1934: MatGetOwnershipRange(C, &rstart, &rend);
1936: /* C_loc -> C */
1937: cm = C_loc->rmap->N;
1938: c_seq = (Mat_SeqAIJ *)C_loc->data;
1939: cols = c_seq->j;
1940: vals = c_seq->a;
1942: /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1943: /* when there are no off-processor parts. */
1944: /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1945: /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1946: /* a table, and other, more complex stuff has to be done. */
1947: if (C->assembled) {
1948: C->was_assembled = PETSC_TRUE;
1949: C->assembled = PETSC_FALSE;
1950: }
1951: if (C->was_assembled) {
1952: for (i = 0; i < cm; i++) {
1953: ncols = c_seq->i[i + 1] - c_seq->i[i];
1954: row = rstart + i;
1955: MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES);
1956: cols += ncols;
1957: vals += ncols;
1958: }
1959: } else {
1960: MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a);
1961: }
1963: /* Co -> C, off-processor part */
1964: cm = C_oth->rmap->N;
1965: c_seq = (Mat_SeqAIJ *)C_oth->data;
1966: cols = c_seq->j;
1967: vals = c_seq->a;
1968: for (i = 0; i < cm; i++) {
1969: ncols = c_seq->i[i + 1] - c_seq->i[i];
1970: row = p->garray[i];
1971: MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES);
1972: cols += ncols;
1973: vals += ncols;
1974: }
1976: MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
1977: MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);
1979: ptap->reuse = MAT_REUSE_MATRIX;
1980: return 0;
1981: }
1983: PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C)
1984: {
1985: Mat_Product *product = C->product;
1986: Mat A = product->A, P = product->B;
1987: MatProductAlgorithm alg = product->alg;
1988: PetscReal fill = product->fill;
1989: PetscBool flg;
1991: /* scalable: do R=P^T locally, then C=R*A*P */
1992: PetscStrcmp(alg, "scalable", &flg);
1993: if (flg) {
1994: MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A, P, product->fill, C);
1995: C->ops->productnumeric = MatProductNumeric_PtAP;
1996: goto next;
1997: }
1999: /* nonscalable: do R=P^T locally, then C=R*A*P */
2000: PetscStrcmp(alg, "nonscalable", &flg);
2001: if (flg) {
2002: MatPtAPSymbolic_MPIAIJ_MPIAIJ(A, P, fill, C);
2003: goto next;
2004: }
2006: /* allatonce */
2007: PetscStrcmp(alg, "allatonce", &flg);
2008: if (flg) {
2009: MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A, P, fill, C);
2010: goto next;
2011: }
2013: /* allatonce_merged */
2014: PetscStrcmp(alg, "allatonce_merged", &flg);
2015: if (flg) {
2016: MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A, P, fill, C);
2017: goto next;
2018: }
2020: /* backend general code */
2021: PetscStrcmp(alg, "backend", &flg);
2022: if (flg) {
2023: MatProductSymbolic_MPIAIJBACKEND(C);
2024: return 0;
2025: }
2027: /* hypre */
2028: #if defined(PETSC_HAVE_HYPRE)
2029: PetscStrcmp(alg, "hypre", &flg);
2030: if (flg) {
2031: MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A, P, fill, C);
2032: C->ops->productnumeric = MatProductNumeric_PtAP;
2033: return 0;
2034: }
2035: #endif
2036: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
2038: next:
2039: C->ops->productnumeric = MatProductNumeric_PtAP;
2040: return 0;
2041: }