Actual source code: mmdense.c
2: /*
3: Support for the parallel dense matrix vector multiply
4: */
5: #include <../src/mat/impls/dense/mpi/mpidense.h>
6: #include <petscblaslapack.h>
8: PetscErrorCode MatSetUpMultiply_MPIDense(Mat mat)
9: {
10: Mat_MPIDense *mdn = (Mat_MPIDense *)mat->data;
12: if (!mdn->Mvctx) {
13: /* Create local vector that is used to scatter into */
14: VecDestroy(&mdn->lvec);
15: if (mdn->A) { MatCreateVecs(mdn->A, &mdn->lvec, NULL); }
16: PetscLayoutSetUp(mat->cmap);
17: PetscSFCreate(PetscObjectComm((PetscObject)mat), &mdn->Mvctx);
18: PetscSFSetGraphWithPattern(mdn->Mvctx, mat->cmap, PETSCSF_PATTERN_ALLGATHER);
19: }
20: return 0;
21: }
23: static PetscErrorCode MatCreateSubMatrices_MPIDense_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *);
25: PetscErrorCode MatCreateSubMatrices_MPIDense(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
26: {
27: PetscInt nmax, nstages_local, nstages, i, pos, max_no;
29: /* Allocate memory to hold all the submatrices */
30: if (scall != MAT_REUSE_MATRIX) PetscCalloc1(ismax + 1, submat);
31: /* Determine the number of stages through which submatrices are done */
32: nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt));
33: if (!nmax) nmax = 1;
34: nstages_local = ismax / nmax + ((ismax % nmax) ? 1 : 0);
36: /* Make sure every processor loops through the nstages */
37: MPIU_Allreduce(&nstages_local, &nstages, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C));
39: for (i = 0, pos = 0; i < nstages; i++) {
40: if (pos + nmax <= ismax) max_no = nmax;
41: else if (pos == ismax) max_no = 0;
42: else max_no = ismax - pos;
43: MatCreateSubMatrices_MPIDense_Local(C, max_no, isrow + pos, iscol + pos, scall, *submat + pos);
44: pos += max_no;
45: }
46: return 0;
47: }
48: /* -------------------------------------------------------------------------*/
49: PetscErrorCode MatCreateSubMatrices_MPIDense_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats)
50: {
51: Mat_MPIDense *c = (Mat_MPIDense *)C->data;
52: Mat A = c->A;
53: Mat_SeqDense *a = (Mat_SeqDense *)A->data, *mat;
54: PetscMPIInt rank, size, tag0, tag1, idex, end, i;
55: PetscInt N = C->cmap->N, rstart = C->rmap->rstart, count;
56: const PetscInt **irow, **icol, *irow_i;
57: PetscInt *nrow, *ncol, *w1, *w3, *w4, *rtable, start;
58: PetscInt **sbuf1, m, j, k, l, ct1, **rbuf1, row, proc;
59: PetscInt nrqs, msz, **ptr, *ctr, *pa, *tmp, bsz, nrqr;
60: PetscInt is_no, jmax, **rmap, *rmap_i;
61: PetscInt ctr_j, *sbuf1_j, *rbuf1_i;
62: MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2;
63: MPI_Status *r_status1, *r_status2, *s_status1, *s_status2;
64: MPI_Comm comm;
65: PetscScalar **rbuf2, **sbuf2;
66: PetscBool sorted;
68: PetscObjectGetComm((PetscObject)C, &comm);
69: tag0 = ((PetscObject)C)->tag;
70: MPI_Comm_rank(comm, &rank);
71: MPI_Comm_size(comm, &size);
72: m = C->rmap->N;
74: /* Get some new tags to keep the communication clean */
75: PetscObjectGetNewTag((PetscObject)C, &tag1);
77: /* Check if the col indices are sorted */
78: for (i = 0; i < ismax; i++) {
79: ISSorted(isrow[i], &sorted);
81: ISSorted(iscol[i], &sorted);
83: }
85: PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, m, &rtable);
86: for (i = 0; i < ismax; i++) {
87: ISGetIndices(isrow[i], &irow[i]);
88: ISGetIndices(iscol[i], &icol[i]);
89: ISGetLocalSize(isrow[i], &nrow[i]);
90: ISGetLocalSize(iscol[i], &ncol[i]);
91: }
93: /* Create hash table for the mapping :row -> proc*/
94: for (i = 0, j = 0; i < size; i++) {
95: jmax = C->rmap->range[i + 1];
96: for (; j < jmax; j++) rtable[j] = i;
97: }
99: /* evaluate communication - mesg to who,length of mesg, and buffer space
100: required. Based on this, buffers are allocated, and data copied into them*/
101: PetscMalloc3(2 * size, &w1, size, &w3, size, &w4);
102: PetscArrayzero(w1, size * 2); /* initialize work vector*/
103: PetscArrayzero(w3, size); /* initialize work vector*/
104: for (i = 0; i < ismax; i++) {
105: PetscArrayzero(w4, size); /* initialize work vector*/
106: jmax = nrow[i];
107: irow_i = irow[i];
108: for (j = 0; j < jmax; j++) {
109: row = irow_i[j];
110: proc = rtable[row];
111: w4[proc]++;
112: }
113: for (j = 0; j < size; j++) {
114: if (w4[j]) {
115: w1[2 * j] += w4[j];
116: w3[j]++;
117: }
118: }
119: }
121: nrqs = 0; /* no of outgoing messages */
122: msz = 0; /* total mesg length (for all procs) */
123: w1[2 * rank] = 0; /* no mesg sent to self */
124: w3[rank] = 0;
125: for (i = 0; i < size; i++) {
126: if (w1[2 * i]) {
127: w1[2 * i + 1] = 1;
128: nrqs++;
129: } /* there exists a message to proc i */
130: }
131: PetscMalloc1(nrqs + 1, &pa); /*(proc -array)*/
132: for (i = 0, j = 0; i < size; i++) {
133: if (w1[2 * i]) {
134: pa[j] = i;
135: j++;
136: }
137: }
139: /* Each message would have a header = 1 + 2*(no of IS) + data */
140: for (i = 0; i < nrqs; i++) {
141: j = pa[i];
142: w1[2 * j] += w1[2 * j + 1] + 2 * w3[j];
143: msz += w1[2 * j];
144: }
145: /* Do a global reduction to determine how many messages to expect*/
146: PetscMaxSum(comm, w1, &bsz, &nrqr);
148: /* Allocate memory for recv buffers . Make sure rbuf1[0] exists by adding 1 to the buffer length */
149: PetscMalloc1(nrqr + 1, &rbuf1);
150: PetscMalloc1(nrqr * bsz, &rbuf1[0]);
151: for (i = 1; i < nrqr; ++i) rbuf1[i] = rbuf1[i - 1] + bsz;
153: /* Post the receives */
154: PetscMalloc1(nrqr + 1, &r_waits1);
155: for (i = 0; i < nrqr; ++i) MPI_Irecv(rbuf1[i], bsz, MPIU_INT, MPI_ANY_SOURCE, tag0, comm, r_waits1 + i);
157: /* Allocate Memory for outgoing messages */
158: PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr);
159: PetscArrayzero(sbuf1, size);
160: PetscArrayzero(ptr, size);
161: {
162: PetscInt *iptr = tmp, ict = 0;
163: for (i = 0; i < nrqs; i++) {
164: j = pa[i];
165: iptr += ict;
166: sbuf1[j] = iptr;
167: ict = w1[2 * j];
168: }
169: }
171: /* Form the outgoing messages */
172: /* Initialize the header space */
173: for (i = 0; i < nrqs; i++) {
174: j = pa[i];
175: sbuf1[j][0] = 0;
176: PetscArrayzero(sbuf1[j] + 1, 2 * w3[j]);
177: ptr[j] = sbuf1[j] + 2 * w3[j] + 1;
178: }
180: /* Parse the isrow and copy data into outbuf */
181: for (i = 0; i < ismax; i++) {
182: PetscArrayzero(ctr, size);
183: irow_i = irow[i];
184: jmax = nrow[i];
185: for (j = 0; j < jmax; j++) { /* parse the indices of each IS */
186: row = irow_i[j];
187: proc = rtable[row];
188: if (proc != rank) { /* copy to the outgoing buf*/
189: ctr[proc]++;
190: *ptr[proc] = row;
191: ptr[proc]++;
192: }
193: }
194: /* Update the headers for the current IS */
195: for (j = 0; j < size; j++) { /* Can Optimise this loop too */
196: if ((ctr_j = ctr[j])) {
197: sbuf1_j = sbuf1[j];
198: k = ++sbuf1_j[0];
199: sbuf1_j[2 * k] = ctr_j;
200: sbuf1_j[2 * k - 1] = i;
201: }
202: }
203: }
205: /* Now post the sends */
206: PetscMalloc1(nrqs + 1, &s_waits1);
207: for (i = 0; i < nrqs; ++i) {
208: j = pa[i];
209: MPI_Isend(sbuf1[j], w1[2 * j], MPIU_INT, j, tag0, comm, s_waits1 + i);
210: }
212: /* Post receives to capture the row_data from other procs */
213: PetscMalloc1(nrqs + 1, &r_waits2);
214: PetscMalloc1(nrqs + 1, &rbuf2);
215: for (i = 0; i < nrqs; i++) {
216: j = pa[i];
217: count = (w1[2 * j] - (2 * sbuf1[j][0] + 1)) * N;
218: PetscMalloc1(count + 1, &rbuf2[i]);
219: MPI_Irecv(rbuf2[i], count, MPIU_SCALAR, j, tag1, comm, r_waits2 + i);
220: }
222: /* Receive messages(row_nos) and then, pack and send off the rowvalues
223: to the correct processors */
225: PetscMalloc1(nrqr + 1, &s_waits2);
226: PetscMalloc1(nrqr + 1, &r_status1);
227: PetscMalloc1(nrqr + 1, &sbuf2);
229: {
230: PetscScalar *sbuf2_i, *v_start;
231: PetscInt s_proc;
232: for (i = 0; i < nrqr; ++i) {
233: MPI_Waitany(nrqr, r_waits1, &idex, r_status1 + i);
234: s_proc = r_status1[i].MPI_SOURCE; /* send processor */
235: rbuf1_i = rbuf1[idex]; /* Actual message from s_proc */
236: /* no of rows = end - start; since start is array idex[], 0idex, whel end
237: is length of the buffer - which is 1idex */
238: start = 2 * rbuf1_i[0] + 1;
239: MPI_Get_count(r_status1 + i, MPIU_INT, &end);
240: /* allocate memory sufficinet to hold all the row values */
241: PetscMalloc1((end - start) * N, &sbuf2[idex]);
242: sbuf2_i = sbuf2[idex];
243: /* Now pack the data */
244: for (j = start; j < end; j++) {
245: row = rbuf1_i[j] - rstart;
246: v_start = a->v + row;
247: for (k = 0; k < N; k++) {
248: sbuf2_i[0] = v_start[0];
249: sbuf2_i++;
250: v_start += a->lda;
251: }
252: }
253: /* Now send off the data */
254: MPI_Isend(sbuf2[idex], (end - start) * N, MPIU_SCALAR, s_proc, tag1, comm, s_waits2 + i);
255: }
256: }
257: /* End Send-Recv of IS + row_numbers */
258: PetscFree(r_status1);
259: PetscFree(r_waits1);
260: PetscMalloc1(nrqs + 1, &s_status1);
261: if (nrqs) MPI_Waitall(nrqs, s_waits1, s_status1);
262: PetscFree(s_status1);
263: PetscFree(s_waits1);
265: /* Create the submatrices */
266: if (scall == MAT_REUSE_MATRIX) {
267: for (i = 0; i < ismax; i++) {
268: mat = (Mat_SeqDense *)(submats[i]->data);
270: PetscArrayzero(mat->v, submats[i]->rmap->n * submats[i]->cmap->n);
272: submats[i]->factortype = C->factortype;
273: }
274: } else {
275: for (i = 0; i < ismax; i++) {
276: MatCreate(PETSC_COMM_SELF, submats + i);
277: MatSetSizes(submats[i], nrow[i], ncol[i], nrow[i], ncol[i]);
278: MatSetType(submats[i], ((PetscObject)A)->type_name);
279: MatSeqDenseSetPreallocation(submats[i], NULL);
280: }
281: }
283: /* Assemble the matrices */
284: {
285: PetscInt col;
286: PetscScalar *imat_v, *mat_v, *imat_vi, *mat_vi;
288: for (i = 0; i < ismax; i++) {
289: mat = (Mat_SeqDense *)submats[i]->data;
290: mat_v = a->v;
291: imat_v = mat->v;
292: irow_i = irow[i];
293: m = nrow[i];
294: for (j = 0; j < m; j++) {
295: row = irow_i[j];
296: proc = rtable[row];
297: if (proc == rank) {
298: row = row - rstart;
299: mat_vi = mat_v + row;
300: imat_vi = imat_v + j;
301: for (k = 0; k < ncol[i]; k++) {
302: col = icol[i][k];
303: imat_vi[k * m] = mat_vi[col * a->lda];
304: }
305: }
306: }
307: }
308: }
310: /* Create row map-> This maps c->row to submat->row for each submat*/
311: /* this is a very expensive operation wrt memory usage */
312: PetscMalloc1(ismax, &rmap);
313: PetscCalloc1(ismax * C->rmap->N, &rmap[0]);
314: for (i = 1; i < ismax; i++) rmap[i] = rmap[i - 1] + C->rmap->N;
315: for (i = 0; i < ismax; i++) {
316: rmap_i = rmap[i];
317: irow_i = irow[i];
318: jmax = nrow[i];
319: for (j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j;
320: }
322: /* Now Receive the row_values and assemble the rest of the matrix */
323: PetscMalloc1(nrqs + 1, &r_status2);
324: {
325: PetscInt is_max, tmp1, col, *sbuf1_i, is_sz;
326: PetscScalar *rbuf2_i, *imat_v, *imat_vi;
328: for (tmp1 = 0; tmp1 < nrqs; tmp1++) { /* For each message */
329: MPI_Waitany(nrqs, r_waits2, &i, r_status2 + tmp1);
330: /* Now dig out the corresponding sbuf1, which contains the IS data_structure */
331: sbuf1_i = sbuf1[pa[i]];
332: is_max = sbuf1_i[0];
333: ct1 = 2 * is_max + 1;
334: rbuf2_i = rbuf2[i];
335: for (j = 1; j <= is_max; j++) { /* For each IS belonging to the message */
336: is_no = sbuf1_i[2 * j - 1];
337: is_sz = sbuf1_i[2 * j];
338: mat = (Mat_SeqDense *)submats[is_no]->data;
339: imat_v = mat->v;
340: rmap_i = rmap[is_no];
341: m = nrow[is_no];
342: for (k = 0; k < is_sz; k++, rbuf2_i += N) { /* For each row */
343: row = sbuf1_i[ct1];
344: ct1++;
345: row = rmap_i[row];
346: imat_vi = imat_v + row;
347: for (l = 0; l < ncol[is_no]; l++) { /* For each col */
348: col = icol[is_no][l];
349: imat_vi[l * m] = rbuf2_i[col];
350: }
351: }
352: }
353: }
354: }
355: /* End Send-Recv of row_values */
356: PetscFree(r_status2);
357: PetscFree(r_waits2);
358: PetscMalloc1(nrqr + 1, &s_status2);
359: if (nrqr) MPI_Waitall(nrqr, s_waits2, s_status2);
360: PetscFree(s_status2);
361: PetscFree(s_waits2);
363: /* Restore the indices */
364: for (i = 0; i < ismax; i++) {
365: ISRestoreIndices(isrow[i], irow + i);
366: ISRestoreIndices(iscol[i], icol + i);
367: }
369: PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, rtable);
370: PetscFree3(w1, w3, w4);
371: PetscFree(pa);
373: for (i = 0; i < nrqs; ++i) PetscFree(rbuf2[i]);
374: PetscFree(rbuf2);
375: PetscFree4(sbuf1, ptr, tmp, ctr);
376: PetscFree(rbuf1[0]);
377: PetscFree(rbuf1);
379: for (i = 0; i < nrqr; ++i) PetscFree(sbuf2[i]);
381: PetscFree(sbuf2);
382: PetscFree(rmap[0]);
383: PetscFree(rmap);
385: for (i = 0; i < ismax; i++) {
386: MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY);
387: MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY);
388: }
389: return 0;
390: }
392: PETSC_INTERN PetscErrorCode MatScale_MPIDense(Mat inA, PetscScalar alpha)
393: {
394: Mat_MPIDense *A = (Mat_MPIDense *)inA->data;
396: MatScale(A->A, alpha);
397: return 0;
398: }