Actual source code: aijperm.c
2: /*
3: Defines basic operations for the MATSEQAIJPERM matrix class.
4: This class is derived from the MATSEQAIJ class and retains the
5: compressed row storage (aka Yale sparse matrix format) but augments
6: it with some permutation information that enables some operations
7: to be more vectorizable. A physically rearranged copy of the matrix
8: may be stored if the user desires.
10: Eventually a variety of permutations may be supported.
11: */
13: #include <../src/mat/impls/aij/seq/aij.h>
15: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
16: #include <immintrin.h>
18: #if !defined(_MM_SCALE_8)
19: #define _MM_SCALE_8 8
20: #endif
21: #if !defined(_MM_SCALE_4)
22: #define _MM_SCALE_4 4
23: #endif
24: #endif
26: #define NDIM 512
27: /* NDIM specifies how many rows at a time we should work with when
28: * performing the vectorized mat-vec. This depends on various factors
29: * such as vector register length, etc., and I really need to add a
30: * way for the user (or the library) to tune this. I'm setting it to
31: * 512 for now since that is what Ed D'Azevedo was using in his Fortran
32: * routines. */
34: typedef struct {
35: PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
37: PetscInt ngroup;
38: PetscInt *xgroup;
39: /* Denotes where groups of rows with same number of nonzeros
40: * begin and end, i.e., xgroup[i] gives us the position in iperm[]
41: * where the ith group begins. */
43: PetscInt *nzgroup; /* how many nonzeros each row that is a member of group i has. */
44: PetscInt *iperm; /* The permutation vector. */
46: /* Some of this stuff is for Ed's recursive triangular solve.
47: * I'm not sure what I need yet. */
48: PetscInt blocksize;
49: PetscInt nstep;
50: PetscInt *jstart_list;
51: PetscInt *jend_list;
52: PetscInt *action_list;
53: PetscInt *ngroup_list;
54: PetscInt **ipointer_list;
55: PetscInt **xgroup_list;
56: PetscInt **nzgroup_list;
57: PetscInt **iperm_list;
58: } Mat_SeqAIJPERM;
60: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A, MatType type, MatReuse reuse, Mat *newmat)
61: {
62: /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
63: /* so we will ignore 'MatType type'. */
64: Mat B = *newmat;
65: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
67: if (reuse == MAT_INITIAL_MATRIX) {
68: MatDuplicate(A, MAT_COPY_VALUES, &B);
69: aijperm = (Mat_SeqAIJPERM *)B->spptr;
70: }
72: /* Reset the original function pointers. */
73: B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
74: B->ops->destroy = MatDestroy_SeqAIJ;
75: B->ops->duplicate = MatDuplicate_SeqAIJ;
76: B->ops->mult = MatMult_SeqAIJ;
77: B->ops->multadd = MatMultAdd_SeqAIJ;
79: PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", NULL);
81: /* Free everything in the Mat_SeqAIJPERM data structure.*/
82: PetscFree(aijperm->xgroup);
83: PetscFree(aijperm->nzgroup);
84: PetscFree(aijperm->iperm);
85: PetscFree(B->spptr);
87: /* Change the type of B to MATSEQAIJ. */
88: PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);
90: *newmat = B;
91: return 0;
92: }
94: PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
95: {
96: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
98: if (aijperm) {
99: /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
100: PetscFree(aijperm->xgroup);
101: PetscFree(aijperm->nzgroup);
102: PetscFree(aijperm->iperm);
103: PetscFree(A->spptr);
104: }
105: /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
106: * to destroy everything that remains. */
107: PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);
108: /* Note that I don't call MatSetType(). I believe this is because that
109: * is only to be called when *building* a matrix. I could be wrong, but
110: * that is how things work for the SuperLU matrix class. */
111: PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijperm_seqaij_C", NULL);
112: MatDestroy_SeqAIJ(A);
113: return 0;
114: }
116: PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
117: {
118: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
119: Mat_SeqAIJPERM *aijperm_dest;
120: PetscBool perm;
122: MatDuplicate_SeqAIJ(A, op, M);
123: PetscObjectTypeCompare((PetscObject)*M, MATSEQAIJPERM, &perm);
124: if (perm) {
125: aijperm_dest = (Mat_SeqAIJPERM *)(*M)->spptr;
126: PetscFree(aijperm_dest->xgroup);
127: PetscFree(aijperm_dest->nzgroup);
128: PetscFree(aijperm_dest->iperm);
129: } else {
130: PetscNew(&aijperm_dest);
131: (*M)->spptr = (void *)aijperm_dest;
132: PetscObjectChangeTypeName((PetscObject)*M, MATSEQAIJPERM);
133: PetscObjectComposeFunction((PetscObject)*M, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ);
134: }
135: PetscArraycpy(aijperm_dest, aijperm, 1);
136: /* Allocate space for, and copy the grouping and permutation info.
137: * I note that when the groups are initially determined in
138: * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
139: * necessary. But at this point, we know how large they need to be, and
140: * allocate only the necessary amount of memory. So the duplicated matrix
141: * may actually use slightly less storage than the original! */
142: PetscMalloc1(A->rmap->n, &aijperm_dest->iperm);
143: PetscMalloc1(aijperm->ngroup + 1, &aijperm_dest->xgroup);
144: PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup);
145: PetscArraycpy(aijperm_dest->iperm, aijperm->iperm, A->rmap->n);
146: PetscArraycpy(aijperm_dest->xgroup, aijperm->xgroup, aijperm->ngroup + 1);
147: PetscArraycpy(aijperm_dest->nzgroup, aijperm->nzgroup, aijperm->ngroup);
148: return 0;
149: }
151: PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
152: {
153: Mat_SeqAIJ *a = (Mat_SeqAIJ *)(A)->data;
154: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
155: PetscInt m; /* Number of rows in the matrix. */
156: PetscInt *ia; /* From the CSR representation; points to the beginning of each row. */
157: PetscInt maxnz; /* Maximum number of nonzeros in any row. */
158: PetscInt *rows_in_bucket;
159: /* To construct the permutation, we sort each row into one of maxnz
160: * buckets based on how many nonzeros are in the row. */
161: PetscInt nz;
162: PetscInt *nz_in_row; /* the number of nonzero elements in row k. */
163: PetscInt *ipnz;
164: /* When constructing the iperm permutation vector,
165: * ipnz[nz] is used to point to the next place in the permutation vector
166: * that a row with nz nonzero elements should be placed.*/
167: PetscInt i, ngroup, istart, ipos;
169: if (aijperm->nonzerostate == A->nonzerostate) return 0; /* permutation exists and matches current nonzero structure */
170: aijperm->nonzerostate = A->nonzerostate;
171: /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
172: PetscFree(aijperm->xgroup);
173: PetscFree(aijperm->nzgroup);
174: PetscFree(aijperm->iperm);
176: m = A->rmap->n;
177: ia = a->i;
179: /* Allocate the arrays that will hold the permutation vector. */
180: PetscMalloc1(m, &aijperm->iperm);
182: /* Allocate some temporary work arrays that will be used in
183: * calculating the permutation vector and groupings. */
184: PetscMalloc1(m, &nz_in_row);
186: /* Now actually figure out the permutation and grouping. */
188: /* First pass: Determine number of nonzeros in each row, maximum
189: * number of nonzeros in any row, and how many rows fall into each
190: * "bucket" of rows with same number of nonzeros. */
191: maxnz = 0;
192: for (i = 0; i < m; i++) {
193: nz_in_row[i] = ia[i + 1] - ia[i];
194: if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
195: }
196: PetscMalloc1(PetscMax(maxnz, m) + 1, &rows_in_bucket);
197: PetscMalloc1(PetscMax(maxnz, m) + 1, &ipnz);
199: for (i = 0; i <= maxnz; i++) rows_in_bucket[i] = 0;
200: for (i = 0; i < m; i++) {
201: nz = nz_in_row[i];
202: rows_in_bucket[nz]++;
203: }
205: /* Allocate space for the grouping info. There will be at most (maxnz + 1)
206: * groups. (It is maxnz + 1 instead of simply maxnz because there may be
207: * rows with no nonzero elements.) If there are (maxnz + 1) groups,
208: * then xgroup[] must consist of (maxnz + 2) elements, since the last
209: * element of xgroup will tell us where the (maxnz + 1)th group ends.
210: * We allocate space for the maximum number of groups;
211: * that is potentially a little wasteful, but not too much so.
212: * Perhaps I should fix it later. */
213: PetscMalloc1(maxnz + 2, &aijperm->xgroup);
214: PetscMalloc1(maxnz + 1, &aijperm->nzgroup);
216: /* Second pass. Look at what is in the buckets and create the groupings.
217: * Note that it is OK to have a group of rows with no non-zero values. */
218: ngroup = 0;
219: istart = 0;
220: for (i = 0; i <= maxnz; i++) {
221: if (rows_in_bucket[i] > 0) {
222: aijperm->nzgroup[ngroup] = i;
223: aijperm->xgroup[ngroup] = istart;
224: ngroup++;
225: istart += rows_in_bucket[i];
226: }
227: }
229: aijperm->xgroup[ngroup] = istart;
230: aijperm->ngroup = ngroup;
232: /* Now fill in the permutation vector iperm. */
233: ipnz[0] = 0;
234: for (i = 0; i < maxnz; i++) ipnz[i + 1] = ipnz[i] + rows_in_bucket[i];
236: for (i = 0; i < m; i++) {
237: nz = nz_in_row[i];
238: ipos = ipnz[nz];
239: aijperm->iperm[ipos] = i;
240: ipnz[nz]++;
241: }
243: /* Clean up temporary work arrays. */
244: PetscFree(rows_in_bucket);
245: PetscFree(ipnz);
246: PetscFree(nz_in_row);
247: return 0;
248: }
250: PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
251: {
252: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
254: if (mode == MAT_FLUSH_ASSEMBLY) return 0;
256: /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
257: * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
258: * I'm not sure if this is the best way to do this, but it avoids
259: * a lot of code duplication.
260: * I also note that currently MATSEQAIJPERM doesn't know anything about
261: * the Mat_CompressedRow data structure that SeqAIJ now uses when there
262: * are many zero rows. If the SeqAIJ assembly end routine decides to use
263: * this, this may break things. (Don't know... haven't looked at it.) */
264: a->inode.use = PETSC_FALSE;
265: MatAssemblyEnd_SeqAIJ(A, mode);
267: /* Now calculate the permutation and grouping information. */
268: MatSeqAIJPERM_create_perm(A);
269: return 0;
270: }
272: PetscErrorCode MatMult_SeqAIJPERM(Mat A, Vec xx, Vec yy)
273: {
274: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
275: const PetscScalar *x;
276: PetscScalar *y;
277: const MatScalar *aa;
278: const PetscInt *aj, *ai;
279: #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
280: PetscInt i, j;
281: #endif
282: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
283: __m512d vec_x, vec_y, vec_vals;
284: __m256i vec_idx, vec_ipos, vec_j;
285: __mmask8 mask;
286: #endif
288: /* Variables that don't appear in MatMult_SeqAIJ. */
289: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
290: PetscInt *iperm; /* Points to the permutation vector. */
291: PetscInt *xgroup;
292: /* Denotes where groups of rows with same number of nonzeros
293: * begin and end in iperm. */
294: PetscInt *nzgroup;
295: PetscInt ngroup;
296: PetscInt igroup;
297: PetscInt jstart, jend;
298: /* jstart is used in loops to denote the position in iperm where a
299: * group starts; jend denotes the position where it ends.
300: * (jend + 1 is where the next group starts.) */
301: PetscInt iold, nz;
302: PetscInt istart, iend, isize;
303: PetscInt ipos;
304: PetscScalar yp[NDIM];
305: PetscInt ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
307: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
308: #pragma disjoint(*x, *y, *aa)
309: #endif
311: VecGetArrayRead(xx, &x);
312: VecGetArray(yy, &y);
313: aj = a->j; /* aj[k] gives column index for element aa[k]. */
314: aa = a->a; /* Nonzero elements stored row-by-row. */
315: ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
317: /* Get the info we need about the permutations and groupings. */
318: iperm = aijperm->iperm;
319: ngroup = aijperm->ngroup;
320: xgroup = aijperm->xgroup;
321: nzgroup = aijperm->nzgroup;
323: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
324: fortranmultaijperm_(&m, x, ii, aj, aa, y);
325: #else
327: for (igroup = 0; igroup < ngroup; igroup++) {
328: jstart = xgroup[igroup];
329: jend = xgroup[igroup + 1] - 1;
330: nz = nzgroup[igroup];
332: /* Handle the special cases where the number of nonzeros per row
333: * in the group is either 0 or 1. */
334: if (nz == 0) {
335: for (i = jstart; i <= jend; i++) y[iperm[i]] = 0.0;
336: } else if (nz == 1) {
337: for (i = jstart; i <= jend; i++) {
338: iold = iperm[i];
339: ipos = ai[iold];
340: y[iold] = aa[ipos] * x[aj[ipos]];
341: }
342: } else {
343: /* We work our way through the current group in chunks of NDIM rows
344: * at a time. */
346: for (istart = jstart; istart <= jend; istart += NDIM) {
347: /* Figure out where the chunk of 'isize' rows ends in iperm.
348: * 'isize may of course be less than NDIM for the last chunk. */
349: iend = istart + (NDIM - 1);
351: if (iend > jend) iend = jend;
353: isize = iend - istart + 1;
355: /* Initialize the yp[] array that will be used to hold part of
356: * the permuted results vector, and figure out where in aa each
357: * row of the chunk will begin. */
358: for (i = 0; i < isize; i++) {
359: iold = iperm[istart + i];
360: /* iold is a row number from the matrix A *before* reordering. */
361: ip[i] = ai[iold];
362: /* ip[i] tells us where the ith row of the chunk begins in aa. */
363: yp[i] = (PetscScalar)0.0;
364: }
366: /* If the number of zeros per row exceeds the number of rows in
367: * the chunk, we should vectorize along nz, that is, perform the
368: * mat-vec one row at a time as in the usual CSR case. */
369: if (nz > isize) {
370: #if defined(PETSC_HAVE_CRAY_VECTOR)
371: #pragma _CRI preferstream
372: #endif
373: for (i = 0; i < isize; i++) {
374: #if defined(PETSC_HAVE_CRAY_VECTOR)
375: #pragma _CRI prefervector
376: #endif
378: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
379: vec_y = _mm512_setzero_pd();
380: ipos = ip[i];
381: for (j = 0; j < (nz >> 3); j++) {
382: vec_idx = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
383: vec_vals = _mm512_loadu_pd(&aa[ipos]);
384: vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
385: vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
386: ipos += 8;
387: }
388: if ((nz & 0x07) > 2) {
389: mask = (__mmask8)(0xff >> (8 - (nz & 0x07)));
390: vec_idx = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
391: vec_vals = _mm512_loadu_pd(&aa[ipos]);
392: vec_x = _mm512_mask_i32gather_pd(vec_x, mask, vec_idx, x, _MM_SCALE_8);
393: vec_y = _mm512_mask3_fmadd_pd(vec_x, vec_vals, vec_y, mask);
394: } else if ((nz & 0x07) == 2) {
395: yp[i] += aa[ipos] * x[aj[ipos]];
396: yp[i] += aa[ipos + 1] * x[aj[ipos + 1]];
397: } else if ((nz & 0x07) == 1) {
398: yp[i] += aa[ipos] * x[aj[ipos]];
399: }
400: yp[i] += _mm512_reduce_add_pd(vec_y);
401: #else
402: for (j = 0; j < nz; j++) {
403: ipos = ip[i] + j;
404: yp[i] += aa[ipos] * x[aj[ipos]];
405: }
406: #endif
407: }
408: } else {
409: /* Otherwise, there are enough rows in the chunk to make it
410: * worthwhile to vectorize across the rows, that is, to do the
411: * matvec by operating with "columns" of the chunk. */
412: for (j = 0; j < nz; j++) {
413: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
414: vec_j = _mm256_set1_epi32(j);
415: for (i = 0; i < ((isize >> 3) << 3); i += 8) {
416: vec_y = _mm512_loadu_pd(&yp[i]);
417: vec_ipos = _mm256_loadu_si256((__m256i const *)&ip[i]);
418: vec_ipos = _mm256_add_epi32(vec_ipos, vec_j);
419: vec_idx = _mm256_i32gather_epi32(aj, vec_ipos, _MM_SCALE_4);
420: vec_vals = _mm512_i32gather_pd(vec_ipos, aa, _MM_SCALE_8);
421: vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
422: vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
423: _mm512_storeu_pd(&yp[i], vec_y);
424: }
425: for (i = isize - (isize & 0x07); i < isize; i++) {
426: ipos = ip[i] + j;
427: yp[i] += aa[ipos] * x[aj[ipos]];
428: }
429: #else
430: for (i = 0; i < isize; i++) {
431: ipos = ip[i] + j;
432: yp[i] += aa[ipos] * x[aj[ipos]];
433: }
434: #endif
435: }
436: }
438: #if defined(PETSC_HAVE_CRAY_VECTOR)
439: #pragma _CRI ivdep
440: #endif
441: /* Put results from yp[] into non-permuted result vector y. */
442: for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
443: } /* End processing chunk of isize rows of a group. */
444: } /* End handling matvec for chunk with nz > 1. */
445: } /* End loop over igroup. */
446: #endif
447: PetscLogFlops(PetscMax(2.0 * a->nz - A->rmap->n, 0));
448: VecRestoreArrayRead(xx, &x);
449: VecRestoreArray(yy, &y);
450: return 0;
451: }
453: /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
454: * Note that the names I used to designate the vectors differs from that
455: * used in MatMultAdd_SeqAIJ(). I did this to keep my notation consistent
456: * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
457: /*
458: I hate having virtually identical code for the mult and the multadd!!!
459: */
460: PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A, Vec xx, Vec ww, Vec yy)
461: {
462: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
463: const PetscScalar *x;
464: PetscScalar *y, *w;
465: const MatScalar *aa;
466: const PetscInt *aj, *ai;
467: #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
468: PetscInt i, j;
469: #endif
471: /* Variables that don't appear in MatMultAdd_SeqAIJ. */
472: Mat_SeqAIJPERM *aijperm;
473: PetscInt *iperm; /* Points to the permutation vector. */
474: PetscInt *xgroup;
475: /* Denotes where groups of rows with same number of nonzeros
476: * begin and end in iperm. */
477: PetscInt *nzgroup;
478: PetscInt ngroup;
479: PetscInt igroup;
480: PetscInt jstart, jend;
481: /* jstart is used in loops to denote the position in iperm where a
482: * group starts; jend denotes the position where it ends.
483: * (jend + 1 is where the next group starts.) */
484: PetscInt iold, nz;
485: PetscInt istart, iend, isize;
486: PetscInt ipos;
487: PetscScalar yp[NDIM];
488: PetscInt ip[NDIM];
489: /* yp[] and ip[] are treated as vector "registers" for performing
490: * the mat-vec. */
492: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
493: #pragma disjoint(*x, *y, *aa)
494: #endif
496: VecGetArrayRead(xx, &x);
497: VecGetArrayPair(yy, ww, &y, &w);
499: aj = a->j; /* aj[k] gives column index for element aa[k]. */
500: aa = a->a; /* Nonzero elements stored row-by-row. */
501: ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
503: /* Get the info we need about the permutations and groupings. */
504: aijperm = (Mat_SeqAIJPERM *)A->spptr;
505: iperm = aijperm->iperm;
506: ngroup = aijperm->ngroup;
507: xgroup = aijperm->xgroup;
508: nzgroup = aijperm->nzgroup;
510: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
511: fortranmultaddaijperm_(&m, x, ii, aj, aa, y, w);
512: #else
514: for (igroup = 0; igroup < ngroup; igroup++) {
515: jstart = xgroup[igroup];
516: jend = xgroup[igroup + 1] - 1;
518: nz = nzgroup[igroup];
520: /* Handle the special cases where the number of nonzeros per row
521: * in the group is either 0 or 1. */
522: if (nz == 0) {
523: for (i = jstart; i <= jend; i++) {
524: iold = iperm[i];
525: y[iold] = w[iold];
526: }
527: } else if (nz == 1) {
528: for (i = jstart; i <= jend; i++) {
529: iold = iperm[i];
530: ipos = ai[iold];
531: y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
532: }
533: }
534: /* For the general case: */
535: else {
536: /* We work our way through the current group in chunks of NDIM rows
537: * at a time. */
539: for (istart = jstart; istart <= jend; istart += NDIM) {
540: /* Figure out where the chunk of 'isize' rows ends in iperm.
541: * 'isize may of course be less than NDIM for the last chunk. */
542: iend = istart + (NDIM - 1);
543: if (iend > jend) iend = jend;
544: isize = iend - istart + 1;
546: /* Initialize the yp[] array that will be used to hold part of
547: * the permuted results vector, and figure out where in aa each
548: * row of the chunk will begin. */
549: for (i = 0; i < isize; i++) {
550: iold = iperm[istart + i];
551: /* iold is a row number from the matrix A *before* reordering. */
552: ip[i] = ai[iold];
553: /* ip[i] tells us where the ith row of the chunk begins in aa. */
554: yp[i] = w[iold];
555: }
557: /* If the number of zeros per row exceeds the number of rows in
558: * the chunk, we should vectorize along nz, that is, perform the
559: * mat-vec one row at a time as in the usual CSR case. */
560: if (nz > isize) {
561: #if defined(PETSC_HAVE_CRAY_VECTOR)
562: #pragma _CRI preferstream
563: #endif
564: for (i = 0; i < isize; i++) {
565: #if defined(PETSC_HAVE_CRAY_VECTOR)
566: #pragma _CRI prefervector
567: #endif
568: for (j = 0; j < nz; j++) {
569: ipos = ip[i] + j;
570: yp[i] += aa[ipos] * x[aj[ipos]];
571: }
572: }
573: }
574: /* Otherwise, there are enough rows in the chunk to make it
575: * worthwhile to vectorize across the rows, that is, to do the
576: * matvec by operating with "columns" of the chunk. */
577: else {
578: for (j = 0; j < nz; j++) {
579: for (i = 0; i < isize; i++) {
580: ipos = ip[i] + j;
581: yp[i] += aa[ipos] * x[aj[ipos]];
582: }
583: }
584: }
586: #if defined(PETSC_HAVE_CRAY_VECTOR)
587: #pragma _CRI ivdep
588: #endif
589: /* Put results from yp[] into non-permuted result vector y. */
590: for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
591: } /* End processing chunk of isize rows of a group. */
593: } /* End handling matvec for chunk with nz > 1. */
594: } /* End loop over igroup. */
596: #endif
597: PetscLogFlops(2.0 * a->nz);
598: VecRestoreArrayRead(xx, &x);
599: VecRestoreArrayPair(yy, ww, &y, &w);
600: return 0;
601: }
603: /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
604: * SeqAIJPERM matrix. This routine is called by the MatCreate_SeqAIJPERM()
605: * routine, but can also be used to convert an assembled SeqAIJ matrix
606: * into a SeqAIJPERM one. */
607: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A, MatType type, MatReuse reuse, Mat *newmat)
608: {
609: Mat B = *newmat;
610: Mat_SeqAIJPERM *aijperm;
611: PetscBool sametype;
613: if (reuse == MAT_INITIAL_MATRIX) MatDuplicate(A, MAT_COPY_VALUES, &B);
614: PetscObjectTypeCompare((PetscObject)A, type, &sametype);
615: if (sametype) return 0;
617: PetscNew(&aijperm);
618: B->spptr = (void *)aijperm;
620: /* Set function pointers for methods that we inherit from AIJ but override. */
621: B->ops->duplicate = MatDuplicate_SeqAIJPERM;
622: B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
623: B->ops->destroy = MatDestroy_SeqAIJPERM;
624: B->ops->mult = MatMult_SeqAIJPERM;
625: B->ops->multadd = MatMultAdd_SeqAIJPERM;
627: aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
628: /* If A has already been assembled, compute the permutation. */
629: if (A->assembled) MatSeqAIJPERM_create_perm(B);
631: PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ);
633: PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJPERM);
634: *newmat = B;
635: return 0;
636: }
638: /*@C
639: MatCreateSeqAIJPERM - Creates a sparse matrix of type `MATSEQAIJPERM`.
640: This type inherits from `MATSEQAIJ`, but calculates some additional permutation
641: information that is used to allow better vectorization of some
642: operations. At the cost of increased storage, the `MATSEQAIJ` formatted
643: matrix can be copied to a format in which pieces of the matrix are
644: stored in ELLPACK format, allowing the vectorized matrix multiply
645: routine to use stride-1 memory accesses. As with the `MATSEQAIJ` type, it is
646: important to preallocate matrix storage in order to get good assembly
647: performance.
649: Collective
651: Input Parameters:
652: + comm - MPI communicator, set to `PETSC_COMM_SELF`
653: . m - number of rows
654: . n - number of columns
655: . nz - number of nonzeros per row (same for all rows)
656: - nnz - array containing the number of nonzeros in the various rows
657: (possibly different for each row) or NULL
659: Output Parameter:
660: . A - the matrix
662: Note:
663: If nnz is given then nz is ignored
665: Level: intermediate
667: .seealso: `MatCreate()`, `MatCreateMPIAIJPERM()`, `MatSetValues()`
668: @*/
669: PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
670: {
671: MatCreate(comm, A);
672: MatSetSizes(*A, m, n, m, n);
673: MatSetType(*A, MATSEQAIJPERM);
674: MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, nnz);
675: return 0;
676: }
678: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
679: {
680: MatSetType(A, MATSEQAIJ);
681: MatConvert_SeqAIJ_SeqAIJPERM(A, MATSEQAIJPERM, MAT_INPLACE_MATRIX, &A);
682: return 0;
683: }