Actual source code: aijperm.c


  2: /*
  3:   Defines basic operations for the MATSEQAIJPERM matrix class.
  4:   This class is derived from the MATSEQAIJ class and retains the
  5:   compressed row storage (aka Yale sparse matrix format) but augments
  6:   it with some permutation information that enables some operations
  7:   to be more vectorizable.  A physically rearranged copy of the matrix
  8:   may be stored if the user desires.

 10:   Eventually a variety of permutations may be supported.
 11: */

 13: #include <../src/mat/impls/aij/seq/aij.h>

 15: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
 16:   #include <immintrin.h>

 18:   #if !defined(_MM_SCALE_8)
 19:     #define _MM_SCALE_8 8
 20:   #endif
 21:   #if !defined(_MM_SCALE_4)
 22:     #define _MM_SCALE_4 4
 23:   #endif
 24: #endif

 26: #define NDIM 512
 27: /* NDIM specifies how many rows at a time we should work with when
 28:  * performing the vectorized mat-vec.  This depends on various factors
 29:  * such as vector register length, etc., and I really need to add a
 30:  * way for the user (or the library) to tune this.  I'm setting it to
 31:  * 512 for now since that is what Ed D'Azevedo was using in his Fortran
 32:  * routines. */

 34: typedef struct {
 35:   PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */

 37:   PetscInt  ngroup;
 38:   PetscInt *xgroup;
 39:   /* Denotes where groups of rows with same number of nonzeros
 40:    * begin and end, i.e., xgroup[i] gives us the position in iperm[]
 41:    * where the ith group begins. */

 43:   PetscInt *nzgroup; /*  how many nonzeros each row that is a member of group i has. */
 44:   PetscInt *iperm;   /* The permutation vector. */

 46:   /* Some of this stuff is for Ed's recursive triangular solve.
 47:    * I'm not sure what I need yet. */
 48:   PetscInt   blocksize;
 49:   PetscInt   nstep;
 50:   PetscInt  *jstart_list;
 51:   PetscInt  *jend_list;
 52:   PetscInt  *action_list;
 53:   PetscInt  *ngroup_list;
 54:   PetscInt **ipointer_list;
 55:   PetscInt **xgroup_list;
 56:   PetscInt **nzgroup_list;
 57:   PetscInt **iperm_list;
 58: } Mat_SeqAIJPERM;

 60: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A, MatType type, MatReuse reuse, Mat *newmat)
 61: {
 62:   /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
 63:   /* so we will ignore 'MatType type'. */
 64:   Mat             B       = *newmat;
 65:   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;

 67:   if (reuse == MAT_INITIAL_MATRIX) {
 68:     MatDuplicate(A, MAT_COPY_VALUES, &B);
 69:     aijperm = (Mat_SeqAIJPERM *)B->spptr;
 70:   }

 72:   /* Reset the original function pointers. */
 73:   B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
 74:   B->ops->destroy     = MatDestroy_SeqAIJ;
 75:   B->ops->duplicate   = MatDuplicate_SeqAIJ;
 76:   B->ops->mult        = MatMult_SeqAIJ;
 77:   B->ops->multadd     = MatMultAdd_SeqAIJ;

 79:   PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", NULL);

 81:   /* Free everything in the Mat_SeqAIJPERM data structure.*/
 82:   PetscFree(aijperm->xgroup);
 83:   PetscFree(aijperm->nzgroup);
 84:   PetscFree(aijperm->iperm);
 85:   PetscFree(B->spptr);

 87:   /* Change the type of B to MATSEQAIJ. */
 88:   PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);

 90:   *newmat = B;
 91:   return 0;
 92: }

 94: PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
 95: {
 96:   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;

 98:   if (aijperm) {
 99:     /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
100:     PetscFree(aijperm->xgroup);
101:     PetscFree(aijperm->nzgroup);
102:     PetscFree(aijperm->iperm);
103:     PetscFree(A->spptr);
104:   }
105:   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
106:    * to destroy everything that remains. */
107:   PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);
108:   /* Note that I don't call MatSetType().  I believe this is because that
109:    * is only to be called when *building* a matrix.  I could be wrong, but
110:    * that is how things work for the SuperLU matrix class. */
111:   PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijperm_seqaij_C", NULL);
112:   MatDestroy_SeqAIJ(A);
113:   return 0;
114: }

116: PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
117: {
118:   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
119:   Mat_SeqAIJPERM *aijperm_dest;
120:   PetscBool       perm;

122:   MatDuplicate_SeqAIJ(A, op, M);
123:   PetscObjectTypeCompare((PetscObject)*M, MATSEQAIJPERM, &perm);
124:   if (perm) {
125:     aijperm_dest = (Mat_SeqAIJPERM *)(*M)->spptr;
126:     PetscFree(aijperm_dest->xgroup);
127:     PetscFree(aijperm_dest->nzgroup);
128:     PetscFree(aijperm_dest->iperm);
129:   } else {
130:     PetscNew(&aijperm_dest);
131:     (*M)->spptr = (void *)aijperm_dest;
132:     PetscObjectChangeTypeName((PetscObject)*M, MATSEQAIJPERM);
133:     PetscObjectComposeFunction((PetscObject)*M, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ);
134:   }
135:   PetscArraycpy(aijperm_dest, aijperm, 1);
136:   /* Allocate space for, and copy the grouping and permutation info.
137:    * I note that when the groups are initially determined in
138:    * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
139:    * necessary.  But at this point, we know how large they need to be, and
140:    * allocate only the necessary amount of memory.  So the duplicated matrix
141:    * may actually use slightly less storage than the original! */
142:   PetscMalloc1(A->rmap->n, &aijperm_dest->iperm);
143:   PetscMalloc1(aijperm->ngroup + 1, &aijperm_dest->xgroup);
144:   PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup);
145:   PetscArraycpy(aijperm_dest->iperm, aijperm->iperm, A->rmap->n);
146:   PetscArraycpy(aijperm_dest->xgroup, aijperm->xgroup, aijperm->ngroup + 1);
147:   PetscArraycpy(aijperm_dest->nzgroup, aijperm->nzgroup, aijperm->ngroup);
148:   return 0;
149: }

151: PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
152: {
153:   Mat_SeqAIJ     *a       = (Mat_SeqAIJ *)(A)->data;
154:   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
155:   PetscInt        m;     /* Number of rows in the matrix. */
156:   PetscInt       *ia;    /* From the CSR representation; points to the beginning  of each row. */
157:   PetscInt        maxnz; /* Maximum number of nonzeros in any row. */
158:   PetscInt       *rows_in_bucket;
159:   /* To construct the permutation, we sort each row into one of maxnz
160:    * buckets based on how many nonzeros are in the row. */
161:   PetscInt  nz;
162:   PetscInt *nz_in_row; /* the number of nonzero elements in row k. */
163:   PetscInt *ipnz;
164:   /* When constructing the iperm permutation vector,
165:    * ipnz[nz] is used to point to the next place in the permutation vector
166:    * that a row with nz nonzero elements should be placed.*/
167:   PetscInt i, ngroup, istart, ipos;

169:   if (aijperm->nonzerostate == A->nonzerostate) return 0; /* permutation exists and matches current nonzero structure */
170:   aijperm->nonzerostate = A->nonzerostate;
171:   /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
172:   PetscFree(aijperm->xgroup);
173:   PetscFree(aijperm->nzgroup);
174:   PetscFree(aijperm->iperm);

176:   m  = A->rmap->n;
177:   ia = a->i;

179:   /* Allocate the arrays that will hold the permutation vector. */
180:   PetscMalloc1(m, &aijperm->iperm);

182:   /* Allocate some temporary work arrays that will be used in
183:    * calculating the permutation vector and groupings. */
184:   PetscMalloc1(m, &nz_in_row);

186:   /* Now actually figure out the permutation and grouping. */

188:   /* First pass: Determine number of nonzeros in each row, maximum
189:    * number of nonzeros in any row, and how many rows fall into each
190:    * "bucket" of rows with same number of nonzeros. */
191:   maxnz = 0;
192:   for (i = 0; i < m; i++) {
193:     nz_in_row[i] = ia[i + 1] - ia[i];
194:     if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
195:   }
196:   PetscMalloc1(PetscMax(maxnz, m) + 1, &rows_in_bucket);
197:   PetscMalloc1(PetscMax(maxnz, m) + 1, &ipnz);

199:   for (i = 0; i <= maxnz; i++) rows_in_bucket[i] = 0;
200:   for (i = 0; i < m; i++) {
201:     nz = nz_in_row[i];
202:     rows_in_bucket[nz]++;
203:   }

205:   /* Allocate space for the grouping info.  There will be at most (maxnz + 1)
206:    * groups.  (It is maxnz + 1 instead of simply maxnz because there may be
207:    * rows with no nonzero elements.)  If there are (maxnz + 1) groups,
208:    * then xgroup[] must consist of (maxnz + 2) elements, since the last
209:    * element of xgroup will tell us where the (maxnz + 1)th group ends.
210:    * We allocate space for the maximum number of groups;
211:    * that is potentially a little wasteful, but not too much so.
212:    * Perhaps I should fix it later. */
213:   PetscMalloc1(maxnz + 2, &aijperm->xgroup);
214:   PetscMalloc1(maxnz + 1, &aijperm->nzgroup);

216:   /* Second pass.  Look at what is in the buckets and create the groupings.
217:    * Note that it is OK to have a group of rows with no non-zero values. */
218:   ngroup = 0;
219:   istart = 0;
220:   for (i = 0; i <= maxnz; i++) {
221:     if (rows_in_bucket[i] > 0) {
222:       aijperm->nzgroup[ngroup] = i;
223:       aijperm->xgroup[ngroup]  = istart;
224:       ngroup++;
225:       istart += rows_in_bucket[i];
226:     }
227:   }

229:   aijperm->xgroup[ngroup] = istart;
230:   aijperm->ngroup         = ngroup;

232:   /* Now fill in the permutation vector iperm. */
233:   ipnz[0] = 0;
234:   for (i = 0; i < maxnz; i++) ipnz[i + 1] = ipnz[i] + rows_in_bucket[i];

236:   for (i = 0; i < m; i++) {
237:     nz                   = nz_in_row[i];
238:     ipos                 = ipnz[nz];
239:     aijperm->iperm[ipos] = i;
240:     ipnz[nz]++;
241:   }

243:   /* Clean up temporary work arrays. */
244:   PetscFree(rows_in_bucket);
245:   PetscFree(ipnz);
246:   PetscFree(nz_in_row);
247:   return 0;
248: }

250: PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
251: {
252:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

254:   if (mode == MAT_FLUSH_ASSEMBLY) return 0;

256:   /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
257:    * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
258:    * I'm not sure if this is the best way to do this, but it avoids
259:    * a lot of code duplication.
260:    * I also note that currently MATSEQAIJPERM doesn't know anything about
261:    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
262:    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
263:    * this, this may break things.  (Don't know... haven't looked at it.) */
264:   a->inode.use = PETSC_FALSE;
265:   MatAssemblyEnd_SeqAIJ(A, mode);

267:   /* Now calculate the permutation and grouping information. */
268:   MatSeqAIJPERM_create_perm(A);
269:   return 0;
270: }

272: PetscErrorCode MatMult_SeqAIJPERM(Mat A, Vec xx, Vec yy)
273: {
274:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
275:   const PetscScalar *x;
276:   PetscScalar       *y;
277:   const MatScalar   *aa;
278:   const PetscInt    *aj, *ai;
279: #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
280:   PetscInt i, j;
281: #endif
282: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
283:   __m512d  vec_x, vec_y, vec_vals;
284:   __m256i  vec_idx, vec_ipos, vec_j;
285:   __mmask8 mask;
286: #endif

288:   /* Variables that don't appear in MatMult_SeqAIJ. */
289:   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
290:   PetscInt       *iperm; /* Points to the permutation vector. */
291:   PetscInt       *xgroup;
292:   /* Denotes where groups of rows with same number of nonzeros
293:    * begin and end in iperm. */
294:   PetscInt *nzgroup;
295:   PetscInt  ngroup;
296:   PetscInt  igroup;
297:   PetscInt  jstart, jend;
298:   /* jstart is used in loops to denote the position in iperm where a
299:    * group starts; jend denotes the position where it ends.
300:    * (jend + 1 is where the next group starts.) */
301:   PetscInt    iold, nz;
302:   PetscInt    istart, iend, isize;
303:   PetscInt    ipos;
304:   PetscScalar yp[NDIM];
305:   PetscInt    ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */

307: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
308:   #pragma disjoint(*x, *y, *aa)
309: #endif

311:   VecGetArrayRead(xx, &x);
312:   VecGetArray(yy, &y);
313:   aj = a->j; /* aj[k] gives column index for element aa[k]. */
314:   aa = a->a; /* Nonzero elements stored row-by-row. */
315:   ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */

317:   /* Get the info we need about the permutations and groupings. */
318:   iperm   = aijperm->iperm;
319:   ngroup  = aijperm->ngroup;
320:   xgroup  = aijperm->xgroup;
321:   nzgroup = aijperm->nzgroup;

323: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
324:   fortranmultaijperm_(&m, x, ii, aj, aa, y);
325: #else

327:   for (igroup = 0; igroup < ngroup; igroup++) {
328:     jstart = xgroup[igroup];
329:     jend   = xgroup[igroup + 1] - 1;
330:     nz     = nzgroup[igroup];

332:     /* Handle the special cases where the number of nonzeros per row
333:      * in the group is either 0 or 1. */
334:     if (nz == 0) {
335:       for (i = jstart; i <= jend; i++) y[iperm[i]] = 0.0;
336:     } else if (nz == 1) {
337:       for (i = jstart; i <= jend; i++) {
338:         iold    = iperm[i];
339:         ipos    = ai[iold];
340:         y[iold] = aa[ipos] * x[aj[ipos]];
341:       }
342:     } else {
343:       /* We work our way through the current group in chunks of NDIM rows
344:        * at a time. */

346:       for (istart = jstart; istart <= jend; istart += NDIM) {
347:         /* Figure out where the chunk of 'isize' rows ends in iperm.
348:          * 'isize may of course be less than NDIM for the last chunk. */
349:         iend = istart + (NDIM - 1);

351:         if (iend > jend) iend = jend;

353:         isize = iend - istart + 1;

355:         /* Initialize the yp[] array that will be used to hold part of
356:          * the permuted results vector, and figure out where in aa each
357:          * row of the chunk will begin. */
358:         for (i = 0; i < isize; i++) {
359:           iold = iperm[istart + i];
360:           /* iold is a row number from the matrix A *before* reordering. */
361:           ip[i] = ai[iold];
362:           /* ip[i] tells us where the ith row of the chunk begins in aa. */
363:           yp[i] = (PetscScalar)0.0;
364:         }

366:         /* If the number of zeros per row exceeds the number of rows in
367:          * the chunk, we should vectorize along nz, that is, perform the
368:          * mat-vec one row at a time as in the usual CSR case. */
369:         if (nz > isize) {
370:   #if defined(PETSC_HAVE_CRAY_VECTOR)
371:     #pragma _CRI preferstream
372:   #endif
373:           for (i = 0; i < isize; i++) {
374:   #if defined(PETSC_HAVE_CRAY_VECTOR)
375:     #pragma _CRI prefervector
376:   #endif

378:   #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
379:             vec_y = _mm512_setzero_pd();
380:             ipos  = ip[i];
381:             for (j = 0; j < (nz >> 3); j++) {
382:               vec_idx  = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
383:               vec_vals = _mm512_loadu_pd(&aa[ipos]);
384:               vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
385:               vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
386:               ipos += 8;
387:             }
388:             if ((nz & 0x07) > 2) {
389:               mask     = (__mmask8)(0xff >> (8 - (nz & 0x07)));
390:               vec_idx  = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
391:               vec_vals = _mm512_loadu_pd(&aa[ipos]);
392:               vec_x    = _mm512_mask_i32gather_pd(vec_x, mask, vec_idx, x, _MM_SCALE_8);
393:               vec_y    = _mm512_mask3_fmadd_pd(vec_x, vec_vals, vec_y, mask);
394:             } else if ((nz & 0x07) == 2) {
395:               yp[i] += aa[ipos] * x[aj[ipos]];
396:               yp[i] += aa[ipos + 1] * x[aj[ipos + 1]];
397:             } else if ((nz & 0x07) == 1) {
398:               yp[i] += aa[ipos] * x[aj[ipos]];
399:             }
400:             yp[i] += _mm512_reduce_add_pd(vec_y);
401:   #else
402:             for (j = 0; j < nz; j++) {
403:               ipos = ip[i] + j;
404:               yp[i] += aa[ipos] * x[aj[ipos]];
405:             }
406:   #endif
407:           }
408:         } else {
409:           /* Otherwise, there are enough rows in the chunk to make it
410:            * worthwhile to vectorize across the rows, that is, to do the
411:            * matvec by operating with "columns" of the chunk. */
412:           for (j = 0; j < nz; j++) {
413:   #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
414:             vec_j = _mm256_set1_epi32(j);
415:             for (i = 0; i < ((isize >> 3) << 3); i += 8) {
416:               vec_y    = _mm512_loadu_pd(&yp[i]);
417:               vec_ipos = _mm256_loadu_si256((__m256i const *)&ip[i]);
418:               vec_ipos = _mm256_add_epi32(vec_ipos, vec_j);
419:               vec_idx  = _mm256_i32gather_epi32(aj, vec_ipos, _MM_SCALE_4);
420:               vec_vals = _mm512_i32gather_pd(vec_ipos, aa, _MM_SCALE_8);
421:               vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
422:               vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
423:               _mm512_storeu_pd(&yp[i], vec_y);
424:             }
425:             for (i = isize - (isize & 0x07); i < isize; i++) {
426:               ipos = ip[i] + j;
427:               yp[i] += aa[ipos] * x[aj[ipos]];
428:             }
429:   #else
430:             for (i = 0; i < isize; i++) {
431:               ipos = ip[i] + j;
432:               yp[i] += aa[ipos] * x[aj[ipos]];
433:             }
434:   #endif
435:           }
436:         }

438:   #if defined(PETSC_HAVE_CRAY_VECTOR)
439:     #pragma _CRI ivdep
440:   #endif
441:         /* Put results from yp[] into non-permuted result vector y. */
442:         for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
443:       } /* End processing chunk of isize rows of a group. */
444:     }   /* End handling matvec for chunk with nz > 1. */
445:   }     /* End loop over igroup. */
446: #endif
447:   PetscLogFlops(PetscMax(2.0 * a->nz - A->rmap->n, 0));
448:   VecRestoreArrayRead(xx, &x);
449:   VecRestoreArray(yy, &y);
450:   return 0;
451: }

453: /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
454:  * Note that the names I used to designate the vectors differs from that
455:  * used in MatMultAdd_SeqAIJ().  I did this to keep my notation consistent
456:  * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
457: /*
458:     I hate having virtually identical code for the mult and the multadd!!!
459: */
460: PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A, Vec xx, Vec ww, Vec yy)
461: {
462:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
463:   const PetscScalar *x;
464:   PetscScalar       *y, *w;
465:   const MatScalar   *aa;
466:   const PetscInt    *aj, *ai;
467: #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
468:   PetscInt i, j;
469: #endif

471:   /* Variables that don't appear in MatMultAdd_SeqAIJ. */
472:   Mat_SeqAIJPERM *aijperm;
473:   PetscInt       *iperm; /* Points to the permutation vector. */
474:   PetscInt       *xgroup;
475:   /* Denotes where groups of rows with same number of nonzeros
476:    * begin and end in iperm. */
477:   PetscInt *nzgroup;
478:   PetscInt  ngroup;
479:   PetscInt  igroup;
480:   PetscInt  jstart, jend;
481:   /* jstart is used in loops to denote the position in iperm where a
482:    * group starts; jend denotes the position where it ends.
483:    * (jend + 1 is where the next group starts.) */
484:   PetscInt    iold, nz;
485:   PetscInt    istart, iend, isize;
486:   PetscInt    ipos;
487:   PetscScalar yp[NDIM];
488:   PetscInt    ip[NDIM];
489:   /* yp[] and ip[] are treated as vector "registers" for performing
490:    * the mat-vec. */

492: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
493:   #pragma disjoint(*x, *y, *aa)
494: #endif

496:   VecGetArrayRead(xx, &x);
497:   VecGetArrayPair(yy, ww, &y, &w);

499:   aj = a->j; /* aj[k] gives column index for element aa[k]. */
500:   aa = a->a; /* Nonzero elements stored row-by-row. */
501:   ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */

503:   /* Get the info we need about the permutations and groupings. */
504:   aijperm = (Mat_SeqAIJPERM *)A->spptr;
505:   iperm   = aijperm->iperm;
506:   ngroup  = aijperm->ngroup;
507:   xgroup  = aijperm->xgroup;
508:   nzgroup = aijperm->nzgroup;

510: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
511:   fortranmultaddaijperm_(&m, x, ii, aj, aa, y, w);
512: #else

514:   for (igroup = 0; igroup < ngroup; igroup++) {
515:     jstart = xgroup[igroup];
516:     jend   = xgroup[igroup + 1] - 1;

518:     nz = nzgroup[igroup];

520:     /* Handle the special cases where the number of nonzeros per row
521:      * in the group is either 0 or 1. */
522:     if (nz == 0) {
523:       for (i = jstart; i <= jend; i++) {
524:         iold    = iperm[i];
525:         y[iold] = w[iold];
526:       }
527:     } else if (nz == 1) {
528:       for (i = jstart; i <= jend; i++) {
529:         iold    = iperm[i];
530:         ipos    = ai[iold];
531:         y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
532:       }
533:     }
534:     /* For the general case: */
535:     else {
536:       /* We work our way through the current group in chunks of NDIM rows
537:        * at a time. */

539:       for (istart = jstart; istart <= jend; istart += NDIM) {
540:         /* Figure out where the chunk of 'isize' rows ends in iperm.
541:          * 'isize may of course be less than NDIM for the last chunk. */
542:         iend = istart + (NDIM - 1);
543:         if (iend > jend) iend = jend;
544:         isize = iend - istart + 1;

546:         /* Initialize the yp[] array that will be used to hold part of
547:          * the permuted results vector, and figure out where in aa each
548:          * row of the chunk will begin. */
549:         for (i = 0; i < isize; i++) {
550:           iold = iperm[istart + i];
551:           /* iold is a row number from the matrix A *before* reordering. */
552:           ip[i] = ai[iold];
553:           /* ip[i] tells us where the ith row of the chunk begins in aa. */
554:           yp[i] = w[iold];
555:         }

557:         /* If the number of zeros per row exceeds the number of rows in
558:          * the chunk, we should vectorize along nz, that is, perform the
559:          * mat-vec one row at a time as in the usual CSR case. */
560:         if (nz > isize) {
561:   #if defined(PETSC_HAVE_CRAY_VECTOR)
562:     #pragma _CRI preferstream
563:   #endif
564:           for (i = 0; i < isize; i++) {
565:   #if defined(PETSC_HAVE_CRAY_VECTOR)
566:     #pragma _CRI prefervector
567:   #endif
568:             for (j = 0; j < nz; j++) {
569:               ipos = ip[i] + j;
570:               yp[i] += aa[ipos] * x[aj[ipos]];
571:             }
572:           }
573:         }
574:         /* Otherwise, there are enough rows in the chunk to make it
575:          * worthwhile to vectorize across the rows, that is, to do the
576:          * matvec by operating with "columns" of the chunk. */
577:         else {
578:           for (j = 0; j < nz; j++) {
579:             for (i = 0; i < isize; i++) {
580:               ipos = ip[i] + j;
581:               yp[i] += aa[ipos] * x[aj[ipos]];
582:             }
583:           }
584:         }

586:   #if defined(PETSC_HAVE_CRAY_VECTOR)
587:     #pragma _CRI ivdep
588:   #endif
589:         /* Put results from yp[] into non-permuted result vector y. */
590:         for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
591:       } /* End processing chunk of isize rows of a group. */

593:     } /* End handling matvec for chunk with nz > 1. */
594:   }   /* End loop over igroup. */

596: #endif
597:   PetscLogFlops(2.0 * a->nz);
598:   VecRestoreArrayRead(xx, &x);
599:   VecRestoreArrayPair(yy, ww, &y, &w);
600:   return 0;
601: }

603: /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
604:  * SeqAIJPERM matrix.  This routine is called by the MatCreate_SeqAIJPERM()
605:  * routine, but can also be used to convert an assembled SeqAIJ matrix
606:  * into a SeqAIJPERM one. */
607: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A, MatType type, MatReuse reuse, Mat *newmat)
608: {
609:   Mat             B = *newmat;
610:   Mat_SeqAIJPERM *aijperm;
611:   PetscBool       sametype;

613:   if (reuse == MAT_INITIAL_MATRIX) MatDuplicate(A, MAT_COPY_VALUES, &B);
614:   PetscObjectTypeCompare((PetscObject)A, type, &sametype);
615:   if (sametype) return 0;

617:   PetscNew(&aijperm);
618:   B->spptr = (void *)aijperm;

620:   /* Set function pointers for methods that we inherit from AIJ but override. */
621:   B->ops->duplicate   = MatDuplicate_SeqAIJPERM;
622:   B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
623:   B->ops->destroy     = MatDestroy_SeqAIJPERM;
624:   B->ops->mult        = MatMult_SeqAIJPERM;
625:   B->ops->multadd     = MatMultAdd_SeqAIJPERM;

627:   aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
628:   /* If A has already been assembled, compute the permutation. */
629:   if (A->assembled) MatSeqAIJPERM_create_perm(B);

631:   PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ);

633:   PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJPERM);
634:   *newmat = B;
635:   return 0;
636: }

638: /*@C
639:    MatCreateSeqAIJPERM - Creates a sparse matrix of type `MATSEQAIJPERM`.
640:    This type inherits from `MATSEQAIJ`, but calculates some additional permutation
641:    information that is used to allow better vectorization of some
642:    operations.  At the cost of increased storage, the `MATSEQAIJ` formatted
643:    matrix can be copied to a format in which pieces of the matrix are
644:    stored in ELLPACK format, allowing the vectorized matrix multiply
645:    routine to use stride-1 memory accesses.  As with the `MATSEQAIJ` type, it is
646:    important to preallocate matrix storage in order to get good assembly
647:    performance.

649:    Collective

651:    Input Parameters:
652: +  comm - MPI communicator, set to `PETSC_COMM_SELF`
653: .  m - number of rows
654: .  n - number of columns
655: .  nz - number of nonzeros per row (same for all rows)
656: -  nnz - array containing the number of nonzeros in the various rows
657:          (possibly different for each row) or NULL

659:    Output Parameter:
660: .  A - the matrix

662:    Note:
663:    If nnz is given then nz is ignored

665:    Level: intermediate

667: .seealso: `MatCreate()`, `MatCreateMPIAIJPERM()`, `MatSetValues()`
668: @*/
669: PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
670: {
671:   MatCreate(comm, A);
672:   MatSetSizes(*A, m, n, m, n);
673:   MatSetType(*A, MATSEQAIJPERM);
674:   MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, nnz);
675:   return 0;
676: }

678: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
679: {
680:   MatSetType(A, MATSEQAIJ);
681:   MatConvert_SeqAIJ_SeqAIJPERM(A, MATSEQAIJPERM, MAT_INPLACE_MATRIX, &A);
682:   return 0;
683: }