Actual source code: sell.c


  2: /*
  3:   Defines the basic matrix operations for the SELL matrix storage format.
  4: */
  5: #include <../src/mat/impls/sell/seq/sell.h>
  6: #include <petscblaslapack.h>
  7: #include <petsc/private/kernels/blocktranspose.h>

  9: static PetscBool  cited      = PETSC_FALSE;
 10: static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
 11:                                " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
 12:                                " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
 13:                                " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
 14:                                " year = 2018\n"
 15:                                "}\n";

 17: #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)

 19:   #include <immintrin.h>

 21:   #if !defined(_MM_SCALE_8)
 22:     #define _MM_SCALE_8 8
 23:   #endif

 25:   #if defined(__AVX512F__)
 26:     /* these do not work
 27:    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
 28:    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
 29:   */
 30:     #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
 31:       /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
 32:       vec_idx  = _mm256_loadu_si256((__m256i const *)acolidx); \
 33:       vec_vals = _mm512_loadu_pd(aval); \
 34:       vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
 35:       vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
 36:   #elif defined(__AVX2__) && defined(__FMA__)
 37:     #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
 38:       vec_vals = _mm256_loadu_pd(aval); \
 39:       vec_idx  = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
 40:       vec_x    = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
 41:       vec_y    = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
 42:   #endif
 43: #endif /* PETSC_HAVE_IMMINTRIN_H */

 45: /*@C
 46:  MatSeqSELLSetPreallocation - For good matrix assembly performance
 47:  the user should preallocate the matrix storage by setting the parameter nz
 48:  (or the array nnz).  By setting these parameters accurately, performance
 49:  during matrix assembly can be increased significantly.

 51:  Collective

 53:  Input Parameters:
 54:  +  B - The `MATSEQSELL` matrix
 55:  .  rlenmax - number of nonzeros per row (same for all rows)
 56:  -  rlen - array containing the number of nonzeros in the various rows
 57:  (possibly different for each row) or `NULL`

 59:  Level: intermediate

 61:  Notes:
 62:  If `rlen` is given then `rlenmax` is ignored.

 64:  Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
 65:  Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
 66:  allocation.  For large problems you MUST preallocate memory or you
 67:  will get TERRIBLE performance, see the users' manual chapter on matrices.

 69:  You can call `MatGetInfo()` to get information on how effective the preallocation was;
 70:  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
 71:  You can also run with the option `-info` and look for messages with the string
 72:  malloc in them to see if additional memory allocation was needed.

 74:  Developers Note:
 75:  Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
 76:  entries or columns indices.

 78:  The maximum number of nonzeos in any row should be as accurate as possible.
 79:  If it is underestimated, you will get bad performance due to reallocation
 80:  (`MatSeqXSELLReallocateSELL()`).

 82:  .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
 83:  @*/
 84: PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
 85: {
 88:   PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
 89:   return 0;
 90: }

 92: PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
 93: {
 94:   Mat_SeqSELL *b;
 95:   PetscInt     i, j, totalslices;
 96:   PetscBool    skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;

 98:   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
 99:   if (maxallocrow == MAT_SKIP_ALLOCATION) {
100:     skipallocation = PETSC_TRUE;
101:     maxallocrow    = 0;
102:   }

104:   PetscLayoutSetUp(B->rmap);
105:   PetscLayoutSetUp(B->cmap);

107:   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
108:   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
110:   if (rlen) {
111:     for (i = 0; i < B->rmap->n; i++) {
114:     }
115:   }

117:   B->preallocated = PETSC_TRUE;

119:   b = (Mat_SeqSELL *)B->data;

121:   totalslices    = PetscCeilInt(B->rmap->n, 8);
122:   b->totalslices = totalslices;
123:   if (!skipallocation) {
124:     if (B->rmap->n & 0x07) PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n", B->rmap->n);

126:     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
127:       PetscMalloc1(totalslices + 1, &b->sliidx);
128:     }
129:     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
130:       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
131:       else if (maxallocrow < 0) maxallocrow = 1;
132:       for (i = 0; i <= totalslices; i++) b->sliidx[i] = i * 8 * maxallocrow;
133:     } else {
134:       maxallocrow  = 0;
135:       b->sliidx[0] = 0;
136:       for (i = 1; i < totalslices; i++) {
137:         b->sliidx[i] = 0;
138:         for (j = 0; j < 8; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[8 * (i - 1) + j]);
139:         maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
140:         PetscIntSumError(b->sliidx[i - 1], 8 * b->sliidx[i], &b->sliidx[i]);
141:       }
142:       /* last slice */
143:       b->sliidx[totalslices] = 0;
144:       for (j = (totalslices - 1) * 8; j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
145:       maxallocrow            = PetscMax(b->sliidx[totalslices], maxallocrow);
146:       b->sliidx[totalslices] = b->sliidx[totalslices - 1] + 8 * b->sliidx[totalslices];
147:     }

149:     /* allocate space for val, colidx, rlen */
150:     /* FIXME: should B's old memory be unlogged? */
151:     MatSeqXSELLFreeSELL(B, &b->val, &b->colidx);
152:     /* FIXME: assuming an element of the bit array takes 8 bits */
153:     PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx);
154:     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
155:     PetscCalloc1(8 * totalslices, &b->rlen);

157:     b->singlemalloc = PETSC_TRUE;
158:     b->free_val     = PETSC_TRUE;
159:     b->free_colidx  = PETSC_TRUE;
160:   } else {
161:     b->free_val    = PETSC_FALSE;
162:     b->free_colidx = PETSC_FALSE;
163:   }

165:   b->nz               = 0;
166:   b->maxallocrow      = maxallocrow;
167:   b->rlenmax          = maxallocrow;
168:   b->maxallocmat      = b->sliidx[totalslices];
169:   B->info.nz_unneeded = (double)b->maxallocmat;
170:   if (realalloc) MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE);
171:   return 0;
172: }

174: PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
175: {
176:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
177:   PetscInt     shift;

180:   if (nz) *nz = a->rlen[row];
181:   shift = a->sliidx[row >> 3] + (row & 0x07);
182:   if (!a->getrowcols) PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals);
183:   if (idx) {
184:     PetscInt j;
185:     for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + 8 * j];
186:     *idx = a->getrowcols;
187:   }
188:   if (v) {
189:     PetscInt j;
190:     for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + 8 * j];
191:     *v = a->getrowvals;
192:   }
193:   return 0;
194: }

196: PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
197: {
198:   return 0;
199: }

201: PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
202: {
203:   Mat          B;
204:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
205:   PetscInt     i;

207:   if (reuse == MAT_REUSE_MATRIX) {
208:     B = *newmat;
209:     MatZeroEntries(B);
210:   } else {
211:     MatCreate(PetscObjectComm((PetscObject)A), &B);
212:     MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N);
213:     MatSetType(B, MATSEQAIJ);
214:     MatSeqAIJSetPreallocation(B, 0, a->rlen);
215:   }

217:   for (i = 0; i < A->rmap->n; i++) {
218:     PetscInt     nz = 0, *cols = NULL;
219:     PetscScalar *vals = NULL;

221:     MatGetRow_SeqSELL(A, i, &nz, &cols, &vals);
222:     MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES);
223:     MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals);
224:   }

226:   MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY);
227:   MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY);
228:   B->rmap->bs = A->rmap->bs;

230:   if (reuse == MAT_INPLACE_MATRIX) {
231:     MatHeaderReplace(A, &B);
232:   } else {
233:     *newmat = B;
234:   }
235:   return 0;
236: }

238: #include <../src/mat/impls/aij/seq/aij.h>

240: PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
241: {
242:   Mat                B;
243:   Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
244:   PetscInt          *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
245:   const PetscInt    *cols;
246:   const PetscScalar *vals;


249:   if (reuse == MAT_REUSE_MATRIX) {
250:     B = *newmat;
251:   } else {
252:     if (PetscDefined(USE_DEBUG) || !a->ilen) {
253:       PetscMalloc1(m, &rowlengths);
254:       for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
255:     }
256:     if (PetscDefined(USE_DEBUG) && a->ilen) {
257:       PetscBool eq;
258:       PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq);
260:       PetscFree(rowlengths);
261:       rowlengths = a->ilen;
262:     } else if (a->ilen) rowlengths = a->ilen;
263:     MatCreate(PetscObjectComm((PetscObject)A), &B);
264:     MatSetSizes(B, m, n, m, n);
265:     MatSetType(B, MATSEQSELL);
266:     MatSeqSELLSetPreallocation(B, 0, rowlengths);
267:     if (rowlengths != a->ilen) PetscFree(rowlengths);
268:   }

270:   for (row = 0; row < m; row++) {
271:     MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals);
272:     MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES);
273:     MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals);
274:   }
275:   MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY);
276:   MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY);
277:   B->rmap->bs = A->rmap->bs;

279:   if (reuse == MAT_INPLACE_MATRIX) {
280:     MatHeaderReplace(A, &B);
281:   } else {
282:     *newmat = B;
283:   }
284:   return 0;
285: }

287: PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
288: {
289:   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
290:   PetscScalar       *y;
291:   const PetscScalar *x;
292:   const MatScalar   *aval        = a->val;
293:   PetscInt           totalslices = a->totalslices;
294:   const PetscInt    *acolidx     = a->colidx;
295:   PetscInt           i, j;
296: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
297:   __m512d  vec_x, vec_y, vec_vals;
298:   __m256i  vec_idx;
299:   __mmask8 mask;
300:   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
301:   __m256i  vec_idx2, vec_idx3, vec_idx4;
302: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
303:   __m128i   vec_idx;
304:   __m256d   vec_x, vec_y, vec_y2, vec_vals;
305:   MatScalar yval;
306:   PetscInt  r, rows_left, row, nnz_in_row;
307: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
308:   __m128d   vec_x_tmp;
309:   __m256d   vec_x, vec_y, vec_y2, vec_vals;
310:   MatScalar yval;
311:   PetscInt  r, rows_left, row, nnz_in_row;
312: #else
313:   PetscScalar sum[8];
314: #endif

316: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
317:   #pragma disjoint(*x, *y, *aval)
318: #endif

320:   VecGetArrayRead(xx, &x);
321:   VecGetArray(yy, &y);
322: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
323:   for (i = 0; i < totalslices; i++) { /* loop over slices */
324:     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
325:     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);

327:     vec_y  = _mm512_setzero_pd();
328:     vec_y2 = _mm512_setzero_pd();
329:     vec_y3 = _mm512_setzero_pd();
330:     vec_y4 = _mm512_setzero_pd();

332:     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
333:     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
334:     case 3:
335:       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
336:       acolidx += 8;
337:       aval += 8;
338:       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
339:       acolidx += 8;
340:       aval += 8;
341:       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
342:       acolidx += 8;
343:       aval += 8;
344:       j += 3;
345:       break;
346:     case 2:
347:       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
348:       acolidx += 8;
349:       aval += 8;
350:       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
351:       acolidx += 8;
352:       aval += 8;
353:       j += 2;
354:       break;
355:     case 1:
356:       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
357:       acolidx += 8;
358:       aval += 8;
359:       j += 1;
360:       break;
361:     }
362:   #pragma novector
363:     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
364:       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
365:       acolidx += 8;
366:       aval += 8;
367:       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
368:       acolidx += 8;
369:       aval += 8;
370:       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
371:       acolidx += 8;
372:       aval += 8;
373:       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
374:       acolidx += 8;
375:       aval += 8;
376:     }

378:     vec_y = _mm512_add_pd(vec_y, vec_y2);
379:     vec_y = _mm512_add_pd(vec_y, vec_y3);
380:     vec_y = _mm512_add_pd(vec_y, vec_y4);
381:     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
382:       mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
383:       _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
384:     } else {
385:       _mm512_storeu_pd(&y[8 * i], vec_y);
386:     }
387:   }
388: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
389:   for (i = 0; i < totalslices; i++) { /* loop over full slices */
390:     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
391:     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);

393:     /* last slice may have padding rows. Don't use vectorization. */
394:     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
395:       rows_left = A->rmap->n - 8 * i;
396:       for (r = 0; r < rows_left; ++r) {
397:         yval       = (MatScalar)0;
398:         row        = 8 * i + r;
399:         nnz_in_row = a->rlen[row];
400:         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
401:         y[row] = yval;
402:       }
403:       break;
404:     }

406:     vec_y  = _mm256_setzero_pd();
407:     vec_y2 = _mm256_setzero_pd();

409:   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
410:   #pragma novector
411:   #pragma unroll(2)
412:     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
413:       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
414:       aval += 4;
415:       acolidx += 4;
416:       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
417:       aval += 4;
418:       acolidx += 4;
419:     }

421:     _mm256_storeu_pd(y + i * 8, vec_y);
422:     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
423:   }
424: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
425:   for (i = 0; i < totalslices; i++) { /* loop over full slices */
426:     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
427:     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);

429:     vec_y  = _mm256_setzero_pd();
430:     vec_y2 = _mm256_setzero_pd();

432:     /* last slice may have padding rows. Don't use vectorization. */
433:     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
434:       rows_left = A->rmap->n - 8 * i;
435:       for (r = 0; r < rows_left; ++r) {
436:         yval       = (MatScalar)0;
437:         row        = 8 * i + r;
438:         nnz_in_row = a->rlen[row];
439:         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
440:         y[row] = yval;
441:       }
442:       break;
443:     }

445:   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
446:   #pragma novector
447:   #pragma unroll(2)
448:     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
449:       vec_vals  = _mm256_loadu_pd(aval);
450:       vec_x_tmp = _mm_setzero_pd();
451:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
452:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
453:       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
454:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
455:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
456:       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
457:       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
458:       aval += 4;

460:       vec_vals  = _mm256_loadu_pd(aval);
461:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
462:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
463:       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
464:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
465:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
466:       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
467:       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
468:       aval += 4;
469:     }

471:     _mm256_storeu_pd(y + i * 8, vec_y);
472:     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
473:   }
474: #else
475:   for (i = 0; i < totalslices; i++) { /* loop over slices */
476:     for (j = 0; j < 8; j++) sum[j] = 0.0;
477:     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
478:       sum[0] += aval[j] * x[acolidx[j]];
479:       sum[1] += aval[j + 1] * x[acolidx[j + 1]];
480:       sum[2] += aval[j + 2] * x[acolidx[j + 2]];
481:       sum[3] += aval[j + 3] * x[acolidx[j + 3]];
482:       sum[4] += aval[j + 4] * x[acolidx[j + 4]];
483:       sum[5] += aval[j + 5] * x[acolidx[j + 5]];
484:       sum[6] += aval[j + 6] * x[acolidx[j + 6]];
485:       sum[7] += aval[j + 7] * x[acolidx[j + 7]];
486:     }
487:     if (i == totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
488:       for (j = 0; j < (A->rmap->n & 0x07); j++) y[8 * i + j] = sum[j];
489:     } else {
490:       for (j = 0; j < 8; j++) y[8 * i + j] = sum[j];
491:     }
492:   }
493: #endif

495:   PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt); /* theoretical minimal FLOPs */
496:   VecRestoreArrayRead(xx, &x);
497:   VecRestoreArray(yy, &y);
498:   return 0;
499: }

501: #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
502: PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
503: {
504:   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
505:   PetscScalar       *y, *z;
506:   const PetscScalar *x;
507:   const MatScalar   *aval        = a->val;
508:   PetscInt           totalslices = a->totalslices;
509:   const PetscInt    *acolidx     = a->colidx;
510:   PetscInt           i, j;
511: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
512:   __m512d  vec_x, vec_y, vec_vals;
513:   __m256i  vec_idx;
514:   __mmask8 mask;
515:   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
516:   __m256i  vec_idx2, vec_idx3, vec_idx4;
517: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
518:   __m128d   vec_x_tmp;
519:   __m256d   vec_x, vec_y, vec_y2, vec_vals;
520:   MatScalar yval;
521:   PetscInt  r, row, nnz_in_row;
522: #else
523:   PetscScalar sum[8];
524: #endif

526: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
527:   #pragma disjoint(*x, *y, *aval)
528: #endif

530:   VecGetArrayRead(xx, &x);
531:   VecGetArrayPair(yy, zz, &y, &z);
532: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
533:   for (i = 0; i < totalslices; i++) { /* loop over slices */
534:     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
535:     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);

537:     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
538:       mask  = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
539:       vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
540:     } else {
541:       vec_y = _mm512_loadu_pd(&y[8 * i]);
542:     }
543:     vec_y2 = _mm512_setzero_pd();
544:     vec_y3 = _mm512_setzero_pd();
545:     vec_y4 = _mm512_setzero_pd();

547:     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
548:     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
549:     case 3:
550:       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
551:       acolidx += 8;
552:       aval += 8;
553:       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
554:       acolidx += 8;
555:       aval += 8;
556:       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
557:       acolidx += 8;
558:       aval += 8;
559:       j += 3;
560:       break;
561:     case 2:
562:       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
563:       acolidx += 8;
564:       aval += 8;
565:       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
566:       acolidx += 8;
567:       aval += 8;
568:       j += 2;
569:       break;
570:     case 1:
571:       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
572:       acolidx += 8;
573:       aval += 8;
574:       j += 1;
575:       break;
576:     }
577:   #pragma novector
578:     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
579:       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
580:       acolidx += 8;
581:       aval += 8;
582:       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
583:       acolidx += 8;
584:       aval += 8;
585:       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
586:       acolidx += 8;
587:       aval += 8;
588:       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
589:       acolidx += 8;
590:       aval += 8;
591:     }

593:     vec_y = _mm512_add_pd(vec_y, vec_y2);
594:     vec_y = _mm512_add_pd(vec_y, vec_y3);
595:     vec_y = _mm512_add_pd(vec_y, vec_y4);
596:     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
597:       _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
598:     } else {
599:       _mm512_storeu_pd(&z[8 * i], vec_y);
600:     }
601:   }
602: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
603:   for (i = 0; i < totalslices; i++) { /* loop over full slices */
604:     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
605:     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);

607:     /* last slice may have padding rows. Don't use vectorization. */
608:     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
609:       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
610:         row        = 8 * i + r;
611:         yval       = (MatScalar)0.0;
612:         nnz_in_row = a->rlen[row];
613:         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
614:         z[row] = y[row] + yval;
615:       }
616:       break;
617:     }

619:     vec_y  = _mm256_loadu_pd(y + 8 * i);
620:     vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);

622:     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
623:     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
624:       vec_vals  = _mm256_loadu_pd(aval);
625:       vec_x_tmp = _mm_setzero_pd();
626:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
627:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
628:       vec_x     = _mm256_setzero_pd();
629:       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
630:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
631:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
632:       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
633:       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
634:       aval += 4;

636:       vec_vals  = _mm256_loadu_pd(aval);
637:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
638:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
639:       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
640:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
641:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
642:       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
643:       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
644:       aval += 4;
645:     }

647:     _mm256_storeu_pd(z + i * 8, vec_y);
648:     _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
649:   }
650: #else
651:   for (i = 0; i < totalslices; i++) { /* loop over slices */
652:     for (j = 0; j < 8; j++) sum[j] = 0.0;
653:     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
654:       sum[0] += aval[j] * x[acolidx[j]];
655:       sum[1] += aval[j + 1] * x[acolidx[j + 1]];
656:       sum[2] += aval[j + 2] * x[acolidx[j + 2]];
657:       sum[3] += aval[j + 3] * x[acolidx[j + 3]];
658:       sum[4] += aval[j + 4] * x[acolidx[j + 4]];
659:       sum[5] += aval[j + 5] * x[acolidx[j + 5]];
660:       sum[6] += aval[j + 6] * x[acolidx[j + 6]];
661:       sum[7] += aval[j + 7] * x[acolidx[j + 7]];
662:     }
663:     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
664:       for (j = 0; j < (A->rmap->n & 0x07); j++) z[8 * i + j] = y[8 * i + j] + sum[j];
665:     } else {
666:       for (j = 0; j < 8; j++) z[8 * i + j] = y[8 * i + j] + sum[j];
667:     }
668:   }
669: #endif

671:   PetscLogFlops(2.0 * a->nz);
672:   VecRestoreArrayRead(xx, &x);
673:   VecRestoreArrayPair(yy, zz, &y, &z);
674:   return 0;
675: }

677: PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
678: {
679:   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
680:   PetscScalar       *y;
681:   const PetscScalar *x;
682:   const MatScalar   *aval    = a->val;
683:   const PetscInt    *acolidx = a->colidx;
684:   PetscInt           i, j, r, row, nnz_in_row, totalslices = a->totalslices;

686: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
687:   #pragma disjoint(*x, *y, *aval)
688: #endif

690:   if (A->symmetric == PETSC_BOOL3_TRUE) {
691:     MatMultAdd_SeqSELL(A, xx, zz, yy);
692:     return 0;
693:   }
694:   if (zz != yy) VecCopy(zz, yy);
695:   VecGetArrayRead(xx, &x);
696:   VecGetArray(yy, &y);
697:   for (i = 0; i < a->totalslices; i++) { /* loop over slices */
698:     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
699:       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
700:         row        = 8 * i + r;
701:         nnz_in_row = a->rlen[row];
702:         for (j = 0; j < nnz_in_row; ++j) y[acolidx[8 * j + r]] += aval[8 * j + r] * x[row];
703:       }
704:       break;
705:     }
706:     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
707:       y[acolidx[j]] += aval[j] * x[8 * i];
708:       y[acolidx[j + 1]] += aval[j + 1] * x[8 * i + 1];
709:       y[acolidx[j + 2]] += aval[j + 2] * x[8 * i + 2];
710:       y[acolidx[j + 3]] += aval[j + 3] * x[8 * i + 3];
711:       y[acolidx[j + 4]] += aval[j + 4] * x[8 * i + 4];
712:       y[acolidx[j + 5]] += aval[j + 5] * x[8 * i + 5];
713:       y[acolidx[j + 6]] += aval[j + 6] * x[8 * i + 6];
714:       y[acolidx[j + 7]] += aval[j + 7] * x[8 * i + 7];
715:     }
716:   }
717:   PetscLogFlops(2.0 * a->sliidx[a->totalslices]);
718:   VecRestoreArrayRead(xx, &x);
719:   VecRestoreArray(yy, &y);
720:   return 0;
721: }

723: PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
724: {
725:   if (A->symmetric == PETSC_BOOL3_TRUE) {
726:     MatMult_SeqSELL(A, xx, yy);
727:   } else {
728:     VecSet(yy, 0.0);
729:     MatMultTransposeAdd_SeqSELL(A, xx, yy, yy);
730:   }
731:   return 0;
732: }

734: /*
735:      Checks for missing diagonals
736: */
737: PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
738: {
739:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
740:   PetscInt    *diag, i;

742:   *missing = PETSC_FALSE;
743:   if (A->rmap->n > 0 && !(a->colidx)) {
744:     *missing = PETSC_TRUE;
745:     if (d) *d = 0;
746:     PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n");
747:   } else {
748:     diag = a->diag;
749:     for (i = 0; i < A->rmap->n; i++) {
750:       if (diag[i] == -1) {
751:         *missing = PETSC_TRUE;
752:         if (d) *d = i;
753:         PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i);
754:         break;
755:       }
756:     }
757:   }
758:   return 0;
759: }

761: PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
762: {
763:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
764:   PetscInt     i, j, m = A->rmap->n, shift;

766:   if (!a->diag) {
767:     PetscMalloc1(m, &a->diag);
768:     a->free_diag = PETSC_TRUE;
769:   }
770:   for (i = 0; i < m; i++) {                      /* loop over rows */
771:     shift      = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
772:     a->diag[i] = -1;
773:     for (j = 0; j < a->rlen[i]; j++) {
774:       if (a->colidx[shift + j * 8] == i) {
775:         a->diag[i] = shift + j * 8;
776:         break;
777:       }
778:     }
779:   }
780:   return 0;
781: }

783: /*
784:   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
785: */
786: PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
787: {
788:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
789:   PetscInt     i, *diag, m = A->rmap->n;
790:   MatScalar   *val = a->val;
791:   PetscScalar *idiag, *mdiag;

793:   if (a->idiagvalid) return 0;
794:   MatMarkDiagonal_SeqSELL(A);
795:   diag = a->diag;
796:   if (!a->idiag) {
797:     PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work);
798:     val = a->val;
799:   }
800:   mdiag = a->mdiag;
801:   idiag = a->idiag;

803:   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
804:     for (i = 0; i < m; i++) {
805:       mdiag[i] = val[diag[i]];
806:       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
807:         if (PetscRealPart(fshift)) {
808:           PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i);
809:           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
810:           A->factorerror_zeropivot_value = 0.0;
811:           A->factorerror_zeropivot_row   = i;
812:         } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
813:       }
814:       idiag[i] = 1.0 / val[diag[i]];
815:     }
816:     PetscLogFlops(m);
817:   } else {
818:     for (i = 0; i < m; i++) {
819:       mdiag[i] = val[diag[i]];
820:       idiag[i] = omega / (fshift + val[diag[i]]);
821:     }
822:     PetscLogFlops(2.0 * m);
823:   }
824:   a->idiagvalid = PETSC_TRUE;
825:   return 0;
826: }

828: PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
829: {
830:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;

832:   PetscArrayzero(a->val, a->sliidx[a->totalslices]);
833:   MatSeqSELLInvalidateDiagonal(A);
834:   return 0;
835: }

837: PetscErrorCode MatDestroy_SeqSELL(Mat A)
838: {
839:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;

841: #if defined(PETSC_USE_LOG)
842:   PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz);
843: #endif
844:   MatSeqXSELLFreeSELL(A, &a->val, &a->colidx);
845:   ISDestroy(&a->row);
846:   ISDestroy(&a->col);
847:   PetscFree(a->diag);
848:   PetscFree(a->rlen);
849:   PetscFree(a->sliidx);
850:   PetscFree3(a->idiag, a->mdiag, a->ssor_work);
851:   PetscFree(a->solve_work);
852:   ISDestroy(&a->icol);
853:   PetscFree(a->saved_values);
854:   PetscFree2(a->getrowcols, a->getrowvals);

856:   PetscFree(A->data);

858:   PetscObjectChangeTypeName((PetscObject)A, NULL);
859:   PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL);
860:   PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL);
861:   PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL);
862:   PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL);
863:   PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL);
864:   PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL);
865:   return 0;
866: }

868: PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
869: {
870:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;

872:   switch (op) {
873:   case MAT_ROW_ORIENTED:
874:     a->roworiented = flg;
875:     break;
876:   case MAT_KEEP_NONZERO_PATTERN:
877:     a->keepnonzeropattern = flg;
878:     break;
879:   case MAT_NEW_NONZERO_LOCATIONS:
880:     a->nonew = (flg ? 0 : 1);
881:     break;
882:   case MAT_NEW_NONZERO_LOCATION_ERR:
883:     a->nonew = (flg ? -1 : 0);
884:     break;
885:   case MAT_NEW_NONZERO_ALLOCATION_ERR:
886:     a->nonew = (flg ? -2 : 0);
887:     break;
888:   case MAT_UNUSED_NONZERO_LOCATION_ERR:
889:     a->nounused = (flg ? -1 : 0);
890:     break;
891:   case MAT_FORCE_DIAGONAL_ENTRIES:
892:   case MAT_IGNORE_OFF_PROC_ENTRIES:
893:   case MAT_USE_HASH_TABLE:
894:   case MAT_SORTED_FULL:
895:     PetscInfo(A, "Option %s ignored\n", MatOptions[op]);
896:     break;
897:   case MAT_SPD:
898:   case MAT_SYMMETRIC:
899:   case MAT_STRUCTURALLY_SYMMETRIC:
900:   case MAT_HERMITIAN:
901:   case MAT_SYMMETRY_ETERNAL:
902:   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
903:   case MAT_SPD_ETERNAL:
904:     /* These options are handled directly by MatSetOption() */
905:     break;
906:   default:
907:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
908:   }
909:   return 0;
910: }

912: PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
913: {
914:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
915:   PetscInt     i, j, n, shift;
916:   PetscScalar *x, zero = 0.0;

918:   VecGetLocalSize(v, &n);

921:   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
922:     PetscInt *diag = a->diag;
923:     VecGetArray(v, &x);
924:     for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
925:     VecRestoreArray(v, &x);
926:     return 0;
927:   }

929:   VecSet(v, zero);
930:   VecGetArray(v, &x);
931:   for (i = 0; i < n; i++) {                 /* loop over rows */
932:     shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
933:     x[i]  = 0;
934:     for (j = 0; j < a->rlen[i]; j++) {
935:       if (a->colidx[shift + j * 8] == i) {
936:         x[i] = a->val[shift + j * 8];
937:         break;
938:       }
939:     }
940:   }
941:   VecRestoreArray(v, &x);
942:   return 0;
943: }

945: PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
946: {
947:   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
948:   const PetscScalar *l, *r;
949:   PetscInt           i, j, m, n, row;

951:   if (ll) {
952:     /* The local size is used so that VecMPI can be passed to this routine
953:        by MatDiagonalScale_MPISELL */
954:     VecGetLocalSize(ll, &m);
956:     VecGetArrayRead(ll, &l);
957:     for (i = 0; i < a->totalslices; i++) {                  /* loop over slices */
958:       if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
959:         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
960:           if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8 * i + row];
961:         }
962:       } else {
963:         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) a->val[j] *= l[8 * i + row];
964:       }
965:     }
966:     VecRestoreArrayRead(ll, &l);
967:     PetscLogFlops(a->nz);
968:   }
969:   if (rr) {
970:     VecGetLocalSize(rr, &n);
972:     VecGetArrayRead(rr, &r);
973:     for (i = 0; i < a->totalslices; i++) {                  /* loop over slices */
974:       if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
975:         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
976:           if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
977:         }
978:       } else {
979:         for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
980:       }
981:     }
982:     VecRestoreArrayRead(rr, &r);
983:     PetscLogFlops(a->nz);
984:   }
985:   MatSeqSELLInvalidateDiagonal(A);
986:   return 0;
987: }

989: PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
990: {
991:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
992:   PetscInt    *cp, i, k, low, high, t, row, col, l;
993:   PetscInt     shift;
994:   MatScalar   *vp;

996:   for (k = 0; k < m; k++) { /* loop over requested rows */
997:     row = im[k];
998:     if (row < 0) continue;
1000:     shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
1001:     cp    = a->colidx + shift;                  /* pointer to the row */
1002:     vp    = a->val + shift;                     /* pointer to the row */
1003:     for (l = 0; l < n; l++) {                   /* loop over requested columns */
1004:       col = in[l];
1005:       if (col < 0) continue;
1007:       high = a->rlen[row];
1008:       low  = 0; /* assume unsorted */
1009:       while (high - low > 5) {
1010:         t = (low + high) / 2;
1011:         if (*(cp + t * 8) > col) high = t;
1012:         else low = t;
1013:       }
1014:       for (i = low; i < high; i++) {
1015:         if (*(cp + 8 * i) > col) break;
1016:         if (*(cp + 8 * i) == col) {
1017:           *v++ = *(vp + 8 * i);
1018:           goto finished;
1019:         }
1020:       }
1021:       *v++ = 0.0;
1022:     finished:;
1023:     }
1024:   }
1025:   return 0;
1026: }

1028: PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1029: {
1030:   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1031:   PetscInt          i, j, m = A->rmap->n, shift;
1032:   const char       *name;
1033:   PetscViewerFormat format;

1035:   PetscViewerGetFormat(viewer, &format);
1036:   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1037:     PetscInt nofinalvalue = 0;
1038:     /*
1039:     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1040:       nofinalvalue = 1;
1041:     }
1042:     */
1043:     PetscViewerASCIIUseTabs(viewer, PETSC_FALSE);
1044:     PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n);
1045:     PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz);
1046: #if defined(PETSC_USE_COMPLEX)
1047:     PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue);
1048: #else
1049:     PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue);
1050: #endif
1051:     PetscViewerASCIIPrintf(viewer, "zzz = [\n");

1053:     for (i = 0; i < m; i++) {
1054:       shift = a->sliidx[i >> 3] + (i & 0x07);
1055:       for (j = 0; j < a->rlen[i]; j++) {
1056: #if defined(PETSC_USE_COMPLEX)
1057:         PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]));
1058: #else
1059:         PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)a->val[shift + 8 * j]);
1060: #endif
1061:       }
1062:     }
1063:     /*
1064:     if (nofinalvalue) {
1065: #if defined(PETSC_USE_COMPLEX)
1066:       PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.);
1067: #else
1068:       PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0);
1069: #endif
1070:     }
1071:     */
1072:     PetscObjectGetName((PetscObject)A, &name);
1073:     PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name);
1074:     PetscViewerASCIIUseTabs(viewer, PETSC_TRUE);
1075:   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1076:     return 0;
1077:   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1078:     PetscViewerASCIIUseTabs(viewer, PETSC_FALSE);
1079:     for (i = 0; i < m; i++) {
1080:       PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i);
1081:       shift = a->sliidx[i >> 3] + (i & 0x07);
1082:       for (j = 0; j < a->rlen[i]; j++) {
1083: #if defined(PETSC_USE_COMPLEX)
1084:         if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1085:           PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]));
1086:         } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1087:           PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j]));
1088:         } else if (PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1089:           PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]));
1090:         }
1091: #else
1092:         if (a->val[shift + 8 * j] != 0.0) PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]);
1093: #endif
1094:       }
1095:       PetscViewerASCIIPrintf(viewer, "\n");
1096:     }
1097:     PetscViewerASCIIUseTabs(viewer, PETSC_TRUE);
1098:   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1099:     PetscInt    cnt = 0, jcnt;
1100:     PetscScalar value;
1101: #if defined(PETSC_USE_COMPLEX)
1102:     PetscBool realonly = PETSC_TRUE;
1103:     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1104:       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1105:         realonly = PETSC_FALSE;
1106:         break;
1107:       }
1108:     }
1109: #endif

1111:     PetscViewerASCIIUseTabs(viewer, PETSC_FALSE);
1112:     for (i = 0; i < m; i++) {
1113:       jcnt  = 0;
1114:       shift = a->sliidx[i >> 3] + (i & 0x07);
1115:       for (j = 0; j < A->cmap->n; j++) {
1116:         if (jcnt < a->rlen[i] && j == a->colidx[shift + 8 * j]) {
1117:           value = a->val[cnt++];
1118:           jcnt++;
1119:         } else {
1120:           value = 0.0;
1121:         }
1122: #if defined(PETSC_USE_COMPLEX)
1123:         if (realonly) {
1124:           PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value));
1125:         } else {
1126:           PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value));
1127:         }
1128: #else
1129:         PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value);
1130: #endif
1131:       }
1132:       PetscViewerASCIIPrintf(viewer, "\n");
1133:     }
1134:     PetscViewerASCIIUseTabs(viewer, PETSC_TRUE);
1135:   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1136:     PetscInt fshift = 1;
1137:     PetscViewerASCIIUseTabs(viewer, PETSC_FALSE);
1138: #if defined(PETSC_USE_COMPLEX)
1139:     PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n");
1140: #else
1141:     PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n");
1142: #endif
1143:     PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz);
1144:     for (i = 0; i < m; i++) {
1145:       shift = a->sliidx[i >> 3] + (i & 0x07);
1146:       for (j = 0; j < a->rlen[i]; j++) {
1147: #if defined(PETSC_USE_COMPLEX)
1148:         PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]));
1149: #else
1150:         PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)a->val[shift + 8 * j]);
1151: #endif
1152:       }
1153:     }
1154:     PetscViewerASCIIUseTabs(viewer, PETSC_TRUE);
1155:   } else if (format == PETSC_VIEWER_NATIVE) {
1156:     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1157:       PetscInt row;
1158:       PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]);
1159:       for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
1160: #if defined(PETSC_USE_COMPLEX)
1161:         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1162:           PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]));
1163:         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1164:           PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]));
1165:         } else {
1166:           PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]));
1167:         }
1168: #else
1169:         PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)a->val[j]);
1170: #endif
1171:       }
1172:     }
1173:   } else {
1174:     PetscViewerASCIIUseTabs(viewer, PETSC_FALSE);
1175:     if (A->factortype) {
1176:       for (i = 0; i < m; i++) {
1177:         shift = a->sliidx[i >> 3] + (i & 0x07);
1178:         PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i);
1179:         /* L part */
1180:         for (j = shift; j < a->diag[i]; j += 8) {
1181: #if defined(PETSC_USE_COMPLEX)
1182:           if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0) {
1183:             PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]));
1184:           } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0) {
1185:             PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])));
1186:           } else {
1187:             PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]));
1188:           }
1189: #else
1190:           PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]);
1191: #endif
1192:         }
1193:         /* diagonal */
1194:         j = a->diag[i];
1195: #if defined(PETSC_USE_COMPLEX)
1196:         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1197:           PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]));
1198:         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1199:           PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])));
1200:         } else {
1201:           PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]));
1202:         }
1203: #else
1204:         PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j]));
1205: #endif

1207:         /* U part */
1208:         for (j = a->diag[i] + 1; j < shift + 8 * a->rlen[i]; j += 8) {
1209: #if defined(PETSC_USE_COMPLEX)
1210:           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1211:             PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]));
1212:           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1213:             PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])));
1214:           } else {
1215:             PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]));
1216:           }
1217: #else
1218:           PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]);
1219: #endif
1220:         }
1221:         PetscViewerASCIIPrintf(viewer, "\n");
1222:       }
1223:     } else {
1224:       for (i = 0; i < m; i++) {
1225:         shift = a->sliidx[i >> 3] + (i & 0x07);
1226:         PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i);
1227:         for (j = 0; j < a->rlen[i]; j++) {
1228: #if defined(PETSC_USE_COMPLEX)
1229:           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1230:             PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]));
1231:           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1232:             PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j]));
1233:           } else {
1234:             PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]));
1235:           }
1236: #else
1237:           PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]);
1238: #endif
1239:         }
1240:         PetscViewerASCIIPrintf(viewer, "\n");
1241:       }
1242:     }
1243:     PetscViewerASCIIUseTabs(viewer, PETSC_TRUE);
1244:   }
1245:   PetscViewerFlush(viewer);
1246:   return 0;
1247: }

1249: #include <petscdraw.h>
1250: PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1251: {
1252:   Mat               A = (Mat)Aa;
1253:   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1254:   PetscInt          i, j, m = A->rmap->n, shift;
1255:   int               color;
1256:   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1257:   PetscViewer       viewer;
1258:   PetscViewerFormat format;

1260:   PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer);
1261:   PetscViewerGetFormat(viewer, &format);
1262:   PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr);

1264:   /* loop over matrix elements drawing boxes */

1266:   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1267:     PetscDrawCollectiveBegin(draw);
1268:     /* Blue for negative, Cyan for zero and  Red for positive */
1269:     color = PETSC_DRAW_BLUE;
1270:     for (i = 0; i < m; i++) {
1271:       shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1272:       y_l   = m - i - 1.0;
1273:       y_r   = y_l + 1.0;
1274:       for (j = 0; j < a->rlen[i]; j++) {
1275:         x_l = a->colidx[shift + j * 8];
1276:         x_r = x_l + 1.0;
1277:         if (PetscRealPart(a->val[shift + 8 * j]) >= 0.) continue;
1278:         PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color);
1279:       }
1280:     }
1281:     color = PETSC_DRAW_CYAN;
1282:     for (i = 0; i < m; i++) {
1283:       shift = a->sliidx[i >> 3] + (i & 0x07);
1284:       y_l   = m - i - 1.0;
1285:       y_r   = y_l + 1.0;
1286:       for (j = 0; j < a->rlen[i]; j++) {
1287:         x_l = a->colidx[shift + j * 8];
1288:         x_r = x_l + 1.0;
1289:         if (a->val[shift + 8 * j] != 0.) continue;
1290:         PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color);
1291:       }
1292:     }
1293:     color = PETSC_DRAW_RED;
1294:     for (i = 0; i < m; i++) {
1295:       shift = a->sliidx[i >> 3] + (i & 0x07);
1296:       y_l   = m - i - 1.0;
1297:       y_r   = y_l + 1.0;
1298:       for (j = 0; j < a->rlen[i]; j++) {
1299:         x_l = a->colidx[shift + j * 8];
1300:         x_r = x_l + 1.0;
1301:         if (PetscRealPart(a->val[shift + 8 * j]) <= 0.) continue;
1302:         PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color);
1303:       }
1304:     }
1305:     PetscDrawCollectiveEnd(draw);
1306:   } else {
1307:     /* use contour shading to indicate magnitude of values */
1308:     /* first determine max of all nonzero values */
1309:     PetscReal minv = 0.0, maxv = 0.0;
1310:     PetscInt  count = 0;
1311:     PetscDraw popup;
1312:     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1313:       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1314:     }
1315:     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1316:     PetscDrawGetPopup(draw, &popup);
1317:     PetscDrawScalePopup(popup, minv, maxv);

1319:     PetscDrawCollectiveBegin(draw);
1320:     for (i = 0; i < m; i++) {
1321:       shift = a->sliidx[i >> 3] + (i & 0x07);
1322:       y_l   = m - i - 1.0;
1323:       y_r   = y_l + 1.0;
1324:       for (j = 0; j < a->rlen[i]; j++) {
1325:         x_l   = a->colidx[shift + j * 8];
1326:         x_r   = x_l + 1.0;
1327:         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1328:         PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color);
1329:         count++;
1330:       }
1331:     }
1332:     PetscDrawCollectiveEnd(draw);
1333:   }
1334:   return 0;
1335: }

1337: #include <petscdraw.h>
1338: PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1339: {
1340:   PetscDraw draw;
1341:   PetscReal xr, yr, xl, yl, h, w;
1342:   PetscBool isnull;

1344:   PetscViewerDrawGetDraw(viewer, 0, &draw);
1345:   PetscDrawIsNull(draw, &isnull);
1346:   if (isnull) return 0;

1348:   xr = A->cmap->n;
1349:   yr = A->rmap->n;
1350:   h  = yr / 10.0;
1351:   w  = xr / 10.0;
1352:   xr += w;
1353:   yr += h;
1354:   xl = -w;
1355:   yl = -h;
1356:   PetscDrawSetCoordinates(draw, xl, yl, xr, yr);
1357:   PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer);
1358:   PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A);
1359:   PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL);
1360:   PetscDrawSave(draw);
1361:   return 0;
1362: }

1364: PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1365: {
1366:   PetscBool iascii, isbinary, isdraw;

1368:   PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii);
1369:   PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary);
1370:   PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw);
1371:   if (iascii) {
1372:     MatView_SeqSELL_ASCII(A, viewer);
1373:   } else if (isbinary) {
1374:     /* MatView_SeqSELL_Binary(A,viewer); */
1375:   } else if (isdraw) MatView_SeqSELL_Draw(A, viewer);
1376:   return 0;
1377: }

1379: PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1380: {
1381:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1382:   PetscInt     i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1383:   MatScalar   *vp;

1385:   if (mode == MAT_FLUSH_ASSEMBLY) return 0;
1386:   /* To do: compress out the unused elements */
1387:   MatMarkDiagonal_SeqSELL(A);
1388:   PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz);
1389:   PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs);
1390:   PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax);
1391:   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1392:   for (i = 0; i < a->totalslices; ++i) {
1393:     shift = a->sliidx[i];                                      /* starting index of the slice */
1394:     cp    = a->colidx + shift;                                 /* pointer to the column indices of the slice */
1395:     vp    = a->val + shift;                                    /* pointer to the nonzero values of the slice */
1396:     for (row_in_slice = 0; row_in_slice < 8; ++row_in_slice) { /* loop over rows in the slice */
1397:       row  = 8 * i + row_in_slice;
1398:       nrow = a->rlen[row]; /* number of nonzeros in row */
1399:       /*
1400:         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1401:         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1402:       */
1403:       lastcol = 0;
1404:       if (nrow > 0) {                                /* nonempty row */
1405:         lastcol = cp[8 * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1406:       } else if (!row_in_slice) {                    /* first row of the currect slice is empty */
1407:         for (j = 1; j < 8; j++) {
1408:           if (a->rlen[8 * i + j]) {
1409:             lastcol = cp[j];
1410:             break;
1411:           }
1412:         }
1413:       } else {
1414:         if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1415:       }

1417:       for (k = nrow; k < (a->sliidx[i + 1] - shift) / 8; ++k) {
1418:         cp[8 * k + row_in_slice] = lastcol;
1419:         vp[8 * k + row_in_slice] = (MatScalar)0;
1420:       }
1421:     }
1422:   }

1424:   A->info.mallocs += a->reallocs;
1425:   a->reallocs = 0;

1427:   MatSeqSELLInvalidateDiagonal(A);
1428:   return 0;
1429: }

1431: PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1432: {
1433:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;

1435:   info->block_size   = 1.0;
1436:   info->nz_allocated = a->maxallocmat;
1437:   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1438:   info->nz_unneeded  = (a->maxallocmat - a->sliidx[a->totalslices]);
1439:   info->assemblies   = A->num_ass;
1440:   info->mallocs      = A->info.mallocs;
1441:   info->memory       = 0; /* REVIEW ME */
1442:   if (A->factortype) {
1443:     info->fill_ratio_given  = A->info.fill_ratio_given;
1444:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1445:     info->factor_mallocs    = A->info.factor_mallocs;
1446:   } else {
1447:     info->fill_ratio_given  = 0;
1448:     info->fill_ratio_needed = 0;
1449:     info->factor_mallocs    = 0;
1450:   }
1451:   return 0;
1452: }

1454: PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1455: {
1456:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1457:   PetscInt     shift, i, k, l, low, high, t, ii, row, col, nrow;
1458:   PetscInt    *cp, nonew = a->nonew, lastcol = -1;
1459:   MatScalar   *vp, value;

1461:   for (k = 0; k < m; k++) { /* loop over added rows */
1462:     row = im[k];
1463:     if (row < 0) continue;
1465:     shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
1466:     cp    = a->colidx + shift;                  /* pointer to the row */
1467:     vp    = a->val + shift;                     /* pointer to the row */
1468:     nrow  = a->rlen[row];
1469:     low   = 0;
1470:     high  = nrow;

1472:     for (l = 0; l < n; l++) { /* loop over added columns */
1473:       col = in[l];
1474:       if (col < 0) continue;
1476:       if (a->roworiented) {
1477:         value = v[l + k * n];
1478:       } else {
1479:         value = v[k + l * m];
1480:       }
1481:       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;

1483:       /* search in this row for the specified column, i indicates the column to be set */
1484:       if (col <= lastcol) low = 0;
1485:       else high = nrow;
1486:       lastcol = col;
1487:       while (high - low > 5) {
1488:         t = (low + high) / 2;
1489:         if (*(cp + t * 8) > col) high = t;
1490:         else low = t;
1491:       }
1492:       for (i = low; i < high; i++) {
1493:         if (*(cp + i * 8) > col) break;
1494:         if (*(cp + i * 8) == col) {
1495:           if (is == ADD_VALUES) *(vp + i * 8) += value;
1496:           else *(vp + i * 8) = value;
1497:           low = i + 1;
1498:           goto noinsert;
1499:         }
1500:       }
1501:       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1502:       if (nonew == 1) goto noinsert;
1504:       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1505:       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, row / 8, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar);
1506:       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1507:       for (ii = nrow - 1; ii >= i; ii--) {
1508:         *(cp + (ii + 1) * 8) = *(cp + ii * 8);
1509:         *(vp + (ii + 1) * 8) = *(vp + ii * 8);
1510:       }
1511:       a->rlen[row]++;
1512:       *(cp + i * 8) = col;
1513:       *(vp + i * 8) = value;
1514:       a->nz++;
1515:       A->nonzerostate++;
1516:       low = i + 1;
1517:       high++;
1518:       nrow++;
1519:     noinsert:;
1520:     }
1521:     a->rlen[row] = nrow;
1522:   }
1523:   return 0;
1524: }

1526: PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1527: {
1528:   /* If the two matrices have the same copy implementation, use fast copy. */
1529:   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1530:     Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1531:     Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;

1534:     PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]);
1535:   } else {
1536:     MatCopy_Basic(A, B, str);
1537:   }
1538:   return 0;
1539: }

1541: PetscErrorCode MatSetUp_SeqSELL(Mat A)
1542: {
1543:   MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL);
1544:   return 0;
1545: }

1547: PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1548: {
1549:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;

1551:   *array = a->val;
1552:   return 0;
1553: }

1555: PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1556: {
1557:   return 0;
1558: }

1560: PetscErrorCode MatRealPart_SeqSELL(Mat A)
1561: {
1562:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1563:   PetscInt     i;
1564:   MatScalar   *aval = a->val;

1566:   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]);
1567:   return 0;
1568: }

1570: PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1571: {
1572:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1573:   PetscInt     i;
1574:   MatScalar   *aval = a->val;

1576:   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1577:   MatSeqSELLInvalidateDiagonal(A);
1578:   return 0;
1579: }

1581: PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1582: {
1583:   Mat_SeqSELL *a      = (Mat_SeqSELL *)inA->data;
1584:   MatScalar   *aval   = a->val;
1585:   PetscScalar  oalpha = alpha;
1586:   PetscBLASInt one    = 1, size;

1588:   PetscBLASIntCast(a->sliidx[a->totalslices], &size);
1589:   PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1590:   PetscLogFlops(a->nz);
1591:   MatSeqSELLInvalidateDiagonal(inA);
1592:   return 0;
1593: }

1595: PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1596: {
1597:   Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;

1599:   if (!Y->preallocated || !y->nz) MatSeqSELLSetPreallocation(Y, 1, NULL);
1600:   MatShift_Basic(Y, a);
1601:   return 0;
1602: }

1604: PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1605: {
1606:   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
1607:   PetscScalar       *x, sum, *t;
1608:   const MatScalar   *idiag = NULL, *mdiag;
1609:   const PetscScalar *b, *xb;
1610:   PetscInt           n, m = A->rmap->n, i, j, shift;
1611:   const PetscInt    *diag;

1613:   its = its * lits;

1615:   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1616:   if (!a->idiagvalid) MatInvertDiagonal_SeqSELL(A, omega, fshift);
1617:   a->fshift = fshift;
1618:   a->omega  = omega;

1620:   diag  = a->diag;
1621:   t     = a->ssor_work;
1622:   idiag = a->idiag;
1623:   mdiag = a->mdiag;

1625:   VecGetArray(xx, &x);
1626:   VecGetArrayRead(bb, &b);
1627:   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */

1632:   if (flag & SOR_ZERO_INITIAL_GUESS) {
1633:     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1634:       for (i = 0; i < m; i++) {
1635:         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1636:         sum   = b[i];
1637:         n     = (diag[i] - shift) / 8;
1638:         for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1639:         t[i] = sum;
1640:         x[i] = sum * idiag[i];
1641:       }
1642:       xb = t;
1643:       PetscLogFlops(a->nz);
1644:     } else xb = b;
1645:     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1646:       for (i = m - 1; i >= 0; i--) {
1647:         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1648:         sum   = xb[i];
1649:         n     = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1650:         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1651:         if (xb == b) {
1652:           x[i] = sum * idiag[i];
1653:         } else {
1654:           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1655:         }
1656:       }
1657:       PetscLogFlops(a->nz); /* assumes 1/2 in upper */
1658:     }
1659:     its--;
1660:   }
1661:   while (its--) {
1662:     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1663:       for (i = 0; i < m; i++) {
1664:         /* lower */
1665:         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1666:         sum   = b[i];
1667:         n     = (diag[i] - shift) / 8;
1668:         for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1669:         t[i] = sum; /* save application of the lower-triangular part */
1670:         /* upper */
1671:         n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1672:         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1673:         x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1674:       }
1675:       xb = t;
1676:       PetscLogFlops(2.0 * a->nz);
1677:     } else xb = b;
1678:     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1679:       for (i = m - 1; i >= 0; i--) {
1680:         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1681:         sum   = xb[i];
1682:         if (xb == b) {
1683:           /* whole matrix (no checkpointing available) */
1684:           n = a->rlen[i];
1685:           for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1686:           x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1687:         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1688:           n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1689:           for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1690:           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1691:         }
1692:       }
1693:       if (xb == b) {
1694:         PetscLogFlops(2.0 * a->nz);
1695:       } else {
1696:         PetscLogFlops(a->nz); /* assumes 1/2 in upper */
1697:       }
1698:     }
1699:   }
1700:   VecRestoreArray(xx, &x);
1701:   VecRestoreArrayRead(bb, &b);
1702:   return 0;
1703: }

1705: /* -------------------------------------------------------------------*/
1706: static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1707:                                        MatGetRow_SeqSELL,
1708:                                        MatRestoreRow_SeqSELL,
1709:                                        MatMult_SeqSELL,
1710:                                        /* 4*/ MatMultAdd_SeqSELL,
1711:                                        MatMultTranspose_SeqSELL,
1712:                                        MatMultTransposeAdd_SeqSELL,
1713:                                        NULL,
1714:                                        NULL,
1715:                                        NULL,
1716:                                        /* 10*/ NULL,
1717:                                        NULL,
1718:                                        NULL,
1719:                                        MatSOR_SeqSELL,
1720:                                        NULL,
1721:                                        /* 15*/ MatGetInfo_SeqSELL,
1722:                                        MatEqual_SeqSELL,
1723:                                        MatGetDiagonal_SeqSELL,
1724:                                        MatDiagonalScale_SeqSELL,
1725:                                        NULL,
1726:                                        /* 20*/ NULL,
1727:                                        MatAssemblyEnd_SeqSELL,
1728:                                        MatSetOption_SeqSELL,
1729:                                        MatZeroEntries_SeqSELL,
1730:                                        /* 24*/ NULL,
1731:                                        NULL,
1732:                                        NULL,
1733:                                        NULL,
1734:                                        NULL,
1735:                                        /* 29*/ MatSetUp_SeqSELL,
1736:                                        NULL,
1737:                                        NULL,
1738:                                        NULL,
1739:                                        NULL,
1740:                                        /* 34*/ MatDuplicate_SeqSELL,
1741:                                        NULL,
1742:                                        NULL,
1743:                                        NULL,
1744:                                        NULL,
1745:                                        /* 39*/ NULL,
1746:                                        NULL,
1747:                                        NULL,
1748:                                        MatGetValues_SeqSELL,
1749:                                        MatCopy_SeqSELL,
1750:                                        /* 44*/ NULL,
1751:                                        MatScale_SeqSELL,
1752:                                        MatShift_SeqSELL,
1753:                                        NULL,
1754:                                        NULL,
1755:                                        /* 49*/ NULL,
1756:                                        NULL,
1757:                                        NULL,
1758:                                        NULL,
1759:                                        NULL,
1760:                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
1761:                                        NULL,
1762:                                        NULL,
1763:                                        NULL,
1764:                                        NULL,
1765:                                        /* 59*/ NULL,
1766:                                        MatDestroy_SeqSELL,
1767:                                        MatView_SeqSELL,
1768:                                        NULL,
1769:                                        NULL,
1770:                                        /* 64*/ NULL,
1771:                                        NULL,
1772:                                        NULL,
1773:                                        NULL,
1774:                                        NULL,
1775:                                        /* 69*/ NULL,
1776:                                        NULL,
1777:                                        NULL,
1778:                                        NULL,
1779:                                        NULL,
1780:                                        /* 74*/ NULL,
1781:                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1782:                                        NULL,
1783:                                        NULL,
1784:                                        NULL,
1785:                                        /* 79*/ NULL,
1786:                                        NULL,
1787:                                        NULL,
1788:                                        NULL,
1789:                                        NULL,
1790:                                        /* 84*/ NULL,
1791:                                        NULL,
1792:                                        NULL,
1793:                                        NULL,
1794:                                        NULL,
1795:                                        /* 89*/ NULL,
1796:                                        NULL,
1797:                                        NULL,
1798:                                        NULL,
1799:                                        NULL,
1800:                                        /* 94*/ NULL,
1801:                                        NULL,
1802:                                        NULL,
1803:                                        NULL,
1804:                                        NULL,
1805:                                        /* 99*/ NULL,
1806:                                        NULL,
1807:                                        NULL,
1808:                                        MatConjugate_SeqSELL,
1809:                                        NULL,
1810:                                        /*104*/ NULL,
1811:                                        NULL,
1812:                                        NULL,
1813:                                        NULL,
1814:                                        NULL,
1815:                                        /*109*/ NULL,
1816:                                        NULL,
1817:                                        NULL,
1818:                                        NULL,
1819:                                        MatMissingDiagonal_SeqSELL,
1820:                                        /*114*/ NULL,
1821:                                        NULL,
1822:                                        NULL,
1823:                                        NULL,
1824:                                        NULL,
1825:                                        /*119*/ NULL,
1826:                                        NULL,
1827:                                        NULL,
1828:                                        NULL,
1829:                                        NULL,
1830:                                        /*124*/ NULL,
1831:                                        NULL,
1832:                                        NULL,
1833:                                        NULL,
1834:                                        NULL,
1835:                                        /*129*/ NULL,
1836:                                        NULL,
1837:                                        NULL,
1838:                                        NULL,
1839:                                        NULL,
1840:                                        /*134*/ NULL,
1841:                                        NULL,
1842:                                        NULL,
1843:                                        NULL,
1844:                                        NULL,
1845:                                        /*139*/ NULL,
1846:                                        NULL,
1847:                                        NULL,
1848:                                        MatFDColoringSetUp_SeqXAIJ,
1849:                                        NULL,
1850:                                        /*144*/ NULL,
1851:                                        NULL,
1852:                                        NULL,
1853:                                        NULL,
1854:                                        NULL,
1855:                                        NULL,
1856:                                        /*150*/ NULL};

1858: PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1859: {
1860:   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;


1864:   /* allocate space for values if not already there */
1865:   if (!a->saved_values) { PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values); }

1867:   /* copy values over */
1868:   PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]);
1869:   return 0;
1870: }

1872: PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1873: {
1874:   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;

1878:   PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]);
1879:   return 0;
1880: }

1882: /*@C
1883:  MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()`

1885:  Not Collective

1887:  Input Parameters:
1888:  .  mat - a `MATSEQSELL` matrix
1889:  .  array - pointer to the data

1891:  Level: intermediate

1893:  .seealso: `Mat`, `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()`
1894:  @*/
1895: PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array)
1896: {
1897:   PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array));
1898:   return 0;
1899: }

1901: PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1902: {
1903:   Mat_SeqSELL *b;
1904:   PetscMPIInt  size;

1906:   PetscCitationsRegister(citation, &cited);
1907:   MPI_Comm_size(PetscObjectComm((PetscObject)B), &size);

1910:   PetscNew(&b);

1912:   B->data = (void *)b;

1914:   PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps));

1916:   b->row                = NULL;
1917:   b->col                = NULL;
1918:   b->icol               = NULL;
1919:   b->reallocs           = 0;
1920:   b->ignorezeroentries  = PETSC_FALSE;
1921:   b->roworiented        = PETSC_TRUE;
1922:   b->nonew              = 0;
1923:   b->diag               = NULL;
1924:   b->solve_work         = NULL;
1925:   B->spptr              = NULL;
1926:   b->saved_values       = NULL;
1927:   b->idiag              = NULL;
1928:   b->mdiag              = NULL;
1929:   b->ssor_work          = NULL;
1930:   b->omega              = 1.0;
1931:   b->fshift             = 0.0;
1932:   b->idiagvalid         = PETSC_FALSE;
1933:   b->keepnonzeropattern = PETSC_FALSE;

1935:   PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL);
1936:   PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL);
1937:   PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL);
1938:   PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL);
1939:   PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL);
1940:   PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL);
1941:   PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ);
1942:   return 0;
1943: }

1945: /*
1946:  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1947:  */
1948: PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
1949: {
1950:   Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
1951:   PetscInt     i, m                           = A->rmap->n;
1952:   PetscInt     totalslices = a->totalslices;

1954:   C->factortype = A->factortype;
1955:   c->row        = NULL;
1956:   c->col        = NULL;
1957:   c->icol       = NULL;
1958:   c->reallocs   = 0;
1959:   C->assembled  = PETSC_TRUE;

1961:   PetscLayoutReference(A->rmap, &C->rmap);
1962:   PetscLayoutReference(A->cmap, &C->cmap);

1964:   PetscMalloc1(8 * totalslices, &c->rlen);
1965:   PetscMalloc1(totalslices + 1, &c->sliidx);

1967:   for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
1968:   for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];

1970:   /* allocate the matrix space */
1971:   if (mallocmatspace) {
1972:     PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx);

1974:     c->singlemalloc = PETSC_TRUE;

1976:     if (m > 0) {
1977:       PetscArraycpy(c->colidx, a->colidx, a->maxallocmat);
1978:       if (cpvalues == MAT_COPY_VALUES) {
1979:         PetscArraycpy(c->val, a->val, a->maxallocmat);
1980:       } else {
1981:         PetscArrayzero(c->val, a->maxallocmat);
1982:       }
1983:     }
1984:   }

1986:   c->ignorezeroentries = a->ignorezeroentries;
1987:   c->roworiented       = a->roworiented;
1988:   c->nonew             = a->nonew;
1989:   if (a->diag) {
1990:     PetscMalloc1(m, &c->diag);
1991:     for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
1992:   } else c->diag = NULL;

1994:   c->solve_work         = NULL;
1995:   c->saved_values       = NULL;
1996:   c->idiag              = NULL;
1997:   c->ssor_work          = NULL;
1998:   c->keepnonzeropattern = a->keepnonzeropattern;
1999:   c->free_val           = PETSC_TRUE;
2000:   c->free_colidx        = PETSC_TRUE;

2002:   c->maxallocmat  = a->maxallocmat;
2003:   c->maxallocrow  = a->maxallocrow;
2004:   c->rlenmax      = a->rlenmax;
2005:   c->nz           = a->nz;
2006:   C->preallocated = PETSC_TRUE;

2008:   c->nonzerorowcnt = a->nonzerorowcnt;
2009:   C->nonzerostate  = A->nonzerostate;

2011:   PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist);
2012:   return 0;
2013: }

2015: PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2016: {
2017:   MatCreate(PetscObjectComm((PetscObject)A), B);
2018:   MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n);
2019:   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) MatSetBlockSizesFromMats(*B, A, A);
2020:   MatSetType(*B, ((PetscObject)A)->type_name);
2021:   MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE);
2022:   return 0;
2023: }

2025: /*MC
2026:    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2027:    based on the sliced Ellpack format

2029:    Options Database Keys:
2030: . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`

2032:    Level: beginner

2034: .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2035: M*/

2037: /*MC
2038:    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.

2040:    This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2041:    and `MATMPISELL` otherwise.  As a result, for single process communicators,
2042:   `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2043:   for communicators controlling multiple processes.  It is recommended that you call both of
2044:   the above preallocation routines for simplicity.

2046:    Options Database Keys:
2047: . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()

2049:   Level: beginner

2051:   Notes:
2052:    This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).

2054:    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2055:    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.

2057:   Developer Notes:
2058:    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.

2060:    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2061: .vb
2062:                             (2 0  3 4)
2063:    Consider the matrix A =  (5 0  6 0)
2064:                             (0 0  7 8)
2065:                             (0 0  9 9)

2067:    symbolically the Ellpack format can be written as

2069:         (2 3 4 |)           (0 2 3 |)
2070:    v =  (5 6 0 |)  colidx = (0 2 2 |)
2071:         --------            ---------
2072:         (7 8 |)             (2 3 |)
2073:         (9 9 |)             (2 3 |)

2075:     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2076:     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2077:     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.

2079:     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)

2081: .ve

2083:       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.

2085:  References:
2086: . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2087:    Proceedings of the 47th International Conference on Parallel Processing, 2018.

2089: .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2090: M*/

2092: /*@C
2093:        MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.

2095:  Collective on comm

2097:  Input Parameters:
2098: +  comm - MPI communicator, set to `PETSC_COMM_SELF`
2099: .  m - number of rows
2100: .  n - number of columns
2101: .  rlenmax - maximum number of nonzeros in a row
2102: -  rlen - array containing the number of nonzeros in the various rows
2103:  (possibly different for each row) or NULL

2105:  Output Parameter:
2106: .  A - the matrix

2108:  It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2109:  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2110:  [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]

2112:  Notes:
2113:  If nnz is given then nz is ignored

2115:  Specify the preallocated storage with either rlenmax or rlen (not both).
2116:  Set rlenmax = `PETSC_DEFAULT` and rlen = NULL for PETSc to control dynamic memory
2117:  allocation.  For large problems you MUST preallocate memory or you
2118:  will get TERRIBLE performance, see the users' manual chapter on matrices.

2120:  Level: intermediate

2122:  .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL`
2123:  @*/
2124: PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt maxallocrow, const PetscInt rlen[], Mat *A)
2125: {
2126:   MatCreate(comm, A);
2127:   MatSetSizes(*A, m, n, m, n);
2128:   MatSetType(*A, MATSEQSELL);
2129:   MatSeqSELLSetPreallocation_SeqSELL(*A, maxallocrow, rlen);
2130:   return 0;
2131: }

2133: PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2134: {
2135:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2136:   PetscInt     totalslices = a->totalslices;

2138:   /* If the  matrix dimensions are not equal,or no of nonzeros */
2139:   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2140:     *flg = PETSC_FALSE;
2141:     return 0;
2142:   }
2143:   /* if the a->colidx are the same */
2144:   PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg);
2145:   if (!*flg) return 0;
2146:   /* if a->val are the same */
2147:   PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg);
2148:   return 0;
2149: }

2151: PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2152: {
2153:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;

2155:   a->idiagvalid = PETSC_FALSE;
2156:   return 0;
2157: }

2159: PetscErrorCode MatConjugate_SeqSELL(Mat A)
2160: {
2161: #if defined(PETSC_USE_COMPLEX)
2162:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2163:   PetscInt     i;
2164:   PetscScalar *val = a->val;

2166:   for (i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]);
2167: #else
2168: #endif
2169:   return 0;
2170: }