Actual source code: sell.c
2: /*
3: Defines the basic matrix operations for the SELL matrix storage format.
4: */
5: #include <../src/mat/impls/sell/seq/sell.h>
6: #include <petscblaslapack.h>
7: #include <petsc/private/kernels/blocktranspose.h>
9: static PetscBool cited = PETSC_FALSE;
10: static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
11: " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
12: " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
13: " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
14: " year = 2018\n"
15: "}\n";
17: #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
19: #include <immintrin.h>
21: #if !defined(_MM_SCALE_8)
22: #define _MM_SCALE_8 8
23: #endif
25: #if defined(__AVX512F__)
26: /* these do not work
27: vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
28: vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
29: */
30: #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
31: /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
32: vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \
33: vec_vals = _mm512_loadu_pd(aval); \
34: vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
35: vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
36: #elif defined(__AVX2__) && defined(__FMA__)
37: #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
38: vec_vals = _mm256_loadu_pd(aval); \
39: vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
40: vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
41: vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
42: #endif
43: #endif /* PETSC_HAVE_IMMINTRIN_H */
45: /*@C
46: MatSeqSELLSetPreallocation - For good matrix assembly performance
47: the user should preallocate the matrix storage by setting the parameter nz
48: (or the array nnz). By setting these parameters accurately, performance
49: during matrix assembly can be increased significantly.
51: Collective
53: Input Parameters:
54: + B - The `MATSEQSELL` matrix
55: . rlenmax - number of nonzeros per row (same for all rows)
56: - rlen - array containing the number of nonzeros in the various rows
57: (possibly different for each row) or `NULL`
59: Level: intermediate
61: Notes:
62: If `rlen` is given then `rlenmax` is ignored.
64: Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
65: Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
66: allocation. For large problems you MUST preallocate memory or you
67: will get TERRIBLE performance, see the users' manual chapter on matrices.
69: You can call `MatGetInfo()` to get information on how effective the preallocation was;
70: for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
71: You can also run with the option `-info` and look for messages with the string
72: malloc in them to see if additional memory allocation was needed.
74: Developers Note:
75: Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
76: entries or columns indices.
78: The maximum number of nonzeos in any row should be as accurate as possible.
79: If it is underestimated, you will get bad performance due to reallocation
80: (`MatSeqXSELLReallocateSELL()`).
82: .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
83: @*/
84: PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
85: {
88: PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
89: return 0;
90: }
92: PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
93: {
94: Mat_SeqSELL *b;
95: PetscInt i, j, totalslices;
96: PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
98: if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
99: if (maxallocrow == MAT_SKIP_ALLOCATION) {
100: skipallocation = PETSC_TRUE;
101: maxallocrow = 0;
102: }
104: PetscLayoutSetUp(B->rmap);
105: PetscLayoutSetUp(B->cmap);
107: /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
108: if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
110: if (rlen) {
111: for (i = 0; i < B->rmap->n; i++) {
114: }
115: }
117: B->preallocated = PETSC_TRUE;
119: b = (Mat_SeqSELL *)B->data;
121: totalslices = PetscCeilInt(B->rmap->n, 8);
122: b->totalslices = totalslices;
123: if (!skipallocation) {
124: if (B->rmap->n & 0x07) PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n", B->rmap->n);
126: if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
127: PetscMalloc1(totalslices + 1, &b->sliidx);
128: }
129: if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
130: if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
131: else if (maxallocrow < 0) maxallocrow = 1;
132: for (i = 0; i <= totalslices; i++) b->sliidx[i] = i * 8 * maxallocrow;
133: } else {
134: maxallocrow = 0;
135: b->sliidx[0] = 0;
136: for (i = 1; i < totalslices; i++) {
137: b->sliidx[i] = 0;
138: for (j = 0; j < 8; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[8 * (i - 1) + j]);
139: maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
140: PetscIntSumError(b->sliidx[i - 1], 8 * b->sliidx[i], &b->sliidx[i]);
141: }
142: /* last slice */
143: b->sliidx[totalslices] = 0;
144: for (j = (totalslices - 1) * 8; j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
145: maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow);
146: b->sliidx[totalslices] = b->sliidx[totalslices - 1] + 8 * b->sliidx[totalslices];
147: }
149: /* allocate space for val, colidx, rlen */
150: /* FIXME: should B's old memory be unlogged? */
151: MatSeqXSELLFreeSELL(B, &b->val, &b->colidx);
152: /* FIXME: assuming an element of the bit array takes 8 bits */
153: PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx);
154: /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
155: PetscCalloc1(8 * totalslices, &b->rlen);
157: b->singlemalloc = PETSC_TRUE;
158: b->free_val = PETSC_TRUE;
159: b->free_colidx = PETSC_TRUE;
160: } else {
161: b->free_val = PETSC_FALSE;
162: b->free_colidx = PETSC_FALSE;
163: }
165: b->nz = 0;
166: b->maxallocrow = maxallocrow;
167: b->rlenmax = maxallocrow;
168: b->maxallocmat = b->sliidx[totalslices];
169: B->info.nz_unneeded = (double)b->maxallocmat;
170: if (realalloc) MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE);
171: return 0;
172: }
174: PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
175: {
176: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
177: PetscInt shift;
180: if (nz) *nz = a->rlen[row];
181: shift = a->sliidx[row >> 3] + (row & 0x07);
182: if (!a->getrowcols) PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals);
183: if (idx) {
184: PetscInt j;
185: for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + 8 * j];
186: *idx = a->getrowcols;
187: }
188: if (v) {
189: PetscInt j;
190: for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + 8 * j];
191: *v = a->getrowvals;
192: }
193: return 0;
194: }
196: PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
197: {
198: return 0;
199: }
201: PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
202: {
203: Mat B;
204: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
205: PetscInt i;
207: if (reuse == MAT_REUSE_MATRIX) {
208: B = *newmat;
209: MatZeroEntries(B);
210: } else {
211: MatCreate(PetscObjectComm((PetscObject)A), &B);
212: MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N);
213: MatSetType(B, MATSEQAIJ);
214: MatSeqAIJSetPreallocation(B, 0, a->rlen);
215: }
217: for (i = 0; i < A->rmap->n; i++) {
218: PetscInt nz = 0, *cols = NULL;
219: PetscScalar *vals = NULL;
221: MatGetRow_SeqSELL(A, i, &nz, &cols, &vals);
222: MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES);
223: MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals);
224: }
226: MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY);
227: MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY);
228: B->rmap->bs = A->rmap->bs;
230: if (reuse == MAT_INPLACE_MATRIX) {
231: MatHeaderReplace(A, &B);
232: } else {
233: *newmat = B;
234: }
235: return 0;
236: }
238: #include <../src/mat/impls/aij/seq/aij.h>
240: PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
241: {
242: Mat B;
243: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
244: PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
245: const PetscInt *cols;
246: const PetscScalar *vals;
249: if (reuse == MAT_REUSE_MATRIX) {
250: B = *newmat;
251: } else {
252: if (PetscDefined(USE_DEBUG) || !a->ilen) {
253: PetscMalloc1(m, &rowlengths);
254: for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
255: }
256: if (PetscDefined(USE_DEBUG) && a->ilen) {
257: PetscBool eq;
258: PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq);
260: PetscFree(rowlengths);
261: rowlengths = a->ilen;
262: } else if (a->ilen) rowlengths = a->ilen;
263: MatCreate(PetscObjectComm((PetscObject)A), &B);
264: MatSetSizes(B, m, n, m, n);
265: MatSetType(B, MATSEQSELL);
266: MatSeqSELLSetPreallocation(B, 0, rowlengths);
267: if (rowlengths != a->ilen) PetscFree(rowlengths);
268: }
270: for (row = 0; row < m; row++) {
271: MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals);
272: MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES);
273: MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals);
274: }
275: MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY);
276: MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY);
277: B->rmap->bs = A->rmap->bs;
279: if (reuse == MAT_INPLACE_MATRIX) {
280: MatHeaderReplace(A, &B);
281: } else {
282: *newmat = B;
283: }
284: return 0;
285: }
287: PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
288: {
289: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
290: PetscScalar *y;
291: const PetscScalar *x;
292: const MatScalar *aval = a->val;
293: PetscInt totalslices = a->totalslices;
294: const PetscInt *acolidx = a->colidx;
295: PetscInt i, j;
296: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
297: __m512d vec_x, vec_y, vec_vals;
298: __m256i vec_idx;
299: __mmask8 mask;
300: __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
301: __m256i vec_idx2, vec_idx3, vec_idx4;
302: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
303: __m128i vec_idx;
304: __m256d vec_x, vec_y, vec_y2, vec_vals;
305: MatScalar yval;
306: PetscInt r, rows_left, row, nnz_in_row;
307: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
308: __m128d vec_x_tmp;
309: __m256d vec_x, vec_y, vec_y2, vec_vals;
310: MatScalar yval;
311: PetscInt r, rows_left, row, nnz_in_row;
312: #else
313: PetscScalar sum[8];
314: #endif
316: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
317: #pragma disjoint(*x, *y, *aval)
318: #endif
320: VecGetArrayRead(xx, &x);
321: VecGetArray(yy, &y);
322: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
323: for (i = 0; i < totalslices; i++) { /* loop over slices */
324: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
325: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
327: vec_y = _mm512_setzero_pd();
328: vec_y2 = _mm512_setzero_pd();
329: vec_y3 = _mm512_setzero_pd();
330: vec_y4 = _mm512_setzero_pd();
332: j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
333: switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
334: case 3:
335: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
336: acolidx += 8;
337: aval += 8;
338: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
339: acolidx += 8;
340: aval += 8;
341: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
342: acolidx += 8;
343: aval += 8;
344: j += 3;
345: break;
346: case 2:
347: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
348: acolidx += 8;
349: aval += 8;
350: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
351: acolidx += 8;
352: aval += 8;
353: j += 2;
354: break;
355: case 1:
356: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
357: acolidx += 8;
358: aval += 8;
359: j += 1;
360: break;
361: }
362: #pragma novector
363: for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
364: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
365: acolidx += 8;
366: aval += 8;
367: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
368: acolidx += 8;
369: aval += 8;
370: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
371: acolidx += 8;
372: aval += 8;
373: AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
374: acolidx += 8;
375: aval += 8;
376: }
378: vec_y = _mm512_add_pd(vec_y, vec_y2);
379: vec_y = _mm512_add_pd(vec_y, vec_y3);
380: vec_y = _mm512_add_pd(vec_y, vec_y4);
381: if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
382: mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
383: _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
384: } else {
385: _mm512_storeu_pd(&y[8 * i], vec_y);
386: }
387: }
388: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
389: for (i = 0; i < totalslices; i++) { /* loop over full slices */
390: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
391: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
393: /* last slice may have padding rows. Don't use vectorization. */
394: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
395: rows_left = A->rmap->n - 8 * i;
396: for (r = 0; r < rows_left; ++r) {
397: yval = (MatScalar)0;
398: row = 8 * i + r;
399: nnz_in_row = a->rlen[row];
400: for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
401: y[row] = yval;
402: }
403: break;
404: }
406: vec_y = _mm256_setzero_pd();
407: vec_y2 = _mm256_setzero_pd();
409: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
410: #pragma novector
411: #pragma unroll(2)
412: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
413: AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
414: aval += 4;
415: acolidx += 4;
416: AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
417: aval += 4;
418: acolidx += 4;
419: }
421: _mm256_storeu_pd(y + i * 8, vec_y);
422: _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
423: }
424: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
425: for (i = 0; i < totalslices; i++) { /* loop over full slices */
426: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
427: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
429: vec_y = _mm256_setzero_pd();
430: vec_y2 = _mm256_setzero_pd();
432: /* last slice may have padding rows. Don't use vectorization. */
433: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
434: rows_left = A->rmap->n - 8 * i;
435: for (r = 0; r < rows_left; ++r) {
436: yval = (MatScalar)0;
437: row = 8 * i + r;
438: nnz_in_row = a->rlen[row];
439: for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
440: y[row] = yval;
441: }
442: break;
443: }
445: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
446: #pragma novector
447: #pragma unroll(2)
448: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
449: vec_vals = _mm256_loadu_pd(aval);
450: vec_x_tmp = _mm_setzero_pd();
451: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
452: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
453: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
454: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
455: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
456: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
457: vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
458: aval += 4;
460: vec_vals = _mm256_loadu_pd(aval);
461: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
462: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
463: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
464: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
465: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
466: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
467: vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
468: aval += 4;
469: }
471: _mm256_storeu_pd(y + i * 8, vec_y);
472: _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
473: }
474: #else
475: for (i = 0; i < totalslices; i++) { /* loop over slices */
476: for (j = 0; j < 8; j++) sum[j] = 0.0;
477: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
478: sum[0] += aval[j] * x[acolidx[j]];
479: sum[1] += aval[j + 1] * x[acolidx[j + 1]];
480: sum[2] += aval[j + 2] * x[acolidx[j + 2]];
481: sum[3] += aval[j + 3] * x[acolidx[j + 3]];
482: sum[4] += aval[j + 4] * x[acolidx[j + 4]];
483: sum[5] += aval[j + 5] * x[acolidx[j + 5]];
484: sum[6] += aval[j + 6] * x[acolidx[j + 6]];
485: sum[7] += aval[j + 7] * x[acolidx[j + 7]];
486: }
487: if (i == totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
488: for (j = 0; j < (A->rmap->n & 0x07); j++) y[8 * i + j] = sum[j];
489: } else {
490: for (j = 0; j < 8; j++) y[8 * i + j] = sum[j];
491: }
492: }
493: #endif
495: PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt); /* theoretical minimal FLOPs */
496: VecRestoreArrayRead(xx, &x);
497: VecRestoreArray(yy, &y);
498: return 0;
499: }
501: #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
502: PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
503: {
504: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
505: PetscScalar *y, *z;
506: const PetscScalar *x;
507: const MatScalar *aval = a->val;
508: PetscInt totalslices = a->totalslices;
509: const PetscInt *acolidx = a->colidx;
510: PetscInt i, j;
511: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
512: __m512d vec_x, vec_y, vec_vals;
513: __m256i vec_idx;
514: __mmask8 mask;
515: __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
516: __m256i vec_idx2, vec_idx3, vec_idx4;
517: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
518: __m128d vec_x_tmp;
519: __m256d vec_x, vec_y, vec_y2, vec_vals;
520: MatScalar yval;
521: PetscInt r, row, nnz_in_row;
522: #else
523: PetscScalar sum[8];
524: #endif
526: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
527: #pragma disjoint(*x, *y, *aval)
528: #endif
530: VecGetArrayRead(xx, &x);
531: VecGetArrayPair(yy, zz, &y, &z);
532: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
533: for (i = 0; i < totalslices; i++) { /* loop over slices */
534: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
535: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
537: if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
538: mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
539: vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
540: } else {
541: vec_y = _mm512_loadu_pd(&y[8 * i]);
542: }
543: vec_y2 = _mm512_setzero_pd();
544: vec_y3 = _mm512_setzero_pd();
545: vec_y4 = _mm512_setzero_pd();
547: j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
548: switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
549: case 3:
550: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
551: acolidx += 8;
552: aval += 8;
553: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
554: acolidx += 8;
555: aval += 8;
556: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
557: acolidx += 8;
558: aval += 8;
559: j += 3;
560: break;
561: case 2:
562: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
563: acolidx += 8;
564: aval += 8;
565: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
566: acolidx += 8;
567: aval += 8;
568: j += 2;
569: break;
570: case 1:
571: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
572: acolidx += 8;
573: aval += 8;
574: j += 1;
575: break;
576: }
577: #pragma novector
578: for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
579: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
580: acolidx += 8;
581: aval += 8;
582: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
583: acolidx += 8;
584: aval += 8;
585: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
586: acolidx += 8;
587: aval += 8;
588: AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
589: acolidx += 8;
590: aval += 8;
591: }
593: vec_y = _mm512_add_pd(vec_y, vec_y2);
594: vec_y = _mm512_add_pd(vec_y, vec_y3);
595: vec_y = _mm512_add_pd(vec_y, vec_y4);
596: if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
597: _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
598: } else {
599: _mm512_storeu_pd(&z[8 * i], vec_y);
600: }
601: }
602: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
603: for (i = 0; i < totalslices; i++) { /* loop over full slices */
604: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
605: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
607: /* last slice may have padding rows. Don't use vectorization. */
608: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
609: for (r = 0; r < (A->rmap->n & 0x07); ++r) {
610: row = 8 * i + r;
611: yval = (MatScalar)0.0;
612: nnz_in_row = a->rlen[row];
613: for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
614: z[row] = y[row] + yval;
615: }
616: break;
617: }
619: vec_y = _mm256_loadu_pd(y + 8 * i);
620: vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
622: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
623: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
624: vec_vals = _mm256_loadu_pd(aval);
625: vec_x_tmp = _mm_setzero_pd();
626: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
627: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
628: vec_x = _mm256_setzero_pd();
629: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
630: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
631: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
632: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
633: vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
634: aval += 4;
636: vec_vals = _mm256_loadu_pd(aval);
637: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
638: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
639: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
640: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
641: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
642: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
643: vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
644: aval += 4;
645: }
647: _mm256_storeu_pd(z + i * 8, vec_y);
648: _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
649: }
650: #else
651: for (i = 0; i < totalslices; i++) { /* loop over slices */
652: for (j = 0; j < 8; j++) sum[j] = 0.0;
653: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
654: sum[0] += aval[j] * x[acolidx[j]];
655: sum[1] += aval[j + 1] * x[acolidx[j + 1]];
656: sum[2] += aval[j + 2] * x[acolidx[j + 2]];
657: sum[3] += aval[j + 3] * x[acolidx[j + 3]];
658: sum[4] += aval[j + 4] * x[acolidx[j + 4]];
659: sum[5] += aval[j + 5] * x[acolidx[j + 5]];
660: sum[6] += aval[j + 6] * x[acolidx[j + 6]];
661: sum[7] += aval[j + 7] * x[acolidx[j + 7]];
662: }
663: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
664: for (j = 0; j < (A->rmap->n & 0x07); j++) z[8 * i + j] = y[8 * i + j] + sum[j];
665: } else {
666: for (j = 0; j < 8; j++) z[8 * i + j] = y[8 * i + j] + sum[j];
667: }
668: }
669: #endif
671: PetscLogFlops(2.0 * a->nz);
672: VecRestoreArrayRead(xx, &x);
673: VecRestoreArrayPair(yy, zz, &y, &z);
674: return 0;
675: }
677: PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
678: {
679: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
680: PetscScalar *y;
681: const PetscScalar *x;
682: const MatScalar *aval = a->val;
683: const PetscInt *acolidx = a->colidx;
684: PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices;
686: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
687: #pragma disjoint(*x, *y, *aval)
688: #endif
690: if (A->symmetric == PETSC_BOOL3_TRUE) {
691: MatMultAdd_SeqSELL(A, xx, zz, yy);
692: return 0;
693: }
694: if (zz != yy) VecCopy(zz, yy);
695: VecGetArrayRead(xx, &x);
696: VecGetArray(yy, &y);
697: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
698: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
699: for (r = 0; r < (A->rmap->n & 0x07); ++r) {
700: row = 8 * i + r;
701: nnz_in_row = a->rlen[row];
702: for (j = 0; j < nnz_in_row; ++j) y[acolidx[8 * j + r]] += aval[8 * j + r] * x[row];
703: }
704: break;
705: }
706: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
707: y[acolidx[j]] += aval[j] * x[8 * i];
708: y[acolidx[j + 1]] += aval[j + 1] * x[8 * i + 1];
709: y[acolidx[j + 2]] += aval[j + 2] * x[8 * i + 2];
710: y[acolidx[j + 3]] += aval[j + 3] * x[8 * i + 3];
711: y[acolidx[j + 4]] += aval[j + 4] * x[8 * i + 4];
712: y[acolidx[j + 5]] += aval[j + 5] * x[8 * i + 5];
713: y[acolidx[j + 6]] += aval[j + 6] * x[8 * i + 6];
714: y[acolidx[j + 7]] += aval[j + 7] * x[8 * i + 7];
715: }
716: }
717: PetscLogFlops(2.0 * a->sliidx[a->totalslices]);
718: VecRestoreArrayRead(xx, &x);
719: VecRestoreArray(yy, &y);
720: return 0;
721: }
723: PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
724: {
725: if (A->symmetric == PETSC_BOOL3_TRUE) {
726: MatMult_SeqSELL(A, xx, yy);
727: } else {
728: VecSet(yy, 0.0);
729: MatMultTransposeAdd_SeqSELL(A, xx, yy, yy);
730: }
731: return 0;
732: }
734: /*
735: Checks for missing diagonals
736: */
737: PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
738: {
739: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
740: PetscInt *diag, i;
742: *missing = PETSC_FALSE;
743: if (A->rmap->n > 0 && !(a->colidx)) {
744: *missing = PETSC_TRUE;
745: if (d) *d = 0;
746: PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n");
747: } else {
748: diag = a->diag;
749: for (i = 0; i < A->rmap->n; i++) {
750: if (diag[i] == -1) {
751: *missing = PETSC_TRUE;
752: if (d) *d = i;
753: PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i);
754: break;
755: }
756: }
757: }
758: return 0;
759: }
761: PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
762: {
763: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
764: PetscInt i, j, m = A->rmap->n, shift;
766: if (!a->diag) {
767: PetscMalloc1(m, &a->diag);
768: a->free_diag = PETSC_TRUE;
769: }
770: for (i = 0; i < m; i++) { /* loop over rows */
771: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
772: a->diag[i] = -1;
773: for (j = 0; j < a->rlen[i]; j++) {
774: if (a->colidx[shift + j * 8] == i) {
775: a->diag[i] = shift + j * 8;
776: break;
777: }
778: }
779: }
780: return 0;
781: }
783: /*
784: Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
785: */
786: PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
787: {
788: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
789: PetscInt i, *diag, m = A->rmap->n;
790: MatScalar *val = a->val;
791: PetscScalar *idiag, *mdiag;
793: if (a->idiagvalid) return 0;
794: MatMarkDiagonal_SeqSELL(A);
795: diag = a->diag;
796: if (!a->idiag) {
797: PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work);
798: val = a->val;
799: }
800: mdiag = a->mdiag;
801: idiag = a->idiag;
803: if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
804: for (i = 0; i < m; i++) {
805: mdiag[i] = val[diag[i]];
806: if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
807: if (PetscRealPart(fshift)) {
808: PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i);
809: A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
810: A->factorerror_zeropivot_value = 0.0;
811: A->factorerror_zeropivot_row = i;
812: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
813: }
814: idiag[i] = 1.0 / val[diag[i]];
815: }
816: PetscLogFlops(m);
817: } else {
818: for (i = 0; i < m; i++) {
819: mdiag[i] = val[diag[i]];
820: idiag[i] = omega / (fshift + val[diag[i]]);
821: }
822: PetscLogFlops(2.0 * m);
823: }
824: a->idiagvalid = PETSC_TRUE;
825: return 0;
826: }
828: PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
829: {
830: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
832: PetscArrayzero(a->val, a->sliidx[a->totalslices]);
833: MatSeqSELLInvalidateDiagonal(A);
834: return 0;
835: }
837: PetscErrorCode MatDestroy_SeqSELL(Mat A)
838: {
839: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
841: #if defined(PETSC_USE_LOG)
842: PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz);
843: #endif
844: MatSeqXSELLFreeSELL(A, &a->val, &a->colidx);
845: ISDestroy(&a->row);
846: ISDestroy(&a->col);
847: PetscFree(a->diag);
848: PetscFree(a->rlen);
849: PetscFree(a->sliidx);
850: PetscFree3(a->idiag, a->mdiag, a->ssor_work);
851: PetscFree(a->solve_work);
852: ISDestroy(&a->icol);
853: PetscFree(a->saved_values);
854: PetscFree2(a->getrowcols, a->getrowvals);
856: PetscFree(A->data);
858: PetscObjectChangeTypeName((PetscObject)A, NULL);
859: PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL);
860: PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL);
861: PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL);
862: PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL);
863: PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL);
864: PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL);
865: return 0;
866: }
868: PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
869: {
870: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
872: switch (op) {
873: case MAT_ROW_ORIENTED:
874: a->roworiented = flg;
875: break;
876: case MAT_KEEP_NONZERO_PATTERN:
877: a->keepnonzeropattern = flg;
878: break;
879: case MAT_NEW_NONZERO_LOCATIONS:
880: a->nonew = (flg ? 0 : 1);
881: break;
882: case MAT_NEW_NONZERO_LOCATION_ERR:
883: a->nonew = (flg ? -1 : 0);
884: break;
885: case MAT_NEW_NONZERO_ALLOCATION_ERR:
886: a->nonew = (flg ? -2 : 0);
887: break;
888: case MAT_UNUSED_NONZERO_LOCATION_ERR:
889: a->nounused = (flg ? -1 : 0);
890: break;
891: case MAT_FORCE_DIAGONAL_ENTRIES:
892: case MAT_IGNORE_OFF_PROC_ENTRIES:
893: case MAT_USE_HASH_TABLE:
894: case MAT_SORTED_FULL:
895: PetscInfo(A, "Option %s ignored\n", MatOptions[op]);
896: break;
897: case MAT_SPD:
898: case MAT_SYMMETRIC:
899: case MAT_STRUCTURALLY_SYMMETRIC:
900: case MAT_HERMITIAN:
901: case MAT_SYMMETRY_ETERNAL:
902: case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
903: case MAT_SPD_ETERNAL:
904: /* These options are handled directly by MatSetOption() */
905: break;
906: default:
907: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
908: }
909: return 0;
910: }
912: PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
913: {
914: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
915: PetscInt i, j, n, shift;
916: PetscScalar *x, zero = 0.0;
918: VecGetLocalSize(v, &n);
921: if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
922: PetscInt *diag = a->diag;
923: VecGetArray(v, &x);
924: for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
925: VecRestoreArray(v, &x);
926: return 0;
927: }
929: VecSet(v, zero);
930: VecGetArray(v, &x);
931: for (i = 0; i < n; i++) { /* loop over rows */
932: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
933: x[i] = 0;
934: for (j = 0; j < a->rlen[i]; j++) {
935: if (a->colidx[shift + j * 8] == i) {
936: x[i] = a->val[shift + j * 8];
937: break;
938: }
939: }
940: }
941: VecRestoreArray(v, &x);
942: return 0;
943: }
945: PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
946: {
947: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
948: const PetscScalar *l, *r;
949: PetscInt i, j, m, n, row;
951: if (ll) {
952: /* The local size is used so that VecMPI can be passed to this routine
953: by MatDiagonalScale_MPISELL */
954: VecGetLocalSize(ll, &m);
956: VecGetArrayRead(ll, &l);
957: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
958: if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
959: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
960: if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8 * i + row];
961: }
962: } else {
963: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) a->val[j] *= l[8 * i + row];
964: }
965: }
966: VecRestoreArrayRead(ll, &l);
967: PetscLogFlops(a->nz);
968: }
969: if (rr) {
970: VecGetLocalSize(rr, &n);
972: VecGetArrayRead(rr, &r);
973: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
974: if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
975: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
976: if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
977: }
978: } else {
979: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
980: }
981: }
982: VecRestoreArrayRead(rr, &r);
983: PetscLogFlops(a->nz);
984: }
985: MatSeqSELLInvalidateDiagonal(A);
986: return 0;
987: }
989: PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
990: {
991: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
992: PetscInt *cp, i, k, low, high, t, row, col, l;
993: PetscInt shift;
994: MatScalar *vp;
996: for (k = 0; k < m; k++) { /* loop over requested rows */
997: row = im[k];
998: if (row < 0) continue;
1000: shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
1001: cp = a->colidx + shift; /* pointer to the row */
1002: vp = a->val + shift; /* pointer to the row */
1003: for (l = 0; l < n; l++) { /* loop over requested columns */
1004: col = in[l];
1005: if (col < 0) continue;
1007: high = a->rlen[row];
1008: low = 0; /* assume unsorted */
1009: while (high - low > 5) {
1010: t = (low + high) / 2;
1011: if (*(cp + t * 8) > col) high = t;
1012: else low = t;
1013: }
1014: for (i = low; i < high; i++) {
1015: if (*(cp + 8 * i) > col) break;
1016: if (*(cp + 8 * i) == col) {
1017: *v++ = *(vp + 8 * i);
1018: goto finished;
1019: }
1020: }
1021: *v++ = 0.0;
1022: finished:;
1023: }
1024: }
1025: return 0;
1026: }
1028: PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1029: {
1030: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1031: PetscInt i, j, m = A->rmap->n, shift;
1032: const char *name;
1033: PetscViewerFormat format;
1035: PetscViewerGetFormat(viewer, &format);
1036: if (format == PETSC_VIEWER_ASCII_MATLAB) {
1037: PetscInt nofinalvalue = 0;
1038: /*
1039: if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1040: nofinalvalue = 1;
1041: }
1042: */
1043: PetscViewerASCIIUseTabs(viewer, PETSC_FALSE);
1044: PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n);
1045: PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz);
1046: #if defined(PETSC_USE_COMPLEX)
1047: PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue);
1048: #else
1049: PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue);
1050: #endif
1051: PetscViewerASCIIPrintf(viewer, "zzz = [\n");
1053: for (i = 0; i < m; i++) {
1054: shift = a->sliidx[i >> 3] + (i & 0x07);
1055: for (j = 0; j < a->rlen[i]; j++) {
1056: #if defined(PETSC_USE_COMPLEX)
1057: PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]));
1058: #else
1059: PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)a->val[shift + 8 * j]);
1060: #endif
1061: }
1062: }
1063: /*
1064: if (nofinalvalue) {
1065: #if defined(PETSC_USE_COMPLEX)
1066: PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.);
1067: #else
1068: PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0);
1069: #endif
1070: }
1071: */
1072: PetscObjectGetName((PetscObject)A, &name);
1073: PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name);
1074: PetscViewerASCIIUseTabs(viewer, PETSC_TRUE);
1075: } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1076: return 0;
1077: } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1078: PetscViewerASCIIUseTabs(viewer, PETSC_FALSE);
1079: for (i = 0; i < m; i++) {
1080: PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i);
1081: shift = a->sliidx[i >> 3] + (i & 0x07);
1082: for (j = 0; j < a->rlen[i]; j++) {
1083: #if defined(PETSC_USE_COMPLEX)
1084: if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1085: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]));
1086: } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1087: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j]));
1088: } else if (PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1089: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]));
1090: }
1091: #else
1092: if (a->val[shift + 8 * j] != 0.0) PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]);
1093: #endif
1094: }
1095: PetscViewerASCIIPrintf(viewer, "\n");
1096: }
1097: PetscViewerASCIIUseTabs(viewer, PETSC_TRUE);
1098: } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1099: PetscInt cnt = 0, jcnt;
1100: PetscScalar value;
1101: #if defined(PETSC_USE_COMPLEX)
1102: PetscBool realonly = PETSC_TRUE;
1103: for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1104: if (PetscImaginaryPart(a->val[i]) != 0.0) {
1105: realonly = PETSC_FALSE;
1106: break;
1107: }
1108: }
1109: #endif
1111: PetscViewerASCIIUseTabs(viewer, PETSC_FALSE);
1112: for (i = 0; i < m; i++) {
1113: jcnt = 0;
1114: shift = a->sliidx[i >> 3] + (i & 0x07);
1115: for (j = 0; j < A->cmap->n; j++) {
1116: if (jcnt < a->rlen[i] && j == a->colidx[shift + 8 * j]) {
1117: value = a->val[cnt++];
1118: jcnt++;
1119: } else {
1120: value = 0.0;
1121: }
1122: #if defined(PETSC_USE_COMPLEX)
1123: if (realonly) {
1124: PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value));
1125: } else {
1126: PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value));
1127: }
1128: #else
1129: PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value);
1130: #endif
1131: }
1132: PetscViewerASCIIPrintf(viewer, "\n");
1133: }
1134: PetscViewerASCIIUseTabs(viewer, PETSC_TRUE);
1135: } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1136: PetscInt fshift = 1;
1137: PetscViewerASCIIUseTabs(viewer, PETSC_FALSE);
1138: #if defined(PETSC_USE_COMPLEX)
1139: PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n");
1140: #else
1141: PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n");
1142: #endif
1143: PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz);
1144: for (i = 0; i < m; i++) {
1145: shift = a->sliidx[i >> 3] + (i & 0x07);
1146: for (j = 0; j < a->rlen[i]; j++) {
1147: #if defined(PETSC_USE_COMPLEX)
1148: PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]));
1149: #else
1150: PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)a->val[shift + 8 * j]);
1151: #endif
1152: }
1153: }
1154: PetscViewerASCIIUseTabs(viewer, PETSC_TRUE);
1155: } else if (format == PETSC_VIEWER_NATIVE) {
1156: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1157: PetscInt row;
1158: PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]);
1159: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
1160: #if defined(PETSC_USE_COMPLEX)
1161: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1162: PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]));
1163: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1164: PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]));
1165: } else {
1166: PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]));
1167: }
1168: #else
1169: PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)a->val[j]);
1170: #endif
1171: }
1172: }
1173: } else {
1174: PetscViewerASCIIUseTabs(viewer, PETSC_FALSE);
1175: if (A->factortype) {
1176: for (i = 0; i < m; i++) {
1177: shift = a->sliidx[i >> 3] + (i & 0x07);
1178: PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i);
1179: /* L part */
1180: for (j = shift; j < a->diag[i]; j += 8) {
1181: #if defined(PETSC_USE_COMPLEX)
1182: if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0) {
1183: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]));
1184: } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0) {
1185: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])));
1186: } else {
1187: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]));
1188: }
1189: #else
1190: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]);
1191: #endif
1192: }
1193: /* diagonal */
1194: j = a->diag[i];
1195: #if defined(PETSC_USE_COMPLEX)
1196: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1197: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]));
1198: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1199: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])));
1200: } else {
1201: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]));
1202: }
1203: #else
1204: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j]));
1205: #endif
1207: /* U part */
1208: for (j = a->diag[i] + 1; j < shift + 8 * a->rlen[i]; j += 8) {
1209: #if defined(PETSC_USE_COMPLEX)
1210: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1211: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]));
1212: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1213: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])));
1214: } else {
1215: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]));
1216: }
1217: #else
1218: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]);
1219: #endif
1220: }
1221: PetscViewerASCIIPrintf(viewer, "\n");
1222: }
1223: } else {
1224: for (i = 0; i < m; i++) {
1225: shift = a->sliidx[i >> 3] + (i & 0x07);
1226: PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i);
1227: for (j = 0; j < a->rlen[i]; j++) {
1228: #if defined(PETSC_USE_COMPLEX)
1229: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1230: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]));
1231: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1232: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j]));
1233: } else {
1234: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]));
1235: }
1236: #else
1237: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]);
1238: #endif
1239: }
1240: PetscViewerASCIIPrintf(viewer, "\n");
1241: }
1242: }
1243: PetscViewerASCIIUseTabs(viewer, PETSC_TRUE);
1244: }
1245: PetscViewerFlush(viewer);
1246: return 0;
1247: }
1249: #include <petscdraw.h>
1250: PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1251: {
1252: Mat A = (Mat)Aa;
1253: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1254: PetscInt i, j, m = A->rmap->n, shift;
1255: int color;
1256: PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1257: PetscViewer viewer;
1258: PetscViewerFormat format;
1260: PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer);
1261: PetscViewerGetFormat(viewer, &format);
1262: PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr);
1264: /* loop over matrix elements drawing boxes */
1266: if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1267: PetscDrawCollectiveBegin(draw);
1268: /* Blue for negative, Cyan for zero and Red for positive */
1269: color = PETSC_DRAW_BLUE;
1270: for (i = 0; i < m; i++) {
1271: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1272: y_l = m - i - 1.0;
1273: y_r = y_l + 1.0;
1274: for (j = 0; j < a->rlen[i]; j++) {
1275: x_l = a->colidx[shift + j * 8];
1276: x_r = x_l + 1.0;
1277: if (PetscRealPart(a->val[shift + 8 * j]) >= 0.) continue;
1278: PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color);
1279: }
1280: }
1281: color = PETSC_DRAW_CYAN;
1282: for (i = 0; i < m; i++) {
1283: shift = a->sliidx[i >> 3] + (i & 0x07);
1284: y_l = m - i - 1.0;
1285: y_r = y_l + 1.0;
1286: for (j = 0; j < a->rlen[i]; j++) {
1287: x_l = a->colidx[shift + j * 8];
1288: x_r = x_l + 1.0;
1289: if (a->val[shift + 8 * j] != 0.) continue;
1290: PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color);
1291: }
1292: }
1293: color = PETSC_DRAW_RED;
1294: for (i = 0; i < m; i++) {
1295: shift = a->sliidx[i >> 3] + (i & 0x07);
1296: y_l = m - i - 1.0;
1297: y_r = y_l + 1.0;
1298: for (j = 0; j < a->rlen[i]; j++) {
1299: x_l = a->colidx[shift + j * 8];
1300: x_r = x_l + 1.0;
1301: if (PetscRealPart(a->val[shift + 8 * j]) <= 0.) continue;
1302: PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color);
1303: }
1304: }
1305: PetscDrawCollectiveEnd(draw);
1306: } else {
1307: /* use contour shading to indicate magnitude of values */
1308: /* first determine max of all nonzero values */
1309: PetscReal minv = 0.0, maxv = 0.0;
1310: PetscInt count = 0;
1311: PetscDraw popup;
1312: for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1313: if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1314: }
1315: if (minv >= maxv) maxv = minv + PETSC_SMALL;
1316: PetscDrawGetPopup(draw, &popup);
1317: PetscDrawScalePopup(popup, minv, maxv);
1319: PetscDrawCollectiveBegin(draw);
1320: for (i = 0; i < m; i++) {
1321: shift = a->sliidx[i >> 3] + (i & 0x07);
1322: y_l = m - i - 1.0;
1323: y_r = y_l + 1.0;
1324: for (j = 0; j < a->rlen[i]; j++) {
1325: x_l = a->colidx[shift + j * 8];
1326: x_r = x_l + 1.0;
1327: color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1328: PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color);
1329: count++;
1330: }
1331: }
1332: PetscDrawCollectiveEnd(draw);
1333: }
1334: return 0;
1335: }
1337: #include <petscdraw.h>
1338: PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1339: {
1340: PetscDraw draw;
1341: PetscReal xr, yr, xl, yl, h, w;
1342: PetscBool isnull;
1344: PetscViewerDrawGetDraw(viewer, 0, &draw);
1345: PetscDrawIsNull(draw, &isnull);
1346: if (isnull) return 0;
1348: xr = A->cmap->n;
1349: yr = A->rmap->n;
1350: h = yr / 10.0;
1351: w = xr / 10.0;
1352: xr += w;
1353: yr += h;
1354: xl = -w;
1355: yl = -h;
1356: PetscDrawSetCoordinates(draw, xl, yl, xr, yr);
1357: PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer);
1358: PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A);
1359: PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL);
1360: PetscDrawSave(draw);
1361: return 0;
1362: }
1364: PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1365: {
1366: PetscBool iascii, isbinary, isdraw;
1368: PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii);
1369: PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary);
1370: PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw);
1371: if (iascii) {
1372: MatView_SeqSELL_ASCII(A, viewer);
1373: } else if (isbinary) {
1374: /* MatView_SeqSELL_Binary(A,viewer); */
1375: } else if (isdraw) MatView_SeqSELL_Draw(A, viewer);
1376: return 0;
1377: }
1379: PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1380: {
1381: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1382: PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1383: MatScalar *vp;
1385: if (mode == MAT_FLUSH_ASSEMBLY) return 0;
1386: /* To do: compress out the unused elements */
1387: MatMarkDiagonal_SeqSELL(A);
1388: PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz);
1389: PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs);
1390: PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax);
1391: /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1392: for (i = 0; i < a->totalslices; ++i) {
1393: shift = a->sliidx[i]; /* starting index of the slice */
1394: cp = a->colidx + shift; /* pointer to the column indices of the slice */
1395: vp = a->val + shift; /* pointer to the nonzero values of the slice */
1396: for (row_in_slice = 0; row_in_slice < 8; ++row_in_slice) { /* loop over rows in the slice */
1397: row = 8 * i + row_in_slice;
1398: nrow = a->rlen[row]; /* number of nonzeros in row */
1399: /*
1400: Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1401: But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1402: */
1403: lastcol = 0;
1404: if (nrow > 0) { /* nonempty row */
1405: lastcol = cp[8 * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1406: } else if (!row_in_slice) { /* first row of the currect slice is empty */
1407: for (j = 1; j < 8; j++) {
1408: if (a->rlen[8 * i + j]) {
1409: lastcol = cp[j];
1410: break;
1411: }
1412: }
1413: } else {
1414: if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1415: }
1417: for (k = nrow; k < (a->sliidx[i + 1] - shift) / 8; ++k) {
1418: cp[8 * k + row_in_slice] = lastcol;
1419: vp[8 * k + row_in_slice] = (MatScalar)0;
1420: }
1421: }
1422: }
1424: A->info.mallocs += a->reallocs;
1425: a->reallocs = 0;
1427: MatSeqSELLInvalidateDiagonal(A);
1428: return 0;
1429: }
1431: PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1432: {
1433: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1435: info->block_size = 1.0;
1436: info->nz_allocated = a->maxallocmat;
1437: info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */
1438: info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]);
1439: info->assemblies = A->num_ass;
1440: info->mallocs = A->info.mallocs;
1441: info->memory = 0; /* REVIEW ME */
1442: if (A->factortype) {
1443: info->fill_ratio_given = A->info.fill_ratio_given;
1444: info->fill_ratio_needed = A->info.fill_ratio_needed;
1445: info->factor_mallocs = A->info.factor_mallocs;
1446: } else {
1447: info->fill_ratio_given = 0;
1448: info->fill_ratio_needed = 0;
1449: info->factor_mallocs = 0;
1450: }
1451: return 0;
1452: }
1454: PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1455: {
1456: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1457: PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow;
1458: PetscInt *cp, nonew = a->nonew, lastcol = -1;
1459: MatScalar *vp, value;
1461: for (k = 0; k < m; k++) { /* loop over added rows */
1462: row = im[k];
1463: if (row < 0) continue;
1465: shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
1466: cp = a->colidx + shift; /* pointer to the row */
1467: vp = a->val + shift; /* pointer to the row */
1468: nrow = a->rlen[row];
1469: low = 0;
1470: high = nrow;
1472: for (l = 0; l < n; l++) { /* loop over added columns */
1473: col = in[l];
1474: if (col < 0) continue;
1476: if (a->roworiented) {
1477: value = v[l + k * n];
1478: } else {
1479: value = v[k + l * m];
1480: }
1481: if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1483: /* search in this row for the specified column, i indicates the column to be set */
1484: if (col <= lastcol) low = 0;
1485: else high = nrow;
1486: lastcol = col;
1487: while (high - low > 5) {
1488: t = (low + high) / 2;
1489: if (*(cp + t * 8) > col) high = t;
1490: else low = t;
1491: }
1492: for (i = low; i < high; i++) {
1493: if (*(cp + i * 8) > col) break;
1494: if (*(cp + i * 8) == col) {
1495: if (is == ADD_VALUES) *(vp + i * 8) += value;
1496: else *(vp + i * 8) = value;
1497: low = i + 1;
1498: goto noinsert;
1499: }
1500: }
1501: if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1502: if (nonew == 1) goto noinsert;
1504: /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1505: MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, row / 8, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar);
1506: /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1507: for (ii = nrow - 1; ii >= i; ii--) {
1508: *(cp + (ii + 1) * 8) = *(cp + ii * 8);
1509: *(vp + (ii + 1) * 8) = *(vp + ii * 8);
1510: }
1511: a->rlen[row]++;
1512: *(cp + i * 8) = col;
1513: *(vp + i * 8) = value;
1514: a->nz++;
1515: A->nonzerostate++;
1516: low = i + 1;
1517: high++;
1518: nrow++;
1519: noinsert:;
1520: }
1521: a->rlen[row] = nrow;
1522: }
1523: return 0;
1524: }
1526: PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1527: {
1528: /* If the two matrices have the same copy implementation, use fast copy. */
1529: if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1530: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1531: Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1534: PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]);
1535: } else {
1536: MatCopy_Basic(A, B, str);
1537: }
1538: return 0;
1539: }
1541: PetscErrorCode MatSetUp_SeqSELL(Mat A)
1542: {
1543: MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL);
1544: return 0;
1545: }
1547: PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1548: {
1549: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1551: *array = a->val;
1552: return 0;
1553: }
1555: PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1556: {
1557: return 0;
1558: }
1560: PetscErrorCode MatRealPart_SeqSELL(Mat A)
1561: {
1562: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1563: PetscInt i;
1564: MatScalar *aval = a->val;
1566: for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]);
1567: return 0;
1568: }
1570: PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1571: {
1572: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1573: PetscInt i;
1574: MatScalar *aval = a->val;
1576: for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1577: MatSeqSELLInvalidateDiagonal(A);
1578: return 0;
1579: }
1581: PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1582: {
1583: Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data;
1584: MatScalar *aval = a->val;
1585: PetscScalar oalpha = alpha;
1586: PetscBLASInt one = 1, size;
1588: PetscBLASIntCast(a->sliidx[a->totalslices], &size);
1589: PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1590: PetscLogFlops(a->nz);
1591: MatSeqSELLInvalidateDiagonal(inA);
1592: return 0;
1593: }
1595: PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1596: {
1597: Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1599: if (!Y->preallocated || !y->nz) MatSeqSELLSetPreallocation(Y, 1, NULL);
1600: MatShift_Basic(Y, a);
1601: return 0;
1602: }
1604: PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1605: {
1606: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1607: PetscScalar *x, sum, *t;
1608: const MatScalar *idiag = NULL, *mdiag;
1609: const PetscScalar *b, *xb;
1610: PetscInt n, m = A->rmap->n, i, j, shift;
1611: const PetscInt *diag;
1613: its = its * lits;
1615: if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1616: if (!a->idiagvalid) MatInvertDiagonal_SeqSELL(A, omega, fshift);
1617: a->fshift = fshift;
1618: a->omega = omega;
1620: diag = a->diag;
1621: t = a->ssor_work;
1622: idiag = a->idiag;
1623: mdiag = a->mdiag;
1625: VecGetArray(xx, &x);
1626: VecGetArrayRead(bb, &b);
1627: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1632: if (flag & SOR_ZERO_INITIAL_GUESS) {
1633: if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1634: for (i = 0; i < m; i++) {
1635: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1636: sum = b[i];
1637: n = (diag[i] - shift) / 8;
1638: for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1639: t[i] = sum;
1640: x[i] = sum * idiag[i];
1641: }
1642: xb = t;
1643: PetscLogFlops(a->nz);
1644: } else xb = b;
1645: if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1646: for (i = m - 1; i >= 0; i--) {
1647: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1648: sum = xb[i];
1649: n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1650: for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1651: if (xb == b) {
1652: x[i] = sum * idiag[i];
1653: } else {
1654: x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1655: }
1656: }
1657: PetscLogFlops(a->nz); /* assumes 1/2 in upper */
1658: }
1659: its--;
1660: }
1661: while (its--) {
1662: if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1663: for (i = 0; i < m; i++) {
1664: /* lower */
1665: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1666: sum = b[i];
1667: n = (diag[i] - shift) / 8;
1668: for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1669: t[i] = sum; /* save application of the lower-triangular part */
1670: /* upper */
1671: n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1672: for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1673: x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1674: }
1675: xb = t;
1676: PetscLogFlops(2.0 * a->nz);
1677: } else xb = b;
1678: if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1679: for (i = m - 1; i >= 0; i--) {
1680: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1681: sum = xb[i];
1682: if (xb == b) {
1683: /* whole matrix (no checkpointing available) */
1684: n = a->rlen[i];
1685: for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1686: x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1687: } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1688: n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1689: for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1690: x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1691: }
1692: }
1693: if (xb == b) {
1694: PetscLogFlops(2.0 * a->nz);
1695: } else {
1696: PetscLogFlops(a->nz); /* assumes 1/2 in upper */
1697: }
1698: }
1699: }
1700: VecRestoreArray(xx, &x);
1701: VecRestoreArrayRead(bb, &b);
1702: return 0;
1703: }
1705: /* -------------------------------------------------------------------*/
1706: static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1707: MatGetRow_SeqSELL,
1708: MatRestoreRow_SeqSELL,
1709: MatMult_SeqSELL,
1710: /* 4*/ MatMultAdd_SeqSELL,
1711: MatMultTranspose_SeqSELL,
1712: MatMultTransposeAdd_SeqSELL,
1713: NULL,
1714: NULL,
1715: NULL,
1716: /* 10*/ NULL,
1717: NULL,
1718: NULL,
1719: MatSOR_SeqSELL,
1720: NULL,
1721: /* 15*/ MatGetInfo_SeqSELL,
1722: MatEqual_SeqSELL,
1723: MatGetDiagonal_SeqSELL,
1724: MatDiagonalScale_SeqSELL,
1725: NULL,
1726: /* 20*/ NULL,
1727: MatAssemblyEnd_SeqSELL,
1728: MatSetOption_SeqSELL,
1729: MatZeroEntries_SeqSELL,
1730: /* 24*/ NULL,
1731: NULL,
1732: NULL,
1733: NULL,
1734: NULL,
1735: /* 29*/ MatSetUp_SeqSELL,
1736: NULL,
1737: NULL,
1738: NULL,
1739: NULL,
1740: /* 34*/ MatDuplicate_SeqSELL,
1741: NULL,
1742: NULL,
1743: NULL,
1744: NULL,
1745: /* 39*/ NULL,
1746: NULL,
1747: NULL,
1748: MatGetValues_SeqSELL,
1749: MatCopy_SeqSELL,
1750: /* 44*/ NULL,
1751: MatScale_SeqSELL,
1752: MatShift_SeqSELL,
1753: NULL,
1754: NULL,
1755: /* 49*/ NULL,
1756: NULL,
1757: NULL,
1758: NULL,
1759: NULL,
1760: /* 54*/ MatFDColoringCreate_SeqXAIJ,
1761: NULL,
1762: NULL,
1763: NULL,
1764: NULL,
1765: /* 59*/ NULL,
1766: MatDestroy_SeqSELL,
1767: MatView_SeqSELL,
1768: NULL,
1769: NULL,
1770: /* 64*/ NULL,
1771: NULL,
1772: NULL,
1773: NULL,
1774: NULL,
1775: /* 69*/ NULL,
1776: NULL,
1777: NULL,
1778: NULL,
1779: NULL,
1780: /* 74*/ NULL,
1781: MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1782: NULL,
1783: NULL,
1784: NULL,
1785: /* 79*/ NULL,
1786: NULL,
1787: NULL,
1788: NULL,
1789: NULL,
1790: /* 84*/ NULL,
1791: NULL,
1792: NULL,
1793: NULL,
1794: NULL,
1795: /* 89*/ NULL,
1796: NULL,
1797: NULL,
1798: NULL,
1799: NULL,
1800: /* 94*/ NULL,
1801: NULL,
1802: NULL,
1803: NULL,
1804: NULL,
1805: /* 99*/ NULL,
1806: NULL,
1807: NULL,
1808: MatConjugate_SeqSELL,
1809: NULL,
1810: /*104*/ NULL,
1811: NULL,
1812: NULL,
1813: NULL,
1814: NULL,
1815: /*109*/ NULL,
1816: NULL,
1817: NULL,
1818: NULL,
1819: MatMissingDiagonal_SeqSELL,
1820: /*114*/ NULL,
1821: NULL,
1822: NULL,
1823: NULL,
1824: NULL,
1825: /*119*/ NULL,
1826: NULL,
1827: NULL,
1828: NULL,
1829: NULL,
1830: /*124*/ NULL,
1831: NULL,
1832: NULL,
1833: NULL,
1834: NULL,
1835: /*129*/ NULL,
1836: NULL,
1837: NULL,
1838: NULL,
1839: NULL,
1840: /*134*/ NULL,
1841: NULL,
1842: NULL,
1843: NULL,
1844: NULL,
1845: /*139*/ NULL,
1846: NULL,
1847: NULL,
1848: MatFDColoringSetUp_SeqXAIJ,
1849: NULL,
1850: /*144*/ NULL,
1851: NULL,
1852: NULL,
1853: NULL,
1854: NULL,
1855: NULL,
1856: /*150*/ NULL};
1858: PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1859: {
1860: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1864: /* allocate space for values if not already there */
1865: if (!a->saved_values) { PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values); }
1867: /* copy values over */
1868: PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]);
1869: return 0;
1870: }
1872: PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1873: {
1874: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1878: PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]);
1879: return 0;
1880: }
1882: /*@C
1883: MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()`
1885: Not Collective
1887: Input Parameters:
1888: . mat - a `MATSEQSELL` matrix
1889: . array - pointer to the data
1891: Level: intermediate
1893: .seealso: `Mat`, `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()`
1894: @*/
1895: PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array)
1896: {
1897: PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array));
1898: return 0;
1899: }
1901: PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1902: {
1903: Mat_SeqSELL *b;
1904: PetscMPIInt size;
1906: PetscCitationsRegister(citation, &cited);
1907: MPI_Comm_size(PetscObjectComm((PetscObject)B), &size);
1910: PetscNew(&b);
1912: B->data = (void *)b;
1914: PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps));
1916: b->row = NULL;
1917: b->col = NULL;
1918: b->icol = NULL;
1919: b->reallocs = 0;
1920: b->ignorezeroentries = PETSC_FALSE;
1921: b->roworiented = PETSC_TRUE;
1922: b->nonew = 0;
1923: b->diag = NULL;
1924: b->solve_work = NULL;
1925: B->spptr = NULL;
1926: b->saved_values = NULL;
1927: b->idiag = NULL;
1928: b->mdiag = NULL;
1929: b->ssor_work = NULL;
1930: b->omega = 1.0;
1931: b->fshift = 0.0;
1932: b->idiagvalid = PETSC_FALSE;
1933: b->keepnonzeropattern = PETSC_FALSE;
1935: PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL);
1936: PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL);
1937: PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL);
1938: PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL);
1939: PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL);
1940: PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL);
1941: PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ);
1942: return 0;
1943: }
1945: /*
1946: Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1947: */
1948: PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
1949: {
1950: Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
1951: PetscInt i, m = A->rmap->n;
1952: PetscInt totalslices = a->totalslices;
1954: C->factortype = A->factortype;
1955: c->row = NULL;
1956: c->col = NULL;
1957: c->icol = NULL;
1958: c->reallocs = 0;
1959: C->assembled = PETSC_TRUE;
1961: PetscLayoutReference(A->rmap, &C->rmap);
1962: PetscLayoutReference(A->cmap, &C->cmap);
1964: PetscMalloc1(8 * totalslices, &c->rlen);
1965: PetscMalloc1(totalslices + 1, &c->sliidx);
1967: for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
1968: for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
1970: /* allocate the matrix space */
1971: if (mallocmatspace) {
1972: PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx);
1974: c->singlemalloc = PETSC_TRUE;
1976: if (m > 0) {
1977: PetscArraycpy(c->colidx, a->colidx, a->maxallocmat);
1978: if (cpvalues == MAT_COPY_VALUES) {
1979: PetscArraycpy(c->val, a->val, a->maxallocmat);
1980: } else {
1981: PetscArrayzero(c->val, a->maxallocmat);
1982: }
1983: }
1984: }
1986: c->ignorezeroentries = a->ignorezeroentries;
1987: c->roworiented = a->roworiented;
1988: c->nonew = a->nonew;
1989: if (a->diag) {
1990: PetscMalloc1(m, &c->diag);
1991: for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
1992: } else c->diag = NULL;
1994: c->solve_work = NULL;
1995: c->saved_values = NULL;
1996: c->idiag = NULL;
1997: c->ssor_work = NULL;
1998: c->keepnonzeropattern = a->keepnonzeropattern;
1999: c->free_val = PETSC_TRUE;
2000: c->free_colidx = PETSC_TRUE;
2002: c->maxallocmat = a->maxallocmat;
2003: c->maxallocrow = a->maxallocrow;
2004: c->rlenmax = a->rlenmax;
2005: c->nz = a->nz;
2006: C->preallocated = PETSC_TRUE;
2008: c->nonzerorowcnt = a->nonzerorowcnt;
2009: C->nonzerostate = A->nonzerostate;
2011: PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist);
2012: return 0;
2013: }
2015: PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2016: {
2017: MatCreate(PetscObjectComm((PetscObject)A), B);
2018: MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n);
2019: if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) MatSetBlockSizesFromMats(*B, A, A);
2020: MatSetType(*B, ((PetscObject)A)->type_name);
2021: MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE);
2022: return 0;
2023: }
2025: /*MC
2026: MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2027: based on the sliced Ellpack format
2029: Options Database Keys:
2030: . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2032: Level: beginner
2034: .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2035: M*/
2037: /*MC
2038: MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2040: This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2041: and `MATMPISELL` otherwise. As a result, for single process communicators,
2042: `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2043: for communicators controlling multiple processes. It is recommended that you call both of
2044: the above preallocation routines for simplicity.
2046: Options Database Keys:
2047: . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2049: Level: beginner
2051: Notes:
2052: This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).
2054: It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2055: non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2057: Developer Notes:
2058: On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2060: The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2061: .vb
2062: (2 0 3 4)
2063: Consider the matrix A = (5 0 6 0)
2064: (0 0 7 8)
2065: (0 0 9 9)
2067: symbolically the Ellpack format can be written as
2069: (2 3 4 |) (0 2 3 |)
2070: v = (5 6 0 |) colidx = (0 2 2 |)
2071: -------- ---------
2072: (7 8 |) (2 3 |)
2073: (9 9 |) (2 3 |)
2075: The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2076: Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2077: zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2079: The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3)
2081: .ve
2083: See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2085: References:
2086: . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2087: Proceedings of the 47th International Conference on Parallel Processing, 2018.
2089: .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2090: M*/
2092: /*@C
2093: MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2095: Collective on comm
2097: Input Parameters:
2098: + comm - MPI communicator, set to `PETSC_COMM_SELF`
2099: . m - number of rows
2100: . n - number of columns
2101: . rlenmax - maximum number of nonzeros in a row
2102: - rlen - array containing the number of nonzeros in the various rows
2103: (possibly different for each row) or NULL
2105: Output Parameter:
2106: . A - the matrix
2108: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2109: MatXXXXSetPreallocation() paradigm instead of this routine directly.
2110: [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2112: Notes:
2113: If nnz is given then nz is ignored
2115: Specify the preallocated storage with either rlenmax or rlen (not both).
2116: Set rlenmax = `PETSC_DEFAULT` and rlen = NULL for PETSc to control dynamic memory
2117: allocation. For large problems you MUST preallocate memory or you
2118: will get TERRIBLE performance, see the users' manual chapter on matrices.
2120: Level: intermediate
2122: .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL`
2123: @*/
2124: PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt maxallocrow, const PetscInt rlen[], Mat *A)
2125: {
2126: MatCreate(comm, A);
2127: MatSetSizes(*A, m, n, m, n);
2128: MatSetType(*A, MATSEQSELL);
2129: MatSeqSELLSetPreallocation_SeqSELL(*A, maxallocrow, rlen);
2130: return 0;
2131: }
2133: PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2134: {
2135: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2136: PetscInt totalslices = a->totalslices;
2138: /* If the matrix dimensions are not equal,or no of nonzeros */
2139: if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2140: *flg = PETSC_FALSE;
2141: return 0;
2142: }
2143: /* if the a->colidx are the same */
2144: PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg);
2145: if (!*flg) return 0;
2146: /* if a->val are the same */
2147: PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg);
2148: return 0;
2149: }
2151: PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2152: {
2153: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2155: a->idiagvalid = PETSC_FALSE;
2156: return 0;
2157: }
2159: PetscErrorCode MatConjugate_SeqSELL(Mat A)
2160: {
2161: #if defined(PETSC_USE_COMPLEX)
2162: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2163: PetscInt i;
2164: PetscScalar *val = a->val;
2166: for (i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]);
2167: #else
2168: #endif
2169: return 0;
2170: }