Actual source code: veccuda.c
1: /*
2: Implementation of the sequential cuda vectors.
4: This file contains the code that can be compiled with a C
5: compiler. The companion file veccuda2.cu contains the code that
6: must be compiled with nvcc or a C++ compiler.
7: */
9: #define PETSC_SKIP_SPINLOCK
11: #include <petscconf.h>
12: #include <petsc/private/vecimpl.h>
13: #include <../src/vec/vec/impls/dvecimpl.h>
14: #include <petsc/private/cudavecimpl.h>
16: PetscErrorCode VecCUDAGetArrays_Private(Vec v, const PetscScalar **x, const PetscScalar **x_d, PetscOffloadMask *flg)
17: {
19: if (x) {
20: Vec_Seq *h = (Vec_Seq *)v->data;
22: *x = h->array;
23: }
24: if (x_d) {
25: Vec_CUDA *d = (Vec_CUDA *)v->spptr;
27: *x_d = d ? d->GPUarray : NULL;
28: }
29: if (flg) *flg = v->offloadmask;
30: return 0;
31: }
33: /*
34: Allocates space for the vector array on the Host if it does not exist.
35: Does NOT change the PetscCUDAFlag for the vector
36: Does NOT zero the CUDA array
37: */
38: PetscErrorCode VecCUDAAllocateCheckHost(Vec v)
39: {
40: PetscScalar *array;
41: Vec_Seq *s = (Vec_Seq *)v->data;
42: PetscInt n = v->map->n;
44: if (!s) {
45: PetscNew(&s);
46: v->data = s;
47: }
48: if (!s->array) {
49: if (n * sizeof(PetscScalar) > v->minimum_bytes_pinned_memory) {
50: PetscMallocSetCUDAHost();
51: v->pinned_memory = PETSC_TRUE;
52: }
53: PetscMalloc1(n, &array);
54: s->array = array;
55: s->array_allocated = array;
56: if (n * sizeof(PetscScalar) > v->minimum_bytes_pinned_memory) PetscMallocResetCUDAHost();
57: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) v->offloadmask = PETSC_OFFLOAD_CPU;
58: }
59: return 0;
60: }
62: PetscErrorCode VecCopy_SeqCUDA_Private(Vec xin, Vec yin)
63: {
64: PetscScalar *ya;
65: const PetscScalar *xa;
67: VecCUDAAllocateCheckHost(xin);
68: VecCUDAAllocateCheckHost(yin);
69: if (xin != yin) {
70: VecGetArrayRead(xin, &xa);
71: VecGetArray(yin, &ya);
72: PetscArraycpy(ya, xa, xin->map->n);
73: VecRestoreArrayRead(xin, &xa);
74: VecRestoreArray(yin, &ya);
75: }
76: return 0;
77: }
79: PetscErrorCode VecSetRandom_SeqCUDA(Vec xin, PetscRandom r)
80: {
81: PetscInt n = xin->map->n;
82: PetscBool iscurand;
83: PetscScalar *xx;
85: PetscObjectTypeCompare((PetscObject)r, PETSCCURAND, &iscurand);
86: if (iscurand) {
87: VecCUDAGetArrayWrite(xin, &xx);
88: } else {
89: VecGetArrayWrite(xin, &xx);
90: }
91: PetscRandomGetValues(r, n, xx);
92: if (iscurand) {
93: VecCUDARestoreArrayWrite(xin, &xx);
94: } else {
95: VecRestoreArrayWrite(xin, &xx);
96: }
97: return 0;
98: }
100: PetscErrorCode VecDestroy_SeqCUDA_Private(Vec v)
101: {
102: Vec_Seq *vs = (Vec_Seq *)v->data;
104: PetscObjectSAWsViewOff(v);
105: #if defined(PETSC_USE_LOG)
106: PetscLogObjectState((PetscObject)v, "Length=%" PetscInt_FMT, v->map->n);
107: #endif
108: if (vs) {
109: if (vs->array_allocated) {
110: if (v->pinned_memory) PetscMallocSetCUDAHost();
111: PetscFree(vs->array_allocated);
112: if (v->pinned_memory) {
113: PetscMallocResetCUDAHost();
114: v->pinned_memory = PETSC_FALSE;
115: }
116: }
117: VecDestroy_Seq(v);
118: }
119: return 0;
120: }
122: PetscErrorCode VecResetArray_SeqCUDA_Private(Vec vin)
123: {
124: Vec_Seq *v = (Vec_Seq *)vin->data;
126: v->array = v->unplacedarray;
127: v->unplacedarray = 0;
128: return 0;
129: }
131: PetscErrorCode VecResetArray_SeqCUDA(Vec vin)
132: {
133: VecCUDACopyFromGPU(vin);
134: VecResetArray_SeqCUDA_Private(vin);
135: vin->offloadmask = PETSC_OFFLOAD_CPU;
136: return 0;
137: }
139: PetscErrorCode VecPlaceArray_SeqCUDA(Vec vin, const PetscScalar *a)
140: {
141: VecCUDACopyFromGPU(vin);
142: VecPlaceArray_Seq(vin, a);
143: vin->offloadmask = PETSC_OFFLOAD_CPU;
144: return 0;
145: }
147: PetscErrorCode VecReplaceArray_SeqCUDA(Vec vin, const PetscScalar *a)
148: {
149: Vec_Seq *vs = (Vec_Seq *)vin->data;
151: if (vs->array != vs->array_allocated) {
152: /* make sure the users array has the latest values */
153: VecCUDACopyFromGPU(vin);
154: }
155: if (vs->array_allocated) {
156: if (vin->pinned_memory) PetscMallocSetCUDAHost();
157: PetscFree(vs->array_allocated);
158: if (vin->pinned_memory) PetscMallocResetCUDAHost();
159: }
160: vin->pinned_memory = PETSC_FALSE;
161: vs->array_allocated = vs->array = (PetscScalar *)a;
162: vin->offloadmask = PETSC_OFFLOAD_CPU;
163: return 0;
164: }
166: /*@
167: VecCreateSeqCUDA - Creates a standard, sequential array-style vector.
169: Collective
171: Input Parameter:
172: + comm - the communicator, should be PETSC_COMM_SELF
173: - n - the vector length
175: Output Parameter:
176: . v - the vector
178: Notes:
179: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
180: same type as an existing vector.
182: Level: intermediate
184: .seealso: `VecCreateMPICUDA()`, `VecCreateMPI()`, `VecCreate()`, `VecDuplicate()`, `VecDuplicateVecs()`, `VecCreateGhost()`
185: @*/
186: PetscErrorCode VecCreateSeqCUDA(MPI_Comm comm, PetscInt n, Vec *v)
187: {
188: VecCreate(comm, v);
189: VecSetSizes(*v, n, n);
190: VecSetType(*v, VECSEQCUDA);
191: return 0;
192: }
194: PetscErrorCode VecDuplicate_SeqCUDA(Vec win, Vec *V)
195: {
196: VecCreateSeqCUDA(PetscObjectComm((PetscObject)win), win->map->n, V);
197: PetscLayoutReference(win->map, &(*V)->map);
198: PetscObjectListDuplicate(((PetscObject)win)->olist, &((PetscObject)(*V))->olist);
199: PetscFunctionListDuplicate(((PetscObject)win)->qlist, &((PetscObject)(*V))->qlist);
200: (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
201: return 0;
202: }
204: PetscErrorCode VecCreate_SeqCUDA(Vec V)
205: {
206: PetscDeviceInitialize(PETSC_DEVICE_CUDA);
207: PetscLayoutSetUp(V->map);
208: VecCUDAAllocateCheck(V);
209: VecCreate_SeqCUDA_Private(V, ((Vec_CUDA *)V->spptr)->GPUarray_allocated);
210: VecSet_SeqCUDA(V, 0.0);
211: return 0;
212: }
214: /*@C
215: VecCreateSeqCUDAWithArray - Creates a CUDA sequential array-style vector,
216: where the user provides the array space to store the vector values. The array
217: provided must be a GPU array.
219: Collective
221: Input Parameters:
222: + comm - the communicator, should be PETSC_COMM_SELF
223: . bs - the block size
224: . n - the vector length
225: - array - GPU memory where the vector elements are to be stored.
227: Output Parameter:
228: . V - the vector
230: Notes:
231: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
232: same type as an existing vector.
234: If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
235: at a later stage to SET the array for storing the vector values.
237: PETSc does NOT free the array when the vector is destroyed via VecDestroy().
238: The user should not free the array until the vector is destroyed.
240: Level: intermediate
242: .seealso: `VecCreateMPICUDAWithArray()`, `VecCreate()`, `VecDuplicate()`, `VecDuplicateVecs()`,
243: `VecCreateGhost()`, `VecCreateSeq()`, `VecCUDAPlaceArray()`, `VecCreateSeqWithArray()`,
244: `VecCreateMPIWithArray()`
245: @*/
246: PetscErrorCode VecCreateSeqCUDAWithArray(MPI_Comm comm, PetscInt bs, PetscInt n, const PetscScalar array[], Vec *V)
247: {
248: PetscDeviceInitialize(PETSC_DEVICE_CUDA);
249: VecCreate(comm, V);
250: VecSetSizes(*V, n, n);
251: VecSetBlockSize(*V, bs);
252: VecCreate_SeqCUDA_Private(*V, array);
253: return 0;
254: }
256: /*@C
257: VecCreateSeqCUDAWithArrays - Creates a CUDA sequential array-style vector,
258: where the user provides the array space to store the vector values.
260: Collective
262: Input Parameters:
263: + comm - the communicator, should be PETSC_COMM_SELF
264: . bs - the block size
265: . n - the vector length
266: - cpuarray - CPU memory where the vector elements are to be stored.
267: - gpuarray - GPU memory where the vector elements are to be stored.
269: Output Parameter:
270: . V - the vector
272: Notes:
273: If both cpuarray and gpuarray are provided, the caller must ensure that
274: the provided arrays have identical values.
276: PETSc does NOT free the provided arrays when the vector is destroyed via
277: VecDestroy(). The user should not free the array until the vector is
278: destroyed.
280: Level: intermediate
282: .seealso: `VecCreateMPICUDAWithArrays()`, `VecCreate()`, `VecCreateSeqWithArray()`,
283: `VecCUDAPlaceArray()`, `VecCreateSeqCUDAWithArray()`,
284: `VecCUDAAllocateCheckHost()`
285: @*/
286: PetscErrorCode VecCreateSeqCUDAWithArrays(MPI_Comm comm, PetscInt bs, PetscInt n, const PetscScalar cpuarray[], const PetscScalar gpuarray[], Vec *V)
287: {
288: // set V's gpuarray to be gpuarray, do not allocate memory on host yet.
289: VecCreateSeqCUDAWithArray(comm, bs, n, gpuarray, V);
291: if (cpuarray && gpuarray) {
292: Vec_Seq *s = (Vec_Seq *)((*V)->data);
293: s->array = (PetscScalar *)cpuarray;
294: (*V)->offloadmask = PETSC_OFFLOAD_BOTH;
295: } else if (cpuarray) {
296: Vec_Seq *s = (Vec_Seq *)((*V)->data);
297: s->array = (PetscScalar *)cpuarray;
298: (*V)->offloadmask = PETSC_OFFLOAD_CPU;
299: } else if (gpuarray) {
300: (*V)->offloadmask = PETSC_OFFLOAD_GPU;
301: } else {
302: (*V)->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
303: }
305: return 0;
306: }
308: PetscErrorCode VecGetArray_SeqCUDA(Vec v, PetscScalar **a)
309: {
310: VecCUDACopyFromGPU(v);
311: *a = *((PetscScalar **)v->data);
312: return 0;
313: }
315: PetscErrorCode VecRestoreArray_SeqCUDA(Vec v, PetscScalar **a)
316: {
317: v->offloadmask = PETSC_OFFLOAD_CPU;
318: return 0;
319: }
321: PetscErrorCode VecGetArrayWrite_SeqCUDA(Vec v, PetscScalar **a)
322: {
323: VecCUDAAllocateCheckHost(v);
324: *a = *((PetscScalar **)v->data);
325: return 0;
326: }
328: PetscErrorCode VecGetArrayAndMemType_SeqCUDA(Vec v, PetscScalar **a, PetscMemType *mtype)
329: {
330: VecCUDACopyToGPU(v);
331: *a = ((Vec_CUDA *)v->spptr)->GPUarray;
332: if (mtype) *mtype = ((Vec_CUDA *)v->spptr)->nvshmem ? PETSC_MEMTYPE_NVSHMEM : PETSC_MEMTYPE_CUDA;
333: return 0;
334: }
336: PetscErrorCode VecRestoreArrayAndMemType_SeqCUDA(Vec v, PetscScalar **a)
337: {
338: v->offloadmask = PETSC_OFFLOAD_GPU;
339: return 0;
340: }
342: PetscErrorCode VecGetArrayWriteAndMemType_SeqCUDA(Vec v, PetscScalar **a, PetscMemType *mtype)
343: {
344: /* Allocate memory (not zeroed) on device if not yet, but no need to sync data from host to device */
345: VecCUDAAllocateCheck(v);
346: *a = ((Vec_CUDA *)v->spptr)->GPUarray;
347: if (mtype) *mtype = ((Vec_CUDA *)v->spptr)->nvshmem ? PETSC_MEMTYPE_NVSHMEM : PETSC_MEMTYPE_CUDA;
348: return 0;
349: }
351: PetscErrorCode VecBindToCPU_SeqCUDA(Vec V, PetscBool bind)
352: {
353: V->boundtocpu = bind;
354: if (bind) {
355: VecCUDACopyFromGPU(V);
356: V->offloadmask = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
357: V->ops->dot = VecDot_Seq;
358: V->ops->norm = VecNorm_Seq;
359: V->ops->tdot = VecTDot_Seq;
360: V->ops->scale = VecScale_Seq;
361: V->ops->copy = VecCopy_Seq;
362: V->ops->set = VecSet_Seq;
363: V->ops->swap = VecSwap_Seq;
364: V->ops->axpy = VecAXPY_Seq;
365: V->ops->axpby = VecAXPBY_Seq;
366: V->ops->axpbypcz = VecAXPBYPCZ_Seq;
367: V->ops->pointwisemult = VecPointwiseMult_Seq;
368: V->ops->pointwisedivide = VecPointwiseDivide_Seq;
369: V->ops->setrandom = VecSetRandom_Seq;
370: V->ops->dot_local = VecDot_Seq;
371: V->ops->tdot_local = VecTDot_Seq;
372: V->ops->norm_local = VecNorm_Seq;
373: V->ops->mdot_local = VecMDot_Seq;
374: V->ops->mtdot_local = VecMTDot_Seq;
375: V->ops->maxpy = VecMAXPY_Seq;
376: V->ops->mdot = VecMDot_Seq;
377: V->ops->mtdot = VecMTDot_Seq;
378: V->ops->aypx = VecAYPX_Seq;
379: V->ops->waxpy = VecWAXPY_Seq;
380: V->ops->dotnorm2 = NULL;
381: V->ops->placearray = VecPlaceArray_Seq;
382: V->ops->replacearray = VecReplaceArray_SeqCUDA;
383: V->ops->resetarray = VecResetArray_Seq;
384: V->ops->duplicate = VecDuplicate_Seq;
385: V->ops->conjugate = VecConjugate_Seq;
386: V->ops->getlocalvector = NULL;
387: V->ops->restorelocalvector = NULL;
388: V->ops->getlocalvectorread = NULL;
389: V->ops->restorelocalvectorread = NULL;
390: V->ops->getarraywrite = NULL;
391: V->ops->getarrayandmemtype = NULL;
392: V->ops->getarraywriteandmemtype = NULL;
393: V->ops->restorearrayandmemtype = NULL;
394: V->ops->max = VecMax_Seq;
395: V->ops->min = VecMin_Seq;
396: V->ops->reciprocal = VecReciprocal_Default;
397: V->ops->sum = NULL;
398: V->ops->shift = NULL;
399: V->ops->setpreallocationcoo = VecSetPreallocationCOO_Seq;
400: V->ops->setvaluescoo = VecSetValuesCOO_Seq;
401: /* default random number generator */
402: PetscFree(V->defaultrandtype);
403: PetscStrallocpy(PETSCRANDER48, &V->defaultrandtype);
404: } else {
405: V->ops->dot = VecDot_SeqCUDA;
406: V->ops->norm = VecNorm_SeqCUDA;
407: V->ops->tdot = VecTDot_SeqCUDA;
408: V->ops->scale = VecScale_SeqCUDA;
409: V->ops->copy = VecCopy_SeqCUDA;
410: V->ops->set = VecSet_SeqCUDA;
411: V->ops->swap = VecSwap_SeqCUDA;
412: V->ops->axpy = VecAXPY_SeqCUDA;
413: V->ops->axpby = VecAXPBY_SeqCUDA;
414: V->ops->axpbypcz = VecAXPBYPCZ_SeqCUDA;
415: V->ops->pointwisemult = VecPointwiseMult_SeqCUDA;
416: V->ops->pointwisedivide = VecPointwiseDivide_SeqCUDA;
417: V->ops->setrandom = VecSetRandom_SeqCUDA;
418: V->ops->dot_local = VecDot_SeqCUDA;
419: V->ops->tdot_local = VecTDot_SeqCUDA;
420: V->ops->norm_local = VecNorm_SeqCUDA;
421: V->ops->mdot_local = VecMDot_SeqCUDA;
422: V->ops->maxpy = VecMAXPY_SeqCUDA;
423: V->ops->mdot = VecMDot_SeqCUDA;
424: V->ops->aypx = VecAYPX_SeqCUDA;
425: V->ops->waxpy = VecWAXPY_SeqCUDA;
426: V->ops->dotnorm2 = VecDotNorm2_SeqCUDA;
427: V->ops->placearray = VecPlaceArray_SeqCUDA;
428: V->ops->replacearray = VecReplaceArray_SeqCUDA;
429: V->ops->resetarray = VecResetArray_SeqCUDA;
430: V->ops->destroy = VecDestroy_SeqCUDA;
431: V->ops->duplicate = VecDuplicate_SeqCUDA;
432: V->ops->conjugate = VecConjugate_SeqCUDA;
433: V->ops->getlocalvector = VecGetLocalVector_SeqCUDA;
434: V->ops->restorelocalvector = VecRestoreLocalVector_SeqCUDA;
435: V->ops->getlocalvectorread = VecGetLocalVectorRead_SeqCUDA;
436: V->ops->restorelocalvectorread = VecRestoreLocalVectorRead_SeqCUDA;
437: V->ops->getarraywrite = VecGetArrayWrite_SeqCUDA;
438: V->ops->getarray = VecGetArray_SeqCUDA;
439: V->ops->restorearray = VecRestoreArray_SeqCUDA;
440: V->ops->getarrayandmemtype = VecGetArrayAndMemType_SeqCUDA;
441: V->ops->getarraywriteandmemtype = VecGetArrayWriteAndMemType_SeqCUDA;
442: V->ops->restorearrayandmemtype = VecRestoreArrayAndMemType_SeqCUDA;
443: V->ops->max = VecMax_SeqCUDA;
444: V->ops->min = VecMin_SeqCUDA;
445: V->ops->reciprocal = VecReciprocal_SeqCUDA;
446: V->ops->sum = VecSum_SeqCUDA;
447: V->ops->shift = VecShift_SeqCUDA;
448: V->ops->setpreallocationcoo = VecSetPreallocationCOO_SeqCUDA;
449: V->ops->setvaluescoo = VecSetValuesCOO_SeqCUDA;
451: /* default random number generator */
452: PetscFree(V->defaultrandtype);
453: PetscStrallocpy(PETSCCURAND, &V->defaultrandtype);
454: }
455: return 0;
456: }
458: PetscErrorCode VecCreate_SeqCUDA_Private(Vec V, const PetscScalar *array)
459: {
460: Vec_CUDA *veccuda;
461: PetscMPIInt size;
462: PetscBool option_set;
464: MPI_Comm_size(PetscObjectComm((PetscObject)V), &size);
466: VecCreate_Seq_Private(V, 0);
467: PetscObjectChangeTypeName((PetscObject)V, VECSEQCUDA);
468: VecBindToCPU_SeqCUDA(V, PETSC_FALSE);
469: V->ops->bindtocpu = VecBindToCPU_SeqCUDA;
471: /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
472: if (array) {
473: if (!V->spptr) {
474: PetscReal pinned_memory_min;
475: PetscCalloc(sizeof(Vec_CUDA), &V->spptr);
476: veccuda = (Vec_CUDA *)V->spptr;
477: V->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
479: pinned_memory_min = 0;
480: /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
481: Note: This same code duplicated in VecCUDAAllocateCheck() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
482: PetscOptionsBegin(PetscObjectComm((PetscObject)V), ((PetscObject)V)->prefix, "VECCUDA Options", "Vec");
483: PetscOptionsReal("-vec_pinned_memory_min", "Minimum size (in bytes) for an allocation to use pinned memory on host", "VecSetPinnedMemoryMin", pinned_memory_min, &pinned_memory_min, &option_set);
484: if (option_set) V->minimum_bytes_pinned_memory = pinned_memory_min;
485: PetscOptionsEnd();
486: }
487: veccuda = (Vec_CUDA *)V->spptr;
488: veccuda->GPUarray = (PetscScalar *)array;
489: V->offloadmask = PETSC_OFFLOAD_GPU;
490: }
491: return 0;
492: }