Actual source code: mpicuda.cu
2: /*
3: This file contains routines for Parallel vector operations.
4: */
5: #define PETSC_SKIP_SPINLOCK
7: #include <petscconf.h>
8: #include <petscsf.h>
9: #include <../src/vec/vec/impls/mpi/pvecimpl.h>
10: #include <petsc/private/cudavecimpl.h>
12: static PetscErrorCode VecResetPreallocationCOO_MPICUDA(Vec x)
13: {
14: Vec_CUDA *veccuda = static_cast<Vec_CUDA *>(x->spptr);
16: if (veccuda) {
17: cudaFree(veccuda->jmap1_d);
18: cudaFree(veccuda->perm1_d);
19: cudaFree(veccuda->imap2_d);
20: cudaFree(veccuda->jmap2_d);
21: cudaFree(veccuda->perm2_d);
22: cudaFree(veccuda->Cperm_d);
23: cudaFree(veccuda->sendbuf_d);
24: cudaFree(veccuda->recvbuf_d);
25: }
26: return 0;
27: }
29: PetscErrorCode VecSetPreallocationCOO_MPICUDA(Vec x, PetscCount ncoo, const PetscInt coo_i[])
30: {
31: Vec_MPI *vecmpi = static_cast<Vec_MPI *>(x->data);
32: Vec_CUDA *veccuda = static_cast<Vec_CUDA *>(x->spptr);
33: PetscInt m;
35: VecResetPreallocationCOO_MPICUDA(x);
36: VecSetPreallocationCOO_MPI(x, ncoo, coo_i);
37: VecGetLocalSize(x, &m);
39: cudaMalloc((void **)&veccuda->jmap1_d, sizeof(PetscCount) * m);
40: cudaMalloc((void **)&veccuda->perm1_d, sizeof(PetscCount) * vecmpi->tot1);
41: cudaMalloc((void **)&veccuda->imap2_d, sizeof(PetscInt) * vecmpi->nnz2);
42: cudaMalloc((void **)&veccuda->jmap2_d, sizeof(PetscCount) * (vecmpi->nnz2 + 1));
43: cudaMalloc((void **)&veccuda->perm2_d, sizeof(PetscCount) * vecmpi->recvlen);
44: cudaMalloc((void **)&veccuda->Cperm_d, sizeof(PetscCount) * vecmpi->sendlen);
45: cudaMalloc((void **)&veccuda->sendbuf_d, sizeof(PetscScalar) * vecmpi->sendlen);
46: cudaMalloc((void **)&veccuda->recvbuf_d, sizeof(PetscScalar) * vecmpi->recvlen);
48: cudaMemcpy(veccuda->jmap1_d, vecmpi->jmap1, sizeof(PetscCount) * m, cudaMemcpyHostToDevice);
49: cudaMemcpy(veccuda->perm1_d, vecmpi->perm1, sizeof(PetscCount) * vecmpi->tot1, cudaMemcpyHostToDevice);
50: cudaMemcpy(veccuda->imap2_d, vecmpi->imap2, sizeof(PetscInt) * vecmpi->nnz2, cudaMemcpyHostToDevice);
51: cudaMemcpy(veccuda->jmap2_d, vecmpi->jmap2, sizeof(PetscCount) * (vecmpi->nnz2 + 1), cudaMemcpyHostToDevice);
52: cudaMemcpy(veccuda->perm2_d, vecmpi->perm2, sizeof(PetscCount) * vecmpi->recvlen, cudaMemcpyHostToDevice);
53: cudaMemcpy(veccuda->Cperm_d, vecmpi->Cperm, sizeof(PetscCount) * vecmpi->sendlen, cudaMemcpyHostToDevice);
54: cudaMemcpy(veccuda->sendbuf_d, vecmpi->sendbuf, sizeof(PetscScalar) * vecmpi->sendlen, cudaMemcpyHostToDevice);
55: cudaMemcpy(veccuda->recvbuf_d, vecmpi->recvbuf, sizeof(PetscScalar) * vecmpi->recvlen, cudaMemcpyHostToDevice);
56: return 0;
57: }
59: __global__ static void VecPackCOOValues(const PetscScalar vv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
60: {
61: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
62: const PetscCount grid_size = gridDim.x * blockDim.x;
63: for (; i < nnz; i += grid_size) buf[i] = vv[perm[i]];
64: }
66: __global__ static void VecAddCOOValues(const PetscScalar vv[], PetscCount m, const PetscCount jmap1[], const PetscCount perm1[], InsertMode imode, PetscScalar xv[])
67: {
68: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
69: const PetscCount grid_size = gridDim.x * blockDim.x;
70: for (; i < m; i += grid_size) {
71: PetscScalar sum = 0.0;
72: for (PetscCount k = jmap1[i]; k < jmap1[i + 1]; k++) sum += vv[perm1[k]];
73: xv[i] = (imode == INSERT_VALUES ? 0.0 : xv[i]) + sum;
74: }
75: }
77: __global__ static void VecAddRemoteCOOValues(const PetscScalar vv[], PetscCount nnz2, const PetscCount imap2[], const PetscCount jmap2[], const PetscCount perm2[], PetscScalar xv[])
78: {
79: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
80: const PetscCount grid_size = gridDim.x * blockDim.x;
81: for (; i < nnz2; i += grid_size) {
82: for (PetscCount k = jmap2[i]; k < jmap2[i + 1]; k++) xv[imap2[i]] += vv[perm2[k]];
83: }
84: }
86: PetscErrorCode VecSetValuesCOO_MPICUDA(Vec x, const PetscScalar v[], InsertMode imode)
87: {
88: Vec_MPI *vecmpi = static_cast<Vec_MPI *>(x->data);
89: Vec_CUDA *veccuda = static_cast<Vec_CUDA *>(x->spptr);
90: const PetscCount *jmap1 = veccuda->jmap1_d;
91: const PetscCount *perm1 = veccuda->perm1_d;
92: const PetscCount *imap2 = veccuda->imap2_d;
93: const PetscCount *jmap2 = veccuda->jmap2_d;
94: const PetscCount *perm2 = veccuda->perm2_d;
95: const PetscCount *Cperm = veccuda->Cperm_d;
96: PetscScalar *sendbuf = veccuda->sendbuf_d;
97: PetscScalar *recvbuf = veccuda->recvbuf_d;
98: PetscScalar *xv = NULL;
99: const PetscScalar *vv = v;
100: PetscMemType memtype;
101: PetscInt m;
103: VecGetLocalSize(x, &m);
104: PetscGetMemType(v, &memtype);
105: if (PetscMemTypeHost(memtype)) { /* If user gave v[] on host, copy it to device */
106: cudaMalloc((void **)&vv, sizeof(PetscScalar) * vecmpi->coo_n);
107: cudaMemcpy((void *)vv, v, sizeof(PetscScalar) * vecmpi->coo_n, cudaMemcpyHostToDevice);
108: } else {
109: vv = (PetscScalar *)v;
110: }
112: /* Pack entries to be sent to remote */
113: if (vecmpi->sendlen) {
114: VecPackCOOValues<<<(vecmpi->sendlen + 255) / 256, 256>>>(vv, vecmpi->sendlen, Cperm, sendbuf);
115: cudaPeekAtLastError();
116: }
118: PetscSFReduceWithMemTypeBegin(vecmpi->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, sendbuf, PETSC_MEMTYPE_CUDA, recvbuf, MPI_REPLACE);
119: if (imode == INSERT_VALUES) VecCUDAGetArrayWrite(x, &xv); /* write vector */
120: else VecCUDAGetArray(x, &xv); /* read & write vector */
122: if (m) {
123: VecAddCOOValues<<<(m + 255) / 256, 256>>>(vv, m, jmap1, perm1, imode, xv);
124: cudaPeekAtLastError();
125: }
126: PetscSFReduceEnd(vecmpi->coo_sf, MPIU_SCALAR, sendbuf, recvbuf, MPI_REPLACE);
128: /* Add received remote entries */
129: if (vecmpi->nnz2) {
130: VecAddRemoteCOOValues<<<(vecmpi->nnz2 + 255) / 256, 256>>>(recvbuf, vecmpi->nnz2, imap2, jmap2, perm2, xv);
131: cudaPeekAtLastError();
132: }
134: if (PetscMemTypeHost(memtype)) cudaFree((void *)vv);
135: if (imode == INSERT_VALUES) VecCUDARestoreArrayWrite(x, &xv);
136: else VecCUDARestoreArray(x, &xv);
137: return 0;
138: }
140: /*MC
141: VECCUDA - VECCUDA = "cuda" - A VECSEQCUDA on a single-process communicator, and VECMPICUDA otherwise.
143: Options Database Keys:
144: . -vec_type cuda - sets the vector type to VECCUDA during a call to VecSetFromOptions()
146: Level: beginner
148: .seealso: `VecCreate()`, `VecSetType()`, `VecSetFromOptions()`, `VecCreateMPIWithArray()`, `VECSEQCUDA`, `VECMPICUDA`, `VECSTANDARD`, `VecType`, `VecCreateMPI()`, `VecSetPinnedMemoryMin()`
149: M*/
151: PetscErrorCode VecDestroy_MPICUDA(Vec v)
152: {
153: Vec_MPI *vecmpi = (Vec_MPI *)v->data;
154: Vec_CUDA *veccuda;
156: if (v->spptr) {
157: veccuda = (Vec_CUDA *)v->spptr;
158: if (veccuda->GPUarray_allocated) {
159: cudaFree(((Vec_CUDA *)v->spptr)->GPUarray_allocated);
160: veccuda->GPUarray_allocated = NULL;
161: }
162: if (veccuda->stream) cudaStreamDestroy(((Vec_CUDA *)v->spptr)->stream);
163: if (v->pinned_memory) {
164: PetscMallocSetCUDAHost();
165: PetscFree(vecmpi->array_allocated);
166: PetscMallocResetCUDAHost();
167: v->pinned_memory = PETSC_FALSE;
168: }
169: VecResetPreallocationCOO_MPICUDA(v);
170: PetscFree(v->spptr);
171: }
172: VecDestroy_MPI(v);
173: return 0;
174: }
176: PetscErrorCode VecNorm_MPICUDA(Vec xin, NormType type, PetscReal *z)
177: {
178: PetscReal sum, work = 0.0;
180: if (type == NORM_2 || type == NORM_FROBENIUS) {
181: VecNorm_SeqCUDA(xin, NORM_2, &work);
182: work *= work;
183: MPIU_Allreduce(&work, &sum, 1, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)xin));
184: *z = PetscSqrtReal(sum);
185: } else if (type == NORM_1) {
186: /* Find the local part */
187: VecNorm_SeqCUDA(xin, NORM_1, &work);
188: /* Find the global max */
189: MPIU_Allreduce(&work, z, 1, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)xin));
190: } else if (type == NORM_INFINITY) {
191: /* Find the local max */
192: VecNorm_SeqCUDA(xin, NORM_INFINITY, &work);
193: /* Find the global max */
194: MPIU_Allreduce(&work, z, 1, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)xin));
195: } else if (type == NORM_1_AND_2) {
196: PetscReal temp[2];
197: VecNorm_SeqCUDA(xin, NORM_1, temp);
198: VecNorm_SeqCUDA(xin, NORM_2, temp + 1);
199: temp[1] = temp[1] * temp[1];
200: MPIU_Allreduce(temp, z, 2, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)xin));
201: z[1] = PetscSqrtReal(z[1]);
202: }
203: return 0;
204: }
206: PetscErrorCode VecDot_MPICUDA(Vec xin, Vec yin, PetscScalar *z)
207: {
208: PetscScalar sum, work;
210: VecDot_SeqCUDA(xin, yin, &work);
211: MPIU_Allreduce(&work, &sum, 1, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)xin));
212: *z = sum;
213: return 0;
214: }
216: PetscErrorCode VecTDot_MPICUDA(Vec xin, Vec yin, PetscScalar *z)
217: {
218: PetscScalar sum, work;
220: VecTDot_SeqCUDA(xin, yin, &work);
221: MPIU_Allreduce(&work, &sum, 1, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)xin));
222: *z = sum;
223: return 0;
224: }
226: PetscErrorCode VecMDot_MPICUDA(Vec xin, PetscInt nv, const Vec y[], PetscScalar *z)
227: {
228: PetscScalar awork[128], *work = awork;
230: if (nv > 128) PetscMalloc1(nv, &work);
231: VecMDot_SeqCUDA(xin, nv, y, work);
232: MPIU_Allreduce(work, z, nv, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)xin));
233: if (nv > 128) PetscFree(work);
234: return 0;
235: }
237: /*MC
238: VECMPICUDA - VECMPICUDA = "mpicuda" - The basic parallel vector, modified to use CUDA
240: Options Database Keys:
241: . -vec_type mpicuda - sets the vector type to VECMPICUDA during a call to VecSetFromOptions()
243: Level: beginner
245: .seealso: `VecCreate()`, `VecSetType()`, `VecSetFromOptions()`, `VecCreateMPIWithArray()`, `VECMPI`, `VecType`, `VecCreateMPI()`, `VecSetPinnedMemoryMin()`
246: M*/
248: PetscErrorCode VecDuplicate_MPICUDA(Vec win, Vec *v)
249: {
250: Vec_MPI *vw, *w = (Vec_MPI *)win->data;
251: PetscScalar *array;
253: VecCreate(PetscObjectComm((PetscObject)win), v);
254: PetscLayoutReference(win->map, &(*v)->map);
256: VecCreate_MPICUDA_Private(*v, PETSC_TRUE, w->nghost, 0);
257: vw = (Vec_MPI *)(*v)->data;
258: PetscMemcpy((*v)->ops, win->ops, sizeof(struct _VecOps));
260: /* save local representation of the parallel vector (and scatter) if it exists */
261: if (w->localrep) {
262: VecGetArray(*v, &array);
263: VecCreateSeqWithArray(PETSC_COMM_SELF, 1, win->map->n + w->nghost, array, &vw->localrep);
264: PetscMemcpy(vw->localrep->ops, w->localrep->ops, sizeof(struct _VecOps));
265: VecRestoreArray(*v, &array);
266: vw->localupdate = w->localupdate;
267: if (vw->localupdate) PetscObjectReference((PetscObject)vw->localupdate);
268: }
270: /* New vector should inherit stashing property of parent */
271: (*v)->stash.donotstash = win->stash.donotstash;
272: (*v)->stash.ignorenegidx = win->stash.ignorenegidx;
274: /* change type_name appropriately */
275: VecCUDAAllocateCheck(*v);
276: PetscObjectChangeTypeName((PetscObject)(*v), VECMPICUDA);
278: PetscObjectListDuplicate(((PetscObject)win)->olist, &((PetscObject)(*v))->olist);
279: PetscFunctionListDuplicate(((PetscObject)win)->qlist, &((PetscObject)(*v))->qlist);
280: (*v)->map->bs = PetscAbs(win->map->bs);
281: (*v)->bstash.bs = win->bstash.bs;
282: return 0;
283: }
285: PetscErrorCode VecDotNorm2_MPICUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
286: {
287: PetscScalar work[2], sum[2];
289: VecDotNorm2_SeqCUDA(s, t, work, work + 1);
290: MPIU_Allreduce(&work, &sum, 2, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)s));
291: *dp = sum[0];
292: *nm = sum[1];
293: return 0;
294: }
296: PetscErrorCode VecCreate_MPICUDA(Vec vv)
297: {
298: PetscDeviceInitialize(PETSC_DEVICE_CUDA);
299: PetscLayoutSetUp(vv->map);
300: VecCUDAAllocateCheck(vv);
301: VecCreate_MPICUDA_Private(vv, PETSC_FALSE, 0, ((Vec_CUDA *)vv->spptr)->GPUarray_allocated);
302: VecCUDAAllocateCheckHost(vv);
303: VecSet(vv, 0.0);
304: VecSet_Seq(vv, 0.0);
305: vv->offloadmask = PETSC_OFFLOAD_BOTH;
306: return 0;
307: }
309: PetscErrorCode VecCreate_CUDA(Vec v)
310: {
311: PetscMPIInt size;
313: MPI_Comm_size(PetscObjectComm((PetscObject)v), &size);
314: if (size == 1) {
315: VecSetType(v, VECSEQCUDA);
316: } else {
317: VecSetType(v, VECMPICUDA);
318: }
319: return 0;
320: }
322: /*@
323: VecCreateMPICUDA - Creates a standard, parallel array-style vector for CUDA devices.
325: Collective
327: Input Parameters:
328: + comm - the MPI communicator to use
329: . n - local vector length (or PETSC_DECIDE to have calculated if N is given)
330: - N - global vector length (or PETSC_DETERMINE to have calculated if n is given)
332: Output Parameter:
333: . v - the vector
335: Notes:
336: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
337: same type as an existing vector.
339: Level: intermediate
341: .seealso: `VecCreateMPICUDAWithArray()`, `VecCreateMPICUDAWithArrays()`, `VecCreateSeqCUDA()`, `VecCreateSeq()`,
342: `VecCreateMPI()`, `VecCreate()`, `VecDuplicate()`, `VecDuplicateVecs()`, `VecCreateGhost()`,
343: `VecCreateMPIWithArray()`, `VecCreateGhostWithArray()`, `VecMPISetGhost()`
345: @*/
346: PetscErrorCode VecCreateMPICUDA(MPI_Comm comm, PetscInt n, PetscInt N, Vec *v)
347: {
348: VecCreate(comm, v);
349: VecSetSizes(*v, n, N);
350: VecSetType(*v, VECMPICUDA);
351: return 0;
352: }
354: /*@C
355: VecCreateMPICUDAWithArray - Creates a parallel, array-style vector,
356: where the user provides the GPU array space to store the vector values.
358: Collective
360: Input Parameters:
361: + comm - the MPI communicator to use
362: . bs - block size, same meaning as VecSetBlockSize()
363: . n - local vector length, cannot be PETSC_DECIDE
364: . N - global vector length (or PETSC_DECIDE to have calculated)
365: - array - the user provided GPU array to store the vector values
367: Output Parameter:
368: . vv - the vector
370: Notes:
371: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
372: same type as an existing vector.
374: If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
375: at a later stage to SET the array for storing the vector values.
377: PETSc does NOT free the array when the vector is destroyed via VecDestroy().
378: The user should not free the array until the vector is destroyed.
380: Level: intermediate
382: .seealso: `VecCreateMPICUDA()`, `VecCreateSeqCUDAWithArray()`, `VecCreateMPIWithArray()`, `VecCreateSeqWithArray()`,
383: `VecCreate()`, `VecDuplicate()`, `VecDuplicateVecs()`, `VecCreateGhost()`,
384: `VecCreateMPI()`, `VecCreateGhostWithArray()`, `VecPlaceArray()`
386: @*/
387: PetscErrorCode VecCreateMPICUDAWithArray(MPI_Comm comm, PetscInt bs, PetscInt n, PetscInt N, const PetscScalar array[], Vec *vv)
388: {
390: PetscDeviceInitialize(PETSC_DEVICE_CUDA);
391: VecCreate(comm, vv);
392: VecSetSizes(*vv, n, N);
393: VecSetBlockSize(*vv, bs);
394: VecCreate_MPICUDA_Private(*vv, PETSC_FALSE, 0, array);
395: return 0;
396: }
398: /*@C
399: VecCreateMPICUDAWithArrays - Creates a parallel, array-style vector,
400: where the user provides the GPU array space to store the vector values.
402: Collective
404: Input Parameters:
405: + comm - the MPI communicator to use
406: . bs - block size, same meaning as VecSetBlockSize()
407: . n - local vector length, cannot be PETSC_DECIDE
408: . N - global vector length (or PETSC_DECIDE to have calculated)
409: - cpuarray - the user provided CPU array to store the vector values
410: - gpuarray - the user provided GPU array to store the vector values
412: Output Parameter:
413: . vv - the vector
415: Notes:
416: If both cpuarray and gpuarray are provided, the caller must ensure that
417: the provided arrays have identical values.
419: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
420: same type as an existing vector.
422: PETSc does NOT free the provided arrays when the vector is destroyed via
423: VecDestroy(). The user should not free the array until the vector is
424: destroyed.
426: Level: intermediate
428: .seealso: `VecCreateSeqCUDAWithArrays()`, `VecCreateMPIWithArray()`, `VecCreateSeqWithArray()`,
429: `VecCreate()`, `VecDuplicate()`, `VecDuplicateVecs()`, `VecCreateGhost()`,
430: `VecCreateMPI()`, `VecCreateGhostWithArray()`, `VecCUDAPlaceArray()`, `VecPlaceArray()`,
431: `VecCUDAAllocateCheckHost()`
432: @*/
433: PetscErrorCode VecCreateMPICUDAWithArrays(MPI_Comm comm, PetscInt bs, PetscInt n, PetscInt N, const PetscScalar cpuarray[], const PetscScalar gpuarray[], Vec *vv)
434: {
435: VecCreateMPICUDAWithArray(comm, bs, n, N, gpuarray, vv);
437: if (cpuarray && gpuarray) {
438: Vec_MPI *s = (Vec_MPI *)((*vv)->data);
439: s->array = (PetscScalar *)cpuarray;
440: (*vv)->offloadmask = PETSC_OFFLOAD_BOTH;
441: } else if (cpuarray) {
442: Vec_MPI *s = (Vec_MPI *)((*vv)->data);
443: s->array = (PetscScalar *)cpuarray;
444: (*vv)->offloadmask = PETSC_OFFLOAD_CPU;
445: } else if (gpuarray) {
446: (*vv)->offloadmask = PETSC_OFFLOAD_GPU;
447: } else {
448: (*vv)->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
449: }
451: return 0;
452: }
454: PetscErrorCode VecMax_MPICUDA(Vec xin, PetscInt *idx, PetscReal *z)
455: {
456: PetscReal work;
458: VecMax_SeqCUDA(xin, idx, &work);
459: #if defined(PETSC_HAVE_MPIUNI)
460: *z = work;
461: #else
462: if (!idx) {
463: MPIU_Allreduce(&work, z, 1, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)xin));
464: } else {
465: struct {
466: PetscReal v;
467: PetscInt i;
468: } in, out;
470: in.v = work;
471: in.i = *idx + xin->map->rstart;
472: MPIU_Allreduce(&in, &out, 1, MPIU_REAL_INT, MPIU_MAXLOC, PetscObjectComm((PetscObject)xin));
473: *z = out.v;
474: *idx = out.i;
475: }
476: #endif
477: return 0;
478: }
480: PetscErrorCode VecMin_MPICUDA(Vec xin, PetscInt *idx, PetscReal *z)
481: {
482: PetscReal work;
484: VecMin_SeqCUDA(xin, idx, &work);
485: #if defined(PETSC_HAVE_MPIUNI)
486: *z = work;
487: #else
488: if (!idx) {
489: MPIU_Allreduce(&work, z, 1, MPIU_REAL, MPIU_MIN, PetscObjectComm((PetscObject)xin));
490: } else {
491: struct {
492: PetscReal v;
493: PetscInt i;
494: } in, out;
496: in.v = work;
497: in.i = *idx + xin->map->rstart;
498: MPIU_Allreduce(&in, &out, 1, MPIU_REAL_INT, MPIU_MINLOC, PetscObjectComm((PetscObject)xin));
499: *z = out.v;
500: *idx = out.i;
501: }
502: #endif
503: return 0;
504: }
506: PetscErrorCode VecBindToCPU_MPICUDA(Vec V, PetscBool bind)
507: {
508: V->boundtocpu = bind;
509: if (bind) {
510: VecCUDACopyFromGPU(V);
511: V->offloadmask = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
512: V->ops->dotnorm2 = NULL;
513: V->ops->waxpy = VecWAXPY_Seq;
514: V->ops->dot = VecDot_MPI;
515: V->ops->mdot = VecMDot_MPI;
516: V->ops->tdot = VecTDot_MPI;
517: V->ops->norm = VecNorm_MPI;
518: V->ops->scale = VecScale_Seq;
519: V->ops->copy = VecCopy_Seq;
520: V->ops->set = VecSet_Seq;
521: V->ops->swap = VecSwap_Seq;
522: V->ops->axpy = VecAXPY_Seq;
523: V->ops->axpby = VecAXPBY_Seq;
524: V->ops->maxpy = VecMAXPY_Seq;
525: V->ops->aypx = VecAYPX_Seq;
526: V->ops->axpbypcz = VecAXPBYPCZ_Seq;
527: V->ops->pointwisemult = VecPointwiseMult_Seq;
528: V->ops->setrandom = VecSetRandom_Seq;
529: V->ops->placearray = VecPlaceArray_Seq;
530: V->ops->replacearray = VecReplaceArray_SeqCUDA;
531: V->ops->resetarray = VecResetArray_Seq;
532: V->ops->dot_local = VecDot_Seq;
533: V->ops->tdot_local = VecTDot_Seq;
534: V->ops->norm_local = VecNorm_Seq;
535: V->ops->mdot_local = VecMDot_Seq;
536: V->ops->pointwisedivide = VecPointwiseDivide_Seq;
537: V->ops->getlocalvector = NULL;
538: V->ops->restorelocalvector = NULL;
539: V->ops->getlocalvectorread = NULL;
540: V->ops->restorelocalvectorread = NULL;
541: V->ops->getarraywrite = NULL;
542: V->ops->getarrayandmemtype = NULL;
543: V->ops->restorearrayandmemtype = NULL;
544: V->ops->getarraywriteandmemtype = NULL;
545: V->ops->max = VecMax_MPI;
546: V->ops->min = VecMin_MPI;
547: V->ops->reciprocal = VecReciprocal_Default;
548: V->ops->sum = NULL;
549: V->ops->shift = NULL;
550: /* default random number generator */
551: PetscFree(V->defaultrandtype);
552: PetscStrallocpy(PETSCRANDER48, &V->defaultrandtype);
553: } else {
554: V->ops->dotnorm2 = VecDotNorm2_MPICUDA;
555: V->ops->waxpy = VecWAXPY_SeqCUDA;
556: V->ops->duplicate = VecDuplicate_MPICUDA;
557: V->ops->dot = VecDot_MPICUDA;
558: V->ops->mdot = VecMDot_MPICUDA;
559: V->ops->tdot = VecTDot_MPICUDA;
560: V->ops->norm = VecNorm_MPICUDA;
561: V->ops->scale = VecScale_SeqCUDA;
562: V->ops->copy = VecCopy_SeqCUDA;
563: V->ops->set = VecSet_SeqCUDA;
564: V->ops->swap = VecSwap_SeqCUDA;
565: V->ops->axpy = VecAXPY_SeqCUDA;
566: V->ops->axpby = VecAXPBY_SeqCUDA;
567: V->ops->maxpy = VecMAXPY_SeqCUDA;
568: V->ops->aypx = VecAYPX_SeqCUDA;
569: V->ops->axpbypcz = VecAXPBYPCZ_SeqCUDA;
570: V->ops->pointwisemult = VecPointwiseMult_SeqCUDA;
571: V->ops->setrandom = VecSetRandom_SeqCUDA;
572: V->ops->placearray = VecPlaceArray_SeqCUDA;
573: V->ops->replacearray = VecReplaceArray_SeqCUDA;
574: V->ops->resetarray = VecResetArray_SeqCUDA;
575: V->ops->dot_local = VecDot_SeqCUDA;
576: V->ops->tdot_local = VecTDot_SeqCUDA;
577: V->ops->norm_local = VecNorm_SeqCUDA;
578: V->ops->mdot_local = VecMDot_SeqCUDA;
579: V->ops->destroy = VecDestroy_MPICUDA;
580: V->ops->pointwisedivide = VecPointwiseDivide_SeqCUDA;
581: V->ops->getlocalvector = VecGetLocalVector_SeqCUDA;
582: V->ops->restorelocalvector = VecRestoreLocalVector_SeqCUDA;
583: V->ops->getlocalvectorread = VecGetLocalVectorRead_SeqCUDA;
584: V->ops->restorelocalvectorread = VecRestoreLocalVectorRead_SeqCUDA;
585: V->ops->getarraywrite = VecGetArrayWrite_SeqCUDA;
586: V->ops->getarray = VecGetArray_SeqCUDA;
587: V->ops->restorearray = VecRestoreArray_SeqCUDA;
588: V->ops->getarrayandmemtype = VecGetArrayAndMemType_SeqCUDA;
589: V->ops->restorearrayandmemtype = VecRestoreArrayAndMemType_SeqCUDA;
590: V->ops->getarraywriteandmemtype = VecGetArrayWriteAndMemType_SeqCUDA;
591: V->ops->max = VecMax_MPICUDA;
592: V->ops->min = VecMin_MPICUDA;
593: V->ops->reciprocal = VecReciprocal_SeqCUDA;
594: V->ops->sum = VecSum_SeqCUDA;
595: V->ops->shift = VecShift_SeqCUDA;
596: /* default random number generator */
597: PetscFree(V->defaultrandtype);
598: PetscStrallocpy(PETSCCURAND, &V->defaultrandtype);
599: }
600: return 0;
601: }
603: PetscErrorCode VecCreate_MPICUDA_Private(Vec vv, PetscBool alloc, PetscInt nghost, const PetscScalar array[])
604: {
605: Vec_CUDA *veccuda;
607: VecCreate_MPI_Private(vv, PETSC_FALSE, 0, 0);
608: PetscObjectChangeTypeName((PetscObject)vv, VECMPICUDA);
610: VecBindToCPU_MPICUDA(vv, PETSC_FALSE);
611: vv->ops->bindtocpu = VecBindToCPU_MPICUDA;
613: /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
614: if (alloc && !array) {
615: VecCUDAAllocateCheck(vv);
616: VecCUDAAllocateCheckHost(vv);
617: VecSet(vv, 0.0);
618: VecSet_Seq(vv, 0.0);
619: vv->offloadmask = PETSC_OFFLOAD_BOTH;
620: }
621: if (array) {
622: if (!vv->spptr) {
623: PetscReal pinned_memory_min;
624: PetscBool flag;
625: /* Cannot use PetscNew() here because spptr is void* */
626: PetscCalloc(sizeof(Vec_CUDA), &vv->spptr);
627: veccuda = (Vec_CUDA *)vv->spptr;
628: vv->minimum_bytes_pinned_memory = 0;
630: /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
631: Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCUDAAllocateCheck(). Is there a good way to avoid this? */
632: PetscOptionsBegin(PetscObjectComm((PetscObject)vv), ((PetscObject)vv)->prefix, "VECCUDA Options", "Vec");
633: pinned_memory_min = vv->minimum_bytes_pinned_memory;
634: PetscOptionsReal("-vec_pinned_memory_min", "Minimum size (in bytes) for an allocation to use pinned memory on host", "VecSetPinnedMemoryMin", pinned_memory_min, &pinned_memory_min, &flag);
635: if (flag) vv->minimum_bytes_pinned_memory = pinned_memory_min;
636: PetscOptionsEnd();
637: }
638: veccuda = (Vec_CUDA *)vv->spptr;
639: veccuda->GPUarray = (PetscScalar *)array;
640: vv->offloadmask = PETSC_OFFLOAD_GPU;
641: }
642: return 0;
643: }