Actual source code: mpihip.hip.cpp
2: /*
3: This file contains routines for Parallel vector operations.
4: */
5: #define PETSC_SKIP_SPINLOCK
7: #include <petscconf.h>
8: #include <../src/vec/vec/impls/mpi/pvecimpl.h>
9: #include <petsc/private/hipvecimpl.h>
11: /*MC
12: VECHIP - VECHIP = "hip" - A VECSEQHIP on a single-process communicator, and VECMPIHIP otherwise.
14: Options Database Keys:
15: . -vec_type hip - sets the vector type to VECHIP during a call to VecSetFromOptions()
17: Level: beginner
19: .seealso: `VecCreate()`, `VecSetType()`, `VecSetFromOptions()`, `VecCreateMPIWithArray()`, `VECSEQHIP`, `VECMPIHIP`, `VECSTANDARD`, `VecType`, `VecCreateMPI()`, `VecSetPinnedMemoryMin()`
20: M*/
22: PetscErrorCode VecDestroy_MPIHIP(Vec v)
23: {
24: Vec_MPI *vecmpi = (Vec_MPI *)v->data;
25: Vec_HIP *vechip;
27: if (v->spptr) {
28: vechip = (Vec_HIP *)v->spptr;
29: if (vechip->GPUarray_allocated) {
30: hipFree(vechip->GPUarray_allocated);
31: vechip->GPUarray_allocated = NULL;
32: }
33: if (vechip->stream) hipStreamDestroy(vechip->stream);
34: if (v->pinned_memory) {
35: PetscMallocSetHIPHost();
36: PetscFree(vecmpi->array_allocated);
37: PetscMallocResetHIPHost();
38: v->pinned_memory = PETSC_FALSE;
39: }
40: PetscFree(v->spptr);
41: }
42: VecDestroy_MPI(v);
43: return 0;
44: }
46: PetscErrorCode VecNorm_MPIHIP(Vec xin, NormType type, PetscReal *z)
47: {
48: PetscReal sum, work = 0.0;
50: if (type == NORM_2 || type == NORM_FROBENIUS) {
51: VecNorm_SeqHIP(xin, NORM_2, &work);
52: work *= work;
53: MPIU_Allreduce(&work, &sum, 1, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)xin));
54: *z = PetscSqrtReal(sum);
55: } else if (type == NORM_1) {
56: /* Find the local part */
57: VecNorm_SeqHIP(xin, NORM_1, &work);
58: /* Find the global max */
59: MPIU_Allreduce(&work, z, 1, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)xin));
60: } else if (type == NORM_INFINITY) {
61: /* Find the local max */
62: VecNorm_SeqHIP(xin, NORM_INFINITY, &work);
63: /* Find the global max */
64: MPIU_Allreduce(&work, z, 1, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)xin));
65: } else if (type == NORM_1_AND_2) {
66: PetscReal temp[2];
67: VecNorm_SeqHIP(xin, NORM_1, temp);
68: VecNorm_SeqHIP(xin, NORM_2, temp + 1);
69: temp[1] = temp[1] * temp[1];
70: MPIU_Allreduce(temp, z, 2, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)xin));
71: z[1] = PetscSqrtReal(z[1]);
72: }
73: return 0;
74: }
76: PetscErrorCode VecDot_MPIHIP(Vec xin, Vec yin, PetscScalar *z)
77: {
78: PetscScalar sum, work;
80: VecDot_SeqHIP(xin, yin, &work);
81: MPIU_Allreduce(&work, &sum, 1, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)xin));
82: *z = sum;
83: return 0;
84: }
86: PetscErrorCode VecTDot_MPIHIP(Vec xin, Vec yin, PetscScalar *z)
87: {
88: PetscScalar sum, work;
90: VecTDot_SeqHIP(xin, yin, &work);
91: MPIU_Allreduce(&work, &sum, 1, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)xin));
92: *z = sum;
93: return 0;
94: }
96: PetscErrorCode VecMDot_MPIHIP(Vec xin, PetscInt nv, const Vec y[], PetscScalar *z)
97: {
98: PetscScalar awork[128], *work = awork;
100: if (nv > 128) PetscMalloc1(nv, &work);
101: VecMDot_SeqHIP(xin, nv, y, work);
102: MPIU_Allreduce(work, z, nv, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)xin));
103: if (nv > 128) PetscFree(work);
104: return 0;
105: }
107: /*MC
108: VECMPIHIP - VECMPIHIP = "mpihip" - The basic parallel vector, modified to use HIP
110: Options Database Keys:
111: . -vec_type mpihip - sets the vector type to VECMPIHIP during a call to VecSetFromOptions()
113: Level: beginner
115: .seealso: `VecCreate()`, `VecSetType()`, `VecSetFromOptions()`, `VecCreateMPIWithArray()`, `VECMPI`, `VecType`, `VecCreateMPI()`, `VecSetPinnedMemoryMin()`
116: M*/
118: PetscErrorCode VecDuplicate_MPIHIP(Vec win, Vec *v)
119: {
120: Vec_MPI *vw, *w = (Vec_MPI *)win->data;
121: PetscScalar *array;
123: VecCreate(PetscObjectComm((PetscObject)win), v);
124: PetscLayoutReference(win->map, &(*v)->map);
126: VecCreate_MPIHIP_Private(*v, PETSC_TRUE, w->nghost, 0);
127: vw = (Vec_MPI *)(*v)->data;
128: PetscMemcpy((*v)->ops, win->ops, sizeof(struct _VecOps));
130: /* save local representation of the parallel vector (and scatter) if it exists */
131: if (w->localrep) {
132: VecGetArray(*v, &array);
133: VecCreateSeqWithArray(PETSC_COMM_SELF, 1, win->map->n + w->nghost, array, &vw->localrep);
134: PetscMemcpy(vw->localrep->ops, w->localrep->ops, sizeof(struct _VecOps));
135: VecRestoreArray(*v, &array);
136: vw->localupdate = w->localupdate;
137: if (vw->localupdate) PetscObjectReference((PetscObject)vw->localupdate);
138: }
140: /* New vector should inherit stashing property of parent */
141: (*v)->stash.donotstash = win->stash.donotstash;
142: (*v)->stash.ignorenegidx = win->stash.ignorenegidx;
144: /* change type_name appropriately */
145: VecHIPAllocateCheck(*v);
146: PetscObjectChangeTypeName((PetscObject)(*v), VECMPIHIP);
148: PetscObjectListDuplicate(((PetscObject)win)->olist, &((PetscObject)(*v))->olist);
149: PetscFunctionListDuplicate(((PetscObject)win)->qlist, &((PetscObject)(*v))->qlist);
150: (*v)->map->bs = PetscAbs(win->map->bs);
151: (*v)->bstash.bs = win->bstash.bs;
152: return 0;
153: }
155: PetscErrorCode VecDotNorm2_MPIHIP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
156: {
157: PetscScalar work[2], sum[2];
159: VecDotNorm2_SeqHIP(s, t, work, work + 1);
160: MPIU_Allreduce(&work, &sum, 2, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)s));
161: *dp = sum[0];
162: *nm = sum[1];
163: return 0;
164: }
166: PetscErrorCode VecCreate_MPIHIP(Vec vv)
167: {
168: PetscDeviceInitialize(PETSC_DEVICE_HIP);
169: PetscLayoutSetUp(vv->map);
170: VecHIPAllocateCheck(vv);
171: VecCreate_MPIHIP_Private(vv, PETSC_FALSE, 0, ((Vec_HIP *)vv->spptr)->GPUarray_allocated);
172: VecHIPAllocateCheckHost(vv);
173: VecSet(vv, 0.0);
174: VecSet_Seq(vv, 0.0);
175: vv->offloadmask = PETSC_OFFLOAD_BOTH;
176: return 0;
177: }
179: PetscErrorCode VecCreate_HIP(Vec v)
180: {
181: PetscMPIInt size;
183: MPI_Comm_size(PetscObjectComm((PetscObject)v), &size);
184: if (size == 1) {
185: VecSetType(v, VECSEQHIP);
186: } else {
187: VecSetType(v, VECMPIHIP);
188: }
189: return 0;
190: }
192: /*@
193: VecCreateMPIHIP - Creates a standard, parallel array-style vector for HIP devices.
195: Collective
197: Input Parameters:
198: + comm - the MPI communicator to use
199: . n - local vector length (or PETSC_DECIDE to have calculated if N is given)
200: - N - global vector length (or PETSC_DETERMINE to have calculated if n is given)
202: Output Parameter:
203: . v - the vector
205: Notes:
206: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
207: same type as an existing vector.
209: Level: intermediate
211: .seealso: `VecCreateMPIHIPWithArray()`, `VecCreateMPIHIPWithArrays()`, `VecCreateSeqHIP()`, `VecCreateSeq()`,
212: `VecCreateMPI()`, `VecCreate()`, `VecDuplicate()`, `VecDuplicateVecs()`, `VecCreateGhost()`,
213: `VecCreateMPIWithArray()`, `VecCreateGhostWithArray()`, `VecMPISetGhost()`
215: @*/
216: PetscErrorCode VecCreateMPIHIP(MPI_Comm comm, PetscInt n, PetscInt N, Vec *v)
217: {
218: VecCreate(comm, v);
219: VecSetSizes(*v, n, N);
220: VecSetType(*v, VECMPIHIP);
221: return 0;
222: }
224: /*@C
225: VecCreateMPIHIPWithArray - Creates a parallel, array-style vector,
226: where the user provides the GPU array space to store the vector values.
228: Collective
230: Input Parameters:
231: + comm - the MPI communicator to use
232: . bs - block size, same meaning as VecSetBlockSize()
233: . n - local vector length, cannot be PETSC_DECIDE
234: . N - global vector length (or PETSC_DECIDE to have calculated)
235: - array - the user provided GPU array to store the vector values
237: Output Parameter:
238: . vv - the vector
240: Notes:
241: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
242: same type as an existing vector.
244: If the user-provided array is NULL, then VecHIPPlaceArray() can be used
245: at a later stage to SET the array for storing the vector values.
247: PETSc does NOT free the array when the vector is destroyed via VecDestroy().
248: The user should not free the array until the vector is destroyed.
250: Level: intermediate
252: .seealso: `VecCreateSeqHIPWithArray()`, `VecCreateMPIWithArray()`, `VecCreateSeqWithArray()`,
253: `VecCreate()`, `VecDuplicate()`, `VecDuplicateVecs()`, `VecCreateGhost()`,
254: `VecCreateMPI()`, `VecCreateGhostWithArray()`, `VecPlaceArray()`
256: @*/
257: PetscErrorCode VecCreateMPIHIPWithArray(MPI_Comm comm, PetscInt bs, PetscInt n, PetscInt N, const PetscScalar array[], Vec *vv)
258: {
260: PetscDeviceInitialize(PETSC_DEVICE_HIP);
261: VecCreate(comm, vv);
262: VecSetSizes(*vv, n, N);
263: VecSetBlockSize(*vv, bs);
264: VecCreate_MPIHIP_Private(*vv, PETSC_FALSE, 0, array);
265: return 0;
266: }
268: /*@C
269: VecCreateMPIHIPWithArrays - Creates a parallel, array-style vector,
270: where the user provides the GPU array space to store the vector values.
272: Collective
274: Input Parameters:
275: + comm - the MPI communicator to use
276: . bs - block size, same meaning as VecSetBlockSize()
277: . n - local vector length, cannot be PETSC_DECIDE
278: . N - global vector length (or PETSC_DECIDE to have calculated)
279: - cpuarray - the user provided CPU array to store the vector values
280: - gpuarray - the user provided GPU array to store the vector values
282: Output Parameter:
283: . vv - the vector
285: Notes:
286: If both cpuarray and gpuarray are provided, the caller must ensure that
287: the provided arrays have identical values.
289: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
290: same type as an existing vector.
292: PETSc does NOT free the provided arrays when the vector is destroyed via
293: VecDestroy(). The user should not free the array until the vector is
294: destroyed.
296: Level: intermediate
298: .seealso: `VecCreateSeqHIPWithArrays()`, `VecCreateMPIWithArray()`, `VecCreateSeqWithArray()`,
299: `VecCreate()`, `VecDuplicate()`, `VecDuplicateVecs()`, `VecCreateGhost()`,
300: `VecCreateMPI()`, `VecCreateGhostWithArray()`, `VecHIPPlaceArray()`, `VecPlaceArray()`,
301: `VecHIPAllocateCheckHost()`
302: @*/
303: PetscErrorCode VecCreateMPIHIPWithArrays(MPI_Comm comm, PetscInt bs, PetscInt n, PetscInt N, const PetscScalar cpuarray[], const PetscScalar gpuarray[], Vec *vv)
304: {
305: VecCreateMPIHIPWithArray(comm, bs, n, N, gpuarray, vv);
307: if (cpuarray && gpuarray) {
308: Vec_MPI *s = (Vec_MPI *)((*vv)->data);
309: s->array = (PetscScalar *)cpuarray;
310: (*vv)->offloadmask = PETSC_OFFLOAD_BOTH;
311: } else if (cpuarray) {
312: Vec_MPI *s = (Vec_MPI *)((*vv)->data);
313: s->array = (PetscScalar *)cpuarray;
314: (*vv)->offloadmask = PETSC_OFFLOAD_CPU;
315: } else if (gpuarray) {
316: (*vv)->offloadmask = PETSC_OFFLOAD_GPU;
317: } else {
318: (*vv)->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
319: }
321: return 0;
322: }
324: PetscErrorCode VecMax_MPIHIP(Vec xin, PetscInt *idx, PetscReal *z)
325: {
326: PetscReal work;
328: VecMax_SeqHIP(xin, idx, &work);
329: #if defined(PETSC_HAVE_MPIUNI)
330: *z = work;
331: #else
332: if (!idx) {
333: MPIU_Allreduce(&work, z, 1, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)xin));
334: } else {
335: struct {
336: PetscReal v;
337: PetscInt i;
338: } in, out;
340: in.v = work;
341: in.i = *idx + xin->map->rstart;
342: MPIU_Allreduce(&in, &out, 1, MPIU_REAL_INT, MPIU_MAXLOC, PetscObjectComm((PetscObject)xin));
343: *z = out.v;
344: *idx = out.i;
345: }
346: #endif
347: return 0;
348: }
350: PetscErrorCode VecMin_MPIHIP(Vec xin, PetscInt *idx, PetscReal *z)
351: {
352: PetscReal work;
354: VecMin_SeqHIP(xin, idx, &work);
355: #if defined(PETSC_HAVE_MPIUNI)
356: *z = work;
357: #else
358: if (!idx) {
359: MPIU_Allreduce(&work, z, 1, MPIU_REAL, MPIU_MIN, PetscObjectComm((PetscObject)xin));
360: } else {
361: struct {
362: PetscReal v;
363: PetscInt i;
364: } in, out;
366: in.v = work;
367: in.i = *idx + xin->map->rstart;
368: MPIU_Allreduce(&in, &out, 1, MPIU_REAL_INT, MPIU_MINLOC, PetscObjectComm((PetscObject)xin));
369: *z = out.v;
370: *idx = out.i;
371: }
372: #endif
373: return 0;
374: }
376: PetscErrorCode VecBindToCPU_MPIHIP(Vec V, PetscBool bind)
377: {
378: V->boundtocpu = bind;
379: if (bind) {
380: VecHIPCopyFromGPU(V);
381: V->offloadmask = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
382: V->ops->dotnorm2 = NULL;
383: V->ops->waxpy = VecWAXPY_Seq;
384: V->ops->dot = VecDot_MPI;
385: V->ops->mdot = VecMDot_MPI;
386: V->ops->tdot = VecTDot_MPI;
387: V->ops->norm = VecNorm_MPI;
388: V->ops->scale = VecScale_Seq;
389: V->ops->copy = VecCopy_Seq;
390: V->ops->set = VecSet_Seq;
391: V->ops->swap = VecSwap_Seq;
392: V->ops->axpy = VecAXPY_Seq;
393: V->ops->axpby = VecAXPBY_Seq;
394: V->ops->maxpy = VecMAXPY_Seq;
395: V->ops->aypx = VecAYPX_Seq;
396: V->ops->axpbypcz = VecAXPBYPCZ_Seq;
397: V->ops->pointwisemult = VecPointwiseMult_Seq;
398: V->ops->setrandom = VecSetRandom_Seq;
399: V->ops->placearray = VecPlaceArray_Seq;
400: V->ops->replacearray = VecReplaceArray_SeqHIP;
401: V->ops->resetarray = VecResetArray_Seq;
402: V->ops->dot_local = VecDot_Seq;
403: V->ops->tdot_local = VecTDot_Seq;
404: V->ops->norm_local = VecNorm_Seq;
405: V->ops->mdot_local = VecMDot_Seq;
406: V->ops->pointwisedivide = VecPointwiseDivide_Seq;
407: V->ops->getlocalvector = NULL;
408: V->ops->restorelocalvector = NULL;
409: V->ops->getlocalvectorread = NULL;
410: V->ops->restorelocalvectorread = NULL;
411: V->ops->getarraywrite = NULL;
412: V->ops->getarrayandmemtype = NULL;
413: V->ops->restorearrayandmemtype = NULL;
414: V->ops->getarraywriteandmemtype = NULL;
415: V->ops->max = VecMax_MPI;
416: V->ops->min = VecMin_MPI;
417: V->ops->reciprocal = VecReciprocal_Default;
418: V->ops->sum = NULL;
419: V->ops->shift = NULL;
420: } else {
421: V->ops->dotnorm2 = VecDotNorm2_MPIHIP;
422: V->ops->waxpy = VecWAXPY_SeqHIP;
423: V->ops->duplicate = VecDuplicate_MPIHIP;
424: V->ops->dot = VecDot_MPIHIP;
425: V->ops->mdot = VecMDot_MPIHIP;
426: V->ops->tdot = VecTDot_MPIHIP;
427: V->ops->norm = VecNorm_MPIHIP;
428: V->ops->scale = VecScale_SeqHIP;
429: V->ops->copy = VecCopy_SeqHIP;
430: V->ops->set = VecSet_SeqHIP;
431: V->ops->swap = VecSwap_SeqHIP;
432: V->ops->axpy = VecAXPY_SeqHIP;
433: V->ops->axpby = VecAXPBY_SeqHIP;
434: V->ops->maxpy = VecMAXPY_SeqHIP;
435: V->ops->aypx = VecAYPX_SeqHIP;
436: V->ops->axpbypcz = VecAXPBYPCZ_SeqHIP;
437: V->ops->pointwisemult = VecPointwiseMult_SeqHIP;
438: V->ops->setrandom = VecSetRandom_SeqHIP;
439: V->ops->placearray = VecPlaceArray_SeqHIP;
440: V->ops->replacearray = VecReplaceArray_SeqHIP;
441: V->ops->resetarray = VecResetArray_SeqHIP;
442: V->ops->dot_local = VecDot_SeqHIP;
443: V->ops->tdot_local = VecTDot_SeqHIP;
444: V->ops->norm_local = VecNorm_SeqHIP;
445: V->ops->mdot_local = VecMDot_SeqHIP;
446: V->ops->destroy = VecDestroy_MPIHIP;
447: V->ops->pointwisedivide = VecPointwiseDivide_SeqHIP;
448: V->ops->getlocalvector = VecGetLocalVector_SeqHIP;
449: V->ops->restorelocalvector = VecRestoreLocalVector_SeqHIP;
450: V->ops->getlocalvectorread = VecGetLocalVectorRead_SeqHIP;
451: V->ops->restorelocalvectorread = VecRestoreLocalVectorRead_SeqHIP;
452: V->ops->getarraywrite = VecGetArrayWrite_SeqHIP;
453: V->ops->getarray = VecGetArray_SeqHIP;
454: V->ops->restorearray = VecRestoreArray_SeqHIP;
455: V->ops->getarrayandmemtype = VecGetArrayAndMemType_SeqHIP;
456: V->ops->restorearrayandmemtype = VecRestoreArrayAndMemType_SeqHIP;
457: V->ops->getarraywriteandmemtype = VecGetArrayWriteAndMemType_SeqHIP;
458: V->ops->max = VecMax_MPIHIP;
459: V->ops->min = VecMin_MPIHIP;
460: V->ops->reciprocal = VecReciprocal_SeqHIP;
461: V->ops->sum = VecSum_SeqHIP;
462: V->ops->shift = VecShift_SeqHIP;
463: }
464: return 0;
465: }
467: PetscErrorCode VecCreate_MPIHIP_Private(Vec vv, PetscBool alloc, PetscInt nghost, const PetscScalar array[])
468: {
469: Vec_HIP *vechip;
471: VecCreate_MPI_Private(vv, PETSC_FALSE, 0, 0);
472: PetscObjectChangeTypeName((PetscObject)vv, VECMPIHIP);
474: VecBindToCPU_MPIHIP(vv, PETSC_FALSE);
475: vv->ops->bindtocpu = VecBindToCPU_MPIHIP;
477: /* Later, functions check for the Vec_HIP structure existence, so do not create it without array */
478: if (alloc && !array) {
479: VecHIPAllocateCheck(vv);
480: VecHIPAllocateCheckHost(vv);
481: VecSet(vv, 0.0);
482: VecSet_Seq(vv, 0.0);
483: vv->offloadmask = PETSC_OFFLOAD_BOTH;
484: }
485: if (array) {
486: if (!vv->spptr) {
487: PetscReal pinned_memory_min;
488: PetscBool flag;
490: /* Cannot use PetscNew() here because spptr is void* */
491: PetscCalloc(sizeof(Vec_HIP), &vv->spptr);
492: vechip = (Vec_HIP *)vv->spptr;
493: vv->minimum_bytes_pinned_memory = 0;
495: /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
496: Note: This same code duplicated in VecCreate_SeqHIP_Private() and VecHIPAllocateCheck(). Is there a good way to avoid this? */
497: PetscOptionsBegin(PetscObjectComm((PetscObject)vv), ((PetscObject)vv)->prefix, "VECHIP Options", "Vec");
498: pinned_memory_min = vv->minimum_bytes_pinned_memory;
499: PetscOptionsReal("-vec_pinned_memory_min", "Minimum size (in bytes) for an allocation to use pinned memory on host", "VecSetPinnedMemoryMin", pinned_memory_min, &pinned_memory_min, &flag);
500: if (flag) vv->minimum_bytes_pinned_memory = pinned_memory_min;
501: PetscOptionsEnd();
502: }
503: vechip = (Vec_HIP *)vv->spptr;
504: vechip->GPUarray = (PetscScalar *)array;
505: vv->offloadmask = PETSC_OFFLOAD_GPU;
506: }
507: return 0;
508: }