Actual source code: vechip2.hip.cpp
1: /*
2: Implements the sequential hip vectors.
3: */
5: #define PETSC_SKIP_SPINLOCK
7: #include <petscconf.h>
8: #include <petsc/private/vecimpl.h>
9: #include <../src/vec/vec/impls/dvecimpl.h>
10: #include <petsc/private/hipvecimpl.h>
12: #include <hip/hip_runtime.h>
13: #include <thrust/device_ptr.h>
14: #include <thrust/transform.h>
15: #include <thrust/functional.h>
16: #include <thrust/reduce.h>
18: #if defined(PETSC_USE_COMPLEX)
19: /* SPOCK compilation issues, need to unroll division and multiplication with complex numbers */
20: struct PetscDivideComplex {
21: __host__ __device__ PetscScalar operator()(const PetscScalar &lhs, const PetscScalar &rhs)
22: {
23: PetscReal lx = PetscRealPart(lhs);
24: PetscReal ly = PetscImaginaryPart(lhs);
25: PetscReal rx = PetscRealPart(rhs);
26: PetscReal ry = PetscImaginaryPart(rhs);
27: PetscReal n = rx * rx + ry * ry;
28: return PetscComplex((lx * rx + ly * ry) / n, (rx * ly - lx * ry) / n);
29: }
30: };
32: struct PetscMultiplyComplex {
33: __host__ __device__ PetscScalar operator()(const PetscScalar &lhs, const PetscScalar &rhs)
34: {
35: PetscReal lx = PetscRealPart(lhs);
36: PetscReal ly = PetscImaginaryPart(lhs);
37: PetscReal rx = PetscRealPart(rhs);
38: PetscReal ry = PetscImaginaryPart(rhs);
39: return PetscComplex(lx * rx - ly * ry, ly * rx + lx * ry);
40: }
41: };
42: #endif
44: /*
45: Allocates space for the vector array on the GPU if it does not exist.
46: Does NOT change the PetscHIPFlag for the vector
47: Does NOT zero the HIP array
49: */
50: PetscErrorCode VecHIPAllocateCheck(Vec v)
51: {
52: Vec_HIP *vechip;
53: PetscBool option_set;
55: if (!v->spptr) {
56: PetscReal pinned_memory_min;
58: PetscCalloc(sizeof(Vec_HIP), &v->spptr);
59: vechip = (Vec_HIP *)v->spptr;
60: hipMalloc((void **)&vechip->GPUarray_allocated, sizeof(PetscScalar) * ((PetscBLASInt)v->map->n));
61: vechip->GPUarray = vechip->GPUarray_allocated;
62: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
63: if (v->data && ((Vec_Seq *)v->data)->array) {
64: v->offloadmask = PETSC_OFFLOAD_CPU;
65: } else {
66: v->offloadmask = PETSC_OFFLOAD_GPU;
67: }
68: }
69: pinned_memory_min = 0;
71: /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
72: Note: This same code duplicated in VecCreate_SeqHIP_Private() and VecCreate_MPIHIP_Private(). Is there a good way to avoid this? */
73: PetscOptionsBegin(PetscObjectComm((PetscObject)v), ((PetscObject)v)->prefix, "VECHIP Options", "Vec");
74: PetscOptionsReal("-vec_pinned_memory_min", "Minimum size (in bytes) for an allocation to use pinned memory on host", "VecSetPinnedMemoryMin", pinned_memory_min, &pinned_memory_min, &option_set);
75: if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
76: PetscOptionsEnd();
77: }
78: return 0;
79: }
81: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
82: PetscErrorCode VecHIPCopyToGPU(Vec v)
83: {
84: Vec_HIP *vechip;
85: PetscScalar *varray;
88: VecHIPAllocateCheck(v);
89: if (v->offloadmask == PETSC_OFFLOAD_CPU) {
90: PetscLogEventBegin(VEC_HIPCopyToGPU, v, 0, 0, 0);
91: vechip = (Vec_HIP *)v->spptr;
92: varray = vechip->GPUarray;
93: hipMemcpy(varray, ((Vec_Seq *)v->data)->array, v->map->n * sizeof(PetscScalar), hipMemcpyHostToDevice);
94: PetscLogCpuToGpu((v->map->n) * sizeof(PetscScalar));
95: PetscLogEventEnd(VEC_HIPCopyToGPU, v, 0, 0, 0);
96: v->offloadmask = PETSC_OFFLOAD_BOTH;
97: }
98: return 0;
99: }
101: /*
102: VecHIPCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
103: */
104: PetscErrorCode VecHIPCopyFromGPU(Vec v)
105: {
106: Vec_HIP *vechip;
107: PetscScalar *varray;
110: VecHIPAllocateCheckHost(v);
111: if (v->offloadmask == PETSC_OFFLOAD_GPU) {
112: PetscLogEventBegin(VEC_HIPCopyFromGPU, v, 0, 0, 0);
113: vechip = (Vec_HIP *)v->spptr;
114: varray = vechip->GPUarray;
115: hipMemcpy(((Vec_Seq *)v->data)->array, varray, v->map->n * sizeof(PetscScalar), hipMemcpyDeviceToHost);
116: PetscLogGpuToCpu((v->map->n) * sizeof(PetscScalar));
117: PetscLogEventEnd(VEC_HIPCopyFromGPU, v, 0, 0, 0);
118: v->offloadmask = PETSC_OFFLOAD_BOTH;
119: }
120: return 0;
121: }
123: /*MC
124: VECSEQHIP - VECSEQHIP = "seqhip" - The basic sequential vector, modified to use HIP
126: Options Database Keys:
127: . -vec_type seqhip - sets the vector type to VECSEQHIP during a call to VecSetFromOptions()
129: Level: beginner
131: .seealso: `VecCreate()`, `VecSetType()`, `VecSetFromOptions()`, `VecCreateSeqWithArray()`, `VECMPI`, `VecType`, `VecCreateMPI()`, `VecCreateSeq()`
132: M*/
134: PetscErrorCode VecAYPX_SeqHIP(Vec yin, PetscScalar alpha, Vec xin)
135: {
136: const PetscScalar *xarray;
137: PetscScalar *yarray;
138: PetscBLASInt one = 1, bn = 0;
139: PetscScalar sone = 1.0;
140: hipblasHandle_t hipblasv2handle;
142: PetscHIPBLASGetHandle(&hipblasv2handle);
143: PetscBLASIntCast(yin->map->n, &bn);
144: VecHIPGetArrayRead(xin, &xarray);
145: VecHIPGetArray(yin, &yarray);
146: PetscLogGpuTimeBegin();
147: if (alpha == (PetscScalar)0.0) {
148: hipMemcpy(yarray, xarray, bn * sizeof(PetscScalar), hipMemcpyDeviceToDevice);
149: } else if (alpha == (PetscScalar)1.0) {
150: hipblasXaxpy(hipblasv2handle, bn, &alpha, xarray, one, yarray, one);
151: PetscLogGpuFlops(1.0 * yin->map->n);
152: } else {
153: hipblasXscal(hipblasv2handle, bn, &alpha, yarray, one);
154: hipblasXaxpy(hipblasv2handle, bn, &sone, xarray, one, yarray, one);
155: PetscLogGpuFlops(2.0 * yin->map->n);
156: }
157: PetscLogGpuTimeEnd();
158: VecHIPRestoreArrayRead(xin, &xarray);
159: VecHIPRestoreArray(yin, &yarray);
160: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
161: return 0;
162: }
164: PetscErrorCode VecAXPY_SeqHIP(Vec yin, PetscScalar alpha, Vec xin)
165: {
166: const PetscScalar *xarray;
167: PetscScalar *yarray;
168: PetscBLASInt one = 1, bn = 0;
169: hipblasHandle_t hipblasv2handle;
170: PetscBool xiship;
172: if (alpha == (PetscScalar)0.0) return 0;
173: PetscHIPBLASGetHandle(&hipblasv2handle);
174: PetscObjectTypeCompareAny((PetscObject)xin, &xiship, VECSEQHIP, VECMPIHIP, "");
175: if (xiship) {
176: PetscBLASIntCast(yin->map->n, &bn);
177: VecHIPGetArrayRead(xin, &xarray);
178: VecHIPGetArray(yin, &yarray);
179: PetscLogGpuTimeBegin();
180: hipblasXaxpy(hipblasv2handle, bn, &alpha, xarray, one, yarray, one);
181: PetscLogGpuTimeEnd();
182: VecHIPRestoreArrayRead(xin, &xarray);
183: VecHIPRestoreArray(yin, &yarray);
184: PetscLogGpuFlops(2.0 * yin->map->n);
185: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
186: } else {
187: VecAXPY_Seq(yin, alpha, xin);
188: }
189: return 0;
190: }
192: PetscErrorCode VecPointwiseDivide_SeqHIP(Vec win, Vec xin, Vec yin)
193: {
194: PetscInt n = xin->map->n;
195: const PetscScalar *xarray = NULL, *yarray = NULL;
196: PetscScalar *warray = NULL;
197: thrust::device_ptr<const PetscScalar> xptr, yptr;
198: thrust::device_ptr<PetscScalar> wptr;
200: if (xin->boundtocpu || yin->boundtocpu) {
201: VecPointwiseDivide_Seq(win, xin, yin);
202: return 0;
203: }
204: VecHIPGetArrayWrite(win, &warray);
205: VecHIPGetArrayRead(xin, &xarray);
206: VecHIPGetArrayRead(yin, &yarray);
207: PetscLogGpuTimeBegin();
208: try {
209: wptr = thrust::device_pointer_cast(warray);
210: xptr = thrust::device_pointer_cast(xarray);
211: yptr = thrust::device_pointer_cast(yarray);
212: #if defined(PETSC_USE_COMPLEX)
213: thrust::transform(xptr, xptr + n, yptr, wptr, PetscDivideComplex());
214: #else
215: thrust::transform(xptr, xptr + n, yptr, wptr, thrust::divides<PetscScalar>());
216: #endif
217: } catch (char *ex) {
218: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
219: }
220: PetscLogGpuTimeEnd();
221: PetscLogGpuFlops(n);
222: VecHIPRestoreArrayRead(xin, &xarray);
223: VecHIPRestoreArrayRead(yin, &yarray);
224: VecHIPRestoreArrayWrite(win, &warray);
225: return 0;
226: }
228: PetscErrorCode VecWAXPY_SeqHIP(Vec win, PetscScalar alpha, Vec xin, Vec yin)
229: {
230: const PetscScalar *xarray = NULL, *yarray = NULL;
231: PetscScalar *warray = NULL;
232: PetscBLASInt one = 1, bn = 0;
233: hipblasHandle_t hipblasv2handle;
235: PetscHIPBLASGetHandle(&hipblasv2handle);
236: PetscBLASIntCast(win->map->n, &bn);
237: if (alpha == (PetscScalar)0.0) {
238: VecCopy_SeqHIP(yin, win);
239: } else {
240: VecHIPGetArrayRead(xin, &xarray);
241: VecHIPGetArrayRead(yin, &yarray);
242: VecHIPGetArrayWrite(win, &warray);
243: PetscLogGpuTimeBegin();
244: hipMemcpy(warray, yarray, win->map->n * sizeof(PetscScalar), hipMemcpyDeviceToDevice);
245: hipblasXaxpy(hipblasv2handle, bn, &alpha, xarray, one, warray, one);
246: PetscLogGpuTimeEnd();
247: PetscLogGpuFlops(2 * win->map->n);
248: VecHIPRestoreArrayRead(xin, &xarray);
249: VecHIPRestoreArrayRead(yin, &yarray);
250: VecHIPRestoreArrayWrite(win, &warray);
251: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
252: }
253: return 0;
254: }
256: PetscErrorCode VecMAXPY_SeqHIP(Vec xin, PetscInt nv, const PetscScalar *alpha, Vec *y)
257: {
258: PetscInt n = xin->map->n, j;
259: PetscScalar *xarray;
260: const PetscScalar *yarray;
261: PetscBLASInt one = 1, bn = 0;
262: hipblasHandle_t hipblasv2handle;
264: PetscLogGpuFlops(nv * 2.0 * n);
265: PetscLogCpuToGpuScalar(nv * sizeof(PetscScalar));
266: PetscHIPBLASGetHandle(&hipblasv2handle);
267: PetscBLASIntCast(n, &bn);
268: VecHIPGetArray(xin, &xarray);
269: PetscLogGpuTimeBegin();
270: for (j = 0; j < nv; j++) {
271: VecHIPGetArrayRead(y[j], &yarray);
272: hipblasXaxpy(hipblasv2handle, bn, alpha + j, yarray, one, xarray, one);
273: VecHIPRestoreArrayRead(y[j], &yarray);
274: }
275: PetscLogGpuTimeEnd();
276: VecHIPRestoreArray(xin, &xarray);
277: return 0;
278: }
280: PetscErrorCode VecDot_SeqHIP(Vec xin, Vec yin, PetscScalar *z)
281: {
282: const PetscScalar *xarray, *yarray;
283: PetscBLASInt one = 1, bn = 0;
284: hipblasHandle_t hipblasv2handle;
286: PetscHIPBLASGetHandle(&hipblasv2handle);
287: PetscBLASIntCast(yin->map->n, &bn);
288: VecHIPGetArrayRead(xin, &xarray);
289: VecHIPGetArrayRead(yin, &yarray);
290: /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
291: PetscLogGpuTimeBegin();
292: hipblasXdot(hipblasv2handle, bn, yarray, one, xarray, one, z);
293: PetscLogGpuTimeEnd();
294: if (xin->map->n > 0) PetscLogGpuFlops(2.0 * xin->map->n - 1);
295: PetscLogGpuToCpu(sizeof(PetscScalar));
296: VecHIPRestoreArrayRead(xin, &xarray);
297: VecHIPRestoreArrayRead(yin, &yarray);
298: return 0;
299: }
301: //
302: // HIP kernels for MDot to follow
303: //
305: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
306: #define MDOT_WORKGROUP_SIZE 128
307: #define MDOT_WORKGROUP_NUM 128
309: #if !defined(PETSC_USE_COMPLEX)
310: // M = 2:
311: __global__ void VecMDot_SeqHIP_kernel2(const PetscScalar *x, const PetscScalar *y0, const PetscScalar *y1, PetscInt size, PetscScalar *group_results)
312: {
313: __shared__ PetscScalar tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
314: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
315: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
316: PetscInt vec_start_index = blockIdx.x * entries_per_group;
317: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
319: PetscScalar entry_x = 0;
320: PetscScalar group_sum0 = 0;
321: PetscScalar group_sum1 = 0;
322: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
323: entry_x = x[i]; // load only once from global memory!
324: group_sum0 += entry_x * y0[i];
325: group_sum1 += entry_x * y1[i];
326: }
327: tmp_buffer[threadIdx.x] = group_sum0;
328: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
330: // parallel reduction
331: for (PetscInt stride = blockDim.x / 2; stride > 0; stride /= 2) {
332: __syncthreads();
333: if (threadIdx.x < stride) {
334: tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
335: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + MDOT_WORKGROUP_SIZE];
336: }
337: }
339: // write result of group to group_results
340: if (threadIdx.x == 0) {
341: group_results[blockIdx.x] = tmp_buffer[0];
342: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
343: }
344: }
346: // M = 3:
347: __global__ void VecMDot_SeqHIP_kernel3(const PetscScalar *x, const PetscScalar *y0, const PetscScalar *y1, const PetscScalar *y2, PetscInt size, PetscScalar *group_results)
348: {
349: __shared__ PetscScalar tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
350: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
351: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
352: PetscInt vec_start_index = blockIdx.x * entries_per_group;
353: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
355: PetscScalar entry_x = 0;
356: PetscScalar group_sum0 = 0;
357: PetscScalar group_sum1 = 0;
358: PetscScalar group_sum2 = 0;
359: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
360: entry_x = x[i]; // load only once from global memory!
361: group_sum0 += entry_x * y0[i];
362: group_sum1 += entry_x * y1[i];
363: group_sum2 += entry_x * y2[i];
364: }
365: tmp_buffer[threadIdx.x] = group_sum0;
366: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
367: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
369: // parallel reduction
370: for (PetscInt stride = blockDim.x / 2; stride > 0; stride /= 2) {
371: __syncthreads();
372: if (threadIdx.x < stride) {
373: tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
374: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + MDOT_WORKGROUP_SIZE];
375: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 2 * MDOT_WORKGROUP_SIZE];
376: }
377: }
379: // write result of group to group_results
380: if (threadIdx.x == 0) {
381: group_results[blockIdx.x] = tmp_buffer[0];
382: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
383: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
384: }
385: }
387: // M = 4:
388: __global__ void VecMDot_SeqHIP_kernel4(const PetscScalar *x, const PetscScalar *y0, const PetscScalar *y1, const PetscScalar *y2, const PetscScalar *y3, PetscInt size, PetscScalar *group_results)
389: {
390: __shared__ PetscScalar tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
391: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
392: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
393: PetscInt vec_start_index = blockIdx.x * entries_per_group;
394: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
396: PetscScalar entry_x = 0;
397: PetscScalar group_sum0 = 0;
398: PetscScalar group_sum1 = 0;
399: PetscScalar group_sum2 = 0;
400: PetscScalar group_sum3 = 0;
401: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
402: entry_x = x[i]; // load only once from global memory!
403: group_sum0 += entry_x * y0[i];
404: group_sum1 += entry_x * y1[i];
405: group_sum2 += entry_x * y2[i];
406: group_sum3 += entry_x * y3[i];
407: }
408: tmp_buffer[threadIdx.x] = group_sum0;
409: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
410: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
411: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
413: // parallel reduction
414: for (PetscInt stride = blockDim.x / 2; stride > 0; stride /= 2) {
415: __syncthreads();
416: if (threadIdx.x < stride) {
417: tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
418: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + MDOT_WORKGROUP_SIZE];
419: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 2 * MDOT_WORKGROUP_SIZE];
420: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 3 * MDOT_WORKGROUP_SIZE];
421: }
422: }
424: // write result of group to group_results
425: if (threadIdx.x == 0) {
426: group_results[blockIdx.x] = tmp_buffer[0];
427: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
428: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
429: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
430: }
431: }
433: // M = 8:
434: __global__ void VecMDot_SeqHIP_kernel8(const PetscScalar *x, const PetscScalar *y0, const PetscScalar *y1, const PetscScalar *y2, const PetscScalar *y3, const PetscScalar *y4, const PetscScalar *y5, const PetscScalar *y6, const PetscScalar *y7, PetscInt size, PetscScalar *group_results)
435: {
436: __shared__ PetscScalar tmp_buffer[8 * MDOT_WORKGROUP_SIZE];
437: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
438: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
439: PetscInt vec_start_index = blockIdx.x * entries_per_group;
440: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
442: PetscScalar entry_x = 0;
443: PetscScalar group_sum0 = 0;
444: PetscScalar group_sum1 = 0;
445: PetscScalar group_sum2 = 0;
446: PetscScalar group_sum3 = 0;
447: PetscScalar group_sum4 = 0;
448: PetscScalar group_sum5 = 0;
449: PetscScalar group_sum6 = 0;
450: PetscScalar group_sum7 = 0;
451: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
452: entry_x = x[i]; // load only once from global memory!
453: group_sum0 += entry_x * y0[i];
454: group_sum1 += entry_x * y1[i];
455: group_sum2 += entry_x * y2[i];
456: group_sum3 += entry_x * y3[i];
457: group_sum4 += entry_x * y4[i];
458: group_sum5 += entry_x * y5[i];
459: group_sum6 += entry_x * y6[i];
460: group_sum7 += entry_x * y7[i];
461: }
462: tmp_buffer[threadIdx.x] = group_sum0;
463: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
464: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
465: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
466: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
467: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
468: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
469: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
471: // parallel reduction
472: for (PetscInt stride = blockDim.x / 2; stride > 0; stride /= 2) {
473: __syncthreads();
474: if (threadIdx.x < stride) {
475: tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
476: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + MDOT_WORKGROUP_SIZE];
477: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 2 * MDOT_WORKGROUP_SIZE];
478: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 3 * MDOT_WORKGROUP_SIZE];
479: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 4 * MDOT_WORKGROUP_SIZE];
480: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 5 * MDOT_WORKGROUP_SIZE];
481: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 6 * MDOT_WORKGROUP_SIZE];
482: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 7 * MDOT_WORKGROUP_SIZE];
483: }
484: }
486: // write result of group to group_results
487: if (threadIdx.x == 0) {
488: group_results[blockIdx.x] = tmp_buffer[0];
489: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
490: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
491: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
492: group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
493: group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
494: group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
495: group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
496: }
497: }
498: #endif /* !defined(PETSC_USE_COMPLEX) */
500: PetscErrorCode VecMDot_SeqHIP(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
501: {
502: PetscInt i, n = xin->map->n, current_y_index = 0;
503: const PetscScalar *xptr, *y0ptr, *y1ptr, *y2ptr, *y3ptr, *y4ptr, *y5ptr, *y6ptr, *y7ptr;
504: #if !defined(PETSC_USE_COMPLEX)
505: PetscInt nv1 = ((nv % 4) == 1) ? nv - 1 : nv, j;
506: PetscScalar *group_results_gpu, *group_results_cpu;
507: #endif
508: PetscBLASInt one = 1, bn = 0;
509: hipblasHandle_t hipblasv2handle;
511: PetscHIPBLASGetHandle(&hipblasv2handle);
512: PetscBLASIntCast(xin->map->n, &bn);
514: /* Handle the case of local size zero first */
515: if (!xin->map->n) {
516: for (i = 0; i < nv; ++i) z[i] = 0;
517: return 0;
518: }
520: #if !defined(PETSC_USE_COMPLEX)
521: // allocate scratchpad memory for the results of individual work groups:
522: hipMalloc((void **)&group_results_gpu, nv1 * sizeof(PetscScalar) * MDOT_WORKGROUP_NUM);
523: PetscMalloc1(nv1 * MDOT_WORKGROUP_NUM, &group_results_cpu);
524: #endif
525: VecHIPGetArrayRead(xin, &xptr);
526: PetscLogGpuTimeBegin();
528: while (current_y_index < nv) {
529: switch (nv - current_y_index) {
530: case 7:
531: case 6:
532: case 5:
533: case 4:
534: VecHIPGetArrayRead(yin[current_y_index], &y0ptr);
535: VecHIPGetArrayRead(yin[current_y_index + 1], &y1ptr);
536: VecHIPGetArrayRead(yin[current_y_index + 2], &y2ptr);
537: VecHIPGetArrayRead(yin[current_y_index + 3], &y3ptr);
538: #if defined(PETSC_USE_COMPLEX)
539: hipblasXdot(hipblasv2handle, bn, y0ptr, one, xptr, one, &z[current_y_index]);
540: hipblasXdot(hipblasv2handle, bn, y1ptr, one, xptr, one, &z[current_y_index + 1]);
541: hipblasXdot(hipblasv2handle, bn, y2ptr, one, xptr, one, &z[current_y_index + 2]);
542: hipblasXdot(hipblasv2handle, bn, y3ptr, one, xptr, one, &z[current_y_index + 3]);
543: #else
544: hipLaunchKernelGGL(VecMDot_SeqHIP_kernel4, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr, y0ptr, y1ptr, y2ptr, y3ptr, n, group_results_gpu + current_y_index * MDOT_WORKGROUP_NUM);
545: #endif
546: VecHIPRestoreArrayRead(yin[current_y_index], &y0ptr);
547: VecHIPRestoreArrayRead(yin[current_y_index + 1], &y1ptr);
548: VecHIPRestoreArrayRead(yin[current_y_index + 2], &y2ptr);
549: VecHIPRestoreArrayRead(yin[current_y_index + 3], &y3ptr);
550: current_y_index += 4;
551: break;
553: case 3:
554: VecHIPGetArrayRead(yin[current_y_index], &y0ptr);
555: VecHIPGetArrayRead(yin[current_y_index + 1], &y1ptr);
556: VecHIPGetArrayRead(yin[current_y_index + 2], &y2ptr);
558: #if defined(PETSC_USE_COMPLEX)
559: hipblasXdot(hipblasv2handle, bn, y0ptr, one, xptr, one, &z[current_y_index]);
560: hipblasXdot(hipblasv2handle, bn, y1ptr, one, xptr, one, &z[current_y_index + 1]);
561: hipblasXdot(hipblasv2handle, bn, y2ptr, one, xptr, one, &z[current_y_index + 2]);
562: #else
563: hipLaunchKernelGGL(VecMDot_SeqHIP_kernel3, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr, y0ptr, y1ptr, y2ptr, n, group_results_gpu + current_y_index * MDOT_WORKGROUP_NUM);
564: #endif
565: VecHIPRestoreArrayRead(yin[current_y_index], &y0ptr);
566: VecHIPRestoreArrayRead(yin[current_y_index + 1], &y1ptr);
567: VecHIPRestoreArrayRead(yin[current_y_index + 2], &y2ptr);
568: current_y_index += 3;
569: break;
571: case 2:
572: VecHIPGetArrayRead(yin[current_y_index], &y0ptr);
573: VecHIPGetArrayRead(yin[current_y_index + 1], &y1ptr);
574: #if defined(PETSC_USE_COMPLEX)
575: hipblasXdot(hipblasv2handle, bn, y0ptr, one, xptr, one, &z[current_y_index]);
576: hipblasXdot(hipblasv2handle, bn, y1ptr, one, xptr, one, &z[current_y_index + 1]);
577: #else
578: hipLaunchKernelGGL(VecMDot_SeqHIP_kernel2, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr, y0ptr, y1ptr, n, group_results_gpu + current_y_index * MDOT_WORKGROUP_NUM);
579: #endif
580: VecHIPRestoreArrayRead(yin[current_y_index], &y0ptr);
581: VecHIPRestoreArrayRead(yin[current_y_index + 1], &y1ptr);
582: current_y_index += 2;
583: break;
585: case 1:
586: VecHIPGetArrayRead(yin[current_y_index], &y0ptr);
587: hipblasXdot(hipblasv2handle, bn, y0ptr, one, xptr, one, &z[current_y_index]);
588: VecHIPRestoreArrayRead(yin[current_y_index], &y0ptr);
589: current_y_index += 1;
590: break;
592: default: // 8 or more vectors left
593: VecHIPGetArrayRead(yin[current_y_index], &y0ptr);
594: VecHIPGetArrayRead(yin[current_y_index + 1], &y1ptr);
595: VecHIPGetArrayRead(yin[current_y_index + 2], &y2ptr);
596: VecHIPGetArrayRead(yin[current_y_index + 3], &y3ptr);
597: VecHIPGetArrayRead(yin[current_y_index + 4], &y4ptr);
598: VecHIPGetArrayRead(yin[current_y_index + 5], &y5ptr);
599: VecHIPGetArrayRead(yin[current_y_index + 6], &y6ptr);
600: VecHIPGetArrayRead(yin[current_y_index + 7], &y7ptr);
601: #if defined(PETSC_USE_COMPLEX)
602: hipblasXdot(hipblasv2handle, bn, y0ptr, one, xptr, one, &z[current_y_index]);
603: hipblasXdot(hipblasv2handle, bn, y1ptr, one, xptr, one, &z[current_y_index + 1]);
604: hipblasXdot(hipblasv2handle, bn, y2ptr, one, xptr, one, &z[current_y_index + 2]);
605: hipblasXdot(hipblasv2handle, bn, y3ptr, one, xptr, one, &z[current_y_index + 3]);
606: hipblasXdot(hipblasv2handle, bn, y4ptr, one, xptr, one, &z[current_y_index + 4]);
607: hipblasXdot(hipblasv2handle, bn, y5ptr, one, xptr, one, &z[current_y_index + 5]);
608: hipblasXdot(hipblasv2handle, bn, y6ptr, one, xptr, one, &z[current_y_index + 6]);
609: hipblasXdot(hipblasv2handle, bn, y7ptr, one, xptr, one, &z[current_y_index + 7]);
610: #else
611: hipLaunchKernelGGL(VecMDot_SeqHIP_kernel8, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr, y0ptr, y1ptr, y2ptr, y3ptr, y4ptr, y5ptr, y6ptr, y7ptr, n, group_results_gpu + current_y_index * MDOT_WORKGROUP_NUM);
612: #endif
613: VecHIPRestoreArrayRead(yin[current_y_index], &y0ptr);
614: VecHIPRestoreArrayRead(yin[current_y_index + 1], &y1ptr);
615: VecHIPRestoreArrayRead(yin[current_y_index + 2], &y2ptr);
616: VecHIPRestoreArrayRead(yin[current_y_index + 3], &y3ptr);
617: VecHIPRestoreArrayRead(yin[current_y_index + 4], &y4ptr);
618: VecHIPRestoreArrayRead(yin[current_y_index + 5], &y5ptr);
619: VecHIPRestoreArrayRead(yin[current_y_index + 6], &y6ptr);
620: VecHIPRestoreArrayRead(yin[current_y_index + 7], &y7ptr);
621: current_y_index += 8;
622: break;
623: }
624: }
625: PetscLogGpuTimeEnd();
626: VecHIPRestoreArrayRead(xin, &xptr);
628: #if defined(PETSC_USE_COMPLEX)
629: PetscLogGpuToCpu(nv * sizeof(PetscScalar));
630: #else
631: // copy results to CPU
632: hipMemcpy(group_results_cpu, group_results_gpu, nv1 * sizeof(PetscScalar) * MDOT_WORKGROUP_NUM, hipMemcpyDeviceToHost);
634: // sum group results into z
635: for (j = 0; j < nv1; ++j) {
636: z[j] = 0;
637: for (i = j * MDOT_WORKGROUP_NUM; i < (j + 1) * MDOT_WORKGROUP_NUM; ++i) z[j] += group_results_cpu[i];
638: }
639: PetscLogFlops(nv1 * MDOT_WORKGROUP_NUM);
640: hipFree(group_results_gpu);
641: PetscFree(group_results_cpu);
642: PetscLogGpuToCpu(nv1 * sizeof(PetscScalar) * MDOT_WORKGROUP_NUM);
643: #endif
644: PetscLogGpuFlops(PetscMax(nv * (2.0 * n - 1), 0.0));
645: return 0;
646: }
648: #undef MDOT_WORKGROUP_SIZE
649: #undef MDOT_WORKGROUP_NUM
651: PetscErrorCode VecSet_SeqHIP(Vec xin, PetscScalar alpha)
652: {
653: PetscInt n = xin->map->n;
654: PetscScalar *xarray = NULL;
655: thrust::device_ptr<PetscScalar> xptr;
657: VecHIPGetArrayWrite(xin, &xarray);
658: PetscLogGpuTimeBegin();
659: if (alpha == (PetscScalar)0.0) {
660: hipMemset(xarray, 0, n * sizeof(PetscScalar));
661: } else {
662: try {
663: xptr = thrust::device_pointer_cast(xarray);
664: thrust::fill(xptr, xptr + n, alpha);
665: } catch (char *ex) {
666: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
667: }
668: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
669: }
670: PetscLogGpuTimeEnd();
671: VecHIPRestoreArrayWrite(xin, &xarray);
672: return 0;
673: }
675: struct PetscScalarReciprocal {
676: __host__ __device__ PetscScalar operator()(const PetscScalar &s)
677: {
678: #if defined(PETSC_USE_COMPLEX)
679: /* SPOCK compilation issue, need to unroll division */
680: PetscReal sx = PetscRealPart(s);
681: PetscReal sy = PetscImaginaryPart(s);
682: PetscReal n = sx * sx + sy * sy;
683: return n != 0.0 ? PetscComplex(sx / n, -sy / n) : 0.0;
684: #else
685: return (s != (PetscScalar)0.0) ? (PetscScalar)1.0 / s : 0.0;
686: #endif
687: }
688: };
690: PetscErrorCode VecReciprocal_SeqHIP(Vec v)
691: {
692: PetscInt n;
693: PetscScalar *x;
695: VecGetLocalSize(v, &n);
696: VecHIPGetArray(v, &x);
697: PetscLogGpuTimeBegin();
698: try {
699: auto xptr = thrust::device_pointer_cast(x);
700: thrust::transform(xptr, xptr + n, xptr, PetscScalarReciprocal());
701: } catch (char *ex) {
702: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
703: }
704: PetscLogGpuTimeEnd();
705: VecHIPRestoreArray(v, &x);
706: return 0;
707: }
709: PetscErrorCode VecScale_SeqHIP(Vec xin, PetscScalar alpha)
710: {
711: PetscScalar *xarray;
712: PetscBLASInt one = 1, bn = 0;
713: hipblasHandle_t hipblasv2handle;
715: if (alpha == (PetscScalar)0.0) {
716: VecSet_SeqHIP(xin, alpha);
717: } else if (alpha != (PetscScalar)1.0) {
718: PetscHIPBLASGetHandle(&hipblasv2handle);
719: PetscBLASIntCast(xin->map->n, &bn);
720: VecHIPGetArray(xin, &xarray);
721: PetscLogGpuTimeBegin();
722: hipblasXscal(hipblasv2handle, bn, &alpha, xarray, one);
723: VecHIPRestoreArray(xin, &xarray);
724: PetscLogGpuTimeEnd();
725: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
726: PetscLogGpuFlops(xin->map->n);
727: }
728: return 0;
729: }
731: PetscErrorCode VecTDot_SeqHIP(Vec xin, Vec yin, PetscScalar *z)
732: {
733: const PetscScalar *xarray, *yarray;
734: PetscBLASInt one = 1, bn = 0;
735: hipblasHandle_t hipblasv2handle;
737: PetscHIPBLASGetHandle(&hipblasv2handle);
738: PetscBLASIntCast(xin->map->n, &bn);
739: VecHIPGetArrayRead(xin, &xarray);
740: VecHIPGetArrayRead(yin, &yarray);
741: PetscLogGpuTimeBegin();
742: hipblasXdotu(hipblasv2handle, bn, xarray, one, yarray, one, z);
743: PetscLogGpuTimeEnd();
744: if (xin->map->n > 0) PetscLogGpuFlops(2.0 * xin->map->n - 1);
745: PetscLogGpuToCpu(sizeof(PetscScalar));
746: VecHIPRestoreArrayRead(yin, &yarray);
747: VecHIPRestoreArrayRead(xin, &xarray);
748: return 0;
749: }
751: PetscErrorCode VecCopy_SeqHIP(Vec xin, Vec yin)
752: {
753: const PetscScalar *xarray;
754: PetscScalar *yarray;
756: if (xin != yin) {
757: if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
758: PetscBool yiship;
760: PetscObjectTypeCompareAny((PetscObject)yin, &yiship, VECSEQHIP, VECMPIHIP, "");
761: VecHIPGetArrayRead(xin, &xarray);
762: if (yiship) {
763: VecHIPGetArrayWrite(yin, &yarray);
764: } else {
765: VecGetArrayWrite(yin, &yarray);
766: }
767: PetscLogGpuTimeBegin();
768: if (yiship) {
769: hipMemcpy(yarray, xarray, yin->map->n * sizeof(PetscScalar), hipMemcpyDeviceToDevice);
770: } else {
771: hipMemcpy(yarray, xarray, yin->map->n * sizeof(PetscScalar), hipMemcpyDeviceToHost);
772: }
773: PetscLogGpuTimeEnd();
774: VecHIPRestoreArrayRead(xin, &xarray);
775: if (yiship) {
776: VecHIPRestoreArrayWrite(yin, &yarray);
777: } else {
778: VecRestoreArrayWrite(yin, &yarray);
779: }
780: } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
781: /* copy in CPU if we are on the CPU */
782: VecCopy_SeqHIP_Private(xin, yin);
783: } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
784: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
785: if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
786: /* copy in CPU */
787: VecCopy_SeqHIP_Private(xin, yin);
788: } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
789: /* copy in GPU */
790: VecHIPGetArrayRead(xin, &xarray);
791: VecHIPGetArrayWrite(yin, &yarray);
792: PetscLogGpuTimeBegin();
793: hipMemcpy(yarray, xarray, yin->map->n * sizeof(PetscScalar), hipMemcpyDeviceToDevice);
794: PetscLogGpuTimeEnd();
795: VecHIPRestoreArrayRead(xin, &xarray);
796: VecHIPRestoreArrayWrite(yin, &yarray);
797: } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
798: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
799: default to copy in GPU (this is an arbitrary choice) */
800: VecHIPGetArrayRead(xin, &xarray);
801: VecHIPGetArrayWrite(yin, &yarray);
802: PetscLogGpuTimeBegin();
803: hipMemcpy(yarray, xarray, yin->map->n * sizeof(PetscScalar), hipMemcpyDeviceToDevice);
804: PetscLogGpuTimeEnd();
805: VecHIPRestoreArrayRead(xin, &xarray);
806: VecHIPRestoreArrayWrite(yin, &yarray);
807: } else {
808: VecCopy_SeqHIP_Private(xin, yin);
809: }
810: }
811: }
812: return 0;
813: }
815: PetscErrorCode VecSwap_SeqHIP(Vec xin, Vec yin)
816: {
817: PetscBLASInt one = 1, bn;
818: PetscScalar *xarray, *yarray;
819: hipblasHandle_t hipblasv2handle;
821: PetscHIPBLASGetHandle(&hipblasv2handle);
822: PetscBLASIntCast(xin->map->n, &bn);
823: if (xin != yin) {
824: VecHIPGetArray(xin, &xarray);
825: VecHIPGetArray(yin, &yarray);
826: PetscLogGpuTimeBegin();
827: hipblasXswap(hipblasv2handle, bn, xarray, one, yarray, one);
828: PetscLogGpuTimeEnd();
829: VecHIPRestoreArray(xin, &xarray);
830: VecHIPRestoreArray(yin, &yarray);
831: }
832: return 0;
833: }
835: PetscErrorCode VecAXPBY_SeqHIP(Vec yin, PetscScalar alpha, PetscScalar beta, Vec xin)
836: {
837: PetscScalar a = alpha, b = beta;
838: const PetscScalar *xarray;
839: PetscScalar *yarray;
840: PetscBLASInt one = 1, bn = 0;
841: hipblasHandle_t hipblasv2handle;
843: PetscHIPBLASGetHandle(&hipblasv2handle);
844: PetscBLASIntCast(yin->map->n, &bn);
845: if (a == (PetscScalar)0.0) {
846: VecScale_SeqHIP(yin, beta);
847: } else if (b == (PetscScalar)1.0) {
848: VecAXPY_SeqHIP(yin, alpha, xin);
849: } else if (a == (PetscScalar)1.0) {
850: VecAYPX_SeqHIP(yin, beta, xin);
851: } else if (b == (PetscScalar)0.0) {
852: VecHIPGetArrayRead(xin, &xarray);
853: VecHIPGetArray(yin, &yarray);
854: PetscLogGpuTimeBegin();
855: hipMemcpy(yarray, xarray, yin->map->n * sizeof(PetscScalar), hipMemcpyDeviceToDevice);
856: hipblasXscal(hipblasv2handle, bn, &alpha, yarray, one);
857: PetscLogGpuTimeEnd();
858: PetscLogGpuFlops(xin->map->n);
859: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
860: VecHIPRestoreArrayRead(xin, &xarray);
861: VecHIPRestoreArray(yin, &yarray);
862: } else {
863: VecHIPGetArrayRead(xin, &xarray);
864: VecHIPGetArray(yin, &yarray);
865: PetscLogGpuTimeBegin();
866: hipblasXscal(hipblasv2handle, bn, &beta, yarray, one);
867: hipblasXaxpy(hipblasv2handle, bn, &alpha, xarray, one, yarray, one);
868: VecHIPRestoreArrayRead(xin, &xarray);
869: VecHIPRestoreArray(yin, &yarray);
870: PetscLogGpuTimeEnd();
871: PetscLogGpuFlops(3.0 * xin->map->n);
872: PetscLogCpuToGpuScalar(2 * sizeof(PetscScalar));
873: }
874: return 0;
875: }
877: PetscErrorCode VecAXPBYPCZ_SeqHIP(Vec zin, PetscScalar alpha, PetscScalar beta, PetscScalar gamma, Vec xin, Vec yin)
878: {
879: PetscInt n = zin->map->n;
881: if (gamma == (PetscScalar)1.0) {
882: /* z = ax + b*y + z */
883: VecAXPY_SeqHIP(zin, alpha, xin);
884: VecAXPY_SeqHIP(zin, beta, yin);
885: PetscLogGpuFlops(4.0 * n);
886: } else {
887: /* z = a*x + b*y + c*z */
888: VecScale_SeqHIP(zin, gamma);
889: VecAXPY_SeqHIP(zin, alpha, xin);
890: VecAXPY_SeqHIP(zin, beta, yin);
891: PetscLogGpuFlops(5.0 * n);
892: }
893: return 0;
894: }
896: PetscErrorCode VecPointwiseMult_SeqHIP(Vec win, Vec xin, Vec yin)
897: {
898: PetscInt n = win->map->n;
899: const PetscScalar *xarray, *yarray;
900: PetscScalar *warray;
901: thrust::device_ptr<const PetscScalar> xptr, yptr;
902: thrust::device_ptr<PetscScalar> wptr;
904: if (xin->boundtocpu || yin->boundtocpu) {
905: VecPointwiseMult_Seq(win, xin, yin);
906: return 0;
907: }
908: VecHIPGetArrayRead(xin, &xarray);
909: VecHIPGetArrayRead(yin, &yarray);
910: VecHIPGetArrayWrite(win, &warray);
911: PetscLogGpuTimeBegin();
912: try {
913: wptr = thrust::device_pointer_cast(warray);
914: xptr = thrust::device_pointer_cast(xarray);
915: yptr = thrust::device_pointer_cast(yarray);
916: #if defined(PETSC_USE_COMPLEX)
917: thrust::transform(xptr, xptr + n, yptr, wptr, PetscMultiplyComplex());
918: #else
919: thrust::transform(xptr, xptr + n, yptr, wptr, thrust::multiplies<PetscScalar>());
920: #endif
921: } catch (char *ex) {
922: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
923: }
924: PetscLogGpuTimeEnd();
925: VecHIPRestoreArrayRead(xin, &xarray);
926: VecHIPRestoreArrayRead(yin, &yarray);
927: VecHIPRestoreArrayWrite(win, &warray);
928: PetscLogGpuFlops(n);
929: return 0;
930: }
932: /* should do infinity norm in hip */
934: PetscErrorCode VecNorm_SeqHIP(Vec xin, NormType type, PetscReal *z)
935: {
936: PetscInt n = xin->map->n;
937: PetscBLASInt one = 1, bn = 0;
938: const PetscScalar *xarray;
939: hipblasHandle_t hipblasv2handle;
941: PetscHIPBLASGetHandle(&hipblasv2handle);
942: PetscBLASIntCast(n, &bn);
943: if (type == NORM_2 || type == NORM_FROBENIUS) {
944: VecHIPGetArrayRead(xin, &xarray);
945: PetscLogGpuTimeBegin();
946: hipblasXnrm2(hipblasv2handle, bn, xarray, one, z);
947: PetscLogGpuTimeEnd();
948: VecHIPRestoreArrayRead(xin, &xarray);
949: PetscLogGpuFlops(PetscMax(2.0 * n - 1, 0.0));
950: } else if (type == NORM_INFINITY) {
951: int i;
952: VecHIPGetArrayRead(xin, &xarray);
953: PetscLogGpuTimeBegin();
954: hipblasIXamax(hipblasv2handle, bn, xarray, one, &i);
955: PetscLogGpuTimeEnd();
956: if (bn) {
957: PetscScalar zs;
958: hipMemcpy(&zs, xarray + i - 1, sizeof(PetscScalar), hipMemcpyDeviceToHost);
959: *z = PetscAbsScalar(zs);
960: } else *z = 0.0;
961: VecHIPRestoreArrayRead(xin, &xarray);
962: } else if (type == NORM_1) {
963: VecHIPGetArrayRead(xin, &xarray);
964: PetscLogGpuTimeBegin();
965: hipblasXasum(hipblasv2handle, bn, xarray, one, z);
966: VecHIPRestoreArrayRead(xin, &xarray);
967: PetscLogGpuTimeEnd();
968: PetscLogGpuFlops(PetscMax(n - 1.0, 0.0));
969: } else if (type == NORM_1_AND_2) {
970: VecNorm_SeqHIP(xin, NORM_1, z);
971: VecNorm_SeqHIP(xin, NORM_2, z + 1);
972: }
973: PetscLogGpuToCpu(sizeof(PetscReal));
974: return 0;
975: }
977: PetscErrorCode VecDotNorm2_SeqHIP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
978: {
979: VecDot_SeqHIP(s, t, dp);
980: VecDot_SeqHIP(t, t, nm);
981: return 0;
982: }
984: PetscErrorCode VecDestroy_SeqHIP(Vec v)
985: {
986: if (v->spptr) {
987: if (((Vec_HIP *)v->spptr)->GPUarray_allocated) {
988: hipFree(((Vec_HIP *)v->spptr)->GPUarray_allocated);
989: ((Vec_HIP *)v->spptr)->GPUarray_allocated = NULL;
990: }
991: if (((Vec_HIP *)v->spptr)->stream) hipStreamDestroy(((Vec_HIP *)v->spptr)->stream);
992: }
993: VecDestroy_SeqHIP_Private(v);
994: PetscFree(v->spptr);
995: return 0;
996: }
998: #if defined(PETSC_USE_COMPLEX)
999: /* SPOCK compilation issue, need to do conjugation ourselves */
1000: struct conjugate {
1001: __host__ __device__ PetscScalar operator()(const PetscScalar &x) { return PetscScalar(PetscRealPart(x), -PetscImaginaryPart(x)); }
1002: };
1003: #endif
1005: PetscErrorCode VecConjugate_SeqHIP(Vec xin)
1006: {
1007: #if defined(PETSC_USE_COMPLEX)
1008: PetscScalar *xarray;
1009: PetscInt n = xin->map->n;
1010: thrust::device_ptr<PetscScalar> xptr;
1012: VecHIPGetArray(xin, &xarray);
1013: PetscLogGpuTimeBegin();
1014: try {
1015: xptr = thrust::device_pointer_cast(xarray);
1016: thrust::transform(xptr, xptr + n, xptr, conjugate());
1017: } catch (char *ex) {
1018: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1019: }
1020: PetscLogGpuTimeEnd();
1021: VecHIPRestoreArray(xin, &xarray);
1022: #else
1023: #endif
1024: return 0;
1025: }
1027: static inline PetscErrorCode VecGetLocalVectorK_SeqHIP(Vec v, Vec w, PetscBool read)
1028: {
1029: PetscBool wisseqhip;
1034: PetscObjectTypeCompare((PetscObject)w, VECSEQHIP, &wisseqhip);
1035: if (w->data && wisseqhip) {
1036: if (((Vec_Seq *)w->data)->array_allocated) {
1037: if (w->pinned_memory) PetscMallocSetHIPHost();
1038: PetscFree(((Vec_Seq *)w->data)->array_allocated);
1039: if (w->pinned_memory) {
1040: PetscMallocResetHIPHost();
1041: w->pinned_memory = PETSC_FALSE;
1042: }
1043: }
1044: ((Vec_Seq *)w->data)->array = NULL;
1045: ((Vec_Seq *)w->data)->unplacedarray = NULL;
1046: }
1047: if (w->spptr && wisseqhip) {
1048: if (((Vec_HIP *)w->spptr)->GPUarray) {
1049: hipFree(((Vec_HIP *)w->spptr)->GPUarray);
1050: ((Vec_HIP *)w->spptr)->GPUarray = NULL;
1051: }
1052: if (((Vec_HIP *)v->spptr)->stream) hipStreamDestroy(((Vec_HIP *)w->spptr)->stream);
1053: PetscFree(w->spptr);
1054: }
1056: if (v->petscnative && wisseqhip) {
1057: PetscFree(w->data);
1058: w->data = v->data;
1059: w->offloadmask = v->offloadmask;
1060: w->pinned_memory = v->pinned_memory;
1061: w->spptr = v->spptr;
1062: PetscObjectStateIncrease((PetscObject)w);
1063: } else {
1064: if (read) {
1065: VecGetArrayRead(v, (const PetscScalar **)&((Vec_Seq *)w->data)->array);
1066: } else {
1067: VecGetArray(v, &((Vec_Seq *)w->data)->array);
1068: }
1069: w->offloadmask = PETSC_OFFLOAD_CPU;
1070: if (wisseqhip) VecHIPAllocateCheck(w);
1071: }
1072: return 0;
1073: }
1075: static inline PetscErrorCode VecRestoreLocalVectorK_SeqHIP(Vec v, Vec w, PetscBool read)
1076: {
1077: PetscBool wisseqhip;
1082: PetscObjectTypeCompare((PetscObject)w, VECSEQHIP, &wisseqhip);
1083: if (v->petscnative && wisseqhip) {
1084: v->data = w->data;
1085: v->offloadmask = w->offloadmask;
1086: v->pinned_memory = w->pinned_memory;
1087: v->spptr = w->spptr;
1088: w->data = 0;
1089: w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1090: w->spptr = 0;
1091: } else {
1092: if (read) {
1093: VecRestoreArrayRead(v, (const PetscScalar **)&((Vec_Seq *)w->data)->array);
1094: } else {
1095: VecRestoreArray(v, &((Vec_Seq *)w->data)->array);
1096: }
1097: if ((Vec_HIP *)w->spptr && wisseqhip) {
1098: hipFree(((Vec_HIP *)w->spptr)->GPUarray);
1099: ((Vec_HIP *)w->spptr)->GPUarray = NULL;
1100: if (((Vec_HIP *)v->spptr)->stream) hipStreamDestroy(((Vec_HIP *)w->spptr)->stream);
1101: PetscFree(w->spptr);
1102: }
1103: }
1104: return 0;
1105: }
1107: PetscErrorCode VecGetLocalVector_SeqHIP(Vec v, Vec w)
1108: {
1109: VecGetLocalVectorK_SeqHIP(v, w, PETSC_FALSE);
1110: return 0;
1111: }
1113: PetscErrorCode VecGetLocalVectorRead_SeqHIP(Vec v, Vec w)
1114: {
1115: VecGetLocalVectorK_SeqHIP(v, w, PETSC_TRUE);
1116: return 0;
1117: }
1119: PetscErrorCode VecRestoreLocalVector_SeqHIP(Vec v, Vec w)
1120: {
1121: VecRestoreLocalVectorK_SeqHIP(v, w, PETSC_FALSE);
1122: return 0;
1123: }
1125: PetscErrorCode VecRestoreLocalVectorRead_SeqHIP(Vec v, Vec w)
1126: {
1127: VecRestoreLocalVectorK_SeqHIP(v, w, PETSC_TRUE);
1128: return 0;
1129: }
1131: struct petscrealpart : public thrust::unary_function<PetscScalar, PetscReal> {
1132: __host__ __device__ PetscReal operator()(const PetscScalar &x) { return PetscRealPart(x); }
1133: };
1135: struct petscrealparti : public thrust::unary_function<thrust::tuple<PetscScalar, PetscInt>, thrust::tuple<PetscReal, PetscInt>> {
1136: __host__ __device__ thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscScalar, PetscInt> &x) { return thrust::make_tuple(PetscRealPart(x.get<0>()), x.get<1>()); }
1137: };
1139: struct petscmax : public thrust::binary_function<PetscReal, PetscReal, PetscReal> {
1140: __host__ __device__ PetscReal operator()(const PetscReal &x, const PetscReal &y) { return x < y ? y : x; }
1141: };
1143: struct petscmaxi : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>, thrust::tuple<PetscReal, PetscInt>, thrust::tuple<PetscReal, PetscInt>> {
1144: __host__ __device__ thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt> &x, const thrust::tuple<PetscReal, PetscInt> &y)
1145: {
1146: return x.get<0>() < y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) : (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1147: }
1148: };
1150: struct petscmin : public thrust::binary_function<PetscReal, PetscReal, PetscReal> {
1151: __host__ __device__ PetscReal operator()(const PetscReal &x, const PetscReal &y) { return x < y ? x : y; }
1152: };
1154: struct petscmini : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>, thrust::tuple<PetscReal, PetscInt>, thrust::tuple<PetscReal, PetscInt>> {
1155: __host__ __device__ thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt> &x, const thrust::tuple<PetscReal, PetscInt> &y)
1156: {
1157: return x.get<0>() > y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) : (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1158: }
1159: };
1161: PetscErrorCode VecMax_SeqHIP(Vec v, PetscInt *p, PetscReal *m)
1162: {
1163: PetscInt n = v->map->n;
1164: const PetscScalar *av;
1165: thrust::device_ptr<const PetscScalar> avpt;
1168: if (!n) {
1169: *m = PETSC_MIN_REAL;
1170: if (p) *p = -1;
1171: return 0;
1172: }
1173: VecHIPGetArrayRead(v, &av);
1174: avpt = thrust::device_pointer_cast(av);
1175: PetscLogGpuTimeBegin();
1176: if (p) {
1177: thrust::tuple<PetscReal, PetscInt> res(PETSC_MIN_REAL, -1);
1178: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt, thrust::counting_iterator<PetscInt>(0)));
1179: try {
1180: #if defined(PETSC_USE_COMPLEX)
1181: res = thrust::transform_reduce(zibit, zibit + n, petscrealparti(), res, petscmaxi());
1182: #else
1183: res = thrust::reduce(zibit, zibit + n, res, petscmaxi());
1184: #endif
1185: } catch (char *ex) {
1186: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1187: }
1188: *m = res.get<0>();
1189: *p = res.get<1>();
1190: } else {
1191: try {
1192: #if defined(PETSC_USE_COMPLEX)
1193: *m = thrust::transform_reduce(avpt, avpt + n, petscrealpart(), PETSC_MIN_REAL, petscmax());
1194: #else
1195: *m = thrust::reduce(avpt, avpt + n, PETSC_MIN_REAL, petscmax());
1196: #endif
1197: } catch (char *ex) {
1198: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1199: }
1200: }
1201: PetscLogGpuTimeEnd();
1202: VecHIPRestoreArrayRead(v, &av);
1203: return 0;
1204: }
1206: PetscErrorCode VecMin_SeqHIP(Vec v, PetscInt *p, PetscReal *m)
1207: {
1208: PetscInt n = v->map->n;
1209: const PetscScalar *av;
1210: thrust::device_ptr<const PetscScalar> avpt;
1213: if (!n) {
1214: *m = PETSC_MAX_REAL;
1215: if (p) *p = -1;
1216: return 0;
1217: }
1218: VecHIPGetArrayRead(v, &av);
1219: avpt = thrust::device_pointer_cast(av);
1220: PetscLogGpuTimeBegin();
1221: if (p) {
1222: thrust::tuple<PetscReal, PetscInt> res(PETSC_MAX_REAL, -1);
1223: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt, thrust::counting_iterator<PetscInt>(0)));
1224: try {
1225: #if defined(PETSC_USE_COMPLEX)
1226: res = thrust::transform_reduce(zibit, zibit + n, petscrealparti(), res, petscmini());
1227: #else
1228: res = thrust::reduce(zibit, zibit + n, res, petscmini());
1229: #endif
1230: } catch (char *ex) {
1231: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1232: }
1233: *m = res.get<0>();
1234: *p = res.get<1>();
1235: } else {
1236: try {
1237: #if defined(PETSC_USE_COMPLEX)
1238: *m = thrust::transform_reduce(avpt, avpt + n, petscrealpart(), PETSC_MAX_REAL, petscmin());
1239: #else
1240: *m = thrust::reduce(avpt, avpt + n, PETSC_MAX_REAL, petscmin());
1241: #endif
1242: } catch (char *ex) {
1243: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1244: }
1245: }
1246: PetscLogGpuTimeEnd();
1247: VecHIPRestoreArrayRead(v, &av);
1248: return 0;
1249: }
1251: PetscErrorCode VecSum_SeqHIP(Vec v, PetscScalar *sum)
1252: {
1253: PetscInt n = v->map->n;
1254: const PetscScalar *a;
1255: thrust::device_ptr<const PetscScalar> dptr;
1258: VecHIPGetArrayRead(v, &a);
1259: dptr = thrust::device_pointer_cast(a);
1260: PetscLogGpuTimeBegin();
1261: try {
1262: *sum = thrust::reduce(dptr, dptr + n, PetscScalar(0.0));
1263: } catch (char *ex) {
1264: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1265: }
1266: PetscLogGpuTimeEnd();
1267: VecHIPRestoreArrayRead(v, &a);
1268: return 0;
1269: }
1271: struct petscshift : public thrust::unary_function<PetscScalar, PetscScalar> {
1272: const PetscScalar shift_;
1273: petscshift(PetscScalar shift) : shift_(shift) { }
1274: __host__ __device__ PetscScalar operator()(PetscScalar x) { return x + shift_; }
1275: };
1277: PetscErrorCode VecShift_SeqHIP(Vec v, PetscScalar shift)
1278: {
1279: PetscInt n = v->map->n;
1280: PetscScalar *a;
1281: thrust::device_ptr<PetscScalar> dptr;
1284: VecHIPGetArray(v, &a);
1285: dptr = thrust::device_pointer_cast(a);
1286: PetscLogGpuTimeBegin();
1287: try {
1288: thrust::transform(dptr, dptr + n, dptr, petscshift(shift)); /* in-place transform */
1289: } catch (char *ex) {
1290: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1291: }
1292: PetscLogGpuTimeEnd();
1293: VecHIPRestoreArray(v, &a);
1294: return 0;
1295: }