Actual source code: vechip2.hip.cpp

  1: /*
  2:    Implements the sequential hip vectors.
  3: */

  5: #define PETSC_SKIP_SPINLOCK

  7: #include <petscconf.h>
  8: #include <petsc/private/vecimpl.h>
  9: #include <../src/vec/vec/impls/dvecimpl.h>
 10: #include <petsc/private/hipvecimpl.h>

 12: #include <hip/hip_runtime.h>
 13: #include <thrust/device_ptr.h>
 14: #include <thrust/transform.h>
 15: #include <thrust/functional.h>
 16: #include <thrust/reduce.h>

 18: #if defined(PETSC_USE_COMPLEX)
 19: /* SPOCK compilation issues, need to unroll division and multiplication with complex numbers */
 20: struct PetscDivideComplex {
 21:   __host__ __device__ PetscScalar operator()(const PetscScalar &lhs, const PetscScalar &rhs)
 22:   {
 23:     PetscReal lx = PetscRealPart(lhs);
 24:     PetscReal ly = PetscImaginaryPart(lhs);
 25:     PetscReal rx = PetscRealPart(rhs);
 26:     PetscReal ry = PetscImaginaryPart(rhs);
 27:     PetscReal n  = rx * rx + ry * ry;
 28:     return PetscComplex((lx * rx + ly * ry) / n, (rx * ly - lx * ry) / n);
 29:   }
 30: };

 32: struct PetscMultiplyComplex {
 33:   __host__ __device__ PetscScalar operator()(const PetscScalar &lhs, const PetscScalar &rhs)
 34:   {
 35:     PetscReal lx = PetscRealPart(lhs);
 36:     PetscReal ly = PetscImaginaryPart(lhs);
 37:     PetscReal rx = PetscRealPart(rhs);
 38:     PetscReal ry = PetscImaginaryPart(rhs);
 39:     return PetscComplex(lx * rx - ly * ry, ly * rx + lx * ry);
 40:   }
 41: };
 42: #endif

 44: /*
 45:     Allocates space for the vector array on the GPU if it does not exist.
 46:     Does NOT change the PetscHIPFlag for the vector
 47:     Does NOT zero the HIP array

 49:  */
 50: PetscErrorCode VecHIPAllocateCheck(Vec v)
 51: {
 52:   Vec_HIP  *vechip;
 53:   PetscBool option_set;

 55:   if (!v->spptr) {
 56:     PetscReal pinned_memory_min;

 58:     PetscCalloc(sizeof(Vec_HIP), &v->spptr);
 59:     vechip = (Vec_HIP *)v->spptr;
 60:     hipMalloc((void **)&vechip->GPUarray_allocated, sizeof(PetscScalar) * ((PetscBLASInt)v->map->n));
 61:     vechip->GPUarray = vechip->GPUarray_allocated;
 62:     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
 63:       if (v->data && ((Vec_Seq *)v->data)->array) {
 64:         v->offloadmask = PETSC_OFFLOAD_CPU;
 65:       } else {
 66:         v->offloadmask = PETSC_OFFLOAD_GPU;
 67:       }
 68:     }
 69:     pinned_memory_min = 0;

 71:     /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
 72:        Note: This same code duplicated in VecCreate_SeqHIP_Private() and VecCreate_MPIHIP_Private(). Is there a good way to avoid this? */
 73:     PetscOptionsBegin(PetscObjectComm((PetscObject)v), ((PetscObject)v)->prefix, "VECHIP Options", "Vec");
 74:     PetscOptionsReal("-vec_pinned_memory_min", "Minimum size (in bytes) for an allocation to use pinned memory on host", "VecSetPinnedMemoryMin", pinned_memory_min, &pinned_memory_min, &option_set);
 75:     if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
 76:     PetscOptionsEnd();
 77:   }
 78:   return 0;
 79: }

 81: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 82: PetscErrorCode VecHIPCopyToGPU(Vec v)
 83: {
 84:   Vec_HIP     *vechip;
 85:   PetscScalar *varray;

 88:   VecHIPAllocateCheck(v);
 89:   if (v->offloadmask == PETSC_OFFLOAD_CPU) {
 90:     PetscLogEventBegin(VEC_HIPCopyToGPU, v, 0, 0, 0);
 91:     vechip = (Vec_HIP *)v->spptr;
 92:     varray = vechip->GPUarray;
 93:     hipMemcpy(varray, ((Vec_Seq *)v->data)->array, v->map->n * sizeof(PetscScalar), hipMemcpyHostToDevice);
 94:     PetscLogCpuToGpu((v->map->n) * sizeof(PetscScalar));
 95:     PetscLogEventEnd(VEC_HIPCopyToGPU, v, 0, 0, 0);
 96:     v->offloadmask = PETSC_OFFLOAD_BOTH;
 97:   }
 98:   return 0;
 99: }

101: /*
102:      VecHIPCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
103: */
104: PetscErrorCode VecHIPCopyFromGPU(Vec v)
105: {
106:   Vec_HIP     *vechip;
107:   PetscScalar *varray;

110:   VecHIPAllocateCheckHost(v);
111:   if (v->offloadmask == PETSC_OFFLOAD_GPU) {
112:     PetscLogEventBegin(VEC_HIPCopyFromGPU, v, 0, 0, 0);
113:     vechip = (Vec_HIP *)v->spptr;
114:     varray = vechip->GPUarray;
115:     hipMemcpy(((Vec_Seq *)v->data)->array, varray, v->map->n * sizeof(PetscScalar), hipMemcpyDeviceToHost);
116:     PetscLogGpuToCpu((v->map->n) * sizeof(PetscScalar));
117:     PetscLogEventEnd(VEC_HIPCopyFromGPU, v, 0, 0, 0);
118:     v->offloadmask = PETSC_OFFLOAD_BOTH;
119:   }
120:   return 0;
121: }

123: /*MC
124:    VECSEQHIP - VECSEQHIP = "seqhip" - The basic sequential vector, modified to use HIP

126:    Options Database Keys:
127: . -vec_type seqhip - sets the vector type to VECSEQHIP during a call to VecSetFromOptions()

129:   Level: beginner

131: .seealso: `VecCreate()`, `VecSetType()`, `VecSetFromOptions()`, `VecCreateSeqWithArray()`, `VECMPI`, `VecType`, `VecCreateMPI()`, `VecCreateSeq()`
132: M*/

134: PetscErrorCode VecAYPX_SeqHIP(Vec yin, PetscScalar alpha, Vec xin)
135: {
136:   const PetscScalar *xarray;
137:   PetscScalar       *yarray;
138:   PetscBLASInt       one = 1, bn = 0;
139:   PetscScalar        sone = 1.0;
140:   hipblasHandle_t    hipblasv2handle;

142:   PetscHIPBLASGetHandle(&hipblasv2handle);
143:   PetscBLASIntCast(yin->map->n, &bn);
144:   VecHIPGetArrayRead(xin, &xarray);
145:   VecHIPGetArray(yin, &yarray);
146:   PetscLogGpuTimeBegin();
147:   if (alpha == (PetscScalar)0.0) {
148:     hipMemcpy(yarray, xarray, bn * sizeof(PetscScalar), hipMemcpyDeviceToDevice);
149:   } else if (alpha == (PetscScalar)1.0) {
150:     hipblasXaxpy(hipblasv2handle, bn, &alpha, xarray, one, yarray, one);
151:     PetscLogGpuFlops(1.0 * yin->map->n);
152:   } else {
153:     hipblasXscal(hipblasv2handle, bn, &alpha, yarray, one);
154:     hipblasXaxpy(hipblasv2handle, bn, &sone, xarray, one, yarray, one);
155:     PetscLogGpuFlops(2.0 * yin->map->n);
156:   }
157:   PetscLogGpuTimeEnd();
158:   VecHIPRestoreArrayRead(xin, &xarray);
159:   VecHIPRestoreArray(yin, &yarray);
160:   PetscLogCpuToGpuScalar(sizeof(PetscScalar));
161:   return 0;
162: }

164: PetscErrorCode VecAXPY_SeqHIP(Vec yin, PetscScalar alpha, Vec xin)
165: {
166:   const PetscScalar *xarray;
167:   PetscScalar       *yarray;
168:   PetscBLASInt       one = 1, bn = 0;
169:   hipblasHandle_t    hipblasv2handle;
170:   PetscBool          xiship;

172:   if (alpha == (PetscScalar)0.0) return 0;
173:   PetscHIPBLASGetHandle(&hipblasv2handle);
174:   PetscObjectTypeCompareAny((PetscObject)xin, &xiship, VECSEQHIP, VECMPIHIP, "");
175:   if (xiship) {
176:     PetscBLASIntCast(yin->map->n, &bn);
177:     VecHIPGetArrayRead(xin, &xarray);
178:     VecHIPGetArray(yin, &yarray);
179:     PetscLogGpuTimeBegin();
180:     hipblasXaxpy(hipblasv2handle, bn, &alpha, xarray, one, yarray, one);
181:     PetscLogGpuTimeEnd();
182:     VecHIPRestoreArrayRead(xin, &xarray);
183:     VecHIPRestoreArray(yin, &yarray);
184:     PetscLogGpuFlops(2.0 * yin->map->n);
185:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
186:   } else {
187:     VecAXPY_Seq(yin, alpha, xin);
188:   }
189:   return 0;
190: }

192: PetscErrorCode VecPointwiseDivide_SeqHIP(Vec win, Vec xin, Vec yin)
193: {
194:   PetscInt                              n      = xin->map->n;
195:   const PetscScalar                    *xarray = NULL, *yarray = NULL;
196:   PetscScalar                          *warray = NULL;
197:   thrust::device_ptr<const PetscScalar> xptr, yptr;
198:   thrust::device_ptr<PetscScalar>       wptr;

200:   if (xin->boundtocpu || yin->boundtocpu) {
201:     VecPointwiseDivide_Seq(win, xin, yin);
202:     return 0;
203:   }
204:   VecHIPGetArrayWrite(win, &warray);
205:   VecHIPGetArrayRead(xin, &xarray);
206:   VecHIPGetArrayRead(yin, &yarray);
207:   PetscLogGpuTimeBegin();
208:   try {
209:     wptr = thrust::device_pointer_cast(warray);
210:     xptr = thrust::device_pointer_cast(xarray);
211:     yptr = thrust::device_pointer_cast(yarray);
212: #if defined(PETSC_USE_COMPLEX)
213:     thrust::transform(xptr, xptr + n, yptr, wptr, PetscDivideComplex());
214: #else
215:     thrust::transform(xptr, xptr + n, yptr, wptr, thrust::divides<PetscScalar>());
216: #endif
217:   } catch (char *ex) {
218:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
219:   }
220:   PetscLogGpuTimeEnd();
221:   PetscLogGpuFlops(n);
222:   VecHIPRestoreArrayRead(xin, &xarray);
223:   VecHIPRestoreArrayRead(yin, &yarray);
224:   VecHIPRestoreArrayWrite(win, &warray);
225:   return 0;
226: }

228: PetscErrorCode VecWAXPY_SeqHIP(Vec win, PetscScalar alpha, Vec xin, Vec yin)
229: {
230:   const PetscScalar *xarray = NULL, *yarray = NULL;
231:   PetscScalar       *warray = NULL;
232:   PetscBLASInt       one = 1, bn = 0;
233:   hipblasHandle_t    hipblasv2handle;

235:   PetscHIPBLASGetHandle(&hipblasv2handle);
236:   PetscBLASIntCast(win->map->n, &bn);
237:   if (alpha == (PetscScalar)0.0) {
238:     VecCopy_SeqHIP(yin, win);
239:   } else {
240:     VecHIPGetArrayRead(xin, &xarray);
241:     VecHIPGetArrayRead(yin, &yarray);
242:     VecHIPGetArrayWrite(win, &warray);
243:     PetscLogGpuTimeBegin();
244:     hipMemcpy(warray, yarray, win->map->n * sizeof(PetscScalar), hipMemcpyDeviceToDevice);
245:     hipblasXaxpy(hipblasv2handle, bn, &alpha, xarray, one, warray, one);
246:     PetscLogGpuTimeEnd();
247:     PetscLogGpuFlops(2 * win->map->n);
248:     VecHIPRestoreArrayRead(xin, &xarray);
249:     VecHIPRestoreArrayRead(yin, &yarray);
250:     VecHIPRestoreArrayWrite(win, &warray);
251:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
252:   }
253:   return 0;
254: }

256: PetscErrorCode VecMAXPY_SeqHIP(Vec xin, PetscInt nv, const PetscScalar *alpha, Vec *y)
257: {
258:   PetscInt           n = xin->map->n, j;
259:   PetscScalar       *xarray;
260:   const PetscScalar *yarray;
261:   PetscBLASInt       one = 1, bn = 0;
262:   hipblasHandle_t    hipblasv2handle;

264:   PetscLogGpuFlops(nv * 2.0 * n);
265:   PetscLogCpuToGpuScalar(nv * sizeof(PetscScalar));
266:   PetscHIPBLASGetHandle(&hipblasv2handle);
267:   PetscBLASIntCast(n, &bn);
268:   VecHIPGetArray(xin, &xarray);
269:   PetscLogGpuTimeBegin();
270:   for (j = 0; j < nv; j++) {
271:     VecHIPGetArrayRead(y[j], &yarray);
272:     hipblasXaxpy(hipblasv2handle, bn, alpha + j, yarray, one, xarray, one);
273:     VecHIPRestoreArrayRead(y[j], &yarray);
274:   }
275:   PetscLogGpuTimeEnd();
276:   VecHIPRestoreArray(xin, &xarray);
277:   return 0;
278: }

280: PetscErrorCode VecDot_SeqHIP(Vec xin, Vec yin, PetscScalar *z)
281: {
282:   const PetscScalar *xarray, *yarray;
283:   PetscBLASInt       one = 1, bn = 0;
284:   hipblasHandle_t    hipblasv2handle;

286:   PetscHIPBLASGetHandle(&hipblasv2handle);
287:   PetscBLASIntCast(yin->map->n, &bn);
288:   VecHIPGetArrayRead(xin, &xarray);
289:   VecHIPGetArrayRead(yin, &yarray);
290:   /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
291:   PetscLogGpuTimeBegin();
292:   hipblasXdot(hipblasv2handle, bn, yarray, one, xarray, one, z);
293:   PetscLogGpuTimeEnd();
294:   if (xin->map->n > 0) PetscLogGpuFlops(2.0 * xin->map->n - 1);
295:   PetscLogGpuToCpu(sizeof(PetscScalar));
296:   VecHIPRestoreArrayRead(xin, &xarray);
297:   VecHIPRestoreArrayRead(yin, &yarray);
298:   return 0;
299: }

301: //
302: // HIP kernels for MDot to follow
303: //

305: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
306: #define MDOT_WORKGROUP_SIZE 128
307: #define MDOT_WORKGROUP_NUM  128

309: #if !defined(PETSC_USE_COMPLEX)
310: // M = 2:
311: __global__ void VecMDot_SeqHIP_kernel2(const PetscScalar *x, const PetscScalar *y0, const PetscScalar *y1, PetscInt size, PetscScalar *group_results)
312: {
313:   __shared__ PetscScalar tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
314:   PetscInt               entries_per_group = (size - 1) / gridDim.x + 1;
315:   entries_per_group                        = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
316:   PetscInt vec_start_index                 = blockIdx.x * entries_per_group;
317:   PetscInt vec_stop_index                  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

319:   PetscScalar entry_x    = 0;
320:   PetscScalar group_sum0 = 0;
321:   PetscScalar group_sum1 = 0;
322:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
323:     entry_x = x[i]; // load only once from global memory!
324:     group_sum0 += entry_x * y0[i];
325:     group_sum1 += entry_x * y1[i];
326:   }
327:   tmp_buffer[threadIdx.x]                       = group_sum0;
328:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

330:   // parallel reduction
331:   for (PetscInt stride = blockDim.x / 2; stride > 0; stride /= 2) {
332:     __syncthreads();
333:     if (threadIdx.x < stride) {
334:       tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
335:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + MDOT_WORKGROUP_SIZE];
336:     }
337:   }

339:   // write result of group to group_results
340:   if (threadIdx.x == 0) {
341:     group_results[blockIdx.x]             = tmp_buffer[0];
342:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
343:   }
344: }

346: // M = 3:
347: __global__ void VecMDot_SeqHIP_kernel3(const PetscScalar *x, const PetscScalar *y0, const PetscScalar *y1, const PetscScalar *y2, PetscInt size, PetscScalar *group_results)
348: {
349:   __shared__ PetscScalar tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
350:   PetscInt               entries_per_group = (size - 1) / gridDim.x + 1;
351:   entries_per_group                        = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
352:   PetscInt vec_start_index                 = blockIdx.x * entries_per_group;
353:   PetscInt vec_stop_index                  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

355:   PetscScalar entry_x    = 0;
356:   PetscScalar group_sum0 = 0;
357:   PetscScalar group_sum1 = 0;
358:   PetscScalar group_sum2 = 0;
359:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
360:     entry_x = x[i]; // load only once from global memory!
361:     group_sum0 += entry_x * y0[i];
362:     group_sum1 += entry_x * y1[i];
363:     group_sum2 += entry_x * y2[i];
364:   }
365:   tmp_buffer[threadIdx.x]                           = group_sum0;
366:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE]     = group_sum1;
367:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

369:   // parallel reduction
370:   for (PetscInt stride = blockDim.x / 2; stride > 0; stride /= 2) {
371:     __syncthreads();
372:     if (threadIdx.x < stride) {
373:       tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
374:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + MDOT_WORKGROUP_SIZE];
375:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 2 * MDOT_WORKGROUP_SIZE];
376:     }
377:   }

379:   // write result of group to group_results
380:   if (threadIdx.x == 0) {
381:     group_results[blockIdx.x]                 = tmp_buffer[0];
382:     group_results[blockIdx.x + gridDim.x]     = tmp_buffer[MDOT_WORKGROUP_SIZE];
383:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
384:   }
385: }

387: // M = 4:
388: __global__ void VecMDot_SeqHIP_kernel4(const PetscScalar *x, const PetscScalar *y0, const PetscScalar *y1, const PetscScalar *y2, const PetscScalar *y3, PetscInt size, PetscScalar *group_results)
389: {
390:   __shared__ PetscScalar tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
391:   PetscInt               entries_per_group = (size - 1) / gridDim.x + 1;
392:   entries_per_group                        = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
393:   PetscInt vec_start_index                 = blockIdx.x * entries_per_group;
394:   PetscInt vec_stop_index                  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

396:   PetscScalar entry_x    = 0;
397:   PetscScalar group_sum0 = 0;
398:   PetscScalar group_sum1 = 0;
399:   PetscScalar group_sum2 = 0;
400:   PetscScalar group_sum3 = 0;
401:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
402:     entry_x = x[i]; // load only once from global memory!
403:     group_sum0 += entry_x * y0[i];
404:     group_sum1 += entry_x * y1[i];
405:     group_sum2 += entry_x * y2[i];
406:     group_sum3 += entry_x * y3[i];
407:   }
408:   tmp_buffer[threadIdx.x]                           = group_sum0;
409:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE]     = group_sum1;
410:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
411:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

413:   // parallel reduction
414:   for (PetscInt stride = blockDim.x / 2; stride > 0; stride /= 2) {
415:     __syncthreads();
416:     if (threadIdx.x < stride) {
417:       tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
418:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + MDOT_WORKGROUP_SIZE];
419:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 2 * MDOT_WORKGROUP_SIZE];
420:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 3 * MDOT_WORKGROUP_SIZE];
421:     }
422:   }

424:   // write result of group to group_results
425:   if (threadIdx.x == 0) {
426:     group_results[blockIdx.x]                 = tmp_buffer[0];
427:     group_results[blockIdx.x + gridDim.x]     = tmp_buffer[MDOT_WORKGROUP_SIZE];
428:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
429:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
430:   }
431: }

433: // M = 8:
434: __global__ void VecMDot_SeqHIP_kernel8(const PetscScalar *x, const PetscScalar *y0, const PetscScalar *y1, const PetscScalar *y2, const PetscScalar *y3, const PetscScalar *y4, const PetscScalar *y5, const PetscScalar *y6, const PetscScalar *y7, PetscInt size, PetscScalar *group_results)
435: {
436:   __shared__ PetscScalar tmp_buffer[8 * MDOT_WORKGROUP_SIZE];
437:   PetscInt               entries_per_group = (size - 1) / gridDim.x + 1;
438:   entries_per_group                        = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
439:   PetscInt vec_start_index                 = blockIdx.x * entries_per_group;
440:   PetscInt vec_stop_index                  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

442:   PetscScalar entry_x    = 0;
443:   PetscScalar group_sum0 = 0;
444:   PetscScalar group_sum1 = 0;
445:   PetscScalar group_sum2 = 0;
446:   PetscScalar group_sum3 = 0;
447:   PetscScalar group_sum4 = 0;
448:   PetscScalar group_sum5 = 0;
449:   PetscScalar group_sum6 = 0;
450:   PetscScalar group_sum7 = 0;
451:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
452:     entry_x = x[i]; // load only once from global memory!
453:     group_sum0 += entry_x * y0[i];
454:     group_sum1 += entry_x * y1[i];
455:     group_sum2 += entry_x * y2[i];
456:     group_sum3 += entry_x * y3[i];
457:     group_sum4 += entry_x * y4[i];
458:     group_sum5 += entry_x * y5[i];
459:     group_sum6 += entry_x * y6[i];
460:     group_sum7 += entry_x * y7[i];
461:   }
462:   tmp_buffer[threadIdx.x]                           = group_sum0;
463:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE]     = group_sum1;
464:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
465:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
466:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
467:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
468:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
469:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

471:   // parallel reduction
472:   for (PetscInt stride = blockDim.x / 2; stride > 0; stride /= 2) {
473:     __syncthreads();
474:     if (threadIdx.x < stride) {
475:       tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
476:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + MDOT_WORKGROUP_SIZE];
477:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 2 * MDOT_WORKGROUP_SIZE];
478:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 3 * MDOT_WORKGROUP_SIZE];
479:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 4 * MDOT_WORKGROUP_SIZE];
480:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 5 * MDOT_WORKGROUP_SIZE];
481:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 6 * MDOT_WORKGROUP_SIZE];
482:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 7 * MDOT_WORKGROUP_SIZE];
483:     }
484:   }

486:   // write result of group to group_results
487:   if (threadIdx.x == 0) {
488:     group_results[blockIdx.x]                 = tmp_buffer[0];
489:     group_results[blockIdx.x + gridDim.x]     = tmp_buffer[MDOT_WORKGROUP_SIZE];
490:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
491:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
492:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
493:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
494:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
495:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
496:   }
497: }
498: #endif /* !defined(PETSC_USE_COMPLEX) */

500: PetscErrorCode VecMDot_SeqHIP(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
501: {
502:   PetscInt           i, n = xin->map->n, current_y_index = 0;
503:   const PetscScalar *xptr, *y0ptr, *y1ptr, *y2ptr, *y3ptr, *y4ptr, *y5ptr, *y6ptr, *y7ptr;
504: #if !defined(PETSC_USE_COMPLEX)
505:   PetscInt     nv1 = ((nv % 4) == 1) ? nv - 1 : nv, j;
506:   PetscScalar *group_results_gpu, *group_results_cpu;
507: #endif
508:   PetscBLASInt    one = 1, bn = 0;
509:   hipblasHandle_t hipblasv2handle;

511:   PetscHIPBLASGetHandle(&hipblasv2handle);
512:   PetscBLASIntCast(xin->map->n, &bn);
514:   /* Handle the case of local size zero first */
515:   if (!xin->map->n) {
516:     for (i = 0; i < nv; ++i) z[i] = 0;
517:     return 0;
518:   }

520: #if !defined(PETSC_USE_COMPLEX)
521:   // allocate scratchpad memory for the results of individual work groups:
522:   hipMalloc((void **)&group_results_gpu, nv1 * sizeof(PetscScalar) * MDOT_WORKGROUP_NUM);
523:   PetscMalloc1(nv1 * MDOT_WORKGROUP_NUM, &group_results_cpu);
524: #endif
525:   VecHIPGetArrayRead(xin, &xptr);
526:   PetscLogGpuTimeBegin();

528:   while (current_y_index < nv) {
529:     switch (nv - current_y_index) {
530:     case 7:
531:     case 6:
532:     case 5:
533:     case 4:
534:       VecHIPGetArrayRead(yin[current_y_index], &y0ptr);
535:       VecHIPGetArrayRead(yin[current_y_index + 1], &y1ptr);
536:       VecHIPGetArrayRead(yin[current_y_index + 2], &y2ptr);
537:       VecHIPGetArrayRead(yin[current_y_index + 3], &y3ptr);
538: #if defined(PETSC_USE_COMPLEX)
539:       hipblasXdot(hipblasv2handle, bn, y0ptr, one, xptr, one, &z[current_y_index]);
540:       hipblasXdot(hipblasv2handle, bn, y1ptr, one, xptr, one, &z[current_y_index + 1]);
541:       hipblasXdot(hipblasv2handle, bn, y2ptr, one, xptr, one, &z[current_y_index + 2]);
542:       hipblasXdot(hipblasv2handle, bn, y3ptr, one, xptr, one, &z[current_y_index + 3]);
543: #else
544:       hipLaunchKernelGGL(VecMDot_SeqHIP_kernel4, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr, y0ptr, y1ptr, y2ptr, y3ptr, n, group_results_gpu + current_y_index * MDOT_WORKGROUP_NUM);
545: #endif
546:       VecHIPRestoreArrayRead(yin[current_y_index], &y0ptr);
547:       VecHIPRestoreArrayRead(yin[current_y_index + 1], &y1ptr);
548:       VecHIPRestoreArrayRead(yin[current_y_index + 2], &y2ptr);
549:       VecHIPRestoreArrayRead(yin[current_y_index + 3], &y3ptr);
550:       current_y_index += 4;
551:       break;

553:     case 3:
554:       VecHIPGetArrayRead(yin[current_y_index], &y0ptr);
555:       VecHIPGetArrayRead(yin[current_y_index + 1], &y1ptr);
556:       VecHIPGetArrayRead(yin[current_y_index + 2], &y2ptr);

558: #if defined(PETSC_USE_COMPLEX)
559:       hipblasXdot(hipblasv2handle, bn, y0ptr, one, xptr, one, &z[current_y_index]);
560:       hipblasXdot(hipblasv2handle, bn, y1ptr, one, xptr, one, &z[current_y_index + 1]);
561:       hipblasXdot(hipblasv2handle, bn, y2ptr, one, xptr, one, &z[current_y_index + 2]);
562: #else
563:       hipLaunchKernelGGL(VecMDot_SeqHIP_kernel3, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr, y0ptr, y1ptr, y2ptr, n, group_results_gpu + current_y_index * MDOT_WORKGROUP_NUM);
564: #endif
565:       VecHIPRestoreArrayRead(yin[current_y_index], &y0ptr);
566:       VecHIPRestoreArrayRead(yin[current_y_index + 1], &y1ptr);
567:       VecHIPRestoreArrayRead(yin[current_y_index + 2], &y2ptr);
568:       current_y_index += 3;
569:       break;

571:     case 2:
572:       VecHIPGetArrayRead(yin[current_y_index], &y0ptr);
573:       VecHIPGetArrayRead(yin[current_y_index + 1], &y1ptr);
574: #if defined(PETSC_USE_COMPLEX)
575:       hipblasXdot(hipblasv2handle, bn, y0ptr, one, xptr, one, &z[current_y_index]);
576:       hipblasXdot(hipblasv2handle, bn, y1ptr, one, xptr, one, &z[current_y_index + 1]);
577: #else
578:       hipLaunchKernelGGL(VecMDot_SeqHIP_kernel2, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr, y0ptr, y1ptr, n, group_results_gpu + current_y_index * MDOT_WORKGROUP_NUM);
579: #endif
580:       VecHIPRestoreArrayRead(yin[current_y_index], &y0ptr);
581:       VecHIPRestoreArrayRead(yin[current_y_index + 1], &y1ptr);
582:       current_y_index += 2;
583:       break;

585:     case 1:
586:       VecHIPGetArrayRead(yin[current_y_index], &y0ptr);
587:       hipblasXdot(hipblasv2handle, bn, y0ptr, one, xptr, one, &z[current_y_index]);
588:       VecHIPRestoreArrayRead(yin[current_y_index], &y0ptr);
589:       current_y_index += 1;
590:       break;

592:     default: // 8 or more vectors left
593:       VecHIPGetArrayRead(yin[current_y_index], &y0ptr);
594:       VecHIPGetArrayRead(yin[current_y_index + 1], &y1ptr);
595:       VecHIPGetArrayRead(yin[current_y_index + 2], &y2ptr);
596:       VecHIPGetArrayRead(yin[current_y_index + 3], &y3ptr);
597:       VecHIPGetArrayRead(yin[current_y_index + 4], &y4ptr);
598:       VecHIPGetArrayRead(yin[current_y_index + 5], &y5ptr);
599:       VecHIPGetArrayRead(yin[current_y_index + 6], &y6ptr);
600:       VecHIPGetArrayRead(yin[current_y_index + 7], &y7ptr);
601: #if defined(PETSC_USE_COMPLEX)
602:       hipblasXdot(hipblasv2handle, bn, y0ptr, one, xptr, one, &z[current_y_index]);
603:       hipblasXdot(hipblasv2handle, bn, y1ptr, one, xptr, one, &z[current_y_index + 1]);
604:       hipblasXdot(hipblasv2handle, bn, y2ptr, one, xptr, one, &z[current_y_index + 2]);
605:       hipblasXdot(hipblasv2handle, bn, y3ptr, one, xptr, one, &z[current_y_index + 3]);
606:       hipblasXdot(hipblasv2handle, bn, y4ptr, one, xptr, one, &z[current_y_index + 4]);
607:       hipblasXdot(hipblasv2handle, bn, y5ptr, one, xptr, one, &z[current_y_index + 5]);
608:       hipblasXdot(hipblasv2handle, bn, y6ptr, one, xptr, one, &z[current_y_index + 6]);
609:       hipblasXdot(hipblasv2handle, bn, y7ptr, one, xptr, one, &z[current_y_index + 7]);
610: #else
611:       hipLaunchKernelGGL(VecMDot_SeqHIP_kernel8, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr, y0ptr, y1ptr, y2ptr, y3ptr, y4ptr, y5ptr, y6ptr, y7ptr, n, group_results_gpu + current_y_index * MDOT_WORKGROUP_NUM);
612: #endif
613:       VecHIPRestoreArrayRead(yin[current_y_index], &y0ptr);
614:       VecHIPRestoreArrayRead(yin[current_y_index + 1], &y1ptr);
615:       VecHIPRestoreArrayRead(yin[current_y_index + 2], &y2ptr);
616:       VecHIPRestoreArrayRead(yin[current_y_index + 3], &y3ptr);
617:       VecHIPRestoreArrayRead(yin[current_y_index + 4], &y4ptr);
618:       VecHIPRestoreArrayRead(yin[current_y_index + 5], &y5ptr);
619:       VecHIPRestoreArrayRead(yin[current_y_index + 6], &y6ptr);
620:       VecHIPRestoreArrayRead(yin[current_y_index + 7], &y7ptr);
621:       current_y_index += 8;
622:       break;
623:     }
624:   }
625:   PetscLogGpuTimeEnd();
626:   VecHIPRestoreArrayRead(xin, &xptr);

628: #if defined(PETSC_USE_COMPLEX)
629:   PetscLogGpuToCpu(nv * sizeof(PetscScalar));
630: #else
631:   // copy results to CPU
632:   hipMemcpy(group_results_cpu, group_results_gpu, nv1 * sizeof(PetscScalar) * MDOT_WORKGROUP_NUM, hipMemcpyDeviceToHost);

634:   // sum group results into z
635:   for (j = 0; j < nv1; ++j) {
636:     z[j] = 0;
637:     for (i = j * MDOT_WORKGROUP_NUM; i < (j + 1) * MDOT_WORKGROUP_NUM; ++i) z[j] += group_results_cpu[i];
638:   }
639:   PetscLogFlops(nv1 * MDOT_WORKGROUP_NUM);
640:   hipFree(group_results_gpu);
641:   PetscFree(group_results_cpu);
642:   PetscLogGpuToCpu(nv1 * sizeof(PetscScalar) * MDOT_WORKGROUP_NUM);
643: #endif
644:   PetscLogGpuFlops(PetscMax(nv * (2.0 * n - 1), 0.0));
645:   return 0;
646: }

648: #undef MDOT_WORKGROUP_SIZE
649: #undef MDOT_WORKGROUP_NUM

651: PetscErrorCode VecSet_SeqHIP(Vec xin, PetscScalar alpha)
652: {
653:   PetscInt                        n      = xin->map->n;
654:   PetscScalar                    *xarray = NULL;
655:   thrust::device_ptr<PetscScalar> xptr;

657:   VecHIPGetArrayWrite(xin, &xarray);
658:   PetscLogGpuTimeBegin();
659:   if (alpha == (PetscScalar)0.0) {
660:     hipMemset(xarray, 0, n * sizeof(PetscScalar));
661:   } else {
662:     try {
663:       xptr = thrust::device_pointer_cast(xarray);
664:       thrust::fill(xptr, xptr + n, alpha);
665:     } catch (char *ex) {
666:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
667:     }
668:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
669:   }
670:   PetscLogGpuTimeEnd();
671:   VecHIPRestoreArrayWrite(xin, &xarray);
672:   return 0;
673: }

675: struct PetscScalarReciprocal {
676:   __host__ __device__ PetscScalar operator()(const PetscScalar &s)
677:   {
678: #if defined(PETSC_USE_COMPLEX)
679:     /* SPOCK compilation issue, need to unroll division */
680:     PetscReal sx = PetscRealPart(s);
681:     PetscReal sy = PetscImaginaryPart(s);
682:     PetscReal n  = sx * sx + sy * sy;
683:     return n != 0.0 ? PetscComplex(sx / n, -sy / n) : 0.0;
684: #else
685:     return (s != (PetscScalar)0.0) ? (PetscScalar)1.0 / s : 0.0;
686: #endif
687:   }
688: };

690: PetscErrorCode VecReciprocal_SeqHIP(Vec v)
691: {
692:   PetscInt     n;
693:   PetscScalar *x;

695:   VecGetLocalSize(v, &n);
696:   VecHIPGetArray(v, &x);
697:   PetscLogGpuTimeBegin();
698:   try {
699:     auto xptr = thrust::device_pointer_cast(x);
700:     thrust::transform(xptr, xptr + n, xptr, PetscScalarReciprocal());
701:   } catch (char *ex) {
702:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
703:   }
704:   PetscLogGpuTimeEnd();
705:   VecHIPRestoreArray(v, &x);
706:   return 0;
707: }

709: PetscErrorCode VecScale_SeqHIP(Vec xin, PetscScalar alpha)
710: {
711:   PetscScalar    *xarray;
712:   PetscBLASInt    one = 1, bn = 0;
713:   hipblasHandle_t hipblasv2handle;

715:   if (alpha == (PetscScalar)0.0) {
716:     VecSet_SeqHIP(xin, alpha);
717:   } else if (alpha != (PetscScalar)1.0) {
718:     PetscHIPBLASGetHandle(&hipblasv2handle);
719:     PetscBLASIntCast(xin->map->n, &bn);
720:     VecHIPGetArray(xin, &xarray);
721:     PetscLogGpuTimeBegin();
722:     hipblasXscal(hipblasv2handle, bn, &alpha, xarray, one);
723:     VecHIPRestoreArray(xin, &xarray);
724:     PetscLogGpuTimeEnd();
725:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
726:     PetscLogGpuFlops(xin->map->n);
727:   }
728:   return 0;
729: }

731: PetscErrorCode VecTDot_SeqHIP(Vec xin, Vec yin, PetscScalar *z)
732: {
733:   const PetscScalar *xarray, *yarray;
734:   PetscBLASInt       one = 1, bn = 0;
735:   hipblasHandle_t    hipblasv2handle;

737:   PetscHIPBLASGetHandle(&hipblasv2handle);
738:   PetscBLASIntCast(xin->map->n, &bn);
739:   VecHIPGetArrayRead(xin, &xarray);
740:   VecHIPGetArrayRead(yin, &yarray);
741:   PetscLogGpuTimeBegin();
742:   hipblasXdotu(hipblasv2handle, bn, xarray, one, yarray, one, z);
743:   PetscLogGpuTimeEnd();
744:   if (xin->map->n > 0) PetscLogGpuFlops(2.0 * xin->map->n - 1);
745:   PetscLogGpuToCpu(sizeof(PetscScalar));
746:   VecHIPRestoreArrayRead(yin, &yarray);
747:   VecHIPRestoreArrayRead(xin, &xarray);
748:   return 0;
749: }

751: PetscErrorCode VecCopy_SeqHIP(Vec xin, Vec yin)
752: {
753:   const PetscScalar *xarray;
754:   PetscScalar       *yarray;

756:   if (xin != yin) {
757:     if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
758:       PetscBool yiship;

760:       PetscObjectTypeCompareAny((PetscObject)yin, &yiship, VECSEQHIP, VECMPIHIP, "");
761:       VecHIPGetArrayRead(xin, &xarray);
762:       if (yiship) {
763:         VecHIPGetArrayWrite(yin, &yarray);
764:       } else {
765:         VecGetArrayWrite(yin, &yarray);
766:       }
767:       PetscLogGpuTimeBegin();
768:       if (yiship) {
769:         hipMemcpy(yarray, xarray, yin->map->n * sizeof(PetscScalar), hipMemcpyDeviceToDevice);
770:       } else {
771:         hipMemcpy(yarray, xarray, yin->map->n * sizeof(PetscScalar), hipMemcpyDeviceToHost);
772:       }
773:       PetscLogGpuTimeEnd();
774:       VecHIPRestoreArrayRead(xin, &xarray);
775:       if (yiship) {
776:         VecHIPRestoreArrayWrite(yin, &yarray);
777:       } else {
778:         VecRestoreArrayWrite(yin, &yarray);
779:       }
780:     } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
781:       /* copy in CPU if we are on the CPU */
782:       VecCopy_SeqHIP_Private(xin, yin);
783:     } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
784:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
785:       if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
786:         /* copy in CPU */
787:         VecCopy_SeqHIP_Private(xin, yin);
788:       } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
789:         /* copy in GPU */
790:         VecHIPGetArrayRead(xin, &xarray);
791:         VecHIPGetArrayWrite(yin, &yarray);
792:         PetscLogGpuTimeBegin();
793:         hipMemcpy(yarray, xarray, yin->map->n * sizeof(PetscScalar), hipMemcpyDeviceToDevice);
794:         PetscLogGpuTimeEnd();
795:         VecHIPRestoreArrayRead(xin, &xarray);
796:         VecHIPRestoreArrayWrite(yin, &yarray);
797:       } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
798:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
799:            default to copy in GPU (this is an arbitrary choice) */
800:         VecHIPGetArrayRead(xin, &xarray);
801:         VecHIPGetArrayWrite(yin, &yarray);
802:         PetscLogGpuTimeBegin();
803:         hipMemcpy(yarray, xarray, yin->map->n * sizeof(PetscScalar), hipMemcpyDeviceToDevice);
804:         PetscLogGpuTimeEnd();
805:         VecHIPRestoreArrayRead(xin, &xarray);
806:         VecHIPRestoreArrayWrite(yin, &yarray);
807:       } else {
808:         VecCopy_SeqHIP_Private(xin, yin);
809:       }
810:     }
811:   }
812:   return 0;
813: }

815: PetscErrorCode VecSwap_SeqHIP(Vec xin, Vec yin)
816: {
817:   PetscBLASInt    one = 1, bn;
818:   PetscScalar    *xarray, *yarray;
819:   hipblasHandle_t hipblasv2handle;

821:   PetscHIPBLASGetHandle(&hipblasv2handle);
822:   PetscBLASIntCast(xin->map->n, &bn);
823:   if (xin != yin) {
824:     VecHIPGetArray(xin, &xarray);
825:     VecHIPGetArray(yin, &yarray);
826:     PetscLogGpuTimeBegin();
827:     hipblasXswap(hipblasv2handle, bn, xarray, one, yarray, one);
828:     PetscLogGpuTimeEnd();
829:     VecHIPRestoreArray(xin, &xarray);
830:     VecHIPRestoreArray(yin, &yarray);
831:   }
832:   return 0;
833: }

835: PetscErrorCode VecAXPBY_SeqHIP(Vec yin, PetscScalar alpha, PetscScalar beta, Vec xin)
836: {
837:   PetscScalar        a = alpha, b = beta;
838:   const PetscScalar *xarray;
839:   PetscScalar       *yarray;
840:   PetscBLASInt       one = 1, bn = 0;
841:   hipblasHandle_t    hipblasv2handle;

843:   PetscHIPBLASGetHandle(&hipblasv2handle);
844:   PetscBLASIntCast(yin->map->n, &bn);
845:   if (a == (PetscScalar)0.0) {
846:     VecScale_SeqHIP(yin, beta);
847:   } else if (b == (PetscScalar)1.0) {
848:     VecAXPY_SeqHIP(yin, alpha, xin);
849:   } else if (a == (PetscScalar)1.0) {
850:     VecAYPX_SeqHIP(yin, beta, xin);
851:   } else if (b == (PetscScalar)0.0) {
852:     VecHIPGetArrayRead(xin, &xarray);
853:     VecHIPGetArray(yin, &yarray);
854:     PetscLogGpuTimeBegin();
855:     hipMemcpy(yarray, xarray, yin->map->n * sizeof(PetscScalar), hipMemcpyDeviceToDevice);
856:     hipblasXscal(hipblasv2handle, bn, &alpha, yarray, one);
857:     PetscLogGpuTimeEnd();
858:     PetscLogGpuFlops(xin->map->n);
859:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
860:     VecHIPRestoreArrayRead(xin, &xarray);
861:     VecHIPRestoreArray(yin, &yarray);
862:   } else {
863:     VecHIPGetArrayRead(xin, &xarray);
864:     VecHIPGetArray(yin, &yarray);
865:     PetscLogGpuTimeBegin();
866:     hipblasXscal(hipblasv2handle, bn, &beta, yarray, one);
867:     hipblasXaxpy(hipblasv2handle, bn, &alpha, xarray, one, yarray, one);
868:     VecHIPRestoreArrayRead(xin, &xarray);
869:     VecHIPRestoreArray(yin, &yarray);
870:     PetscLogGpuTimeEnd();
871:     PetscLogGpuFlops(3.0 * xin->map->n);
872:     PetscLogCpuToGpuScalar(2 * sizeof(PetscScalar));
873:   }
874:   return 0;
875: }

877: PetscErrorCode VecAXPBYPCZ_SeqHIP(Vec zin, PetscScalar alpha, PetscScalar beta, PetscScalar gamma, Vec xin, Vec yin)
878: {
879:   PetscInt n = zin->map->n;

881:   if (gamma == (PetscScalar)1.0) {
882:     /* z = ax + b*y + z */
883:     VecAXPY_SeqHIP(zin, alpha, xin);
884:     VecAXPY_SeqHIP(zin, beta, yin);
885:     PetscLogGpuFlops(4.0 * n);
886:   } else {
887:     /* z = a*x + b*y + c*z */
888:     VecScale_SeqHIP(zin, gamma);
889:     VecAXPY_SeqHIP(zin, alpha, xin);
890:     VecAXPY_SeqHIP(zin, beta, yin);
891:     PetscLogGpuFlops(5.0 * n);
892:   }
893:   return 0;
894: }

896: PetscErrorCode VecPointwiseMult_SeqHIP(Vec win, Vec xin, Vec yin)
897: {
898:   PetscInt                              n = win->map->n;
899:   const PetscScalar                    *xarray, *yarray;
900:   PetscScalar                          *warray;
901:   thrust::device_ptr<const PetscScalar> xptr, yptr;
902:   thrust::device_ptr<PetscScalar>       wptr;

904:   if (xin->boundtocpu || yin->boundtocpu) {
905:     VecPointwiseMult_Seq(win, xin, yin);
906:     return 0;
907:   }
908:   VecHIPGetArrayRead(xin, &xarray);
909:   VecHIPGetArrayRead(yin, &yarray);
910:   VecHIPGetArrayWrite(win, &warray);
911:   PetscLogGpuTimeBegin();
912:   try {
913:     wptr = thrust::device_pointer_cast(warray);
914:     xptr = thrust::device_pointer_cast(xarray);
915:     yptr = thrust::device_pointer_cast(yarray);
916: #if defined(PETSC_USE_COMPLEX)
917:     thrust::transform(xptr, xptr + n, yptr, wptr, PetscMultiplyComplex());
918: #else
919:     thrust::transform(xptr, xptr + n, yptr, wptr, thrust::multiplies<PetscScalar>());
920: #endif
921:   } catch (char *ex) {
922:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
923:   }
924:   PetscLogGpuTimeEnd();
925:   VecHIPRestoreArrayRead(xin, &xarray);
926:   VecHIPRestoreArrayRead(yin, &yarray);
927:   VecHIPRestoreArrayWrite(win, &warray);
928:   PetscLogGpuFlops(n);
929:   return 0;
930: }

932: /* should do infinity norm in hip */

934: PetscErrorCode VecNorm_SeqHIP(Vec xin, NormType type, PetscReal *z)
935: {
936:   PetscInt           n   = xin->map->n;
937:   PetscBLASInt       one = 1, bn = 0;
938:   const PetscScalar *xarray;
939:   hipblasHandle_t    hipblasv2handle;

941:   PetscHIPBLASGetHandle(&hipblasv2handle);
942:   PetscBLASIntCast(n, &bn);
943:   if (type == NORM_2 || type == NORM_FROBENIUS) {
944:     VecHIPGetArrayRead(xin, &xarray);
945:     PetscLogGpuTimeBegin();
946:     hipblasXnrm2(hipblasv2handle, bn, xarray, one, z);
947:     PetscLogGpuTimeEnd();
948:     VecHIPRestoreArrayRead(xin, &xarray);
949:     PetscLogGpuFlops(PetscMax(2.0 * n - 1, 0.0));
950:   } else if (type == NORM_INFINITY) {
951:     int i;
952:     VecHIPGetArrayRead(xin, &xarray);
953:     PetscLogGpuTimeBegin();
954:     hipblasIXamax(hipblasv2handle, bn, xarray, one, &i);
955:     PetscLogGpuTimeEnd();
956:     if (bn) {
957:       PetscScalar zs;
958:       hipMemcpy(&zs, xarray + i - 1, sizeof(PetscScalar), hipMemcpyDeviceToHost);
959:       *z = PetscAbsScalar(zs);
960:     } else *z = 0.0;
961:     VecHIPRestoreArrayRead(xin, &xarray);
962:   } else if (type == NORM_1) {
963:     VecHIPGetArrayRead(xin, &xarray);
964:     PetscLogGpuTimeBegin();
965:     hipblasXasum(hipblasv2handle, bn, xarray, one, z);
966:     VecHIPRestoreArrayRead(xin, &xarray);
967:     PetscLogGpuTimeEnd();
968:     PetscLogGpuFlops(PetscMax(n - 1.0, 0.0));
969:   } else if (type == NORM_1_AND_2) {
970:     VecNorm_SeqHIP(xin, NORM_1, z);
971:     VecNorm_SeqHIP(xin, NORM_2, z + 1);
972:   }
973:   PetscLogGpuToCpu(sizeof(PetscReal));
974:   return 0;
975: }

977: PetscErrorCode VecDotNorm2_SeqHIP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
978: {
979:   VecDot_SeqHIP(s, t, dp);
980:   VecDot_SeqHIP(t, t, nm);
981:   return 0;
982: }

984: PetscErrorCode VecDestroy_SeqHIP(Vec v)
985: {
986:   if (v->spptr) {
987:     if (((Vec_HIP *)v->spptr)->GPUarray_allocated) {
988:       hipFree(((Vec_HIP *)v->spptr)->GPUarray_allocated);
989:       ((Vec_HIP *)v->spptr)->GPUarray_allocated = NULL;
990:     }
991:     if (((Vec_HIP *)v->spptr)->stream) hipStreamDestroy(((Vec_HIP *)v->spptr)->stream);
992:   }
993:   VecDestroy_SeqHIP_Private(v);
994:   PetscFree(v->spptr);
995:   return 0;
996: }

998: #if defined(PETSC_USE_COMPLEX)
999: /* SPOCK compilation issue, need to do conjugation ourselves */
1000: struct conjugate {
1001:   __host__ __device__ PetscScalar operator()(const PetscScalar &x) { return PetscScalar(PetscRealPart(x), -PetscImaginaryPart(x)); }
1002: };
1003: #endif

1005: PetscErrorCode VecConjugate_SeqHIP(Vec xin)
1006: {
1007: #if defined(PETSC_USE_COMPLEX)
1008:   PetscScalar                    *xarray;
1009:   PetscInt                        n = xin->map->n;
1010:   thrust::device_ptr<PetscScalar> xptr;

1012:   VecHIPGetArray(xin, &xarray);
1013:   PetscLogGpuTimeBegin();
1014:   try {
1015:     xptr = thrust::device_pointer_cast(xarray);
1016:     thrust::transform(xptr, xptr + n, xptr, conjugate());
1017:   } catch (char *ex) {
1018:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1019:   }
1020:   PetscLogGpuTimeEnd();
1021:   VecHIPRestoreArray(xin, &xarray);
1022: #else
1023: #endif
1024:   return 0;
1025: }

1027: static inline PetscErrorCode VecGetLocalVectorK_SeqHIP(Vec v, Vec w, PetscBool read)
1028: {
1029:   PetscBool wisseqhip;

1034:   PetscObjectTypeCompare((PetscObject)w, VECSEQHIP, &wisseqhip);
1035:   if (w->data && wisseqhip) {
1036:     if (((Vec_Seq *)w->data)->array_allocated) {
1037:       if (w->pinned_memory) PetscMallocSetHIPHost();
1038:       PetscFree(((Vec_Seq *)w->data)->array_allocated);
1039:       if (w->pinned_memory) {
1040:         PetscMallocResetHIPHost();
1041:         w->pinned_memory = PETSC_FALSE;
1042:       }
1043:     }
1044:     ((Vec_Seq *)w->data)->array         = NULL;
1045:     ((Vec_Seq *)w->data)->unplacedarray = NULL;
1046:   }
1047:   if (w->spptr && wisseqhip) {
1048:     if (((Vec_HIP *)w->spptr)->GPUarray) {
1049:       hipFree(((Vec_HIP *)w->spptr)->GPUarray);
1050:       ((Vec_HIP *)w->spptr)->GPUarray = NULL;
1051:     }
1052:     if (((Vec_HIP *)v->spptr)->stream) hipStreamDestroy(((Vec_HIP *)w->spptr)->stream);
1053:     PetscFree(w->spptr);
1054:   }

1056:   if (v->petscnative && wisseqhip) {
1057:     PetscFree(w->data);
1058:     w->data          = v->data;
1059:     w->offloadmask   = v->offloadmask;
1060:     w->pinned_memory = v->pinned_memory;
1061:     w->spptr         = v->spptr;
1062:     PetscObjectStateIncrease((PetscObject)w);
1063:   } else {
1064:     if (read) {
1065:       VecGetArrayRead(v, (const PetscScalar **)&((Vec_Seq *)w->data)->array);
1066:     } else {
1067:       VecGetArray(v, &((Vec_Seq *)w->data)->array);
1068:     }
1069:     w->offloadmask = PETSC_OFFLOAD_CPU;
1070:     if (wisseqhip) VecHIPAllocateCheck(w);
1071:   }
1072:   return 0;
1073: }

1075: static inline PetscErrorCode VecRestoreLocalVectorK_SeqHIP(Vec v, Vec w, PetscBool read)
1076: {
1077:   PetscBool wisseqhip;

1082:   PetscObjectTypeCompare((PetscObject)w, VECSEQHIP, &wisseqhip);
1083:   if (v->petscnative && wisseqhip) {
1084:     v->data          = w->data;
1085:     v->offloadmask   = w->offloadmask;
1086:     v->pinned_memory = w->pinned_memory;
1087:     v->spptr         = w->spptr;
1088:     w->data          = 0;
1089:     w->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
1090:     w->spptr         = 0;
1091:   } else {
1092:     if (read) {
1093:       VecRestoreArrayRead(v, (const PetscScalar **)&((Vec_Seq *)w->data)->array);
1094:     } else {
1095:       VecRestoreArray(v, &((Vec_Seq *)w->data)->array);
1096:     }
1097:     if ((Vec_HIP *)w->spptr && wisseqhip) {
1098:       hipFree(((Vec_HIP *)w->spptr)->GPUarray);
1099:       ((Vec_HIP *)w->spptr)->GPUarray = NULL;
1100:       if (((Vec_HIP *)v->spptr)->stream) hipStreamDestroy(((Vec_HIP *)w->spptr)->stream);
1101:       PetscFree(w->spptr);
1102:     }
1103:   }
1104:   return 0;
1105: }

1107: PetscErrorCode VecGetLocalVector_SeqHIP(Vec v, Vec w)
1108: {
1109:   VecGetLocalVectorK_SeqHIP(v, w, PETSC_FALSE);
1110:   return 0;
1111: }

1113: PetscErrorCode VecGetLocalVectorRead_SeqHIP(Vec v, Vec w)
1114: {
1115:   VecGetLocalVectorK_SeqHIP(v, w, PETSC_TRUE);
1116:   return 0;
1117: }

1119: PetscErrorCode VecRestoreLocalVector_SeqHIP(Vec v, Vec w)
1120: {
1121:   VecRestoreLocalVectorK_SeqHIP(v, w, PETSC_FALSE);
1122:   return 0;
1123: }

1125: PetscErrorCode VecRestoreLocalVectorRead_SeqHIP(Vec v, Vec w)
1126: {
1127:   VecRestoreLocalVectorK_SeqHIP(v, w, PETSC_TRUE);
1128:   return 0;
1129: }

1131: struct petscrealpart : public thrust::unary_function<PetscScalar, PetscReal> {
1132:   __host__ __device__ PetscReal operator()(const PetscScalar &x) { return PetscRealPart(x); }
1133: };

1135: struct petscrealparti : public thrust::unary_function<thrust::tuple<PetscScalar, PetscInt>, thrust::tuple<PetscReal, PetscInt>> {
1136:   __host__ __device__ thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscScalar, PetscInt> &x) { return thrust::make_tuple(PetscRealPart(x.get<0>()), x.get<1>()); }
1137: };

1139: struct petscmax : public thrust::binary_function<PetscReal, PetscReal, PetscReal> {
1140:   __host__ __device__ PetscReal operator()(const PetscReal &x, const PetscReal &y) { return x < y ? y : x; }
1141: };

1143: struct petscmaxi : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>, thrust::tuple<PetscReal, PetscInt>, thrust::tuple<PetscReal, PetscInt>> {
1144:   __host__ __device__ thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt> &x, const thrust::tuple<PetscReal, PetscInt> &y)
1145:   {
1146:     return x.get<0>() < y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) : (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1147:   }
1148: };

1150: struct petscmin : public thrust::binary_function<PetscReal, PetscReal, PetscReal> {
1151:   __host__ __device__ PetscReal operator()(const PetscReal &x, const PetscReal &y) { return x < y ? x : y; }
1152: };

1154: struct petscmini : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>, thrust::tuple<PetscReal, PetscInt>, thrust::tuple<PetscReal, PetscInt>> {
1155:   __host__ __device__ thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt> &x, const thrust::tuple<PetscReal, PetscInt> &y)
1156:   {
1157:     return x.get<0>() > y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) : (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1158:   }
1159: };

1161: PetscErrorCode VecMax_SeqHIP(Vec v, PetscInt *p, PetscReal *m)
1162: {
1163:   PetscInt                              n = v->map->n;
1164:   const PetscScalar                    *av;
1165:   thrust::device_ptr<const PetscScalar> avpt;

1168:   if (!n) {
1169:     *m = PETSC_MIN_REAL;
1170:     if (p) *p = -1;
1171:     return 0;
1172:   }
1173:   VecHIPGetArrayRead(v, &av);
1174:   avpt = thrust::device_pointer_cast(av);
1175:   PetscLogGpuTimeBegin();
1176:   if (p) {
1177:     thrust::tuple<PetscReal, PetscInt> res(PETSC_MIN_REAL, -1);
1178:     auto                               zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt, thrust::counting_iterator<PetscInt>(0)));
1179:     try {
1180: #if defined(PETSC_USE_COMPLEX)
1181:       res = thrust::transform_reduce(zibit, zibit + n, petscrealparti(), res, petscmaxi());
1182: #else
1183:       res = thrust::reduce(zibit, zibit + n, res, petscmaxi());
1184: #endif
1185:     } catch (char *ex) {
1186:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1187:     }
1188:     *m = res.get<0>();
1189:     *p = res.get<1>();
1190:   } else {
1191:     try {
1192: #if defined(PETSC_USE_COMPLEX)
1193:       *m = thrust::transform_reduce(avpt, avpt + n, petscrealpart(), PETSC_MIN_REAL, petscmax());
1194: #else
1195:       *m = thrust::reduce(avpt, avpt + n, PETSC_MIN_REAL, petscmax());
1196: #endif
1197:     } catch (char *ex) {
1198:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1199:     }
1200:   }
1201:   PetscLogGpuTimeEnd();
1202:   VecHIPRestoreArrayRead(v, &av);
1203:   return 0;
1204: }

1206: PetscErrorCode VecMin_SeqHIP(Vec v, PetscInt *p, PetscReal *m)
1207: {
1208:   PetscInt                              n = v->map->n;
1209:   const PetscScalar                    *av;
1210:   thrust::device_ptr<const PetscScalar> avpt;

1213:   if (!n) {
1214:     *m = PETSC_MAX_REAL;
1215:     if (p) *p = -1;
1216:     return 0;
1217:   }
1218:   VecHIPGetArrayRead(v, &av);
1219:   avpt = thrust::device_pointer_cast(av);
1220:   PetscLogGpuTimeBegin();
1221:   if (p) {
1222:     thrust::tuple<PetscReal, PetscInt> res(PETSC_MAX_REAL, -1);
1223:     auto                               zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt, thrust::counting_iterator<PetscInt>(0)));
1224:     try {
1225: #if defined(PETSC_USE_COMPLEX)
1226:       res = thrust::transform_reduce(zibit, zibit + n, petscrealparti(), res, petscmini());
1227: #else
1228:       res = thrust::reduce(zibit, zibit + n, res, petscmini());
1229: #endif
1230:     } catch (char *ex) {
1231:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1232:     }
1233:     *m = res.get<0>();
1234:     *p = res.get<1>();
1235:   } else {
1236:     try {
1237: #if defined(PETSC_USE_COMPLEX)
1238:       *m = thrust::transform_reduce(avpt, avpt + n, petscrealpart(), PETSC_MAX_REAL, petscmin());
1239: #else
1240:       *m = thrust::reduce(avpt, avpt + n, PETSC_MAX_REAL, petscmin());
1241: #endif
1242:     } catch (char *ex) {
1243:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1244:     }
1245:   }
1246:   PetscLogGpuTimeEnd();
1247:   VecHIPRestoreArrayRead(v, &av);
1248:   return 0;
1249: }

1251: PetscErrorCode VecSum_SeqHIP(Vec v, PetscScalar *sum)
1252: {
1253:   PetscInt                              n = v->map->n;
1254:   const PetscScalar                    *a;
1255:   thrust::device_ptr<const PetscScalar> dptr;

1258:   VecHIPGetArrayRead(v, &a);
1259:   dptr = thrust::device_pointer_cast(a);
1260:   PetscLogGpuTimeBegin();
1261:   try {
1262:     *sum = thrust::reduce(dptr, dptr + n, PetscScalar(0.0));
1263:   } catch (char *ex) {
1264:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1265:   }
1266:   PetscLogGpuTimeEnd();
1267:   VecHIPRestoreArrayRead(v, &a);
1268:   return 0;
1269: }

1271: struct petscshift : public thrust::unary_function<PetscScalar, PetscScalar> {
1272:   const PetscScalar shift_;
1273:   petscshift(PetscScalar shift) : shift_(shift) { }
1274:   __host__ __device__ PetscScalar operator()(PetscScalar x) { return x + shift_; }
1275: };

1277: PetscErrorCode VecShift_SeqHIP(Vec v, PetscScalar shift)
1278: {
1279:   PetscInt                        n = v->map->n;
1280:   PetscScalar                    *a;
1281:   thrust::device_ptr<PetscScalar> dptr;

1284:   VecHIPGetArray(v, &a);
1285:   dptr = thrust::device_pointer_cast(a);
1286:   PetscLogGpuTimeBegin();
1287:   try {
1288:     thrust::transform(dptr, dptr + n, dptr, petscshift(shift)); /* in-place transform */
1289:   } catch (char *ex) {
1290:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1291:   }
1292:   PetscLogGpuTimeEnd();
1293:   VecHIPRestoreArray(v, &a);
1294:   return 0;
1295: }