23#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
24#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
29#if !defined(__HIPCC_RTC__)
30#include <hip/hip_runtime_api.h>
34#include <hip/hip_vector_types.h>
38extern "C" __device__
int printf(
const char *fmt, ...);
40template <
typename... All>
41static inline __device__
void printf(
const char* format, All... all) {}
44extern "C" __device__
unsigned long long __ockl_steadyctr_u64();
51__device__
static inline unsigned int __popc(
unsigned int input) {
52 return __builtin_popcount(input);
54__device__
static inline unsigned int __popcll(
unsigned long long int input) {
55 return __builtin_popcountll(input);
58__device__
static inline int __clz(
int input) {
59 return __ockl_clz_u32((uint)input);
62__device__
static inline int __clzll(
long long int input) {
63 return __ockl_clz_u64((uint64_t)input);
66__device__
static inline unsigned int __ffs(
unsigned int input) {
67 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
70__device__
static inline unsigned int __ffsll(
unsigned long long int input) {
71 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
74__device__
static inline unsigned int __ffsll(
unsigned long int input) {
75 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
78__device__
static inline unsigned int __ffs(
int input) {
79 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
82__device__
static inline unsigned int __ffsll(
long long int input) {
83 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
86__device__
static inline unsigned int __ffsll(
long int input) {
87 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
93__device__
static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) {
94 uint64_t temp_mask = mask;
95 int32_t temp_offset = offset;
98 temp_mask &= (1 << base);
101 else if (offset < 0) {
102 temp_mask = __builtin_bitreverse64(mask);
104 temp_offset = -offset;
107 temp_mask = temp_mask & ((~0ULL) << base);
108 if (__builtin_popcountll(temp_mask) < temp_offset)
111 for (
int i = 0x20; i > 0; i >>= 1) {
112 uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
113 int32_t pcnt = __builtin_popcountll(temp_mask_lo);
114 if (pcnt < temp_offset) {
115 temp_mask = temp_mask >> i;
120 temp_mask = temp_mask_lo;
129__device__
static int32_t __fns32(uint64_t mask, uint32_t base, int32_t offset) {
130 uint64_t temp_mask = mask;
131 int32_t temp_offset = offset;
133 temp_mask &= (1 << base);
136 else if (offset < 0) {
137 temp_mask = __builtin_bitreverse64(mask);
139 temp_offset = -offset;
141 temp_mask = temp_mask & ((~0ULL) << base);
142 if (__builtin_popcountll(temp_mask) < temp_offset)
145 for (
int i = 0x20; i > 0; i >>= 1) {
146 uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
147 int32_t pcnt = __builtin_popcountll(temp_mask_lo);
148 if (pcnt < temp_offset) {
149 temp_mask = temp_mask >> i;
154 temp_mask = temp_mask_lo;
162__device__
static inline unsigned int __brev(
unsigned int input) {
163 return __builtin_bitreverse32(input);
166__device__
static inline unsigned long long int __brevll(
unsigned long long int input) {
167 return __builtin_bitreverse64(input);
170__device__
static inline unsigned int __lastbit_u32_u64(uint64_t input) {
171 return input == 0 ? -1 : __builtin_ctzl(input);
174__device__
static inline unsigned int __bitextract_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2) {
175 uint32_t offset = src1 & 31;
176 uint32_t width = src2 & 31;
177 return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
180__device__
static inline uint64_t __bitextract_u64(uint64_t src0,
unsigned int src1,
unsigned int src2) {
181 uint64_t offset = src1 & 63;
182 uint64_t width = src2 & 63;
183 return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
186__device__
static inline unsigned int __bitinsert_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2,
unsigned int src3) {
187 uint32_t offset = src2 & 31;
188 uint32_t width = src3 & 31;
189 uint32_t mask = (1 << width) - 1;
190 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
193__device__
static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1,
unsigned int src2,
unsigned int src3) {
194 uint64_t offset = src2 & 63;
195 uint64_t width = src3 & 63;
196 uint64_t mask = (1ULL << width) - 1;
197 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
200__device__
inline unsigned int __funnelshift_l(
unsigned int lo,
unsigned int hi,
unsigned int shift)
202 uint32_t mask_shift = shift & 31;
203 return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
206__device__
inline unsigned int __funnelshift_lc(
unsigned int lo,
unsigned int hi,
unsigned int shift)
208 uint32_t min_shift = shift >= 32 ? 32 : shift;
209 return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
212__device__
inline unsigned int __funnelshift_r(
unsigned int lo,
unsigned int hi,
unsigned int shift)
214 return __builtin_amdgcn_alignbit(hi, lo, shift);
217__device__
inline unsigned int __funnelshift_rc(
unsigned int lo,
unsigned int hi,
unsigned int shift)
219 return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift);
222__device__
static unsigned int __byte_perm(
unsigned int x,
unsigned int y,
unsigned int s);
223__device__
static unsigned int __hadd(
int x,
int y);
224__device__
static int __mul24(
int x,
int y);
225__device__
static long long int __mul64hi(
long long int x,
long long int y);
226__device__
static int __mulhi(
int x,
int y);
227__device__
static int __rhadd(
int x,
int y);
228__device__
static unsigned int __sad(
int x,
int y,
unsigned int z);
229__device__
static unsigned int __uhadd(
unsigned int x,
unsigned int y);
230__device__
static int __umul24(
unsigned int x,
unsigned int y);
231__device__
static unsigned long long int __umul64hi(
unsigned long long int x,
unsigned long long int y);
232__device__
static unsigned int __umulhi(
unsigned int x,
unsigned int y);
233__device__
static unsigned int __urhadd(
unsigned int x,
unsigned int y);
234__device__
static unsigned int __usad(
unsigned int x,
unsigned int y,
unsigned int z);
241} __attribute__((aligned(4)));
248} __attribute__((aligned(8)));
251static inline unsigned int __byte_perm(
unsigned int x,
unsigned int y,
unsigned int s) {
258 result = cHoldVal.c[cHoldKey.c[0] & 0x07];
259 result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
260 result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
261 result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
265__device__
static inline unsigned int __hadd(
int x,
int y) {
267 int sign = z & 0x8000000;
268 int value = z & 0x7FFFFFFF;
269 return ((value) >> 1 || sign);
272__device__
static inline int __mul24(
int x,
int y) {
273 return __ockl_mul24_i32(x, y);
276__device__
static inline long long __mul64hi(
long long int x,
long long int y) {
277 ulong x0 = (ulong)x & 0xffffffffUL;
279 ulong y0 = (ulong)y & 0xffffffffUL;
282 long t = x1*y0 + (z0 >> 32);
283 long z1 = t & 0xffffffffL;
286 return x1*y1 + z2 + (z1 >> 32);
289__device__
static inline int __mulhi(
int x,
int y) {
290 return __ockl_mul_hi_i32(x, y);
293__device__
static inline int __rhadd(
int x,
int y) {
295 int sign = z & 0x8000000;
296 int value = z & 0x7FFFFFFF;
297 return ((value) >> 1 || sign);
299__device__
static inline unsigned int __sad(
int x,
int y,
unsigned int z) {
300 return x > y ? x - y + z : y - x + z;
302__device__
static inline unsigned int __uhadd(
unsigned int x,
unsigned int y) {
305__device__
static inline int __umul24(
unsigned int x,
unsigned int y) {
306 return __ockl_mul24_u32(x, y);
310static inline unsigned long long __umul64hi(
unsigned long long int x,
unsigned long long int y) {
311 ulong x0 = x & 0xffffffffUL;
313 ulong y0 = y & 0xffffffffUL;
316 ulong t = x1*y0 + (z0 >> 32);
317 ulong z1 = t & 0xffffffffUL;
320 return x1*y1 + z2 + (z1 >> 32);
323__device__
static inline unsigned int __umulhi(
unsigned int x,
unsigned int y) {
324 return __ockl_mul_hi_u32(x, y);
326__device__
static inline unsigned int __urhadd(
unsigned int x,
unsigned int y) {
327 return (x + y + 1) >> 1;
329__device__
static inline unsigned int __usad(
unsigned int x,
unsigned int y,
unsigned int z) {
330 return __ockl_sadd_u32(x, y, z);
333__device__
static inline unsigned int __lane_id() {
334 return __builtin_amdgcn_mbcnt_hi(
335 -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
339static inline unsigned int __mbcnt_lo(
unsigned int x,
unsigned int y) {
return __builtin_amdgcn_mbcnt_lo(x,y);};
342static inline unsigned int __mbcnt_hi(
unsigned int x,
unsigned int y) {
return __builtin_amdgcn_mbcnt_hi(x,y);};
348#if !defined(__HIPCC_RTC__)
349#include "amd_warp_functions.h"
352#define MASK1 0x00ff00ff
353#define MASK2 0xff00ff00
357 unsigned one1 = in1.w & MASK1;
358 unsigned one2 = in2.w & MASK1;
359 out.w = (one1 + one2) & MASK1;
360 one1 = in1.w & MASK2;
361 one2 = in2.w & MASK2;
362 out.w = out.w | ((one1 + one2) & MASK2);
368 unsigned one1 = in1.w & MASK1;
369 unsigned one2 = in2.w & MASK1;
370 out.w = (one1 - one2) & MASK1;
371 one1 = in1.w & MASK2;
372 one2 = in2.w & MASK2;
373 out.w = out.w | ((one1 - one2) & MASK2);
379 unsigned one1 = in1.w & MASK1;
380 unsigned one2 = in2.w & MASK1;
381 out.w = (one1 * one2) & MASK1;
382 one1 = in1.w & MASK2;
383 one2 = in2.w & MASK2;
384 out.w = out.w | ((one1 * one2) & MASK2);
388__device__
static inline float __double2float_rd(
double x) {
389 return __ocml_cvtrtn_f32_f64(x);
391__device__
static inline float __double2float_rn(
double x) {
return x; }
392__device__
static inline float __double2float_ru(
double x) {
393 return __ocml_cvtrtp_f32_f64(x);
395__device__
static inline float __double2float_rz(
double x) {
396 return __ocml_cvtrtz_f32_f64(x);
399__device__
static inline int __double2hiint(
double x) {
400 static_assert(
sizeof(double) == 2 *
sizeof(
int),
"");
403 __builtin_memcpy(tmp, &x,
sizeof(tmp));
407__device__
static inline int __double2loint(
double x) {
408 static_assert(
sizeof(double) == 2 *
sizeof(
int),
"");
411 __builtin_memcpy(tmp, &x,
sizeof(tmp));
416__device__
static inline int __double2int_rd(
double x) {
return (
int)__ocml_floor_f64(x); }
417__device__
static inline int __double2int_rn(
double x) {
return (
int)__ocml_rint_f64(x); }
418__device__
static inline int __double2int_ru(
double x) {
return (
int)__ocml_ceil_f64(x); }
419__device__
static inline int __double2int_rz(
double x) {
return (
int)x; }
421__device__
static inline long long int __double2ll_rd(
double x) {
422 return (
long long)__ocml_floor_f64(x);
424__device__
static inline long long int __double2ll_rn(
double x) {
425 return (
long long)__ocml_rint_f64(x);
427__device__
static inline long long int __double2ll_ru(
double x) {
428 return (
long long)__ocml_ceil_f64(x);
430__device__
static inline long long int __double2ll_rz(
double x) {
return (
long long)x; }
432__device__
static inline unsigned int __double2uint_rd(
double x) {
433 return (
unsigned int)__ocml_floor_f64(x);
435__device__
static inline unsigned int __double2uint_rn(
double x) {
436 return (
unsigned int)__ocml_rint_f64(x);
438__device__
static inline unsigned int __double2uint_ru(
double x) {
439 return (
unsigned int)__ocml_ceil_f64(x);
441__device__
static inline unsigned int __double2uint_rz(
double x) {
return (
unsigned int)x; }
443__device__
static inline unsigned long long int __double2ull_rd(
double x) {
444 return (
unsigned long long int)__ocml_floor_f64(x);
446__device__
static inline unsigned long long int __double2ull_rn(
double x) {
447 return (
unsigned long long int)__ocml_rint_f64(x);
449__device__
static inline unsigned long long int __double2ull_ru(
double x) {
450 return (
unsigned long long int)__ocml_ceil_f64(x);
452__device__
static inline unsigned long long int __double2ull_rz(
double x) {
453 return (
unsigned long long int)x;
455#if defined(__clang__)
456#pragma clang diagnostic push
457#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
459__device__
static inline long long int __double_as_longlong(
double x) {
460 static_assert(
sizeof(
long long) ==
sizeof(
double),
"");
463 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
467#if defined(__clang__)
468#pragma clang diagnostic pop
485__device__
static inline int __float2int_rd(
float x) {
return (
int)__ocml_floor_f32(x); }
486__device__
static inline int __float2int_rn(
float x) {
return (
int)__ocml_rint_f32(x); }
487__device__
static inline int __float2int_ru(
float x) {
return (
int)__ocml_ceil_f32(x); }
488__device__
static inline int __float2int_rz(
float x) {
return (
int)__ocml_trunc_f32(x); }
490__device__
static inline long long int __float2ll_rd(
float x) {
491 return (
long long int)__ocml_floor_f32(x);
493__device__
static inline long long int __float2ll_rn(
float x) {
494 return (
long long int)__ocml_rint_f32(x);
496__device__
static inline long long int __float2ll_ru(
float x) {
497 return (
long long int)__ocml_ceil_f32(x);
499__device__
static inline long long int __float2ll_rz(
float x) {
return (
long long int)x; }
501__device__
static inline unsigned int __float2uint_rd(
float x) {
502 return (
unsigned int)__ocml_floor_f32(x);
504__device__
static inline unsigned int __float2uint_rn(
float x) {
505 return (
unsigned int)__ocml_rint_f32(x);
507__device__
static inline unsigned int __float2uint_ru(
float x) {
508 return (
unsigned int)__ocml_ceil_f32(x);
510__device__
static inline unsigned int __float2uint_rz(
float x) {
return (
unsigned int)x; }
512__device__
static inline unsigned long long int __float2ull_rd(
float x) {
513 return (
unsigned long long int)__ocml_floor_f32(x);
515__device__
static inline unsigned long long int __float2ull_rn(
float x) {
516 return (
unsigned long long int)__ocml_rint_f32(x);
518__device__
static inline unsigned long long int __float2ull_ru(
float x) {
519 return (
unsigned long long int)__ocml_ceil_f32(x);
521__device__
static inline unsigned long long int __float2ull_rz(
float x) {
522 return (
unsigned long long int)x;
525__device__
static inline int __float_as_int(
float x) {
526 static_assert(
sizeof(int) ==
sizeof(
float),
"");
529 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
534__device__
static inline unsigned int __float_as_uint(
float x) {
535 static_assert(
sizeof(
unsigned int) ==
sizeof(
float),
"");
538 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
543__device__
static inline double __hiloint2double(
int hi,
int lo) {
544 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
546 uint64_t tmp0 = (
static_cast<uint64_t
>(hi) << 32ull) |
static_cast<uint32_t
>(lo);
548 __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
553__device__
static inline double __int2double_rn(
int x) {
return (
double)x; }
555__device__
static inline float __int2float_rd(
int x) {
556 return __ocml_cvtrtn_f32_s32(x);
558__device__
static inline float __int2float_rn(
int x) {
return (
float)x; }
559__device__
static inline float __int2float_ru(
int x) {
560 return __ocml_cvtrtp_f32_s32(x);
562__device__
static inline float __int2float_rz(
int x) {
563 return __ocml_cvtrtz_f32_s32(x);
566__device__
static inline float __int_as_float(
int x) {
567 static_assert(
sizeof(float) ==
sizeof(
int),
"");
570 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
575__device__
static inline double __ll2double_rd(
long long int x) {
576 return __ocml_cvtrtn_f64_s64(x);
578__device__
static inline double __ll2double_rn(
long long int x) {
return (
double)x; }
579__device__
static inline double __ll2double_ru(
long long int x) {
580 return __ocml_cvtrtp_f64_s64(x);
582__device__
static inline double __ll2double_rz(
long long int x) {
583 return __ocml_cvtrtz_f64_s64(x);
586__device__
static inline float __ll2float_rd(
long long int x) {
587 return __ocml_cvtrtn_f32_s64(x);
589__device__
static inline float __ll2float_rn(
long long int x) {
return (
float)x; }
590__device__
static inline float __ll2float_ru(
long long int x) {
591 return __ocml_cvtrtp_f32_s64(x);
593__device__
static inline float __ll2float_rz(
long long int x) {
594 return __ocml_cvtrtz_f32_s64(x);
597__device__
static inline double __longlong_as_double(
long long int x) {
598 static_assert(
sizeof(double) ==
sizeof(
long long),
"");
601 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
606__device__
static inline double __uint2double_rn(
unsigned int x) {
return (
double)x; }
608__device__
static inline float __uint2float_rd(
unsigned int x) {
609 return __ocml_cvtrtn_f32_u32(x);
611__device__
static inline float __uint2float_rn(
unsigned int x) {
return (
float)x; }
612__device__
static inline float __uint2float_ru(
unsigned int x) {
613 return __ocml_cvtrtp_f32_u32(x);
615__device__
static inline float __uint2float_rz(
unsigned int x) {
616 return __ocml_cvtrtz_f32_u32(x);
619__device__
static inline float __uint_as_float(
unsigned int x) {
620 static_assert(
sizeof(float) ==
sizeof(
unsigned int),
"");
623 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
628__device__
static inline double __ull2double_rd(
unsigned long long int x) {
629 return __ocml_cvtrtn_f64_u64(x);
631__device__
static inline double __ull2double_rn(
unsigned long long int x) {
return (
double)x; }
632__device__
static inline double __ull2double_ru(
unsigned long long int x) {
633 return __ocml_cvtrtp_f64_u64(x);
635__device__
static inline double __ull2double_rz(
unsigned long long int x) {
636 return __ocml_cvtrtz_f64_u64(x);
639__device__
static inline float __ull2float_rd(
unsigned long long int x) {
640 return __ocml_cvtrtn_f32_u64(x);
642__device__
static inline float __ull2float_rn(
unsigned long long int x) {
return (
float)x; }
643__device__
static inline float __ull2float_ru(
unsigned long long int x) {
644 return __ocml_cvtrtp_f32_u64(x);
646__device__
static inline float __ull2float_rz(
unsigned long long int x) {
647 return __ocml_cvtrtz_f32_u64(x);
650#if __HIP_CLANG_ONLY__
653__device__
long long int __clock64();
654__device__
long long int __clock();
655__device__
long long int clock64();
656__device__
long long int clock();
657__device__
long long int wall_clock64();
659__device__
void __named_sync();
661#ifdef __HIP_DEVICE_COMPILE__
667inline __attribute((always_inline))
668long long int __clock64() {
669#if __has_builtin(__builtin_amdgcn_s_memtime)
671 return (
long long int) __builtin_amdgcn_s_memtime();
674 return (
long long int) __builtin_readcyclecounter();
679inline __attribute((always_inline))
680long long int __clock() {
return __clock64(); }
685inline __attribute__((always_inline))
686long long int wall_clock64() {
687 return (
long long int) __ockl_steadyctr_u64();
691inline __attribute__((always_inline))
692long long int clock64() {
return __clock64(); }
695inline __attribute__((always_inline))
696long long int clock() {
return __clock(); }
701void __named_sync() { __builtin_amdgcn_s_barrier(); }
708int __all(
int predicate) {
709 return __ockl_wfall_i32(predicate);
714int __any(
int predicate) {
715 return __ockl_wfany_i32(predicate);
723unsigned long long int __ballot(
int predicate) {
724 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
729unsigned long long int __ballot64(
int predicate) {
730 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
736uint64_t __lanemask_gt()
738 uint32_t lane = __ockl_lane_u32();
741 uint64_t ballot = __ballot64(1);
742 uint64_t mask = (~((uint64_t)0)) << (lane + 1);
743 return mask & ballot;
748uint64_t __lanemask_lt()
750 uint32_t lane = __ockl_lane_u32();
751 int64_t ballot = __ballot64(1);
752 uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
753 return mask & ballot;
758uint64_t __lanemask_eq()
760 uint32_t lane = __ockl_lane_u32();
761 int64_t mask = ((uint64_t)1 << lane);
766__device__
inline void* __local_to_generic(
void* p) {
return p; }
768#ifdef __HIP_DEVICE_COMPILE__
771void* __get_dynamicgroupbaseptr()
774 return (
char*)__local_to_generic((
void*)__to_local(__builtin_amdgcn_groupstaticsize()));
778void* __get_dynamicgroupbaseptr();
783void *__amdgcn_get_dynamicgroupbaseptr() {
784 return __get_dynamicgroupbaseptr();
790static void __threadfence()
792 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST,
"agent");
797static void __threadfence_block()
799 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST,
"workgroup");
804static void __threadfence_system()
806 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST,
"");
814 return __builtin_trap();
822#if defined(_WIN32) || defined(_WIN64)
823extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
824void _wassert(
const wchar_t *_msg,
const wchar_t *_file,
unsigned _line) {
829extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
830void __assert_fail(
const char *assertion,
833 const char *function)
835 const char fmt[] =
"%s:%u: %s: Device-side assertion `%s' failed.\n";
847#define __hip_get_string_length(LEN, STR) \
849 const char *tmp = STR; \
854 auto msg = __ockl_fprintf_stderr_begin();
856 __hip_get_string_length(len, fmt);
857 msg = __ockl_fprintf_append_string_n(msg, fmt, len, 0);
858 __hip_get_string_length(len, file);
859 msg = __ockl_fprintf_append_string_n(msg, file, len, 0);
860 msg = __ockl_fprintf_append_args(msg, 1, line, 0, 0, 0, 0, 0, 0, 0);
861 __hip_get_string_length(len, function);
862 msg = __ockl_fprintf_append_string_n(msg, function, len, 0);
863 __hip_get_string_length(len, assertion);
864 __ockl_fprintf_append_string_n(msg, assertion, len, 1);
866#undef __hip_get_string_length
871extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
879__device__
inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
881 __builtin_amdgcn_fence(__ATOMIC_RELEASE,
"workgroup");
882 __builtin_amdgcn_s_barrier();
883 __builtin_amdgcn_fence(__ATOMIC_ACQUIRE,
"workgroup");
885 __builtin_amdgcn_s_barrier();
891static void __barrier(
int n)
893 __work_group_barrier((__cl_mem_fence_flags)n);
898__attribute__((convergent))
901 __barrier(__CLK_LOCAL_MEM_FENCE);
906__attribute__((convergent))
907int __syncthreads_count(
int predicate)
909 return __ockl_wgred_add_i32(!!predicate);
914__attribute__((convergent))
915int __syncthreads_and(
int predicate)
917 return __ockl_wgred_and_i32(!!predicate);
922__attribute__((convergent))
923int __syncthreads_or(
int predicate)
925 return __ockl_wgred_or_i32(!!predicate);
957#if (defined (__GFX10__) || defined (__GFX11__))
963#if (defined(__GFX10__) || defined(__GFX11__))
964 #define HW_ID_WGP_ID_SIZE 4
965 #define HW_ID_WGP_ID_OFFSET 10
967 #define HW_ID_CU_ID_SIZE 4
968 #define HW_ID_CU_ID_OFFSET 8
971#if (defined(__gfx908__) || defined(__gfx90a__) || \
973 #define HW_ID_SE_ID_SIZE 3
975 #define HW_ID_SE_ID_SIZE 2
977#if (defined(__GFX10__) || defined(__GFX11__))
978 #define HW_ID_SE_ID_OFFSET 18
979 #define HW_ID_SA_ID_OFFSET 16
980 #define HW_ID_SA_ID_SIZE 1
982 #define HW_ID_SE_ID_OFFSET 13
985#if (defined(__gfx940__))
987 #define XCC_ID_XCC_ID_SIZE 4
988 #define XCC_ID_XCC_ID_OFFSET 0
991#if (!defined(__HIP_NO_IMAGE_SUPPORT) && \
992 (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)))
993 #define __HIP_NO_IMAGE_SUPPORT 1
1003#define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
1013unsigned __smid(
void)
1015 unsigned se_id = __builtin_amdgcn_s_getreg(
1016 GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
1017 #if (defined(__GFX10__) || defined(__GFX11__))
1018 unsigned wgp_id = __builtin_amdgcn_s_getreg(
1019 GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
1020 unsigned sa_id = __builtin_amdgcn_s_getreg(
1021 GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
1023 #if defined(__gfx940__)
1024 unsigned xcc_id = __builtin_amdgcn_s_getreg(
1025 GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
1027 unsigned cu_id = __builtin_amdgcn_s_getreg(
1028 GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
1030 #if (defined(__GFX10__) || defined(__GFX11__))
1031 unsigned temp = se_id;
1032 temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
1033 temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
1036 #elif defined(__gfx940__)
1037 unsigned temp = xcc_id;
1038 temp = (temp << HW_ID_SE_ID_SIZE) | se_id;
1039 temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
1042 return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
1050#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
1051#define HIP_DYNAMIC_SHARED_ATTRIBUTE
1057static inline __device__
void* __hip_hc_memcpy(
void* dst,
const void* src,
size_t size) {
1058 auto dstPtr =
static_cast<unsigned char*
>(dst);
1059 auto srcPtr =
static_cast<const unsigned char*
>(src);
1061 while (size >= 4u) {
1062 dstPtr[0] = srcPtr[0];
1063 dstPtr[1] = srcPtr[1];
1064 dstPtr[2] = srcPtr[2];
1065 dstPtr[3] = srcPtr[3];
1073 dstPtr[2] = srcPtr[2];
1075 dstPtr[1] = srcPtr[1];
1077 dstPtr[0] = srcPtr[0];
1083static inline __device__
void* __hip_hc_memset(
void* dst,
unsigned char val,
size_t size) {
1084 auto dstPtr =
static_cast<unsigned char*
>(dst);
1086 while (size >= 4u) {
1106#ifndef __OPENMP_AMDGCN__
1107static inline __device__
void* memcpy(
void* dst,
const void* src,
size_t size) {
1108 return __hip_hc_memcpy(dst, src, size);
1111static inline __device__
void* memset(
void* ptr,
int val,
size_t size) {
1112 unsigned char val8 =
static_cast<unsigned char>(val);
1113 return __hip_hc_memset(ptr, val8, size);
Contains declarations for types and functions in device library. Uses int64_t and uint64_t instead of...
Definition amd_device_functions.h:236
Definition amd_device_functions.h:243
Definition amd_hip_vector_types.h:1623