HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
amd_device_functions.h
1/*
2Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
23#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
24#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
25
26#include "host_defines.h"
27#include "math_fwd.h"
28
29#if !defined(__HIPCC_RTC__)
30#include <hip/hip_runtime_api.h>
31#include <stddef.h>
32#endif // !defined(__HIPCC_RTC__)
33
34#include <hip/hip_vector_types.h>
36
37#if __HIP_CLANG_ONLY__
38extern "C" __device__ int printf(const char *fmt, ...);
39#else
40template <typename... All>
41static inline __device__ void printf(const char* format, All... all) {}
42#endif // __HIP_CLANG_ONLY__
43
44extern "C" __device__ unsigned long long __ockl_steadyctr_u64();
45
46/*
47Integer Intrinsics
48*/
49
50// integer intrinsic function __poc __clz __ffs __brev
51__device__ static inline unsigned int __popc(unsigned int input) {
52 return __builtin_popcount(input);
53}
54__device__ static inline unsigned int __popcll(unsigned long long int input) {
55 return __builtin_popcountll(input);
56}
57
58__device__ static inline int __clz(int input) {
59 return __ockl_clz_u32((uint)input);
60}
61
62__device__ static inline int __clzll(long long int input) {
63 return __ockl_clz_u64((uint64_t)input);
64}
65
66__device__ static inline unsigned int __ffs(unsigned int input) {
67 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
68}
69
70__device__ static inline unsigned int __ffsll(unsigned long long int input) {
71 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
72}
73
74__device__ static inline unsigned int __ffsll(unsigned long int input) {
75 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
76}
77
78__device__ static inline unsigned int __ffs(int input) {
79 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
80}
81
82__device__ static inline unsigned int __ffsll(long long int input) {
83 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
84}
85
86__device__ static inline unsigned int __ffsll(long int input) {
87 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
88}
89
90// Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE),
91// find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position.
92// If not found, return -1.
93__device__ static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) {
94 uint64_t temp_mask = mask;
95 int32_t temp_offset = offset;
96
97 if (offset == 0) {
98 temp_mask &= (1 << base);
99 temp_offset = 1;
100 }
101 else if (offset < 0) {
102 temp_mask = __builtin_bitreverse64(mask);
103 base = 63 - base;
104 temp_offset = -offset;
105 }
106
107 temp_mask = temp_mask & ((~0ULL) << base);
108 if (__builtin_popcountll(temp_mask) < temp_offset)
109 return -1;
110 int32_t total = 0;
111 for (int i = 0x20; i > 0; i >>= 1) {
112 uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
113 int32_t pcnt = __builtin_popcountll(temp_mask_lo);
114 if (pcnt < temp_offset) {
115 temp_mask = temp_mask >> i;
116 temp_offset -= pcnt;
117 total += i;
118 }
119 else {
120 temp_mask = temp_mask_lo;
121 }
122 }
123 if (offset < 0)
124 return 63 - total;
125 else
126 return total;
127}
128
129__device__ static int32_t __fns32(uint64_t mask, uint32_t base, int32_t offset) {
130 uint64_t temp_mask = mask;
131 int32_t temp_offset = offset;
132 if (offset == 0) {
133 temp_mask &= (1 << base);
134 temp_offset = 1;
135 }
136 else if (offset < 0) {
137 temp_mask = __builtin_bitreverse64(mask);
138 base = 63 - base;
139 temp_offset = -offset;
140 }
141 temp_mask = temp_mask & ((~0ULL) << base);
142 if (__builtin_popcountll(temp_mask) < temp_offset)
143 return -1;
144 int32_t total = 0;
145 for (int i = 0x20; i > 0; i >>= 1) {
146 uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
147 int32_t pcnt = __builtin_popcountll(temp_mask_lo);
148 if (pcnt < temp_offset) {
149 temp_mask = temp_mask >> i;
150 temp_offset -= pcnt;
151 total += i;
152 }
153 else {
154 temp_mask = temp_mask_lo;
155 }
156 }
157 if (offset < 0)
158 return 63 - total;
159 else
160 return total;
161}
162__device__ static inline unsigned int __brev(unsigned int input) {
163 return __builtin_bitreverse32(input);
164}
165
166__device__ static inline unsigned long long int __brevll(unsigned long long int input) {
167 return __builtin_bitreverse64(input);
168}
169
170__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
171 return input == 0 ? -1 : __builtin_ctzl(input);
172}
173
174__device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
175 uint32_t offset = src1 & 31;
176 uint32_t width = src2 & 31;
177 return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
178}
179
180__device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
181 uint64_t offset = src1 & 63;
182 uint64_t width = src2 & 63;
183 return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
184}
185
186__device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
187 uint32_t offset = src2 & 31;
188 uint32_t width = src3 & 31;
189 uint32_t mask = (1 << width) - 1;
190 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
191}
192
193__device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
194 uint64_t offset = src2 & 63;
195 uint64_t width = src3 & 63;
196 uint64_t mask = (1ULL << width) - 1;
197 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
198}
199
200__device__ inline unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
201{
202 uint32_t mask_shift = shift & 31;
203 return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
204}
205
206__device__ inline unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
207{
208 uint32_t min_shift = shift >= 32 ? 32 : shift;
209 return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
210}
211
212__device__ inline unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)
213{
214 return __builtin_amdgcn_alignbit(hi, lo, shift);
215}
216
217__device__ inline unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)
218{
219 return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift);
220}
221
222__device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
223__device__ static unsigned int __hadd(int x, int y);
224__device__ static int __mul24(int x, int y);
225__device__ static long long int __mul64hi(long long int x, long long int y);
226__device__ static int __mulhi(int x, int y);
227__device__ static int __rhadd(int x, int y);
228__device__ static unsigned int __sad(int x, int y,unsigned int z);
229__device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
230__device__ static int __umul24(unsigned int x, unsigned int y);
231__device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
232__device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
233__device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
234__device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
235
237 union {
238 unsigned char c[4];
239 unsigned int ui;
240 };
241} __attribute__((aligned(4)));
242
244 union {
245 unsigned int ui[2];
246 unsigned char c[8];
247 };
248} __attribute__((aligned(8)));
249
250__device__
251static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
252 struct uchar2Holder cHoldVal;
253 struct ucharHolder cHoldKey;
254 cHoldKey.ui = s;
255 cHoldVal.ui[0] = x;
256 cHoldVal.ui[1] = y;
257 unsigned int result;
258 result = cHoldVal.c[cHoldKey.c[0] & 0x07];
259 result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
260 result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
261 result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
262 return result;
263}
264
265__device__ static inline unsigned int __hadd(int x, int y) {
266 int z = x + y;
267 int sign = z & 0x8000000;
268 int value = z & 0x7FFFFFFF;
269 return ((value) >> 1 || sign);
270}
271
272__device__ static inline int __mul24(int x, int y) {
273 return __ockl_mul24_i32(x, y);
274}
275
276__device__ static inline long long __mul64hi(long long int x, long long int y) {
277 ulong x0 = (ulong)x & 0xffffffffUL;
278 long x1 = x >> 32;
279 ulong y0 = (ulong)y & 0xffffffffUL;
280 long y1 = y >> 32;
281 ulong z0 = x0*y0;
282 long t = x1*y0 + (z0 >> 32);
283 long z1 = t & 0xffffffffL;
284 long z2 = t >> 32;
285 z1 = x0*y1 + z1;
286 return x1*y1 + z2 + (z1 >> 32);
287}
288
289__device__ static inline int __mulhi(int x, int y) {
290 return __ockl_mul_hi_i32(x, y);
291}
292
293__device__ static inline int __rhadd(int x, int y) {
294 int z = x + y + 1;
295 int sign = z & 0x8000000;
296 int value = z & 0x7FFFFFFF;
297 return ((value) >> 1 || sign);
298}
299__device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
300 return x > y ? x - y + z : y - x + z;
301}
302__device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
303 return (x + y) >> 1;
304}
305__device__ static inline int __umul24(unsigned int x, unsigned int y) {
306 return __ockl_mul24_u32(x, y);
307}
308
309__device__
310static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
311 ulong x0 = x & 0xffffffffUL;
312 ulong x1 = x >> 32;
313 ulong y0 = y & 0xffffffffUL;
314 ulong y1 = y >> 32;
315 ulong z0 = x0*y0;
316 ulong t = x1*y0 + (z0 >> 32);
317 ulong z1 = t & 0xffffffffUL;
318 ulong z2 = t >> 32;
319 z1 = x0*y1 + z1;
320 return x1*y1 + z2 + (z1 >> 32);
321}
322
323__device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
324 return __ockl_mul_hi_u32(x, y);
325}
326__device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
327 return (x + y + 1) >> 1;
328}
329__device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
330 return __ockl_sadd_u32(x, y, z);
331}
332
333__device__ static inline unsigned int __lane_id() {
334 return __builtin_amdgcn_mbcnt_hi(
335 -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
336}
337
338__device__
339static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};
340
341__device__
342static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_hi(x,y);};
343
344/*
345HIP specific device functions
346*/
347
348#if !defined(__HIPCC_RTC__)
349#include "amd_warp_functions.h"
350#endif
351
352#define MASK1 0x00ff00ff
353#define MASK2 0xff00ff00
354
355__device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
356 char4 out;
357 unsigned one1 = in1.w & MASK1;
358 unsigned one2 = in2.w & MASK1;
359 out.w = (one1 + one2) & MASK1;
360 one1 = in1.w & MASK2;
361 one2 = in2.w & MASK2;
362 out.w = out.w | ((one1 + one2) & MASK2);
363 return out;
364}
365
366__device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
367 char4 out;
368 unsigned one1 = in1.w & MASK1;
369 unsigned one2 = in2.w & MASK1;
370 out.w = (one1 - one2) & MASK1;
371 one1 = in1.w & MASK2;
372 one2 = in2.w & MASK2;
373 out.w = out.w | ((one1 - one2) & MASK2);
374 return out;
375}
376
377__device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
378 char4 out;
379 unsigned one1 = in1.w & MASK1;
380 unsigned one2 = in2.w & MASK1;
381 out.w = (one1 * one2) & MASK1;
382 one1 = in1.w & MASK2;
383 one2 = in2.w & MASK2;
384 out.w = out.w | ((one1 * one2) & MASK2);
385 return out;
386}
387
388__device__ static inline float __double2float_rd(double x) {
389 return __ocml_cvtrtn_f32_f64(x);
390}
391__device__ static inline float __double2float_rn(double x) { return x; }
392__device__ static inline float __double2float_ru(double x) {
393 return __ocml_cvtrtp_f32_f64(x);
394}
395__device__ static inline float __double2float_rz(double x) {
396 return __ocml_cvtrtz_f32_f64(x);
397}
398
399__device__ static inline int __double2hiint(double x) {
400 static_assert(sizeof(double) == 2 * sizeof(int), "");
401
402 int tmp[2];
403 __builtin_memcpy(tmp, &x, sizeof(tmp));
404
405 return tmp[1];
406}
407__device__ static inline int __double2loint(double x) {
408 static_assert(sizeof(double) == 2 * sizeof(int), "");
409
410 int tmp[2];
411 __builtin_memcpy(tmp, &x, sizeof(tmp));
412
413 return tmp[0];
414}
415
416__device__ static inline int __double2int_rd(double x) { return (int)__ocml_floor_f64(x); }
417__device__ static inline int __double2int_rn(double x) { return (int)__ocml_rint_f64(x); }
418__device__ static inline int __double2int_ru(double x) { return (int)__ocml_ceil_f64(x); }
419__device__ static inline int __double2int_rz(double x) { return (int)x; }
420
421__device__ static inline long long int __double2ll_rd(double x) {
422 return (long long)__ocml_floor_f64(x);
423}
424__device__ static inline long long int __double2ll_rn(double x) {
425 return (long long)__ocml_rint_f64(x);
426}
427__device__ static inline long long int __double2ll_ru(double x) {
428 return (long long)__ocml_ceil_f64(x);
429}
430__device__ static inline long long int __double2ll_rz(double x) { return (long long)x; }
431
432__device__ static inline unsigned int __double2uint_rd(double x) {
433 return (unsigned int)__ocml_floor_f64(x);
434}
435__device__ static inline unsigned int __double2uint_rn(double x) {
436 return (unsigned int)__ocml_rint_f64(x);
437}
438__device__ static inline unsigned int __double2uint_ru(double x) {
439 return (unsigned int)__ocml_ceil_f64(x);
440}
441__device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
442
443__device__ static inline unsigned long long int __double2ull_rd(double x) {
444 return (unsigned long long int)__ocml_floor_f64(x);
445}
446__device__ static inline unsigned long long int __double2ull_rn(double x) {
447 return (unsigned long long int)__ocml_rint_f64(x);
448}
449__device__ static inline unsigned long long int __double2ull_ru(double x) {
450 return (unsigned long long int)__ocml_ceil_f64(x);
451}
452__device__ static inline unsigned long long int __double2ull_rz(double x) {
453 return (unsigned long long int)x;
454}
455#if defined(__clang__)
456#pragma clang diagnostic push
457#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
458#endif
459__device__ static inline long long int __double_as_longlong(double x) {
460 static_assert(sizeof(long long) == sizeof(double), "");
461
462 long long tmp;
463 __builtin_memcpy(&tmp, &x, sizeof(tmp));
464
465 return tmp;
466}
467#if defined(__clang__)
468#pragma clang diagnostic pop
469#endif
470
471/*
472__device__ unsigned short __float2half_rn(float x);
473__device__ float __half2float(unsigned short);
474
475The above device function are not a valid .
476Use
477__device__ __half __float2half_rn(float x);
478__device__ float __half2float(__half);
479from hip_fp16.h
480
481CUDA implements half as unsigned short whereas, HIP doesn't.
482
483*/
484
485__device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
486__device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
487__device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
488__device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
489
490__device__ static inline long long int __float2ll_rd(float x) {
491 return (long long int)__ocml_floor_f32(x);
492}
493__device__ static inline long long int __float2ll_rn(float x) {
494 return (long long int)__ocml_rint_f32(x);
495}
496__device__ static inline long long int __float2ll_ru(float x) {
497 return (long long int)__ocml_ceil_f32(x);
498}
499__device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
500
501__device__ static inline unsigned int __float2uint_rd(float x) {
502 return (unsigned int)__ocml_floor_f32(x);
503}
504__device__ static inline unsigned int __float2uint_rn(float x) {
505 return (unsigned int)__ocml_rint_f32(x);
506}
507__device__ static inline unsigned int __float2uint_ru(float x) {
508 return (unsigned int)__ocml_ceil_f32(x);
509}
510__device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
511
512__device__ static inline unsigned long long int __float2ull_rd(float x) {
513 return (unsigned long long int)__ocml_floor_f32(x);
514}
515__device__ static inline unsigned long long int __float2ull_rn(float x) {
516 return (unsigned long long int)__ocml_rint_f32(x);
517}
518__device__ static inline unsigned long long int __float2ull_ru(float x) {
519 return (unsigned long long int)__ocml_ceil_f32(x);
520}
521__device__ static inline unsigned long long int __float2ull_rz(float x) {
522 return (unsigned long long int)x;
523}
524
525__device__ static inline int __float_as_int(float x) {
526 static_assert(sizeof(int) == sizeof(float), "");
527
528 int tmp;
529 __builtin_memcpy(&tmp, &x, sizeof(tmp));
530
531 return tmp;
532}
533
534__device__ static inline unsigned int __float_as_uint(float x) {
535 static_assert(sizeof(unsigned int) == sizeof(float), "");
536
537 unsigned int tmp;
538 __builtin_memcpy(&tmp, &x, sizeof(tmp));
539
540 return tmp;
541}
542
543__device__ static inline double __hiloint2double(int hi, int lo) {
544 static_assert(sizeof(double) == sizeof(uint64_t), "");
545
546 uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
547 double tmp1;
548 __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
549
550 return tmp1;
551}
552
553__device__ static inline double __int2double_rn(int x) { return (double)x; }
554
555__device__ static inline float __int2float_rd(int x) {
556 return __ocml_cvtrtn_f32_s32(x);
557}
558__device__ static inline float __int2float_rn(int x) { return (float)x; }
559__device__ static inline float __int2float_ru(int x) {
560 return __ocml_cvtrtp_f32_s32(x);
561}
562__device__ static inline float __int2float_rz(int x) {
563 return __ocml_cvtrtz_f32_s32(x);
564}
565
566__device__ static inline float __int_as_float(int x) {
567 static_assert(sizeof(float) == sizeof(int), "");
568
569 float tmp;
570 __builtin_memcpy(&tmp, &x, sizeof(tmp));
571
572 return tmp;
573}
574
575__device__ static inline double __ll2double_rd(long long int x) {
576 return __ocml_cvtrtn_f64_s64(x);
577}
578__device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
579__device__ static inline double __ll2double_ru(long long int x) {
580 return __ocml_cvtrtp_f64_s64(x);
581}
582__device__ static inline double __ll2double_rz(long long int x) {
583 return __ocml_cvtrtz_f64_s64(x);
584}
585
586__device__ static inline float __ll2float_rd(long long int x) {
587 return __ocml_cvtrtn_f32_s64(x);
588}
589__device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
590__device__ static inline float __ll2float_ru(long long int x) {
591 return __ocml_cvtrtp_f32_s64(x);
592}
593__device__ static inline float __ll2float_rz(long long int x) {
594 return __ocml_cvtrtz_f32_s64(x);
595}
596
597__device__ static inline double __longlong_as_double(long long int x) {
598 static_assert(sizeof(double) == sizeof(long long), "");
599
600 double tmp;
601 __builtin_memcpy(&tmp, &x, sizeof(tmp));
602
603 return tmp;
604}
605
606__device__ static inline double __uint2double_rn(unsigned int x) { return (double)x; }
607
608__device__ static inline float __uint2float_rd(unsigned int x) {
609 return __ocml_cvtrtn_f32_u32(x);
610}
611__device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
612__device__ static inline float __uint2float_ru(unsigned int x) {
613 return __ocml_cvtrtp_f32_u32(x);
614}
615__device__ static inline float __uint2float_rz(unsigned int x) {
616 return __ocml_cvtrtz_f32_u32(x);
617}
618
619__device__ static inline float __uint_as_float(unsigned int x) {
620 static_assert(sizeof(float) == sizeof(unsigned int), "");
621
622 float tmp;
623 __builtin_memcpy(&tmp, &x, sizeof(tmp));
624
625 return tmp;
626}
627
628__device__ static inline double __ull2double_rd(unsigned long long int x) {
629 return __ocml_cvtrtn_f64_u64(x);
630}
631__device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
632__device__ static inline double __ull2double_ru(unsigned long long int x) {
633 return __ocml_cvtrtp_f64_u64(x);
634}
635__device__ static inline double __ull2double_rz(unsigned long long int x) {
636 return __ocml_cvtrtz_f64_u64(x);
637}
638
639__device__ static inline float __ull2float_rd(unsigned long long int x) {
640 return __ocml_cvtrtn_f32_u64(x);
641}
642__device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
643__device__ static inline float __ull2float_ru(unsigned long long int x) {
644 return __ocml_cvtrtp_f32_u64(x);
645}
646__device__ static inline float __ull2float_rz(unsigned long long int x) {
647 return __ocml_cvtrtz_f32_u64(x);
648}
649
650#if __HIP_CLANG_ONLY__
651
652// Clock functions
653__device__ long long int __clock64();
654__device__ long long int __clock();
655__device__ long long int clock64();
656__device__ long long int clock();
657__device__ long long int wall_clock64();
658// hip.amdgcn.bc - named sync
659__device__ void __named_sync();
660
661#ifdef __HIP_DEVICE_COMPILE__
662
663// Clock function to return GPU core cycle count.
664// GPU can change its core clock frequency at runtime. The maximum frequency can be queried
665// through hipDeviceAttributeClockRate attribute.
666__device__
667inline __attribute((always_inline))
668long long int __clock64() {
669#if __has_builtin(__builtin_amdgcn_s_memtime)
670 // Exists on gfx8, gfx9, gfx10.1, gfx10.2, gfx10.3
671 return (long long int) __builtin_amdgcn_s_memtime();
672#else
673 // Subject to change when better solution available
674 return (long long int) __builtin_readcyclecounter();
675#endif
676}
677
678__device__
679inline __attribute((always_inline))
680long long int __clock() { return __clock64(); }
681
682// Clock function to return wall clock count at a constant frequency that can be queried
683// through hipDeviceAttributeWallClockRate attribute.
684__device__
685inline __attribute__((always_inline))
686long long int wall_clock64() {
687 return (long long int) __ockl_steadyctr_u64();
688}
689
690__device__
691inline __attribute__((always_inline))
692long long int clock64() { return __clock64(); }
693
694__device__
695inline __attribute__((always_inline))
696long long int clock() { return __clock(); }
697
698// hip.amdgcn.bc - named sync
699__device__
700inline
701void __named_sync() { __builtin_amdgcn_s_barrier(); }
702
703#endif // __HIP_DEVICE_COMPILE__
704
705// warp vote function __all __any __ballot
706__device__
707inline
708int __all(int predicate) {
709 return __ockl_wfall_i32(predicate);
710}
711
712__device__
713inline
714int __any(int predicate) {
715 return __ockl_wfany_i32(predicate);
716}
717
718// XXX from llvm/include/llvm/IR/InstrTypes.h
719#define ICMP_NE 33
720
721__device__
722inline
723unsigned long long int __ballot(int predicate) {
724 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
725}
726
727__device__
728inline
729unsigned long long int __ballot64(int predicate) {
730 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
731}
732
733// hip.amdgcn.bc - lanemask
734__device__
735inline
736uint64_t __lanemask_gt()
737{
738 uint32_t lane = __ockl_lane_u32();
739 if (lane == 63)
740 return 0;
741 uint64_t ballot = __ballot64(1);
742 uint64_t mask = (~((uint64_t)0)) << (lane + 1);
743 return mask & ballot;
744}
745
746__device__
747inline
748uint64_t __lanemask_lt()
749{
750 uint32_t lane = __ockl_lane_u32();
751 int64_t ballot = __ballot64(1);
752 uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
753 return mask & ballot;
754}
755
756__device__
757inline
758uint64_t __lanemask_eq()
759{
760 uint32_t lane = __ockl_lane_u32();
761 int64_t mask = ((uint64_t)1 << lane);
762 return mask;
763}
764
765
766__device__ inline void* __local_to_generic(void* p) { return p; }
767
768#ifdef __HIP_DEVICE_COMPILE__
769__device__
770inline
771void* __get_dynamicgroupbaseptr()
772{
773 // Get group segment base pointer.
774 return (char*)__local_to_generic((void*)__to_local(__builtin_amdgcn_groupstaticsize()));
775}
776#else
777__device__
778void* __get_dynamicgroupbaseptr();
779#endif // __HIP_DEVICE_COMPILE__
780
781__device__
782inline
783void *__amdgcn_get_dynamicgroupbaseptr() {
784 return __get_dynamicgroupbaseptr();
785}
786
787// Memory Fence Functions
788__device__
789inline
790static void __threadfence()
791{
792 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
793}
794
795__device__
796inline
797static void __threadfence_block()
798{
799 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
800}
801
802__device__
803inline
804static void __threadfence_system()
805{
806 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
807}
808
809// abort
810__device__
811inline
812__attribute__((weak))
813void abort() {
814 return __builtin_trap();
815}
816
817// The noinline attribute helps encapsulate the printf expansion,
818// which otherwise has a performance impact just by increasing the
819// size of the calling function. Additionally, the weak attribute
820// allows the function to exist as a global although its definition is
821// included in every compilation unit.
822#if defined(_WIN32) || defined(_WIN64)
823extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
824void _wassert(const wchar_t *_msg, const wchar_t *_file, unsigned _line) {
825 // FIXME: Need `wchar_t` support to generate assertion message.
826 __builtin_trap();
827}
828#else /* defined(_WIN32) || defined(_WIN64) */
829extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
830void __assert_fail(const char *assertion,
831 const char *file,
832 unsigned int line,
833 const char *function)
834{
835 const char fmt[] = "%s:%u: %s: Device-side assertion `%s' failed.\n";
836
837 // strlen is not available as a built-in yet, so we create our own
838 // loop in a macro. With a string literal argument, the compiler
839 // usually manages to replace the loop with a constant.
840 //
841 // The macro does not check for null pointer, since all the string
842 // arguments are defined to be constant literals when called from
843 // the assert() macro.
844 //
845 // NOTE: The loop below includes the null terminator in the length
846 // as required by append_string_n().
847#define __hip_get_string_length(LEN, STR) \
848 do { \
849 const char *tmp = STR; \
850 while (*tmp++); \
851 LEN = tmp - STR; \
852 } while (0)
853
854 auto msg = __ockl_fprintf_stderr_begin();
855 int len = 0;
856 __hip_get_string_length(len, fmt);
857 msg = __ockl_fprintf_append_string_n(msg, fmt, len, 0);
858 __hip_get_string_length(len, file);
859 msg = __ockl_fprintf_append_string_n(msg, file, len, 0);
860 msg = __ockl_fprintf_append_args(msg, 1, line, 0, 0, 0, 0, 0, 0, 0);
861 __hip_get_string_length(len, function);
862 msg = __ockl_fprintf_append_string_n(msg, function, len, 0);
863 __hip_get_string_length(len, assertion);
864 __ockl_fprintf_append_string_n(msg, assertion, len, /* is_last = */ 1);
865
866#undef __hip_get_string_length
867
868 __builtin_trap();
869}
870
871extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
872void __assertfail()
873{
874 // ignore all the args for now.
875 __builtin_trap();
876}
877#endif /* defined(_WIN32) || defined(_WIN64) */
878
879__device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
880 if (flags) {
881 __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
882 __builtin_amdgcn_s_barrier();
883 __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
884 } else {
885 __builtin_amdgcn_s_barrier();
886 }
887}
888
889__device__
890inline
891static void __barrier(int n)
892{
893 __work_group_barrier((__cl_mem_fence_flags)n);
894}
895
896__device__
897inline
898__attribute__((convergent))
899void __syncthreads()
900{
901 __barrier(__CLK_LOCAL_MEM_FENCE);
902}
903
904__device__
905inline
906__attribute__((convergent))
907int __syncthreads_count(int predicate)
908{
909 return __ockl_wgred_add_i32(!!predicate);
910}
911
912__device__
913inline
914__attribute__((convergent))
915int __syncthreads_and(int predicate)
916{
917 return __ockl_wgred_and_i32(!!predicate);
918}
919
920__device__
921inline
922__attribute__((convergent))
923int __syncthreads_or(int predicate)
924{
925 return __ockl_wgred_or_i32(!!predicate);
926}
927
928// hip.amdgcn.bc - device routine
929/*
930 HW_ID Register bit structure for RDNA2 & RDNA3
931 WAVE_ID 4:0 Wave id within the SIMD.
932 SIMD_ID 9:8 SIMD_ID within the WGP: [0] = row, [1] = column.
933 WGP_ID 13:10 Physical WGP ID.
934 SA_ID 16 Shader Array ID
935 SE_ID 20:18 Shader Engine the wave is assigned to for gfx11
936 SE_ID 19:18 Shader Engine the wave is assigned to for gfx10
937 DP_RATE 31:29 Number of double-precision float units per SIMD
938
939 HW_ID Register bit structure for GCN and CDNA
940 WAVE_ID 3:0 Wave buffer slot number. 0-9.
941 SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
942 PIPE_ID 7:6 Pipeline from which the wave was dispatched.
943 CU_ID 11:8 Compute Unit the wave is assigned to.
944 SH_ID 12 Shader Array (within an SE) the wave is assigned to.
945 SE_ID 15:13 Shader Engine the wave is assigned to for gfx908, gfx90a, gfx940
946 14:13 Shader Engine the wave is assigned to for Vega.
947 TG_ID 19:16 Thread-group ID
948 VM_ID 23:20 Virtual Memory ID
949 QUEUE_ID 26:24 Queue from which this wave was dispatched.
950 STATE_ID 29:27 State ID (graphics only, not compute).
951 ME_ID 31:30 Micro-engine ID.
952
953 XCC_ID Register bit structure for gfx940
954 XCC_ID 3:0 XCC the wave is assigned to.
955 */
956
957#if (defined (__GFX10__) || defined (__GFX11__))
958 #define HW_ID 23
959#else
960 #define HW_ID 4
961#endif
962
963#if (defined(__GFX10__) || defined(__GFX11__))
964 #define HW_ID_WGP_ID_SIZE 4
965 #define HW_ID_WGP_ID_OFFSET 10
966#else
967 #define HW_ID_CU_ID_SIZE 4
968 #define HW_ID_CU_ID_OFFSET 8
969#endif
970
971#if (defined(__gfx908__) || defined(__gfx90a__) || \
972 defined(__GFX11__))
973 #define HW_ID_SE_ID_SIZE 3
974#else //4 SEs/XCC for gfx940
975 #define HW_ID_SE_ID_SIZE 2
976#endif
977#if (defined(__GFX10__) || defined(__GFX11__))
978 #define HW_ID_SE_ID_OFFSET 18
979 #define HW_ID_SA_ID_OFFSET 16
980 #define HW_ID_SA_ID_SIZE 1
981#else
982 #define HW_ID_SE_ID_OFFSET 13
983#endif
984
985#if (defined(__gfx940__))
986 #define XCC_ID 20
987 #define XCC_ID_XCC_ID_SIZE 4
988 #define XCC_ID_XCC_ID_OFFSET 0
989#endif
990
991#if (!defined(__HIP_NO_IMAGE_SUPPORT) && \
992 (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)))
993 #define __HIP_NO_IMAGE_SUPPORT 1
994#endif
995
996/*
997 Encoding of parameter bitmask
998 HW_ID 5:0 HW_ID
999 OFFSET 10:6 Range: 0..31
1000 SIZE 15:11 Range: 1..32
1001 */
1002
1003#define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
1004
1005/*
1006 __smid returns the wave's assigned Compute Unit and Shader Engine.
1007 The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
1008 Note: the results vary over time.
1009 SZ minus 1 since SIZE is 1-based.
1010*/
1011__device__
1012inline
1013unsigned __smid(void)
1014{
1015 unsigned se_id = __builtin_amdgcn_s_getreg(
1016 GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
1017 #if (defined(__GFX10__) || defined(__GFX11__))
1018 unsigned wgp_id = __builtin_amdgcn_s_getreg(
1019 GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
1020 unsigned sa_id = __builtin_amdgcn_s_getreg(
1021 GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
1022 #else
1023 #if defined(__gfx940__)
1024 unsigned xcc_id = __builtin_amdgcn_s_getreg(
1025 GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
1026 #endif
1027 unsigned cu_id = __builtin_amdgcn_s_getreg(
1028 GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
1029 #endif
1030 #if (defined(__GFX10__) || defined(__GFX11__))
1031 unsigned temp = se_id;
1032 temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
1033 temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
1034 return temp;
1035 //TODO : CU Mode impl
1036 #elif defined(__gfx940__)
1037 unsigned temp = xcc_id;
1038 temp = (temp << HW_ID_SE_ID_SIZE) | se_id;
1039 temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
1040 return temp;
1041 #else
1042 return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
1043 #endif
1044}
1045
1050#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
1051#define HIP_DYNAMIC_SHARED_ATTRIBUTE
1052
1053#endif //defined(__clang__) && defined(__HIP__)
1054
1055
1056// loop unrolling
1057static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
1058 auto dstPtr = static_cast<unsigned char*>(dst);
1059 auto srcPtr = static_cast<const unsigned char*>(src);
1060
1061 while (size >= 4u) {
1062 dstPtr[0] = srcPtr[0];
1063 dstPtr[1] = srcPtr[1];
1064 dstPtr[2] = srcPtr[2];
1065 dstPtr[3] = srcPtr[3];
1066
1067 size -= 4u;
1068 srcPtr += 4u;
1069 dstPtr += 4u;
1070 }
1071 switch (size) {
1072 case 3:
1073 dstPtr[2] = srcPtr[2];
1074 case 2:
1075 dstPtr[1] = srcPtr[1];
1076 case 1:
1077 dstPtr[0] = srcPtr[0];
1078 }
1079
1080 return dst;
1081}
1082
1083static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
1084 auto dstPtr = static_cast<unsigned char*>(dst);
1085
1086 while (size >= 4u) {
1087 dstPtr[0] = val;
1088 dstPtr[1] = val;
1089 dstPtr[2] = val;
1090 dstPtr[3] = val;
1091
1092 size -= 4u;
1093 dstPtr += 4u;
1094 }
1095 switch (size) {
1096 case 3:
1097 dstPtr[2] = val;
1098 case 2:
1099 dstPtr[1] = val;
1100 case 1:
1101 dstPtr[0] = val;
1102 }
1103
1104 return dst;
1105}
1106#ifndef __OPENMP_AMDGCN__
1107static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
1108 return __hip_hc_memcpy(dst, src, size);
1109}
1110
1111static inline __device__ void* memset(void* ptr, int val, size_t size) {
1112 unsigned char val8 = static_cast<unsigned char>(val);
1113 return __hip_hc_memset(ptr, val8, size);
1114}
1115#endif // !__OPENMP_AMDGCN__
1116#endif
Contains declarations for types and functions in device library. Uses int64_t and uint64_t instead of...
Definition amd_device_functions.h:236
Definition amd_device_functions.h:243
Definition amd_hip_vector_types.h:1623