HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
amd_hip_fp16.h
1/*
2Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
23#pragma once
24#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP16_H
25#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP16_H
26
27#if defined(__clang__)
28#pragma clang diagnostic push
29#pragma clang diagnostic ignored "-Wreserved-identifier"
30#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
31#pragma clang diagnostic ignored "-Wc++98-compat"
32#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
33#pragma clang diagnostic ignored "-Wsign-conversion"
34#pragma clang diagnostic ignored "-Wfloat-conversion"
35#pragma clang diagnostic ignored "-Wdouble-promotion"
36#pragma clang diagnostic ignored "-Wnested-anon-types"
37#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
38#pragma clang diagnostic ignored "-Wfloat-equal"
39#endif
40
41#if defined(__HIPCC_RTC__)
42 #define __HOST_DEVICE__ __device__
43#else
44 #define __HOST_DEVICE__ __host__ __device__
45 #include <hip/amd_detail/amd_hip_common.h>
47 #include <assert.h>
48 #if defined(__cplusplus)
49 #include <algorithm>
50 #include <type_traits>
51 #include <utility>
52#endif
53#endif // !defined(__HIPCC_RTC__)
54
55#if defined(__clang__) && defined(__HIP__)
56 typedef _Float16 _Float16_2 __attribute__((ext_vector_type(2)));
57
58 struct __half_raw {
59 union {
60 static_assert(sizeof(_Float16) == sizeof(unsigned short), "");
61
62 _Float16 data;
63 unsigned short x;
64 };
65 };
66
67 struct __half2_raw {
68 union {
69 static_assert(sizeof(_Float16_2) == sizeof(unsigned short[2]), "");
70
71 struct {
72 unsigned short x;
73 unsigned short y;
74 };
75 _Float16_2 data;
76 };
77 };
78
79 #if defined(__cplusplus)
80 #if !defined(__HIPCC_RTC__)
81 #include "hip_fp16_math_fwd.h"
82 #include "amd_hip_vector_types.h"
83 #include "host_defines.h"
84 #include "amd_device_functions.h"
85 #include "amd_warp_functions.h"
86 #endif
87 namespace std
88 {
89 template<> struct is_floating_point<_Float16> : std::true_type {};
90 }
91
92 template<bool cond, typename T = void>
93 using Enable_if_t = typename std::enable_if<cond, T>::type;
94
95 // BEGIN STRUCT __HALF
96 struct __half {
97 protected:
98 union {
99 static_assert(sizeof(_Float16) == sizeof(unsigned short), "");
100
101 _Float16 data;
102 unsigned short __x;
103 };
104 public:
105 // CREATORS
106 __HOST_DEVICE__
107 __half() = default;
108 __HOST_DEVICE__
109 __half(const __half_raw& x) : data{x.data} {}
110 #if !defined(__HIP_NO_HALF_CONVERSIONS__)
111 __HOST_DEVICE__
112 __half(decltype(data) x) : data{x} {}
113 template<
114 typename T,
115 Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
116 __HOST_DEVICE__
117 __half(T x) : data{static_cast<_Float16>(x)} {}
118 #endif
119 __HOST_DEVICE__
120 __half(const __half&) = default;
121 __HOST_DEVICE__
122 __half(__half&&) = default;
123 __HOST_DEVICE__
124 ~__half() = default;
125
126 // CREATORS - DEVICE ONLY
127 #if !defined(__HIP_NO_HALF_CONVERSIONS__)
128 template<
129 typename T, Enable_if_t<std::is_integral<T>{}>* = nullptr>
130 __HOST_DEVICE__
131 __half(T x) : data{static_cast<_Float16>(x)} {}
132 #endif
133
134 // MANIPULATORS
135 __HOST_DEVICE__
136 __half& operator=(const __half&) = default;
137 __HOST_DEVICE__
138 __half& operator=(__half&&) = default;
139 __HOST_DEVICE__
140 __half& operator=(const __half_raw& x)
141 {
142 data = x.data;
143 return *this;
144 }
145 __HOST_DEVICE__
146 volatile __half& operator=(const __half_raw& x) volatile
147 {
148 data = x.data;
149 return *this;
150 }
151 volatile __half& operator=(const volatile __half_raw& x) volatile
152 {
153 data = x.data;
154 return *this;
155 }
156 __half& operator=(__half_raw&& x)
157 {
158 data = x.data;
159 return *this;
160 }
161 volatile __half& operator=(__half_raw&& x) volatile
162 {
163 data = x.data;
164 return *this;
165 }
166 volatile __half& operator=(volatile __half_raw&& x) volatile
167 {
168 data = x.data;
169 return *this;
170 }
171 #if !defined(__HIP_NO_HALF_CONVERSIONS__)
172 template<
173 typename T,
174 Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
175 __HOST_DEVICE__
176 __half& operator=(T x)
177 {
178 data = static_cast<_Float16>(x);
179 return *this;
180 }
181 #endif
182
183 // MANIPULATORS - DEVICE ONLY
184 #if !defined(__HIP_NO_HALF_CONVERSIONS__)
185 template<
186 typename T, Enable_if_t<std::is_integral<T>{}>* = nullptr>
187 __device__
188 __half& operator=(T x)
189 {
190 data = static_cast<_Float16>(x);
191 return *this;
192 }
193 #endif
194
195 #if !defined(__HIP_NO_HALF_OPERATORS__)
196 __device__
197 __half& operator+=(const __half& x)
198 {
199 data += x.data;
200 return *this;
201 }
202 __device__
203 __half& operator-=(const __half& x)
204 {
205 data -= x.data;
206 return *this;
207 }
208 __device__
209 __half& operator*=(const __half& x)
210 {
211 data *= x.data;
212 return *this;
213 }
214 __device__
215 __half& operator/=(const __half& x)
216 {
217 data /= x.data;
218 return *this;
219 }
220 __device__
221 __half& operator++() { ++data; return *this; }
222 __device__
223 __half operator++(int)
224 {
225 __half tmp{*this};
226 ++*this;
227 return tmp;
228 }
229 __device__
230 __half& operator--() { --data; return *this; }
231 __device__
232 __half operator--(int)
233 {
234 __half tmp{*this};
235 --*this;
236 return tmp;
237 }
238 #endif
239
240 // ACCESSORS
241 #if !defined(__HIP_NO_HALF_CONVERSIONS__)
242 template<
243 typename T,
244 Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
245 __HOST_DEVICE__
246 operator T() const { return data; }
247 #endif
248 __HOST_DEVICE__
249 operator __half_raw() const { return __half_raw{data}; }
250 __HOST_DEVICE__
251 operator __half_raw() const volatile
252 {
253 return __half_raw{data};
254 }
255
256 #if !defined(__HIP_NO_HALF_CONVERSIONS__)
257 template<
258 typename T, Enable_if_t<std::is_integral<T>{}>* = nullptr>
259 __HOST_DEVICE__
260 operator T() const { return data; }
261 #endif
262
263 #if !defined(__HIP_NO_HALF_OPERATORS__)
264 __device__
265 __half operator+() const { return *this; }
266 __device__
267 __half operator-() const
268 {
269 __half tmp{*this};
270 tmp.data = -tmp.data;
271 return tmp;
272 }
273 #endif
274
275 // FRIENDS
276 #if !defined(__HIP_NO_HALF_OPERATORS__)
277 friend
278 inline
279 __device__
280 __half operator+(const __half& x, const __half& y)
281 {
282 return __half{x} += y;
283 }
284 friend
285 inline
286 __device__
287 __half operator-(const __half& x, const __half& y)
288 {
289 return __half{x} -= y;
290 }
291 friend
292 inline
293 __device__
294 __half operator*(const __half& x, const __half& y)
295 {
296 return __half{x} *= y;
297 }
298 friend
299 inline
300 __device__
301 __half operator/(const __half& x, const __half& y)
302 {
303 return __half{x} /= y;
304 }
305 friend
306 inline
307 __device__
308 bool operator==(const __half& x, const __half& y)
309 {
310 return x.data == y.data;
311 }
312 friend
313 inline
314 __device__
315 bool operator!=(const __half& x, const __half& y)
316 {
317 return !(x == y);
318 }
319 friend
320 inline
321 __device__
322 bool operator<(const __half& x, const __half& y)
323 {
324 return x.data < y.data;
325 }
326 friend
327 inline
328 __device__
329 bool operator>(const __half& x, const __half& y)
330 {
331 return y.data < x.data;
332 }
333 friend
334 inline
335 __device__
336 bool operator<=(const __half& x, const __half& y)
337 {
338 return !(y < x);
339 }
340 friend
341 inline
342 __device__
343 bool operator>=(const __half& x, const __half& y)
344 {
345 return !(x < y);
346 }
347 #endif // !defined(__HIP_NO_HALF_OPERATORS__)
348 };
349 // END STRUCT __HALF
350
351 // BEGIN STRUCT __HALF2
352 struct __half2 {
353 public:
354 union {
355 static_assert(
356 sizeof(_Float16_2) == sizeof(unsigned short[2]), "");
357
358 struct {
359 unsigned short x;
360 unsigned short y;
361 };
362 _Float16_2 data;
363 };
364
365 // CREATORS
366 __HOST_DEVICE__
367 __half2() = default;
368 __HOST_DEVICE__
369 __half2(const __half2_raw& x) : data{x.data} {}
370 __HOST_DEVICE__
371 __half2(decltype(data) x) : data{x} {}
372 __HOST_DEVICE__
373 __half2(const __half& x, const __half& y)
374 :
375 data{
376 static_cast<__half_raw>(x).data,
377 static_cast<__half_raw>(y).data}
378 {}
379 __HOST_DEVICE__
380 __half2(const __half2&) = default;
381 __HOST_DEVICE__
382 __half2(__half2&&) = default;
383 __HOST_DEVICE__
384 ~__half2() = default;
385
386 // MANIPULATORS
387 __HOST_DEVICE__
388 __half2& operator=(const __half2&) = default;
389 __HOST_DEVICE__
390 __half2& operator=(__half2&&) = default;
391 __HOST_DEVICE__
392 __half2& operator=(const __half2_raw& x)
393 {
394 data = x.data;
395 return *this;
396 }
397
398 // MANIPULATORS - DEVICE ONLY
399 #if !defined(__HIP_NO_HALF_OPERATORS__)
400 __device__
401 __half2& operator+=(const __half2& x)
402 {
403 data += x.data;
404 return *this;
405 }
406 __device__
407 __half2& operator-=(const __half2& x)
408 {
409 data -= x.data;
410 return *this;
411 }
412 __device__
413 __half2& operator*=(const __half2& x)
414 {
415 data *= x.data;
416 return *this;
417 }
418 __device__
419 __half2& operator/=(const __half2& x)
420 {
421 data /= x.data;
422 return *this;
423 }
424 __device__
425 __half2& operator++() { return *this += _Float16_2{1, 1}; }
426 __device__
427 __half2 operator++(int)
428 {
429 __half2 tmp{*this};
430 ++*this;
431 return tmp;
432 }
433 __device__
434 __half2& operator--() { return *this -= _Float16_2{1, 1}; }
435 __device__
436 __half2 operator--(int)
437 {
438 __half2 tmp{*this};
439 --*this;
440 return tmp;
441 }
442 #endif
443
444 // ACCESSORS
445 __HOST_DEVICE__
446 operator decltype(data)() const { return data; }
447 __HOST_DEVICE__
448 operator __half2_raw() const {
449 __half2_raw r;
450 r.data = data;
451 return r;
452 }
453
454 // ACCESSORS - DEVICE ONLY
455 #if !defined(__HIP_NO_HALF_OPERATORS__)
456 __device__
457 __half2 operator+() const { return *this; }
458 __device__
459 __half2 operator-() const
460 {
461 __half2 tmp{*this};
462 tmp.data = -tmp.data;
463 return tmp;
464 }
465 #endif
466
467 // FRIENDS
468 #if !defined(__HIP_NO_HALF_OPERATORS__)
469 friend
470 inline
471 __device__
472 __half2 operator+(const __half2& x, const __half2& y)
473 {
474 return __half2{x} += y;
475 }
476 friend
477 inline
478 __device__
479 __half2 operator-(const __half2& x, const __half2& y)
480 {
481 return __half2{x} -= y;
482 }
483 friend
484 inline
485 __device__
486 __half2 operator*(const __half2& x, const __half2& y)
487 {
488 return __half2{x} *= y;
489 }
490 friend
491 inline
492 __device__
493 __half2 operator/(const __half2& x, const __half2& y)
494 {
495 return __half2{x} /= y;
496 }
497 friend
498 inline
499 __device__
500 bool operator==(const __half2& x, const __half2& y)
501 {
502 auto r = x.data == y.data;
503 return r.x != 0 && r.y != 0;
504 }
505 friend
506 inline
507 __device__
508 bool operator!=(const __half2& x, const __half2& y)
509 {
510 return !(x == y);
511 }
512 friend
513 inline
514 __device__
515 bool operator<(const __half2& x, const __half2& y)
516 {
517 auto r = x.data < y.data;
518 return r.x != 0 && r.y != 0;
519 }
520 friend
521 inline
522 __device__
523 bool operator>(const __half2& x, const __half2& y)
524 {
525 return y < x;
526 }
527 friend
528 inline
529 __device__
530 bool operator<=(const __half2& x, const __half2& y)
531 {
532 return !(y < x);
533 }
534 friend
535 inline
536 __device__
537 bool operator>=(const __half2& x, const __half2& y)
538 {
539 return !(x < y);
540 }
541 #endif // !defined(__HIP_NO_HALF_OPERATORS__)
542 };
543 // END STRUCT __HALF2
544
545 namespace
546 {
547 inline
548 __HOST_DEVICE__
549 __half2 make_half2(__half x, __half y)
550 {
551 return __half2{x, y};
552 }
553
554 inline
555 __HOST_DEVICE__
556 __half __low2half(__half2 x)
557 {
558 return __half{__half_raw{static_cast<__half2_raw>(x).data.x}};
559 }
560
561 inline
562 __HOST_DEVICE__
563 __half __high2half(__half2 x)
564 {
565 return __half{__half_raw{static_cast<__half2_raw>(x).data.y}};
566 }
567
568 inline
569 __HOST_DEVICE__
570 __half2 __half2half2(__half x)
571 {
572 return __half2{x, x};
573 }
574
575 inline
576 __HOST_DEVICE__
577 __half2 __halves2half2(__half x, __half y)
578 {
579 return __half2{x, y};
580 }
581
582 inline
583 __HOST_DEVICE__
584 __half2 __low2half2(__half2 x)
585 {
586 return __half2{
587 _Float16_2{
588 static_cast<__half2_raw>(x).data.x,
589 static_cast<__half2_raw>(x).data.x}};
590 }
591
592 inline
593 __HOST_DEVICE__
594 __half2 __high2half2(__half2 x)
595 {
596 return __half2{
597 _Float16_2{
598 static_cast<__half2_raw>(x).data.y,
599 static_cast<__half2_raw>(x).data.y}};
600 }
601
602 inline
603 __HOST_DEVICE__
604 __half2 __lows2half2(__half2 x, __half2 y)
605 {
606 return __half2{
607 _Float16_2{
608 static_cast<__half2_raw>(x).data.x,
609 static_cast<__half2_raw>(y).data.x}};
610 }
611
612 inline
613 __HOST_DEVICE__
614 __half2 __highs2half2(__half2 x, __half2 y)
615 {
616 return __half2{
617 _Float16_2{
618 static_cast<__half2_raw>(x).data.y,
619 static_cast<__half2_raw>(y).data.y}};
620 }
621
622 inline
623 __HOST_DEVICE__
624 __half2 __lowhigh2highlow(__half2 x)
625 {
626 return __half2{
627 _Float16_2{
628 static_cast<__half2_raw>(x).data.y,
629 static_cast<__half2_raw>(x).data.x}};
630 }
631
632 // Bitcasts
633 inline
634 __device__
635 short __half_as_short(__half x)
636 {
637 return static_cast<__half_raw>(x).x;
638 }
639
640 inline
641 __device__
642 unsigned short __half_as_ushort(__half x)
643 {
644 return static_cast<__half_raw>(x).x;
645 }
646
647 inline
648 __device__
649 __half __short_as_half(short x)
650 {
651 __half_raw r; r.x = x;
652 return r;
653 }
654
655 inline
656 __device__
657 __half __ushort_as_half(unsigned short x)
658 {
659 __half_raw r; r.x = x;
660 return r;
661 }
662
663 // float -> half | half2
664 inline
665 __HOST_DEVICE__
666 __half __float2half(float x)
667 {
668 return __half_raw{static_cast<_Float16>(x)};
669 }
670 inline
671 __HOST_DEVICE__
672 __half __float2half_rn(float x)
673 {
674 return __half_raw{static_cast<_Float16>(x)};
675 }
676 #if !defined(__HIPCC_RTC__)
677 // TODO: rounding behaviour is not correct for host functions.
678 inline
680 __half __float2half_rz(float x)
681 {
682 return __half_raw{static_cast<_Float16>(x)};
683 }
684 inline
686 __half __float2half_rd(float x)
687 {
688 return __half_raw{static_cast<_Float16>(x)};
689 }
690 inline
692 __half __float2half_ru(float x)
693 {
694 return __half_raw{static_cast<_Float16>(x)};
695 }
696 #endif
697 inline
698 __device__
699 __half __float2half_rz(float x)
700 {
701 return __half_raw{__ocml_cvtrtz_f16_f32(x)};
702 }
703 inline
704 __device__
705 __half __float2half_rd(float x)
706 {
707 return __half_raw{__ocml_cvtrtn_f16_f32(x)};
708 }
709 inline
710 __device__
711 __half __float2half_ru(float x)
712 {
713 return __half_raw{__ocml_cvtrtp_f16_f32(x)};
714 }
715 inline
716 __HOST_DEVICE__
717 __half2 __float2half2_rn(float x)
718 {
719 return __half2{
720 _Float16_2{
721 static_cast<_Float16>(x), static_cast<_Float16>(x)}};
722 }
723 inline
724 __HOST_DEVICE__
725 __half2 __floats2half2_rn(float x, float y)
726 {
727 return __half2{_Float16_2{
728 static_cast<_Float16>(x), static_cast<_Float16>(y)}};
729 }
730 inline
731 __HOST_DEVICE__
732 __half2 __float22half2_rn(float2 x)
733 {
734 return __floats2half2_rn(x.x, x.y);
735 }
736
737 // half | half2 -> float
738 inline
739 __HOST_DEVICE__
740 float __half2float(__half x)
741 {
742 return static_cast<__half_raw>(x).data;
743 }
744 inline
745 __HOST_DEVICE__
746 float __low2float(__half2 x)
747 {
748 return static_cast<__half2_raw>(x).data.x;
749 }
750 inline
751 __HOST_DEVICE__
752 float __high2float(__half2 x)
753 {
754 return static_cast<__half2_raw>(x).data.y;
755 }
756 inline
757 __HOST_DEVICE__
758 float2 __half22float2(__half2 x)
759 {
760 return make_float2(
761 static_cast<__half2_raw>(x).data.x,
762 static_cast<__half2_raw>(x).data.y);
763 }
764
765 // half -> int
766 inline
767 __device__
768 int __half2int_rn(__half x)
769 {
770 return static_cast<__half_raw>(x).data;
771 }
772 inline
773 __device__
774 int __half2int_rz(__half x)
775 {
776 return static_cast<__half_raw>(x).data;
777 }
778 inline
779 __device__
780 int __half2int_rd(__half x)
781 {
782 return static_cast<__half_raw>(x).data;
783 }
784 inline
785 __device__
786 int __half2int_ru(__half x)
787 {
788 return static_cast<__half_raw>(x).data;
789 }
790
791 // int -> half
792 inline
793 __device__
794 __half __int2half_rn(int x)
795 {
796 return __half_raw{static_cast<_Float16>(x)};
797 }
798 inline
799 __device__
800 __half __int2half_rz(int x)
801 {
802 return __half_raw{static_cast<_Float16>(x)};
803 }
804 inline
805 __device__
806 __half __int2half_rd(int x)
807 {
808 return __half_raw{static_cast<_Float16>(x)};
809 }
810 inline
811 __device__
812 __half __int2half_ru(int x)
813 {
814 return __half_raw{static_cast<_Float16>(x)};
815 }
816
817 // half -> short
818 inline
819 __device__
820 short __half2short_rn(__half x)
821 {
822 return static_cast<__half_raw>(x).data;
823 }
824 inline
825 __device__
826 short __half2short_rz(__half x)
827 {
828 return static_cast<__half_raw>(x).data;
829 }
830 inline
831 __device__
832 short __half2short_rd(__half x)
833 {
834 return static_cast<__half_raw>(x).data;
835 }
836 inline
837 __device__
838 short __half2short_ru(__half x)
839 {
840 return static_cast<__half_raw>(x).data;
841 }
842
843 // short -> half
844 inline
845 __device__
846 __half __short2half_rn(short x)
847 {
848 return __half_raw{static_cast<_Float16>(x)};
849 }
850 inline
851 __device__
852 __half __short2half_rz(short x)
853 {
854 return __half_raw{static_cast<_Float16>(x)};
855 }
856 inline
857 __device__
858 __half __short2half_rd(short x)
859 {
860 return __half_raw{static_cast<_Float16>(x)};
861 }
862 inline
863 __device__
864 __half __short2half_ru(short x)
865 {
866 return __half_raw{static_cast<_Float16>(x)};
867 }
868
869 // half -> long long
870 inline
871 __device__
872 long long __half2ll_rn(__half x)
873 {
874 return static_cast<__half_raw>(x).data;
875 }
876 inline
877 __device__
878 long long __half2ll_rz(__half x)
879 {
880 return static_cast<__half_raw>(x).data;
881 }
882 inline
883 __device__
884 long long __half2ll_rd(__half x)
885 {
886 return static_cast<__half_raw>(x).data;
887 }
888 inline
889 __device__
890 long long __half2ll_ru(__half x)
891 {
892 return static_cast<__half_raw>(x).data;
893 }
894
895 // long long -> half
896 inline
897 __device__
898 __half __ll2half_rn(long long x)
899 {
900 return __half_raw{static_cast<_Float16>(x)};
901 }
902 inline
903 __device__
904 __half __ll2half_rz(long long x)
905 {
906 return __half_raw{static_cast<_Float16>(x)};
907 }
908 inline
909 __device__
910 __half __ll2half_rd(long long x)
911 {
912 return __half_raw{static_cast<_Float16>(x)};
913 }
914 inline
915 __device__
916 __half __ll2half_ru(long long x)
917 {
918 return __half_raw{static_cast<_Float16>(x)};
919 }
920
921 // half -> unsigned int
922 inline
923 __device__
924 unsigned int __half2uint_rn(__half x)
925 {
926 return static_cast<__half_raw>(x).data;
927 }
928 inline
929 __device__
930 unsigned int __half2uint_rz(__half x)
931 {
932 return static_cast<__half_raw>(x).data;
933 }
934 inline
935 __device__
936 unsigned int __half2uint_rd(__half x)
937 {
938 return static_cast<__half_raw>(x).data;
939 }
940 inline
941 __device__
942 unsigned int __half2uint_ru(__half x)
943 {
944 return static_cast<__half_raw>(x).data;
945 }
946
947 // unsigned int -> half
948 inline
949 __device__
950 __half __uint2half_rn(unsigned int x)
951 {
952 return __half_raw{static_cast<_Float16>(x)};
953 }
954 inline
955 __device__
956 __half __uint2half_rz(unsigned int x)
957 {
958 return __half_raw{static_cast<_Float16>(x)};
959 }
960 inline
961 __device__
962 __half __uint2half_rd(unsigned int x)
963 {
964 return __half_raw{static_cast<_Float16>(x)};
965 }
966 inline
967 __device__
968 __half __uint2half_ru(unsigned int x)
969 {
970 return __half_raw{static_cast<_Float16>(x)};
971 }
972
973 // half -> unsigned short
974 inline
975 __device__
976 unsigned short __half2ushort_rn(__half x)
977 {
978 return static_cast<__half_raw>(x).data;
979 }
980 inline
981 __device__
982 unsigned short __half2ushort_rz(__half x)
983 {
984 return static_cast<__half_raw>(x).data;
985 }
986 inline
987 __device__
988 unsigned short __half2ushort_rd(__half x)
989 {
990 return static_cast<__half_raw>(x).data;
991 }
992 inline
993 __device__
994 unsigned short __half2ushort_ru(__half x)
995 {
996 return static_cast<__half_raw>(x).data;
997 }
998
999 // unsigned short -> half
1000 inline
1001 __device__
1002 __half __ushort2half_rn(unsigned short x)
1003 {
1004 return __half_raw{static_cast<_Float16>(x)};
1005 }
1006 inline
1007 __device__
1008 __half __ushort2half_rz(unsigned short x)
1009 {
1010 return __half_raw{static_cast<_Float16>(x)};
1011 }
1012 inline
1013 __device__
1014 __half __ushort2half_rd(unsigned short x)
1015 {
1016 return __half_raw{static_cast<_Float16>(x)};
1017 }
1018 inline
1019 __device__
1020 __half __ushort2half_ru(unsigned short x)
1021 {
1022 return __half_raw{static_cast<_Float16>(x)};
1023 }
1024
1025 // half -> unsigned long long
1026 inline
1027 __device__
1028 unsigned long long __half2ull_rn(__half x)
1029 {
1030 return static_cast<__half_raw>(x).data;
1031 }
1032 inline
1033 __device__
1034 unsigned long long __half2ull_rz(__half x)
1035 {
1036 return static_cast<__half_raw>(x).data;
1037 }
1038 inline
1039 __device__
1040 unsigned long long __half2ull_rd(__half x)
1041 {
1042 return static_cast<__half_raw>(x).data;
1043 }
1044 inline
1045 __device__
1046 unsigned long long __half2ull_ru(__half x)
1047 {
1048 return static_cast<__half_raw>(x).data;
1049 }
1050
1051 // unsigned long long -> half
1052 inline
1053 __device__
1054 __half __ull2half_rn(unsigned long long x)
1055 {
1056 return __half_raw{static_cast<_Float16>(x)};
1057 }
1058 inline
1059 __device__
1060 __half __ull2half_rz(unsigned long long x)
1061 {
1062 return __half_raw{static_cast<_Float16>(x)};
1063 }
1064 inline
1065 __device__
1066 __half __ull2half_rd(unsigned long long x)
1067 {
1068 return __half_raw{static_cast<_Float16>(x)};
1069 }
1070 inline
1071 __device__
1072 __half __ull2half_ru(unsigned long long x)
1073 {
1074 return __half_raw{static_cast<_Float16>(x)};
1075 }
1076
1077 // Load primitives
1078 inline
1079 __device__
1080 __half __ldg(const __half* ptr) { return *ptr; }
1081 inline
1082 __device__
1083 __half __ldcg(const __half* ptr) { return *ptr; }
1084 inline
1085 __device__
1086 __half __ldca(const __half* ptr) { return *ptr; }
1087 inline
1088 __device__
1089 __half __ldcs(const __half* ptr) { return *ptr; }
1090
1091 inline
1092 __HOST_DEVICE__
1093 __half2 __ldg(const __half2* ptr) { return *ptr; }
1094 inline
1095 __HOST_DEVICE__
1096 __half2 __ldcg(const __half2* ptr) { return *ptr; }
1097 inline
1098 __HOST_DEVICE__
1099 __half2 __ldca(const __half2* ptr) { return *ptr; }
1100 inline
1101 __HOST_DEVICE__
1102 __half2 __ldcs(const __half2* ptr) { return *ptr; }
1103
1104 // Relations
1105 inline
1106 __device__
1107 bool __heq(__half x, __half y)
1108 {
1109 return static_cast<__half_raw>(x).data ==
1110 static_cast<__half_raw>(y).data;
1111 }
1112 inline
1113 __device__
1114 bool __hne(__half x, __half y)
1115 {
1116 return static_cast<__half_raw>(x).data !=
1117 static_cast<__half_raw>(y).data;
1118 }
1119 inline
1120 __device__
1121 bool __hle(__half x, __half y)
1122 {
1123 return static_cast<__half_raw>(x).data <=
1124 static_cast<__half_raw>(y).data;
1125 }
1126 inline
1127 __device__
1128 bool __hge(__half x, __half y)
1129 {
1130 return static_cast<__half_raw>(x).data >=
1131 static_cast<__half_raw>(y).data;
1132 }
1133 inline
1134 __device__
1135 bool __hlt(__half x, __half y)
1136 {
1137 return static_cast<__half_raw>(x).data <
1138 static_cast<__half_raw>(y).data;
1139 }
1140 inline
1141 __device__
1142 bool __hgt(__half x, __half y)
1143 {
1144 return static_cast<__half_raw>(x).data >
1145 static_cast<__half_raw>(y).data;
1146 }
1147 inline __device__
1148 bool __hequ(__half x, __half y) {
1149 return !(static_cast<__half_raw>(x).data < static_cast<__half_raw>(y).data) &&
1150 !(static_cast<__half_raw>(x).data > static_cast<__half_raw>(y).data);
1151 }
1152 inline __device__
1153 bool __hneu(__half x, __half y) {
1154 return !(static_cast<__half_raw>(x).data == static_cast<__half_raw>(y).data);
1155 }
1156 inline __device__
1157 bool __hleu(__half x, __half y) {
1158 return !(static_cast<__half_raw>(x).data > static_cast<__half_raw>(y).data);
1159 }
1160 inline
1161 __device__
1162 bool __hgeu(__half x, __half y) {
1163 return !(static_cast<__half_raw>(x).data < static_cast<__half_raw>(y).data);
1164 }
1165 inline
1166 __device__
1167 bool __hltu(__half x, __half y) {
1168 return !(static_cast<__half_raw>(x).data >= static_cast<__half_raw>(y).data);
1169 }
1170 inline
1171 __device__
1172 bool __hgtu(__half x, __half y) {
1173 return !(static_cast<__half_raw>(x).data <= static_cast<__half_raw>(y).data);
1174 }
1175
1176 inline
1177 __HOST_DEVICE__
1178 __half2 __heq2(__half2 x, __half2 y)
1179 {
1180 auto r = static_cast<__half2_raw>(x).data ==
1181 static_cast<__half2_raw>(y).data;
1182 return __builtin_convertvector(-r, _Float16_2);
1183 }
1184 inline
1185 __HOST_DEVICE__
1186 __half2 __hne2(__half2 x, __half2 y)
1187 {
1188 auto r = static_cast<__half2_raw>(x).data !=
1189 static_cast<__half2_raw>(y).data;
1190 return __builtin_convertvector(-r, _Float16_2);
1191 }
1192 inline
1193 __HOST_DEVICE__
1194 __half2 __hle2(__half2 x, __half2 y)
1195 {
1196 auto r = static_cast<__half2_raw>(x).data <=
1197 static_cast<__half2_raw>(y).data;
1198 return __builtin_convertvector(-r, _Float16_2);
1199 }
1200 inline
1201 __HOST_DEVICE__
1202 __half2 __hge2(__half2 x, __half2 y)
1203 {
1204 auto r = static_cast<__half2_raw>(x).data >=
1205 static_cast<__half2_raw>(y).data;
1206 return __builtin_convertvector(-r, _Float16_2);
1207 }
1208 inline
1209 __HOST_DEVICE__
1210 __half2 __hlt2(__half2 x, __half2 y)
1211 {
1212 auto r = static_cast<__half2_raw>(x).data <
1213 static_cast<__half2_raw>(y).data;
1214 return __builtin_convertvector(-r, _Float16_2);
1215 }
1216 inline
1217 __HOST_DEVICE__
1218 __half2 __hgt2(__half2 x, __half2 y)
1219 {
1220 auto r = static_cast<__half2_raw>(x).data >
1221 static_cast<__half2_raw>(y).data;
1222 return __builtin_convertvector(-r, _Float16_2);
1223 }
1224 inline __HOST_DEVICE__
1225 __half2 __hequ2(__half2 x, __half2 y) {
1226 auto r = !(static_cast<__half2_raw>(x).data < static_cast<__half2_raw>(y).data) &&
1227 !(static_cast<__half2_raw>(x).data > static_cast<__half2_raw>(y).data);
1228 return __builtin_convertvector(-r, _Float16_2);
1229 }
1230 inline
1231 __HOST_DEVICE__
1232 __half2 __hneu2(__half2 x, __half2 y) {
1233 auto r = !(static_cast<__half2_raw>(x).data == static_cast<__half2_raw>(y).data);
1234 return __builtin_convertvector(-r, _Float16_2);
1235 }
1236 inline
1237 __HOST_DEVICE__
1238 __half2 __hleu2(__half2 x, __half2 y) {
1239 auto r = !(static_cast<__half2_raw>(x).data > static_cast<__half2_raw>(y).data);
1240 return __builtin_convertvector(-r, _Float16_2);
1241 }
1242 inline
1243 __HOST_DEVICE__
1244 __half2 __hgeu2(__half2 x, __half2 y) {
1245 auto r = !(static_cast<__half2_raw>(x).data < static_cast<__half2_raw>(y).data);
1246 return __builtin_convertvector(-r, _Float16_2);
1247 }
1248 inline
1249 __HOST_DEVICE__
1250 __half2 __hltu2(__half2 x, __half2 y) {
1251 auto r = !(static_cast<__half2_raw>(x).data >= static_cast<__half2_raw>(y).data);
1252 return __builtin_convertvector(-r, _Float16_2);
1253 }
1254 inline
1255 __HOST_DEVICE__
1256 __half2 __hgtu2(__half2 x, __half2 y) {
1257 auto r = !(static_cast<__half2_raw>(x).data <= static_cast<__half2_raw>(y).data);
1258 return __builtin_convertvector(-r, _Float16_2);
1259 }
1260
1261 inline
1262 __HOST_DEVICE__
1263 bool __hbeq2(__half2 x, __half2 y)
1264 {
1265 auto r = static_cast<__half2_raw>(__heq2(x, y));
1266 return r.data.x != 0 && r.data.y != 0;
1267 }
1268 inline
1269 __HOST_DEVICE__
1270 bool __hbne2(__half2 x, __half2 y)
1271 {
1272 auto r = static_cast<__half2_raw>(__hne2(x, y));
1273 return r.data.x != 0 && r.data.y != 0;
1274 }
1275 inline
1276 __HOST_DEVICE__
1277 bool __hble2(__half2 x, __half2 y)
1278 {
1279 auto r = static_cast<__half2_raw>(__hle2(x, y));
1280 return r.data.x != 0 && r.data.y != 0;
1281 }
1282 inline
1283 __HOST_DEVICE__
1284 bool __hbge2(__half2 x, __half2 y)
1285 {
1286 auto r = static_cast<__half2_raw>(__hge2(x, y));
1287 return r.data.x != 0 && r.data.y != 0;
1288 }
1289 inline
1290 __HOST_DEVICE__
1291 bool __hblt2(__half2 x, __half2 y)
1292 {
1293 auto r = static_cast<__half2_raw>(__hlt2(x, y));
1294 return r.data.x != 0 && r.data.y != 0;
1295 }
1296 inline
1297 __HOST_DEVICE__
1298 bool __hbgt2(__half2 x, __half2 y)
1299 {
1300 auto r = static_cast<__half2_raw>(__hgt2(x, y));
1301 return r.data.x != 0 && r.data.y != 0;
1302 }
1303 inline
1304 __HOST_DEVICE__
1305 bool __hbequ2(__half2 x, __half2 y) { return __hbeq2(x, y); }
1306 inline
1307 __HOST_DEVICE__
1308 bool __hbneu2(__half2 x, __half2 y) { return __hbne2(x, y); }
1309 inline
1310 __HOST_DEVICE__
1311 bool __hbleu2(__half2 x, __half2 y) { return __hble2(x, y); }
1312 inline
1313 __HOST_DEVICE__
1314 bool __hbgeu2(__half2 x, __half2 y) { return __hbge2(x, y); }
1315 inline
1316 __HOST_DEVICE__
1317 bool __hbltu2(__half2 x, __half2 y) { return __hblt2(x, y); }
1318 inline
1319 __HOST_DEVICE__
1320 bool __hbgtu2(__half2 x, __half2 y) { return __hbgt2(x, y); }
1321 inline
1322 __device__
1323 __half __hmax(const __half x, const __half y) {
1324 return __half_raw{__ocml_fmax_f16(static_cast<__half_raw>(x).data,
1325 static_cast<__half_raw>(y).data)};
1326 }
1327 inline
1328 __device__
1329 __half __hmax_nan(const __half x, const __half y) {
1330 if(__ocml_isnan_f16(static_cast<__half_raw>(x).data)) {
1331 return x;
1332 } else if (__ocml_isnan_f16(static_cast<__half_raw>(y).data)) {
1333 return y;
1334 }
1335 return __hmax(x, y);
1336 }
1337 inline
1338 __device__
1339 __half __hmin(const __half x, const __half y) {
1340 return __half_raw{__ocml_fmin_f16(static_cast<__half_raw>(x).data,
1341 static_cast<__half_raw>(y).data)};
1342 }
1343 inline
1344 __device__
1345 __half __hmin_nan(const __half x, const __half y) {
1346 if(__ocml_isnan_f16(static_cast<__half_raw>(x).data)) {
1347 return x;
1348 } else if (__ocml_isnan_f16(static_cast<__half_raw>(y).data)) {
1349 return y;
1350 }
1351 return __hmin(x, y);
1352 }
1353
1354 // Arithmetic
1355 inline
1356 __device__
1357 __half __clamp_01(__half x)
1358 {
1359 auto r = static_cast<__half_raw>(x);
1360
1361 if (__hlt(x, __half_raw{0})) return __half_raw{0};
1362 if (__hlt(__half_raw{1}, x)) return __half_raw{1};
1363 return r;
1364 }
1365
1366 inline
1367 __device__
1368 __half __hadd(__half x, __half y)
1369 {
1370 return __half_raw{
1371 static_cast<__half_raw>(x).data +
1372 static_cast<__half_raw>(y).data};
1373 }
1374 inline
1375 __device__
1376 __half __habs(__half x)
1377 {
1378 return __half_raw{
1379 __ocml_fabs_f16(static_cast<__half_raw>(x).data)};
1380 }
1381 inline
1382 __device__
1383 __half __hsub(__half x, __half y)
1384 {
1385 return __half_raw{
1386 static_cast<__half_raw>(x).data -
1387 static_cast<__half_raw>(y).data};
1388 }
1389 inline
1390 __device__
1391 __half __hmul(__half x, __half y)
1392 {
1393 return __half_raw{
1394 static_cast<__half_raw>(x).data *
1395 static_cast<__half_raw>(y).data};
1396 }
1397 inline
1398 __device__
1399 __half __hadd_sat(__half x, __half y)
1400 {
1401 return __clamp_01(__hadd(x, y));
1402 }
1403 inline
1404 __device__
1405 __half __hsub_sat(__half x, __half y)
1406 {
1407 return __clamp_01(__hsub(x, y));
1408 }
1409 inline
1410 __device__
1411 __half __hmul_sat(__half x, __half y)
1412 {
1413 return __clamp_01(__hmul(x, y));
1414 }
1415 inline
1416 __device__
1417 __half __hfma(__half x, __half y, __half z)
1418 {
1419 return __half_raw{__ocml_fma_f16(
1420 static_cast<__half_raw>(x).data,
1421 static_cast<__half_raw>(y).data,
1422 static_cast<__half_raw>(z).data)};
1423 }
1424 inline
1425 __device__
1426 __half __hfma_sat(__half x, __half y, __half z)
1427 {
1428 return __clamp_01(__hfma(x, y, z));
1429 }
1430 inline
1431 __device__
1432 __half __hdiv(__half x, __half y)
1433 {
1434 return __half_raw{
1435 static_cast<__half_raw>(x).data /
1436 static_cast<__half_raw>(y).data};
1437 }
1438
1439 inline
1440 __HOST_DEVICE__
1441 __half2 __hadd2(__half2 x, __half2 y)
1442 {
1443 return __half2{
1444 static_cast<__half2_raw>(x).data +
1445 static_cast<__half2_raw>(y).data};
1446 }
1447 inline
1448 __HOST_DEVICE__
1449 __half2 __habs2(__half2 x)
1450 {
1451 return __half2{
1452 __ocml_fabs_2f16(static_cast<__half2_raw>(x).data)};
1453 }
1454 inline
1455 __HOST_DEVICE__
1456 __half2 __hsub2(__half2 x, __half2 y)
1457 {
1458 return __half2{
1459 static_cast<__half2_raw>(x).data -
1460 static_cast<__half2_raw>(y).data};
1461 }
1462 inline
1463 __HOST_DEVICE__
1464 __half2 __hmul2(__half2 x, __half2 y)
1465 {
1466 return __half2{
1467 static_cast<__half2_raw>(x).data *
1468 static_cast<__half2_raw>(y).data};
1469 }
1470 inline
1471 __HOST_DEVICE__
1472 __half2 __hadd2_sat(__half2 x, __half2 y)
1473 {
1474 auto r = static_cast<__half2_raw>(__hadd2(x, y));
1475 return __half2{
1476 __clamp_01(__half_raw{r.data.x}),
1477 __clamp_01(__half_raw{r.data.y})};
1478 }
1479 inline
1480 __HOST_DEVICE__
1481 __half2 __hsub2_sat(__half2 x, __half2 y)
1482 {
1483 auto r = static_cast<__half2_raw>(__hsub2(x, y));
1484 return __half2{
1485 __clamp_01(__half_raw{r.data.x}),
1486 __clamp_01(__half_raw{r.data.y})};
1487 }
1488 inline
1489 __HOST_DEVICE__
1490 __half2 __hmul2_sat(__half2 x, __half2 y)
1491 {
1492 auto r = static_cast<__half2_raw>(__hmul2(x, y));
1493 return __half2{
1494 __clamp_01(__half_raw{r.data.x}),
1495 __clamp_01(__half_raw{r.data.y})};
1496 }
1497 inline
1498 __HOST_DEVICE__
1499 __half2 __hfma2(__half2 x, __half2 y, __half2 z)
1500 {
1501 return __half2{__ocml_fma_2f16(x, y, z)};
1502 }
1503 inline
1504 __HOST_DEVICE__
1505 __half2 __hfma2_sat(__half2 x, __half2 y, __half2 z)
1506 {
1507 auto r = static_cast<__half2_raw>(__hfma2(x, y, z));
1508 return __half2{
1509 __clamp_01(__half_raw{r.data.x}),
1510 __clamp_01(__half_raw{r.data.y})};
1511 }
1512 inline
1513 __HOST_DEVICE__
1514 __half2 __h2div(__half2 x, __half2 y)
1515 {
1516 return __half2{
1517 static_cast<__half2_raw>(x).data /
1518 static_cast<__half2_raw>(y).data};
1519 }
1520
1521 // Math functions
1522 #if defined(__clang__) && defined(__HIP__)
1523 inline
1524 __device__
1525 float amd_mixed_dot(__half2 a, __half2 b, float c, bool saturate) {
1526 return __ockl_fdot2(static_cast<__half2_raw>(a).data,
1527 static_cast<__half2_raw>(b).data,
1528 c, saturate);
1529 }
1530 #endif
1531 inline
1532 __device__
1533 __half htrunc(__half x)
1534 {
1535 return __half_raw{
1536 __ocml_trunc_f16(static_cast<__half_raw>(x).data)};
1537 }
1538 inline
1539 __device__
1540 __half hceil(__half x)
1541 {
1542 return __half_raw{
1543 __ocml_ceil_f16(static_cast<__half_raw>(x).data)};
1544 }
1545 inline
1546 __device__
1547 __half hfloor(__half x)
1548 {
1549 return __half_raw{
1550 __ocml_floor_f16(static_cast<__half_raw>(x).data)};
1551 }
1552 inline
1553 __device__
1554 __half hrint(__half x)
1555 {
1556 return __half_raw{
1557 __ocml_rint_f16(static_cast<__half_raw>(x).data)};
1558 }
1559 inline
1560 __device__
1561 __half hsin(__half x)
1562 {
1563 return __half_raw{
1564 __ocml_sin_f16(static_cast<__half_raw>(x).data)};
1565 }
1566 inline
1567 __device__
1568 __half hcos(__half x)
1569 {
1570 return __half_raw{
1571 __ocml_cos_f16(static_cast<__half_raw>(x).data)};
1572 }
1573 inline
1574 __device__
1575 __half hexp(__half x)
1576 {
1577 return __half_raw{
1578 __ocml_exp_f16(static_cast<__half_raw>(x).data)};
1579 }
1580 inline
1581 __device__
1582 __half hexp2(__half x)
1583 {
1584 return __half_raw{
1585 __ocml_exp2_f16(static_cast<__half_raw>(x).data)};
1586 }
1587 inline
1588 __device__
1589 __half hexp10(__half x)
1590 {
1591 return __half_raw{
1592 __ocml_exp10_f16(static_cast<__half_raw>(x).data)};
1593 }
1594 inline
1595 __device__
1596 __half hlog2(__half x)
1597 {
1598 return __half_raw{
1599 __ocml_log2_f16(static_cast<__half_raw>(x).data)};
1600 }
1601 inline
1602 __device__
1603 __half hlog(__half x)
1604 {
1605 return __half_raw{
1606 __ocml_log_f16(static_cast<__half_raw>(x).data)};
1607 }
1608 inline
1609 __device__
1610 __half hlog10(__half x)
1611 {
1612 return __half_raw{
1613 __ocml_log10_f16(static_cast<__half_raw>(x).data)};
1614 }
1615 inline
1616 __device__
1617 __half hrcp(__half x)
1618 {
1619 return __half_raw{
1620 static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half_raw>(x).data))};
1621 }
1622 inline
1623 __device__
1624 __half hrsqrt(__half x)
1625 {
1626 return __half_raw{
1627 __ocml_rsqrt_f16(static_cast<__half_raw>(x).data)};
1628 }
1629 inline
1630 __device__
1631 __half hsqrt(__half x)
1632 {
1633 return __half_raw{
1634 __ocml_sqrt_f16(static_cast<__half_raw>(x).data)};
1635 }
1636 inline
1637 __device__
1638 bool __hisinf(__half x)
1639 {
1640 return __ocml_isinf_f16(static_cast<__half_raw>(x).data);
1641 }
1642 inline
1643 __device__
1644 bool __hisnan(__half x)
1645 {
1646 return __ocml_isnan_f16(static_cast<__half_raw>(x).data);
1647 }
1648 inline
1649 __device__
1650 __half __hneg(__half x)
1651 {
1652 return __half_raw{-static_cast<__half_raw>(x).data};
1653 }
1654
1655 inline
1656 __HOST_DEVICE__
1657 __half2 h2trunc(__half2 x)
1658 {
1659 return __half2{__ocml_trunc_2f16(x)};
1660 }
1661 inline
1662 __HOST_DEVICE__
1663 __half2 h2ceil(__half2 x)
1664 {
1665 return __half2{__ocml_ceil_2f16(x)};
1666 }
1667 inline
1668 __HOST_DEVICE__
1669 __half2 h2floor(__half2 x)
1670 {
1671 return __half2{__ocml_floor_2f16(x)};
1672 }
1673 inline
1674 __HOST_DEVICE__
1675 __half2 h2rint(__half2 x)
1676 {
1677 return __half2{__ocml_rint_2f16(x)};
1678 }
1679 inline
1680 __HOST_DEVICE__
1681 __half2 h2sin(__half2 x)
1682 {
1683 return __half2{__ocml_sin_2f16(x)};
1684 }
1685 inline
1686 __HOST_DEVICE__
1687 __half2 h2cos(__half2 x)
1688 {
1689 return __half2{__ocml_cos_2f16(x)};
1690 }
1691 inline
1692 __HOST_DEVICE__
1693 __half2 h2exp(__half2 x)
1694 {
1695 return __half2{__ocml_exp_2f16(x)};
1696 }
1697 inline
1698 __HOST_DEVICE__
1699 __half2 h2exp2(__half2 x)
1700 {
1701 return __half2{__ocml_exp2_2f16(x)};
1702 }
1703 inline
1704 __HOST_DEVICE__
1705 __half2 h2exp10(__half2 x)
1706 {
1707 return __half2{__ocml_exp10_2f16(x)};
1708 }
1709 inline
1710 __HOST_DEVICE__
1711 __half2 h2log2(__half2 x)
1712 {
1713 return __half2{__ocml_log2_2f16(x)};
1714 }
1715 inline
1716 __HOST_DEVICE__
1717 __half2 h2log(__half2 x) { return __ocml_log_2f16(x); }
1718 inline
1719 __HOST_DEVICE__
1720 __half2 h2log10(__half2 x) { return __ocml_log10_2f16(x); }
1721 inline
1722 __HOST_DEVICE__
1723 __half2 h2rcp(__half2 x) {
1724 return _Float16_2{static_cast<_Float16>(__builtin_amdgcn_rcph(x.x)),
1725 static_cast<_Float16>(__builtin_amdgcn_rcph(x.y))};
1726 }
1727 inline
1728 __HOST_DEVICE__
1729 __half2 h2rsqrt(__half2 x) { return __ocml_rsqrt_2f16(x); }
1730 inline
1731 __HOST_DEVICE__
1732 __half2 h2sqrt(__half2 x) { return __ocml_sqrt_2f16(x); }
1733 inline
1734 __HOST_DEVICE__
1735 __half2 __hisinf2(__half2 x)
1736 {
1737 auto r = __ocml_isinf_2f16(x);
1738 return __half2{_Float16_2{
1739 static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}};
1740 }
1741 inline
1742 __HOST_DEVICE__
1743 __half2 __hisnan2(__half2 x)
1744 {
1745 auto r = __ocml_isnan_2f16(x);
1746 return __half2{_Float16_2{
1747 static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}};
1748 }
1749 inline
1750 __HOST_DEVICE__
1751 __half2 __hneg2(__half2 x)
1752 {
1753 return __half2{-static_cast<__half2_raw>(x).data};
1754 }
1755 } // Anonymous namespace.
1756
1757 #if !defined(HIP_NO_HALF)
1758 using half = __half;
1759 using half2 = __half2;
1760 #endif
1761 __device__
1762 inline
1763 __half __shfl(__half var, int src_lane, int width = warpSize) {
1764 union { int i; __half h; } tmp; tmp.h = var;
1765 tmp.i = __shfl(tmp.i, src_lane, width);
1766 return tmp.h;
1767 }
1768 __device__
1769 inline
1770 __half2 __shfl(__half2 var, int src_lane, int width = warpSize) {
1771 union { int i; __half2 h; } tmp; tmp.h = var;
1772 tmp.i = __shfl(tmp.i, src_lane, width);
1773 return tmp.h;
1774 }
1775 __device__
1776 inline
1777 __half __shfl_up(__half var, unsigned int lane_delta, int width = warpSize) {
1778 union { int i; __half h; } tmp; tmp.h = var;
1779 tmp.i = __shfl_up(tmp.i, lane_delta, width);
1780 return tmp.h;
1781 }
1782 __device__
1783 inline
1784 __half2 __shfl_up(__half2 var, unsigned int lane_delta, int width = warpSize) {
1785 union { int i; __half2 h; } tmp; tmp.h = var;
1786 tmp.i = __shfl_up(tmp.i, lane_delta, width);
1787 return tmp.h;
1788 }
1789 __device__
1790 inline
1791 __half __shfl_down(__half var, unsigned int lane_delta, int width = warpSize) {
1792 union { int i; __half h; } tmp; tmp.h = var;
1793 tmp.i = __shfl_down(tmp.i, lane_delta, width);
1794 return tmp.h;
1795 }
1796 __device__
1797 inline
1798 __half2 __shfl_down(__half2 var, unsigned int lane_delta, int width = warpSize) {
1799 union { int i; __half2 h; } tmp; tmp.h = var;
1800 tmp.i = __shfl_down(tmp.i, lane_delta, width);
1801 return tmp.h;
1802 }
1803 __device__
1804 inline
1805 __half __shfl_xor(__half var, int lane_mask, int width = warpSize) {
1806 union { int i; __half h; } tmp; tmp.h = var;
1807 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
1808 return tmp.h;
1809 }
1810 __device__
1811 inline
1812 __half2 __shfl_xor(__half2 var, int lane_mask, int width = warpSize) {
1813 union { int i; __half2 h; } tmp; tmp.h = var;
1814 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
1815 return tmp.h;
1816 }
1817 #endif // defined(__cplusplus)
1818#elif defined(__GNUC__)
1819 #if !defined(__HIPCC_RTC__)
1820 #include "hip_fp16_gcc.h"
1821 #endif
1822#endif // !defined(__clang__) && defined(__GNUC__)
1823
1824#if defined(__clang__)
1825#pragma clang diagnostic pop
1826#endif
1827
1828#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP16_H
#define __host__
Definition host_defines.h:170
__device__ __hip_bfloat16 __habs(const __hip_bfloat16 a)
Returns absolute of a bfloat16.
Definition amd_hip_bf16.h:365
__device__ __hip_bfloat16 __hsub(const __hip_bfloat16 a, const __hip_bfloat16 b)
Subtracts two bfloat16 values.
Definition amd_hip_bf16.h:321
__device__ __hip_bfloat16 __hfma(const __hip_bfloat16 a, const __hip_bfloat16 b, const __hip_bfloat16 c)
Performs FMA of given bfloat16 values.
Definition amd_hip_bf16.h:337
__device__ __hip_bfloat16 __hdiv(const __hip_bfloat16 a, const __hip_bfloat16 b)
Divides two bfloat16 values.
Definition amd_hip_bf16.h:329
__device__ __hip_bfloat16 __hneg(const __hip_bfloat16 a)
Negate a bfloat16 value.
Definition amd_hip_bf16.h:355
__device__ __hip_bfloat16 __hmul(const __hip_bfloat16 a, const __hip_bfloat16 b)
Multiplies two bfloat16 values.
Definition amd_hip_bf16.h:347
__device__ __hip_bfloat16 __hadd(const __hip_bfloat16 a, const __hip_bfloat16 b)
Adds two bfloat16 values.
Definition amd_hip_bf16.h:313
__device__ bool __hge(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - greater than equal.
Definition amd_hip_bf16.h:466
__device__ bool __hlt(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - less than operator.
Definition amd_hip_bf16.h:514
__device__ bool __hneu(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered not equal.
Definition amd_hip_bf16.h:490
__device__ bool __hgt(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - greater than.
Definition amd_hip_bf16.h:450
__device__ bool __heq(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values.
Definition amd_hip_bf16.h:433
__device__ bool __hltu(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered less than.
Definition amd_hip_bf16.h:522
__device__ bool __hleu(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered less than equal.
Definition amd_hip_bf16.h:538
__device__ __hip_bfloat16 __hmin(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - return min.
Definition amd_hip_bf16.h:506
__device__ bool __hne(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - not equal.
Definition amd_hip_bf16.h:482
__device__ bool __hequ(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered equal.
Definition amd_hip_bf16.h:441
__device__ __hip_bfloat16 __hmax(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - return max.
Definition amd_hip_bf16.h:498
__device__ bool __hgtu(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered greater than.
Definition amd_hip_bf16.h:458
__device__ int __hisinf(const __hip_bfloat16 a)
Checks if number is inf.
Definition amd_hip_bf16.h:546
__device__ bool __hle(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - less than.
Definition amd_hip_bf16.h:530
__device__ bool __hisnan(const __hip_bfloat16 a)
Checks if number is nan.
Definition amd_hip_bf16.h:552
__device__ bool __hgeu(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered greater than equal.
Definition amd_hip_bf16.h:474
__device__ bool __hbgeu2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a >= b - unordered.
Definition amd_hip_bf16.h:582
__device__ __hip_bfloat162 __hge2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a >= b, returns 1.0 if greater than equal, otherwise 0.0.
Definition amd_hip_bf16.h:663
__device__ bool __hbltu2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a < b - unordered.
Definition amd_hip_bf16.h:630
__device__ bool __hble2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a <= b.
Definition amd_hip_bf16.h:606
__device__ __hip_bfloat162 __hlt2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a < b, returns 1.0 if greater than equal, otherwise 0.0.
Definition amd_hip_bf16.h:700
__device__ bool __hbequ2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Checks if two numbers are equal - unordered.
Definition amd_hip_bf16.h:566
__device__ bool __hbeq2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Checks if two numbers are equal.
Definition amd_hip_bf16.h:558
__device__ __hip_bfloat162 __hle2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a <= b, returns 1.0 if greater than equal, otherwise 0.0.
Definition amd_hip_bf16.h:691
__device__ __hip_bfloat162 __heq2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a != b, returns 1.0 if equal, otherwise 0.0.
Definition amd_hip_bf16.h:654
__device__ __hip_bfloat162 __hne2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Checks for not equal to.
Definition amd_hip_bf16.h:729
__device__ bool __hblt2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a < b.
Definition amd_hip_bf16.h:622
__device__ bool __hbge2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a >= b.
Definition amd_hip_bf16.h:574
__device__ bool __hbgt2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a > b.
Definition amd_hip_bf16.h:590
__device__ bool __hbneu2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a != b.
Definition amd_hip_bf16.h:646
__device__ bool __hbleu2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a <= b - unordered.
Definition amd_hip_bf16.h:614
__device__ __hip_bfloat162 __hisnan2(const __hip_bfloat162 a)
Check for a is NaN, returns 1.0 if NaN, otherwise 0.0.
Definition amd_hip_bf16.h:681
__device__ bool __hbne2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a != b.
Definition amd_hip_bf16.h:638
__device__ __hip_bfloat162 __hgt2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a > b, returns 1.0 if greater than equal, otherwise 0.0.
Definition amd_hip_bf16.h:672
__device__ bool __hbgtu2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a > b - unordered.
Definition amd_hip_bf16.h:598
__device__ __hip_bfloat162 __hfma2(const __hip_bfloat162 a, const __hip_bfloat162 b, const __hip_bfloat162 c)
Performs FMA of given bfloat162 values.
Definition amd_hip_bf16.h:400
__device__ __hip_bfloat162 __h2div(const __hip_bfloat162 a, const __hip_bfloat162 b)
Divides bfloat162 values.
Definition amd_hip_bf16.h:375
__device__ __hip_bfloat162 __habs2(const __hip_bfloat162 a)
Returns absolute of a bfloat162.
Definition amd_hip_bf16.h:384
__device__ __hip_bfloat162 __hadd2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Adds two bfloat162 values.
Definition amd_hip_bf16.h:392
__device__ __hip_bfloat162 __hsub2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Subtracts two bfloat162 values.
Definition amd_hip_bf16.h:425
__device__ __hip_bfloat162 __hneg2(const __hip_bfloat162 a)
Converts a bfloat162 into negative.
Definition amd_hip_bf16.h:417
__device__ __hip_bfloat162 __hmul2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Multiplies two bfloat162 values.
Definition amd_hip_bf16.h:409
__HOST_DEVICE__ float __high2float(const __hip_bfloat162 a)
Converts high 16 bits of __hip_bfloat162 to float and returns the result.
Definition amd_hip_bf16.h:246
__HOST_DEVICE__ float __low2float(const __hip_bfloat162 a)
Converts low 16 bits of __hip_bfloat162 to float and returns the result.
Definition amd_hip_bf16.h:274
__device__ __hip_bfloat162 __lowhigh2highlow(const __hip_bfloat162 a)
Swaps both halves.
Definition amd_hip_bf16.h:280
__device__ __hip_bfloat16 hexp(const __hip_bfloat16 h)
Calculate exponential of bfloat16.
Definition amd_hip_bf16.h:754
__device__ __hip_bfloat16 hrint(const __hip_bfloat16 h)
Round to nearest int.
Definition amd_hip_bf16.h:818
__device__ __hip_bfloat16 hrsqrt(const __hip_bfloat16 h)
Reciprocal square root.
Definition amd_hip_bf16.h:826
__device__ __hip_bfloat16 hcos(const __hip_bfloat16 h)
Calculate cosine of bfloat16.
Definition amd_hip_bf16.h:746
__device__ __hip_bfloat16 htrunc(const __hip_bfloat16 h)
Calculate truncate of bfloat16.
Definition amd_hip_bf16.h:850
__device__ __hip_bfloat16 hlog10(const __hip_bfloat16 h)
Calculate log 10 of bfloat16.
Definition amd_hip_bf16.h:794
__device__ __hip_bfloat16 hexp10(const __hip_bfloat16 h)
Calculate exponential 10 of bfloat16.
Definition amd_hip_bf16.h:762
__device__ __hip_bfloat16 hceil(const __hip_bfloat16 h)
Calculate ceil of bfloat16.
Definition amd_hip_bf16.h:738
__device__ __hip_bfloat16 hrcp(const __hip_bfloat16 h)
Calculate reciprocal.
Definition amd_hip_bf16.h:810
__device__ __hip_bfloat16 hsqrt(const __hip_bfloat16 h)
Calculate sqrt of bfloat16.
Definition amd_hip_bf16.h:842
__device__ __hip_bfloat16 hfloor(const __hip_bfloat16 h)
Calculate floor of bfloat16.
Definition amd_hip_bf16.h:778
__device__ __hip_bfloat16 hsin(const __hip_bfloat16 h)
Calculate sin of bfloat16.
Definition amd_hip_bf16.h:834
__device__ __hip_bfloat16 hlog(const __hip_bfloat16 h)
Calculate natural log of bfloat16.
Definition amd_hip_bf16.h:786
__device__ __hip_bfloat16 hlog2(const __hip_bfloat16 h)
Calculate log 2 of bfloat16.
Definition amd_hip_bf16.h:802
__device__ __hip_bfloat16 hexp2(const __hip_bfloat16 h)
Calculate exponential 2 of bfloat16.
Definition amd_hip_bf16.h:770
__device__ __hip_bfloat162 h2sin(const __hip_bfloat162 h)
Calculate sin of bfloat162.
Definition amd_hip_bf16.h:954
__device__ __hip_bfloat162 h2log(const __hip_bfloat162 h)
Calculate natural log of bfloat162.
Definition amd_hip_bf16.h:906
__device__ __hip_bfloat162 h2log2(const __hip_bfloat162 h)
Calculate log 2 of bfloat162.
Definition amd_hip_bf16.h:922
__device__ __hip_bfloat162 h2sqrt(const __hip_bfloat162 h)
Calculate sqrt of bfloat162.
Definition amd_hip_bf16.h:962
__device__ __hip_bfloat162 h2log10(const __hip_bfloat162 h)
Calculate log 10 of bfloat162.
Definition amd_hip_bf16.h:914
__device__ __hip_bfloat162 h2ceil(const __hip_bfloat162 h)
Calculate ceil of bfloat162.
Definition amd_hip_bf16.h:858
__device__ __hip_bfloat162 h2rint(const __hip_bfloat162 h)
Calculate vector round to nearest int.
Definition amd_hip_bf16.h:938
__device__ __hip_bfloat162 h2rcp(const __hip_bfloat162 h)
Calculate vector reciprocal.
Definition amd_hip_bf16.h:930
__device__ __hip_bfloat162 h2rsqrt(const __hip_bfloat162 h)
Calculate vector reciprocal square root.
Definition amd_hip_bf16.h:946
__device__ __hip_bfloat162 h2cos(const __hip_bfloat162 h)
Calculate cosine of bfloat162.
Definition amd_hip_bf16.h:866
__device__ __hip_bfloat162 h2floor(const __hip_bfloat162 h)
Calculate floor of bfloat162.
Definition amd_hip_bf16.h:898
__device__ __hip_bfloat162 h2exp10(const __hip_bfloat162 h)
Calculate exponential 10 of bfloat162.
Definition amd_hip_bf16.h:882
__device__ __hip_bfloat162 h2trunc(const __hip_bfloat162 h)
Calculate truncate of bfloat162.
Definition amd_hip_bf16.h:970
__device__ __hip_bfloat162 h2exp2(const __hip_bfloat162 h)
Calculate exponential 2 of bfloat162.
Definition amd_hip_bf16.h:890
__device__ __hip_bfloat162 h2exp(const __hip_bfloat162 h)
Calculate exponential of bfloat162.
Definition amd_hip_bf16.h:874
Definition amd_hip_vector_types.h:1986
Definition hip_fp16_gcc.h:7
Definition hip_fp16_gcc.h:11