[HIPIFY][ROCm#1769][fp16] Support for fp16 math - Part 2 - Functions

+ Updated synthetic tests, the regenerated `hipify-perl`, and `Device` `CUDA2HIP` docs accordingly
emankov · Nov 29, 2024 · 53b4c87 · 53b4c87
1 parent 1681e5d
commit 53b4c87
Show file tree

Hide file tree

Showing 4 changed files with 84 additions and 32 deletions.
diff --git a/bin/hipify-perl b/bin/hipify-perl
@@ -6152,6 +6152,9 @@ sub simpleSubstitutions {
     subst("__assertfail", "__assertfail", "device_function");
     subst("__ballot", "__ballot", "device_function");
     subst("__ballot_sync", "__ballot_sync", "device_function");
+    subst("__bfloat1622float2", "__bfloat1622float2", "device_function");
+    subst("__bfloat162bfloat162", "__bfloat162bfloat162", "device_function");
+    subst("__bfloat162float", "__bfloat162float", "device_function");
     subst("__brev", "__brev", "device_function");
     subst("__brevll", "__brevll", "device_function");
     subst("__byte_perm", "__byte_perm", "device_function");
@@ -6161,6 +6164,7 @@ sub simpleSubstitutions {
     subst("__dadd_rn", "__dadd_rn", "device_function");
     subst("__ddiv_rn", "__ddiv_rn", "device_function");
     subst("__dmul_rn", "__dmul_rn", "device_function");
+    subst("__double2bfloat16", "__double2bfloat16", "device_function");
     subst("__double2float_rd", "__double2float_rd", "device_function");
     subst("__double2float_rn", "__double2float_rn", "device_function");
     subst("__double2float_ru", "__double2float_ru", "device_function");
@@ -6195,6 +6199,7 @@ sub simpleSubstitutions {
     subst("__ffs", "__ffs", "device_function");
     subst("__ffsll", "__ffsll", "device_function");
     subst("__float22half2_rn", "__float22half2_rn", "device_function");
+    subst("__float2bfloat16", "__float2bfloat16", "device_function");
     subst("__float2half", "__float2half", "device_function");
     subst("__float2half2_rn", "__float2half2_rn", "device_function");
     subst("__float2half_rd", "__float2half_rd", "device_function");
@@ -6297,9 +6302,11 @@ sub simpleSubstitutions {
     subst("__hgt2", "__hgt2", "device_function");
     subst("__hgtu", "__hgtu", "device_function");
     subst("__hgtu2", "__hgtu2", "device_function");
+    subst("__high2bfloat16", "__high2bfloat16", "device_function");
     subst("__high2float", "__high2float", "device_function");
     subst("__high2half", "__high2half", "device_function");
     subst("__high2half2", "__high2half2", "device_function");
+    subst("__highs2bfloat162", "__highs2bfloat162", "device_function");
     subst("__highs2half2", "__highs2half2", "device_function");
     subst("__hiloint2double", "__hiloint2double", "device_function");
     subst("__hisinf", "__hisinf", "device_function");
@@ -6365,6 +6372,7 @@ sub simpleSubstitutions {
     subst("__low2half", "__low2half", "device_function");
     subst("__low2half2", "__low2half2", "device_function");
     subst("__lowhigh2highlow", "__lowhigh2highlow", "device_function");
+    subst("__lows2bfloat162", "__lows2bfloat162", "device_function");
     subst("__lows2half2", "__lows2half2", "device_function");
     subst("__match_all_sync", "__match_all_sync", "device_function");
     subst("__match_any_sync", "__match_any_sync", "device_function");
@@ -9237,6 +9245,7 @@ sub countSupportedDeviceFunctions {
         "__match_any_sync",
         "__match_all_sync",
         "__lows2half2",
+        "__lows2bfloat162",
         "__lowhigh2highlow",
         "__low2half2",
         "__low2half",
@@ -9302,9 +9311,11 @@ sub countSupportedDeviceFunctions {
         "__hisinf",
         "__hiloint2double",
         "__highs2half2",
+        "__highs2bfloat162",
         "__high2half2",
         "__high2half",
         "__high2float",
+        "__high2bfloat16",
         "__hgtu2",
         "__hgtu",
         "__hgt2",
@@ -9407,6 +9418,7 @@ sub countSupportedDeviceFunctions {
         "__float2half_rd",
         "__float2half2_rn",
         "__float2half",
+        "__float2bfloat16",
         "__float22half2_rn",
         "__ffsll",
         "__ffs",
@@ -9441,6 +9453,7 @@ sub countSupportedDeviceFunctions {
         "__double2float_ru",
         "__double2float_rn",
         "__double2float_rd",
+        "__double2bfloat16",
         "__dmul_rn",
         "__ddiv_rn",
         "__dadd_rn",
@@ -9450,6 +9463,9 @@ sub countSupportedDeviceFunctions {
         "__byte_perm",
         "__brevll",
         "__brev",
+        "__bfloat162float",
+        "__bfloat162bfloat162",
+        "__bfloat1622float2",
         "__ballot_sync",
         "__ballot",
         "__assertfail",
@@ -9610,7 +9626,6 @@ sub warnUnsupportedDeviceFunctions {
         "__pm2",
         "__pm1",
         "__pm0",
-        "__lows2bfloat162",
         "__low2bfloat162",
         "__low2bfloat16",
         "__ll2bfloat16_rz",
@@ -9643,9 +9658,7 @@ sub warnUnsupportedDeviceFunctions {
         "__hlt2_mask",
         "__hleu2_mask",
         "__hle2_mask",
-        "__highs2bfloat162",
         "__high2bfloat162",
-        "__high2bfloat16",
         "__hgtu2_mask",
         "__hgt2_mask",
         "__hgeu2_mask",
@@ -9684,7 +9697,6 @@ sub warnUnsupportedDeviceFunctions {
         "__float2bfloat16_rn",
         "__float2bfloat16_rd",
         "__float2bfloat162_rn",
-        "__float2bfloat16",
         "__float22bfloat162_rn",
         "__finitel",
         "__finitef",
@@ -9705,7 +9717,6 @@ sub warnUnsupportedDeviceFunctions {
         "__drcp_ru",
         "__drcp_rd",
         "__double2half",
-        "__double2bfloat16",
         "__dmul_rz",
         "__dmul_ru",
         "__dmul_rd",
@@ -9743,10 +9754,7 @@ sub warnUnsupportedDeviceFunctions {
         "__bfloat162int_ru",
         "__bfloat162int_rn",
         "__bfloat162int_rd",
-        "__bfloat162float",
         "__bfloat162char_rz",
-        "__bfloat162bfloat162",
-        "__bfloat1622float2",
         "_Pow_int"
     )
     {
@@ -11073,7 +11081,6 @@ sub warnUnsupportedFunctions {
         "__pm2",
         "__pm1",
         "__pm0",
-        "__lows2bfloat162",
         "__low2bfloat162",
         "__low2bfloat16",
         "__ll2bfloat16_rz",
@@ -11106,9 +11113,7 @@ sub warnUnsupportedFunctions {
         "__hlt2_mask",
         "__hleu2_mask",
         "__hle2_mask",
-        "__highs2bfloat162",
         "__high2bfloat162",
-        "__high2bfloat16",
         "__hgtu2_mask",
         "__hgt2_mask",
         "__hgeu2_mask",
@@ -11147,7 +11152,6 @@ sub warnUnsupportedFunctions {
         "__float2bfloat16_rn",
         "__float2bfloat16_rd",
         "__float2bfloat162_rn",
-        "__float2bfloat16",
         "__float22bfloat162_rn",
         "__finitel",
         "__finitef",
@@ -11168,7 +11172,6 @@ sub warnUnsupportedFunctions {
         "__drcp_ru",
         "__drcp_rd",
         "__double2half",
-        "__double2bfloat16",
         "__dmul_rz",
         "__dmul_ru",
         "__dmul_rd",
@@ -11206,10 +11209,7 @@ sub warnUnsupportedFunctions {
         "__bfloat162int_ru",
         "__bfloat162int_rn",
         "__bfloat162int_rd",
-        "__bfloat162float",
         "__bfloat162char_rz",
-        "__bfloat162bfloat162",
-        "__bfloat1622float2",
         "__CUB_LP64__",
         "_Pow_int",
         "_CUB_ASM_PTR_SIZE_",

diff --git a/docs/tables/CUDA_Device_API_supported_by_HIP.md b/docs/tables/CUDA_Device_API_supported_by_HIP.md
@@ -14,10 +14,10 @@
 |`__assertfail`| | | | |`__assertfail`|1.9.0| | | | |
 |`__ballot`| | | | |`__ballot`|1.6.0| | | | |
 |`__ballot_sync`|9.0| | | |`__ballot_sync`|6.2.0| | | | |
-|`__bfloat1622float2`|11.0| | | | | | | | | |
-|`__bfloat162bfloat162`|11.0| | | | | | | | | |
+|`__bfloat1622float2`|11.0| | | |`__bfloat1622float2`|5.7.0| | | | |
+|`__bfloat162bfloat162`|11.0| | | |`__bfloat162bfloat162`|5.7.0| | | | |
 |`__bfloat162char_rz`|12.2| | | | | | | | | |
-|`__bfloat162float`|11.0| | | | | | | | | |
+|`__bfloat162float`|11.0| | | |`__bfloat162float`|5.7.0| | | | |
 |`__bfloat162int_rd`|11.0| | | | | | | | | |
 |`__bfloat162int_rn`|11.0| | | | | | | | | |
 |`__bfloat162int_ru`|11.0| | | | | | | | | |
@@ -64,7 +64,7 @@
 |`__dmul_rn`| | | | |`__dmul_rn`|1.6.0| | | | |
 |`__dmul_ru`| | | | | | | | | | |
 |`__dmul_rz`| | | | | | | | | | |
-|`__double2bfloat16`|11.0| | | | | | | | | |
+|`__double2bfloat16`|11.0| | | |`__double2bfloat16`|5.7.0| | | | |
 |`__double2float_rd`| | | | |`__double2float_rd`|1.6.0| | | | |
 |`__double2float_rn`| | | | |`__double2float_rn`|1.6.0| | | | |
 |`__double2float_ru`| | | | |`__double2float_ru`|1.6.0| | | | |
@@ -119,7 +119,7 @@
 |`__finitel`| | | | | | | | | | |
 |`__float22bfloat162_rn`|11.0| | | | | | | | | |
 |`__float22half2_rn`| | | | |`__float22half2_rn`|1.6.0| | | | |
-|`__float2bfloat16`|11.0| | | | | | | | | |
+|`__float2bfloat16`|11.0| | | |`__float2bfloat16`|5.7.0| | | | |
 |`__float2bfloat162_rn`|11.0| | | | | | | | | |
 |`__float2bfloat16_rd`|11.0| | | | | | | | | |
 |`__float2bfloat16_rn`|11.0| | | | | | | | | |
@@ -260,12 +260,12 @@
 |`__hgtu`| | | | |`__hgtu`|1.9.0| | | | |
 |`__hgtu2`| | | | |`__hgtu2`|1.9.0| | | | |
 |`__hgtu2_mask`|12.0| | | | | | | | | |
-|`__high2bfloat16`|11.0| | | | | | | | | |
+|`__high2bfloat16`|11.0| | | |`__high2bfloat16`|5.7.0| | | | |
 |`__high2bfloat162`|11.0| | | | | | | | | |
 |`__high2float`| | | | |`__high2float`|1.6.0| | | | |
 |`__high2half`| | | | |`__high2half`|1.6.0| | | | |
 |`__high2half2`| | | | |`__high2half2`|1.6.0| | | | |
-|`__highs2bfloat162`|11.0| | | | | | | | | |
+|`__highs2bfloat162`|11.0| | | |`__highs2bfloat162`|5.7.0| | | | |
 |`__highs2half2`| | | | |`__highs2half2`|1.6.0| | | | |
 |`__hiloint2double`| | | | |`__hiloint2double`|1.6.0| | | | |
 |`__hisinf`| | | | |`__hisinf`|1.6.0| | | | |
@@ -363,7 +363,7 @@
 |`__low2half`| | | | |`__low2half`|1.6.0| | | | |
 |`__low2half2`| | | | |`__low2half2`|1.6.0| | | | |
 |`__lowhigh2highlow`| | | | |`__lowhigh2highlow`|1.6.0| | | | |
-|`__lows2bfloat162`|11.0| | | | | | | | | |
+|`__lows2bfloat162`|11.0| | | |`__lows2bfloat162`|5.7.0| | | | |
 |`__lows2half2`| | | | |`__lows2half2`|1.6.0| | | | |
 |`__match_all_sync`|9.0| | | |`__match_all_sync`|6.2.0| | | | |
 |`__match_any_sync`|9.0| | | |`__match_any_sync`|6.2.0| | | | |

diff --git a/src/CUDA2HIP_Device_functions.cpp b/src/CUDA2HIP_Device_functions.cpp
@@ -714,16 +714,16 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DEVICE_FUNCTION_MAP {
   {"__half2char_rz",                    {"__half2char_rz",                     "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
   {"__half2uchar_rz",                   {"__half2uchar_rz",                    "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
   // bfp16 functions
-  {"__double2bfloat16",                 {"__double2bfloat16",                  "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
-  {"__float2bfloat16",                  {"__float2bfloat16",                   "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
+  {"__double2bfloat16",                 {"__double2bfloat16",                  "", CONV_DEVICE_FUNC, API_RUNTIME, 1}},
+  {"__float2bfloat16",                  {"__float2bfloat16",                   "", CONV_DEVICE_FUNC, API_RUNTIME, 1}},
   {"__float2bfloat16_rn",               {"__float2bfloat16_rn",                "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
   {"__float2bfloat16_rz",               {"__float2bfloat16_rz",                "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
   {"__float2bfloat16_rd",               {"__float2bfloat16_rd",                "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
   {"__float2bfloat16_ru",               {"__float2bfloat16_ru",                "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
-  {"__bfloat162float",                  {"__bfloat162float",                   "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
+  {"__bfloat162float",                  {"__bfloat162float",                   "", CONV_DEVICE_FUNC, API_RUNTIME, 1}},
   {"__float2bfloat162_rn",              {"__float2bfloat162_rn",               "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
   {"__floats2bfloat162_rn",             {"__floats2bfloat162_rn",              "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
-  {"__bfloat1622float2",                {"__bfloat1622float2",                 "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
+  {"__bfloat1622float2",                {"__bfloat1622float2",                 "", CONV_DEVICE_FUNC, API_RUNTIME, 1}},
   {"__bfloat162int_rn",                 {"__bfloat162int_rn",                  "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
   {"__bfloat162int_rz",                 {"__bfloat162int_rz",                  "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
   {"__bfloat162int_rd",                 {"__bfloat162int_rd",                  "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
@@ -772,10 +772,10 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DEVICE_FUNCTION_MAP {
   {"__ll2bfloat16_rz",                  {"__ll2bfloat16_rz",                   "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
   {"__ll2bfloat16_rd",                  {"__ll2bfloat16_rd",                   "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
   {"__ll2bfloat16_ru",                  {"__ll2bfloat16_ru",                   "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
-  {"__bfloat162bfloat162",              {"__bfloat162bfloat162",               "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
-  {"__lows2bfloat162",                  {"__lows2bfloat162",                   "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
-  {"__highs2bfloat162",                 {"__highs2bfloat162",                  "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
-  {"__high2bfloat16",                   {"__high2bfloat16",                    "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
+  {"__bfloat162bfloat162",              {"__bfloat162bfloat162",               "", CONV_DEVICE_FUNC, API_RUNTIME, 1}},
+  {"__lows2bfloat162",                  {"__lows2bfloat162",                   "", CONV_DEVICE_FUNC, API_RUNTIME, 1}},
+  {"__highs2bfloat162",                 {"__highs2bfloat162",                  "", CONV_DEVICE_FUNC, API_RUNTIME, 1}},
+  {"__high2bfloat16",                   {"__high2bfloat16",                    "", CONV_DEVICE_FUNC, API_RUNTIME, 1}},
   {"__low2bfloat16",                    {"__low2bfloat16",                     "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
   {"__halves2bfloat162",                {"__halves2bfloat162",                 "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
   {"__low2bfloat162",                   {"__low2bfloat162",                    "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
@@ -1508,6 +1508,14 @@ const std::map<llvm::StringRef, hipAPIversions> HIP_DEVICE_FUNCTION_VER_MAP {
   {"__hip_cvt_bfloat16raw2_to_fp8x2",   {HIP_6020, HIP_0,    HIP_0   }},
   {"__hip_cvt_fp8_to_halfraw",          {HIP_6020, HIP_0,    HIP_0   }},
   {"__hip_cvt_fp8x2_to_halfraw2",       {HIP_6020, HIP_0,    HIP_0   }},
+  {"__double2bfloat16",                 {HIP_5070, HIP_0,    HIP_0   }},
+  {"__float2bfloat16",                  {HIP_5070, HIP_0,    HIP_0   }},
+  {"__bfloat162float",                  {HIP_5070, HIP_0,    HIP_0   }},
+  {"__bfloat1622float2",                {HIP_5070, HIP_0,    HIP_0   }},
+  {"__bfloat162bfloat162",              {HIP_5070, HIP_0,    HIP_0   }},
+  {"__lows2bfloat162",                  {HIP_5070, HIP_0,    HIP_0   }},
+  {"__highs2bfloat162",                 {HIP_5070, HIP_0,    HIP_0   }},
+  {"__high2bfloat16",                   {HIP_5070, HIP_0,    HIP_0   }},
 };
 
 const std::map<unsigned int, llvm::StringRef> CUDA_DEVICE_FUNCTION_API_SECTION_MAP {

diff --git a/tests/unit_tests/synthetic/libraries/cudevice2hipdevice.cu b/tests/unit_tests/synthetic/libraries/cudevice2hipdevice.cu
@@ -11,7 +11,9 @@
 int main() {
   printf("24. CUDA Device API to HIP Device API synthetic test\n");
 
+  double da = 0.0f;
   double dx = 0.0f;
+  float fa = 0.0f;
   float fx = 0.0f;
   double2 d2 = { 0.0f, 0.0f };
   float2 f2 = { 0.0f, 0.0f };
@@ -27,9 +29,51 @@ int main() {
 
   // CHECK: __hip_bfloat162 bf162 = { 0, 0 };
   __nv_bfloat162 bf162 = { 0, 0 };
+  __nv_bfloat162 bf162a = { 0, 0 };
+  __nv_bfloat162 bf162b = { 0, 0 };
 
   // CHECK: __hip_bfloat162_raw bf162r = { 0, 0 };
   __nv_bfloat162_raw bf162r = { 0, 0 };
+
+  // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double a);
+  // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __double2bfloat16(const double a)
+  // CHECK: bf16 = __double2bfloat16(da);
+  bf16 = __double2bfloat16(da);
+
+  // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a);
+  // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __float2bfloat16(float f);
+  // CHECK: bf16 = __float2bfloat16(fa);
+  bf16 = __float2bfloat16(fa);
+
+  // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a);
+  // HIP: __BF16_HOST_DEVICE_STATIC__ float __bfloat162float(__hip_bfloat16 a);
+  // CHECK: bf16 = __bfloat162float(fa);
+  bf16 = __bfloat162float(fa);
+
+  // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a);
+  // HIP: __BF16_HOST_DEVICE_STATIC__ float2 __bfloat1622float2(const __hip_bfloat162 a);
+  // CHECK: f2 = __bfloat1622float2(bf162);
+  f2 = __bfloat1622float2(bf162);
+
+  // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a);
+  // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __bfloat162bfloat162(const __hip_bfloat16 a);
+  // CHECK: bf162 = __bfloat162bfloat162(bf16);
+  bf162 = __bfloat162bfloat162(bf16);
+
+  // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b);
+  // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __lows2bfloat162(const __hip_bfloat162 a, const __hip_bfloat162 b);
+  // CHECK: bf162 = __lows2bfloat162(bf162a, bf162b);
+  bf162 = __lows2bfloat162(bf162a, bf162b);
+
+  // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b);
+  // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __highs2bfloat162(const __hip_bfloat162 a, const __hip_bfloat162 b);
+  // CHECK: bf162 = __highs2bfloat162(bf162a, bf162b);
+  bf162 = __highs2bfloat162(bf162a, bf162b);
+
+  // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a);
+  // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __high2bfloat16(const __hip_bfloat162 a);
+  // CHECK: bf16 = __high2bfloat16(bf162a);
+  bf16 = __high2bfloat16(bf162a);
 #endif
 
 #if CUDA_VERSION >= 11080