From 53b4c8710174e50721e38995d7eb793b8b587096 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 29 Nov 2024 19:51:22 +0100 Subject: [PATCH] [HIPIFY][#1769][fp16] Support for `fp16` math - Part 2 - Functions + Updated synthetic tests, the regenerated `hipify-perl`, and `Device` `CUDA2HIP` docs accordingly --- bin/hipify-perl | 32 +++++++------- .../CUDA_Device_API_supported_by_HIP.md | 16 +++---- src/CUDA2HIP_Device_functions.cpp | 24 ++++++---- .../synthetic/libraries/cudevice2hipdevice.cu | 44 +++++++++++++++++++ 4 files changed, 84 insertions(+), 32 deletions(-) diff --git a/bin/hipify-perl b/bin/hipify-perl index 9d240d4a..7d474474 100755 --- a/bin/hipify-perl +++ b/bin/hipify-perl @@ -6152,6 +6152,9 @@ sub simpleSubstitutions { subst("__assertfail", "__assertfail", "device_function"); subst("__ballot", "__ballot", "device_function"); subst("__ballot_sync", "__ballot_sync", "device_function"); + subst("__bfloat1622float2", "__bfloat1622float2", "device_function"); + subst("__bfloat162bfloat162", "__bfloat162bfloat162", "device_function"); + subst("__bfloat162float", "__bfloat162float", "device_function"); subst("__brev", "__brev", "device_function"); subst("__brevll", "__brevll", "device_function"); subst("__byte_perm", "__byte_perm", "device_function"); @@ -6161,6 +6164,7 @@ sub simpleSubstitutions { subst("__dadd_rn", "__dadd_rn", "device_function"); subst("__ddiv_rn", "__ddiv_rn", "device_function"); subst("__dmul_rn", "__dmul_rn", "device_function"); + subst("__double2bfloat16", "__double2bfloat16", "device_function"); subst("__double2float_rd", "__double2float_rd", "device_function"); subst("__double2float_rn", "__double2float_rn", "device_function"); subst("__double2float_ru", "__double2float_ru", "device_function"); @@ -6195,6 +6199,7 @@ sub simpleSubstitutions { subst("__ffs", "__ffs", "device_function"); subst("__ffsll", "__ffsll", "device_function"); subst("__float22half2_rn", "__float22half2_rn", "device_function"); + subst("__float2bfloat16", "__float2bfloat16", "device_function"); subst("__float2half", "__float2half", "device_function"); subst("__float2half2_rn", "__float2half2_rn", "device_function"); subst("__float2half_rd", "__float2half_rd", "device_function"); @@ -6297,9 +6302,11 @@ sub simpleSubstitutions { subst("__hgt2", "__hgt2", "device_function"); subst("__hgtu", "__hgtu", "device_function"); subst("__hgtu2", "__hgtu2", "device_function"); + subst("__high2bfloat16", "__high2bfloat16", "device_function"); subst("__high2float", "__high2float", "device_function"); subst("__high2half", "__high2half", "device_function"); subst("__high2half2", "__high2half2", "device_function"); + subst("__highs2bfloat162", "__highs2bfloat162", "device_function"); subst("__highs2half2", "__highs2half2", "device_function"); subst("__hiloint2double", "__hiloint2double", "device_function"); subst("__hisinf", "__hisinf", "device_function"); @@ -6365,6 +6372,7 @@ sub simpleSubstitutions { subst("__low2half", "__low2half", "device_function"); subst("__low2half2", "__low2half2", "device_function"); subst("__lowhigh2highlow", "__lowhigh2highlow", "device_function"); + subst("__lows2bfloat162", "__lows2bfloat162", "device_function"); subst("__lows2half2", "__lows2half2", "device_function"); subst("__match_all_sync", "__match_all_sync", "device_function"); subst("__match_any_sync", "__match_any_sync", "device_function"); @@ -9237,6 +9245,7 @@ sub countSupportedDeviceFunctions { "__match_any_sync", "__match_all_sync", "__lows2half2", + "__lows2bfloat162", "__lowhigh2highlow", "__low2half2", "__low2half", @@ -9302,9 +9311,11 @@ sub countSupportedDeviceFunctions { "__hisinf", "__hiloint2double", "__highs2half2", + "__highs2bfloat162", "__high2half2", "__high2half", "__high2float", + "__high2bfloat16", "__hgtu2", "__hgtu", "__hgt2", @@ -9407,6 +9418,7 @@ sub countSupportedDeviceFunctions { "__float2half_rd", "__float2half2_rn", "__float2half", + "__float2bfloat16", "__float22half2_rn", "__ffsll", "__ffs", @@ -9441,6 +9453,7 @@ sub countSupportedDeviceFunctions { "__double2float_ru", "__double2float_rn", "__double2float_rd", + "__double2bfloat16", "__dmul_rn", "__ddiv_rn", "__dadd_rn", @@ -9450,6 +9463,9 @@ sub countSupportedDeviceFunctions { "__byte_perm", "__brevll", "__brev", + "__bfloat162float", + "__bfloat162bfloat162", + "__bfloat1622float2", "__ballot_sync", "__ballot", "__assertfail", @@ -9610,7 +9626,6 @@ sub warnUnsupportedDeviceFunctions { "__pm2", "__pm1", "__pm0", - "__lows2bfloat162", "__low2bfloat162", "__low2bfloat16", "__ll2bfloat16_rz", @@ -9643,9 +9658,7 @@ sub warnUnsupportedDeviceFunctions { "__hlt2_mask", "__hleu2_mask", "__hle2_mask", - "__highs2bfloat162", "__high2bfloat162", - "__high2bfloat16", "__hgtu2_mask", "__hgt2_mask", "__hgeu2_mask", @@ -9684,7 +9697,6 @@ sub warnUnsupportedDeviceFunctions { "__float2bfloat16_rn", "__float2bfloat16_rd", "__float2bfloat162_rn", - "__float2bfloat16", "__float22bfloat162_rn", "__finitel", "__finitef", @@ -9705,7 +9717,6 @@ sub warnUnsupportedDeviceFunctions { "__drcp_ru", "__drcp_rd", "__double2half", - "__double2bfloat16", "__dmul_rz", "__dmul_ru", "__dmul_rd", @@ -9743,10 +9754,7 @@ sub warnUnsupportedDeviceFunctions { "__bfloat162int_ru", "__bfloat162int_rn", "__bfloat162int_rd", - "__bfloat162float", "__bfloat162char_rz", - "__bfloat162bfloat162", - "__bfloat1622float2", "_Pow_int" ) { @@ -11073,7 +11081,6 @@ sub warnUnsupportedFunctions { "__pm2", "__pm1", "__pm0", - "__lows2bfloat162", "__low2bfloat162", "__low2bfloat16", "__ll2bfloat16_rz", @@ -11106,9 +11113,7 @@ sub warnUnsupportedFunctions { "__hlt2_mask", "__hleu2_mask", "__hle2_mask", - "__highs2bfloat162", "__high2bfloat162", - "__high2bfloat16", "__hgtu2_mask", "__hgt2_mask", "__hgeu2_mask", @@ -11147,7 +11152,6 @@ sub warnUnsupportedFunctions { "__float2bfloat16_rn", "__float2bfloat16_rd", "__float2bfloat162_rn", - "__float2bfloat16", "__float22bfloat162_rn", "__finitel", "__finitef", @@ -11168,7 +11172,6 @@ sub warnUnsupportedFunctions { "__drcp_ru", "__drcp_rd", "__double2half", - "__double2bfloat16", "__dmul_rz", "__dmul_ru", "__dmul_rd", @@ -11206,10 +11209,7 @@ sub warnUnsupportedFunctions { "__bfloat162int_ru", "__bfloat162int_rn", "__bfloat162int_rd", - "__bfloat162float", "__bfloat162char_rz", - "__bfloat162bfloat162", - "__bfloat1622float2", "__CUB_LP64__", "_Pow_int", "_CUB_ASM_PTR_SIZE_", diff --git a/docs/tables/CUDA_Device_API_supported_by_HIP.md b/docs/tables/CUDA_Device_API_supported_by_HIP.md index 358c95b8..e9d89fff 100644 --- a/docs/tables/CUDA_Device_API_supported_by_HIP.md +++ b/docs/tables/CUDA_Device_API_supported_by_HIP.md @@ -14,10 +14,10 @@ |`__assertfail`| | | | |`__assertfail`|1.9.0| | | | | |`__ballot`| | | | |`__ballot`|1.6.0| | | | | |`__ballot_sync`|9.0| | | |`__ballot_sync`|6.2.0| | | | | -|`__bfloat1622float2`|11.0| | | | | | | | | | -|`__bfloat162bfloat162`|11.0| | | | | | | | | | +|`__bfloat1622float2`|11.0| | | |`__bfloat1622float2`|5.7.0| | | | | +|`__bfloat162bfloat162`|11.0| | | |`__bfloat162bfloat162`|5.7.0| | | | | |`__bfloat162char_rz`|12.2| | | | | | | | | | -|`__bfloat162float`|11.0| | | | | | | | | | +|`__bfloat162float`|11.0| | | |`__bfloat162float`|5.7.0| | | | | |`__bfloat162int_rd`|11.0| | | | | | | | | | |`__bfloat162int_rn`|11.0| | | | | | | | | | |`__bfloat162int_ru`|11.0| | | | | | | | | | @@ -64,7 +64,7 @@ |`__dmul_rn`| | | | |`__dmul_rn`|1.6.0| | | | | |`__dmul_ru`| | | | | | | | | | | |`__dmul_rz`| | | | | | | | | | | -|`__double2bfloat16`|11.0| | | | | | | | | | +|`__double2bfloat16`|11.0| | | |`__double2bfloat16`|5.7.0| | | | | |`__double2float_rd`| | | | |`__double2float_rd`|1.6.0| | | | | |`__double2float_rn`| | | | |`__double2float_rn`|1.6.0| | | | | |`__double2float_ru`| | | | |`__double2float_ru`|1.6.0| | | | | @@ -119,7 +119,7 @@ |`__finitel`| | | | | | | | | | | |`__float22bfloat162_rn`|11.0| | | | | | | | | | |`__float22half2_rn`| | | | |`__float22half2_rn`|1.6.0| | | | | -|`__float2bfloat16`|11.0| | | | | | | | | | +|`__float2bfloat16`|11.0| | | |`__float2bfloat16`|5.7.0| | | | | |`__float2bfloat162_rn`|11.0| | | | | | | | | | |`__float2bfloat16_rd`|11.0| | | | | | | | | | |`__float2bfloat16_rn`|11.0| | | | | | | | | | @@ -260,12 +260,12 @@ |`__hgtu`| | | | |`__hgtu`|1.9.0| | | | | |`__hgtu2`| | | | |`__hgtu2`|1.9.0| | | | | |`__hgtu2_mask`|12.0| | | | | | | | | | -|`__high2bfloat16`|11.0| | | | | | | | | | +|`__high2bfloat16`|11.0| | | |`__high2bfloat16`|5.7.0| | | | | |`__high2bfloat162`|11.0| | | | | | | | | | |`__high2float`| | | | |`__high2float`|1.6.0| | | | | |`__high2half`| | | | |`__high2half`|1.6.0| | | | | |`__high2half2`| | | | |`__high2half2`|1.6.0| | | | | -|`__highs2bfloat162`|11.0| | | | | | | | | | +|`__highs2bfloat162`|11.0| | | |`__highs2bfloat162`|5.7.0| | | | | |`__highs2half2`| | | | |`__highs2half2`|1.6.0| | | | | |`__hiloint2double`| | | | |`__hiloint2double`|1.6.0| | | | | |`__hisinf`| | | | |`__hisinf`|1.6.0| | | | | @@ -363,7 +363,7 @@ |`__low2half`| | | | |`__low2half`|1.6.0| | | | | |`__low2half2`| | | | |`__low2half2`|1.6.0| | | | | |`__lowhigh2highlow`| | | | |`__lowhigh2highlow`|1.6.0| | | | | -|`__lows2bfloat162`|11.0| | | | | | | | | | +|`__lows2bfloat162`|11.0| | | |`__lows2bfloat162`|5.7.0| | | | | |`__lows2half2`| | | | |`__lows2half2`|1.6.0| | | | | |`__match_all_sync`|9.0| | | |`__match_all_sync`|6.2.0| | | | | |`__match_any_sync`|9.0| | | |`__match_any_sync`|6.2.0| | | | | diff --git a/src/CUDA2HIP_Device_functions.cpp b/src/CUDA2HIP_Device_functions.cpp index 46297d3e..84090a9b 100644 --- a/src/CUDA2HIP_Device_functions.cpp +++ b/src/CUDA2HIP_Device_functions.cpp @@ -714,16 +714,16 @@ const std::map CUDA_DEVICE_FUNCTION_MAP { {"__half2char_rz", {"__half2char_rz", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, {"__half2uchar_rz", {"__half2uchar_rz", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, // bfp16 functions - {"__double2bfloat16", {"__double2bfloat16", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, - {"__float2bfloat16", {"__float2bfloat16", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, + {"__double2bfloat16", {"__double2bfloat16", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, + {"__float2bfloat16", {"__float2bfloat16", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, {"__float2bfloat16_rn", {"__float2bfloat16_rn", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, {"__float2bfloat16_rz", {"__float2bfloat16_rz", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, {"__float2bfloat16_rd", {"__float2bfloat16_rd", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, {"__float2bfloat16_ru", {"__float2bfloat16_ru", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, - {"__bfloat162float", {"__bfloat162float", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, + {"__bfloat162float", {"__bfloat162float", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, {"__float2bfloat162_rn", {"__float2bfloat162_rn", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, {"__floats2bfloat162_rn", {"__floats2bfloat162_rn", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, - {"__bfloat1622float2", {"__bfloat1622float2", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, + {"__bfloat1622float2", {"__bfloat1622float2", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, {"__bfloat162int_rn", {"__bfloat162int_rn", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, {"__bfloat162int_rz", {"__bfloat162int_rz", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, {"__bfloat162int_rd", {"__bfloat162int_rd", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, @@ -772,10 +772,10 @@ const std::map CUDA_DEVICE_FUNCTION_MAP { {"__ll2bfloat16_rz", {"__ll2bfloat16_rz", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, {"__ll2bfloat16_rd", {"__ll2bfloat16_rd", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, {"__ll2bfloat16_ru", {"__ll2bfloat16_ru", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, - {"__bfloat162bfloat162", {"__bfloat162bfloat162", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, - {"__lows2bfloat162", {"__lows2bfloat162", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, - {"__highs2bfloat162", {"__highs2bfloat162", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, - {"__high2bfloat16", {"__high2bfloat16", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, + {"__bfloat162bfloat162", {"__bfloat162bfloat162", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, + {"__lows2bfloat162", {"__lows2bfloat162", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, + {"__highs2bfloat162", {"__highs2bfloat162", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, + {"__high2bfloat16", {"__high2bfloat16", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, {"__low2bfloat16", {"__low2bfloat16", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, {"__halves2bfloat162", {"__halves2bfloat162", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, {"__low2bfloat162", {"__low2bfloat162", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, @@ -1508,6 +1508,14 @@ const std::map HIP_DEVICE_FUNCTION_VER_MAP { {"__hip_cvt_bfloat16raw2_to_fp8x2", {HIP_6020, HIP_0, HIP_0 }}, {"__hip_cvt_fp8_to_halfraw", {HIP_6020, HIP_0, HIP_0 }}, {"__hip_cvt_fp8x2_to_halfraw2", {HIP_6020, HIP_0, HIP_0 }}, + {"__double2bfloat16", {HIP_5070, HIP_0, HIP_0 }}, + {"__float2bfloat16", {HIP_5070, HIP_0, HIP_0 }}, + {"__bfloat162float", {HIP_5070, HIP_0, HIP_0 }}, + {"__bfloat1622float2", {HIP_5070, HIP_0, HIP_0 }}, + {"__bfloat162bfloat162", {HIP_5070, HIP_0, HIP_0 }}, + {"__lows2bfloat162", {HIP_5070, HIP_0, HIP_0 }}, + {"__highs2bfloat162", {HIP_5070, HIP_0, HIP_0 }}, + {"__high2bfloat16", {HIP_5070, HIP_0, HIP_0 }}, }; const std::map CUDA_DEVICE_FUNCTION_API_SECTION_MAP { diff --git a/tests/unit_tests/synthetic/libraries/cudevice2hipdevice.cu b/tests/unit_tests/synthetic/libraries/cudevice2hipdevice.cu index 2f6718c9..8dbda0c7 100644 --- a/tests/unit_tests/synthetic/libraries/cudevice2hipdevice.cu +++ b/tests/unit_tests/synthetic/libraries/cudevice2hipdevice.cu @@ -11,7 +11,9 @@ int main() { printf("24. CUDA Device API to HIP Device API synthetic test\n"); + double da = 0.0f; double dx = 0.0f; + float fa = 0.0f; float fx = 0.0f; double2 d2 = { 0.0f, 0.0f }; float2 f2 = { 0.0f, 0.0f }; @@ -27,9 +29,51 @@ int main() { // CHECK: __hip_bfloat162 bf162 = { 0, 0 }; __nv_bfloat162 bf162 = { 0, 0 }; + __nv_bfloat162 bf162a = { 0, 0 }; + __nv_bfloat162 bf162b = { 0, 0 }; // CHECK: __hip_bfloat162_raw bf162r = { 0, 0 }; __nv_bfloat162_raw bf162r = { 0, 0 }; + + // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double a); + // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __double2bfloat16(const double a) + // CHECK: bf16 = __double2bfloat16(da); + bf16 = __double2bfloat16(da); + + // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a); + // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __float2bfloat16(float f); + // CHECK: bf16 = __float2bfloat16(fa); + bf16 = __float2bfloat16(fa); + + // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a); + // HIP: __BF16_HOST_DEVICE_STATIC__ float __bfloat162float(__hip_bfloat16 a); + // CHECK: bf16 = __bfloat162float(fa); + bf16 = __bfloat162float(fa); + + // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a); + // HIP: __BF16_HOST_DEVICE_STATIC__ float2 __bfloat1622float2(const __hip_bfloat162 a); + // CHECK: f2 = __bfloat1622float2(bf162); + f2 = __bfloat1622float2(bf162); + + // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a); + // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __bfloat162bfloat162(const __hip_bfloat16 a); + // CHECK: bf162 = __bfloat162bfloat162(bf16); + bf162 = __bfloat162bfloat162(bf16); + + // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b); + // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __lows2bfloat162(const __hip_bfloat162 a, const __hip_bfloat162 b); + // CHECK: bf162 = __lows2bfloat162(bf162a, bf162b); + bf162 = __lows2bfloat162(bf162a, bf162b); + + // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b); + // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __highs2bfloat162(const __hip_bfloat162 a, const __hip_bfloat162 b); + // CHECK: bf162 = __highs2bfloat162(bf162a, bf162b); + bf162 = __highs2bfloat162(bf162a, bf162b); + + // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a); + // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __high2bfloat16(const __hip_bfloat162 a); + // CHECK: bf16 = __high2bfloat16(bf162a); + bf16 = __high2bfloat16(bf162a); #endif #if CUDA_VERSION >= 11080