diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 449e4d1256944..34ba0308c6012 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -17,6 +17,7 @@ # define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS) #endif +#pragma push_macro("SM_53") #pragma push_macro("SM_70") #pragma push_macro("SM_72") #pragma push_macro("SM_75") @@ -30,7 +31,9 @@ #pragma push_macro("SM_60") #define SM_60 "sm_60|sm_61|sm_62|" SM_70 +#define SM_53 "sm_53|" SM_60 +#pragma push_macro("PTX42") #pragma push_macro("PTX60") #pragma push_macro("PTX61") #pragma push_macro("PTX63") @@ -53,6 +56,7 @@ #define PTX63 "ptx63|" PTX64 #define PTX61 "ptx61|" PTX63 #define PTX60 "ptx60|" PTX61 +#define PTX42 "ptx42|" PTX60 #pragma push_macro("AND") #define AND(a, b) "(" a "),(" b ")" @@ -110,13 +114,89 @@ BUILTIN(__nvvm_prmt, "UiUiUiUi", "") // Min Max -BUILTIN(__nvvm_fmax_ftz_f, "fff", "") -BUILTIN(__nvvm_fmax_f, "fff", "") -BUILTIN(__nvvm_fmin_ftz_f, "fff", "") -BUILTIN(__nvvm_fmin_f, "fff", "") +TARGET_BUILTIN(__nvvm_fmin_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_nan_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f16, "hhh", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_bf16, "UsUsUs", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_nan_bf16, "UsUsUs", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_bf16, "UsUsUs", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_bf16, "UsUsUs", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_bf16x2, "ZUiZUiZUi", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_nan_bf16x2, "ZUiZUiZUi", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_bf16x2, "ZUiZUiZUi", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_bf16x2, "ZUiZUiZUi", "", + AND(SM_86, PTX72)) +BUILTIN(__nvvm_fmin_f, "fff", "") +BUILTIN(__nvvm_fmin_ftz_f, "fff", "") +TARGET_BUILTIN(__nvvm_fmin_nan_f, "fff", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f, "fff", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) +BUILTIN(__nvvm_fmin_d, "ddd", "") +TARGET_BUILTIN(__nvvm_fmax_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_nan_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f16, "hhh", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_bf16, "UsUsUs", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_nan_bf16, "UsUsUs", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_bf16, "UsUsUs", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_bf16, "UsUsUs", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_bf16x2, "ZUiZUiZUi", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_nan_bf16x2, "ZUiZUiZUi", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_bf16x2, "ZUiZUiZUi", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_bf16x2, "ZUiZUiZUi", "", + AND(SM_86, PTX72)) +BUILTIN(__nvvm_fmax_f, "fff", "") +BUILTIN(__nvvm_fmax_ftz_f, "fff", "") +TARGET_BUILTIN(__nvvm_fmax_nan_f, "fff", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f, "fff", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) BUILTIN(__nvvm_fmax_d, "ddd", "") -BUILTIN(__nvvm_fmin_d, "ddd", "") // Multiplication @@ -205,6 +285,8 @@ BUILTIN(__nvvm_saturate_d, "dd", "") BUILTIN(__nvvm_ex2_approx_ftz_f, "ff", "") BUILTIN(__nvvm_ex2_approx_f, "ff", "") BUILTIN(__nvvm_ex2_approx_d, "dd", "") +TARGET_BUILTIN(__nvvm_ex2_approx_f16, "hh", "", AND(SM_75, PTX70)) +TARGET_BUILTIN(__nvvm_ex2_approx_f16x2, "V2hV2h", "", AND(SM_75, PTX70)) BUILTIN(__nvvm_lg2_approx_ftz_f, "ff", "") BUILTIN(__nvvm_lg2_approx_f, "ff", "") @@ -220,6 +302,22 @@ BUILTIN(__nvvm_cos_approx_f, "ff", "") // Fma +TARGET_BUILTIN(__nvvm_fma_rn_f16, "hhhh", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_ftz_f16, "hhhh", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_sat_f16, "hhhh", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_ftz_sat_f16, "hhhh", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_relu_f16, "hhhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fma_rn_ftz_relu_f16, "hhhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fma_rn_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_ftz_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_sat_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_ftz_sat_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_relu_f16x2, "V2hV2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fma_rn_ftz_relu_f16x2, "V2hV2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fma_rn_bf16, "UsUsUsUs", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fma_rn_relu_bf16, "UsUsUsUs", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fma_rn_bf16x2, "ZUiZUiZUiZUi", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fma_rn_relu_bf16x2, "ZUiZUiZUiZUi", "", AND(SM_80, PTX70)) BUILTIN(__nvvm_fma_rn_ftz_f, "ffff", "") BUILTIN(__nvvm_fma_rn_f, "ffff", "") BUILTIN(__nvvm_fma_rz_ftz_f, "ffff", "") @@ -2301,15 +2399,24 @@ TARGET_BUILTIN(__nvvm_cp_async_commit_group, "v", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_cp_async_wait_group, "vIi", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_cp_async_wait_all, "v", "", AND(SM_80,PTX70)) + +// bf16, bf16x2 abs, neg +TARGET_BUILTIN(__nvvm_abs_bf16, "UsUs", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_abs_bf16x2, "ZUiZUi", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_neg_bf16, "UsUs", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_neg_bf16x2, "ZUiZUi", "", AND(SM_80,PTX70)) + #undef BUILTIN #undef TARGET_BUILTIN #pragma pop_macro("AND") +#pragma pop_macro("SM_53") #pragma pop_macro("SM_60") #pragma pop_macro("SM_70") #pragma pop_macro("SM_72") #pragma pop_macro("SM_75") #pragma pop_macro("SM_80") #pragma pop_macro("SM_86") +#pragma pop_macro("PTX42") #pragma pop_macro("PTX60") #pragma pop_macro("PTX61") #pragma pop_macro("PTX63") diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type.c b/clang/test/CodeGen/builtins-nvptx-native-half-type.c new file mode 100644 index 0000000000000..95021f274cd0f --- /dev/null +++ b/clang/test/CodeGen/builtins-nvptx-native-half-type.c @@ -0,0 +1,174 @@ +// REQUIRES: nvptx-registered-target +// +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \ +// RUN: sm_75 -target-feature +ptx70 -fcuda-is-device -fnative-half-type -S \ +// RUN: -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM75 %s + +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \ +// RUN: sm_80 -target-feature +ptx70 -fcuda-is-device -fnative-half-type -S \ +// RUN: -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 %s + +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown \ +// RUN: -target-cpu sm_80 -target-feature +ptx70 -fcuda-is-device \ +// RUN: -fnative-half-type -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 %s + +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \ +// RUN: sm_86 -target-feature +ptx72 -fcuda-is-device -fnative-half-type -S \ +// RUN: -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 %s + +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown \ +// RUN: -target-cpu sm_86 -target-feature +ptx72 -fcuda-is-device \ +// RUN: -fnative-half-type -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 %s + +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \ +// RUN: sm_53 -target-feature +ptx42 -fcuda-is-device -fnative-half-type -S \ +// RUN: -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX42_SM53 %s + +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown \ +// RUN: -target-cpu sm_53 -target-feature +ptx42 -fcuda-is-device \ +// RUN: -fnative-half-type -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX42_SM53 %s + +#define __device__ __attribute__((device)) + +__device__ void nvvm_ex2_sm75() { +#if __CUDA_ARCH__ >= 750 + // CHECK_PTX70_SM75: call half @llvm.nvvm.ex2.approx.f16 + __nvvm_ex2_approx_f16(0.1f16); + // CHECK_PTX70_SM75: call <2 x half> @llvm.nvvm.ex2.approx.f16x2 + __nvvm_ex2_approx_f16x2({0.1f16, 0.7f16}); +#endif + // CHECK: ret void +} + +// CHECK-LABEL: nvvm_min_max_sm80 +__device__ void nvvm_min_max_sm80() { +#if __CUDA_ARCH__ >= 800 + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.f16 + __nvvm_fmin_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.ftz.f16 + __nvvm_fmin_ftz_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.nan.f16 + __nvvm_fmin_nan_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.ftz.nan.f16 + __nvvm_fmin_ftz_nan_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.f16x2 + __nvvm_fmin_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.ftz.f16x2 + __nvvm_fmin_ftz_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.nan.f16x2 + __nvvm_fmin_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2 + __nvvm_fmin_ftz_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.f16 + __nvvm_fmax_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.ftz.f16 + __nvvm_fmax_ftz_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.nan.f16 + __nvvm_fmax_nan_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.ftz.nan.f16 + __nvvm_fmax_ftz_nan_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.f16x2 + __nvvm_fmax_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.ftz.f16x2 + __nvvm_fmax_ftz_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.nan.f16x2 + __nvvm_fmax_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2 + __nvvm_fmax_ftz_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); +#endif + // CHECK: ret void +} + +// CHECK-LABEL: nvvm_fma_f16_f16x2_sm80 +__device__ void nvvm_fma_f16_f16x2_sm80() { +#if __CUDA_ARCH__ >= 800 + // CHECK_PTX70_SM80: call half @llvm.nvvm.fma.rn.relu.f16 + __nvvm_fma_rn_relu_f16(0.1f16, 0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call half @llvm.nvvm.fma.rn.ftz.relu.f16 + __nvvm_fma_rn_ftz_relu_f16(0.1f16, 0.1f16, 0.1f16); + + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fma.rn.relu.f16x2 + __nvvm_fma_rn_relu_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}, + {0.1f16, 0.7f16}); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fma.rn.ftz.relu.f16x2 + __nvvm_fma_rn_ftz_relu_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}, + {0.1f16, 0.7f16}); +#endif + // CHECK: ret void +} + +// CHECK-LABEL: nvvm_fma_f16_f16x2_sm53 +__device__ void nvvm_fma_f16_f16x2_sm53() { +#if __CUDA_ARCH__ >= 530 + // CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.f16 + __nvvm_fma_rn_f16(0.1f16, 0.1f16, 0.1f16); + // CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.ftz.f16 + __nvvm_fma_rn_ftz_f16(0.1f16, 0.1f16, 0.1f16); + // CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.sat.f16 + __nvvm_fma_rn_sat_f16(0.1f16, 0.1f16, 0.1f16); + // CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.ftz.sat.f16 + __nvvm_fma_rn_ftz_sat_f16(0.1f16, 0.1f16, 0.1f16); + + // CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.f16x2 + __nvvm_fma_rn_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}, + {0.1f16, 0.7f16}); + // CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.ftz.f16x2 + __nvvm_fma_rn_ftz_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}, + {0.1f16, 0.7f16}); + // CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.sat.f16x2 + __nvvm_fma_rn_sat_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}, + {0.1f16, 0.7f16}); + // CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.ftz.sat.f16x2 + __nvvm_fma_rn_ftz_sat_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}, + {0.1f16, 0.7f16}); +#endif + // CHECK: ret void +} + +// CHECK-LABEL: nvvm_min_max_sm86 +__device__ void nvvm_min_max_sm86() { +#if __CUDA_ARCH__ >= 860 + // CHECK_PTX72_SM86: call half @llvm.nvvm.fmin.xorsign.abs.f16 + __nvvm_fmin_xorsign_abs_f16(0.1f16, 0.1f16); + // CHECK_PTX72_SM86: call half @llvm.nvvm.fmin.ftz.xorsign.abs.f16 + __nvvm_fmin_ftz_xorsign_abs_f16(0.1f16, 0.1f16); + // CHECK_PTX72_SM86: call half @llvm.nvvm.fmin.nan.xorsign.abs.f16 + __nvvm_fmin_nan_xorsign_abs_f16(0.1f16, 0.1f16); + // CHECK_PTX72_SM86: call half @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16 + __nvvm_fmin_ftz_nan_xorsign_abs_f16(0.1f16, 0.1f16); + // CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmin.xorsign.abs.f16x2 + __nvvm_fmin_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmin.ftz.xorsign.abs.f16x2 + __nvvm_fmin_ftz_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmin.nan.xorsign.abs.f16x2 + __nvvm_fmin_nan_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16x2 + __nvvm_fmin_ftz_nan_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + + // CHECK_PTX72_SM86: call half @llvm.nvvm.fmax.xorsign.abs.f16 + __nvvm_fmax_xorsign_abs_f16(0.1f16, 0.1f16); + // CHECK_PTX72_SM86: call half @llvm.nvvm.fmax.ftz.xorsign.abs.f16 + __nvvm_fmax_ftz_xorsign_abs_f16(0.1f16, 0.1f16); + // CHECK_PTX72_SM86: call half @llvm.nvvm.fmax.nan.xorsign.abs.f16 + __nvvm_fmax_nan_xorsign_abs_f16(0.1f16, 0.1f16); + // CHECK_PTX72_SM86: call half @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16 + __nvvm_fmax_ftz_nan_xorsign_abs_f16(0.1f16, 0.1f16); + // CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmax.xorsign.abs.f16x2 + __nvvm_fmax_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmax.ftz.xorsign.abs.f16x2 + __nvvm_fmax_ftz_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmax.nan.xorsign.abs.f16x2 + __nvvm_fmax_nan_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16x2 + __nvvm_fmax_ftz_nan_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); +#endif + // CHECK: ret void +} diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index bbd60effc70e4..368974095fad7 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -16,6 +16,12 @@ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 \ // RUN: -DERROR_CHECK -fcuda-is-device -S -o /dev/null -x cuda -verify %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \ +// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP32 %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \ +// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) @@ -4231,3 +4237,128 @@ __device__ void nvvm_cvt_sm80() { #endif // CHECK: ret void } + +// CHECK-LABEL: nvvm_abs_neg_bf16_bf16x2_sm80 +__device__ void nvvm_abs_neg_bf16_bf16x2_sm80() { +#if __CUDA_ARCH__ >= 800 + + // CHECK_PTX70_SM80: call i16 @llvm.nvvm.abs.bf16(i16 -1) + __nvvm_abs_bf16(0xFFFF); + // CHECK_PTX70_SM80: call i32 @llvm.nvvm.abs.bf16x2(i32 -1) + __nvvm_abs_bf16x2(0xFFFFFFFF); + + // CHECK_PTX70_SM80: call i16 @llvm.nvvm.neg.bf16(i16 -1) + __nvvm_neg_bf16(0xFFFF); + // CHECK_PTX70_SM80: call i32 @llvm.nvvm.neg.bf16x2(i32 -1) + __nvvm_neg_bf16x2(0xFFFFFFFF); +#endif + // CHECK: ret void +} + +#define NAN32 0x7FBFFFFF +#define NAN16 0x7FBF +#define BF16 0x1234 +#define BF16_2 0x4321 +#define NANBF16 0xFFC1 +#define BF16X2 0x12341234 +#define BF16X2_2 0x32343234 +#define NANBF16X2 0xFFC1FFC1 + +// CHECK-LABEL: nvvm_min_max_sm80 +__device__ void nvvm_min_max_sm80() { +#if __CUDA_ARCH__ >= 800 + + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmin.nan.f + __nvvm_fmin_nan_f(0.1f, (float)NAN32); + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmin.ftz.nan.f + __nvvm_fmin_ftz_nan_f(0.1f, (float)NAN32); + + // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fmin.bf16 + __nvvm_fmin_bf16(BF16, BF16_2); + // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fmin.nan.bf16 + __nvvm_fmin_nan_bf16(BF16, NANBF16); + // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fmin.bf16x2 + __nvvm_fmin_bf16x2(BF16X2, BF16X2_2); + // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fmin.nan.bf16x2 + __nvvm_fmin_nan_bf16x2(BF16X2, NANBF16X2); + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f + __nvvm_fmax_nan_f(0.1f, 0.11f); + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f + __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32); + + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f + __nvvm_fmax_nan_f(0.1f, (float)NAN32); + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f + __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32); + // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fmax.bf16 + __nvvm_fmax_bf16(BF16, BF16_2); + // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fmax.nan.bf16 + __nvvm_fmax_nan_bf16(BF16, NANBF16); + // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fmax.bf16x2 + __nvvm_fmax_bf16x2(BF16X2, BF16X2_2); + // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fmax.nan.bf16x2 + __nvvm_fmax_nan_bf16x2(NANBF16X2, BF16X2); + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f + __nvvm_fmax_nan_f(0.1f, (float)NAN32); + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f + __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32); + +#endif + // CHECK: ret void +} + +// CHECK-LABEL: nvvm_fma_bf16_bf16x2_sm80 +__device__ void nvvm_fma_bf16_bf16x2_sm80() { +#if __CUDA_ARCH__ >= 800 + // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fma.rn.bf16 + __nvvm_fma_rn_bf16(0x1234, 0x7FBF, 0x1234); + // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fma.rn.relu.bf16 + __nvvm_fma_rn_relu_bf16(0x1234, 0x7FBF, 0x1234); + // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fma.rn.bf16x2 + __nvvm_fma_rn_bf16x2(0x7FBFFFFF, 0xFFFFFFFF, 0x7FBFFFFF); + // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fma.rn.relu.bf16x2 + __nvvm_fma_rn_relu_bf16x2(0x7FBFFFFF, 0xFFFFFFFF, 0x7FBFFFFF); +#endif + // CHECK: ret void +} + +// CHECK-LABEL: nvvm_min_max_sm86 +__device__ void nvvm_min_max_sm86() { +#if __CUDA_ARCH__ >= 860 + + // CHECK_PTX72_SM86: call i16 @llvm.nvvm.fmin.xorsign.abs.bf16 + __nvvm_fmin_xorsign_abs_bf16(BF16, BF16_2); + // CHECK_PTX72_SM86: call i16 @llvm.nvvm.fmin.nan.xorsign.abs.bf16 + __nvvm_fmin_nan_xorsign_abs_bf16(BF16, NANBF16); + // CHECK_PTX72_SM86: call i32 @llvm.nvvm.fmin.xorsign.abs.bf16x2 + __nvvm_fmin_xorsign_abs_bf16x2(BF16X2, BF16X2_2); + // CHECK_PTX72_SM86: call i32 @llvm.nvvm.fmin.nan.xorsign.abs.bf16x2 + __nvvm_fmin_nan_xorsign_abs_bf16x2(BF16X2, NANBF16X2); + // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.xorsign.abs.f + __nvvm_fmin_xorsign_abs_f(-0.1f, 0.1f); + // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.ftz.xorsign.abs.f + __nvvm_fmin_ftz_xorsign_abs_f(-0.1f, 0.1f); + // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.nan.xorsign.abs.f + __nvvm_fmin_nan_xorsign_abs_f(-0.1f, (float)NAN32); + // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f + __nvvm_fmin_ftz_nan_xorsign_abs_f(-0.1f, (float)NAN32); + + // CHECK_PTX72_SM86: call i16 @llvm.nvvm.fmax.xorsign.abs.bf16 + __nvvm_fmax_xorsign_abs_bf16(BF16, BF16_2); + // CHECK_PTX72_SM86: call i16 @llvm.nvvm.fmax.nan.xorsign.abs.bf16 + __nvvm_fmax_nan_xorsign_abs_bf16(BF16, NANBF16); + // CHECK_PTX72_SM86: call i32 @llvm.nvvm.fmax.xorsign.abs.bf16x2 + __nvvm_fmax_xorsign_abs_bf16x2(BF16X2, BF16X2_2); + // CHECK_PTX72_SM86: call i32 @llvm.nvvm.fmax.nan.xorsign.abs.bf16x2 + __nvvm_fmax_nan_xorsign_abs_bf16x2(BF16X2, NANBF16X2); + // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.xorsign.abs.f + __nvvm_fmax_xorsign_abs_f(-0.1f, 0.1f); + // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.ftz.xorsign.abs.f + __nvvm_fmax_ftz_xorsign_abs_f(-0.1f, 0.1f); + // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.nan.xorsign.abs.f + __nvvm_fmax_nan_xorsign_abs_f(-0.1f, (float)NAN32); + // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f + __nvvm_fmax_ftz_nan_xorsign_abs_f(-0.1f, (float)NAN32); +#endif + // CHECK: ret void +} diff --git a/libclc/generic/include/clcmacro.h b/libclc/generic/include/clcmacro.h index d4167a8e4529e..addb461aa047d 100644 --- a/libclc/generic/include/clcmacro.h +++ b/libclc/generic/include/clcmacro.h @@ -9,11 +9,7 @@ #ifndef __CLC_MACRO_H #define __CLC_MACRO_H -#define _CLC_UNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE) \ - DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x) { \ - return (RET_TYPE##2)(FUNCTION(x.x), FUNCTION(x.y)); \ - } \ - \ +#define _CLC_UNARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE) \ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x) { \ return (RET_TYPE##3)(FUNCTION(x.x), FUNCTION(x.y), FUNCTION(x.z)); \ } \ @@ -30,12 +26,14 @@ return (RET_TYPE##16)(FUNCTION(x.lo), FUNCTION(x.hi)); \ } -#define _CLC_BINARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ - ARG2_TYPE) \ - DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y) { \ - return (RET_TYPE##2)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y)); \ +#define _CLC_UNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE) \ + DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x) { \ + return (RET_TYPE##2)(FUNCTION(x.x), FUNCTION(x.y)); \ } \ - \ + _CLC_UNARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE) + +#define _CLC_BINARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ + ARG2_TYPE) \ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y) { \ return (RET_TYPE##3)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y), \ FUNCTION(x.z, y.z)); \ @@ -53,6 +51,14 @@ return (RET_TYPE##16)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \ } +#define _CLC_BINARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ + ARG2_TYPE) \ + DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y) { \ + return (RET_TYPE##2)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y)); \ + } \ + _CLC_BINARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ + ARG2_TYPE) + #define _CLC_V_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ ARG2_TYPE) \ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE##2 y) { \ @@ -76,13 +82,8 @@ return (RET_TYPE##16)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \ } -#define _CLC_TERNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ - ARG2_TYPE, ARG3_TYPE) \ - DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y, \ - ARG3_TYPE##2 z) { \ - return (RET_TYPE##2)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y)); \ - } \ - \ +#define _CLC_TERNARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ + ARG2_TYPE, ARG3_TYPE) \ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y, \ ARG3_TYPE##3 z) { \ return (RET_TYPE##3)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y), \ @@ -107,6 +108,15 @@ FUNCTION(x.hi, y.hi, z.hi)); \ } +#define _CLC_TERNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ + ARG2_TYPE, ARG3_TYPE) \ + DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y, \ + ARG3_TYPE##2 z) { \ + return (RET_TYPE##2)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y)); \ + } \ + _CLC_TERNARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ + ARG2_TYPE, ARG3_TYPE) + #define _CLC_V_S_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ ARG2_TYPE, ARG3_TYPE) \ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##2 z) { \ diff --git a/libclc/ptx-nvidiacl/libspirv/SOURCES b/libclc/ptx-nvidiacl/libspirv/SOURCES index 5ec1dea1afc30..613b6aaa6fd16 100644 --- a/libclc/ptx-nvidiacl/libspirv/SOURCES +++ b/libclc/ptx-nvidiacl/libspirv/SOURCES @@ -27,6 +27,7 @@ math/fabs.cl math/fdim.cl math/floor.cl math/fma.cl +math/fma_relu.cl math/fmax.cl math/fmin.cl math/fmod.cl diff --git a/libclc/ptx-nvidiacl/libspirv/math/fabs.cl b/libclc/ptx-nvidiacl/libspirv/math/fabs.cl index 753f449fafefb..0aac0fa4ab0f0 100644 --- a/libclc/ptx-nvidiacl/libspirv/math/fabs.cl +++ b/libclc/ptx-nvidiacl/libspirv/math/fabs.cl @@ -11,7 +11,27 @@ #include "../../include/libdevice.h" #include +extern int __clc_nvvm_reflect_arch(); + #define __CLC_FUNCTION __spirv_ocl_fabs #define __CLC_BUILTIN __nv_fabs #define __CLC_BUILTIN_F __CLC_XCONCAT(__CLC_BUILTIN, f) #include + +_CLC_DEF _CLC_OVERLOAD ushort __clc_fabs(ushort x) { + if (__clc_nvvm_reflect_arch() >= 800) { + return __nvvm_abs_bf16(x); + } + __builtin_trap(); + __builtin_unreachable(); +} +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, __clc_fabs, ushort) + +_CLC_DEF _CLC_OVERLOAD uint __clc_fabs(uint x) { + if (__clc_nvvm_reflect_arch() >= 800) { + return __nvvm_abs_bf16x2(x); + } + __builtin_trap(); + __builtin_unreachable(); +} +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, __clc_fabs, uint) diff --git a/libclc/ptx-nvidiacl/libspirv/math/fma.cl b/libclc/ptx-nvidiacl/libspirv/math/fma.cl index f887289c03b29..4cfdc2821e12e 100644 --- a/libclc/ptx-nvidiacl/libspirv/math/fma.cl +++ b/libclc/ptx-nvidiacl/libspirv/math/fma.cl @@ -11,6 +11,8 @@ #include "../../include/libdevice.h" #include +extern int __clc_nvvm_reflect_arch(); + _CLC_DEFINE_TERNARY_BUILTIN(float, __spirv_ocl_fma, __nv_fmaf, float, float, float) @@ -27,10 +29,45 @@ _CLC_DEFINE_TERNARY_BUILTIN(double, __spirv_ocl_fma, __nv_fma, double, double, #pragma OPENCL EXTENSION cl_khr_fp16 : enable -_CLC_DEFINE_TERNARY_BUILTIN(half, __spirv_ocl_fma, __nv_fmaf, half, half, half) +_CLC_DEF _CLC_OVERLOAD half __spirv_ocl_fma(half x, half y, half z) { + if (__clc_nvvm_reflect_arch() >= 530) { + return __nvvm_fma_rn_f16(x, y, z); + } + return __nv_fmaf(x, y, z); +} + +_CLC_DEF _CLC_OVERLOAD half2 __spirv_ocl_fma(half2 x, half2 y, half2 z) { + if (__clc_nvvm_reflect_arch() >= 530) { + return __nvvm_fma_rn_f16x2(x, y, z); + } + return (half2)(__spirv_ocl_fma(x.x, y.x, z.x), + __spirv_ocl_fma(x.y, y.y, z.y)); +} +_CLC_TERNARY_VECTORIZE_HAVE2(_CLC_OVERLOAD _CLC_DEF, half, __spirv_ocl_fma, + half, half, half) #endif +_CLC_DEF _CLC_OVERLOAD ushort __clc_fma(ushort x, ushort y, ushort z) { + if (__clc_nvvm_reflect_arch() >= 800) { + return __nvvm_fma_rn_bf16(x, y, z); + } + __builtin_trap(); + __builtin_unreachable(); +} +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, __clc_fma, ushort, + ushort, ushort) + +_CLC_DEF _CLC_OVERLOAD uint __clc_fma(uint x, uint y, uint z) { + if (__clc_nvvm_reflect_arch() >= 800) { + return __nvvm_fma_rn_bf16x2(x, y, z); + } + __builtin_trap(); + __builtin_unreachable(); +} +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, __clc_fma, uint, + uint, uint) + #undef __CLC_BUILTIN #undef __CLC_BUILTIN_F #undef __CLC_FUNCTION diff --git a/libclc/ptx-nvidiacl/libspirv/math/fma_relu.cl b/libclc/ptx-nvidiacl/libspirv/math/fma_relu.cl new file mode 100755 index 0000000000000..b48e25c7c628d --- /dev/null +++ b/libclc/ptx-nvidiacl/libspirv/math/fma_relu.cl @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#include "../../include/libdevice.h" +#include + +extern int __clc_nvvm_reflect_arch(); + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEF _CLC_OVERLOAD half __clc_fma_relu(half x, half y, half z) { + if (__clc_nvvm_reflect_arch() >= 800) { + return __nvvm_fma_rn_relu_f16(x, y, z); + } + __builtin_trap(); + __builtin_unreachable(); +} + +_CLC_DEF _CLC_OVERLOAD half2 __clc_fma_relu(half2 x, half2 y, half2 z) { + if (__clc_nvvm_reflect_arch() >= 800) { + return __nvvm_fma_rn_relu_f16x2(x, y, z); + } + return (half2)(__clc_fma_relu(x.x, y.x, z.x), + __clc_fma_relu(x.y, y.y, z.y)); +} +_CLC_TERNARY_VECTORIZE_HAVE2(_CLC_OVERLOAD _CLC_DEF, half, __clc_fma_relu, + half, half, half) + +#endif + +_CLC_DEF _CLC_OVERLOAD ushort __clc_fma_relu(ushort x, ushort y, + ushort z) { + if (__clc_nvvm_reflect_arch() >= 800) { + return __nvvm_fma_rn_relu_bf16(x, y, z); + } + __builtin_trap(); + __builtin_unreachable(); +} +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, __clc_fma_relu, + ushort, ushort, ushort) + +_CLC_DEF _CLC_OVERLOAD uint __clc_fma_relu(uint x, uint y, uint z) { + if (__clc_nvvm_reflect_arch() >= 800) { + return __nvvm_fma_rn_relu_bf16x2(x, y, z); + } + __builtin_trap(); + __builtin_unreachable(); +} +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, __clc_fma_relu, uint, + uint, uint) diff --git a/libclc/ptx-nvidiacl/libspirv/math/fmax.cl b/libclc/ptx-nvidiacl/libspirv/math/fmax.cl index 0ff8f83d6cc05..645762000ff53 100644 --- a/libclc/ptx-nvidiacl/libspirv/math/fmax.cl +++ b/libclc/ptx-nvidiacl/libspirv/math/fmax.cl @@ -6,12 +6,67 @@ // //===----------------------------------------------------------------------===// +#include #include - #include "../../include/libdevice.h" -#include -#define __CLC_FUNCTION __spirv_ocl_fmax -#define __CLC_BUILTIN __nv_fmax -#define __CLC_BUILTIN_F __CLC_XCONCAT(__CLC_BUILTIN, f) -#include +extern int __clc_nvvm_reflect_arch(); + +_CLC_DEF _CLC_OVERLOAD float __spirv_ocl_fmax(float x, float y) { + return __nvvm_fmax_f(x, y); +} +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __spirv_ocl_fmax, float, + float) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_DEF _CLC_OVERLOAD double __spirv_ocl_fmax(double x, double y) { + return __nvvm_fmax_d(x, y); +} +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_fmax, double, + double) + +#endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEF _CLC_OVERLOAD half __spirv_ocl_fmax(half x, half y) { + if (__clc_nvvm_reflect_arch() >= 800) { + return __nvvm_fmax_f16(x, y); + } + return __nvvm_fmax_f(x,y); +} +_CLC_DEF _CLC_OVERLOAD half2 __spirv_ocl_fmax(half2 x, half2 y) { + if (__clc_nvvm_reflect_arch() >= 800) { + return __nvvm_fmax_f16x2(x, y); + } + return (half2)(__spirv_ocl_fmax(x.x, y.x), __spirv_ocl_fmax(x.y, y.y)); +} +_CLC_BINARY_VECTORIZE_HAVE2(_CLC_OVERLOAD _CLC_DEF, half, __spirv_ocl_fmax, + half, half) + +#endif + +_CLC_DEF _CLC_OVERLOAD ushort __clc_fmax(ushort x, ushort y) { + if (__clc_nvvm_reflect_arch() >= 800) { + return __nvvm_fmax_bf16(x, y); + } + __builtin_trap(); + __builtin_unreachable(); +} +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, __clc_fmax, ushort, + ushort) + +_CLC_DEF _CLC_OVERLOAD uint __clc_fmax(uint x, uint y) { + if (__clc_nvvm_reflect_arch() >= 800) { + return __nvvm_fmax_bf16x2(x, y); + } + __builtin_trap(); + __builtin_unreachable(); +} +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, __clc_fmax, uint, + uint) diff --git a/libclc/ptx-nvidiacl/libspirv/math/fmin.cl b/libclc/ptx-nvidiacl/libspirv/math/fmin.cl index ef09ba1b94ec0..6bdc4b8176be5 100644 --- a/libclc/ptx-nvidiacl/libspirv/math/fmin.cl +++ b/libclc/ptx-nvidiacl/libspirv/math/fmin.cl @@ -6,12 +6,67 @@ // //===----------------------------------------------------------------------===// +#include #include - #include "../../include/libdevice.h" -#include -#define __CLC_FUNCTION __spirv_ocl_fmin -#define __CLC_BUILTIN __nv_fmin -#define __CLC_BUILTIN_F __CLC_XCONCAT(__CLC_BUILTIN, f) -#include +extern int __clc_nvvm_reflect_arch(); + +_CLC_DEF _CLC_OVERLOAD float __spirv_ocl_fmin(float x, float y) { + return __nvvm_fmin_f(x, y); +} +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __spirv_ocl_fmin, float, + float) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_DEF _CLC_OVERLOAD double __spirv_ocl_fmin(double x, double y) { + return __nvvm_fmin_d(x, y); +} +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_fmin, double, + double) + +#endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEF _CLC_OVERLOAD half __spirv_ocl_fmin(half x, half y) { + if (__clc_nvvm_reflect_arch() >= 800) { + return __nvvm_fmin_f16(x, y); + } + return __nvvm_fmin_f(x,y); +} +_CLC_DEF _CLC_OVERLOAD half2 __spirv_ocl_fmin(half2 x, half2 y) { + if (__clc_nvvm_reflect_arch() >= 800) { + return __nvvm_fmin_f16x2(x, y); + } + return (half2)(__spirv_ocl_fmin(x.x, y.x), __spirv_ocl_fmin(x.y, y.y)); +} +_CLC_BINARY_VECTORIZE_HAVE2(_CLC_OVERLOAD _CLC_DEF, half, __spirv_ocl_fmin, half, + half) + +#endif + +_CLC_DEF _CLC_OVERLOAD ushort __clc_fmin(ushort x, ushort y) { + if (__clc_nvvm_reflect_arch() >= 800) { + return __nvvm_fmin_bf16(x, y); + } + __builtin_trap(); + __builtin_unreachable(); +} +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, __clc_fmin, ushort, + ushort) + +_CLC_DEF _CLC_OVERLOAD uint __clc_fmin(uint x, uint y) { + if (__clc_nvvm_reflect_arch() >= 800) { + return __nvvm_fmin_bf16x2(x, y); + } + __builtin_trap(); + __builtin_unreachable(); +} +_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, __clc_fmin, uint, + uint)