Skip to content

[SYCL][CUDA][libclc] Add bf16 builtins and optimize half builtins for fma, fmin, fmax and fmax #5724

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Mar 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 112 additions & 5 deletions clang/include/clang/Basic/BuiltinsNVPTX.def
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
# define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS)
#endif

#pragma push_macro("SM_53")
#pragma push_macro("SM_70")
#pragma push_macro("SM_72")
#pragma push_macro("SM_75")
Expand All @@ -30,7 +31,9 @@

#pragma push_macro("SM_60")
#define SM_60 "sm_60|sm_61|sm_62|" SM_70
#define SM_53 "sm_53|" SM_60

#pragma push_macro("PTX42")
#pragma push_macro("PTX60")
#pragma push_macro("PTX61")
#pragma push_macro("PTX63")
Expand All @@ -53,6 +56,7 @@
#define PTX63 "ptx63|" PTX64
#define PTX61 "ptx61|" PTX63
#define PTX60 "ptx60|" PTX61
#define PTX42 "ptx42|" PTX60

#pragma push_macro("AND")
#define AND(a, b) "(" a "),(" b ")"
Expand Down Expand Up @@ -110,13 +114,89 @@ BUILTIN(__nvvm_prmt, "UiUiUiUi", "")

// Min Max

BUILTIN(__nvvm_fmax_ftz_f, "fff", "")
BUILTIN(__nvvm_fmax_f, "fff", "")
BUILTIN(__nvvm_fmin_ftz_f, "fff", "")
BUILTIN(__nvvm_fmin_f, "fff", "")
TARGET_BUILTIN(__nvvm_fmin_f16, "hhh", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmin_ftz_f16, "hhh", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmin_nan_f16, "hhh", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f16, "hhh", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f16, "hhh", "",
AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmin_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmin_ftz_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmin_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f16x2, "V2hV2hV2h", "",
AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f16x2, "V2hV2hV2h", "",
AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f16x2, "V2hV2hV2h", "",
AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f16x2, "V2hV2hV2h", "",
AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmin_bf16, "UsUsUs", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmin_nan_bf16, "UsUsUs", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_bf16, "UsUsUs", "", AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_bf16, "UsUsUs", "",
AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmin_bf16x2, "ZUiZUiZUi", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmin_nan_bf16x2, "ZUiZUiZUi", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_bf16x2, "ZUiZUiZUi", "",
AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_bf16x2, "ZUiZUiZUi", "",
AND(SM_86, PTX72))
BUILTIN(__nvvm_fmin_f, "fff", "")
BUILTIN(__nvvm_fmin_ftz_f, "fff", "")
TARGET_BUILTIN(__nvvm_fmin_nan_f, "fff", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f, "fff", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
BUILTIN(__nvvm_fmin_d, "ddd", "")

TARGET_BUILTIN(__nvvm_fmax_f16, "hhh", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmax_ftz_f16, "hhh", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmax_nan_f16, "hhh", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f16, "hhh", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f16, "hhh", "",
AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmax_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmax_ftz_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmax_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f16x2, "V2hV2hV2h", "",
AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f16x2, "V2hV2hV2h", "",
AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f16x2, "V2hV2hV2h", "",
AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f16x2, "V2hV2hV2h", "",
AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmax_bf16, "UsUsUs", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmax_nan_bf16, "UsUsUs", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_bf16, "UsUsUs", "", AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_bf16, "UsUsUs", "",
AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmax_bf16x2, "ZUiZUiZUi", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmax_nan_bf16x2, "ZUiZUiZUi", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_bf16x2, "ZUiZUiZUi", "",
AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_bf16x2, "ZUiZUiZUi", "",
AND(SM_86, PTX72))
BUILTIN(__nvvm_fmax_f, "fff", "")
BUILTIN(__nvvm_fmax_ftz_f, "fff", "")
TARGET_BUILTIN(__nvvm_fmax_nan_f, "fff", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f, "fff", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
BUILTIN(__nvvm_fmax_d, "ddd", "")
BUILTIN(__nvvm_fmin_d, "ddd", "")

// Multiplication

Expand Down Expand Up @@ -205,6 +285,8 @@ BUILTIN(__nvvm_saturate_d, "dd", "")
BUILTIN(__nvvm_ex2_approx_ftz_f, "ff", "")
BUILTIN(__nvvm_ex2_approx_f, "ff", "")
BUILTIN(__nvvm_ex2_approx_d, "dd", "")
TARGET_BUILTIN(__nvvm_ex2_approx_f16, "hh", "", AND(SM_75, PTX70))
TARGET_BUILTIN(__nvvm_ex2_approx_f16x2, "V2hV2h", "", AND(SM_75, PTX70))

BUILTIN(__nvvm_lg2_approx_ftz_f, "ff", "")
BUILTIN(__nvvm_lg2_approx_f, "ff", "")
Expand All @@ -220,6 +302,22 @@ BUILTIN(__nvvm_cos_approx_f, "ff", "")

// Fma

TARGET_BUILTIN(__nvvm_fma_rn_f16, "hhhh", "", AND(SM_53, PTX42))
TARGET_BUILTIN(__nvvm_fma_rn_ftz_f16, "hhhh", "", AND(SM_53, PTX42))
TARGET_BUILTIN(__nvvm_fma_rn_sat_f16, "hhhh", "", AND(SM_53, PTX42))
TARGET_BUILTIN(__nvvm_fma_rn_ftz_sat_f16, "hhhh", "", AND(SM_53, PTX42))
TARGET_BUILTIN(__nvvm_fma_rn_relu_f16, "hhhh", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fma_rn_ftz_relu_f16, "hhhh", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fma_rn_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42))
TARGET_BUILTIN(__nvvm_fma_rn_ftz_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42))
TARGET_BUILTIN(__nvvm_fma_rn_sat_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42))
TARGET_BUILTIN(__nvvm_fma_rn_ftz_sat_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42))
TARGET_BUILTIN(__nvvm_fma_rn_relu_f16x2, "V2hV2hV2hV2h", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fma_rn_ftz_relu_f16x2, "V2hV2hV2hV2h", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fma_rn_bf16, "UsUsUsUs", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fma_rn_relu_bf16, "UsUsUsUs", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fma_rn_bf16x2, "ZUiZUiZUiZUi", "", AND(SM_80, PTX70))
TARGET_BUILTIN(__nvvm_fma_rn_relu_bf16x2, "ZUiZUiZUiZUi", "", AND(SM_80, PTX70))
BUILTIN(__nvvm_fma_rn_ftz_f, "ffff", "")
BUILTIN(__nvvm_fma_rn_f, "ffff", "")
BUILTIN(__nvvm_fma_rz_ftz_f, "ffff", "")
Expand Down Expand Up @@ -2301,15 +2399,24 @@ TARGET_BUILTIN(__nvvm_cp_async_commit_group, "v", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_wait_group, "vIi", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_wait_all, "v", "", AND(SM_80,PTX70))


// bf16, bf16x2 abs, neg
TARGET_BUILTIN(__nvvm_abs_bf16, "UsUs", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_abs_bf16x2, "ZUiZUi", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_neg_bf16, "UsUs", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_neg_bf16x2, "ZUiZUi", "", AND(SM_80,PTX70))

#undef BUILTIN
#undef TARGET_BUILTIN
#pragma pop_macro("AND")
#pragma pop_macro("SM_53")
#pragma pop_macro("SM_60")
#pragma pop_macro("SM_70")
#pragma pop_macro("SM_72")
#pragma pop_macro("SM_75")
#pragma pop_macro("SM_80")
#pragma pop_macro("SM_86")
#pragma pop_macro("PTX42")
#pragma pop_macro("PTX60")
#pragma pop_macro("PTX61")
#pragma pop_macro("PTX63")
Expand Down
174 changes: 174 additions & 0 deletions clang/test/CodeGen/builtins-nvptx-native-half-type.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
// REQUIRES: nvptx-registered-target
//
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
// RUN: sm_75 -target-feature +ptx70 -fcuda-is-device -fnative-half-type -S \
// RUN: -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM75 %s

// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
// RUN: sm_80 -target-feature +ptx70 -fcuda-is-device -fnative-half-type -S \
// RUN: -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 %s

// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown \
// RUN: -target-cpu sm_80 -target-feature +ptx70 -fcuda-is-device \
// RUN: -fnative-half-type -S -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 %s

// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
// RUN: sm_86 -target-feature +ptx72 -fcuda-is-device -fnative-half-type -S \
// RUN: -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 %s

// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown \
// RUN: -target-cpu sm_86 -target-feature +ptx72 -fcuda-is-device \
// RUN: -fnative-half-type -S -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 %s

// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
// RUN: sm_53 -target-feature +ptx42 -fcuda-is-device -fnative-half-type -S \
// RUN: -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX42_SM53 %s

// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown \
// RUN: -target-cpu sm_53 -target-feature +ptx42 -fcuda-is-device \
// RUN: -fnative-half-type -S -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX42_SM53 %s

#define __device__ __attribute__((device))

__device__ void nvvm_ex2_sm75() {
#if __CUDA_ARCH__ >= 750
// CHECK_PTX70_SM75: call half @llvm.nvvm.ex2.approx.f16
__nvvm_ex2_approx_f16(0.1f16);
// CHECK_PTX70_SM75: call <2 x half> @llvm.nvvm.ex2.approx.f16x2
__nvvm_ex2_approx_f16x2({0.1f16, 0.7f16});
#endif
// CHECK: ret void
}

// CHECK-LABEL: nvvm_min_max_sm80
__device__ void nvvm_min_max_sm80() {
#if __CUDA_ARCH__ >= 800
// CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.f16
__nvvm_fmin_f16(0.1f16, 0.1f16);
// CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.ftz.f16
__nvvm_fmin_ftz_f16(0.1f16, 0.1f16);
// CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.nan.f16
__nvvm_fmin_nan_f16(0.1f16, 0.1f16);
// CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.ftz.nan.f16
__nvvm_fmin_ftz_nan_f16(0.1f16, 0.1f16);
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.f16x2
__nvvm_fmin_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.ftz.f16x2
__nvvm_fmin_ftz_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.nan.f16x2
__nvvm_fmin_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2
__nvvm_fmin_ftz_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});

// CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.f16
__nvvm_fmax_f16(0.1f16, 0.1f16);
// CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.ftz.f16
__nvvm_fmax_ftz_f16(0.1f16, 0.1f16);
// CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.nan.f16
__nvvm_fmax_nan_f16(0.1f16, 0.1f16);
// CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.ftz.nan.f16
__nvvm_fmax_ftz_nan_f16(0.1f16, 0.1f16);
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.f16x2
__nvvm_fmax_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.ftz.f16x2
__nvvm_fmax_ftz_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.nan.f16x2
__nvvm_fmax_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2
__nvvm_fmax_ftz_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
#endif
// CHECK: ret void
}

// CHECK-LABEL: nvvm_fma_f16_f16x2_sm80
__device__ void nvvm_fma_f16_f16x2_sm80() {
#if __CUDA_ARCH__ >= 800
// CHECK_PTX70_SM80: call half @llvm.nvvm.fma.rn.relu.f16
__nvvm_fma_rn_relu_f16(0.1f16, 0.1f16, 0.1f16);
// CHECK_PTX70_SM80: call half @llvm.nvvm.fma.rn.ftz.relu.f16
__nvvm_fma_rn_ftz_relu_f16(0.1f16, 0.1f16, 0.1f16);

// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fma.rn.relu.f16x2
__nvvm_fma_rn_relu_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16},
{0.1f16, 0.7f16});
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fma.rn.ftz.relu.f16x2
__nvvm_fma_rn_ftz_relu_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16},
{0.1f16, 0.7f16});
#endif
// CHECK: ret void
}

// CHECK-LABEL: nvvm_fma_f16_f16x2_sm53
__device__ void nvvm_fma_f16_f16x2_sm53() {
#if __CUDA_ARCH__ >= 530
// CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.f16
__nvvm_fma_rn_f16(0.1f16, 0.1f16, 0.1f16);
// CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.ftz.f16
__nvvm_fma_rn_ftz_f16(0.1f16, 0.1f16, 0.1f16);
// CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.sat.f16
__nvvm_fma_rn_sat_f16(0.1f16, 0.1f16, 0.1f16);
// CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.ftz.sat.f16
__nvvm_fma_rn_ftz_sat_f16(0.1f16, 0.1f16, 0.1f16);

// CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.f16x2
__nvvm_fma_rn_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16},
{0.1f16, 0.7f16});
// CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.ftz.f16x2
__nvvm_fma_rn_ftz_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16},
{0.1f16, 0.7f16});
// CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.sat.f16x2
__nvvm_fma_rn_sat_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16},
{0.1f16, 0.7f16});
// CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.ftz.sat.f16x2
__nvvm_fma_rn_ftz_sat_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16},
{0.1f16, 0.7f16});
#endif
// CHECK: ret void
}

// CHECK-LABEL: nvvm_min_max_sm86
__device__ void nvvm_min_max_sm86() {
#if __CUDA_ARCH__ >= 860
// CHECK_PTX72_SM86: call half @llvm.nvvm.fmin.xorsign.abs.f16
__nvvm_fmin_xorsign_abs_f16(0.1f16, 0.1f16);
// CHECK_PTX72_SM86: call half @llvm.nvvm.fmin.ftz.xorsign.abs.f16
__nvvm_fmin_ftz_xorsign_abs_f16(0.1f16, 0.1f16);
// CHECK_PTX72_SM86: call half @llvm.nvvm.fmin.nan.xorsign.abs.f16
__nvvm_fmin_nan_xorsign_abs_f16(0.1f16, 0.1f16);
// CHECK_PTX72_SM86: call half @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16
__nvvm_fmin_ftz_nan_xorsign_abs_f16(0.1f16, 0.1f16);
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmin.xorsign.abs.f16x2
__nvvm_fmin_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmin.ftz.xorsign.abs.f16x2
__nvvm_fmin_ftz_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmin.nan.xorsign.abs.f16x2
__nvvm_fmin_nan_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16x2
__nvvm_fmin_ftz_nan_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});

// CHECK_PTX72_SM86: call half @llvm.nvvm.fmax.xorsign.abs.f16
__nvvm_fmax_xorsign_abs_f16(0.1f16, 0.1f16);
// CHECK_PTX72_SM86: call half @llvm.nvvm.fmax.ftz.xorsign.abs.f16
__nvvm_fmax_ftz_xorsign_abs_f16(0.1f16, 0.1f16);
// CHECK_PTX72_SM86: call half @llvm.nvvm.fmax.nan.xorsign.abs.f16
__nvvm_fmax_nan_xorsign_abs_f16(0.1f16, 0.1f16);
// CHECK_PTX72_SM86: call half @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16
__nvvm_fmax_ftz_nan_xorsign_abs_f16(0.1f16, 0.1f16);
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmax.xorsign.abs.f16x2
__nvvm_fmax_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmax.ftz.xorsign.abs.f16x2
__nvvm_fmax_ftz_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmax.nan.xorsign.abs.f16x2
__nvvm_fmax_nan_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16x2
__nvvm_fmax_ftz_nan_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
#endif
// CHECK: ret void
}
Loading