diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 449e4d1256944..cabfb4aa2090b 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -205,6 +205,8 @@ BUILTIN(__nvvm_saturate_d, "dd", "") BUILTIN(__nvvm_ex2_approx_ftz_f, "ff", "") BUILTIN(__nvvm_ex2_approx_f, "ff", "") BUILTIN(__nvvm_ex2_approx_d, "dd", "") +TARGET_BUILTIN(__nvvm_ex2_approx_f16, "hh", "", AND(SM_75, PTX70)) +TARGET_BUILTIN(__nvvm_ex2_approx_f16x2, "V2hV2h", "", AND(SM_75, PTX70)) BUILTIN(__nvvm_lg2_approx_ftz_f, "ff", "") BUILTIN(__nvvm_lg2_approx_f, "ff", "") @@ -218,6 +220,12 @@ BUILTIN(__nvvm_sin_approx_f, "ff", "") BUILTIN(__nvvm_cos_approx_ftz_f, "ff", "") BUILTIN(__nvvm_cos_approx_f, "ff", "") +// Tanh + +TARGET_BUILTIN(__nvvm_tanh_approx_f, "ff", "", AND(SM_75,PTX70)) +TARGET_BUILTIN(__nvvm_tanh_approx_f16, "hh", "", AND(SM_75, PTX70)) +TARGET_BUILTIN(__nvvm_tanh_approx_f16x2, "V2hV2h", "", AND(SM_75, PTX70)) + // Fma BUILTIN(__nvvm_fma_rn_ftz_f, "ffff", "") diff --git a/libclc/generic/include/clcmacro.h b/libclc/generic/include/clcmacro.h index d4167a8e4529e..addb461aa047d 100644 --- a/libclc/generic/include/clcmacro.h +++ b/libclc/generic/include/clcmacro.h @@ -9,11 +9,7 @@ #ifndef __CLC_MACRO_H #define __CLC_MACRO_H -#define _CLC_UNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE) \ - DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x) { \ - return (RET_TYPE##2)(FUNCTION(x.x), FUNCTION(x.y)); \ - } \ - \ +#define _CLC_UNARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE) \ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x) { \ return (RET_TYPE##3)(FUNCTION(x.x), FUNCTION(x.y), FUNCTION(x.z)); \ } \ @@ -30,12 +26,14 @@ return (RET_TYPE##16)(FUNCTION(x.lo), FUNCTION(x.hi)); \ } -#define _CLC_BINARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ - ARG2_TYPE) \ - DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y) { \ - return (RET_TYPE##2)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y)); \ +#define _CLC_UNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE) \ + DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x) { \ + return (RET_TYPE##2)(FUNCTION(x.x), FUNCTION(x.y)); \ } \ - \ + _CLC_UNARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE) + +#define _CLC_BINARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ + ARG2_TYPE) \ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y) { \ return (RET_TYPE##3)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y), \ FUNCTION(x.z, y.z)); \ @@ -53,6 +51,14 @@ return (RET_TYPE##16)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \ } +#define _CLC_BINARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ + ARG2_TYPE) \ + DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y) { \ + return (RET_TYPE##2)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y)); \ + } \ + _CLC_BINARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ + ARG2_TYPE) + #define _CLC_V_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ ARG2_TYPE) \ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE##2 y) { \ @@ -76,13 +82,8 @@ return (RET_TYPE##16)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \ } -#define _CLC_TERNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ - ARG2_TYPE, ARG3_TYPE) \ - DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y, \ - ARG3_TYPE##2 z) { \ - return (RET_TYPE##2)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y)); \ - } \ - \ +#define _CLC_TERNARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ + ARG2_TYPE, ARG3_TYPE) \ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y, \ ARG3_TYPE##3 z) { \ return (RET_TYPE##3)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y), \ @@ -107,6 +108,15 @@ FUNCTION(x.hi, y.hi, z.hi)); \ } +#define _CLC_TERNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ + ARG2_TYPE, ARG3_TYPE) \ + DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y, \ + ARG3_TYPE##2 z) { \ + return (RET_TYPE##2)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y)); \ + } \ + _CLC_TERNARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ + ARG2_TYPE, ARG3_TYPE) + #define _CLC_V_S_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ ARG2_TYPE, ARG3_TYPE) \ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##2 z) { \ diff --git a/libclc/generic/include/spirv/spirv_builtins.h b/libclc/generic/include/spirv/spirv_builtins.h index adeb3a63460d6..ca15fdf3c6547 100644 --- a/libclc/generic/include/spirv/spirv_builtins.h +++ b/libclc/generic/include/spirv/spirv_builtins.h @@ -15776,6 +15776,21 @@ _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec8_fp32_t _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec16_fp32_t __spirv_ocl_native_exp2(__clc_vec16_fp32_t); +#ifdef cl_khr_fp16 +_CLC_OVERLOAD +_CLC_DECL _CLC_CONSTFN __clc_fp16_t __clc_native_exp2(__clc_fp16_t); +_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec2_fp16_t + __clc_native_exp2(__clc_vec2_fp16_t); +_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec3_fp16_t + __clc_native_exp2(__clc_vec3_fp16_t); +_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec4_fp16_t + __clc_native_exp2(__clc_vec4_fp16_t); +_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec8_fp16_t + __clc_native_exp2(__clc_vec8_fp16_t); +_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec16_fp16_t + __clc_native_exp2(__clc_vec16_fp16_t); +#endif + _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_fp32_t __spirv_ocl_native_log(__clc_fp32_t); _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec2_fp32_t @@ -19077,6 +19092,34 @@ _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec16_fp16_t __spirv_ocl_tanh(__clc_vec16_fp16_t); #endif +_CLC_OVERLOAD +_CLC_DECL _CLC_CONSTFN __clc_fp32_t __clc_native_tanh(__clc_fp32_t); +_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec2_fp32_t + __clc_native_tanh(__clc_vec2_fp32_t); +_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec3_fp32_t + __clc_native_tanh(__clc_vec3_fp32_t); +_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec4_fp32_t + __clc_native_tanh(__clc_vec4_fp32_t); +_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec8_fp32_t + __clc_native_tanh(__clc_vec8_fp32_t); +_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec16_fp32_t + __clc_native_tanh(__clc_vec16_fp32_t); + +#ifdef cl_khr_fp16 +_CLC_OVERLOAD +_CLC_DECL _CLC_CONSTFN __clc_fp16_t __clc_native_tanh(__clc_fp16_t); +_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec2_fp16_t + __clc_native_tanh(__clc_vec2_fp16_t); +_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec3_fp16_t + __clc_native_tanh(__clc_vec3_fp16_t); +_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec4_fp16_t + __clc_native_tanh(__clc_vec4_fp16_t); +_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec8_fp16_t + __clc_native_tanh(__clc_vec8_fp16_t); +_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec16_fp16_t + __clc_native_tanh(__clc_vec16_fp16_t); +#endif + _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_fp32_t __spirv_ocl_tanpi(__clc_fp32_t); _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec2_fp32_t diff --git a/libclc/generic/libspirv/float16.cl b/libclc/generic/libspirv/float16.cl index b2cd14e8c63f4..28f5b65ac80e2 100644 --- a/libclc/generic/libspirv/float16.cl +++ b/libclc/generic/libspirv/float16.cl @@ -4344,6 +4344,36 @@ __spirv_ocl_exp2(__clc_vec16_float16_t args_0) { return __spirv_ocl_exp2(as_half16(args_0)); } +_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_fp16_t +__clc_native_exp2(__clc_float16_t args_0) { + return __clc_native_exp2(as_half(args_0)); +} + +_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec2_fp16_t +__clc_native_exp2(__clc_vec2_float16_t args_0) { + return __clc_native_exp2(as_half2(args_0)); +} + +_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec3_fp16_t +__clc_native_exp2(__clc_vec3_float16_t args_0) { + return __clc_native_exp2(as_half3(args_0)); +} + +_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec4_fp16_t +__clc_native_exp2(__clc_vec4_float16_t args_0) { + return __clc_native_exp2(as_half4(args_0)); +} + +_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec8_fp16_t +__clc_native_exp2(__clc_vec8_float16_t args_0) { + return __clc_native_exp2(as_half8(args_0)); +} + +_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec16_fp16_t +__clc_native_exp2(__clc_vec16_float16_t args_0) { + return __clc_native_exp2(as_half16(args_0)); +} + _CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_fp16_t __spirv_ocl_expm1(__clc_float16_t args_0) { return __spirv_ocl_expm1(as_half(args_0)); @@ -6613,6 +6643,36 @@ __spirv_ocl_tanh(__clc_vec16_float16_t args_0) { return __spirv_ocl_tanh(as_half16(args_0)); } +_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_fp16_t +__clc_native_tanh(__clc_float16_t args_0) { + return __clc_native_tanh(as_half(args_0)); +} + +_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec2_fp16_t +__clc_native_tanh(__clc_vec2_float16_t args_0) { + return __clc_native_tanh(as_half2(args_0)); +} + +_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec3_fp16_t +__clc_native_tanh(__clc_vec3_float16_t args_0) { + return __clc_native_tanh(as_half3(args_0)); +} + +_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec4_fp16_t +__clc_native_tanh(__clc_vec4_float16_t args_0) { + return __clc_native_tanh(as_half4(args_0)); +} + +_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec8_fp16_t +__clc_native_tanh(__clc_vec8_float16_t args_0) { + return __clc_native_tanh(as_half8(args_0)); +} + +_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec16_fp16_t +__clc_native_tanh(__clc_vec16_float16_t args_0) { + return __clc_native_tanh(as_half16(args_0)); +} + _CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_fp16_t __spirv_ocl_tanpi(__clc_float16_t args_0) { return __spirv_ocl_tanpi(as_half(args_0)); diff --git a/libclc/ptx-nvidiacl/libspirv/SOURCES b/libclc/ptx-nvidiacl/libspirv/SOURCES index 5ec1dea1afc30..9f105b1556ed9 100644 --- a/libclc/ptx-nvidiacl/libspirv/SOURCES +++ b/libclc/ptx-nvidiacl/libspirv/SOURCES @@ -53,6 +53,7 @@ math/native_rsqrt.cl math/native_sin.cl math/native_sqrt.cl math/native_tan.cl +math/native_tanh.cl math/nextafter.cl math/pow.cl math/remainder.cl diff --git a/libclc/ptx-nvidiacl/libspirv/math/native_exp2.cl b/libclc/ptx-nvidiacl/libspirv/math/native_exp2.cl index 93c12c0aeb067..8c06a7ed9ea9c 100644 --- a/libclc/ptx-nvidiacl/libspirv/math/native_exp2.cl +++ b/libclc/ptx-nvidiacl/libspirv/math/native_exp2.cl @@ -14,4 +14,34 @@ #define __CLC_FUNCTION __spirv_ocl_native_exp2 #define __CLC_BUILTIN __nv_exp2 #define __CLC_BUILTIN_F __CLC_XCONCAT(__CLC_BUILTIN, f) + +#ifdef cl_khr_fp16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +int __clc_nvvm_reflect_arch(); +#define __USE_HALF_EXP2_APPROX (__clc_nvvm_reflect_arch() >= 750) + +_CLC_DEF _CLC_OVERLOAD half __clc_native_exp2(half x) { + return (__USE_HALF_EXP2_APPROX) ? __nvvm_ex2_approx_f16(x) + : __spirv_ocl_native_exp2((float)x); +} + +_CLC_DEF _CLC_OVERLOAD half2 __clc_native_exp2(half2 x) { + return (__USE_HALF_EXP2_APPROX) + ? __nvvm_ex2_approx_f16x2(x) + : (half2)(__spirv_ocl_native_exp2((float)x.x), + __spirv_ocl_native_exp2((float)x.y)); +} + +_CLC_UNARY_VECTORIZE_HAVE2(_CLC_OVERLOAD _CLC_DEF, half, __clc_native_exp2, + half) + +#undef __USE_HALF_EXP2_APPROX + +#endif // cl_khr_fp16 + +// Undef halfs before uncluding unary builtins, as they are handled above. +#ifdef cl_khr_fp16 +#undef cl_khr_fp16 +#endif // cl_khr_fp16 #include diff --git a/libclc/ptx-nvidiacl/libspirv/math/native_tanh.cl b/libclc/ptx-nvidiacl/libspirv/math/native_tanh.cl new file mode 100644 index 0000000000000..3216059c5c7ce --- /dev/null +++ b/libclc/ptx-nvidiacl/libspirv/math/native_tanh.cl @@ -0,0 +1,41 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include + +#include "../../include/libdevice.h" +#include + +extern int __clc_nvvm_reflect_arch(); + +#define __USE_TANH_APPROX (__clc_nvvm_reflect_arch() >= 750) + +_CLC_DEF _CLC_OVERLOAD float __clc_native_tanh(float x) { + return (__USE_TANH_APPROX) ? __nvvm_tanh_approx_f(x) : __nv_tanhf(x); +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __clc_native_tanh, float) + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEF _CLC_OVERLOAD half __clc_native_tanh(half x) { + return (__USE_TANH_APPROX) ? __nvvm_tanh_approx_f16(x) : __nv_tanhf(x); +} + +_CLC_DEF _CLC_OVERLOAD half2 __clc_native_tanh(half2 x) { + return (__USE_TANH_APPROX) ? __nvvm_tanh_approx_f16x2(x) + : (half2)(__nv_tanhf(x.x), __nv_tanhf(x.y)); +} + +_CLC_UNARY_VECTORIZE_HAVE2(_CLC_OVERLOAD _CLC_DEF, half, __clc_native_tanh, half) + +#endif + +#undef __USE_TANH_APPROX + diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index f026033782184..3321534a3218c 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -854,6 +854,17 @@ let TargetPrefix = "nvvm" in { def int_nvvm_cos_approx_f : GCCBuiltin<"__nvvm_cos_approx_f">, DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; +// +// Tanh +// + + def int_nvvm_tanh_approx_f : GCCBuiltin<"__nvvm_tanh_approx_f">, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + def int_nvvm_tanh_approx_f16 : GCCBuiltin<"__nvvm_tanh_approx_f16">, + DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty], [IntrNoMem]>; + def int_nvvm_tanh_approx_f16x2 : GCCBuiltin<"__nvvm_tanh_approx_f16x2">, + DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty], [IntrNoMem]>; + // // Fma // diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index c980f6ed4bdc2..377dce99578cc 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -933,6 +933,17 @@ def INT_NVVM_COS_APPROX_FTZ_F : F_MATH_1<"cos.approx.ftz.f32 \t$dst, $src0;", def INT_NVVM_COS_APPROX_F : F_MATH_1<"cos.approx.f32 \t$dst, $src0;", Float32Regs, Float32Regs, int_nvvm_cos_approx_f>; +// +// Tanh +// + + def INT_NVVM_TANH_APPROX_F : F_MATH_1<"tanh.approx.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_tanh_approx_f>; + def INT_NVVM_TANH_APPROX_F16 : F_MATH_1<"tanh.approx.f16 \t$dst, $src0;", + Float16Regs, Float16Regs, int_nvvm_tanh_approx_f16>; + def INT_NVVM_TANH_APPROX_F16X2 : F_MATH_1<"tanh.approx.f16x2 \t$dst, $src0;", + Float16x2Regs, Float16x2Regs, int_nvvm_tanh_approx_f16x2>; + // // Fma // diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_native_math.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_native_math.asciidoc new file mode 100644 index 0000000000000..9b8daaf0ab6d6 --- /dev/null +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_native_math.asciidoc @@ -0,0 +1,112 @@ += sycl_ext_oneapi_native_math + +:source-highlighter: coderay +:coderay-linenums-mode: table + +// This section needs to be after the document title. +:doctype: book +:toc2: +:toc: left +:encoding: utf-8 +:lang: en +:dpcpp: pass:[DPC++] + +// Set the default source code type in this document to C++, +// for syntax highlighting purposes. This is needed because +// docbook uses c++ and html5 uses cpp. +:language: {basebackend@docbook:c++:cpp} + + +== Notice + +[%hardbreaks] +Copyright (C) 2022-2022 Intel Corporation. All rights reserved. + +Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks +of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by +permission by Khronos. + + +== Contact + +To report problems with this extension, please open a new issue at: + +https://github.com/intel/llvm/issues + + +== Dependencies + +This extension is written against the SYCL 2020 revision 4 specification. All +references below to the "core SYCL specification" or to section numbers in the +SYCL specification refer to that revision. + +== Status + +This is an experimental extension specification, intended to provide early +access to features and gather community feedback. Interfaces defined in this +specification are implemented in {dpcpp}, but they are not finalized and may +change incompatibly in future versions of {dpcpp} without prior notice. +*Shipping software products should not rely on APIs defined in this +specification.* + +== Overview + +The CUDA backend has introduced fast math "approx" versions of the `exp2` and +`tanh` functions for `half` and `float` types. It is considered likely that +other backends will eventually introduce similar functionality, motivated +particularly by deep learning use cases of these functions. We propose that the +appropriate place to call such functionality in SYCL applications would be from +the newly proposed native functions. + +== Specification + +=== Feature test macro + +This extension provides a feature-test macro as described in the core SYCL +specification. An implementation supporting this extension must predefine the +macro `SYCL_EXT_ONEAPI_NATIVE_MATH` to one of the values defined in the table +below. Applications can test for the existence of this macro to determine if +the implementation supports this feature, or applications can test the macro's +value to determine which of the extension's features the implementation +supports. + +[%header,cols="1,5"] +|=== +|Value +|Description + +|1 +|The APIs of this experimental extension are not versioned, so the + feature-test macro always has this value. +|=== + +=== New functions + +This extension allows the `sycl::native::exp2` function (that originally +supports `genfloatf` types) to support `genfloath` types in the +`sycl::ext::oneapi::experimental::native` namespace. It also introduces a new +native `tanh` function supporting `genfloath` and `genfloatf` types in the +`sycl::ext::oneapi::experimental::native` namespace. + +NOTE: This document does not propose `genfloatd` support for either +`sycl::ext::oneapi::experimental::native::exp2` or +`sycl::ext::oneapi::experimental::native::tanh`. + + +> This extension adds the following new native builtin functions to SYCL: +> +> ``` +> namespace sycl::ext::oneapi::experimental::native { +> +> // Available only when "T" is one of the genfloath types. +> template +> T exp2(T x); +> +> // Available only when "T" is one of the genfloatf or genfloath types. +> template +> T tanh(T x); +> +> } // namespace sycl::ext::oneapi::experimental::native +> ``` + + diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index 4878cc4dd5db8..e7b660f9d29e7 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -755,6 +755,44 @@ __spirv_ocl_printf(const __attribute__((opencl_constant)) char *Format, ...); extern SYCL_EXTERNAL int __spirv_ocl_printf(const char *Format, ...); #endif +// Native builtin extension + +extern SYCL_EXTERNAL float __clc_native_tanh(float); +extern SYCL_EXTERNAL __ocl_vec_t + __clc_native_tanh(__ocl_vec_t); +extern SYCL_EXTERNAL __ocl_vec_t + __clc_native_tanh(__ocl_vec_t); +extern SYCL_EXTERNAL __ocl_vec_t + __clc_native_tanh(__ocl_vec_t); +extern SYCL_EXTERNAL __ocl_vec_t + __clc_native_tanh(__ocl_vec_t); +extern SYCL_EXTERNAL __ocl_vec_t + __clc_native_tanh(__ocl_vec_t); + +extern SYCL_EXTERNAL _Float16 __clc_native_tanh(_Float16); +extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 2> + __clc_native_tanh(__ocl_vec_t<_Float16, 2>); +extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 3> + __clc_native_tanh(__ocl_vec_t<_Float16, 3>); +extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 4> + __clc_native_tanh(__ocl_vec_t<_Float16, 4>); +extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 8> + __clc_native_tanh(__ocl_vec_t<_Float16, 8>); +extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 16> + __clc_native_tanh(__ocl_vec_t<_Float16, 16>); + +extern SYCL_EXTERNAL _Float16 __clc_native_exp2(_Float16); +extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 2> + __clc_native_exp2(__ocl_vec_t<_Float16, 2>); +extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 3> + __clc_native_exp2(__ocl_vec_t<_Float16, 3>); +extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 4> + __clc_native_exp2(__ocl_vec_t<_Float16, 4>); +extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 8> + __clc_native_exp2(__ocl_vec_t<_Float16, 8>); +extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 16> + __clc_native_exp2(__ocl_vec_t<_Float16, 16>); + #else // if !__SYCL_DEVICE_ONLY__ template diff --git a/sycl/include/CL/sycl/feature_test.hpp.in b/sycl/include/CL/sycl/feature_test.hpp.in index a7f0ca071ee46..75986f5b3f9ff 100644 --- a/sycl/include/CL/sycl/feature_test.hpp.in +++ b/sycl/include/CL/sycl/feature_test.hpp.in @@ -54,6 +54,7 @@ namespace sycl { #define SYCL_EXT_ONEAPI_SRGB 1 #define SYCL_EXT_ONEAPI_SUB_GROUP 1 #define SYCL_EXT_ONEAPI_PROPERTIES 1 +#define SYCL_EXT_ONEAPI_NATIVE_MATH 1 #define SYCL_EXT_INTEL_BF16_CONVERSION 1 #define SYCL_EXT_INTEL_DATAFLOW_PIPES 1 #ifdef __clang__ diff --git a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp index e32e1c70a5a97..c8fa033d8c79e 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp @@ -8,8 +8,17 @@ #pragma once +#include +#include +#include +#include +#include + #include +// TODO Decide whether to mark functions with this attribute. +#define __NOEXC /*noexcept*/ + #ifdef __SYCL_DEVICE_ONLY__ #define __SYCL_CONSTANT_AS __attribute__((opencl_constant)) #else @@ -72,6 +81,42 @@ int printf(const FormatT *__format, Args... args) { #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__SPIR__) } +namespace native { + +// genfloatfh tanh (genfloatfh x) +template +inline __SYCL_ALWAYS_INLINE + sycl::detail::enable_if_t::value || + sycl::detail::is_genfloath::value, + T> + tanh(T x) __NOEXC { +#if defined(__NVPTX__) + using _ocl_T = cl::sycl::detail::ConvertToOpenCLType_t; + _ocl_T arg1 = cl::sycl::detail::convertDataToType(x); + return cl::sycl::detail::convertDataToType<_ocl_T, T>( + __clc_native_tanh(arg1)); +#else + return __sycl_std::__invoke_tanh(x); +#endif +} + +// genfloath exp2 (genfloath x) +template +inline __SYCL_ALWAYS_INLINE + sycl::detail::enable_if_t::value, T> + exp2(T x) __NOEXC { +#if defined(__NVPTX__) + using _ocl_T = cl::sycl::detail::ConvertToOpenCLType_t; + _ocl_T arg1 = cl::sycl::detail::convertDataToType(x); + return cl::sycl::detail::convertDataToType<_ocl_T, T>( + __clc_native_exp2(arg1)); +#else + return __sycl_std::__invoke_exp2(x); +#endif +} + +} // namespace native + } // namespace experimental } // namespace oneapi } // namespace ext