From eaae1e7b619f28587a80e677dcea470ebd202770 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 30 May 2023 02:56:48 -0700 Subject: [PATCH 1/8] This PR adds support to __half and nb_bfloat16 to most math functions --- cpp/include/raft/core/detail/macros.hpp | 12 + cpp/include/raft/core/math.hpp | 531 ++++++++++++++++++++---- cpp/test/core/math_device.cu | 130 +++++- 3 files changed, 582 insertions(+), 91 deletions(-) diff --git a/cpp/include/raft/core/detail/macros.hpp b/cpp/include/raft/core/detail/macros.hpp index 390acea697..b14114214f 100644 --- a/cpp/include/raft/core/detail/macros.hpp +++ b/cpp/include/raft/core/detail/macros.hpp @@ -22,6 +22,12 @@ #endif #endif +#if defined(_RAFT_HAS_CUDA) +#define CUDA_ONLY_CONDITION(condition) condition +#else +#define CUDA_ONLY_CONDITION(condition) true +#endif + #ifndef _RAFT_HOST_DEVICE #if defined(_RAFT_HAS_CUDA) #define _RAFT_DEVICE __device__ @@ -40,6 +46,10 @@ #define RAFT_INLINE_FUNCTION _RAFT_HOST_DEVICE _RAFT_FORCEINLINE #endif +#ifndef RAFT_DEVICE_INLINE_FUNCTION +#define RAFT_DEVICE_INLINE_FUNCTION _RAFT_DEVICE _RAFT_FORCEINLINE +#endif + // The RAFT_INLINE_CONDITIONAL is a conditional inline specifier that removes // the inline specification when RAFT_COMPILED is defined. // @@ -60,6 +70,8 @@ #define RAFT_INLINE_CONDITIONAL inline #endif // RAFT_COMPILED + + // The RAFT_WEAK_FUNCTION specificies that: // // 1. A function may be defined in multiple translation units (like inline) diff --git a/cpp/include/raft/core/math.hpp b/cpp/include/raft/core/math.hpp index c5f08b84b7..71c297fbba 100644 --- a/cpp/include/raft/core/math.hpp +++ b/cpp/include/raft/core/math.hpp @@ -22,6 +22,11 @@ #include +#if _RAFT_HAS_CUDA +#include +#include +#endif + namespace raft { /** @@ -30,32 +35,32 @@ namespace raft { */ template RAFT_INLINE_FUNCTION auto abs(T x) - -> std::enable_if_t || std::is_same_v || +-> std::enable_if_t || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v, - T> + T> { #ifdef __CUDA_ARCH__ - return ::abs(x); +return ::abs(x); #else - return std::abs(x); +return std::abs(x); #endif } template constexpr RAFT_INLINE_FUNCTION auto abs(T x) - -> std::enable_if_t && !std::is_same_v && +-> std::enable_if_t && !std::is_same_v && !std::is_same_v && !std::is_same_v && !std::is_same_v, - T> + T> { - return x < T{0} ? -x : x; +return x < T{0} ? -x : x; } /** @} */ /** - * @defgroup Trigonometry Trigonometry functions - * @{ - */ +* @defgroup Trigonometry Trigonometry functions +* @{ +*/ /** Inverse cosine */ template RAFT_INLINE_FUNCTION auto acos(T x) @@ -91,7 +96,13 @@ RAFT_INLINE_FUNCTION auto atanh(T x) /** Cosine */ template -RAFT_INLINE_FUNCTION auto cos(T x) +RAFT_INLINE_FUNCTION +#if _RAFT_HAS_CUDA +typename std::enable_if::value && !std::is_same::value, T>::type +#else +auto +#endif +cos(T x) { #ifdef __CUDA_ARCH__ return ::cos(x); @@ -100,9 +111,47 @@ RAFT_INLINE_FUNCTION auto cos(T x) #endif } +#if _RAFT_HAS_CUDA +template +RAFT_DEVICE_INLINE_FUNCTION +typename std::enable_if::value, __half>::type +cos(T x) +{ +#if (__CUDA_ARCH__ >= 530) + return ::hcos(x); +#else + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage + // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); + return T{}; +#endif +} + +template +RAFT_DEVICE_INLINE_FUNCTION +typename std::enable_if::value, nv_bfloat16>::type +cos(T x) +{ +#if (__CUDA_ARCH__ >= 800) + return ::hcos(x); +#else + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage + // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); + return T{}; +#endif +} +#endif + /** Sine */ template -RAFT_INLINE_FUNCTION auto sin(T x) +RAFT_INLINE_FUNCTION +#if _RAFT_HAS_CUDA +typename std::enable_if::value && !std::is_same::value, T>::type +#else +auto +#endif +sin(T x) { #ifdef __CUDA_ARCH__ return ::sin(x); @@ -111,16 +160,48 @@ RAFT_INLINE_FUNCTION auto sin(T x) #endif } +#if _RAFT_HAS_CUDA +template +RAFT_DEVICE_INLINE_FUNCTION +typename std::enable_if::value, __half>::type +sin(T x) +{ +#if (__CUDA_ARCH__ >= 530) + return ::hsin(x); +#else + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage + // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); + return T{}; +#endif +} + +template +RAFT_DEVICE_INLINE_FUNCTION +typename std::enable_if::value, nv_bfloat16>::type +sin(T x) +{ +#if (__CUDA_ARCH__ >= 800) + return ::hsin(x); +#else + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage + // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); + return T{}; +#endif +} +#endif + /** Sine and cosine */ template RAFT_INLINE_FUNCTION std::enable_if_t || std::is_same_v> sincos( - const T& x, T* s, T* c) +const T& x, T* s, T* c) { #ifdef __CUDA_ARCH__ - ::sincos(x, s, c); +::sincos(x, s, c); #else - *s = std::sin(x); - *c = std::cos(x); +*s = std::sin(x); +*c = std::cos(x); #endif } @@ -129,20 +210,26 @@ template RAFT_INLINE_FUNCTION auto tanh(T x) { #ifdef __CUDA_ARCH__ - return ::tanh(x); +return ::tanh(x); #else - return std::tanh(x); +return std::tanh(x); #endif } /** @} */ /** - * @defgroup Exponential Exponential and logarithm - * @{ - */ +* @defgroup Exponential Exponential and logarithm +* @{ +*/ /** Exponential function */ template -RAFT_INLINE_FUNCTION auto exp(T x) +RAFT_INLINE_FUNCTION +#if _RAFT_HAS_CUDA +typename std::enable_if::value && !std::is_same::value, T>::type +#else +auto +#endif +exp(T x) { #ifdef __CUDA_ARCH__ return ::exp(x); @@ -151,63 +238,179 @@ RAFT_INLINE_FUNCTION auto exp(T x) #endif } +#if _RAFT_HAS_CUDA +template +RAFT_DEVICE_INLINE_FUNCTION +typename std::enable_if::value, __half>::type +exp(T x) +{ +#if (__CUDA_ARCH__ >= 530) + return ::hexp(x); +#else + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage + // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); + return T{}; +#endif +} + +template +RAFT_DEVICE_INLINE_FUNCTION +typename std::enable_if::value, nv_bfloat16>::type +exp(T x) +{ +#if (__CUDA_ARCH__ >= 800) + return ::hexp(x); +#else + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage + // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); + return T{}; +#endif +} +#endif + /** Natural logarithm */ template -RAFT_INLINE_FUNCTION auto log(T x) +RAFT_INLINE_FUNCTION +#if _RAFT_HAS_CUDA +typename std::enable_if::value && !std::is_same::value, T>::type +#else +auto +#endif +log(T x) { #ifdef __CUDA_ARCH__ - return ::log(x); +return ::log(x); +#else +return std::log(x); +#endif +} + +#if _RAFT_HAS_CUDA +template +RAFT_DEVICE_INLINE_FUNCTION +typename std::enable_if::value, __half>::type +log(T x) +{ +#if (__CUDA_ARCH__ >= 530) + return ::hlog(x); +#else + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage + // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); + return T{}; +#endif +} + +template +RAFT_DEVICE_INLINE_FUNCTION +typename std::enable_if::value, nv_bfloat16>::type +log(T x) +{ +#if (__CUDA_ARCH__ >= 800) + return ::hlog(x); #else - return std::log(x); + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage + // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); + return T{}; #endif } +#endif /** @} */ /** - * @defgroup Maximum Maximum of two or more values. - * - * The CUDA Math API has overloads for all combinations of float/double. We provide similar - * functionality while wrapping around std::max, which only supports arguments of the same type. - * However, though the CUDA Math API supports combinations of unsigned and signed integers, this is - * very error-prone so we do not support that and require the user to cast instead. (e.g the max of - * -1 and 1u is 4294967295u...) - * - * When no overload matches, we provide a generic implementation but require that both types be the - * same (and that the less-than operator be defined). - * @{ - */ -template -RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y) +* @defgroup Maximum Maximum of two or more values. +* +* The CUDA Math API has overloads for all combinations of float/double. We provide similar +* functionality while wrapping around std::max, which only supports arguments of the same type. +* However, though the CUDA Math API supports combinations of unsigned and signed integers, this is +* very error-prone so we do not support that and require the user to cast instead. (e.g the max of +* -1 and 1u is 4294967295u...) +* +* When no overload matches, we provide a generic implementation but require that both types be the +* same (and that the less-than operator be defined). +* @{ +*/ +template < + typename T1, + typename T2, + std::enable_if_t< + CUDA_ONLY_CONDITION( + RAFT_DEPAREN(( + (!std::is_same_v && !std::is_same_v) || + (!std::is_same_v && !std::is_same_v) + )) + ), + int + > = 0 +> +RAFT_INLINE_FUNCTION +auto +max(const T1& x, const T2& y) { #ifdef __CUDA_ARCH__ - // Combinations of types supported by the CUDA Math API - if constexpr ((std::is_integral_v && std::is_integral_v && std::is_same_v) || +// Combinations of types supported by the CUDA Math API +if constexpr ((std::is_integral_v && std::is_integral_v && std::is_same_v) || ((std::is_same_v || std::is_same_v)&&( - std::is_same_v || std::is_same_v))) { + std::is_same_v || std::is_same_v))) { return ::max(x, y); - } - // Else, check that the types are the same and provide a generic implementation - else { +} +// Else, check that the types are the same and provide a generic implementation +else { static_assert( - std::is_same_v, - "No native max overload for these types. Both argument types must be the same to use " - "the generic max. Please cast appropriately."); + std::is_same_v, + "No native max overload for these types. Both argument types must be the same to use " + "the generic max. Please cast appropriately."); return (x < y) ? y : x; - } +} #else - if constexpr (std::is_same_v && std::is_same_v) { +if constexpr (std::is_same_v && std::is_same_v) { return std::max(static_cast(x), y); - } else if constexpr (std::is_same_v && std::is_same_v) { +} else if constexpr (std::is_same_v && std::is_same_v) { return std::max(x, static_cast(y)); - } else { +} else { static_assert( - std::is_same_v, - "std::max requires that both argument types be the same. Please cast appropriately."); + std::is_same_v, + "std::max requires that both argument types be the same. Please cast appropriately."); return std::max(x, y); - } +} #endif } +#if _RAFT_HAS_CUDA +template +RAFT_DEVICE_INLINE_FUNCTION +typename std::enable_if::value, __half>::type +max(T x, T y) +{ +#if (__CUDA_ARCH__ >= 530) + return ::__hmax(x, y); +#else + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage + // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); + return T{}; +#endif +} + +template +RAFT_DEVICE_INLINE_FUNCTION +typename std::enable_if::value, nv_bfloat16>::type +max(T x, T y) +{ +#if (__CUDA_ARCH__ >= 800) + return ::__hmax(x, y); +#else + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage + // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); + return T{}; +#endif +} +#endif + /** Many-argument overload to avoid verbose nested calls or use with variadic arguments */ template RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y, Args&&... args) @@ -219,54 +422,134 @@ RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y, Args&&... args) template constexpr RAFT_INLINE_FUNCTION auto max(const T& x) { +return x; +} + +#if _RAFT_HAS_CUDA +template +RAFT_DEVICE_INLINE_FUNCTION +typename std::enable_if::value, __half>::type +max(T x) +{ +#if (__CUDA_ARCH__ >= 530) + return x; +#else + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage + // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); + return T{}; +#endif +} + +template +RAFT_DEVICE_INLINE_FUNCTION +typename std::enable_if::value, nv_bfloat16>::type +max(T x) +{ +#if (__CUDA_ARCH__ >= 800) return x; +#else + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage + // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); + return T{}; +#endif } +#endif + + /** @} */ /** - * @defgroup Minimum Minimum of two or more values. - * - * The CUDA Math API has overloads for all combinations of float/double. We provide similar - * functionality while wrapping around std::min, which only supports arguments of the same type. - * However, though the CUDA Math API supports combinations of unsigned and signed integers, this is - * very error-prone so we do not support that and require the user to cast instead. (e.g the min of - * -1 and 1u is 1u...) - * - * When no overload matches, we provide a generic implementation but require that both types be the - * same (and that the less-than operator be defined). - * @{ - */ -template -RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y) +* @defgroup Minimum Minimum of two or more values. +* +* The CUDA Math API has overloads for all combinations of float/double. We provide similar +* functionality while wrapping around std::min, which only supports arguments of the same type. +* However, though the CUDA Math API supports combinations of unsigned and signed integers, this is +* very error-prone so we do not support that and require the user to cast instead. (e.g the min of +* -1 and 1u is 1u...) +* +* When no overload matches, we provide a generic implementation but require that both types be the +* same (and that the less-than operator be defined). +* @{ +*/ +template < + typename T1, + typename T2, + std::enable_if_t< + CUDA_ONLY_CONDITION( + RAFT_DEPAREN(( + (!std::is_same_v && !std::is_same_v) || + (!std::is_same_v && !std::is_same_v) + )) + ), + int + > = 0 +> +RAFT_INLINE_FUNCTION +auto +min(const T1& x, const T2& y) { #ifdef __CUDA_ARCH__ - // Combinations of types supported by the CUDA Math API - if constexpr ((std::is_integral_v && std::is_integral_v && std::is_same_v) || +// Combinations of types supported by the CUDA Math API +if constexpr ((std::is_integral_v && std::is_integral_v && std::is_same_v) || ((std::is_same_v || std::is_same_v)&&( - std::is_same_v || std::is_same_v))) { + std::is_same_v || std::is_same_v))) { return ::min(x, y); - } - // Else, check that the types are the same and provide a generic implementation - else { +} +// Else, check that the types are the same and provide a generic implementation +else { static_assert( - std::is_same_v, - "No native min overload for these types. Both argument types must be the same to use " - "the generic min. Please cast appropriately."); + std::is_same_v, + "No native min overload for these types. Both argument types must be the same to use " + "the generic min. Please cast appropriately."); return (y < x) ? y : x; - } +} #else - if constexpr (std::is_same_v && std::is_same_v) { +if constexpr (std::is_same_v && std::is_same_v) { return std::min(static_cast(x), y); - } else if constexpr (std::is_same_v && std::is_same_v) { +} else if constexpr (std::is_same_v && std::is_same_v) { return std::min(x, static_cast(y)); - } else { +} else { static_assert( - std::is_same_v, - "std::min requires that both argument types be the same. Please cast appropriately."); + std::is_same_v, + "std::min requires that both argument types be the same. Please cast appropriately."); return std::min(x, y); - } +} +#endif +} + +#if _RAFT_HAS_CUDA +template +RAFT_DEVICE_INLINE_FUNCTION +typename std::enable_if::value, __half>::type +min(T x, T y) +{ +#if (__CUDA_ARCH__ >= 530) + return ::__hmin(x, y); +#else + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage + // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); + return T{}; +#endif +} + +template +RAFT_DEVICE_INLINE_FUNCTION +typename std::enable_if::value, nv_bfloat16>::type +min(T x, T y) +{ +#if (__CUDA_ARCH__ >= 800) + return ::__hmin(x, y); +#else + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage + // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); + return T{}; #endif } +#endif /** Many-argument overload to avoid verbose nested calls or use with variadic arguments */ template @@ -279,8 +562,40 @@ RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y, Args&&... args) template constexpr RAFT_INLINE_FUNCTION auto min(const T& x) { +return x; +} + +#if _RAFT_HAS_CUDA +template +RAFT_DEVICE_INLINE_FUNCTION +typename std::enable_if::value, __half>::type +min(T x) +{ +#if (__CUDA_ARCH__ >= 530) + return x; +#else + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage + // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); + return T{}; +#endif +} + +template +RAFT_DEVICE_INLINE_FUNCTION +typename std::enable_if::value, nv_bfloat16>::type +min(T x) +{ +#if (__CUDA_ARCH__ >= 800) return x; +#else + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage + // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); + return T{}; +#endif } +#endif /** @} */ /** @@ -300,21 +615,59 @@ RAFT_INLINE_FUNCTION auto pow(T1 x, T2 y) /** Square root */ template -RAFT_INLINE_FUNCTION auto sqrt(T x) +RAFT_INLINE_FUNCTION +#if _RAFT_HAS_CUDA +typename std::enable_if::value && !std::is_same::value, T>::type +#else +auto +#endif +sqrt(T x) { #ifdef __CUDA_ARCH__ - return ::sqrt(x); +return ::sqrt(x); #else - return std::sqrt(x); +return std::sqrt(x); #endif } + +#if _RAFT_HAS_CUDA +template +RAFT_DEVICE_INLINE_FUNCTION +typename std::enable_if::value, __half>::type +sqrt(T x) +{ +#if (__CUDA_ARCH__ >= 530) + return ::hsqrt(x); +#else + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage + // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); + return T{}; +#endif +} + +template +RAFT_DEVICE_INLINE_FUNCTION +typename std::enable_if::value, nv_bfloat16>::type +sqrt(T x) +{ +#if (__CUDA_ARCH__ >= 800) + return ::hsqrt(x); +#else + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage + // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); + return T{}; +#endif +} +#endif /** @} */ /** Sign */ template RAFT_INLINE_FUNCTION auto sgn(T val) -> int { - return (T(0) < val) - (val < T(0)); +return (T(0) < val) - (val < T(0)); } } // namespace raft diff --git a/cpp/test/core/math_device.cu b/cpp/test/core/math_device.cu index ff4b343d9e..80359b2334 100644 --- a/cpp/test/core/math_device.cu +++ b/cpp/test/core/math_device.cu @@ -21,6 +21,11 @@ #include #include +#if _RAFT_HAS_CUDA +#include +#include +#endif + template __global__ void math_eval_kernel(OutT* out, OpT op, Args... args) { @@ -118,8 +123,32 @@ struct cos_test_op { } }; +struct cos_test_op_device { + template + constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const + { +#if (__CUDA_ARCH__ < 530) + if constexpr (std::is_same_v) + { + return __float2half(raft::cos(__half2float(in))); + } + #elif (__CUDA_ARCH__ < 800) + if constexpr (std::is_same_v) + { + return __float2bfloat16(raft::cos(__bfloat162float(in))); + } + else // else is there to make sure raft::cos(in) is not compiled with __half / nv_bfloat16 +#endif + return raft::cos(in); + } +}; + TEST(MathDevice, Cos) { + ASSERT_TRUE(raft::match( + std::cos(12.34f), __half2float(math_eval(cos_test_op_device{}, __float2half(12.34f))), raft::CompareApprox(0.001f))); + ASSERT_TRUE(raft::match( + std::cos(12.34f), __bfloat162float(math_eval(cos_test_op_device{}, __float2bfloat16(12.34f))), raft::CompareApprox(0.01f))); ASSERT_TRUE(raft::match( std::cos(12.34f), math_eval(cos_test_op{}, 12.34f), raft::CompareApprox(0.0001f))); ASSERT_TRUE(raft::match( @@ -134,14 +163,58 @@ struct exp_test_op { } }; +struct exp_test_op_device { + template + constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const + { + #if (__CUDA_ARCH__ < 530) + if constexpr (std::is_same_v) + { + return __float2half(raft::exp(__half2float(in))); + } + #elif (__CUDA_ARCH__ < 800) + if constexpr (std::is_same_v) + { + return __float2bfloat16(raft::exp(__bfloat162float(in))); + } + else // else is there to make sure raft::exp(in) is not compiled with __half / nv_bfloat16 +#endif + return raft::exp(in); + } +}; + TEST(MathDevice, Exp) { ASSERT_TRUE(raft::match( - std::exp(12.34f), math_eval(exp_test_op{}, 12.34f), raft::CompareApprox(0.0001f))); + std::exp(3.4f), __half2float(math_eval(exp_test_op_device{}, __float2half(3.4f))), raft::CompareApprox(0.001f))); + ASSERT_TRUE(raft::match( + std::exp(3.4f), __bfloat162float(math_eval(exp_test_op_device{}, __float2bfloat16(3.4f))), raft::CompareApprox(0.01f))); + ASSERT_TRUE(raft::match( + std::exp(3.4f), math_eval(exp_test_op{}, 3.4f), raft::CompareApprox(0.0001f))); ASSERT_TRUE(raft::match( - std::exp(12.34), math_eval(exp_test_op{}, 12.34), raft::CompareApprox(0.000001))); + std::exp(3.4), math_eval(exp_test_op{}, 3.4), raft::CompareApprox(0.000001))); } +struct log_test_op_device { + template + constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const + { + #if (__CUDA_ARCH__ < 530) + if constexpr (std::is_same_v) + { + return __float2half(raft::log(__half2float(in))); + } + #elif (__CUDA_ARCH__ < 800) + if constexpr (std::is_same_v) + { + return __float2bfloat16(raft::log(__bfloat162float(in))); + } + else // else is there to make sure raft::log(in) is not compiled with __half / nv_bfloat16 +#endif + return raft::log(in); + } +}; + struct log_test_op { template constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const @@ -152,6 +225,10 @@ struct log_test_op { TEST(MathDevice, Log) { + ASSERT_TRUE(raft::match( + std::log(12.34f), __half2float(math_eval(log_test_op_device{}, __float2half(12.34f))), raft::CompareApprox(0.001f))); + ASSERT_TRUE(raft::match( + std::log(12.34f), __bfloat162float(math_eval(log_test_op_device{}, __float2bfloat16(12.34f))), raft::CompareApprox(0.01f))); ASSERT_TRUE(raft::match( std::log(12.34f), math_eval(log_test_op{}, 12.34f), raft::CompareApprox(0.0001f))); ASSERT_TRUE(raft::match( @@ -277,6 +354,26 @@ TEST(MathDevice, Sgn) ASSERT_TRUE(raft::match(1, math_eval(sgn_test_op{}, 12.34f), raft::Compare())); } +struct sin_test_op_device { + template + constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const + { + #if (__CUDA_ARCH__ < 530) + if constexpr (std::is_same_v) + { + return __float2half(raft::sin(__half2float(in))); + } + #elif (__CUDA_ARCH__ < 800) + if constexpr (std::is_same_v) + { + return __float2bfloat16(raft::sin(__bfloat162float(in))); + } + else // else is there to make sure raft::sin(in) is not compiled with __half / nv_bfloat16 +#endif + return raft::sin(in); + } +}; + struct sin_test_op { template constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const @@ -287,6 +384,10 @@ struct sin_test_op { TEST(MathDevice, Sin) { + ASSERT_TRUE(raft::match( + std::sin(12.34f), __half2float(math_eval(sin_test_op_device{}, __float2half(12.34f))), raft::CompareApprox(0.01f))); + ASSERT_TRUE(raft::match( + std::sin(12.34f), __bfloat162float(math_eval(sin_test_op_device{}, __float2bfloat16(12.34f))), raft::CompareApprox(0.1f))); ASSERT_TRUE(raft::match( std::sin(12.34f), math_eval(sin_test_op{}, 12.34f), raft::CompareApprox(0.0001f))); ASSERT_TRUE(raft::match( @@ -319,6 +420,27 @@ TEST(MathDevice, SinCos) ASSERT_TRUE(raft::match(std::cos(12.34), cd.value(stream), raft::CompareApprox(0.0001f))); } + +struct sqrt_test_op_device { + template + constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const + { + #if (__CUDA_ARCH__ < 530) + if constexpr (std::is_same_v) + { + return __float2half(raft::sqrt(__half2float(in))); + } + #elif (__CUDA_ARCH__ < 800) + if constexpr (std::is_same_v) + { + return __float2bfloat16(raft::sqrt(__bfloat162float(in))); + } + else // else is there to make sure raft::sqrt(in) is not compiled with __half / nv_bfloat16 +#endif + return raft::sqrt(in); + } +}; + struct sqrt_test_op { template constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const @@ -329,6 +451,10 @@ struct sqrt_test_op { TEST(MathDevice, Sqrt) { + ASSERT_TRUE(raft::match( + std::sqrt(12.34f), __half2float(math_eval(sqrt_test_op_device{}, __float2half(12.34f))), raft::CompareApprox(0.001f))); + ASSERT_TRUE(raft::match( + std::sqrt(12.34f), __bfloat162float(math_eval(sqrt_test_op_device{}, __float2bfloat16(12.34f))), raft::CompareApprox(0.01f))); ASSERT_TRUE(raft::match( std::sqrt(12.34f), math_eval(sqrt_test_op{}, 12.34f), raft::CompareApprox(0.0001f))); ASSERT_TRUE(raft::match( From ae65a7afccf1aa20dee55ca279fdc80d9cb433c3 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 30 May 2023 06:15:13 -0700 Subject: [PATCH 2/8] clang format --- cpp/include/raft/core/detail/macros.hpp | 2 - cpp/include/raft/core/math.hpp | 399 ++++++++++++------------ cpp/test/core/math_device.cu | 130 ++++---- 3 files changed, 262 insertions(+), 269 deletions(-) diff --git a/cpp/include/raft/core/detail/macros.hpp b/cpp/include/raft/core/detail/macros.hpp index b14114214f..6d97b830fb 100644 --- a/cpp/include/raft/core/detail/macros.hpp +++ b/cpp/include/raft/core/detail/macros.hpp @@ -70,8 +70,6 @@ #define RAFT_INLINE_CONDITIONAL inline #endif // RAFT_COMPILED - - // The RAFT_WEAK_FUNCTION specificies that: // // 1. A function may be defined in multiple translation units (like inline) diff --git a/cpp/include/raft/core/math.hpp b/cpp/include/raft/core/math.hpp index 71c297fbba..e912e5e428 100644 --- a/cpp/include/raft/core/math.hpp +++ b/cpp/include/raft/core/math.hpp @@ -23,8 +23,8 @@ #include #if _RAFT_HAS_CUDA -#include #include +#include #endif namespace raft { @@ -35,32 +35,32 @@ namespace raft { */ template RAFT_INLINE_FUNCTION auto abs(T x) --> std::enable_if_t || std::is_same_v || + -> std::enable_if_t || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v, - T> + T> { #ifdef __CUDA_ARCH__ -return ::abs(x); + return ::abs(x); #else -return std::abs(x); + return std::abs(x); #endif } template constexpr RAFT_INLINE_FUNCTION auto abs(T x) --> std::enable_if_t && !std::is_same_v && + -> std::enable_if_t && !std::is_same_v && !std::is_same_v && !std::is_same_v && !std::is_same_v, - T> + T> { -return x < T{0} ? -x : x; + return x < T{0} ? -x : x; } /** @} */ /** -* @defgroup Trigonometry Trigonometry functions -* @{ -*/ + * @defgroup Trigonometry Trigonometry functions + * @{ + */ /** Inverse cosine */ template RAFT_INLINE_FUNCTION auto acos(T x) @@ -98,11 +98,12 @@ RAFT_INLINE_FUNCTION auto atanh(T x) template RAFT_INLINE_FUNCTION #if _RAFT_HAS_CUDA -typename std::enable_if::value && !std::is_same::value, T>::type + typename std::enable_if::value && !std::is_same::value, + T>::type #else -auto + auto #endif -cos(T x) + cos(T x) { #ifdef __CUDA_ARCH__ return ::cos(x); @@ -113,15 +114,15 @@ cos(T x) #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION -typename std::enable_if::value, __half>::type +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if::value, __half>::type cos(T x) { #if (__CUDA_ARCH__ >= 530) return ::hcos(x); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage - // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at + // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered + // during template instantiation and thus at device compilation stage static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); return T{}; #endif @@ -129,14 +130,15 @@ cos(T x) template RAFT_DEVICE_INLINE_FUNCTION -typename std::enable_if::value, nv_bfloat16>::type -cos(T x) + typename std::enable_if::value, nv_bfloat16>::type + cos(T x) { #if (__CUDA_ARCH__ >= 800) return ::hcos(x); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage - // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at + // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered + // during template instantiation and thus at device compilation stage static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); return T{}; #endif @@ -147,11 +149,12 @@ cos(T x) template RAFT_INLINE_FUNCTION #if _RAFT_HAS_CUDA -typename std::enable_if::value && !std::is_same::value, T>::type + typename std::enable_if::value && !std::is_same::value, + T>::type #else -auto + auto #endif -sin(T x) + sin(T x) { #ifdef __CUDA_ARCH__ return ::sin(x); @@ -162,15 +165,15 @@ sin(T x) #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION -typename std::enable_if::value, __half>::type +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if::value, __half>::type sin(T x) { #if (__CUDA_ARCH__ >= 530) return ::hsin(x); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage - // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at + // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered + // during template instantiation and thus at device compilation stage static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); return T{}; #endif @@ -178,14 +181,15 @@ sin(T x) template RAFT_DEVICE_INLINE_FUNCTION -typename std::enable_if::value, nv_bfloat16>::type -sin(T x) + typename std::enable_if::value, nv_bfloat16>::type + sin(T x) { #if (__CUDA_ARCH__ >= 800) return ::hsin(x); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage - // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at + // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered + // during template instantiation and thus at device compilation stage static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); return T{}; #endif @@ -195,13 +199,13 @@ sin(T x) /** Sine and cosine */ template RAFT_INLINE_FUNCTION std::enable_if_t || std::is_same_v> sincos( -const T& x, T* s, T* c) + const T& x, T* s, T* c) { #ifdef __CUDA_ARCH__ -::sincos(x, s, c); + ::sincos(x, s, c); #else -*s = std::sin(x); -*c = std::cos(x); + *s = std::sin(x); + *c = std::cos(x); #endif } @@ -210,26 +214,27 @@ template RAFT_INLINE_FUNCTION auto tanh(T x) { #ifdef __CUDA_ARCH__ -return ::tanh(x); + return ::tanh(x); #else -return std::tanh(x); + return std::tanh(x); #endif } /** @} */ /** -* @defgroup Exponential Exponential and logarithm -* @{ -*/ + * @defgroup Exponential Exponential and logarithm + * @{ + */ /** Exponential function */ template RAFT_INLINE_FUNCTION #if _RAFT_HAS_CUDA -typename std::enable_if::value && !std::is_same::value, T>::type + typename std::enable_if::value && !std::is_same::value, + T>::type #else -auto + auto #endif -exp(T x) + exp(T x) { #ifdef __CUDA_ARCH__ return ::exp(x); @@ -240,15 +245,15 @@ exp(T x) #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION -typename std::enable_if::value, __half>::type +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if::value, __half>::type exp(T x) { #if (__CUDA_ARCH__ >= 530) return ::hexp(x); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage - // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at + // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered + // during template instantiation and thus at device compilation stage static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); return T{}; #endif @@ -256,14 +261,15 @@ exp(T x) template RAFT_DEVICE_INLINE_FUNCTION -typename std::enable_if::value, nv_bfloat16>::type -exp(T x) + typename std::enable_if::value, nv_bfloat16>::type + exp(T x) { #if (__CUDA_ARCH__ >= 800) return ::hexp(x); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage - // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at + // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered + // during template instantiation and thus at device compilation stage static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); return T{}; #endif @@ -274,30 +280,31 @@ exp(T x) template RAFT_INLINE_FUNCTION #if _RAFT_HAS_CUDA -typename std::enable_if::value && !std::is_same::value, T>::type + typename std::enable_if::value && !std::is_same::value, + T>::type #else -auto + auto #endif -log(T x) + log(T x) { #ifdef __CUDA_ARCH__ -return ::log(x); + return ::log(x); #else -return std::log(x); + return std::log(x); #endif } #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION -typename std::enable_if::value, __half>::type +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if::value, __half>::type log(T x) { #if (__CUDA_ARCH__ >= 530) return ::hlog(x); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage - // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at + // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered + // during template instantiation and thus at device compilation stage static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); return T{}; #endif @@ -305,14 +312,15 @@ log(T x) template RAFT_DEVICE_INLINE_FUNCTION -typename std::enable_if::value, nv_bfloat16>::type -log(T x) + typename std::enable_if::value, nv_bfloat16>::type + log(T x) { #if (__CUDA_ARCH__ >= 800) return ::hlog(x); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage - // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at + // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered + // during template instantiation and thus at device compilation stage static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); return T{}; #endif @@ -321,75 +329,67 @@ log(T x) /** @} */ /** -* @defgroup Maximum Maximum of two or more values. -* -* The CUDA Math API has overloads for all combinations of float/double. We provide similar -* functionality while wrapping around std::max, which only supports arguments of the same type. -* However, though the CUDA Math API supports combinations of unsigned and signed integers, this is -* very error-prone so we do not support that and require the user to cast instead. (e.g the max of -* -1 and 1u is 4294967295u...) -* -* When no overload matches, we provide a generic implementation but require that both types be the -* same (and that the less-than operator be defined). -* @{ -*/ + * @defgroup Maximum Maximum of two or more values. + * + * The CUDA Math API has overloads for all combinations of float/double. We provide similar + * functionality while wrapping around std::max, which only supports arguments of the same type. + * However, though the CUDA Math API supports combinations of unsigned and signed integers, this is + * very error-prone so we do not support that and require the user to cast instead. (e.g the max of + * -1 and 1u is 4294967295u...) + * + * When no overload matches, we provide a generic implementation but require that both types be the + * same (and that the less-than operator be defined). + * @{ + */ template < - typename T1, - typename T2, - std::enable_if_t< - CUDA_ONLY_CONDITION( - RAFT_DEPAREN(( - (!std::is_same_v && !std::is_same_v) || - (!std::is_same_v && !std::is_same_v) - )) - ), - int - > = 0 -> -RAFT_INLINE_FUNCTION -auto -max(const T1& x, const T2& y) + typename T1, + typename T2, + std::enable_if_t && !std::is_same_v) || + (!std::is_same_v && !std::is_same_v)))), + int> = 0> +RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y) { #ifdef __CUDA_ARCH__ -// Combinations of types supported by the CUDA Math API -if constexpr ((std::is_integral_v && std::is_integral_v && std::is_same_v) || + // Combinations of types supported by the CUDA Math API + if constexpr ((std::is_integral_v && std::is_integral_v && std::is_same_v) || ((std::is_same_v || std::is_same_v)&&( - std::is_same_v || std::is_same_v))) { + std::is_same_v || std::is_same_v))) { return ::max(x, y); -} -// Else, check that the types are the same and provide a generic implementation -else { + } + // Else, check that the types are the same and provide a generic implementation + else { static_assert( - std::is_same_v, - "No native max overload for these types. Both argument types must be the same to use " - "the generic max. Please cast appropriately."); + std::is_same_v, + "No native max overload for these types. Both argument types must be the same to use " + "the generic max. Please cast appropriately."); return (x < y) ? y : x; -} + } #else -if constexpr (std::is_same_v && std::is_same_v) { + if constexpr (std::is_same_v && std::is_same_v) { return std::max(static_cast(x), y); -} else if constexpr (std::is_same_v && std::is_same_v) { + } else if constexpr (std::is_same_v && std::is_same_v) { return std::max(x, static_cast(y)); -} else { + } else { static_assert( - std::is_same_v, - "std::max requires that both argument types be the same. Please cast appropriately."); + std::is_same_v, + "std::max requires that both argument types be the same. Please cast appropriately."); return std::max(x, y); -} + } #endif } #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION -typename std::enable_if::value, __half>::type +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if::value, __half>::type max(T x, T y) { #if (__CUDA_ARCH__ >= 530) return ::__hmax(x, y); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage - // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at + // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered + // during template instantiation and thus at device compilation stage static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); return T{}; #endif @@ -397,14 +397,15 @@ max(T x, T y) template RAFT_DEVICE_INLINE_FUNCTION -typename std::enable_if::value, nv_bfloat16>::type -max(T x, T y) + typename std::enable_if::value, nv_bfloat16>::type + max(T x, T y) { #if (__CUDA_ARCH__ >= 800) return ::__hmax(x, y); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage - // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at + // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered + // during template instantiation and thus at device compilation stage static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); return T{}; #endif @@ -422,20 +423,20 @@ RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y, Args&&... args) template constexpr RAFT_INLINE_FUNCTION auto max(const T& x) { -return x; + return x; } #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION -typename std::enable_if::value, __half>::type +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if::value, __half>::type max(T x) { #if (__CUDA_ARCH__ >= 530) return x; #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage - // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at + // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered + // during template instantiation and thus at device compilation stage static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); return T{}; #endif @@ -443,93 +444,85 @@ max(T x) template RAFT_DEVICE_INLINE_FUNCTION -typename std::enable_if::value, nv_bfloat16>::type -max(T x) + typename std::enable_if::value, nv_bfloat16>::type + max(T x) { #if (__CUDA_ARCH__ >= 800) return x; #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage - // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at + // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered + // during template instantiation and thus at device compilation stage static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); return T{}; #endif } #endif - /** @} */ /** -* @defgroup Minimum Minimum of two or more values. -* -* The CUDA Math API has overloads for all combinations of float/double. We provide similar -* functionality while wrapping around std::min, which only supports arguments of the same type. -* However, though the CUDA Math API supports combinations of unsigned and signed integers, this is -* very error-prone so we do not support that and require the user to cast instead. (e.g the min of -* -1 and 1u is 1u...) -* -* When no overload matches, we provide a generic implementation but require that both types be the -* same (and that the less-than operator be defined). -* @{ -*/ + * @defgroup Minimum Minimum of two or more values. + * + * The CUDA Math API has overloads for all combinations of float/double. We provide similar + * functionality while wrapping around std::min, which only supports arguments of the same type. + * However, though the CUDA Math API supports combinations of unsigned and signed integers, this is + * very error-prone so we do not support that and require the user to cast instead. (e.g the min of + * -1 and 1u is 1u...) + * + * When no overload matches, we provide a generic implementation but require that both types be the + * same (and that the less-than operator be defined). + * @{ + */ template < - typename T1, - typename T2, - std::enable_if_t< - CUDA_ONLY_CONDITION( - RAFT_DEPAREN(( - (!std::is_same_v && !std::is_same_v) || - (!std::is_same_v && !std::is_same_v) - )) - ), - int - > = 0 -> -RAFT_INLINE_FUNCTION -auto -min(const T1& x, const T2& y) + typename T1, + typename T2, + std::enable_if_t && !std::is_same_v) || + (!std::is_same_v && !std::is_same_v)))), + int> = 0> +RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y) { #ifdef __CUDA_ARCH__ -// Combinations of types supported by the CUDA Math API -if constexpr ((std::is_integral_v && std::is_integral_v && std::is_same_v) || + // Combinations of types supported by the CUDA Math API + if constexpr ((std::is_integral_v && std::is_integral_v && std::is_same_v) || ((std::is_same_v || std::is_same_v)&&( - std::is_same_v || std::is_same_v))) { + std::is_same_v || std::is_same_v))) { return ::min(x, y); -} -// Else, check that the types are the same and provide a generic implementation -else { + } + // Else, check that the types are the same and provide a generic implementation + else { static_assert( - std::is_same_v, - "No native min overload for these types. Both argument types must be the same to use " - "the generic min. Please cast appropriately."); + std::is_same_v, + "No native min overload for these types. Both argument types must be the same to use " + "the generic min. Please cast appropriately."); return (y < x) ? y : x; -} + } #else -if constexpr (std::is_same_v && std::is_same_v) { + if constexpr (std::is_same_v && std::is_same_v) { return std::min(static_cast(x), y); -} else if constexpr (std::is_same_v && std::is_same_v) { + } else if constexpr (std::is_same_v && std::is_same_v) { return std::min(x, static_cast(y)); -} else { + } else { static_assert( - std::is_same_v, - "std::min requires that both argument types be the same. Please cast appropriately."); + std::is_same_v, + "std::min requires that both argument types be the same. Please cast appropriately."); return std::min(x, y); -} + } #endif } #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION -typename std::enable_if::value, __half>::type +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if::value, __half>::type min(T x, T y) { #if (__CUDA_ARCH__ >= 530) return ::__hmin(x, y); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage - // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at + // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered + // during template instantiation and thus at device compilation stage static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); return T{}; #endif @@ -537,14 +530,15 @@ min(T x, T y) template RAFT_DEVICE_INLINE_FUNCTION -typename std::enable_if::value, nv_bfloat16>::type -min(T x, T y) + typename std::enable_if::value, nv_bfloat16>::type + min(T x, T y) { #if (__CUDA_ARCH__ >= 800) return ::__hmin(x, y); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage - // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at + // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered + // during template instantiation and thus at device compilation stage static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); return T{}; #endif @@ -562,20 +556,20 @@ RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y, Args&&... args) template constexpr RAFT_INLINE_FUNCTION auto min(const T& x) { -return x; + return x; } #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION -typename std::enable_if::value, __half>::type +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if::value, __half>::type min(T x) { #if (__CUDA_ARCH__ >= 530) return x; #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage - // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at + // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered + // during template instantiation and thus at device compilation stage static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); return T{}; #endif @@ -583,14 +577,15 @@ min(T x) template RAFT_DEVICE_INLINE_FUNCTION -typename std::enable_if::value, nv_bfloat16>::type -min(T x) + typename std::enable_if::value, nv_bfloat16>::type + min(T x) { #if (__CUDA_ARCH__ >= 800) return x; #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage - // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at + // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered + // during template instantiation and thus at device compilation stage static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); return T{}; #endif @@ -617,30 +612,31 @@ RAFT_INLINE_FUNCTION auto pow(T1 x, T2 y) template RAFT_INLINE_FUNCTION #if _RAFT_HAS_CUDA -typename std::enable_if::value && !std::is_same::value, T>::type + typename std::enable_if::value && !std::is_same::value, + T>::type #else -auto + auto #endif -sqrt(T x) + sqrt(T x) { #ifdef __CUDA_ARCH__ -return ::sqrt(x); + return ::sqrt(x); #else -return std::sqrt(x); + return std::sqrt(x); #endif } #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION -typename std::enable_if::value, __half>::type +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if::value, __half>::type sqrt(T x) { #if (__CUDA_ARCH__ >= 530) return ::hsqrt(x); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage - // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at + // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered + // during template instantiation and thus at device compilation stage static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); return T{}; #endif @@ -648,14 +644,15 @@ sqrt(T x) template RAFT_DEVICE_INLINE_FUNCTION -typename std::enable_if::value, nv_bfloat16>::type -sqrt(T x) + typename std::enable_if::value, nv_bfloat16>::type + sqrt(T x) { #if (__CUDA_ARCH__ >= 800) return ::hsqrt(x); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage - // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage + // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at + // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered + // during template instantiation and thus at device compilation stage static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); return T{}; #endif @@ -667,7 +664,7 @@ sqrt(T x) template RAFT_INLINE_FUNCTION auto sgn(T val) -> int { -return (T(0) < val) - (val < T(0)); + return (T(0) < val) - (val < T(0)); } } // namespace raft diff --git a/cpp/test/core/math_device.cu b/cpp/test/core/math_device.cu index 80359b2334..15c7b2b33a 100644 --- a/cpp/test/core/math_device.cu +++ b/cpp/test/core/math_device.cu @@ -22,8 +22,8 @@ #include #if _RAFT_HAS_CUDA -#include #include +#include #endif template @@ -128,27 +128,27 @@ struct cos_test_op_device { constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const { #if (__CUDA_ARCH__ < 530) - if constexpr (std::is_same_v) - { + if constexpr (std::is_same_v) { return __float2half(raft::cos(__half2float(in))); } - #elif (__CUDA_ARCH__ < 800) - if constexpr (std::is_same_v) - { +#elif (__CUDA_ARCH__ < 800) + if constexpr (std::is_same_v) { return __float2bfloat16(raft::cos(__bfloat162float(in))); - } - else // else is there to make sure raft::cos(in) is not compiled with __half / nv_bfloat16 + } else // else is there to make sure raft::cos(in) is not compiled with __half / nv_bfloat16 #endif - return raft::cos(in); + return raft::cos(in); } }; TEST(MathDevice, Cos) { - ASSERT_TRUE(raft::match( - std::cos(12.34f), __half2float(math_eval(cos_test_op_device{}, __float2half(12.34f))), raft::CompareApprox(0.001f))); - ASSERT_TRUE(raft::match( - std::cos(12.34f), __bfloat162float(math_eval(cos_test_op_device{}, __float2bfloat16(12.34f))), raft::CompareApprox(0.01f))); + ASSERT_TRUE(raft::match(std::cos(12.34f), + __half2float(math_eval(cos_test_op_device{}, __float2half(12.34f))), + raft::CompareApprox(0.001f))); + ASSERT_TRUE( + raft::match(std::cos(12.34f), + __bfloat162float(math_eval(cos_test_op_device{}, __float2bfloat16(12.34f))), + raft::CompareApprox(0.01f))); ASSERT_TRUE(raft::match( std::cos(12.34f), math_eval(cos_test_op{}, 12.34f), raft::CompareApprox(0.0001f))); ASSERT_TRUE(raft::match( @@ -167,28 +167,27 @@ struct exp_test_op_device { template constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const { - #if (__CUDA_ARCH__ < 530) - if constexpr (std::is_same_v) - { +#if (__CUDA_ARCH__ < 530) + if constexpr (std::is_same_v) { return __float2half(raft::exp(__half2float(in))); } - #elif (__CUDA_ARCH__ < 800) - if constexpr (std::is_same_v) - { +#elif (__CUDA_ARCH__ < 800) + if constexpr (std::is_same_v) { return __float2bfloat16(raft::exp(__bfloat162float(in))); - } - else // else is there to make sure raft::exp(in) is not compiled with __half / nv_bfloat16 + } else // else is there to make sure raft::exp(in) is not compiled with __half / nv_bfloat16 #endif - return raft::exp(in); + return raft::exp(in); } }; TEST(MathDevice, Exp) { - ASSERT_TRUE(raft::match( - std::exp(3.4f), __half2float(math_eval(exp_test_op_device{}, __float2half(3.4f))), raft::CompareApprox(0.001f))); - ASSERT_TRUE(raft::match( - std::exp(3.4f), __bfloat162float(math_eval(exp_test_op_device{}, __float2bfloat16(3.4f))), raft::CompareApprox(0.01f))); + ASSERT_TRUE(raft::match(std::exp(3.4f), + __half2float(math_eval(exp_test_op_device{}, __float2half(3.4f))), + raft::CompareApprox(0.001f))); + ASSERT_TRUE(raft::match(std::exp(3.4f), + __bfloat162float(math_eval(exp_test_op_device{}, __float2bfloat16(3.4f))), + raft::CompareApprox(0.01f))); ASSERT_TRUE(raft::match( std::exp(3.4f), math_eval(exp_test_op{}, 3.4f), raft::CompareApprox(0.0001f))); ASSERT_TRUE(raft::match( @@ -199,19 +198,16 @@ struct log_test_op_device { template constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const { - #if (__CUDA_ARCH__ < 530) - if constexpr (std::is_same_v) - { +#if (__CUDA_ARCH__ < 530) + if constexpr (std::is_same_v) { return __float2half(raft::log(__half2float(in))); } - #elif (__CUDA_ARCH__ < 800) - if constexpr (std::is_same_v) - { +#elif (__CUDA_ARCH__ < 800) + if constexpr (std::is_same_v) { return __float2bfloat16(raft::log(__bfloat162float(in))); - } - else // else is there to make sure raft::log(in) is not compiled with __half / nv_bfloat16 + } else // else is there to make sure raft::log(in) is not compiled with __half / nv_bfloat16 #endif - return raft::log(in); + return raft::log(in); } }; @@ -225,10 +221,13 @@ struct log_test_op { TEST(MathDevice, Log) { - ASSERT_TRUE(raft::match( - std::log(12.34f), __half2float(math_eval(log_test_op_device{}, __float2half(12.34f))), raft::CompareApprox(0.001f))); - ASSERT_TRUE(raft::match( - std::log(12.34f), __bfloat162float(math_eval(log_test_op_device{}, __float2bfloat16(12.34f))), raft::CompareApprox(0.01f))); + ASSERT_TRUE(raft::match(std::log(12.34f), + __half2float(math_eval(log_test_op_device{}, __float2half(12.34f))), + raft::CompareApprox(0.001f))); + ASSERT_TRUE( + raft::match(std::log(12.34f), + __bfloat162float(math_eval(log_test_op_device{}, __float2bfloat16(12.34f))), + raft::CompareApprox(0.01f))); ASSERT_TRUE(raft::match( std::log(12.34f), math_eval(log_test_op{}, 12.34f), raft::CompareApprox(0.0001f))); ASSERT_TRUE(raft::match( @@ -358,19 +357,16 @@ struct sin_test_op_device { template constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const { - #if (__CUDA_ARCH__ < 530) - if constexpr (std::is_same_v) - { +#if (__CUDA_ARCH__ < 530) + if constexpr (std::is_same_v) { return __float2half(raft::sin(__half2float(in))); } - #elif (__CUDA_ARCH__ < 800) - if constexpr (std::is_same_v) - { +#elif (__CUDA_ARCH__ < 800) + if constexpr (std::is_same_v) { return __float2bfloat16(raft::sin(__bfloat162float(in))); - } - else // else is there to make sure raft::sin(in) is not compiled with __half / nv_bfloat16 + } else // else is there to make sure raft::sin(in) is not compiled with __half / nv_bfloat16 #endif - return raft::sin(in); + return raft::sin(in); } }; @@ -384,10 +380,13 @@ struct sin_test_op { TEST(MathDevice, Sin) { - ASSERT_TRUE(raft::match( - std::sin(12.34f), __half2float(math_eval(sin_test_op_device{}, __float2half(12.34f))), raft::CompareApprox(0.01f))); - ASSERT_TRUE(raft::match( - std::sin(12.34f), __bfloat162float(math_eval(sin_test_op_device{}, __float2bfloat16(12.34f))), raft::CompareApprox(0.1f))); + ASSERT_TRUE(raft::match(std::sin(12.34f), + __half2float(math_eval(sin_test_op_device{}, __float2half(12.34f))), + raft::CompareApprox(0.01f))); + ASSERT_TRUE( + raft::match(std::sin(12.34f), + __bfloat162float(math_eval(sin_test_op_device{}, __float2bfloat16(12.34f))), + raft::CompareApprox(0.1f))); ASSERT_TRUE(raft::match( std::sin(12.34f), math_eval(sin_test_op{}, 12.34f), raft::CompareApprox(0.0001f))); ASSERT_TRUE(raft::match( @@ -420,24 +419,20 @@ TEST(MathDevice, SinCos) ASSERT_TRUE(raft::match(std::cos(12.34), cd.value(stream), raft::CompareApprox(0.0001f))); } - struct sqrt_test_op_device { template constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const { - #if (__CUDA_ARCH__ < 530) - if constexpr (std::is_same_v) - { +#if (__CUDA_ARCH__ < 530) + if constexpr (std::is_same_v) { return __float2half(raft::sqrt(__half2float(in))); } - #elif (__CUDA_ARCH__ < 800) - if constexpr (std::is_same_v) - { +#elif (__CUDA_ARCH__ < 800) + if constexpr (std::is_same_v) { return __float2bfloat16(raft::sqrt(__bfloat162float(in))); - } - else // else is there to make sure raft::sqrt(in) is not compiled with __half / nv_bfloat16 + } else // else is there to make sure raft::sqrt(in) is not compiled with __half / nv_bfloat16 #endif - return raft::sqrt(in); + return raft::sqrt(in); } }; @@ -451,10 +446,13 @@ struct sqrt_test_op { TEST(MathDevice, Sqrt) { - ASSERT_TRUE(raft::match( - std::sqrt(12.34f), __half2float(math_eval(sqrt_test_op_device{}, __float2half(12.34f))), raft::CompareApprox(0.001f))); - ASSERT_TRUE(raft::match( - std::sqrt(12.34f), __bfloat162float(math_eval(sqrt_test_op_device{}, __float2bfloat16(12.34f))), raft::CompareApprox(0.01f))); + ASSERT_TRUE(raft::match(std::sqrt(12.34f), + __half2float(math_eval(sqrt_test_op_device{}, __float2half(12.34f))), + raft::CompareApprox(0.001f))); + ASSERT_TRUE( + raft::match(std::sqrt(12.34f), + __bfloat162float(math_eval(sqrt_test_op_device{}, __float2bfloat16(12.34f))), + raft::CompareApprox(0.01f))); ASSERT_TRUE(raft::match( std::sqrt(12.34f), math_eval(sqrt_test_op{}, 12.34f), raft::CompareApprox(0.0001f))); ASSERT_TRUE(raft::match( From 39308613ceebe3a2fda2504a8c3f9736970eb4a8 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 30 May 2023 06:38:18 -0700 Subject: [PATCH 3/8] fix cmake formating --- cpp/CMakeLists.txt | 8 ++++++-- python/pylibraft/pyproject.toml | 4 ++-- python/raft-dask/pyproject.toml | 6 +++--- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 3a598c5fad..6fa1b5830e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -602,7 +602,9 @@ target_link_libraries(raft::raft INTERFACE # Use `rapids_export` for 22.04 as it will have COMPONENT support rapids_export( INSTALL raft - EXPORT_SET raft-exports COMPONENTS ${raft_components} COMPONENTS_EXPORT_SET ${raft_export_sets} + EXPORT_SET raft-exports + COMPONENTS ${raft_components} + COMPONENTS_EXPORT_SET ${raft_export_sets} GLOBAL_TARGETS raft compiled distributed NAMESPACE raft:: DOCUMENTATION doc_string @@ -613,7 +615,9 @@ rapids_export( # * build export ------------------------------------------------------------- rapids_export( BUILD raft - EXPORT_SET raft-exports COMPONENTS ${raft_components} COMPONENTS_EXPORT_SET ${raft_export_sets} + EXPORT_SET raft-exports + COMPONENTS ${raft_components} + COMPONENTS_EXPORT_SET ${raft_export_sets} GLOBAL_TARGETS raft compiled distributed DOCUMENTATION doc_string NAMESPACE raft:: diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml index c344128e6a..b53ad43740 100644 --- a/python/pylibraft/pyproject.toml +++ b/python/pylibraft/pyproject.toml @@ -19,7 +19,7 @@ requires = [ "cuda-python>=11.7.1,<12.0", "cython>=0.29,<0.30", "ninja", - "rmm==23.8.*", + "rmm==23.6.*", "scikit-build>=0.13.1,<0.17.2", "setuptools", "wheel", @@ -39,7 +39,7 @@ requires-python = ">=3.9" dependencies = [ "cuda-python>=11.7.1,<12.0", "numpy>=1.21", - "rmm==23.8.*", + "rmm==23.6.*", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml index bb703238b9..9957bf9d42 100644 --- a/python/raft-dask/pyproject.toml +++ b/python/raft-dask/pyproject.toml @@ -34,14 +34,14 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ - "dask-cuda==23.8.*", + "dask-cuda==23.6.*", "dask==2023.3.2", "distributed==2023.3.2.1", "joblib>=0.11", "numba>=0.49", "numpy>=1.21", - "pylibraft==23.8.*", - "ucx-py==0.33.*", + "pylibraft==23.6.*", + "ucx-py=0.33.*", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", From 42a43c3e1910d8832fe0ead7864439f18cd08a0f Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 30 May 2023 07:58:49 -0700 Subject: [PATCH 4/8] fix too long comment issue, std::is_same_v, use enable_if_t & CUDA_CONDITION_ELSE_TRUE --- cpp/include/raft/core/detail/macros.hpp | 6 +- cpp/include/raft/core/math.hpp | 200 +++++++++--------------- 2 files changed, 76 insertions(+), 130 deletions(-) diff --git a/cpp/include/raft/core/detail/macros.hpp b/cpp/include/raft/core/detail/macros.hpp index 6d97b830fb..bb4207938b 100644 --- a/cpp/include/raft/core/detail/macros.hpp +++ b/cpp/include/raft/core/detail/macros.hpp @@ -23,9 +23,11 @@ #endif #if defined(_RAFT_HAS_CUDA) -#define CUDA_ONLY_CONDITION(condition) condition +#define CUDA_CONDITION_ELSE_TRUE(condition) condition +#define CUDA_CONDITION_ELSE_FALSE(condition) condition #else -#define CUDA_ONLY_CONDITION(condition) true +#define CUDA_CONDITION_ELSE_TRUE(condition) true +#define CUDA_CONDITION_ELSE_FALSE(condition) false #endif #ifndef _RAFT_HOST_DEVICE diff --git a/cpp/include/raft/core/math.hpp b/cpp/include/raft/core/math.hpp index e912e5e428..1b425e8e4e 100644 --- a/cpp/include/raft/core/math.hpp +++ b/cpp/include/raft/core/math.hpp @@ -95,15 +95,11 @@ RAFT_INLINE_FUNCTION auto atanh(T x) } /** Cosine */ -template -RAFT_INLINE_FUNCTION -#if _RAFT_HAS_CUDA - typename std::enable_if::value && !std::is_same::value, - T>::type -#else - auto -#endif - cos(T x) +template && + (!std::is_same_v)))), + int> = 0> +RAFT_INLINE_FUNCTION auto cos(T x) { #ifdef __CUDA_ARCH__ return ::cos(x); @@ -114,15 +110,13 @@ RAFT_INLINE_FUNCTION #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if::value, __half>::type -cos(T x) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, __half>::type cos( + T x) { #if (__CUDA_ARCH__ >= 530) return ::hcos(x); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at - // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered - // during template instantiation and thus at device compilation stage + // Fail during template instantiation if the compute capability doesn't support this operation static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); return T{}; #endif @@ -130,15 +124,13 @@ cos(T x) template RAFT_DEVICE_INLINE_FUNCTION - typename std::enable_if::value, nv_bfloat16>::type + typename std::enable_if, nv_bfloat16>::type cos(T x) { #if (__CUDA_ARCH__ >= 800) return ::hcos(x); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at - // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered - // during template instantiation and thus at device compilation stage + // Fail during template instantiation if the compute capability doesn't support this operation static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); return T{}; #endif @@ -146,15 +138,11 @@ RAFT_DEVICE_INLINE_FUNCTION #endif /** Sine */ -template -RAFT_INLINE_FUNCTION -#if _RAFT_HAS_CUDA - typename std::enable_if::value && !std::is_same::value, - T>::type -#else - auto -#endif - sin(T x) +template && + (!std::is_same_v)))), + int> = 0> +RAFT_INLINE_FUNCTION auto sin(T x) { #ifdef __CUDA_ARCH__ return ::sin(x); @@ -165,15 +153,13 @@ RAFT_INLINE_FUNCTION #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if::value, __half>::type -sin(T x) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, __half>::type sin( + T x) { #if (__CUDA_ARCH__ >= 530) return ::hsin(x); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at - // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered - // during template instantiation and thus at device compilation stage + // Fail during template instantiation if the compute capability doesn't support this operation static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); return T{}; #endif @@ -181,15 +167,13 @@ sin(T x) template RAFT_DEVICE_INLINE_FUNCTION - typename std::enable_if::value, nv_bfloat16>::type + typename std::enable_if, nv_bfloat16>::type sin(T x) { #if (__CUDA_ARCH__ >= 800) return ::hsin(x); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at - // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered - // during template instantiation and thus at device compilation stage + // Fail during template instantiation if the compute capability doesn't support this operation static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); return T{}; #endif @@ -226,15 +210,11 @@ RAFT_INLINE_FUNCTION auto tanh(T x) * @{ */ /** Exponential function */ -template -RAFT_INLINE_FUNCTION -#if _RAFT_HAS_CUDA - typename std::enable_if::value && !std::is_same::value, - T>::type -#else - auto -#endif - exp(T x) +template && + (!std::is_same_v)))), + int> = 0> +RAFT_INLINE_FUNCTION auto exp(T x) { #ifdef __CUDA_ARCH__ return ::exp(x); @@ -245,15 +225,13 @@ RAFT_INLINE_FUNCTION #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if::value, __half>::type -exp(T x) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, __half>::type exp( + T x) { #if (__CUDA_ARCH__ >= 530) return ::hexp(x); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at - // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered - // during template instantiation and thus at device compilation stage + // Fail during template instantiation if the compute capability doesn't support this operation static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); return T{}; #endif @@ -261,15 +239,13 @@ exp(T x) template RAFT_DEVICE_INLINE_FUNCTION - typename std::enable_if::value, nv_bfloat16>::type + typename std::enable_if, nv_bfloat16>::type exp(T x) { #if (__CUDA_ARCH__ >= 800) return ::hexp(x); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at - // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered - // during template instantiation and thus at device compilation stage + // Fail during template instantiation if the compute capability doesn't support this operation static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); return T{}; #endif @@ -277,15 +253,11 @@ RAFT_DEVICE_INLINE_FUNCTION #endif /** Natural logarithm */ -template -RAFT_INLINE_FUNCTION -#if _RAFT_HAS_CUDA - typename std::enable_if::value && !std::is_same::value, - T>::type -#else - auto -#endif - log(T x) +template && + (!std::is_same_v)))), + int> = 0> +RAFT_INLINE_FUNCTION auto log(T x) { #ifdef __CUDA_ARCH__ return ::log(x); @@ -296,15 +268,13 @@ RAFT_INLINE_FUNCTION #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if::value, __half>::type -log(T x) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, __half>::type log( + T x) { #if (__CUDA_ARCH__ >= 530) return ::hlog(x); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at - // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered - // during template instantiation and thus at device compilation stage + // Fail during template instantiation if the compute capability doesn't support this operation static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); return T{}; #endif @@ -312,15 +282,13 @@ log(T x) template RAFT_DEVICE_INLINE_FUNCTION - typename std::enable_if::value, nv_bfloat16>::type + typename std::enable_if, nv_bfloat16>::type log(T x) { #if (__CUDA_ARCH__ >= 800) return ::hlog(x); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at - // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered - // during template instantiation and thus at device compilation stage + // Fail during template instantiation if the compute capability doesn't support this operation static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); return T{}; #endif @@ -344,7 +312,7 @@ RAFT_DEVICE_INLINE_FUNCTION template < typename T1, typename T2, - std::enable_if_t && !std::is_same_v) || (!std::is_same_v && !std::is_same_v)))), int> = 0> @@ -381,15 +349,13 @@ RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y) #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if::value, __half>::type -max(T x, T y) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, __half>::type max( + T x, T y) { #if (__CUDA_ARCH__ >= 530) return ::__hmax(x, y); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at - // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered - // during template instantiation and thus at device compilation stage + // Fail during template instantiation if the compute capability doesn't support this operation static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); return T{}; #endif @@ -397,15 +363,13 @@ max(T x, T y) template RAFT_DEVICE_INLINE_FUNCTION - typename std::enable_if::value, nv_bfloat16>::type + typename std::enable_if, nv_bfloat16>::type max(T x, T y) { #if (__CUDA_ARCH__ >= 800) return ::__hmax(x, y); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at - // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered - // during template instantiation and thus at device compilation stage + // Fail during template instantiation if the compute capability doesn't support this operation static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); return T{}; #endif @@ -428,15 +392,13 @@ constexpr RAFT_INLINE_FUNCTION auto max(const T& x) #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if::value, __half>::type -max(T x) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, __half>::type max( + T x) { #if (__CUDA_ARCH__ >= 530) return x; #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at - // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered - // during template instantiation and thus at device compilation stage + // Fail during template instantiation if the compute capability doesn't support this operation static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); return T{}; #endif @@ -444,15 +406,13 @@ max(T x) template RAFT_DEVICE_INLINE_FUNCTION - typename std::enable_if::value, nv_bfloat16>::type + typename std::enable_if, nv_bfloat16>::type max(T x) { #if (__CUDA_ARCH__ >= 800) return x; #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at - // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered - // during template instantiation and thus at device compilation stage + // Fail during template instantiation if the compute capability doesn't support this operation static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); return T{}; #endif @@ -477,7 +437,7 @@ RAFT_DEVICE_INLINE_FUNCTION template < typename T1, typename T2, - std::enable_if_t && !std::is_same_v) || (!std::is_same_v && !std::is_same_v)))), int> = 0> @@ -514,15 +474,13 @@ RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y) #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if::value, __half>::type -min(T x, T y) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, __half>::type min( + T x, T y) { #if (__CUDA_ARCH__ >= 530) return ::__hmin(x, y); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at - // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered - // during template instantiation and thus at device compilation stage + // Fail during template instantiation if the compute capability doesn't support this operation static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); return T{}; #endif @@ -530,15 +488,13 @@ min(T x, T y) template RAFT_DEVICE_INLINE_FUNCTION - typename std::enable_if::value, nv_bfloat16>::type + typename std::enable_if, nv_bfloat16>::type min(T x, T y) { #if (__CUDA_ARCH__ >= 800) return ::__hmin(x, y); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at - // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered - // during template instantiation and thus at device compilation stage + // Fail during template instantiation if the compute capability doesn't support this operation static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); return T{}; #endif @@ -561,15 +517,13 @@ constexpr RAFT_INLINE_FUNCTION auto min(const T& x) #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if::value, __half>::type -min(T x) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, __half>::type min( + T x) { #if (__CUDA_ARCH__ >= 530) return x; #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at - // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered - // during template instantiation and thus at device compilation stage + // Fail during template instantiation if the compute capability doesn't support this operation static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); return T{}; #endif @@ -577,15 +531,13 @@ min(T x) template RAFT_DEVICE_INLINE_FUNCTION - typename std::enable_if::value, nv_bfloat16>::type + typename std::enable_if, nv_bfloat16>::type min(T x) { #if (__CUDA_ARCH__ >= 800) return x; #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at - // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered - // during template instantiation and thus at device compilation stage + // Fail during template instantiation if the compute capability doesn't support this operation static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); return T{}; #endif @@ -609,15 +561,11 @@ RAFT_INLINE_FUNCTION auto pow(T1 x, T2 y) } /** Square root */ -template -RAFT_INLINE_FUNCTION -#if _RAFT_HAS_CUDA - typename std::enable_if::value && !std::is_same::value, - T>::type -#else - auto -#endif - sqrt(T x) +template && + (!std::is_same_v)))), + int> = 0> +RAFT_INLINE_FUNCTION auto sqrt(T x) { #ifdef __CUDA_ARCH__ return ::sqrt(x); @@ -628,15 +576,13 @@ RAFT_INLINE_FUNCTION #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if::value, __half>::type -sqrt(T x) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, __half>::type sqrt( + T x) { #if (__CUDA_ARCH__ >= 530) return ::hsqrt(x); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at - // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered - // during template instantiation and thus at device compilation stage + // Fail during template instantiation if the compute capability doesn't support this operation static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); return T{}; #endif @@ -644,15 +590,13 @@ sqrt(T x) template RAFT_DEVICE_INLINE_FUNCTION - typename std::enable_if::value, nv_bfloat16>::type + typename std::enable_if, nv_bfloat16>::type sqrt(T x) { #if (__CUDA_ARCH__ >= 800) return ::hsqrt(x); #else - // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at - // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered - // during template instantiation and thus at device compilation stage + // Fail during template instantiation if the compute capability doesn't support this operation static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); return T{}; #endif From 464e5b27e9ba8c28e84bcb64c2ad4bfe497478a6 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 30 May 2023 08:59:45 -0700 Subject: [PATCH 5/8] switch enable_if to enable_if_t --- cpp/include/raft/core/math.hpp | 73 ++++++++++++++-------------------- 1 file changed, 29 insertions(+), 44 deletions(-) diff --git a/cpp/include/raft/core/math.hpp b/cpp/include/raft/core/math.hpp index 1b425e8e4e..f251c14e4a 100644 --- a/cpp/include/raft/core/math.hpp +++ b/cpp/include/raft/core/math.hpp @@ -110,8 +110,7 @@ RAFT_INLINE_FUNCTION auto cos(T x) #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, __half>::type cos( - T x) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, __half> cos(T x) { #if (__CUDA_ARCH__ >= 530) return ::hcos(x); @@ -123,9 +122,8 @@ RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, _ } template -RAFT_DEVICE_INLINE_FUNCTION - typename std::enable_if, nv_bfloat16>::type - cos(T x) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, nv_bfloat16> +cos(T x) { #if (__CUDA_ARCH__ >= 800) return ::hcos(x); @@ -153,8 +151,7 @@ RAFT_INLINE_FUNCTION auto sin(T x) #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, __half>::type sin( - T x) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, __half> sin(T x) { #if (__CUDA_ARCH__ >= 530) return ::hsin(x); @@ -166,9 +163,8 @@ RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, _ } template -RAFT_DEVICE_INLINE_FUNCTION - typename std::enable_if, nv_bfloat16>::type - sin(T x) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, nv_bfloat16> +sin(T x) { #if (__CUDA_ARCH__ >= 800) return ::hsin(x); @@ -225,8 +221,7 @@ RAFT_INLINE_FUNCTION auto exp(T x) #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, __half>::type exp( - T x) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, __half> exp(T x) { #if (__CUDA_ARCH__ >= 530) return ::hexp(x); @@ -238,9 +233,8 @@ RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, _ } template -RAFT_DEVICE_INLINE_FUNCTION - typename std::enable_if, nv_bfloat16>::type - exp(T x) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, nv_bfloat16> +exp(T x) { #if (__CUDA_ARCH__ >= 800) return ::hexp(x); @@ -268,8 +262,7 @@ RAFT_INLINE_FUNCTION auto log(T x) #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, __half>::type log( - T x) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, __half> log(T x) { #if (__CUDA_ARCH__ >= 530) return ::hlog(x); @@ -281,9 +274,8 @@ RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, _ } template -RAFT_DEVICE_INLINE_FUNCTION - typename std::enable_if, nv_bfloat16>::type - log(T x) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, nv_bfloat16> +log(T x) { #if (__CUDA_ARCH__ >= 800) return ::hlog(x); @@ -349,8 +341,8 @@ RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y) #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, __half>::type max( - T x, T y) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, __half> max(T x, + T y) { #if (__CUDA_ARCH__ >= 530) return ::__hmax(x, y); @@ -362,9 +354,8 @@ RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, _ } template -RAFT_DEVICE_INLINE_FUNCTION - typename std::enable_if, nv_bfloat16>::type - max(T x, T y) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, nv_bfloat16> +max(T x, T y) { #if (__CUDA_ARCH__ >= 800) return ::__hmax(x, y); @@ -392,8 +383,7 @@ constexpr RAFT_INLINE_FUNCTION auto max(const T& x) #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, __half>::type max( - T x) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, __half> max(T x) { #if (__CUDA_ARCH__ >= 530) return x; @@ -405,9 +395,8 @@ RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, _ } template -RAFT_DEVICE_INLINE_FUNCTION - typename std::enable_if, nv_bfloat16>::type - max(T x) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, nv_bfloat16> +max(T x) { #if (__CUDA_ARCH__ >= 800) return x; @@ -474,8 +463,8 @@ RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y) #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, __half>::type min( - T x, T y) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, __half> min(T x, + T y) { #if (__CUDA_ARCH__ >= 530) return ::__hmin(x, y); @@ -487,9 +476,8 @@ RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, _ } template -RAFT_DEVICE_INLINE_FUNCTION - typename std::enable_if, nv_bfloat16>::type - min(T x, T y) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, nv_bfloat16> +min(T x, T y) { #if (__CUDA_ARCH__ >= 800) return ::__hmin(x, y); @@ -517,7 +505,7 @@ constexpr RAFT_INLINE_FUNCTION auto min(const T& x) #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, __half>::type min( +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, __half> min( T x) { #if (__CUDA_ARCH__ >= 530) @@ -530,9 +518,8 @@ RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, _ } template -RAFT_DEVICE_INLINE_FUNCTION - typename std::enable_if, nv_bfloat16>::type - min(T x) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, nv_bfloat16> +min(T x) { #if (__CUDA_ARCH__ >= 800) return x; @@ -576,8 +563,7 @@ RAFT_INLINE_FUNCTION auto sqrt(T x) #if _RAFT_HAS_CUDA template -RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, __half>::type sqrt( - T x) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, __half> sqrt(T x) { #if (__CUDA_ARCH__ >= 530) return ::hsqrt(x); @@ -589,9 +575,8 @@ RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if, _ } template -RAFT_DEVICE_INLINE_FUNCTION - typename std::enable_if, nv_bfloat16>::type - sqrt(T x) +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, nv_bfloat16> +sqrt(T x) { #if (__CUDA_ARCH__ >= 800) return ::hsqrt(x); From 4cdf6e1b72fd089a139aead589151da0839c6fa8 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 30 May 2023 09:08:52 -0700 Subject: [PATCH 6/8] fix rmm version --- python/pylibraft/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml index b53ad43740..1e4e1f40e3 100644 --- a/python/pylibraft/pyproject.toml +++ b/python/pylibraft/pyproject.toml @@ -19,7 +19,7 @@ requires = [ "cuda-python>=11.7.1,<12.0", "cython>=0.29,<0.30", "ninja", - "rmm==23.6.*", + "rmm==23.8.*", "scikit-build>=0.13.1,<0.17.2", "setuptools", "wheel", From 6fa4c131ce8d9b2992017466daced8568b522bbf Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 30 May 2023 09:25:20 -0700 Subject: [PATCH 7/8] fix back other toml file --- python/raft-dask/pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml index 9957bf9d42..bb703238b9 100644 --- a/python/raft-dask/pyproject.toml +++ b/python/raft-dask/pyproject.toml @@ -34,14 +34,14 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ - "dask-cuda==23.6.*", + "dask-cuda==23.8.*", "dask==2023.3.2", "distributed==2023.3.2.1", "joblib>=0.11", "numba>=0.49", "numpy>=1.21", - "pylibraft==23.6.*", - "ucx-py=0.33.*", + "pylibraft==23.8.*", + "ucx-py==0.33.*", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", From dde8caf3f7a30c78d248da5e3c81754b1194189a Mon Sep 17 00:00:00 2001 From: Nicolas Blin <31096601+Kh4ster@users.noreply.github.com> Date: Wed, 31 May 2023 08:38:32 +0200 Subject: [PATCH 8/8] Fix pyproject.toml --- python/pylibraft/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml index 1e4e1f40e3..c344128e6a 100644 --- a/python/pylibraft/pyproject.toml +++ b/python/pylibraft/pyproject.toml @@ -39,7 +39,7 @@ requires-python = ">=3.9" dependencies = [ "cuda-python>=11.7.1,<12.0", "numpy>=1.21", - "rmm==23.6.*", + "rmm==23.8.*", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers",