From eaae1e7b619f28587a80e677dcea470ebd202770 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Tue, 30 May 2023 02:56:48 -0700
Subject: [PATCH 1/8] This PR adds support to __half and nb_bfloat16 to most
 math functions

---
 cpp/include/raft/core/detail/macros.hpp |  12 +
 cpp/include/raft/core/math.hpp          | 531 ++++++++++++++++++++----
 cpp/test/core/math_device.cu            | 130 +++++-
 3 files changed, 582 insertions(+), 91 deletions(-)
diff --git a/cpp/include/raft/core/detail/macros.hpp b/cpp/include/raft/core/detail/macros.hpp
index 390acea697..b14114214f 100644
--- a/cpp/include/raft/core/detail/macros.hpp
+++ b/cpp/include/raft/core/detail/macros.hpp
@@ -22,6 +22,12 @@
 #endif
 #endif
 
+#if defined(_RAFT_HAS_CUDA)
+#define CUDA_ONLY_CONDITION(condition) condition
+#else
+#define CUDA_ONLY_CONDITION(condition) true
+#endif
+
 #ifndef _RAFT_HOST_DEVICE
 #if defined(_RAFT_HAS_CUDA)
 #define _RAFT_DEVICE      __device__
@@ -40,6 +46,10 @@
 #define RAFT_INLINE_FUNCTION _RAFT_HOST_DEVICE _RAFT_FORCEINLINE
 #endif
 
+#ifndef RAFT_DEVICE_INLINE_FUNCTION
+#define RAFT_DEVICE_INLINE_FUNCTION _RAFT_DEVICE _RAFT_FORCEINLINE
+#endif
+
 // The RAFT_INLINE_CONDITIONAL is a conditional inline specifier that removes
 // the inline specification when RAFT_COMPILED is defined.
 //
@@ -60,6 +70,8 @@
 #define RAFT_INLINE_CONDITIONAL inline
 #endif  // RAFT_COMPILED
 
+
+
 // The RAFT_WEAK_FUNCTION specificies that:
 //
 // 1. A function may be defined in multiple translation units (like inline)
diff --git a/cpp/include/raft/core/math.hpp b/cpp/include/raft/core/math.hpp
index c5f08b84b7..71c297fbba 100644
--- a/cpp/include/raft/core/math.hpp
+++ b/cpp/include/raft/core/math.hpp
@@ -22,6 +22,11 @@
 
 #include <raft/core/detail/macros.hpp>
 
+#if _RAFT_HAS_CUDA
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#endif
+
 namespace raft {
 
 /**
@@ -30,32 +35,32 @@ namespace raft {
  */
 template <typename T>
 RAFT_INLINE_FUNCTION auto abs(T x)
-  -> std::enable_if_t<std::is_same_v<float, T> || std::is_same_v<double, T> ||
+-> std::enable_if_t<std::is_same_v<float, T> || std::is_same_v<double, T> ||
                         std::is_same_v<int, T> || std::is_same_v<long int, T> ||
                         std::is_same_v<long long int, T>,
-                      T>
+                    T>
 {
 #ifdef __CUDA_ARCH__
-  return ::abs(x);
+return ::abs(x);
 #else
-  return std::abs(x);
+return std::abs(x);
 #endif
 }
 template <typename T>
 constexpr RAFT_INLINE_FUNCTION auto abs(T x)
-  -> std::enable_if_t<!std::is_same_v<float, T> && !std::is_same_v<double, T> &&
+-> std::enable_if_t<!std::is_same_v<float, T> && !std::is_same_v<double, T> &&
                         !std::is_same_v<int, T> && !std::is_same_v<long int, T> &&
                         !std::is_same_v<long long int, T>,
-                      T>
+                    T>
 {
-  return x < T{0} ? -x : x;
+return x < T{0} ? -x : x;
 }
 /** @} */
 
 /**
- * @defgroup Trigonometry Trigonometry functions
- * @{
- */
+* @defgroup Trigonometry Trigonometry functions
+* @{
+*/
 /** Inverse cosine */
 template <typename T>
 RAFT_INLINE_FUNCTION auto acos(T x)
@@ -91,7 +96,13 @@ RAFT_INLINE_FUNCTION auto atanh(T x)
 
 /** Cosine */
 template <typename T>
-RAFT_INLINE_FUNCTION auto cos(T x)
+RAFT_INLINE_FUNCTION
+#if _RAFT_HAS_CUDA
+typename std::enable_if<!std::is_same<T, __half>::value && !std::is_same<T, nv_bfloat16>::value, T>::type
+#else
+auto
+#endif
+cos(T x)
 {
 #ifdef __CUDA_ARCH__
   return ::cos(x);
@@ -100,9 +111,47 @@ RAFT_INLINE_FUNCTION auto cos(T x)
 #endif
 }
 
+#if _RAFT_HAS_CUDA
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION
+typename std::enable_if<std::is_same<T, __half>::value, __half>::type
+cos(T x)
+{
+#if (__CUDA_ARCH__ >= 530)
+  return ::hcos(x);
+#else
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
+  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
+  return T{};
+#endif
+}
+
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION
+typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+cos(T x)
+{
+#if (__CUDA_ARCH__ >= 800)
+  return ::hcos(x);
+#else
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
+  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
+  return T{};
+#endif
+}
+#endif
+
 /** Sine */
 template <typename T>
-RAFT_INLINE_FUNCTION auto sin(T x)
+RAFT_INLINE_FUNCTION
+#if _RAFT_HAS_CUDA
+typename std::enable_if<!std::is_same<T, __half>::value && !std::is_same<T, nv_bfloat16>::value, T>::type
+#else
+auto
+#endif
+sin(T x)
 {
 #ifdef __CUDA_ARCH__
   return ::sin(x);
@@ -111,16 +160,48 @@ RAFT_INLINE_FUNCTION auto sin(T x)
 #endif
 }
 
+#if _RAFT_HAS_CUDA
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION
+typename std::enable_if<std::is_same<T, __half>::value, __half>::type
+sin(T x)
+{
+#if (__CUDA_ARCH__ >= 530)
+  return ::hsin(x);
+#else
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
+  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
+  return T{};
+#endif
+}
+
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION
+typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+sin(T x)
+{
+#if (__CUDA_ARCH__ >= 800)
+  return ::hsin(x);
+#else
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
+  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
+  return T{};
+#endif
+}
+#endif
+
 /** Sine and cosine */
 template <typename T>
 RAFT_INLINE_FUNCTION std::enable_if_t<std::is_same_v<float, T> || std::is_same_v<double, T>> sincos(
-  const T& x, T* s, T* c)
+const T& x, T* s, T* c)
 {
 #ifdef __CUDA_ARCH__
-  ::sincos(x, s, c);
+::sincos(x, s, c);
 #else
-  *s = std::sin(x);
-  *c = std::cos(x);
+*s = std::sin(x);
+*c = std::cos(x);
 #endif
 }
 
@@ -129,20 +210,26 @@ template <typename T>
 RAFT_INLINE_FUNCTION auto tanh(T x)
 {
 #ifdef __CUDA_ARCH__
-  return ::tanh(x);
+return ::tanh(x);
 #else
-  return std::tanh(x);
+return std::tanh(x);
 #endif
 }
 /** @} */
 
 /**
- * @defgroup Exponential Exponential and logarithm
- * @{
- */
+* @defgroup Exponential Exponential and logarithm
+* @{
+*/
 /** Exponential function */
 template <typename T>
-RAFT_INLINE_FUNCTION auto exp(T x)
+RAFT_INLINE_FUNCTION
+#if _RAFT_HAS_CUDA
+typename std::enable_if<!std::is_same<T, __half>::value && !std::is_same<T, nv_bfloat16>::value, T>::type
+#else
+auto
+#endif
+exp(T x)
 {
 #ifdef __CUDA_ARCH__
   return ::exp(x);
@@ -151,63 +238,179 @@ RAFT_INLINE_FUNCTION auto exp(T x)
 #endif
 }
 
+#if _RAFT_HAS_CUDA
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION
+typename std::enable_if<std::is_same<T, __half>::value, __half>::type
+exp(T x)
+{
+#if (__CUDA_ARCH__ >= 530)
+  return ::hexp(x);
+#else
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
+  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
+  return T{};
+#endif
+}
+
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION
+typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+exp(T x)
+{
+#if (__CUDA_ARCH__ >= 800)
+  return ::hexp(x);
+#else
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
+  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
+  return T{};
+#endif
+}
+#endif
+
 /** Natural logarithm */
 template <typename T>
-RAFT_INLINE_FUNCTION auto log(T x)
+RAFT_INLINE_FUNCTION
+#if _RAFT_HAS_CUDA
+typename std::enable_if<!std::is_same<T, __half>::value && !std::is_same<T, nv_bfloat16>::value, T>::type
+#else
+auto
+#endif
+log(T x)
 {
 #ifdef __CUDA_ARCH__
-  return ::log(x);
+return ::log(x);
+#else
+return std::log(x);
+#endif
+}
+
+#if _RAFT_HAS_CUDA
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION
+typename std::enable_if<std::is_same<T, __half>::value, __half>::type
+log(T x)
+{
+#if (__CUDA_ARCH__ >= 530)
+  return ::hlog(x);
+#else
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
+  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
+  return T{};
+#endif
+}
+
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION
+typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+log(T x)
+{
+#if (__CUDA_ARCH__ >= 800)
+  return ::hlog(x);
 #else
-  return std::log(x);
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
+  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
+  return T{};
 #endif
 }
+#endif
 /** @} */
 
 /**
- * @defgroup Maximum Maximum of two or more values.
- *
- * The CUDA Math API has overloads for all combinations of float/double. We provide similar
- * functionality while wrapping around std::max, which only supports arguments of the same type.
- * However, though the CUDA Math API supports combinations of unsigned and signed integers, this is
- * very error-prone so we do not support that and require the user to cast instead. (e.g the max of
- * -1 and 1u is 4294967295u...)
- *
- * When no overload matches, we provide a generic implementation but require that both types be the
- * same (and that the less-than operator be defined).
- * @{
- */
-template <typename T1, typename T2>
-RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y)
+* @defgroup Maximum Maximum of two or more values.
+*
+* The CUDA Math API has overloads for all combinations of float/double. We provide similar
+* functionality while wrapping around std::max, which only supports arguments of the same type.
+* However, though the CUDA Math API supports combinations of unsigned and signed integers, this is
+* very error-prone so we do not support that and require the user to cast instead. (e.g the max of
+* -1 and 1u is 4294967295u...)
+*
+* When no overload matches, we provide a generic implementation but require that both types be the
+* same (and that the less-than operator be defined).
+* @{
+*/
+template <
+    typename T1, 
+    typename T2,
+    std::enable_if_t<
+        CUDA_ONLY_CONDITION(
+            RAFT_DEPAREN((
+                (!std::is_same_v<T1, __half> && !std::is_same_v<T2, __half>) || 
+                (!std::is_same_v<T1, nv_bfloat16> && !std::is_same_v<T2, nv_bfloat16>)
+            ))
+        ), 
+        int
+    > = 0
+>
+RAFT_INLINE_FUNCTION
+auto
+max(const T1& x, const T2& y)
 {
 #ifdef __CUDA_ARCH__
-  // Combinations of types supported by the CUDA Math API
-  if constexpr ((std::is_integral_v<T1> && std::is_integral_v<T2> && std::is_same_v<T1, T2>) ||
+// Combinations of types supported by the CUDA Math API
+if constexpr ((std::is_integral_v<T1> && std::is_integral_v<T2> && std::is_same_v<T1, T2>) ||
                 ((std::is_same_v<T1, float> || std::is_same_v<T1, double>)&&(
-                  std::is_same_v<T2, float> || std::is_same_v<T2, double>))) {
+                std::is_same_v<T2, float> || std::is_same_v<T2, double>))) {
     return ::max(x, y);
-  }
-  // Else, check that the types are the same and provide a generic implementation
-  else {
+}
+// Else, check that the types are the same and provide a generic implementation
+else {
     static_assert(
-      std::is_same_v<T1, T2>,
-      "No native max overload for these types. Both argument types must be the same to use "
-      "the generic max. Please cast appropriately.");
+    std::is_same_v<T1, T2>,
+    "No native max overload for these types. Both argument types must be the same to use "
+    "the generic max. Please cast appropriately.");
     return (x < y) ? y : x;
-  }
+}
 #else
-  if constexpr (std::is_same_v<T1, float> && std::is_same_v<T2, double>) {
+if constexpr (std::is_same_v<T1, float> && std::is_same_v<T2, double>) {
     return std::max(static_cast<double>(x), y);
-  } else if constexpr (std::is_same_v<T1, double> && std::is_same_v<T2, float>) {
+} else if constexpr (std::is_same_v<T1, double> && std::is_same_v<T2, float>) {
     return std::max(x, static_cast<double>(y));
-  } else {
+} else {
     static_assert(
-      std::is_same_v<T1, T2>,
-      "std::max requires that both argument types be the same. Please cast appropriately.");
+    std::is_same_v<T1, T2>,
+    "std::max requires that both argument types be the same. Please cast appropriately.");
     return std::max(x, y);
-  }
+}
 #endif
 }
 
+#if _RAFT_HAS_CUDA
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION
+typename std::enable_if<std::is_same<T, __half>::value, __half>::type
+max(T x, T y)
+{
+#if (__CUDA_ARCH__ >= 530)
+  return ::__hmax(x, y);
+#else
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
+  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
+  return T{};
+#endif
+}
+
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION
+typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+max(T x, T y)
+{
+#if (__CUDA_ARCH__ >= 800)
+  return ::__hmax(x, y);
+#else
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
+  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
+  return T{};
+#endif
+}
+#endif
+
 /** Many-argument overload to avoid verbose nested calls or use with variadic arguments */
 template <typename T1, typename T2, typename... Args>
 RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y, Args&&... args)
@@ -219,54 +422,134 @@ RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y, Args&&... args)
 template <typename T>
 constexpr RAFT_INLINE_FUNCTION auto max(const T& x)
 {
+return x;
+}
+
+#if _RAFT_HAS_CUDA
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION
+typename std::enable_if<std::is_same<T, __half>::value, __half>::type
+max(T x)
+{
+#if (__CUDA_ARCH__ >= 530)
+  return x;
+#else
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
+  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
+  return T{};
+#endif
+}
+
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION
+typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+max(T x)
+{
+#if (__CUDA_ARCH__ >= 800)
   return x;
+#else
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
+  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
+  return T{};
+#endif
 }
+#endif
+
+
 /** @} */
 
 /**
- * @defgroup Minimum Minimum of two or more values.
- *
- * The CUDA Math API has overloads for all combinations of float/double. We provide similar
- * functionality while wrapping around std::min, which only supports arguments of the same type.
- * However, though the CUDA Math API supports combinations of unsigned and signed integers, this is
- * very error-prone so we do not support that and require the user to cast instead. (e.g the min of
- * -1 and 1u is 1u...)
- *
- * When no overload matches, we provide a generic implementation but require that both types be the
- * same (and that the less-than operator be defined).
- * @{
- */
-template <typename T1, typename T2>
-RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y)
+* @defgroup Minimum Minimum of two or more values.
+*
+* The CUDA Math API has overloads for all combinations of float/double. We provide similar
+* functionality while wrapping around std::min, which only supports arguments of the same type.
+* However, though the CUDA Math API supports combinations of unsigned and signed integers, this is
+* very error-prone so we do not support that and require the user to cast instead. (e.g the min of
+* -1 and 1u is 1u...)
+*
+* When no overload matches, we provide a generic implementation but require that both types be the
+* same (and that the less-than operator be defined).
+* @{
+*/
+template <
+    typename T1, 
+    typename T2,
+    std::enable_if_t<
+        CUDA_ONLY_CONDITION(
+            RAFT_DEPAREN((
+                (!std::is_same_v<T1, __half> && !std::is_same_v<T2, __half>) || 
+                (!std::is_same_v<T1, nv_bfloat16> && !std::is_same_v<T2, nv_bfloat16>)
+            ))
+        ), 
+        int
+    > = 0
+>
+RAFT_INLINE_FUNCTION
+auto
+min(const T1& x, const T2& y)
 {
 #ifdef __CUDA_ARCH__
-  // Combinations of types supported by the CUDA Math API
-  if constexpr ((std::is_integral_v<T1> && std::is_integral_v<T2> && std::is_same_v<T1, T2>) ||
+// Combinations of types supported by the CUDA Math API
+if constexpr ((std::is_integral_v<T1> && std::is_integral_v<T2> && std::is_same_v<T1, T2>) ||
                 ((std::is_same_v<T1, float> || std::is_same_v<T1, double>)&&(
-                  std::is_same_v<T2, float> || std::is_same_v<T2, double>))) {
+                std::is_same_v<T2, float> || std::is_same_v<T2, double>))) {
     return ::min(x, y);
-  }
-  // Else, check that the types are the same and provide a generic implementation
-  else {
+}
+// Else, check that the types are the same and provide a generic implementation
+else {
     static_assert(
-      std::is_same_v<T1, T2>,
-      "No native min overload for these types. Both argument types must be the same to use "
-      "the generic min. Please cast appropriately.");
+    std::is_same_v<T1, T2>,
+    "No native min overload for these types. Both argument types must be the same to use "
+    "the generic min. Please cast appropriately.");
     return (y < x) ? y : x;
-  }
+}
 #else
-  if constexpr (std::is_same_v<T1, float> && std::is_same_v<T2, double>) {
+if constexpr (std::is_same_v<T1, float> && std::is_same_v<T2, double>) {
     return std::min(static_cast<double>(x), y);
-  } else if constexpr (std::is_same_v<T1, double> && std::is_same_v<T2, float>) {
+} else if constexpr (std::is_same_v<T1, double> && std::is_same_v<T2, float>) {
     return std::min(x, static_cast<double>(y));
-  } else {
+} else {
     static_assert(
-      std::is_same_v<T1, T2>,
-      "std::min requires that both argument types be the same. Please cast appropriately.");
+    std::is_same_v<T1, T2>,
+    "std::min requires that both argument types be the same. Please cast appropriately.");
     return std::min(x, y);
-  }
+}
+#endif
+}
+
+#if _RAFT_HAS_CUDA
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION
+typename std::enable_if<std::is_same<T, __half>::value, __half>::type
+min(T x, T y)
+{
+#if (__CUDA_ARCH__ >= 530)
+  return ::__hmin(x, y);
+#else
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
+  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
+  return T{};
+#endif
+}
+
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION
+typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+min(T x, T y)
+{
+#if (__CUDA_ARCH__ >= 800)
+  return ::__hmin(x, y);
+#else
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
+  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
+  return T{};
 #endif
 }
+#endif
 
 /** Many-argument overload to avoid verbose nested calls or use with variadic arguments */
 template <typename T1, typename T2, typename... Args>
@@ -279,8 +562,40 @@ RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y, Args&&... args)
 template <typename T>
 constexpr RAFT_INLINE_FUNCTION auto min(const T& x)
 {
+return x;
+}
+
+#if _RAFT_HAS_CUDA
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION
+typename std::enable_if<std::is_same<T, __half>::value, __half>::type
+min(T x)
+{
+#if (__CUDA_ARCH__ >= 530)
+  return x;
+#else
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
+  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
+  return T{};
+#endif
+}
+
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION
+typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+min(T x)
+{
+#if (__CUDA_ARCH__ >= 800)
   return x;
+#else
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
+  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
+  return T{};
+#endif
 }
+#endif
 /** @} */
 
 /**
@@ -300,21 +615,59 @@ RAFT_INLINE_FUNCTION auto pow(T1 x, T2 y)
 
 /** Square root */
 template <typename T>
-RAFT_INLINE_FUNCTION auto sqrt(T x)
+RAFT_INLINE_FUNCTION
+#if _RAFT_HAS_CUDA
+typename std::enable_if<!std::is_same<T, __half>::value && !std::is_same<T, nv_bfloat16>::value, T>::type
+#else
+auto
+#endif
+sqrt(T x)
 {
 #ifdef __CUDA_ARCH__
-  return ::sqrt(x);
+return ::sqrt(x);
 #else
-  return std::sqrt(x);
+return std::sqrt(x);
 #endif
 }
+
+#if _RAFT_HAS_CUDA
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION
+typename std::enable_if<std::is_same<T, __half>::value, __half>::type
+sqrt(T x)
+{
+#if (__CUDA_ARCH__ >= 530)
+  return ::hsqrt(x);
+#else
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
+  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
+  return T{};
+#endif
+}
+
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION
+typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+sqrt(T x)
+{
+#if (__CUDA_ARCH__ >= 800)
+  return ::hsqrt(x);
+#else
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
+  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
+  return T{};
+#endif
+}
+#endif
 /** @} */
 
 /** Sign */
 template <typename T>
 RAFT_INLINE_FUNCTION auto sgn(T val) -> int
 {
-  return (T(0) < val) - (val < T(0));
+return (T(0) < val) - (val < T(0));
 }
 
 }  // namespace raft
diff --git a/cpp/test/core/math_device.cu b/cpp/test/core/math_device.cu
index ff4b343d9e..80359b2334 100644
--- a/cpp/test/core/math_device.cu
+++ b/cpp/test/core/math_device.cu
@@ -21,6 +21,11 @@
 #include <rmm/cuda_stream.hpp>
 #include <rmm/device_scalar.hpp>
 
+#if _RAFT_HAS_CUDA
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#endif
+
 template <typename OutT, typename OpT, typename... Args>
 __global__ void math_eval_kernel(OutT* out, OpT op, Args... args)
 {
@@ -118,8 +123,32 @@ struct cos_test_op {
   }
 };
 
+struct cos_test_op_device {
+  template <typename Type>
+  constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+#if (__CUDA_ARCH__ < 530)
+    if constexpr (std::is_same_v<Type, __half>)
+    {
+      return __float2half(raft::cos(__half2float(in)));
+    }
+    #elif (__CUDA_ARCH__ < 800)
+    if constexpr (std::is_same_v<Type, nv_bfloat16>)
+    {
+      return __float2bfloat16(raft::cos(__bfloat162float(in)));
+    }
+    else // else is there to make sure raft::cos(in) is not compiled with __half / nv_bfloat16 
+#endif
+      return raft::cos(in);
+  }
+};
+
 TEST(MathDevice, Cos)
 {
+  ASSERT_TRUE(raft::match(
+    std::cos(12.34f), __half2float(math_eval(cos_test_op_device{}, __float2half(12.34f))), raft::CompareApprox<float>(0.001f)));
+  ASSERT_TRUE(raft::match(
+    std::cos(12.34f), __bfloat162float(math_eval(cos_test_op_device{}, __float2bfloat16(12.34f))), raft::CompareApprox<float>(0.01f)));
   ASSERT_TRUE(raft::match(
     std::cos(12.34f), math_eval(cos_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
   ASSERT_TRUE(raft::match(
@@ -134,14 +163,58 @@ struct exp_test_op {
   }
 };
 
+struct exp_test_op_device {
+  template <typename Type>
+  constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    #if (__CUDA_ARCH__ < 530)
+    if constexpr (std::is_same_v<Type, __half>)
+    {
+      return __float2half(raft::exp(__half2float(in)));
+    }
+    #elif (__CUDA_ARCH__ < 800)
+    if constexpr (std::is_same_v<Type, nv_bfloat16>)
+    {
+      return __float2bfloat16(raft::exp(__bfloat162float(in)));
+    }
+    else // else is there to make sure raft::exp(in) is not compiled with __half / nv_bfloat16 
+#endif
+      return raft::exp(in);
+  }
+};
+
 TEST(MathDevice, Exp)
 {
   ASSERT_TRUE(raft::match(
-    std::exp(12.34f), math_eval(exp_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
+    std::exp(3.4f), __half2float(math_eval(exp_test_op_device{}, __float2half(3.4f))), raft::CompareApprox<float>(0.001f)));
+  ASSERT_TRUE(raft::match(
+    std::exp(3.4f), __bfloat162float(math_eval(exp_test_op_device{}, __float2bfloat16(3.4f))), raft::CompareApprox<float>(0.01f)));
+  ASSERT_TRUE(raft::match(
+    std::exp(3.4f), math_eval(exp_test_op{}, 3.4f), raft::CompareApprox<float>(0.0001f)));
   ASSERT_TRUE(raft::match(
-    std::exp(12.34), math_eval(exp_test_op{}, 12.34), raft::CompareApprox<double>(0.000001)));
+    std::exp(3.4), math_eval(exp_test_op{}, 3.4), raft::CompareApprox<double>(0.000001)));
 }
 
+struct log_test_op_device {
+  template <typename Type>
+  constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    #if (__CUDA_ARCH__ < 530)
+    if constexpr (std::is_same_v<Type, __half>)
+    {
+      return __float2half(raft::log(__half2float(in)));
+    }
+    #elif (__CUDA_ARCH__ < 800)
+    if constexpr (std::is_same_v<Type, nv_bfloat16>)
+    {
+      return __float2bfloat16(raft::log(__bfloat162float(in)));
+    }
+    else // else is there to make sure raft::log(in) is not compiled with __half / nv_bfloat16 
+#endif
+      return raft::log(in);
+  }
+};
+
 struct log_test_op {
   template <typename Type>
   constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
@@ -152,6 +225,10 @@ struct log_test_op {
 
 TEST(MathDevice, Log)
 {
+  ASSERT_TRUE(raft::match(
+    std::log(12.34f), __half2float(math_eval(log_test_op_device{}, __float2half(12.34f))), raft::CompareApprox<float>(0.001f)));
+  ASSERT_TRUE(raft::match(
+    std::log(12.34f), __bfloat162float(math_eval(log_test_op_device{}, __float2bfloat16(12.34f))), raft::CompareApprox<float>(0.01f)));
   ASSERT_TRUE(raft::match(
     std::log(12.34f), math_eval(log_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
   ASSERT_TRUE(raft::match(
@@ -277,6 +354,26 @@ TEST(MathDevice, Sgn)
   ASSERT_TRUE(raft::match(1, math_eval(sgn_test_op{}, 12.34f), raft::Compare<int>()));
 }
 
+struct sin_test_op_device {
+  template <typename Type>
+  constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    #if (__CUDA_ARCH__ < 530)
+    if constexpr (std::is_same_v<Type, __half>)
+    {
+      return __float2half(raft::sin(__half2float(in)));
+    }
+    #elif (__CUDA_ARCH__ < 800)
+    if constexpr (std::is_same_v<Type, nv_bfloat16>)
+    {
+      return __float2bfloat16(raft::sin(__bfloat162float(in)));
+    }
+    else // else is there to make sure raft::sin(in) is not compiled with __half / nv_bfloat16 
+#endif
+      return raft::sin(in);
+  }
+};
+
 struct sin_test_op {
   template <typename Type>
   constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
@@ -287,6 +384,10 @@ struct sin_test_op {
 
 TEST(MathDevice, Sin)
 {
+  ASSERT_TRUE(raft::match(
+    std::sin(12.34f), __half2float(math_eval(sin_test_op_device{}, __float2half(12.34f))), raft::CompareApprox<float>(0.01f)));
+  ASSERT_TRUE(raft::match(
+    std::sin(12.34f), __bfloat162float(math_eval(sin_test_op_device{}, __float2bfloat16(12.34f))), raft::CompareApprox<float>(0.1f)));
   ASSERT_TRUE(raft::match(
     std::sin(12.34f), math_eval(sin_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
   ASSERT_TRUE(raft::match(
@@ -319,6 +420,27 @@ TEST(MathDevice, SinCos)
   ASSERT_TRUE(raft::match(std::cos(12.34), cd.value(stream), raft::CompareApprox<double>(0.0001f)));
 }
 
+
+struct sqrt_test_op_device {
+  template <typename Type>
+  constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    #if (__CUDA_ARCH__ < 530)
+    if constexpr (std::is_same_v<Type, __half>)
+    {
+      return __float2half(raft::sqrt(__half2float(in)));
+    }
+    #elif (__CUDA_ARCH__ < 800)
+    if constexpr (std::is_same_v<Type, nv_bfloat16>)
+    {
+      return __float2bfloat16(raft::sqrt(__bfloat162float(in)));
+    }
+    else // else is there to make sure raft::sqrt(in) is not compiled with __half / nv_bfloat16 
+#endif
+      return raft::sqrt(in);
+  }
+};
+
 struct sqrt_test_op {
   template <typename Type>
   constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
@@ -329,6 +451,10 @@ struct sqrt_test_op {
 
 TEST(MathDevice, Sqrt)
 {
+  ASSERT_TRUE(raft::match(
+    std::sqrt(12.34f), __half2float(math_eval(sqrt_test_op_device{}, __float2half(12.34f))), raft::CompareApprox<float>(0.001f)));
+  ASSERT_TRUE(raft::match(
+    std::sqrt(12.34f), __bfloat162float(math_eval(sqrt_test_op_device{}, __float2bfloat16(12.34f))), raft::CompareApprox<float>(0.01f)));
   ASSERT_TRUE(raft::match(
     std::sqrt(12.34f), math_eval(sqrt_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
   ASSERT_TRUE(raft::match(

From ae65a7afccf1aa20dee55ca279fdc80d9cb433c3 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Tue, 30 May 2023 06:15:13 -0700
Subject: [PATCH 2/8] clang format

---
 cpp/include/raft/core/detail/macros.hpp |   2 -
 cpp/include/raft/core/math.hpp          | 399 ++++++++++++------------
 cpp/test/core/math_device.cu            | 130 ++++----
 3 files changed, 262 insertions(+), 269 deletions(-)

diff --git a/cpp/include/raft/core/detail/macros.hpp b/cpp/include/raft/core/detail/macros.hpp
index b14114214f..6d97b830fb 100644
--- a/cpp/include/raft/core/detail/macros.hpp
+++ b/cpp/include/raft/core/detail/macros.hpp
@@ -70,8 +70,6 @@
 #define RAFT_INLINE_CONDITIONAL inline
 #endif  // RAFT_COMPILED
 
-
-
 // The RAFT_WEAK_FUNCTION specificies that:
 //
 // 1. A function may be defined in multiple translation units (like inline)
diff --git a/cpp/include/raft/core/math.hpp b/cpp/include/raft/core/math.hpp
index 71c297fbba..e912e5e428 100644
--- a/cpp/include/raft/core/math.hpp
+++ b/cpp/include/raft/core/math.hpp
@@ -23,8 +23,8 @@
 #include <raft/core/detail/macros.hpp>
 
 #if _RAFT_HAS_CUDA
-#include <cuda_fp16.h>
 #include <cuda_bf16.h>
+#include <cuda_fp16.h>
 #endif
 
 namespace raft {
@@ -35,32 +35,32 @@ namespace raft {
  */
 template <typename T>
 RAFT_INLINE_FUNCTION auto abs(T x)
--> std::enable_if_t<std::is_same_v<float, T> || std::is_same_v<double, T> ||
+  -> std::enable_if_t<std::is_same_v<float, T> || std::is_same_v<double, T> ||
                         std::is_same_v<int, T> || std::is_same_v<long int, T> ||
                         std::is_same_v<long long int, T>,
-                    T>
+                      T>
 {
 #ifdef __CUDA_ARCH__
-return ::abs(x);
+  return ::abs(x);
 #else
-return std::abs(x);
+  return std::abs(x);
 #endif
 }
 template <typename T>
 constexpr RAFT_INLINE_FUNCTION auto abs(T x)
--> std::enable_if_t<!std::is_same_v<float, T> && !std::is_same_v<double, T> &&
+  -> std::enable_if_t<!std::is_same_v<float, T> && !std::is_same_v<double, T> &&
                         !std::is_same_v<int, T> && !std::is_same_v<long int, T> &&
                         !std::is_same_v<long long int, T>,
-                    T>
+                      T>
 {
-return x < T{0} ? -x : x;
+  return x < T{0} ? -x : x;
 }
 /** @} */
 
 /**
-* @defgroup Trigonometry Trigonometry functions
-* @{
-*/
+ * @defgroup Trigonometry Trigonometry functions
+ * @{
+ */
 /** Inverse cosine */
 template <typename T>
 RAFT_INLINE_FUNCTION auto acos(T x)
@@ -98,11 +98,12 @@ RAFT_INLINE_FUNCTION auto atanh(T x)
 template <typename T>
 RAFT_INLINE_FUNCTION
 #if _RAFT_HAS_CUDA
-typename std::enable_if<!std::is_same<T, __half>::value && !std::is_same<T, nv_bfloat16>::value, T>::type
+  typename std::enable_if<!std::is_same<T, __half>::value && !std::is_same<T, nv_bfloat16>::value,
+                          T>::type
 #else
-auto
+  auto
 #endif
-cos(T x)
+  cos(T x)
 {
 #ifdef __CUDA_ARCH__
   return ::cos(x);
@@ -113,15 +114,15 @@ cos(T x)
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION
-typename std::enable_if<std::is_same<T, __half>::value, __half>::type
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same<T, __half>::value, __half>::type
 cos(T x)
 {
 #if (__CUDA_ARCH__ >= 530)
   return ::hcos(x);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
-  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
+  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
+  // during template instantiation and thus at device compilation stage
   static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
   return T{};
 #endif
@@ -129,14 +130,15 @@ cos(T x)
 
 template <typename T>
 RAFT_DEVICE_INLINE_FUNCTION
-typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
-cos(T x)
+  typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+  cos(T x)
 {
 #if (__CUDA_ARCH__ >= 800)
   return ::hcos(x);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
-  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
+  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
+  // during template instantiation and thus at device compilation stage
   static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
   return T{};
 #endif
@@ -147,11 +149,12 @@ cos(T x)
 template <typename T>
 RAFT_INLINE_FUNCTION
 #if _RAFT_HAS_CUDA
-typename std::enable_if<!std::is_same<T, __half>::value && !std::is_same<T, nv_bfloat16>::value, T>::type
+  typename std::enable_if<!std::is_same<T, __half>::value && !std::is_same<T, nv_bfloat16>::value,
+                          T>::type
 #else
-auto
+  auto
 #endif
-sin(T x)
+  sin(T x)
 {
 #ifdef __CUDA_ARCH__
   return ::sin(x);
@@ -162,15 +165,15 @@ sin(T x)
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION
-typename std::enable_if<std::is_same<T, __half>::value, __half>::type
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same<T, __half>::value, __half>::type
 sin(T x)
 {
 #if (__CUDA_ARCH__ >= 530)
   return ::hsin(x);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
-  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
+  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
+  // during template instantiation and thus at device compilation stage
   static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
   return T{};
 #endif
@@ -178,14 +181,15 @@ sin(T x)
 
 template <typename T>
 RAFT_DEVICE_INLINE_FUNCTION
-typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
-sin(T x)
+  typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+  sin(T x)
 {
 #if (__CUDA_ARCH__ >= 800)
   return ::hsin(x);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
-  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
+  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
+  // during template instantiation and thus at device compilation stage
   static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
   return T{};
 #endif
@@ -195,13 +199,13 @@ sin(T x)
 /** Sine and cosine */
 template <typename T>
 RAFT_INLINE_FUNCTION std::enable_if_t<std::is_same_v<float, T> || std::is_same_v<double, T>> sincos(
-const T& x, T* s, T* c)
+  const T& x, T* s, T* c)
 {
 #ifdef __CUDA_ARCH__
-::sincos(x, s, c);
+  ::sincos(x, s, c);
 #else
-*s = std::sin(x);
-*c = std::cos(x);
+  *s = std::sin(x);
+  *c = std::cos(x);
 #endif
 }
 
@@ -210,26 +214,27 @@ template <typename T>
 RAFT_INLINE_FUNCTION auto tanh(T x)
 {
 #ifdef __CUDA_ARCH__
-return ::tanh(x);
+  return ::tanh(x);
 #else
-return std::tanh(x);
+  return std::tanh(x);
 #endif
 }
 /** @} */
 
 /**
-* @defgroup Exponential Exponential and logarithm
-* @{
-*/
+ * @defgroup Exponential Exponential and logarithm
+ * @{
+ */
 /** Exponential function */
 template <typename T>
 RAFT_INLINE_FUNCTION
 #if _RAFT_HAS_CUDA
-typename std::enable_if<!std::is_same<T, __half>::value && !std::is_same<T, nv_bfloat16>::value, T>::type
+  typename std::enable_if<!std::is_same<T, __half>::value && !std::is_same<T, nv_bfloat16>::value,
+                          T>::type
 #else
-auto
+  auto
 #endif
-exp(T x)
+  exp(T x)
 {
 #ifdef __CUDA_ARCH__
   return ::exp(x);
@@ -240,15 +245,15 @@ exp(T x)
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION
-typename std::enable_if<std::is_same<T, __half>::value, __half>::type
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same<T, __half>::value, __half>::type
 exp(T x)
 {
 #if (__CUDA_ARCH__ >= 530)
   return ::hexp(x);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
-  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
+  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
+  // during template instantiation and thus at device compilation stage
   static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
   return T{};
 #endif
@@ -256,14 +261,15 @@ exp(T x)
 
 template <typename T>
 RAFT_DEVICE_INLINE_FUNCTION
-typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
-exp(T x)
+  typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+  exp(T x)
 {
 #if (__CUDA_ARCH__ >= 800)
   return ::hexp(x);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
-  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
+  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
+  // during template instantiation and thus at device compilation stage
   static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
   return T{};
 #endif
@@ -274,30 +280,31 @@ exp(T x)
 template <typename T>
 RAFT_INLINE_FUNCTION
 #if _RAFT_HAS_CUDA
-typename std::enable_if<!std::is_same<T, __half>::value && !std::is_same<T, nv_bfloat16>::value, T>::type
+  typename std::enable_if<!std::is_same<T, __half>::value && !std::is_same<T, nv_bfloat16>::value,
+                          T>::type
 #else
-auto
+  auto
 #endif
-log(T x)
+  log(T x)
 {
 #ifdef __CUDA_ARCH__
-return ::log(x);
+  return ::log(x);
 #else
-return std::log(x);
+  return std::log(x);
 #endif
 }
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION
-typename std::enable_if<std::is_same<T, __half>::value, __half>::type
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same<T, __half>::value, __half>::type
 log(T x)
 {
 #if (__CUDA_ARCH__ >= 530)
   return ::hlog(x);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
-  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
+  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
+  // during template instantiation and thus at device compilation stage
   static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
   return T{};
 #endif
@@ -305,14 +312,15 @@ log(T x)
 
 template <typename T>
 RAFT_DEVICE_INLINE_FUNCTION
-typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
-log(T x)
+  typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+  log(T x)
 {
 #if (__CUDA_ARCH__ >= 800)
   return ::hlog(x);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
-  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
+  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
+  // during template instantiation and thus at device compilation stage
   static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
   return T{};
 #endif
@@ -321,75 +329,67 @@ log(T x)
 /** @} */
 
 /**
-* @defgroup Maximum Maximum of two or more values.
-*
-* The CUDA Math API has overloads for all combinations of float/double. We provide similar
-* functionality while wrapping around std::max, which only supports arguments of the same type.
-* However, though the CUDA Math API supports combinations of unsigned and signed integers, this is
-* very error-prone so we do not support that and require the user to cast instead. (e.g the max of
-* -1 and 1u is 4294967295u...)
-*
-* When no overload matches, we provide a generic implementation but require that both types be the
-* same (and that the less-than operator be defined).
-* @{
-*/
+ * @defgroup Maximum Maximum of two or more values.
+ *
+ * The CUDA Math API has overloads for all combinations of float/double. We provide similar
+ * functionality while wrapping around std::max, which only supports arguments of the same type.
+ * However, though the CUDA Math API supports combinations of unsigned and signed integers, this is
+ * very error-prone so we do not support that and require the user to cast instead. (e.g the max of
+ * -1 and 1u is 4294967295u...)
+ *
+ * When no overload matches, we provide a generic implementation but require that both types be the
+ * same (and that the less-than operator be defined).
+ * @{
+ */
 template <
-    typename T1, 
-    typename T2,
-    std::enable_if_t<
-        CUDA_ONLY_CONDITION(
-            RAFT_DEPAREN((
-                (!std::is_same_v<T1, __half> && !std::is_same_v<T2, __half>) || 
-                (!std::is_same_v<T1, nv_bfloat16> && !std::is_same_v<T2, nv_bfloat16>)
-            ))
-        ), 
-        int
-    > = 0
->
-RAFT_INLINE_FUNCTION
-auto
-max(const T1& x, const T2& y)
+  typename T1,
+  typename T2,
+  std::enable_if_t<CUDA_ONLY_CONDITION(RAFT_DEPAREN(
+                     ((!std::is_same_v<T1, __half> && !std::is_same_v<T2, __half>) ||
+                      (!std::is_same_v<T1, nv_bfloat16> && !std::is_same_v<T2, nv_bfloat16>)))),
+                   int> = 0>
+RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y)
 {
 #ifdef __CUDA_ARCH__
-// Combinations of types supported by the CUDA Math API
-if constexpr ((std::is_integral_v<T1> && std::is_integral_v<T2> && std::is_same_v<T1, T2>) ||
+  // Combinations of types supported by the CUDA Math API
+  if constexpr ((std::is_integral_v<T1> && std::is_integral_v<T2> && std::is_same_v<T1, T2>) ||
                 ((std::is_same_v<T1, float> || std::is_same_v<T1, double>)&&(
-                std::is_same_v<T2, float> || std::is_same_v<T2, double>))) {
+                  std::is_same_v<T2, float> || std::is_same_v<T2, double>))) {
     return ::max(x, y);
-}
-// Else, check that the types are the same and provide a generic implementation
-else {
+  }
+  // Else, check that the types are the same and provide a generic implementation
+  else {
     static_assert(
-    std::is_same_v<T1, T2>,
-    "No native max overload for these types. Both argument types must be the same to use "
-    "the generic max. Please cast appropriately.");
+      std::is_same_v<T1, T2>,
+      "No native max overload for these types. Both argument types must be the same to use "
+      "the generic max. Please cast appropriately.");
     return (x < y) ? y : x;
-}
+  }
 #else
-if constexpr (std::is_same_v<T1, float> && std::is_same_v<T2, double>) {
+  if constexpr (std::is_same_v<T1, float> && std::is_same_v<T2, double>) {
     return std::max(static_cast<double>(x), y);
-} else if constexpr (std::is_same_v<T1, double> && std::is_same_v<T2, float>) {
+  } else if constexpr (std::is_same_v<T1, double> && std::is_same_v<T2, float>) {
     return std::max(x, static_cast<double>(y));
-} else {
+  } else {
     static_assert(
-    std::is_same_v<T1, T2>,
-    "std::max requires that both argument types be the same. Please cast appropriately.");
+      std::is_same_v<T1, T2>,
+      "std::max requires that both argument types be the same. Please cast appropriately.");
     return std::max(x, y);
-}
+  }
 #endif
 }
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION
-typename std::enable_if<std::is_same<T, __half>::value, __half>::type
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same<T, __half>::value, __half>::type
 max(T x, T y)
 {
 #if (__CUDA_ARCH__ >= 530)
   return ::__hmax(x, y);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
-  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
+  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
+  // during template instantiation and thus at device compilation stage
   static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
   return T{};
 #endif
@@ -397,14 +397,15 @@ max(T x, T y)
 
 template <typename T>
 RAFT_DEVICE_INLINE_FUNCTION
-typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
-max(T x, T y)
+  typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+  max(T x, T y)
 {
 #if (__CUDA_ARCH__ >= 800)
   return ::__hmax(x, y);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
-  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
+  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
+  // during template instantiation and thus at device compilation stage
   static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
   return T{};
 #endif
@@ -422,20 +423,20 @@ RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y, Args&&... args)
 template <typename T>
 constexpr RAFT_INLINE_FUNCTION auto max(const T& x)
 {
-return x;
+  return x;
 }
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION
-typename std::enable_if<std::is_same<T, __half>::value, __half>::type
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same<T, __half>::value, __half>::type
 max(T x)
 {
 #if (__CUDA_ARCH__ >= 530)
   return x;
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
-  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
+  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
+  // during template instantiation and thus at device compilation stage
   static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
   return T{};
 #endif
@@ -443,93 +444,85 @@ max(T x)
 
 template <typename T>
 RAFT_DEVICE_INLINE_FUNCTION
-typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
-max(T x)
+  typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+  max(T x)
 {
 #if (__CUDA_ARCH__ >= 800)
   return x;
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
-  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
+  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
+  // during template instantiation and thus at device compilation stage
   static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
   return T{};
 #endif
 }
 #endif
 
-
 /** @} */
 
 /**
-* @defgroup Minimum Minimum of two or more values.
-*
-* The CUDA Math API has overloads for all combinations of float/double. We provide similar
-* functionality while wrapping around std::min, which only supports arguments of the same type.
-* However, though the CUDA Math API supports combinations of unsigned and signed integers, this is
-* very error-prone so we do not support that and require the user to cast instead. (e.g the min of
-* -1 and 1u is 1u...)
-*
-* When no overload matches, we provide a generic implementation but require that both types be the
-* same (and that the less-than operator be defined).
-* @{
-*/
+ * @defgroup Minimum Minimum of two or more values.
+ *
+ * The CUDA Math API has overloads for all combinations of float/double. We provide similar
+ * functionality while wrapping around std::min, which only supports arguments of the same type.
+ * However, though the CUDA Math API supports combinations of unsigned and signed integers, this is
+ * very error-prone so we do not support that and require the user to cast instead. (e.g the min of
+ * -1 and 1u is 1u...)
+ *
+ * When no overload matches, we provide a generic implementation but require that both types be the
+ * same (and that the less-than operator be defined).
+ * @{
+ */
 template <
-    typename T1, 
-    typename T2,
-    std::enable_if_t<
-        CUDA_ONLY_CONDITION(
-            RAFT_DEPAREN((
-                (!std::is_same_v<T1, __half> && !std::is_same_v<T2, __half>) || 
-                (!std::is_same_v<T1, nv_bfloat16> && !std::is_same_v<T2, nv_bfloat16>)
-            ))
-        ), 
-        int
-    > = 0
->
-RAFT_INLINE_FUNCTION
-auto
-min(const T1& x, const T2& y)
+  typename T1,
+  typename T2,
+  std::enable_if_t<CUDA_ONLY_CONDITION(RAFT_DEPAREN(
+                     ((!std::is_same_v<T1, __half> && !std::is_same_v<T2, __half>) ||
+                      (!std::is_same_v<T1, nv_bfloat16> && !std::is_same_v<T2, nv_bfloat16>)))),
+                   int> = 0>
+RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y)
 {
 #ifdef __CUDA_ARCH__
-// Combinations of types supported by the CUDA Math API
-if constexpr ((std::is_integral_v<T1> && std::is_integral_v<T2> && std::is_same_v<T1, T2>) ||
+  // Combinations of types supported by the CUDA Math API
+  if constexpr ((std::is_integral_v<T1> && std::is_integral_v<T2> && std::is_same_v<T1, T2>) ||
                 ((std::is_same_v<T1, float> || std::is_same_v<T1, double>)&&(
-                std::is_same_v<T2, float> || std::is_same_v<T2, double>))) {
+                  std::is_same_v<T2, float> || std::is_same_v<T2, double>))) {
     return ::min(x, y);
-}
-// Else, check that the types are the same and provide a generic implementation
-else {
+  }
+  // Else, check that the types are the same and provide a generic implementation
+  else {
     static_assert(
-    std::is_same_v<T1, T2>,
-    "No native min overload for these types. Both argument types must be the same to use "
-    "the generic min. Please cast appropriately.");
+      std::is_same_v<T1, T2>,
+      "No native min overload for these types. Both argument types must be the same to use "
+      "the generic min. Please cast appropriately.");
     return (y < x) ? y : x;
-}
+  }
 #else
-if constexpr (std::is_same_v<T1, float> && std::is_same_v<T2, double>) {
+  if constexpr (std::is_same_v<T1, float> && std::is_same_v<T2, double>) {
     return std::min(static_cast<double>(x), y);
-} else if constexpr (std::is_same_v<T1, double> && std::is_same_v<T2, float>) {
+  } else if constexpr (std::is_same_v<T1, double> && std::is_same_v<T2, float>) {
     return std::min(x, static_cast<double>(y));
-} else {
+  } else {
     static_assert(
-    std::is_same_v<T1, T2>,
-    "std::min requires that both argument types be the same. Please cast appropriately.");
+      std::is_same_v<T1, T2>,
+      "std::min requires that both argument types be the same. Please cast appropriately.");
     return std::min(x, y);
-}
+  }
 #endif
 }
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION
-typename std::enable_if<std::is_same<T, __half>::value, __half>::type
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same<T, __half>::value, __half>::type
 min(T x, T y)
 {
 #if (__CUDA_ARCH__ >= 530)
   return ::__hmin(x, y);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
-  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
+  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
+  // during template instantiation and thus at device compilation stage
   static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
   return T{};
 #endif
@@ -537,14 +530,15 @@ min(T x, T y)
 
 template <typename T>
 RAFT_DEVICE_INLINE_FUNCTION
-typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
-min(T x, T y)
+  typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+  min(T x, T y)
 {
 #if (__CUDA_ARCH__ >= 800)
   return ::__hmin(x, y);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
-  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
+  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
+  // during template instantiation and thus at device compilation stage
   static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
   return T{};
 #endif
@@ -562,20 +556,20 @@ RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y, Args&&... args)
 template <typename T>
 constexpr RAFT_INLINE_FUNCTION auto min(const T& x)
 {
-return x;
+  return x;
 }
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION
-typename std::enable_if<std::is_same<T, __half>::value, __half>::type
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same<T, __half>::value, __half>::type
 min(T x)
 {
 #if (__CUDA_ARCH__ >= 530)
   return x;
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
-  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
+  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
+  // during template instantiation and thus at device compilation stage
   static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
   return T{};
 #endif
@@ -583,14 +577,15 @@ min(T x)
 
 template <typename T>
 RAFT_DEVICE_INLINE_FUNCTION
-typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
-min(T x)
+  typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+  min(T x)
 {
 #if (__CUDA_ARCH__ >= 800)
   return x;
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
-  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
+  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
+  // during template instantiation and thus at device compilation stage
   static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
   return T{};
 #endif
@@ -617,30 +612,31 @@ RAFT_INLINE_FUNCTION auto pow(T1 x, T2 y)
 template <typename T>
 RAFT_INLINE_FUNCTION
 #if _RAFT_HAS_CUDA
-typename std::enable_if<!std::is_same<T, __half>::value && !std::is_same<T, nv_bfloat16>::value, T>::type
+  typename std::enable_if<!std::is_same<T, __half>::value && !std::is_same<T, nv_bfloat16>::value,
+                          T>::type
 #else
-auto
+  auto
 #endif
-sqrt(T x)
+  sqrt(T x)
 {
 #ifdef __CUDA_ARCH__
-return ::sqrt(x);
+  return ::sqrt(x);
 #else
-return std::sqrt(x);
+  return std::sqrt(x);
 #endif
 }
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION
-typename std::enable_if<std::is_same<T, __half>::value, __half>::type
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same<T, __half>::value, __half>::type
 sqrt(T x)
 {
 #if (__CUDA_ARCH__ >= 530)
   return ::hsqrt(x);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
-  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
+  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
+  // during template instantiation and thus at device compilation stage
   static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
   return T{};
 #endif
@@ -648,14 +644,15 @@ sqrt(T x)
 
 template <typename T>
 RAFT_DEVICE_INLINE_FUNCTION
-typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
-sqrt(T x)
+  typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+  sqrt(T x)
 {
 #if (__CUDA_ARCH__ >= 800)
   return ::hsqrt(x);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at device compilation stage
-  // Using this sizeof(T) != sizeof(T) makes it work as it's only triggered during template instantiation and thus at device compilation stage
+  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
+  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
+  // during template instantiation and thus at device compilation stage
   static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
   return T{};
 #endif
@@ -667,7 +664,7 @@ sqrt(T x)
 template <typename T>
 RAFT_INLINE_FUNCTION auto sgn(T val) -> int
 {
-return (T(0) < val) - (val < T(0));
+  return (T(0) < val) - (val < T(0));
 }
 
 }  // namespace raft
diff --git a/cpp/test/core/math_device.cu b/cpp/test/core/math_device.cu
index 80359b2334..15c7b2b33a 100644
--- a/cpp/test/core/math_device.cu
+++ b/cpp/test/core/math_device.cu
@@ -22,8 +22,8 @@
 #include <rmm/device_scalar.hpp>
 
 #if _RAFT_HAS_CUDA
-#include <cuda_fp16.h>
 #include <cuda_bf16.h>
+#include <cuda_fp16.h>
 #endif
 
 template <typename OutT, typename OpT, typename... Args>
@@ -128,27 +128,27 @@ struct cos_test_op_device {
   constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const
   {
 #if (__CUDA_ARCH__ < 530)
-    if constexpr (std::is_same_v<Type, __half>)
-    {
+    if constexpr (std::is_same_v<Type, __half>) {
       return __float2half(raft::cos(__half2float(in)));
     }
-    #elif (__CUDA_ARCH__ < 800)
-    if constexpr (std::is_same_v<Type, nv_bfloat16>)
-    {
+#elif (__CUDA_ARCH__ < 800)
+    if constexpr (std::is_same_v<Type, nv_bfloat16>) {
       return __float2bfloat16(raft::cos(__bfloat162float(in)));
-    }
-    else // else is there to make sure raft::cos(in) is not compiled with __half / nv_bfloat16 
+    } else  // else is there to make sure raft::cos(in) is not compiled with __half / nv_bfloat16
 #endif
-      return raft::cos(in);
+    return raft::cos(in);
   }
 };
 
 TEST(MathDevice, Cos)
 {
-  ASSERT_TRUE(raft::match(
-    std::cos(12.34f), __half2float(math_eval(cos_test_op_device{}, __float2half(12.34f))), raft::CompareApprox<float>(0.001f)));
-  ASSERT_TRUE(raft::match(
-    std::cos(12.34f), __bfloat162float(math_eval(cos_test_op_device{}, __float2bfloat16(12.34f))), raft::CompareApprox<float>(0.01f)));
+  ASSERT_TRUE(raft::match(std::cos(12.34f),
+                          __half2float(math_eval(cos_test_op_device{}, __float2half(12.34f))),
+                          raft::CompareApprox<float>(0.001f)));
+  ASSERT_TRUE(
+    raft::match(std::cos(12.34f),
+                __bfloat162float(math_eval(cos_test_op_device{}, __float2bfloat16(12.34f))),
+                raft::CompareApprox<float>(0.01f)));
   ASSERT_TRUE(raft::match(
     std::cos(12.34f), math_eval(cos_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
   ASSERT_TRUE(raft::match(
@@ -167,28 +167,27 @@ struct exp_test_op_device {
   template <typename Type>
   constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const
   {
-    #if (__CUDA_ARCH__ < 530)
-    if constexpr (std::is_same_v<Type, __half>)
-    {
+#if (__CUDA_ARCH__ < 530)
+    if constexpr (std::is_same_v<Type, __half>) {
       return __float2half(raft::exp(__half2float(in)));
     }
-    #elif (__CUDA_ARCH__ < 800)
-    if constexpr (std::is_same_v<Type, nv_bfloat16>)
-    {
+#elif (__CUDA_ARCH__ < 800)
+    if constexpr (std::is_same_v<Type, nv_bfloat16>) {
       return __float2bfloat16(raft::exp(__bfloat162float(in)));
-    }
-    else // else is there to make sure raft::exp(in) is not compiled with __half / nv_bfloat16 
+    } else  // else is there to make sure raft::exp(in) is not compiled with __half / nv_bfloat16
 #endif
-      return raft::exp(in);
+    return raft::exp(in);
   }
 };
 
 TEST(MathDevice, Exp)
 {
-  ASSERT_TRUE(raft::match(
-    std::exp(3.4f), __half2float(math_eval(exp_test_op_device{}, __float2half(3.4f))), raft::CompareApprox<float>(0.001f)));
-  ASSERT_TRUE(raft::match(
-    std::exp(3.4f), __bfloat162float(math_eval(exp_test_op_device{}, __float2bfloat16(3.4f))), raft::CompareApprox<float>(0.01f)));
+  ASSERT_TRUE(raft::match(std::exp(3.4f),
+                          __half2float(math_eval(exp_test_op_device{}, __float2half(3.4f))),
+                          raft::CompareApprox<float>(0.001f)));
+  ASSERT_TRUE(raft::match(std::exp(3.4f),
+                          __bfloat162float(math_eval(exp_test_op_device{}, __float2bfloat16(3.4f))),
+                          raft::CompareApprox<float>(0.01f)));
   ASSERT_TRUE(raft::match(
     std::exp(3.4f), math_eval(exp_test_op{}, 3.4f), raft::CompareApprox<float>(0.0001f)));
   ASSERT_TRUE(raft::match(
@@ -199,19 +198,16 @@ struct log_test_op_device {
   template <typename Type>
   constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const
   {
-    #if (__CUDA_ARCH__ < 530)
-    if constexpr (std::is_same_v<Type, __half>)
-    {
+#if (__CUDA_ARCH__ < 530)
+    if constexpr (std::is_same_v<Type, __half>) {
       return __float2half(raft::log(__half2float(in)));
     }
-    #elif (__CUDA_ARCH__ < 800)
-    if constexpr (std::is_same_v<Type, nv_bfloat16>)
-    {
+#elif (__CUDA_ARCH__ < 800)
+    if constexpr (std::is_same_v<Type, nv_bfloat16>) {
       return __float2bfloat16(raft::log(__bfloat162float(in)));
-    }
-    else // else is there to make sure raft::log(in) is not compiled with __half / nv_bfloat16 
+    } else  // else is there to make sure raft::log(in) is not compiled with __half / nv_bfloat16
 #endif
-      return raft::log(in);
+    return raft::log(in);
   }
 };
 
@@ -225,10 +221,13 @@ struct log_test_op {
 
 TEST(MathDevice, Log)
 {
-  ASSERT_TRUE(raft::match(
-    std::log(12.34f), __half2float(math_eval(log_test_op_device{}, __float2half(12.34f))), raft::CompareApprox<float>(0.001f)));
-  ASSERT_TRUE(raft::match(
-    std::log(12.34f), __bfloat162float(math_eval(log_test_op_device{}, __float2bfloat16(12.34f))), raft::CompareApprox<float>(0.01f)));
+  ASSERT_TRUE(raft::match(std::log(12.34f),
+                          __half2float(math_eval(log_test_op_device{}, __float2half(12.34f))),
+                          raft::CompareApprox<float>(0.001f)));
+  ASSERT_TRUE(
+    raft::match(std::log(12.34f),
+                __bfloat162float(math_eval(log_test_op_device{}, __float2bfloat16(12.34f))),
+                raft::CompareApprox<float>(0.01f)));
   ASSERT_TRUE(raft::match(
     std::log(12.34f), math_eval(log_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
   ASSERT_TRUE(raft::match(
@@ -358,19 +357,16 @@ struct sin_test_op_device {
   template <typename Type>
   constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const
   {
-    #if (__CUDA_ARCH__ < 530)
-    if constexpr (std::is_same_v<Type, __half>)
-    {
+#if (__CUDA_ARCH__ < 530)
+    if constexpr (std::is_same_v<Type, __half>) {
       return __float2half(raft::sin(__half2float(in)));
     }
-    #elif (__CUDA_ARCH__ < 800)
-    if constexpr (std::is_same_v<Type, nv_bfloat16>)
-    {
+#elif (__CUDA_ARCH__ < 800)
+    if constexpr (std::is_same_v<Type, nv_bfloat16>) {
       return __float2bfloat16(raft::sin(__bfloat162float(in)));
-    }
-    else // else is there to make sure raft::sin(in) is not compiled with __half / nv_bfloat16 
+    } else  // else is there to make sure raft::sin(in) is not compiled with __half / nv_bfloat16
 #endif
-      return raft::sin(in);
+    return raft::sin(in);
   }
 };
 
@@ -384,10 +380,13 @@ struct sin_test_op {
 
 TEST(MathDevice, Sin)
 {
-  ASSERT_TRUE(raft::match(
-    std::sin(12.34f), __half2float(math_eval(sin_test_op_device{}, __float2half(12.34f))), raft::CompareApprox<float>(0.01f)));
-  ASSERT_TRUE(raft::match(
-    std::sin(12.34f), __bfloat162float(math_eval(sin_test_op_device{}, __float2bfloat16(12.34f))), raft::CompareApprox<float>(0.1f)));
+  ASSERT_TRUE(raft::match(std::sin(12.34f),
+                          __half2float(math_eval(sin_test_op_device{}, __float2half(12.34f))),
+                          raft::CompareApprox<float>(0.01f)));
+  ASSERT_TRUE(
+    raft::match(std::sin(12.34f),
+                __bfloat162float(math_eval(sin_test_op_device{}, __float2bfloat16(12.34f))),
+                raft::CompareApprox<float>(0.1f)));
   ASSERT_TRUE(raft::match(
     std::sin(12.34f), math_eval(sin_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
   ASSERT_TRUE(raft::match(
@@ -420,24 +419,20 @@ TEST(MathDevice, SinCos)
   ASSERT_TRUE(raft::match(std::cos(12.34), cd.value(stream), raft::CompareApprox<double>(0.0001f)));
 }
 
-
 struct sqrt_test_op_device {
   template <typename Type>
   constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const
   {
-    #if (__CUDA_ARCH__ < 530)
-    if constexpr (std::is_same_v<Type, __half>)
-    {
+#if (__CUDA_ARCH__ < 530)
+    if constexpr (std::is_same_v<Type, __half>) {
       return __float2half(raft::sqrt(__half2float(in)));
     }
-    #elif (__CUDA_ARCH__ < 800)
-    if constexpr (std::is_same_v<Type, nv_bfloat16>)
-    {
+#elif (__CUDA_ARCH__ < 800)
+    if constexpr (std::is_same_v<Type, nv_bfloat16>) {
       return __float2bfloat16(raft::sqrt(__bfloat162float(in)));
-    }
-    else // else is there to make sure raft::sqrt(in) is not compiled with __half / nv_bfloat16 
+    } else  // else is there to make sure raft::sqrt(in) is not compiled with __half / nv_bfloat16
 #endif
-      return raft::sqrt(in);
+    return raft::sqrt(in);
   }
 };
 
@@ -451,10 +446,13 @@ struct sqrt_test_op {
 
 TEST(MathDevice, Sqrt)
 {
-  ASSERT_TRUE(raft::match(
-    std::sqrt(12.34f), __half2float(math_eval(sqrt_test_op_device{}, __float2half(12.34f))), raft::CompareApprox<float>(0.001f)));
-  ASSERT_TRUE(raft::match(
-    std::sqrt(12.34f), __bfloat162float(math_eval(sqrt_test_op_device{}, __float2bfloat16(12.34f))), raft::CompareApprox<float>(0.01f)));
+  ASSERT_TRUE(raft::match(std::sqrt(12.34f),
+                          __half2float(math_eval(sqrt_test_op_device{}, __float2half(12.34f))),
+                          raft::CompareApprox<float>(0.001f)));
+  ASSERT_TRUE(
+    raft::match(std::sqrt(12.34f),
+                __bfloat162float(math_eval(sqrt_test_op_device{}, __float2bfloat16(12.34f))),
+                raft::CompareApprox<float>(0.01f)));
   ASSERT_TRUE(raft::match(
     std::sqrt(12.34f), math_eval(sqrt_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
   ASSERT_TRUE(raft::match(

From 39308613ceebe3a2fda2504a8c3f9736970eb4a8 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Tue, 30 May 2023 06:38:18 -0700
Subject: [PATCH 3/8] fix cmake formating

---
 cpp/CMakeLists.txt              | 8 ++++++--
 python/pylibraft/pyproject.toml | 4 ++--
 python/raft-dask/pyproject.toml | 6 +++---
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3a598c5fad..6fa1b5830e 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -602,7 +602,9 @@ target_link_libraries(raft::raft INTERFACE
 # Use `rapids_export` for 22.04 as it will have COMPONENT support
 rapids_export(
   INSTALL raft
-  EXPORT_SET raft-exports COMPONENTS ${raft_components} COMPONENTS_EXPORT_SET ${raft_export_sets}
+  EXPORT_SET raft-exports
+  COMPONENTS ${raft_components}
+  COMPONENTS_EXPORT_SET ${raft_export_sets}
   GLOBAL_TARGETS raft compiled distributed
   NAMESPACE raft::
   DOCUMENTATION doc_string
@@ -613,7 +615,9 @@ rapids_export(
 # * build export -------------------------------------------------------------
 rapids_export(
   BUILD raft
-  EXPORT_SET raft-exports COMPONENTS ${raft_components} COMPONENTS_EXPORT_SET ${raft_export_sets}
+  EXPORT_SET raft-exports
+  COMPONENTS ${raft_components}
+  COMPONENTS_EXPORT_SET ${raft_export_sets}
   GLOBAL_TARGETS raft compiled distributed
   DOCUMENTATION doc_string
   NAMESPACE raft::
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index c344128e6a..b53ad43740 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -19,7 +19,7 @@ requires = [
     "cuda-python>=11.7.1,<12.0",
     "cython>=0.29,<0.30",
     "ninja",
-    "rmm==23.8.*",
+    "rmm==23.6.*",
     "scikit-build>=0.13.1,<0.17.2",
     "setuptools",
     "wheel",
@@ -39,7 +39,7 @@ requires-python = ">=3.9"
 dependencies = [
     "cuda-python>=11.7.1,<12.0",
     "numpy>=1.21",
-    "rmm==23.8.*",
+    "rmm==23.6.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index bb703238b9..9957bf9d42 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -34,14 +34,14 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "dask-cuda==23.8.*",
+    "dask-cuda==23.6.*",
     "dask==2023.3.2",
     "distributed==2023.3.2.1",
     "joblib>=0.11",
     "numba>=0.49",
     "numpy>=1.21",
-    "pylibraft==23.8.*",
-    "ucx-py==0.33.*",
+    "pylibraft==23.6.*",
+    "ucx-py=0.33.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",

From 42a43c3e1910d8832fe0ead7864439f18cd08a0f Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Tue, 30 May 2023 07:58:49 -0700
Subject: [PATCH 4/8] fix too long comment issue, std::is_same_v, use
 enable_if_t & CUDA_CONDITION_ELSE_TRUE

---
 cpp/include/raft/core/detail/macros.hpp |   6 +-
 cpp/include/raft/core/math.hpp          | 200 +++++++++---------------
 2 files changed, 76 insertions(+), 130 deletions(-)

diff --git a/cpp/include/raft/core/detail/macros.hpp b/cpp/include/raft/core/detail/macros.hpp
index 6d97b830fb..bb4207938b 100644
--- a/cpp/include/raft/core/detail/macros.hpp
+++ b/cpp/include/raft/core/detail/macros.hpp
@@ -23,9 +23,11 @@
 #endif
 
 #if defined(_RAFT_HAS_CUDA)
-#define CUDA_ONLY_CONDITION(condition) condition
+#define CUDA_CONDITION_ELSE_TRUE(condition)  condition
+#define CUDA_CONDITION_ELSE_FALSE(condition) condition
 #else
-#define CUDA_ONLY_CONDITION(condition) true
+#define CUDA_CONDITION_ELSE_TRUE(condition)  true
+#define CUDA_CONDITION_ELSE_FALSE(condition) false
 #endif
 
 #ifndef _RAFT_HOST_DEVICE
diff --git a/cpp/include/raft/core/math.hpp b/cpp/include/raft/core/math.hpp
index e912e5e428..1b425e8e4e 100644
--- a/cpp/include/raft/core/math.hpp
+++ b/cpp/include/raft/core/math.hpp
@@ -95,15 +95,11 @@ RAFT_INLINE_FUNCTION auto atanh(T x)
 }
 
 /** Cosine */
-template <typename T>
-RAFT_INLINE_FUNCTION
-#if _RAFT_HAS_CUDA
-  typename std::enable_if<!std::is_same<T, __half>::value && !std::is_same<T, nv_bfloat16>::value,
-                          T>::type
-#else
-  auto
-#endif
-  cos(T x)
+template <typename T,
+          std::enable_if_t<CUDA_CONDITION_ELSE_TRUE(((!std::is_same_v<T, __half> &&
+                                                      (!std::is_same_v<T, nv_bfloat16>)))),
+                           int> = 0>
+RAFT_INLINE_FUNCTION auto cos(T x)
 {
 #ifdef __CUDA_ARCH__
   return ::cos(x);
@@ -114,15 +110,13 @@ RAFT_INLINE_FUNCTION
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same<T, __half>::value, __half>::type
-cos(T x)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, __half>::type cos(
+  T x)
 {
 #if (__CUDA_ARCH__ >= 530)
   return ::hcos(x);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
-  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
-  // during template instantiation and thus at device compilation stage
+  // Fail during template instantiation if the compute capability doesn't support this operation
   static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
   return T{};
 #endif
@@ -130,15 +124,13 @@ cos(T x)
 
 template <typename T>
 RAFT_DEVICE_INLINE_FUNCTION
-  typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+  typename std::enable_if<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>::type
   cos(T x)
 {
 #if (__CUDA_ARCH__ >= 800)
   return ::hcos(x);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
-  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
-  // during template instantiation and thus at device compilation stage
+  // Fail during template instantiation if the compute capability doesn't support this operation
   static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
   return T{};
 #endif
@@ -146,15 +138,11 @@ RAFT_DEVICE_INLINE_FUNCTION
 #endif
 
 /** Sine */
-template <typename T>
-RAFT_INLINE_FUNCTION
-#if _RAFT_HAS_CUDA
-  typename std::enable_if<!std::is_same<T, __half>::value && !std::is_same<T, nv_bfloat16>::value,
-                          T>::type
-#else
-  auto
-#endif
-  sin(T x)
+template <typename T,
+          std::enable_if_t<CUDA_CONDITION_ELSE_TRUE(((!std::is_same_v<T, __half> &&
+                                                      (!std::is_same_v<T, nv_bfloat16>)))),
+                           int> = 0>
+RAFT_INLINE_FUNCTION auto sin(T x)
 {
 #ifdef __CUDA_ARCH__
   return ::sin(x);
@@ -165,15 +153,13 @@ RAFT_INLINE_FUNCTION
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same<T, __half>::value, __half>::type
-sin(T x)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, __half>::type sin(
+  T x)
 {
 #if (__CUDA_ARCH__ >= 530)
   return ::hsin(x);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
-  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
-  // during template instantiation and thus at device compilation stage
+  // Fail during template instantiation if the compute capability doesn't support this operation
   static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
   return T{};
 #endif
@@ -181,15 +167,13 @@ sin(T x)
 
 template <typename T>
 RAFT_DEVICE_INLINE_FUNCTION
-  typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+  typename std::enable_if<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>::type
   sin(T x)
 {
 #if (__CUDA_ARCH__ >= 800)
   return ::hsin(x);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
-  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
-  // during template instantiation and thus at device compilation stage
+  // Fail during template instantiation if the compute capability doesn't support this operation
   static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
   return T{};
 #endif
@@ -226,15 +210,11 @@ RAFT_INLINE_FUNCTION auto tanh(T x)
  * @{
  */
 /** Exponential function */
-template <typename T>
-RAFT_INLINE_FUNCTION
-#if _RAFT_HAS_CUDA
-  typename std::enable_if<!std::is_same<T, __half>::value && !std::is_same<T, nv_bfloat16>::value,
-                          T>::type
-#else
-  auto
-#endif
-  exp(T x)
+template <typename T,
+          std::enable_if_t<CUDA_CONDITION_ELSE_TRUE(((!std::is_same_v<T, __half> &&
+                                                      (!std::is_same_v<T, nv_bfloat16>)))),
+                           int> = 0>
+RAFT_INLINE_FUNCTION auto exp(T x)
 {
 #ifdef __CUDA_ARCH__
   return ::exp(x);
@@ -245,15 +225,13 @@ RAFT_INLINE_FUNCTION
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same<T, __half>::value, __half>::type
-exp(T x)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, __half>::type exp(
+  T x)
 {
 #if (__CUDA_ARCH__ >= 530)
   return ::hexp(x);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
-  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
-  // during template instantiation and thus at device compilation stage
+  // Fail during template instantiation if the compute capability doesn't support this operation
   static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
   return T{};
 #endif
@@ -261,15 +239,13 @@ exp(T x)
 
 template <typename T>
 RAFT_DEVICE_INLINE_FUNCTION
-  typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+  typename std::enable_if<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>::type
   exp(T x)
 {
 #if (__CUDA_ARCH__ >= 800)
   return ::hexp(x);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
-  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
-  // during template instantiation and thus at device compilation stage
+  // Fail during template instantiation if the compute capability doesn't support this operation
   static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
   return T{};
 #endif
@@ -277,15 +253,11 @@ RAFT_DEVICE_INLINE_FUNCTION
 #endif
 
 /** Natural logarithm */
-template <typename T>
-RAFT_INLINE_FUNCTION
-#if _RAFT_HAS_CUDA
-  typename std::enable_if<!std::is_same<T, __half>::value && !std::is_same<T, nv_bfloat16>::value,
-                          T>::type
-#else
-  auto
-#endif
-  log(T x)
+template <typename T,
+          std::enable_if_t<CUDA_CONDITION_ELSE_TRUE(((!std::is_same_v<T, __half> &&
+                                                      (!std::is_same_v<T, nv_bfloat16>)))),
+                           int> = 0>
+RAFT_INLINE_FUNCTION auto log(T x)
 {
 #ifdef __CUDA_ARCH__
   return ::log(x);
@@ -296,15 +268,13 @@ RAFT_INLINE_FUNCTION
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same<T, __half>::value, __half>::type
-log(T x)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, __half>::type log(
+  T x)
 {
 #if (__CUDA_ARCH__ >= 530)
   return ::hlog(x);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
-  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
-  // during template instantiation and thus at device compilation stage
+  // Fail during template instantiation if the compute capability doesn't support this operation
   static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
   return T{};
 #endif
@@ -312,15 +282,13 @@ log(T x)
 
 template <typename T>
 RAFT_DEVICE_INLINE_FUNCTION
-  typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+  typename std::enable_if<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>::type
   log(T x)
 {
 #if (__CUDA_ARCH__ >= 800)
   return ::hlog(x);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
-  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
-  // during template instantiation and thus at device compilation stage
+  // Fail during template instantiation if the compute capability doesn't support this operation
   static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
   return T{};
 #endif
@@ -344,7 +312,7 @@ RAFT_DEVICE_INLINE_FUNCTION
 template <
   typename T1,
   typename T2,
-  std::enable_if_t<CUDA_ONLY_CONDITION(RAFT_DEPAREN(
+  std::enable_if_t<CUDA_CONDITION_ELSE_TRUE(RAFT_DEPAREN(
                      ((!std::is_same_v<T1, __half> && !std::is_same_v<T2, __half>) ||
                       (!std::is_same_v<T1, nv_bfloat16> && !std::is_same_v<T2, nv_bfloat16>)))),
                    int> = 0>
@@ -381,15 +349,13 @@ RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y)
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same<T, __half>::value, __half>::type
-max(T x, T y)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, __half>::type max(
+  T x, T y)
 {
 #if (__CUDA_ARCH__ >= 530)
   return ::__hmax(x, y);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
-  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
-  // during template instantiation and thus at device compilation stage
+  // Fail during template instantiation if the compute capability doesn't support this operation
   static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
   return T{};
 #endif
@@ -397,15 +363,13 @@ max(T x, T y)
 
 template <typename T>
 RAFT_DEVICE_INLINE_FUNCTION
-  typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+  typename std::enable_if<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>::type
   max(T x, T y)
 {
 #if (__CUDA_ARCH__ >= 800)
   return ::__hmax(x, y);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
-  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
-  // during template instantiation and thus at device compilation stage
+  // Fail during template instantiation if the compute capability doesn't support this operation
   static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
   return T{};
 #endif
@@ -428,15 +392,13 @@ constexpr RAFT_INLINE_FUNCTION auto max(const T& x)
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same<T, __half>::value, __half>::type
-max(T x)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, __half>::type max(
+  T x)
 {
 #if (__CUDA_ARCH__ >= 530)
   return x;
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
-  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
-  // during template instantiation and thus at device compilation stage
+  // Fail during template instantiation if the compute capability doesn't support this operation
   static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
   return T{};
 #endif
@@ -444,15 +406,13 @@ max(T x)
 
 template <typename T>
 RAFT_DEVICE_INLINE_FUNCTION
-  typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+  typename std::enable_if<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>::type
   max(T x)
 {
 #if (__CUDA_ARCH__ >= 800)
   return x;
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
-  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
-  // during template instantiation and thus at device compilation stage
+  // Fail during template instantiation if the compute capability doesn't support this operation
   static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
   return T{};
 #endif
@@ -477,7 +437,7 @@ RAFT_DEVICE_INLINE_FUNCTION
 template <
   typename T1,
   typename T2,
-  std::enable_if_t<CUDA_ONLY_CONDITION(RAFT_DEPAREN(
+  std::enable_if_t<CUDA_CONDITION_ELSE_TRUE(RAFT_DEPAREN(
                      ((!std::is_same_v<T1, __half> && !std::is_same_v<T2, __half>) ||
                       (!std::is_same_v<T1, nv_bfloat16> && !std::is_same_v<T2, nv_bfloat16>)))),
                    int> = 0>
@@ -514,15 +474,13 @@ RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y)
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same<T, __half>::value, __half>::type
-min(T x, T y)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, __half>::type min(
+  T x, T y)
 {
 #if (__CUDA_ARCH__ >= 530)
   return ::__hmin(x, y);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
-  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
-  // during template instantiation and thus at device compilation stage
+  // Fail during template instantiation if the compute capability doesn't support this operation
   static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
   return T{};
 #endif
@@ -530,15 +488,13 @@ min(T x, T y)
 
 template <typename T>
 RAFT_DEVICE_INLINE_FUNCTION
-  typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+  typename std::enable_if<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>::type
   min(T x, T y)
 {
 #if (__CUDA_ARCH__ >= 800)
   return ::__hmin(x, y);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
-  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
-  // during template instantiation and thus at device compilation stage
+  // Fail during template instantiation if the compute capability doesn't support this operation
   static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
   return T{};
 #endif
@@ -561,15 +517,13 @@ constexpr RAFT_INLINE_FUNCTION auto min(const T& x)
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same<T, __half>::value, __half>::type
-min(T x)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, __half>::type min(
+  T x)
 {
 #if (__CUDA_ARCH__ >= 530)
   return x;
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
-  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
-  // during template instantiation and thus at device compilation stage
+  // Fail during template instantiation if the compute capability doesn't support this operation
   static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
   return T{};
 #endif
@@ -577,15 +531,13 @@ min(T x)
 
 template <typename T>
 RAFT_DEVICE_INLINE_FUNCTION
-  typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+  typename std::enable_if<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>::type
   min(T x)
 {
 #if (__CUDA_ARCH__ >= 800)
   return x;
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
-  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
-  // during template instantiation and thus at device compilation stage
+  // Fail during template instantiation if the compute capability doesn't support this operation
   static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
   return T{};
 #endif
@@ -609,15 +561,11 @@ RAFT_INLINE_FUNCTION auto pow(T1 x, T2 y)
 }
 
 /** Square root */
-template <typename T>
-RAFT_INLINE_FUNCTION
-#if _RAFT_HAS_CUDA
-  typename std::enable_if<!std::is_same<T, __half>::value && !std::is_same<T, nv_bfloat16>::value,
-                          T>::type
-#else
-  auto
-#endif
-  sqrt(T x)
+template <typename T,
+          std::enable_if_t<CUDA_CONDITION_ELSE_TRUE(((!std::is_same_v<T, __half> &&
+                                                      (!std::is_same_v<T, nv_bfloat16>)))),
+                           int> = 0>
+RAFT_INLINE_FUNCTION auto sqrt(T x)
 {
 #ifdef __CUDA_ARCH__
   return ::sqrt(x);
@@ -628,15 +576,13 @@ RAFT_INLINE_FUNCTION
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same<T, __half>::value, __half>::type
-sqrt(T x)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, __half>::type sqrt(
+  T x)
 {
 #if (__CUDA_ARCH__ >= 530)
   return ::hsqrt(x);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
-  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
-  // during template instantiation and thus at device compilation stage
+  // Fail during template instantiation if the compute capability doesn't support this operation
   static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
   return T{};
 #endif
@@ -644,15 +590,13 @@ sqrt(T x)
 
 template <typename T>
 RAFT_DEVICE_INLINE_FUNCTION
-  typename std::enable_if<std::is_same<T, nv_bfloat16>::value, nv_bfloat16>::type
+  typename std::enable_if<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>::type
   sqrt(T x)
 {
 #if (__CUDA_ARCH__ >= 800)
   return ::hsqrt(x);
 #else
-  // static_assert(false) would be evaluated during host compilation stage while __CUDA_ARCH__ is at
-  // device compilation stage Using this sizeof(T) != sizeof(T) makes it work as it's only triggered
-  // during template instantiation and thus at device compilation stage
+  // Fail during template instantiation if the compute capability doesn't support this operation
   static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
   return T{};
 #endif

From 464e5b27e9ba8c28e84bcb64c2ad4bfe497478a6 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Tue, 30 May 2023 08:59:45 -0700
Subject: [PATCH 5/8] switch enable_if to enable_if_t

---
 cpp/include/raft/core/math.hpp | 73 ++++++++++++++--------------------
 1 file changed, 29 insertions(+), 44 deletions(-)

diff --git a/cpp/include/raft/core/math.hpp b/cpp/include/raft/core/math.hpp
index 1b425e8e4e..f251c14e4a 100644
--- a/cpp/include/raft/core/math.hpp
+++ b/cpp/include/raft/core/math.hpp
@@ -110,8 +110,7 @@ RAFT_INLINE_FUNCTION auto cos(T x)
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, __half>::type cos(
-  T x)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, __half>, __half> cos(T x)
 {
 #if (__CUDA_ARCH__ >= 530)
   return ::hcos(x);
@@ -123,9 +122,8 @@ RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, _
 }
 
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION
-  typename std::enable_if<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>::type
-  cos(T x)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>
+cos(T x)
 {
 #if (__CUDA_ARCH__ >= 800)
   return ::hcos(x);
@@ -153,8 +151,7 @@ RAFT_INLINE_FUNCTION auto sin(T x)
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, __half>::type sin(
-  T x)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, __half>, __half> sin(T x)
 {
 #if (__CUDA_ARCH__ >= 530)
   return ::hsin(x);
@@ -166,9 +163,8 @@ RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, _
 }
 
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION
-  typename std::enable_if<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>::type
-  sin(T x)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>
+sin(T x)
 {
 #if (__CUDA_ARCH__ >= 800)
   return ::hsin(x);
@@ -225,8 +221,7 @@ RAFT_INLINE_FUNCTION auto exp(T x)
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, __half>::type exp(
-  T x)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, __half>, __half> exp(T x)
 {
 #if (__CUDA_ARCH__ >= 530)
   return ::hexp(x);
@@ -238,9 +233,8 @@ RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, _
 }
 
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION
-  typename std::enable_if<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>::type
-  exp(T x)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>
+exp(T x)
 {
 #if (__CUDA_ARCH__ >= 800)
   return ::hexp(x);
@@ -268,8 +262,7 @@ RAFT_INLINE_FUNCTION auto log(T x)
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, __half>::type log(
-  T x)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, __half>, __half> log(T x)
 {
 #if (__CUDA_ARCH__ >= 530)
   return ::hlog(x);
@@ -281,9 +274,8 @@ RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, _
 }
 
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION
-  typename std::enable_if<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>::type
-  log(T x)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>
+log(T x)
 {
 #if (__CUDA_ARCH__ >= 800)
   return ::hlog(x);
@@ -349,8 +341,8 @@ RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y)
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, __half>::type max(
-  T x, T y)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, __half>, __half> max(T x,
+                                                                                             T y)
 {
 #if (__CUDA_ARCH__ >= 530)
   return ::__hmax(x, y);
@@ -362,9 +354,8 @@ RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, _
 }
 
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION
-  typename std::enable_if<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>::type
-  max(T x, T y)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>
+max(T x, T y)
 {
 #if (__CUDA_ARCH__ >= 800)
   return ::__hmax(x, y);
@@ -392,8 +383,7 @@ constexpr RAFT_INLINE_FUNCTION auto max(const T& x)
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, __half>::type max(
-  T x)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, __half>, __half> max(T x)
 {
 #if (__CUDA_ARCH__ >= 530)
   return x;
@@ -405,9 +395,8 @@ RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, _
 }
 
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION
-  typename std::enable_if<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>::type
-  max(T x)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>
+max(T x)
 {
 #if (__CUDA_ARCH__ >= 800)
   return x;
@@ -474,8 +463,8 @@ RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y)
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, __half>::type min(
-  T x, T y)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, __half>, __half> min(T x,
+                                                                                             T y)
 {
 #if (__CUDA_ARCH__ >= 530)
   return ::__hmin(x, y);
@@ -487,9 +476,8 @@ RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, _
 }
 
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION
-  typename std::enable_if<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>::type
-  min(T x, T y)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>
+min(T x, T y)
 {
 #if (__CUDA_ARCH__ >= 800)
   return ::__hmin(x, y);
@@ -517,7 +505,7 @@ constexpr RAFT_INLINE_FUNCTION auto min(const T& x)
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, __half>::type min(
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, nv_bfloat16>, __half> min(
   T x)
 {
 #if (__CUDA_ARCH__ >= 530)
@@ -530,9 +518,8 @@ RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, _
 }
 
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION
-  typename std::enable_if<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>::type
-  min(T x)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>
+min(T x)
 {
 #if (__CUDA_ARCH__ >= 800)
   return x;
@@ -576,8 +563,7 @@ RAFT_INLINE_FUNCTION auto sqrt(T x)
 
 #if _RAFT_HAS_CUDA
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, __half>::type sqrt(
-  T x)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, __half>, __half> sqrt(T x)
 {
 #if (__CUDA_ARCH__ >= 530)
   return ::hsqrt(x);
@@ -589,9 +575,8 @@ RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if<std::is_same_v<T, __half>, _
 }
 
 template <typename T>
-RAFT_DEVICE_INLINE_FUNCTION
-  typename std::enable_if<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>::type
-  sqrt(T x)
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>
+sqrt(T x)
 {
 #if (__CUDA_ARCH__ >= 800)
   return ::hsqrt(x);

From 4cdf6e1b72fd089a139aead589151da0839c6fa8 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Tue, 30 May 2023 09:08:52 -0700
Subject: [PATCH 6/8] fix rmm version

---
 python/pylibraft/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index b53ad43740..1e4e1f40e3 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -19,7 +19,7 @@ requires = [
     "cuda-python>=11.7.1,<12.0",
     "cython>=0.29,<0.30",
     "ninja",
-    "rmm==23.6.*",
+    "rmm==23.8.*",
     "scikit-build>=0.13.1,<0.17.2",
     "setuptools",
     "wheel",

From 6fa4c131ce8d9b2992017466daced8568b522bbf Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Tue, 30 May 2023 09:25:20 -0700
Subject: [PATCH 7/8] fix back other toml file

---
 python/raft-dask/pyproject.toml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 9957bf9d42..bb703238b9 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -34,14 +34,14 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "dask-cuda==23.6.*",
+    "dask-cuda==23.8.*",
     "dask==2023.3.2",
     "distributed==2023.3.2.1",
     "joblib>=0.11",
     "numba>=0.49",
     "numpy>=1.21",
-    "pylibraft==23.6.*",
-    "ucx-py=0.33.*",
+    "pylibraft==23.8.*",
+    "ucx-py==0.33.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",

From dde8caf3f7a30c78d248da5e3c81754b1194189a Mon Sep 17 00:00:00 2001
From: Nicolas Blin <31096601+Kh4ster@users.noreply.github.com>
Date: Wed, 31 May 2023 08:38:32 +0200
Subject: [PATCH 8/8] Fix pyproject.toml

---
 python/pylibraft/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index 1e4e1f40e3..c344128e6a 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -39,7 +39,7 @@ requires-python = ">=3.9"
 dependencies = [
     "cuda-python>=11.7.1,<12.0",
     "numpy>=1.21",
-    "rmm==23.6.*",
+    "rmm==23.8.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",