diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3a598c5fad..6fa1b5830e 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -602,7 +602,9 @@ target_link_libraries(raft::raft INTERFACE
 # Use `rapids_export` for 22.04 as it will have COMPONENT support
 rapids_export(
   INSTALL raft
-  EXPORT_SET raft-exports COMPONENTS ${raft_components} COMPONENTS_EXPORT_SET ${raft_export_sets}
+  EXPORT_SET raft-exports
+  COMPONENTS ${raft_components}
+  COMPONENTS_EXPORT_SET ${raft_export_sets}
   GLOBAL_TARGETS raft compiled distributed
   NAMESPACE raft::
   DOCUMENTATION doc_string
@@ -613,7 +615,9 @@ rapids_export(
 # * build export -------------------------------------------------------------
 rapids_export(
   BUILD raft
-  EXPORT_SET raft-exports COMPONENTS ${raft_components} COMPONENTS_EXPORT_SET ${raft_export_sets}
+  EXPORT_SET raft-exports
+  COMPONENTS ${raft_components}
+  COMPONENTS_EXPORT_SET ${raft_export_sets}
   GLOBAL_TARGETS raft compiled distributed
   DOCUMENTATION doc_string
   NAMESPACE raft::
diff --git a/cpp/include/raft/core/detail/macros.hpp b/cpp/include/raft/core/detail/macros.hpp
index 390acea697..bb4207938b 100644
--- a/cpp/include/raft/core/detail/macros.hpp
+++ b/cpp/include/raft/core/detail/macros.hpp
@@ -22,6 +22,14 @@
 #endif
 #endif
 
+#if defined(_RAFT_HAS_CUDA)
+#define CUDA_CONDITION_ELSE_TRUE(condition)  condition
+#define CUDA_CONDITION_ELSE_FALSE(condition) condition
+#else
+#define CUDA_CONDITION_ELSE_TRUE(condition)  true
+#define CUDA_CONDITION_ELSE_FALSE(condition) false
+#endif
+
 #ifndef _RAFT_HOST_DEVICE
 #if defined(_RAFT_HAS_CUDA)
 #define _RAFT_DEVICE      __device__
@@ -40,6 +48,10 @@
 #define RAFT_INLINE_FUNCTION _RAFT_HOST_DEVICE _RAFT_FORCEINLINE
 #endif
 
+#ifndef RAFT_DEVICE_INLINE_FUNCTION
+#define RAFT_DEVICE_INLINE_FUNCTION _RAFT_DEVICE _RAFT_FORCEINLINE
+#endif
+
 // The RAFT_INLINE_CONDITIONAL is a conditional inline specifier that removes
 // the inline specification when RAFT_COMPILED is defined.
 //
diff --git a/cpp/include/raft/core/math.hpp b/cpp/include/raft/core/math.hpp
index c5f08b84b7..f251c14e4a 100644
--- a/cpp/include/raft/core/math.hpp
+++ b/cpp/include/raft/core/math.hpp
@@ -22,6 +22,11 @@
 
 #include <raft/core/detail/macros.hpp>
 
+#if _RAFT_HAS_CUDA
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#endif
+
 namespace raft {
 
 /**
@@ -90,7 +95,10 @@ RAFT_INLINE_FUNCTION auto atanh(T x)
 }
 
 /** Cosine */
-template <typename T>
+template <typename T,
+          std::enable_if_t<CUDA_CONDITION_ELSE_TRUE(((!std::is_same_v<T, __half> &&
+                                                      (!std::is_same_v<T, nv_bfloat16>)))),
+                           int> = 0>
 RAFT_INLINE_FUNCTION auto cos(T x)
 {
 #ifdef __CUDA_ARCH__
@@ -100,8 +108,38 @@ RAFT_INLINE_FUNCTION auto cos(T x)
 #endif
 }
 
-/** Sine */
+#if _RAFT_HAS_CUDA
 template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, __half>, __half> cos(T x)
+{
+#if (__CUDA_ARCH__ >= 530)
+  return ::hcos(x);
+#else
+  // Fail during template instantiation if the compute capability doesn't support this operation
+  static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
+  return T{};
+#endif
+}
+
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>
+cos(T x)
+{
+#if (__CUDA_ARCH__ >= 800)
+  return ::hcos(x);
+#else
+  // Fail during template instantiation if the compute capability doesn't support this operation
+  static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
+  return T{};
+#endif
+}
+#endif
+
+/** Sine */
+template <typename T,
+          std::enable_if_t<CUDA_CONDITION_ELSE_TRUE(((!std::is_same_v<T, __half> &&
+                                                      (!std::is_same_v<T, nv_bfloat16>)))),
+                           int> = 0>
 RAFT_INLINE_FUNCTION auto sin(T x)
 {
 #ifdef __CUDA_ARCH__
@@ -111,6 +149,33 @@ RAFT_INLINE_FUNCTION auto sin(T x)
 #endif
 }
 
+#if _RAFT_HAS_CUDA
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, __half>, __half> sin(T x)
+{
+#if (__CUDA_ARCH__ >= 530)
+  return ::hsin(x);
+#else
+  // Fail during template instantiation if the compute capability doesn't support this operation
+  static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
+  return T{};
+#endif
+}
+
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>
+sin(T x)
+{
+#if (__CUDA_ARCH__ >= 800)
+  return ::hsin(x);
+#else
+  // Fail during template instantiation if the compute capability doesn't support this operation
+  static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
+  return T{};
+#endif
+}
+#endif
+
 /** Sine and cosine */
 template <typename T>
 RAFT_INLINE_FUNCTION std::enable_if_t<std::is_same_v<float, T> || std::is_same_v<double, T>> sincos(
@@ -141,7 +206,10 @@ RAFT_INLINE_FUNCTION auto tanh(T x)
  * @{
  */
 /** Exponential function */
-template <typename T>
+template <typename T,
+          std::enable_if_t<CUDA_CONDITION_ELSE_TRUE(((!std::is_same_v<T, __half> &&
+                                                      (!std::is_same_v<T, nv_bfloat16>)))),
+                           int> = 0>
 RAFT_INLINE_FUNCTION auto exp(T x)
 {
 #ifdef __CUDA_ARCH__
@@ -151,8 +219,38 @@ RAFT_INLINE_FUNCTION auto exp(T x)
 #endif
 }
 
-/** Natural logarithm */
+#if _RAFT_HAS_CUDA
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, __half>, __half> exp(T x)
+{
+#if (__CUDA_ARCH__ >= 530)
+  return ::hexp(x);
+#else
+  // Fail during template instantiation if the compute capability doesn't support this operation
+  static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
+  return T{};
+#endif
+}
+
 template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>
+exp(T x)
+{
+#if (__CUDA_ARCH__ >= 800)
+  return ::hexp(x);
+#else
+  // Fail during template instantiation if the compute capability doesn't support this operation
+  static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
+  return T{};
+#endif
+}
+#endif
+
+/** Natural logarithm */
+template <typename T,
+          std::enable_if_t<CUDA_CONDITION_ELSE_TRUE(((!std::is_same_v<T, __half> &&
+                                                      (!std::is_same_v<T, nv_bfloat16>)))),
+                           int> = 0>
 RAFT_INLINE_FUNCTION auto log(T x)
 {
 #ifdef __CUDA_ARCH__
@@ -161,6 +259,33 @@ RAFT_INLINE_FUNCTION auto log(T x)
   return std::log(x);
 #endif
 }
+
+#if _RAFT_HAS_CUDA
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, __half>, __half> log(T x)
+{
+#if (__CUDA_ARCH__ >= 530)
+  return ::hlog(x);
+#else
+  // Fail during template instantiation if the compute capability doesn't support this operation
+  static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
+  return T{};
+#endif
+}
+
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>
+log(T x)
+{
+#if (__CUDA_ARCH__ >= 800)
+  return ::hlog(x);
+#else
+  // Fail during template instantiation if the compute capability doesn't support this operation
+  static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
+  return T{};
+#endif
+}
+#endif
 /** @} */
 
 /**
@@ -176,7 +301,13 @@ RAFT_INLINE_FUNCTION auto log(T x)
  * same (and that the less-than operator be defined).
  * @{
  */
-template <typename T1, typename T2>
+template <
+  typename T1,
+  typename T2,
+  std::enable_if_t<CUDA_CONDITION_ELSE_TRUE(RAFT_DEPAREN(
+                     ((!std::is_same_v<T1, __half> && !std::is_same_v<T2, __half>) ||
+                      (!std::is_same_v<T1, nv_bfloat16> && !std::is_same_v<T2, nv_bfloat16>)))),
+                   int> = 0>
 RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y)
 {
 #ifdef __CUDA_ARCH__
@@ -208,6 +339,34 @@ RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y)
 #endif
 }
 
+#if _RAFT_HAS_CUDA
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, __half>, __half> max(T x,
+                                                                                             T y)
+{
+#if (__CUDA_ARCH__ >= 530)
+  return ::__hmax(x, y);
+#else
+  // Fail during template instantiation if the compute capability doesn't support this operation
+  static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
+  return T{};
+#endif
+}
+
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>
+max(T x, T y)
+{
+#if (__CUDA_ARCH__ >= 800)
+  return ::__hmax(x, y);
+#else
+  // Fail during template instantiation if the compute capability doesn't support this operation
+  static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
+  return T{};
+#endif
+}
+#endif
+
 /** Many-argument overload to avoid verbose nested calls or use with variadic arguments */
 template <typename T1, typename T2, typename... Args>
 RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y, Args&&... args)
@@ -221,6 +380,34 @@ constexpr RAFT_INLINE_FUNCTION auto max(const T& x)
 {
   return x;
 }
+
+#if _RAFT_HAS_CUDA
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, __half>, __half> max(T x)
+{
+#if (__CUDA_ARCH__ >= 530)
+  return x;
+#else
+  // Fail during template instantiation if the compute capability doesn't support this operation
+  static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
+  return T{};
+#endif
+}
+
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>
+max(T x)
+{
+#if (__CUDA_ARCH__ >= 800)
+  return x;
+#else
+  // Fail during template instantiation if the compute capability doesn't support this operation
+  static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
+  return T{};
+#endif
+}
+#endif
+
 /** @} */
 
 /**
@@ -236,7 +423,13 @@ constexpr RAFT_INLINE_FUNCTION auto max(const T& x)
  * same (and that the less-than operator be defined).
  * @{
  */
-template <typename T1, typename T2>
+template <
+  typename T1,
+  typename T2,
+  std::enable_if_t<CUDA_CONDITION_ELSE_TRUE(RAFT_DEPAREN(
+                     ((!std::is_same_v<T1, __half> && !std::is_same_v<T2, __half>) ||
+                      (!std::is_same_v<T1, nv_bfloat16> && !std::is_same_v<T2, nv_bfloat16>)))),
+                   int> = 0>
 RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y)
 {
 #ifdef __CUDA_ARCH__
@@ -268,6 +461,34 @@ RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y)
 #endif
 }
 
+#if _RAFT_HAS_CUDA
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, __half>, __half> min(T x,
+                                                                                             T y)
+{
+#if (__CUDA_ARCH__ >= 530)
+  return ::__hmin(x, y);
+#else
+  // Fail during template instantiation if the compute capability doesn't support this operation
+  static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
+  return T{};
+#endif
+}
+
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>
+min(T x, T y)
+{
+#if (__CUDA_ARCH__ >= 800)
+  return ::__hmin(x, y);
+#else
+  // Fail during template instantiation if the compute capability doesn't support this operation
+  static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
+  return T{};
+#endif
+}
+#endif
+
 /** Many-argument overload to avoid verbose nested calls or use with variadic arguments */
 template <typename T1, typename T2, typename... Args>
 RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y, Args&&... args)
@@ -281,6 +502,34 @@ constexpr RAFT_INLINE_FUNCTION auto min(const T& x)
 {
   return x;
 }
+
+#if _RAFT_HAS_CUDA
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, nv_bfloat16>, __half> min(
+  T x)
+{
+#if (__CUDA_ARCH__ >= 530)
+  return x;
+#else
+  // Fail during template instantiation if the compute capability doesn't support this operation
+  static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
+  return T{};
+#endif
+}
+
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>
+min(T x)
+{
+#if (__CUDA_ARCH__ >= 800)
+  return x;
+#else
+  // Fail during template instantiation if the compute capability doesn't support this operation
+  static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
+  return T{};
+#endif
+}
+#endif
 /** @} */
 
 /**
@@ -299,7 +548,10 @@ RAFT_INLINE_FUNCTION auto pow(T1 x, T2 y)
 }
 
 /** Square root */
-template <typename T>
+template <typename T,
+          std::enable_if_t<CUDA_CONDITION_ELSE_TRUE(((!std::is_same_v<T, __half> &&
+                                                      (!std::is_same_v<T, nv_bfloat16>)))),
+                           int> = 0>
 RAFT_INLINE_FUNCTION auto sqrt(T x)
 {
 #ifdef __CUDA_ARCH__
@@ -308,6 +560,33 @@ RAFT_INLINE_FUNCTION auto sqrt(T x)
   return std::sqrt(x);
 #endif
 }
+
+#if _RAFT_HAS_CUDA
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, __half>, __half> sqrt(T x)
+{
+#if (__CUDA_ARCH__ >= 530)
+  return ::hsqrt(x);
+#else
+  // Fail during template instantiation if the compute capability doesn't support this operation
+  static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530");
+  return T{};
+#endif
+}
+
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t<std::is_same_v<T, nv_bfloat16>, nv_bfloat16>
+sqrt(T x)
+{
+#if (__CUDA_ARCH__ >= 800)
+  return ::hsqrt(x);
+#else
+  // Fail during template instantiation if the compute capability doesn't support this operation
+  static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800");
+  return T{};
+#endif
+}
+#endif
 /** @} */
 
 /** Sign */
diff --git a/cpp/test/core/math_device.cu b/cpp/test/core/math_device.cu
index ff4b343d9e..15c7b2b33a 100644
--- a/cpp/test/core/math_device.cu
+++ b/cpp/test/core/math_device.cu
@@ -21,6 +21,11 @@
 #include <rmm/cuda_stream.hpp>
 #include <rmm/device_scalar.hpp>
 
+#if _RAFT_HAS_CUDA
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#endif
+
 template <typename OutT, typename OpT, typename... Args>
 __global__ void math_eval_kernel(OutT* out, OpT op, Args... args)
 {
@@ -118,8 +123,32 @@ struct cos_test_op {
   }
 };
 
+struct cos_test_op_device {
+  template <typename Type>
+  constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+#if (__CUDA_ARCH__ < 530)
+    if constexpr (std::is_same_v<Type, __half>) {
+      return __float2half(raft::cos(__half2float(in)));
+    }
+#elif (__CUDA_ARCH__ < 800)
+    if constexpr (std::is_same_v<Type, nv_bfloat16>) {
+      return __float2bfloat16(raft::cos(__bfloat162float(in)));
+    } else  // else is there to make sure raft::cos(in) is not compiled with __half / nv_bfloat16
+#endif
+    return raft::cos(in);
+  }
+};
+
 TEST(MathDevice, Cos)
 {
+  ASSERT_TRUE(raft::match(std::cos(12.34f),
+                          __half2float(math_eval(cos_test_op_device{}, __float2half(12.34f))),
+                          raft::CompareApprox<float>(0.001f)));
+  ASSERT_TRUE(
+    raft::match(std::cos(12.34f),
+                __bfloat162float(math_eval(cos_test_op_device{}, __float2bfloat16(12.34f))),
+                raft::CompareApprox<float>(0.01f)));
   ASSERT_TRUE(raft::match(
     std::cos(12.34f), math_eval(cos_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
   ASSERT_TRUE(raft::match(
@@ -134,14 +163,54 @@ struct exp_test_op {
   }
 };
 
+struct exp_test_op_device {
+  template <typename Type>
+  constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+#if (__CUDA_ARCH__ < 530)
+    if constexpr (std::is_same_v<Type, __half>) {
+      return __float2half(raft::exp(__half2float(in)));
+    }
+#elif (__CUDA_ARCH__ < 800)
+    if constexpr (std::is_same_v<Type, nv_bfloat16>) {
+      return __float2bfloat16(raft::exp(__bfloat162float(in)));
+    } else  // else is there to make sure raft::exp(in) is not compiled with __half / nv_bfloat16
+#endif
+    return raft::exp(in);
+  }
+};
+
 TEST(MathDevice, Exp)
 {
+  ASSERT_TRUE(raft::match(std::exp(3.4f),
+                          __half2float(math_eval(exp_test_op_device{}, __float2half(3.4f))),
+                          raft::CompareApprox<float>(0.001f)));
+  ASSERT_TRUE(raft::match(std::exp(3.4f),
+                          __bfloat162float(math_eval(exp_test_op_device{}, __float2bfloat16(3.4f))),
+                          raft::CompareApprox<float>(0.01f)));
   ASSERT_TRUE(raft::match(
-    std::exp(12.34f), math_eval(exp_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
+    std::exp(3.4f), math_eval(exp_test_op{}, 3.4f), raft::CompareApprox<float>(0.0001f)));
   ASSERT_TRUE(raft::match(
-    std::exp(12.34), math_eval(exp_test_op{}, 12.34), raft::CompareApprox<double>(0.000001)));
+    std::exp(3.4), math_eval(exp_test_op{}, 3.4), raft::CompareApprox<double>(0.000001)));
 }
 
+struct log_test_op_device {
+  template <typename Type>
+  constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+#if (__CUDA_ARCH__ < 530)
+    if constexpr (std::is_same_v<Type, __half>) {
+      return __float2half(raft::log(__half2float(in)));
+    }
+#elif (__CUDA_ARCH__ < 800)
+    if constexpr (std::is_same_v<Type, nv_bfloat16>) {
+      return __float2bfloat16(raft::log(__bfloat162float(in)));
+    } else  // else is there to make sure raft::log(in) is not compiled with __half / nv_bfloat16
+#endif
+    return raft::log(in);
+  }
+};
+
 struct log_test_op {
   template <typename Type>
   constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
@@ -152,6 +221,13 @@ struct log_test_op {
 
 TEST(MathDevice, Log)
 {
+  ASSERT_TRUE(raft::match(std::log(12.34f),
+                          __half2float(math_eval(log_test_op_device{}, __float2half(12.34f))),
+                          raft::CompareApprox<float>(0.001f)));
+  ASSERT_TRUE(
+    raft::match(std::log(12.34f),
+                __bfloat162float(math_eval(log_test_op_device{}, __float2bfloat16(12.34f))),
+                raft::CompareApprox<float>(0.01f)));
   ASSERT_TRUE(raft::match(
     std::log(12.34f), math_eval(log_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
   ASSERT_TRUE(raft::match(
@@ -277,6 +353,23 @@ TEST(MathDevice, Sgn)
   ASSERT_TRUE(raft::match(1, math_eval(sgn_test_op{}, 12.34f), raft::Compare<int>()));
 }
 
+struct sin_test_op_device {
+  template <typename Type>
+  constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+#if (__CUDA_ARCH__ < 530)
+    if constexpr (std::is_same_v<Type, __half>) {
+      return __float2half(raft::sin(__half2float(in)));
+    }
+#elif (__CUDA_ARCH__ < 800)
+    if constexpr (std::is_same_v<Type, nv_bfloat16>) {
+      return __float2bfloat16(raft::sin(__bfloat162float(in)));
+    } else  // else is there to make sure raft::sin(in) is not compiled with __half / nv_bfloat16
+#endif
+    return raft::sin(in);
+  }
+};
+
 struct sin_test_op {
   template <typename Type>
   constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
@@ -287,6 +380,13 @@ struct sin_test_op {
 
 TEST(MathDevice, Sin)
 {
+  ASSERT_TRUE(raft::match(std::sin(12.34f),
+                          __half2float(math_eval(sin_test_op_device{}, __float2half(12.34f))),
+                          raft::CompareApprox<float>(0.01f)));
+  ASSERT_TRUE(
+    raft::match(std::sin(12.34f),
+                __bfloat162float(math_eval(sin_test_op_device{}, __float2bfloat16(12.34f))),
+                raft::CompareApprox<float>(0.1f)));
   ASSERT_TRUE(raft::match(
     std::sin(12.34f), math_eval(sin_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
   ASSERT_TRUE(raft::match(
@@ -319,6 +419,23 @@ TEST(MathDevice, SinCos)
   ASSERT_TRUE(raft::match(std::cos(12.34), cd.value(stream), raft::CompareApprox<double>(0.0001f)));
 }
 
+struct sqrt_test_op_device {
+  template <typename Type>
+  constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+#if (__CUDA_ARCH__ < 530)
+    if constexpr (std::is_same_v<Type, __half>) {
+      return __float2half(raft::sqrt(__half2float(in)));
+    }
+#elif (__CUDA_ARCH__ < 800)
+    if constexpr (std::is_same_v<Type, nv_bfloat16>) {
+      return __float2bfloat16(raft::sqrt(__bfloat162float(in)));
+    } else  // else is there to make sure raft::sqrt(in) is not compiled with __half / nv_bfloat16
+#endif
+    return raft::sqrt(in);
+  }
+};
+
 struct sqrt_test_op {
   template <typename Type>
   constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
@@ -329,6 +446,13 @@ struct sqrt_test_op {
 
 TEST(MathDevice, Sqrt)
 {
+  ASSERT_TRUE(raft::match(std::sqrt(12.34f),
+                          __half2float(math_eval(sqrt_test_op_device{}, __float2half(12.34f))),
+                          raft::CompareApprox<float>(0.001f)));
+  ASSERT_TRUE(
+    raft::match(std::sqrt(12.34f),
+                __bfloat162float(math_eval(sqrt_test_op_device{}, __float2bfloat16(12.34f))),
+                raft::CompareApprox<float>(0.01f)));
   ASSERT_TRUE(raft::match(
     std::sqrt(12.34f), math_eval(sqrt_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
   ASSERT_TRUE(raft::match(