From f7ba5220b102c168c0a582210fdb23f225c7dbc0 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Wed, 1 Jan 2025 01:41:51 +0000
Subject: [PATCH 01/16] ceil_div common type and optmize

---
 libcudacxx/include/cuda/__cmath/ceil_div.h    | 61 ++++++++++++-------
 .../test/libcudacxx/cuda/cmath.pass.cpp       | 12 ++--
 2 files changed, 44 insertions(+), 29 deletions(-)
diff --git a/libcudacxx/include/cuda/__cmath/ceil_div.h b/libcudacxx/include/cuda/__cmath/ceil_div.h
index 7e67503bdd5..71ac178edff 100644
--- a/libcudacxx/include/cuda/__cmath/ceil_div.h
+++ b/libcudacxx/include/cuda/__cmath/ceil_div.h
@@ -11,6 +11,7 @@
 #ifndef _CUDA___CMATH_CEIL_DIV_H
 #define _CUDA___CMATH_CEIL_DIV_H
 
+d
 #include <cuda/std/detail/__config>
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
@@ -21,8 +22,9 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/__type_traits/common_type.h>
+#include <cuda/std/__algorithm/min.h>
 #include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_constant_evaluated.h>
 #include <cuda/std/__type_traits/is_enum.h>
 #include <cuda/std/__type_traits/is_integral.h>
 #include <cuda/std/__type_traits/is_signed.h>
@@ -30,23 +32,14 @@
 #include <cuda/std/__type_traits/make_unsigned.h>
 #include <cuda/std/__type_traits/underlying_type.h>
 
-_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+  _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
-//! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder
-//! @param __a The dividend
-//! @param __b The divisor
-//! @pre \p __a must be non-negative
-//! @pre \p __b must be positive
-template <class _Tp,
-          class _Up,
-          _CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_unsigned, _Tp), int> = 0,
-          _CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up), int> = 0>
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(const _Tp __a, const _Up __b) noexcept
+  template <class _UCommon>
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _UCommon
+  __constexpr_unsigned_ceil_div(const _UCommon __a, const _UCommon __b) noexcept
 {
-  _CCCL_ASSERT(__b > _Up(0), "cuda::ceil_div: b must be positive");
-  using _UCommon   = _CUDA_VSTD::make_unsigned_t<_CUDA_VSTD::common_type_t<_Tp, _Up>>;
-  const auto __res = static_cast<_UCommon>(__a) / static_cast<_UCommon>(__b);
-  return static_cast<_Tp>(__res + (__res * static_cast<_UCommon>(__b) != static_cast<_UCommon>(__a)));
+  const auto __res = __a / __b;
+  return __res + (__res * __b != __a);
 }
 
 //! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder
@@ -56,15 +49,37 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(con
 //! @pre \p __b must be positive
 template <class _Tp,
           class _Up,
-          _CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_signed, _Tp), int>   = 0,
+          _CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp), int> = 0,
           _CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up), int> = 0>
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(const _Tp __a, const _Up __b) noexcept
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 decltype(_Tp{} / _Up{})
+ceil_div(const _Tp __a, const _Up __b) noexcept
 {
-  _CCCL_ASSERT(__a >= _Tp(0), "cuda::ceil_div: a must be non negative");
-  _CCCL_ASSERT(__b > _Up(0), "cuda::ceil_div: b must be positive");
-  using _UCommon = _CUDA_VSTD::make_unsigned_t<_CUDA_VSTD::common_type_t<_Tp, _Up>>;
-  // Due to the precondition `__a >= 0` we can safely cast to unsigned without danger of overflowing
-  return static_cast<_Tp>((static_cast<_UCommon>(__a) + static_cast<_UCommon>(__b) - 1) / static_cast<_UCommon>(__b));
+  _CCCL_ASSERT(__b > _Up{0}, "cuda::ceil_div: b must be positive");
+  using _Common  = decltype(_Tp{} / _Up{});
+  using _UCommon = _CUDA_VSTD::make_unsigned_t<_Common>;
+  if constexpr (std::is_signed_v<_Tp>)
+  {
+    _CCCL_ASSERT(__a >= _Tp{0}, "cuda::ceil_div: a must be non negative");
+  }
+  auto __a1 = static_cast<_UCommon>(__a);
+  auto __b1 = static_cast<_UCommon>(__b);
+  if constexpr (std::is_signed_v<_Common>)
+  {
+    return static_cast<_Common>((__a1 + __b1 - 1) / __b1);
+  }
+  else
+  {
+    if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
+    {
+      NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+                        (return static_cast<_Common>(_CUDA_VSTD::min(__a1, 1 + ((__a1 - 1) / __b1)));),
+                        (return __constexpr_unsigned_ceil_div(__a1, __b1);))
+    }
+    else
+    {
+      return __constexpr_unsigned_ceil_div(__a1, __b1);
+    }
+  }
 }
 
 //! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder, \p __b is an enum
diff --git a/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp b/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp
index bf27a71b688..4db4869adef 100644
--- a/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp
@@ -25,15 +25,15 @@ __host__ __device__ TEST_CONSTEXPR_CXX14 void test()
   constexpr T maxv = cuda::std::numeric_limits<T>::max();
 
   // ensure that we return the right type
-  static_assert(cuda::std::is_same<decltype(cuda::ceil_div(T(0), U(1))), T>::value, "");
-
-  assert(cuda::ceil_div(T(0), U(1)) == T(0));
-  assert(cuda::ceil_div(T(1), U(1)) == T(1));
-  assert(cuda::ceil_div(T(126), U(64)) == T(2));
+  using Common = decltype(T(0) / U(1));
+  static_assert(cuda::std::is_same<decltype(cuda::ceil_div(T(0), U(1))), Common>::value);
+  assert(cuda::ceil_div(T(0), U(1)) == Common(0));
+  assert(cuda::ceil_div(T(1), U(1)) == Common(1));
+  assert(cuda::ceil_div(T(126), U(64)) == Common(2));
 
   // ensure that we are resilient against overflow
   assert(cuda::ceil_div(maxv, U(1)) == maxv);
-  assert(cuda::ceil_div(maxv, maxv) == T(1));
+  assert(cuda::ceil_div(maxv, maxv) == Common(1));
 }
 
 template <class T>

From 395a91a3be2c157c7227a00c7693d941c62e76e3 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 2 Jan 2025 18:11:25 +0000
Subject: [PATCH 02/16] simplify the implementation

---
 libcudacxx/include/cuda/__cmath/ceil_div.h | 23 ++++++++--------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/libcudacxx/include/cuda/__cmath/ceil_div.h b/libcudacxx/include/cuda/__cmath/ceil_div.h
index 71ac178edff..c21e5269966 100644
--- a/libcudacxx/include/cuda/__cmath/ceil_div.h
+++ b/libcudacxx/include/cuda/__cmath/ceil_div.h
@@ -24,11 +24,9 @@ d
 
 #include <cuda/std/__algorithm/min.h>
 #include <cuda/std/__type_traits/enable_if.h>
-#include <cuda/std/__type_traits/is_constant_evaluated.h>
 #include <cuda/std/__type_traits/is_enum.h>
 #include <cuda/std/__type_traits/is_integral.h>
 #include <cuda/std/__type_traits/is_signed.h>
-#include <cuda/std/__type_traits/is_unsigned.h>
 #include <cuda/std/__type_traits/make_unsigned.h>
 #include <cuda/std/__type_traits/underlying_type.h>
 
@@ -36,7 +34,7 @@ d
 
   template <class _UCommon>
   _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _UCommon
-  __constexpr_unsigned_ceil_div(const _UCommon __a, const _UCommon __b) noexcept
+  __unsigned_ceil_div(const _UCommon __a, const _UCommon __b) noexcept
 {
   const auto __res = __a / __b;
   return __res + (__res * __b != __a);
@@ -57,28 +55,23 @@ ceil_div(const _Tp __a, const _Up __b) noexcept
   _CCCL_ASSERT(__b > _Up{0}, "cuda::ceil_div: b must be positive");
   using _Common  = decltype(_Tp{} / _Up{});
   using _UCommon = _CUDA_VSTD::make_unsigned_t<_Common>;
-  if constexpr (std::is_signed_v<_Tp>)
+  if constexpr (_CUDA_VSTD::is_signed_v<_Tp>)
   {
     _CCCL_ASSERT(__a >= _Tp{0}, "cuda::ceil_div: a must be non negative");
   }
   auto __a1 = static_cast<_UCommon>(__a);
   auto __b1 = static_cast<_UCommon>(__b);
-  if constexpr (std::is_signed_v<_Common>)
+  if constexpr (_CUDA_VSTD::is_signed_v<_Common>)
   {
     return static_cast<_Common>((__a1 + __b1 - 1) / __b1);
   }
   else
   {
-    if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
-    {
-      NV_IF_ELSE_TARGET(NV_IS_DEVICE,
-                        (return static_cast<_Common>(_CUDA_VSTD::min(__a1, 1 + ((__a1 - 1) / __b1)));),
-                        (return __constexpr_unsigned_ceil_div(__a1, __b1);))
-    }
-    else
-    {
-      return __constexpr_unsigned_ceil_div(__a1, __b1);
-    }
+    // the ::min method is faster even if __b is a compile-time constant
+    NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+                      (return static_cast<_Common>(_CUDA_VSTD::min(__a1, 1 + ((__a1 - 1) / __b1)));),
+                      (const auto __res = __a1 / __b1; //
+                       return static_cast<_Common>(__res + (__res * __b1 != __a1));))
   }
 }
 

From 5fc964f199d61a2deb183da3b8dee1d4e0542d83 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 2 Jan 2025 18:12:34 +0000
Subject: [PATCH 03/16] remove typo

---
 libcudacxx/include/cuda/__cmath/ceil_div.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/libcudacxx/include/cuda/__cmath/ceil_div.h b/libcudacxx/include/cuda/__cmath/ceil_div.h
index c21e5269966..d32d13e106a 100644
--- a/libcudacxx/include/cuda/__cmath/ceil_div.h
+++ b/libcudacxx/include/cuda/__cmath/ceil_div.h
@@ -11,7 +11,6 @@
 #ifndef _CUDA___CMATH_CEIL_DIV_H
 #define _CUDA___CMATH_CEIL_DIV_H
 
-d
 #include <cuda/std/detail/__config>
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
@@ -30,11 +29,11 @@ d
 #include <cuda/std/__type_traits/make_unsigned.h>
 #include <cuda/std/__type_traits/underlying_type.h>
 
-  _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
-  template <class _UCommon>
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _UCommon
-  __unsigned_ceil_div(const _UCommon __a, const _UCommon __b) noexcept
+template <class _UCommon>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _UCommon
+__unsigned_ceil_div(const _UCommon __a, const _UCommon __b) noexcept
 {
   const auto __res = __a / __b;
   return __res + (__res * __b != __a);

From 47bacda84f3ba02f979137b57b5475b4071c644a Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 2 Jan 2025 18:46:09 +0000
Subject: [PATCH 04/16] remove unused function

---
 libcudacxx/include/cuda/__cmath/ceil_div.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/libcudacxx/include/cuda/__cmath/ceil_div.h b/libcudacxx/include/cuda/__cmath/ceil_div.h
index d32d13e106a..143dd8a99bf 100644
--- a/libcudacxx/include/cuda/__cmath/ceil_div.h
+++ b/libcudacxx/include/cuda/__cmath/ceil_div.h
@@ -31,14 +31,6 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
-template <class _UCommon>
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _UCommon
-__unsigned_ceil_div(const _UCommon __a, const _UCommon __b) noexcept
-{
-  const auto __res = __a / __b;
-  return __res + (__res * __b != __a);
-}
-
 //! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder
 //! @param __a The dividend
 //! @param __b The divisor

From e5c8a63f7955d9c356e9d1fa7ab09e31c415ccf0 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 2 Jan 2025 18:57:15 +0000
Subject: [PATCH 05/16] more readable static_assert

---
 libcudacxx/include/cuda/__cmath/ceil_div.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/include/cuda/__cmath/ceil_div.h b/libcudacxx/include/cuda/__cmath/ceil_div.h
index 143dd8a99bf..23416916784 100644
--- a/libcudacxx/include/cuda/__cmath/ceil_div.h
+++ b/libcudacxx/include/cuda/__cmath/ceil_div.h
@@ -43,12 +43,12 @@ template <class _Tp,
 _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 decltype(_Tp{} / _Up{})
 ceil_div(const _Tp __a, const _Up __b) noexcept
 {
-  _CCCL_ASSERT(__b > _Up{0}, "cuda::ceil_div: b must be positive");
+  _CCCL_ASSERT(__b > _Up{0}, "cuda::ceil_div: 'b' must be positive");
   using _Common  = decltype(_Tp{} / _Up{});
   using _UCommon = _CUDA_VSTD::make_unsigned_t<_Common>;
   if constexpr (_CUDA_VSTD::is_signed_v<_Tp>)
   {
-    _CCCL_ASSERT(__a >= _Tp{0}, "cuda::ceil_div: a must be non negative");
+    _CCCL_ASSERT(__a >= _Tp{0}, "cuda::ceil_div: 'a' must be non negative");
   }
   auto __a1 = static_cast<_UCommon>(__a);
   auto __b1 = static_cast<_UCommon>(__b);

From 331b7ce67e836f73d68a43b7371765e2ccb7d687 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Wed, 8 Jan 2025 00:42:24 +0000
Subject: [PATCH 06/16] add _CCCL_REQUIRES , to_underlying, enumerator support

---
 libcudacxx/include/cuda/__cmath/ceil_div.h | 44 ++++++++++++++++------
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/libcudacxx/include/cuda/__cmath/ceil_div.h b/libcudacxx/include/cuda/__cmath/ceil_div.h
index 23416916784..de2b48ff11e 100644
--- a/libcudacxx/include/cuda/__cmath/ceil_div.h
+++ b/libcudacxx/include/cuda/__cmath/ceil_div.h
@@ -27,7 +27,7 @@
 #include <cuda/std/__type_traits/is_integral.h>
 #include <cuda/std/__type_traits/is_signed.h>
 #include <cuda/std/__type_traits/make_unsigned.h>
-#include <cuda/std/__type_traits/underlying_type.h>
+#include <cuda/std/__utility/to_underlying.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
@@ -36,11 +36,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 //! @param __b The divisor
 //! @pre \p __a must be non-negative
 //! @pre \p __b must be positive
-template <class _Tp,
-          class _Up,
-          _CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp), int> = 0,
-          _CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up), int> = 0>
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 decltype(_Tp{} / _Up{})
+_CCCL_TEMPLATE(class _Tp, class _Up)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(_Tp{} / _Up{})
 ceil_div(const _Tp __a, const _Up __b) noexcept
 {
   _CCCL_ASSERT(__b > _Up{0}, "cuda::ceil_div: 'b' must be positive");
@@ -71,13 +69,35 @@ ceil_div(const _Tp __a, const _Up __b) noexcept
 //! @param __b The divisor
 //! @pre \p __a must be non-negative
 //! @pre \p __b must be positive
-template <class _Tp,
-          class _Up,
-          _CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp), int> = 0,
-          _CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up), int>     = 0>
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(const _Tp __a, const _Up __b) noexcept
+_CCCL_TEMPLATE(class _Tp, class _Up)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp ceil_div(const _Tp __a, const _Up __b) noexcept
 {
-  return ::cuda::ceil_div(__a, static_cast<_CUDA_VSTD::underlying_type_t<_Up>>(__b));
+  return ::cuda::ceil_div(__a, _CUDA_VSTD::to_underlying(__b));
+}
+
+//! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder, \p __b is an enum
+//! @param __a The dividend
+//! @param __b The divisor
+//! @pre \p __a must be non-negative
+//! @pre \p __b must be positive
+_CCCL_TEMPLATE(class _Tp, class _Up)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp ceil_div(const _Tp __a, const _Up __b) noexcept
+{
+  return ::cuda::ceil_div(_CUDA_VSTD::to_underlying(__a), __b);
+}
+
+//! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder, \p __b is an enum
+//! @param __a The dividend
+//! @param __b The divisor
+//! @pre \p __a must be non-negative
+//! @pre \p __b must be positive
+_CCCL_TEMPLATE(class _Tp, class _Up)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp ceil_div(const _Tp __a, const _Up __b) noexcept
+{
+  return ::cuda::ceil_div(_CUDA_VSTD::to_underlying(__a), _CUDA_VSTD::to_underlying(__b));
 }
 
 _LIBCUDACXX_END_NAMESPACE_CUDA

From e8862d23bf3460b9a35701d290fdfe436c3bcba8 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Wed, 8 Jan 2025 00:52:41 +0000
Subject: [PATCH 07/16] improve documentation

---
 docs/libcudacxx/extended_api/math.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/libcudacxx/extended_api/math.rst b/docs/libcudacxx/extended_api/math.rst
index 5e9af18aae2..dade27c357d 100644
--- a/docs/libcudacxx/extended_api/math.rst
+++ b/docs/libcudacxx/extended_api/math.rst
@@ -5,15 +5,15 @@ Math
 
 .. code:: cuda
 
-   template <typename T>
-   [[nodiscard]] __host__ __device__ constexpr T ceil_div(T a, T b) noexcept;
+   template <typename T, typename U>
+   [[nodiscard]] __host__ __device__ constexpr decltype(T{} / U{}) ceil_div(T a, U b) noexcept;
 
 ceil_div
 ---------
 
-- _Requires_: `is_integral_v<T>` is true.
-- _Preconditions_: `a >= 0` is true and `b > 0` is true.
-- _Returns_: divides `a` by `b`. If `a` is not a multiple of `b` rounds the result up to the next integer value.
+- *Requires*: ``T`` is an integral type or enumerator.
+- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
+- *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value.
 
 .. note::
 

From a87276bf081947d8dd7efe91f2065cdd52d6fa91 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Wed, 8 Jan 2025 00:52:47 +0000
Subject: [PATCH 08/16] fix return type

---
 libcudacxx/include/cuda/__cmath/ceil_div.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/libcudacxx/include/cuda/__cmath/ceil_div.h b/libcudacxx/include/cuda/__cmath/ceil_div.h
index de2b48ff11e..4e93581febe 100644
--- a/libcudacxx/include/cuda/__cmath/ceil_div.h
+++ b/libcudacxx/include/cuda/__cmath/ceil_div.h
@@ -71,7 +71,8 @@ ceil_div(const _Tp __a, const _Up __b) noexcept
 //! @pre \p __b must be positive
 _CCCL_TEMPLATE(class _Tp, class _Up)
 _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up))
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp ceil_div(const _Tp __a, const _Up __b) noexcept
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(_Tp{} / _CUDA_VSTD::underlying_type_t<_Up>{})
+ceil_div(const _Tp __a, const _Up __b) noexcept
 {
   return ::cuda::ceil_div(__a, _CUDA_VSTD::to_underlying(__b));
 }
@@ -83,7 +84,8 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp ceil_div(const _Tp __a,
 //! @pre \p __b must be positive
 _CCCL_TEMPLATE(class _Tp, class _Up)
 _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up))
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp ceil_div(const _Tp __a, const _Up __b) noexcept
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(_CUDA_VSTD::underlying_type_t<_Tp>{} / _Up{})
+ceil_div(const _Tp __a, const _Up __b) noexcept
 {
   return ::cuda::ceil_div(_CUDA_VSTD::to_underlying(__a), __b);
 }
@@ -95,7 +97,9 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp ceil_div(const _Tp __a,
 //! @pre \p __b must be positive
 _CCCL_TEMPLATE(class _Tp, class _Up)
 _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up))
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp ceil_div(const _Tp __a, const _Up __b) noexcept
+_CCCL_NODISCARD
+_LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(_CUDA_VSTD::underlying_type_t<_Tp>{} / _CUDA_VSTD::underlying_type_t<_Up>{})
+ceil_div(const _Tp __a, const _Up __b) noexcept
 {
   return ::cuda::ceil_div(_CUDA_VSTD::to_underlying(__a), _CUDA_VSTD::to_underlying(__b));
 }

From 7e6a7d45dc38ac22d122fdaeaed528af706beafe Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 9 Jan 2025 00:52:33 +0000
Subject: [PATCH 09/16] use common_type for return value

---
 libcudacxx/include/cuda/__cmath/ceil_div.h    | 25 +++++++++++--------
 .../test/libcudacxx/cuda/cmath.pass.cpp       |  2 +-
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/libcudacxx/include/cuda/__cmath/ceil_div.h b/libcudacxx/include/cuda/__cmath/ceil_div.h
index 4e93581febe..e445d8a0ad5 100644
--- a/libcudacxx/include/cuda/__cmath/ceil_div.h
+++ b/libcudacxx/include/cuda/__cmath/ceil_div.h
@@ -22,11 +22,13 @@
 #endif // no system header
 
 #include <cuda/std/__algorithm/min.h>
-#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__type_traits/common_type.h>
 #include <cuda/std/__type_traits/is_enum.h>
 #include <cuda/std/__type_traits/is_integral.h>
 #include <cuda/std/__type_traits/is_signed.h>
 #include <cuda/std/__type_traits/make_unsigned.h>
+#include <cuda/std/__type_traits/underlying_type.h>
 #include <cuda/std/__utility/to_underlying.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
@@ -38,19 +40,21 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 //! @pre \p __b must be positive
 _CCCL_TEMPLATE(class _Tp, class _Up)
 _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up))
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(_Tp{} / _Up{})
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_Tp, _Up>
 ceil_div(const _Tp __a, const _Up __b) noexcept
 {
   _CCCL_ASSERT(__b > _Up{0}, "cuda::ceil_div: 'b' must be positive");
-  using _Common  = decltype(_Tp{} / _Up{});
-  using _UCommon = _CUDA_VSTD::make_unsigned_t<_Common>;
+
   if constexpr (_CUDA_VSTD::is_signed_v<_Tp>)
   {
     _CCCL_ASSERT(__a >= _Tp{0}, "cuda::ceil_div: 'a' must be non negative");
   }
-  auto __a1 = static_cast<_UCommon>(__a);
-  auto __b1 = static_cast<_UCommon>(__b);
-  if constexpr (_CUDA_VSTD::is_signed_v<_Common>)
+  using _Common = _CUDA_VSTD::common_type_t<_Tp, _Up>;
+  using _Prom   = decltype(_Tp{} / _Up{});
+  using _UProm  = _CUDA_VSTD::make_unsigned_t<_Prom>;
+  auto __a1     = static_cast<_UProm>(__a);
+  auto __b1     = static_cast<_UProm>(__b);
+  if constexpr (_CUDA_VSTD::is_signed_v<_Prom>)
   {
     return static_cast<_Common>((__a1 + __b1 - 1) / __b1);
   }
@@ -71,7 +75,7 @@ ceil_div(const _Tp __a, const _Up __b) noexcept
 //! @pre \p __b must be positive
 _CCCL_TEMPLATE(class _Tp, class _Up)
 _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up))
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(_Tp{} / _CUDA_VSTD::underlying_type_t<_Up>{})
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_Tp, _CUDA_VSTD::underlying_type_t<_Up>>
 ceil_div(const _Tp __a, const _Up __b) noexcept
 {
   return ::cuda::ceil_div(__a, _CUDA_VSTD::to_underlying(__b));
@@ -84,7 +88,7 @@ ceil_div(const _Tp __a, const _Up __b) noexcept
 //! @pre \p __b must be positive
 _CCCL_TEMPLATE(class _Tp, class _Up)
 _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up))
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(_CUDA_VSTD::underlying_type_t<_Tp>{} / _Up{})
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_CUDA_VSTD::underlying_type_t<_Tp>, _Up>
 ceil_div(const _Tp __a, const _Up __b) noexcept
 {
   return ::cuda::ceil_div(_CUDA_VSTD::to_underlying(__a), __b);
@@ -98,7 +102,8 @@ ceil_div(const _Tp __a, const _Up __b) noexcept
 _CCCL_TEMPLATE(class _Tp, class _Up)
 _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up))
 _CCCL_NODISCARD
-_LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(_CUDA_VSTD::underlying_type_t<_Tp>{} / _CUDA_VSTD::underlying_type_t<_Up>{})
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_CUDA_VSTD::underlying_type_t<_Tp>,
+                                                              _CUDA_VSTD::underlying_type_t<_Up>>
 ceil_div(const _Tp __a, const _Up __b) noexcept
 {
   return ::cuda::ceil_div(_CUDA_VSTD::to_underlying(__a), _CUDA_VSTD::to_underlying(__b));
diff --git a/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp b/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp
index 4db4869adef..d426c19e83a 100644
--- a/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp
@@ -25,7 +25,7 @@ __host__ __device__ TEST_CONSTEXPR_CXX14 void test()
   constexpr T maxv = cuda::std::numeric_limits<T>::max();
 
   // ensure that we return the right type
-  using Common = decltype(T(0) / U(1));
+  using Common = _CUDA_VSTD::common_type_t<T, U>;
   static_assert(cuda::std::is_same<decltype(cuda::ceil_div(T(0), U(1))), Common>::value);
   assert(cuda::ceil_div(T(0), U(1)) == Common(0));
   assert(cuda::ceil_div(T(1), U(1)) == Common(1));

From 40700c8f32a3d05bdd1e8274e9bd58b5cd670ec9 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 9 Jan 2025 00:58:08 +0000
Subject: [PATCH 10/16] improve documentation

---
 docs/libcudacxx/extended_api/math.rst | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/docs/libcudacxx/extended_api/math.rst b/docs/libcudacxx/extended_api/math.rst
index dade27c357d..167c5fa2b39 100644
--- a/docs/libcudacxx/extended_api/math.rst
+++ b/docs/libcudacxx/extended_api/math.rst
@@ -6,18 +6,20 @@ Math
 .. code:: cuda
 
    template <typename T, typename U>
-   [[nodiscard]] __host__ __device__ constexpr decltype(T{} / U{}) ceil_div(T a, U b) noexcept;
+   [[nodiscard]] __host__ __device__ inline
+   constexpr _CUDA_VSTD::common_type_t<_Tp, _Up> ceil_div(T a, U b) noexcept;
 
 ceil_div
 ---------
 
-- *Requires*: ``T`` is an integral type or enumerator.
+- *Requires*: ``T`` is an integral type (including 128-bit integers) or enumerator.
 - *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
 - *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value.
 
-.. note::
+**Performance considerations**
 
-   The function is only constexpr from C++14 onwards
+- The function computes ``(a + b - 1) / b`` when the common type is a signed integer.
+- The function computes ``min(a, 1 + ((a - 1) / b)`` when the common type is an unsigned integer in CUDA, which generates less instructions than ``(a / b) + ((a / b) * b != a)``, especially for 64-bit types.
 
 **Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block:
 

From 2a678b191430b5aebc2cbfefb51da4b7db27e124 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Wed, 22 Jan 2025 15:41:30 -0800
Subject: [PATCH 11/16] disable c++11/14

---
 libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp b/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp
index d426c19e83a..37612764986 100644
--- a/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp
@@ -6,6 +6,7 @@
 // SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
+// UNSUPPORTED: c++03, c++11, c++14
 
 #include <cuda/cmath>
 #include <cuda/std/cassert>

From 3edaad3ab1af5bfe448062d95096e7dee2e87f48 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Wed, 22 Jan 2025 15:47:40 -0800
Subject: [PATCH 12/16] sync docs

---
 docs/libcudacxx/extended_api/math.rst         | 64 ++++++-------------
 .../libcudacxx/extended_api/math/ceil_div.rst | 52 +++++++++++++++
 .../extended_api/math/round_down.rst          | 38 +++++++++++
 .../libcudacxx/extended_api/math/round_up.rst | 40 ++++++++++++
 4 files changed, 149 insertions(+), 45 deletions(-)
 create mode 100644 docs/libcudacxx/extended_api/math/ceil_div.rst
 create mode 100644 docs/libcudacxx/extended_api/math/round_down.rst
 create mode 100644 docs/libcudacxx/extended_api/math/round_up.rst

diff --git a/docs/libcudacxx/extended_api/math.rst b/docs/libcudacxx/extended_api/math.rst
index 167c5fa2b39..59c6068a09c 100644
--- a/docs/libcudacxx/extended_api/math.rst
+++ b/docs/libcudacxx/extended_api/math.rst
@@ -1,54 +1,28 @@
 .. _libcudacxx-extended-api-math:
 
 Math
-=====
+====
 
-.. code:: cuda
+.. toctree::
+   :hidden:
+   :maxdepth: 1
 
-   template <typename T, typename U>
-   [[nodiscard]] __host__ __device__ inline
-   constexpr _CUDA_VSTD::common_type_t<_Tp, _Up> ceil_div(T a, U b) noexcept;
+   cuda::ceil_div <math/ceil_div>
+   cuda::round_up <math/round_up>
+   cuda::round_down <math/round_down>
 
-ceil_div
----------
+.. list-table::
+   :widths: 25 45 30
+   :header-rows: 0
 
-- *Requires*: ``T`` is an integral type (including 128-bit integers) or enumerator.
-- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
-- *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value.
+   * - :ref:`ceil_div <libcudacxx-extended-api-math-ceil-div>`
+     - Ceiling division
+     - CCCL 2.6.0 / CUDA 12.6
 
-**Performance considerations**
+   * - :ref:`round_up <libcudacxx-extended-api-math-round-up>`
+     - Round to the next multiple
+     - CCCL 2.9.0 / CUDA 12.9
 
-- The function computes ``(a + b - 1) / b`` when the common type is a signed integer.
-- The function computes ``min(a, 1 + ((a - 1) / b)`` when the common type is an unsigned integer in CUDA, which generates less instructions than ``(a / b) + ((a / b) * b != a)``, especially for 64-bit types.
-
-**Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block:
-
-.. code:: cuda
-
-   #include <vector>
-   #include <cuda/cmath>
-
-   __global__ void vscale(int n, float s, float *x) {
-     int i = blockIdx.x * blockDim.x + threadIdx.x;
-     if (i < n) x[i] *= s;
-   }
-
-   int main() {
-     const int n = 100000;
-     const float s = 2.f;
-     std::vector<float> x(n, 1.f);
-
-     // Given a fixed number of threads per block...
-     constexpr int threads_per_block = 256;
-
-     // ...dividing some "n" by "threads_per_block" may lead to a remainder,
-     // requiring the kernel to be launched with an extra thread block to handle it.
-     const int thread_blocks = cuda::ceil_div(n, threads_per_block);
-
-     vscale<<<thread_blocks, threads_per_block>>>(n, s, x.data());
-     cudaDeviceSynchronize();
-
-     return 0;
-   }
-
-`See it on Godbolt TODO`
+   * - :ref:`round_down <libcudacxx-extended-api-math-round-down>`
+     - Round to the previous multiple
+     - CCCL 2.9.0 / CUDA 12.9
diff --git a/docs/libcudacxx/extended_api/math/ceil_div.rst b/docs/libcudacxx/extended_api/math/ceil_div.rst
new file mode 100644
index 00000000000..df6d8c973fa
--- /dev/null
+++ b/docs/libcudacxx/extended_api/math/ceil_div.rst
@@ -0,0 +1,52 @@
+.. _libcudacxx-extended-api-math-ceil-div:
+
+``ceil_div`` Ceiling Division
+=============================
+
+.. code:: cuda
+
+   template <typename T, typename = U>
+   [[nodiscard]] __host__ __device__ constexpr T ceil_div(T value, U divisor) noexcept;
+
+``value``: The value to be divided.
+``divisor``:  The divisor.
+
+- *Requires*: ``is_integral_v<T>`` is true and ``is_integral_v<U>`` is true.
+- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
+- *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value.
+
+.. note::
+
+   The function is only constexpr from C++14 onwards
+
+**Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block:
+
+.. code:: cuda
+
+   #include <vector>
+   #include <cuda/cmath>
+
+   __global__ void vscale(int n, float s, float *x) {
+     int i = blockIdx.x * blockDim.x + threadIdx.x;
+     if (i < n) x[i] *= s;
+   }
+
+   int main() {
+     const int n = 100000;
+     const float s = 2.f;
+     std::vector<float> x(n, 1.f);
+
+     // Given a fixed number of threads per block...
+     constexpr int threads_per_block = 256;
+
+     // ...dividing some "n" by "threads_per_block" may lead to a remainder,
+     // requiring the kernel to be launched with an extra thread block to handle it.
+     const int thread_blocks = cuda::ceil_div(n, threads_per_block);
+
+     vscale<<<thread_blocks, threads_per_block>>>(n, s, x.data());
+     cudaDeviceSynchronize();
+
+     return 0;
+   }
+
+`See it on Godbolt TODO`
diff --git a/docs/libcudacxx/extended_api/math/round_down.rst b/docs/libcudacxx/extended_api/math/round_down.rst
new file mode 100644
index 00000000000..20a80998fd3
--- /dev/null
+++ b/docs/libcudacxx/extended_api/math/round_down.rst
@@ -0,0 +1,38 @@
+.. _libcudacxx-extended-api-math-round-down:
+
+``round_down`` Round to the previous multiple
+=============================================
+
+.. code:: cuda
+
+   template <typename T, typename = U>
+   [[nodiscard]] __host__ __device__ inline
+   constexpr cuda::std::common_type_t<T, U> round_down(T value, U base_multiple) noexcept;
+
+``value``: The value to be rounded down.
+``base_multiple``:  The base multiple to which the value rounds down.
+
+- *Requires*: ``T`` and ``U`` are integral types (including 128-bit integers) or enumerators.
+- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
+- *Returns*: ``a`` rounded down to the largest multiple of ``b`` less than or equal to ``a``. If ``a`` is already a multiple of ``b``, return ``a``.
+
+.. note::
+
+   The function requires C++17 onwards
+
+**Performance considerations**:
+
+- The function performs a truncation division followed by a multiplication. It provides better performance than ``a / b * b`` when the common type is a signed integer
+
+**Example**:
+
+.. code:: cuda
+
+   #include <cuda/cmath>
+
+   __global__ void example_kernel(int a, unsigned b, unsigned* result) {
+     // a = 7, b = 3 -> result = 6
+     *result = cuda::round_down(a, b);
+   }
+
+`See it on Godbolt TODO`
diff --git a/docs/libcudacxx/extended_api/math/round_up.rst b/docs/libcudacxx/extended_api/math/round_up.rst
new file mode 100644
index 00000000000..13c282aaad7
--- /dev/null
+++ b/docs/libcudacxx/extended_api/math/round_up.rst
@@ -0,0 +1,40 @@
+.. _libcudacxx-extended-api-math-round-up:
+
+``round_up`` Round to the next multiple
+=======================================
+
+.. code:: cuda
+
+   template <typename T, typename = U>
+   [[nodiscard]] __host__ __device__ inline
+   constexpr cuda::std::common_type_t<T, U> round_up(T value, U base_multiple) noexcept;
+
+``value``: The value to be rounded up.
+``base_multiple``:  The base multiple to which the value rounds up.
+
+- *Requires*: ``T`` and ``U`` are integral types (including 128-bit integers) or enumerators.
+- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
+- *Returns*: ``a`` rounded up to the smallest multiple of ``b`` greater than or equal to ``a``. If ``a`` is already a multiple of ``b``, return ``a``.
+- *Note*: the result can overflow if ``ceil(a / b) * b`` exceeds the maximum value of the common type of
+          ``a`` and ``b``. The condition is checked in debug mode.
+
+.. note::
+
+   The function requires C++17 onwards
+
+**Performance considerations**:
+
+- The function performs a ceiling division (``cuda::ceil_div()``) followed by a multiplication
+
+**Example**:
+
+.. code:: cuda
+
+   #include <cuda/cmath>
+
+   __global__ void example_kernel(int a, unsigned b, unsigned* result) {
+     // a = 7, b = 3 -> result = 9
+     *result = cuda::round_up(a, b);
+   }
+
+`See it on Godbolt TODO`

From d4ed421fd19079c4a1cfb6124b9c254860de2353 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Wed, 22 Jan 2025 15:48:03 -0800
Subject: [PATCH 13/16] update ceil_div doc

---
 .../libcudacxx/extended_api/math/ceil_div.rst | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/docs/libcudacxx/extended_api/math/ceil_div.rst b/docs/libcudacxx/extended_api/math/ceil_div.rst
index df6d8c973fa..167c5fa2b39 100644
--- a/docs/libcudacxx/extended_api/math/ceil_div.rst
+++ b/docs/libcudacxx/extended_api/math/ceil_div.rst
@@ -1,23 +1,25 @@
-.. _libcudacxx-extended-api-math-ceil-div:
+.. _libcudacxx-extended-api-math:
 
-``ceil_div`` Ceiling Division
-=============================
+Math
+=====
 
 .. code:: cuda
 
-   template <typename T, typename = U>
-   [[nodiscard]] __host__ __device__ constexpr T ceil_div(T value, U divisor) noexcept;
+   template <typename T, typename U>
+   [[nodiscard]] __host__ __device__ inline
+   constexpr _CUDA_VSTD::common_type_t<_Tp, _Up> ceil_div(T a, U b) noexcept;
 
-``value``: The value to be divided.
-``divisor``:  The divisor.
+ceil_div
+---------
 
-- *Requires*: ``is_integral_v<T>`` is true and ``is_integral_v<U>`` is true.
+- *Requires*: ``T`` is an integral type (including 128-bit integers) or enumerator.
 - *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
 - *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value.
 
-.. note::
+**Performance considerations**
 
-   The function is only constexpr from C++14 onwards
+- The function computes ``(a + b - 1) / b`` when the common type is a signed integer.
+- The function computes ``min(a, 1 + ((a - 1) / b)`` when the common type is an unsigned integer in CUDA, which generates less instructions than ``(a / b) + ((a / b) * b != a)``, especially for 64-bit types.
 
 **Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block:
 

From cc42e107e1547592c9858ba816633e2018e2ad84 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Wed, 22 Jan 2025 15:49:09 -0800
Subject: [PATCH 14/16] restore prev version

---
 .../libcudacxx/extended_api/math/ceil_div.rst | 22 +++++++++----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/docs/libcudacxx/extended_api/math/ceil_div.rst b/docs/libcudacxx/extended_api/math/ceil_div.rst
index 167c5fa2b39..df6d8c973fa 100644
--- a/docs/libcudacxx/extended_api/math/ceil_div.rst
+++ b/docs/libcudacxx/extended_api/math/ceil_div.rst
@@ -1,25 +1,23 @@
-.. _libcudacxx-extended-api-math:
+.. _libcudacxx-extended-api-math-ceil-div:
 
-Math
-=====
+``ceil_div`` Ceiling Division
+=============================
 
 .. code:: cuda
 
-   template <typename T, typename U>
-   [[nodiscard]] __host__ __device__ inline
-   constexpr _CUDA_VSTD::common_type_t<_Tp, _Up> ceil_div(T a, U b) noexcept;
+   template <typename T, typename = U>
+   [[nodiscard]] __host__ __device__ constexpr T ceil_div(T value, U divisor) noexcept;
 
-ceil_div
----------
+``value``: The value to be divided.
+``divisor``:  The divisor.
 
-- *Requires*: ``T`` is an integral type (including 128-bit integers) or enumerator.
+- *Requires*: ``is_integral_v<T>`` is true and ``is_integral_v<U>`` is true.
 - *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
 - *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value.
 
-**Performance considerations**
+.. note::
 
-- The function computes ``(a + b - 1) / b`` when the common type is a signed integer.
-- The function computes ``min(a, 1 + ((a - 1) / b)`` when the common type is an unsigned integer in CUDA, which generates less instructions than ``(a / b) + ((a / b) * b != a)``, especially for 64-bit types.
+   The function is only constexpr from C++14 onwards
 
 **Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block:
 

From 647b9fcb0ed73afc9064f1b3b3465c87c105b669 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Wed, 22 Jan 2025 15:50:02 -0800
Subject: [PATCH 15/16] update doc

---
 .../libcudacxx/extended_api/math/ceil_div.rst | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/docs/libcudacxx/extended_api/math/ceil_div.rst b/docs/libcudacxx/extended_api/math/ceil_div.rst
index df6d8c973fa..167c5fa2b39 100644
--- a/docs/libcudacxx/extended_api/math/ceil_div.rst
+++ b/docs/libcudacxx/extended_api/math/ceil_div.rst
@@ -1,23 +1,25 @@
-.. _libcudacxx-extended-api-math-ceil-div:
+.. _libcudacxx-extended-api-math:
 
-``ceil_div`` Ceiling Division
-=============================
+Math
+=====
 
 .. code:: cuda
 
-   template <typename T, typename = U>
-   [[nodiscard]] __host__ __device__ constexpr T ceil_div(T value, U divisor) noexcept;
+   template <typename T, typename U>
+   [[nodiscard]] __host__ __device__ inline
+   constexpr _CUDA_VSTD::common_type_t<_Tp, _Up> ceil_div(T a, U b) noexcept;
 
-``value``: The value to be divided.
-``divisor``:  The divisor.
+ceil_div
+---------
 
-- *Requires*: ``is_integral_v<T>`` is true and ``is_integral_v<U>`` is true.
+- *Requires*: ``T`` is an integral type (including 128-bit integers) or enumerator.
 - *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
 - *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value.
 
-.. note::
+**Performance considerations**
 
-   The function is only constexpr from C++14 onwards
+- The function computes ``(a + b - 1) / b`` when the common type is a signed integer.
+- The function computes ``min(a, 1 + ((a - 1) / b)`` when the common type is an unsigned integer in CUDA, which generates less instructions than ``(a / b) + ((a / b) * b != a)``, especially for 64-bit types.
 
 **Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block:
 

From 2768814aa032dd82a0f6ed0a5a7a396ca26f9644 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Wed, 22 Jan 2025 16:01:29 -0800
Subject: [PATCH 16/16] fix doc label

---
 docs/libcudacxx/extended_api/math/ceil_div.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/libcudacxx/extended_api/math/ceil_div.rst b/docs/libcudacxx/extended_api/math/ceil_div.rst
index 167c5fa2b39..3584a992a30 100644
--- a/docs/libcudacxx/extended_api/math/ceil_div.rst
+++ b/docs/libcudacxx/extended_api/math/ceil_div.rst
@@ -1,4 +1,4 @@
-.. _libcudacxx-extended-api-math:
+.. _libcudacxx-extended-api-math-ceil-div:
 
 Math
 =====