NVIDIA · fbusato · Jan 23, 2025 · Jan 1, 2025 · Jan 2, 2025 · Jan 2, 2025
@@ -1,23 +1,25 @@
 .. _libcudacxx-extended-api-math-ceil-div:
 
-``ceil_div`` Ceiling Division
-=============================
+Math
+=====
 
 .. code:: cuda
 
-   template <typename T, typename = U>
-   [[nodiscard]] __host__ __device__ constexpr T ceil_div(T value, U divisor) noexcept;
+   template <typename T, typename U>
+   [[nodiscard]] __host__ __device__ inline
+   constexpr _CUDA_VSTD::common_type_t<_Tp, _Up> ceil_div(T a, U b) noexcept;
 
-``value``: The value to be divided.
-``divisor``:  The divisor.
+ceil_div
+---------
 
-- *Requires*: ``is_integral_v<T>`` is true and ``is_integral_v<U>`` is true.
+- *Requires*: ``T`` is an integral type (including 128-bit integers) or enumerator.
 - *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
 - *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value.
 
-.. note::
+**Performance considerations**
 
-   The function is only constexpr from C++14 onwards
+- The function computes ``(a + b - 1) / b`` when the common type is a signed integer.
+- The function computes ``min(a, 1 + ((a - 1) / b)`` when the common type is an unsigned integer in CUDA, which generates less instructions than ``(a / b) + ((a / b) * b != a)``, especially for 64-bit types.
 
 **Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block:
 

@@ -21,14 +21,15 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/__algorithm/min.h>
+#include <cuda/std/__concepts/concept_macros.h>
 #include <cuda/std/__type_traits/common_type.h>
-#include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/is_enum.h>
 #include <cuda/std/__type_traits/is_integral.h>
 #include <cuda/std/__type_traits/is_signed.h>
-#include <cuda/std/__type_traits/is_unsigned.h>
 #include <cuda/std/__type_traits/make_unsigned.h>
 #include <cuda/std/__type_traits/underlying_type.h>
+#include <cuda/std/__utility/to_underlying.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
@@ -37,48 +38,75 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 //! @param __b The divisor
 //! @pre \p __a must be non-negative
 //! @pre \p __b must be positive
-template <class _Tp,
-          class _Up,
-          _CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_unsigned, _Tp), int> = 0,
-          _CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up), int> = 0>
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(const _Tp __a, const _Up __b) noexcept
+_CCCL_TEMPLATE(class _Tp, class _Up)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_Tp, _Up>
+ceil_div(const _Tp __a, const _Up __b) noexcept
 {
-  _CCCL_ASSERT(__b > _Up(0), "cuda::ceil_div: b must be positive");
-  using _UCommon   = _CUDA_VSTD::make_unsigned_t<_CUDA_VSTD::common_type_t<_Tp, _Up>>;
-  const auto __res = static_cast<_UCommon>(__a) / static_cast<_UCommon>(__b);
-  return static_cast<_Tp>(__res + (__res * static_cast<_UCommon>(__b) != static_cast<_UCommon>(__a)));
+  _CCCL_ASSERT(__b > _Up{0}, "cuda::ceil_div: 'b' must be positive");
+
+  if constexpr (_CUDA_VSTD::is_signed_v<_Tp>)
+  {
+    _CCCL_ASSERT(__a >= _Tp{0}, "cuda::ceil_div: 'a' must be non negative");
+  }
+  using _Common = _CUDA_VSTD::common_type_t<_Tp, _Up>;
+  using _Prom   = decltype(_Tp{} / _Up{});
+  using _UProm  = _CUDA_VSTD::make_unsigned_t<_Prom>;
+  auto __a1     = static_cast<_UProm>(__a);
+  auto __b1     = static_cast<_UProm>(__b);
+  if constexpr (_CUDA_VSTD::is_signed_v<_Prom>)
+  {
+    return static_cast<_Common>((__a1 + __b1 - 1) / __b1);
+  }
+  else
+  {
+    // the ::min method is faster even if __b is a compile-time constant
+    NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+                      (return static_cast<_Common>(_CUDA_VSTD::min(__a1, 1 + ((__a1 - 1) / __b1)));),
+                      (const auto __res = __a1 / __b1; //
+                       return static_cast<_Common>(__res + (__res * __b1 != __a1));))
+  }
 }
 
-//! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder
+//! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder, \p __b is an enum
+//! @param __a The dividend
+//! @param __b The divisor
+//! @pre \p __a must be non-negative
+//! @pre \p __b must be positive
+_CCCL_TEMPLATE(class _Tp, class _Up)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_Tp, _CUDA_VSTD::underlying_type_t<_Up>>
+ceil_div(const _Tp __a, const _Up __b) noexcept
+{
+  return ::cuda::ceil_div(__a, _CUDA_VSTD::to_underlying(__b));
+}
+
+//! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder, \p __b is an enum
 //! @param __a The dividend
 //! @param __b The divisor
 //! @pre \p __a must be non-negative
 //! @pre \p __b must be positive
-template <class _Tp,
-          class _Up,
-          _CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_signed, _Tp), int>   = 0,
-          _CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up), int> = 0>
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(const _Tp __a, const _Up __b) noexcept
+_CCCL_TEMPLATE(class _Tp, class _Up)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_CUDA_VSTD::underlying_type_t<_Tp>, _Up>
+ceil_div(const _Tp __a, const _Up __b) noexcept
 {
-  _CCCL_ASSERT(__a >= _Tp(0), "cuda::ceil_div: a must be non negative");
-  _CCCL_ASSERT(__b > _Up(0), "cuda::ceil_div: b must be positive");
-  using _UCommon = _CUDA_VSTD::make_unsigned_t<_CUDA_VSTD::common_type_t<_Tp, _Up>>;
-  // Due to the precondition `__a >= 0` we can safely cast to unsigned without danger of overflowing
-  return static_cast<_Tp>((static_cast<_UCommon>(__a) + static_cast<_UCommon>(__b) - 1) / static_cast<_UCommon>(__b));
+  return ::cuda::ceil_div(_CUDA_VSTD::to_underlying(__a), __b);
 }
 
 //! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder, \p __b is an enum
 //! @param __a The dividend
 //! @param __b The divisor
 //! @pre \p __a must be non-negative
 //! @pre \p __b must be positive
-template <class _Tp,
-          class _Up,
-          _CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp), int> = 0,
-          _CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up), int>     = 0>
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(const _Tp __a, const _Up __b) noexcept
+_CCCL_TEMPLATE(class _Tp, class _Up)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up))
+_CCCL_NODISCARD
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_CUDA_VSTD::underlying_type_t<_Tp>,
+                                                              _CUDA_VSTD::underlying_type_t<_Up>>
+ceil_div(const _Tp __a, const _Up __b) noexcept
 {
-  return ::cuda::ceil_div(__a, static_cast<_CUDA_VSTD::underlying_type_t<_Up>>(__b));
+  return ::cuda::ceil_div(_CUDA_VSTD::to_underlying(__a), _CUDA_VSTD::to_underlying(__b));
 }
 
 _LIBCUDACXX_END_NAMESPACE_CUDA

@@ -6,6 +6,7 @@
 // SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
+// UNSUPPORTED: c++03, c++11, c++14
 
 #include <cuda/cmath>
 #include <cuda/std/cassert>
@@ -25,15 +26,15 @@ __host__ __device__ TEST_CONSTEXPR_CXX14 void test()
   constexpr T maxv = cuda::std::numeric_limits<T>::max();
 
   // ensure that we return the right type
-  static_assert(cuda::std::is_same<decltype(cuda::ceil_div(T(0), U(1))), T>::value, "");
-
-  assert(cuda::ceil_div(T(0), U(1)) == T(0));
-  assert(cuda::ceil_div(T(1), U(1)) == T(1));
-  assert(cuda::ceil_div(T(126), U(64)) == T(2));
+  using Common = _CUDA_VSTD::common_type_t<T, U>;
+  static_assert(cuda::std::is_same<decltype(cuda::ceil_div(T(0), U(1))), Common>::value);
+  assert(cuda::ceil_div(T(0), U(1)) == Common(0));
+  assert(cuda::ceil_div(T(1), U(1)) == Common(1));
+  assert(cuda::ceil_div(T(126), U(64)) == Common(2));
 
   // ensure that we are resilient against overflow
   assert(cuda::ceil_div(maxv, U(1)) == maxv);
-  assert(cuda::ceil_div(maxv, maxv) == T(1));
+  assert(cuda::ceil_div(maxv, maxv) == Common(1));
 }
 
 template <class T>