From f7ba5220b102c168c0a582210fdb23f225c7dbc0 Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 1 Jan 2025 01:41:51 +0000 Subject: [PATCH 01/16] ceil_div common type and optmize --- libcudacxx/include/cuda/__cmath/ceil_div.h | 61 ++++++++++++------- .../test/libcudacxx/cuda/cmath.pass.cpp | 12 ++-- 2 files changed, 44 insertions(+), 29 deletions(-) diff --git a/libcudacxx/include/cuda/__cmath/ceil_div.h b/libcudacxx/include/cuda/__cmath/ceil_div.h index 7e67503bdd5..71ac178edff 100644 --- a/libcudacxx/include/cuda/__cmath/ceil_div.h +++ b/libcudacxx/include/cuda/__cmath/ceil_div.h @@ -11,6 +11,7 @@ #ifndef _CUDA___CMATH_CEIL_DIV_H #define _CUDA___CMATH_CEIL_DIV_H +d #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) @@ -21,8 +22,9 @@ # pragma system_header #endif // no system header -#include +#include #include +#include #include #include #include @@ -30,23 +32,14 @@ #include #include -_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + _LIBCUDACXX_BEGIN_NAMESPACE_CUDA -//! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder -//! @param __a The dividend -//! @param __b The divisor -//! @pre \p __a must be non-negative -//! @pre \p __b must be positive -template = 0, - _CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up), int> = 0> -_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(const _Tp __a, const _Up __b) noexcept + template + _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _UCommon + __constexpr_unsigned_ceil_div(const _UCommon __a, const _UCommon __b) noexcept { - _CCCL_ASSERT(__b > _Up(0), "cuda::ceil_div: b must be positive"); - using _UCommon = _CUDA_VSTD::make_unsigned_t<_CUDA_VSTD::common_type_t<_Tp, _Up>>; - const auto __res = static_cast<_UCommon>(__a) / static_cast<_UCommon>(__b); - return static_cast<_Tp>(__res + (__res * static_cast<_UCommon>(__b) != static_cast<_UCommon>(__a))); + const auto __res = __a / __b; + return __res + (__res * __b != __a); } //! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder @@ -56,15 +49,37 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(con //! @pre \p __b must be positive template = 0, + _CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp), int> = 0, _CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up), int> = 0> -_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(const _Tp __a, const _Up __b) noexcept +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 decltype(_Tp{} / _Up{}) +ceil_div(const _Tp __a, const _Up __b) noexcept { - _CCCL_ASSERT(__a >= _Tp(0), "cuda::ceil_div: a must be non negative"); - _CCCL_ASSERT(__b > _Up(0), "cuda::ceil_div: b must be positive"); - using _UCommon = _CUDA_VSTD::make_unsigned_t<_CUDA_VSTD::common_type_t<_Tp, _Up>>; - // Due to the precondition `__a >= 0` we can safely cast to unsigned without danger of overflowing - return static_cast<_Tp>((static_cast<_UCommon>(__a) + static_cast<_UCommon>(__b) - 1) / static_cast<_UCommon>(__b)); + _CCCL_ASSERT(__b > _Up{0}, "cuda::ceil_div: b must be positive"); + using _Common = decltype(_Tp{} / _Up{}); + using _UCommon = _CUDA_VSTD::make_unsigned_t<_Common>; + if constexpr (std::is_signed_v<_Tp>) + { + _CCCL_ASSERT(__a >= _Tp{0}, "cuda::ceil_div: a must be non negative"); + } + auto __a1 = static_cast<_UCommon>(__a); + auto __b1 = static_cast<_UCommon>(__b); + if constexpr (std::is_signed_v<_Common>) + { + return static_cast<_Common>((__a1 + __b1 - 1) / __b1); + } + else + { + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + NV_IF_ELSE_TARGET(NV_IS_DEVICE, + (return static_cast<_Common>(_CUDA_VSTD::min(__a1, 1 + ((__a1 - 1) / __b1)));), + (return __constexpr_unsigned_ceil_div(__a1, __b1);)) + } + else + { + return __constexpr_unsigned_ceil_div(__a1, __b1); + } + } } //! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder, \p __b is an enum diff --git a/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp b/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp index bf27a71b688..4db4869adef 100644 --- a/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp @@ -25,15 +25,15 @@ __host__ __device__ TEST_CONSTEXPR_CXX14 void test() constexpr T maxv = cuda::std::numeric_limits::max(); // ensure that we return the right type - static_assert(cuda::std::is_same::value, ""); - - assert(cuda::ceil_div(T(0), U(1)) == T(0)); - assert(cuda::ceil_div(T(1), U(1)) == T(1)); - assert(cuda::ceil_div(T(126), U(64)) == T(2)); + using Common = decltype(T(0) / U(1)); + static_assert(cuda::std::is_same::value); + assert(cuda::ceil_div(T(0), U(1)) == Common(0)); + assert(cuda::ceil_div(T(1), U(1)) == Common(1)); + assert(cuda::ceil_div(T(126), U(64)) == Common(2)); // ensure that we are resilient against overflow assert(cuda::ceil_div(maxv, U(1)) == maxv); - assert(cuda::ceil_div(maxv, maxv) == T(1)); + assert(cuda::ceil_div(maxv, maxv) == Common(1)); } template From 395a91a3be2c157c7227a00c7693d941c62e76e3 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 2 Jan 2025 18:11:25 +0000 Subject: [PATCH 02/16] simplify the implementation --- libcudacxx/include/cuda/__cmath/ceil_div.h | 23 ++++++++-------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/libcudacxx/include/cuda/__cmath/ceil_div.h b/libcudacxx/include/cuda/__cmath/ceil_div.h index 71ac178edff..c21e5269966 100644 --- a/libcudacxx/include/cuda/__cmath/ceil_div.h +++ b/libcudacxx/include/cuda/__cmath/ceil_div.h @@ -24,11 +24,9 @@ d #include #include -#include #include #include #include -#include #include #include @@ -36,7 +34,7 @@ d template _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _UCommon - __constexpr_unsigned_ceil_div(const _UCommon __a, const _UCommon __b) noexcept + __unsigned_ceil_div(const _UCommon __a, const _UCommon __b) noexcept { const auto __res = __a / __b; return __res + (__res * __b != __a); @@ -57,28 +55,23 @@ ceil_div(const _Tp __a, const _Up __b) noexcept _CCCL_ASSERT(__b > _Up{0}, "cuda::ceil_div: b must be positive"); using _Common = decltype(_Tp{} / _Up{}); using _UCommon = _CUDA_VSTD::make_unsigned_t<_Common>; - if constexpr (std::is_signed_v<_Tp>) + if constexpr (_CUDA_VSTD::is_signed_v<_Tp>) { _CCCL_ASSERT(__a >= _Tp{0}, "cuda::ceil_div: a must be non negative"); } auto __a1 = static_cast<_UCommon>(__a); auto __b1 = static_cast<_UCommon>(__b); - if constexpr (std::is_signed_v<_Common>) + if constexpr (_CUDA_VSTD::is_signed_v<_Common>) { return static_cast<_Common>((__a1 + __b1 - 1) / __b1); } else { - if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) - { - NV_IF_ELSE_TARGET(NV_IS_DEVICE, - (return static_cast<_Common>(_CUDA_VSTD::min(__a1, 1 + ((__a1 - 1) / __b1)));), - (return __constexpr_unsigned_ceil_div(__a1, __b1);)) - } - else - { - return __constexpr_unsigned_ceil_div(__a1, __b1); - } + // the ::min method is faster even if __b is a compile-time constant + NV_IF_ELSE_TARGET(NV_IS_DEVICE, + (return static_cast<_Common>(_CUDA_VSTD::min(__a1, 1 + ((__a1 - 1) / __b1)));), + (const auto __res = __a1 / __b1; // + return static_cast<_Common>(__res + (__res * __b1 != __a1));)) } } From 5fc964f199d61a2deb183da3b8dee1d4e0542d83 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 2 Jan 2025 18:12:34 +0000 Subject: [PATCH 03/16] remove typo --- libcudacxx/include/cuda/__cmath/ceil_div.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/libcudacxx/include/cuda/__cmath/ceil_div.h b/libcudacxx/include/cuda/__cmath/ceil_div.h index c21e5269966..d32d13e106a 100644 --- a/libcudacxx/include/cuda/__cmath/ceil_div.h +++ b/libcudacxx/include/cuda/__cmath/ceil_div.h @@ -11,7 +11,6 @@ #ifndef _CUDA___CMATH_CEIL_DIV_H #define _CUDA___CMATH_CEIL_DIV_H -d #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) @@ -30,11 +29,11 @@ d #include #include - _LIBCUDACXX_BEGIN_NAMESPACE_CUDA +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA - template - _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _UCommon - __unsigned_ceil_div(const _UCommon __a, const _UCommon __b) noexcept +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _UCommon +__unsigned_ceil_div(const _UCommon __a, const _UCommon __b) noexcept { const auto __res = __a / __b; return __res + (__res * __b != __a); From 47bacda84f3ba02f979137b57b5475b4071c644a Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 2 Jan 2025 18:46:09 +0000 Subject: [PATCH 04/16] remove unused function --- libcudacxx/include/cuda/__cmath/ceil_div.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/libcudacxx/include/cuda/__cmath/ceil_div.h b/libcudacxx/include/cuda/__cmath/ceil_div.h index d32d13e106a..143dd8a99bf 100644 --- a/libcudacxx/include/cuda/__cmath/ceil_div.h +++ b/libcudacxx/include/cuda/__cmath/ceil_div.h @@ -31,14 +31,6 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA -template -_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _UCommon -__unsigned_ceil_div(const _UCommon __a, const _UCommon __b) noexcept -{ - const auto __res = __a / __b; - return __res + (__res * __b != __a); -} - //! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder //! @param __a The dividend //! @param __b The divisor From e5c8a63f7955d9c356e9d1fa7ab09e31c415ccf0 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 2 Jan 2025 18:57:15 +0000 Subject: [PATCH 05/16] more readable static_assert --- libcudacxx/include/cuda/__cmath/ceil_div.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libcudacxx/include/cuda/__cmath/ceil_div.h b/libcudacxx/include/cuda/__cmath/ceil_div.h index 143dd8a99bf..23416916784 100644 --- a/libcudacxx/include/cuda/__cmath/ceil_div.h +++ b/libcudacxx/include/cuda/__cmath/ceil_div.h @@ -43,12 +43,12 @@ template _Up{0}, "cuda::ceil_div: b must be positive"); + _CCCL_ASSERT(__b > _Up{0}, "cuda::ceil_div: 'b' must be positive"); using _Common = decltype(_Tp{} / _Up{}); using _UCommon = _CUDA_VSTD::make_unsigned_t<_Common>; if constexpr (_CUDA_VSTD::is_signed_v<_Tp>) { - _CCCL_ASSERT(__a >= _Tp{0}, "cuda::ceil_div: a must be non negative"); + _CCCL_ASSERT(__a >= _Tp{0}, "cuda::ceil_div: 'a' must be non negative"); } auto __a1 = static_cast<_UCommon>(__a); auto __b1 = static_cast<_UCommon>(__b); From 331b7ce67e836f73d68a43b7371765e2ccb7d687 Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 8 Jan 2025 00:42:24 +0000 Subject: [PATCH 06/16] add _CCCL_REQUIRES , to_underlying, enumerator support --- libcudacxx/include/cuda/__cmath/ceil_div.h | 44 ++++++++++++++++------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/libcudacxx/include/cuda/__cmath/ceil_div.h b/libcudacxx/include/cuda/__cmath/ceil_div.h index 23416916784..de2b48ff11e 100644 --- a/libcudacxx/include/cuda/__cmath/ceil_div.h +++ b/libcudacxx/include/cuda/__cmath/ceil_div.h @@ -27,7 +27,7 @@ #include #include #include -#include +#include _LIBCUDACXX_BEGIN_NAMESPACE_CUDA @@ -36,11 +36,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA //! @param __b The divisor //! @pre \p __a must be non-negative //! @pre \p __b must be positive -template = 0, - _CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up), int> = 0> -_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 decltype(_Tp{} / _Up{}) +_CCCL_TEMPLATE(class _Tp, class _Up) +_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up)) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(_Tp{} / _Up{}) ceil_div(const _Tp __a, const _Up __b) noexcept { _CCCL_ASSERT(__b > _Up{0}, "cuda::ceil_div: 'b' must be positive"); @@ -71,13 +69,35 @@ ceil_div(const _Tp __a, const _Up __b) noexcept //! @param __b The divisor //! @pre \p __a must be non-negative //! @pre \p __b must be positive -template = 0, - _CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up), int> = 0> -_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(const _Tp __a, const _Up __b) noexcept +_CCCL_TEMPLATE(class _Tp, class _Up) +_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up)) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp ceil_div(const _Tp __a, const _Up __b) noexcept { - return ::cuda::ceil_div(__a, static_cast<_CUDA_VSTD::underlying_type_t<_Up>>(__b)); + return ::cuda::ceil_div(__a, _CUDA_VSTD::to_underlying(__b)); +} + +//! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder, \p __b is an enum +//! @param __a The dividend +//! @param __b The divisor +//! @pre \p __a must be non-negative +//! @pre \p __b must be positive +_CCCL_TEMPLATE(class _Tp, class _Up) +_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up)) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp ceil_div(const _Tp __a, const _Up __b) noexcept +{ + return ::cuda::ceil_div(_CUDA_VSTD::to_underlying(__a), __b); +} + +//! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder, \p __b is an enum +//! @param __a The dividend +//! @param __b The divisor +//! @pre \p __a must be non-negative +//! @pre \p __b must be positive +_CCCL_TEMPLATE(class _Tp, class _Up) +_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up)) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp ceil_div(const _Tp __a, const _Up __b) noexcept +{ + return ::cuda::ceil_div(_CUDA_VSTD::to_underlying(__a), _CUDA_VSTD::to_underlying(__b)); } _LIBCUDACXX_END_NAMESPACE_CUDA From e8862d23bf3460b9a35701d290fdfe436c3bcba8 Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 8 Jan 2025 00:52:41 +0000 Subject: [PATCH 07/16] improve documentation --- docs/libcudacxx/extended_api/math.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/libcudacxx/extended_api/math.rst b/docs/libcudacxx/extended_api/math.rst index 5e9af18aae2..dade27c357d 100644 --- a/docs/libcudacxx/extended_api/math.rst +++ b/docs/libcudacxx/extended_api/math.rst @@ -5,15 +5,15 @@ Math .. code:: cuda - template - [[nodiscard]] __host__ __device__ constexpr T ceil_div(T a, T b) noexcept; + template + [[nodiscard]] __host__ __device__ constexpr decltype(T{} / U{}) ceil_div(T a, U b) noexcept; ceil_div --------- -- _Requires_: `is_integral_v` is true. -- _Preconditions_: `a >= 0` is true and `b > 0` is true. -- _Returns_: divides `a` by `b`. If `a` is not a multiple of `b` rounds the result up to the next integer value. +- *Requires*: ``T`` is an integral type or enumerator. +- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true. +- *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value. .. note:: From a87276bf081947d8dd7efe91f2065cdd52d6fa91 Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 8 Jan 2025 00:52:47 +0000 Subject: [PATCH 08/16] fix return type --- libcudacxx/include/cuda/__cmath/ceil_div.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/libcudacxx/include/cuda/__cmath/ceil_div.h b/libcudacxx/include/cuda/__cmath/ceil_div.h index de2b48ff11e..4e93581febe 100644 --- a/libcudacxx/include/cuda/__cmath/ceil_div.h +++ b/libcudacxx/include/cuda/__cmath/ceil_div.h @@ -71,7 +71,8 @@ ceil_div(const _Tp __a, const _Up __b) noexcept //! @pre \p __b must be positive _CCCL_TEMPLATE(class _Tp, class _Up) _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up)) -_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp ceil_div(const _Tp __a, const _Up __b) noexcept +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(_Tp{} / _CUDA_VSTD::underlying_type_t<_Up>{}) +ceil_div(const _Tp __a, const _Up __b) noexcept { return ::cuda::ceil_div(__a, _CUDA_VSTD::to_underlying(__b)); } @@ -83,7 +84,8 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp ceil_div(const _Tp __a, //! @pre \p __b must be positive _CCCL_TEMPLATE(class _Tp, class _Up) _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up)) -_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp ceil_div(const _Tp __a, const _Up __b) noexcept +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(_CUDA_VSTD::underlying_type_t<_Tp>{} / _Up{}) +ceil_div(const _Tp __a, const _Up __b) noexcept { return ::cuda::ceil_div(_CUDA_VSTD::to_underlying(__a), __b); } @@ -95,7 +97,9 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp ceil_div(const _Tp __a, //! @pre \p __b must be positive _CCCL_TEMPLATE(class _Tp, class _Up) _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up)) -_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp ceil_div(const _Tp __a, const _Up __b) noexcept +_CCCL_NODISCARD +_LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(_CUDA_VSTD::underlying_type_t<_Tp>{} / _CUDA_VSTD::underlying_type_t<_Up>{}) +ceil_div(const _Tp __a, const _Up __b) noexcept { return ::cuda::ceil_div(_CUDA_VSTD::to_underlying(__a), _CUDA_VSTD::to_underlying(__b)); } From 7e6a7d45dc38ac22d122fdaeaed528af706beafe Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 9 Jan 2025 00:52:33 +0000 Subject: [PATCH 09/16] use common_type for return value --- libcudacxx/include/cuda/__cmath/ceil_div.h | 25 +++++++++++-------- .../test/libcudacxx/cuda/cmath.pass.cpp | 2 +- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/libcudacxx/include/cuda/__cmath/ceil_div.h b/libcudacxx/include/cuda/__cmath/ceil_div.h index 4e93581febe..e445d8a0ad5 100644 --- a/libcudacxx/include/cuda/__cmath/ceil_div.h +++ b/libcudacxx/include/cuda/__cmath/ceil_div.h @@ -22,11 +22,13 @@ #endif // no system header #include -#include +#include +#include #include #include #include #include +#include #include _LIBCUDACXX_BEGIN_NAMESPACE_CUDA @@ -38,19 +40,21 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA //! @pre \p __b must be positive _CCCL_TEMPLATE(class _Tp, class _Up) _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up)) -_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(_Tp{} / _Up{}) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_Tp, _Up> ceil_div(const _Tp __a, const _Up __b) noexcept { _CCCL_ASSERT(__b > _Up{0}, "cuda::ceil_div: 'b' must be positive"); - using _Common = decltype(_Tp{} / _Up{}); - using _UCommon = _CUDA_VSTD::make_unsigned_t<_Common>; + if constexpr (_CUDA_VSTD::is_signed_v<_Tp>) { _CCCL_ASSERT(__a >= _Tp{0}, "cuda::ceil_div: 'a' must be non negative"); } - auto __a1 = static_cast<_UCommon>(__a); - auto __b1 = static_cast<_UCommon>(__b); - if constexpr (_CUDA_VSTD::is_signed_v<_Common>) + using _Common = _CUDA_VSTD::common_type_t<_Tp, _Up>; + using _Prom = decltype(_Tp{} / _Up{}); + using _UProm = _CUDA_VSTD::make_unsigned_t<_Prom>; + auto __a1 = static_cast<_UProm>(__a); + auto __b1 = static_cast<_UProm>(__b); + if constexpr (_CUDA_VSTD::is_signed_v<_Prom>) { return static_cast<_Common>((__a1 + __b1 - 1) / __b1); } @@ -71,7 +75,7 @@ ceil_div(const _Tp __a, const _Up __b) noexcept //! @pre \p __b must be positive _CCCL_TEMPLATE(class _Tp, class _Up) _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up)) -_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(_Tp{} / _CUDA_VSTD::underlying_type_t<_Up>{}) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_Tp, _CUDA_VSTD::underlying_type_t<_Up>> ceil_div(const _Tp __a, const _Up __b) noexcept { return ::cuda::ceil_div(__a, _CUDA_VSTD::to_underlying(__b)); @@ -84,7 +88,7 @@ ceil_div(const _Tp __a, const _Up __b) noexcept //! @pre \p __b must be positive _CCCL_TEMPLATE(class _Tp, class _Up) _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up)) -_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(_CUDA_VSTD::underlying_type_t<_Tp>{} / _Up{}) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_CUDA_VSTD::underlying_type_t<_Tp>, _Up> ceil_div(const _Tp __a, const _Up __b) noexcept { return ::cuda::ceil_div(_CUDA_VSTD::to_underlying(__a), __b); @@ -98,7 +102,8 @@ ceil_div(const _Tp __a, const _Up __b) noexcept _CCCL_TEMPLATE(class _Tp, class _Up) _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up)) _CCCL_NODISCARD -_LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(_CUDA_VSTD::underlying_type_t<_Tp>{} / _CUDA_VSTD::underlying_type_t<_Up>{}) +_LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_CUDA_VSTD::underlying_type_t<_Tp>, + _CUDA_VSTD::underlying_type_t<_Up>> ceil_div(const _Tp __a, const _Up __b) noexcept { return ::cuda::ceil_div(_CUDA_VSTD::to_underlying(__a), _CUDA_VSTD::to_underlying(__b)); diff --git a/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp b/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp index 4db4869adef..d426c19e83a 100644 --- a/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp @@ -25,7 +25,7 @@ __host__ __device__ TEST_CONSTEXPR_CXX14 void test() constexpr T maxv = cuda::std::numeric_limits::max(); // ensure that we return the right type - using Common = decltype(T(0) / U(1)); + using Common = _CUDA_VSTD::common_type_t; static_assert(cuda::std::is_same::value); assert(cuda::ceil_div(T(0), U(1)) == Common(0)); assert(cuda::ceil_div(T(1), U(1)) == Common(1)); From 40700c8f32a3d05bdd1e8274e9bd58b5cd670ec9 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 9 Jan 2025 00:58:08 +0000 Subject: [PATCH 10/16] improve documentation --- docs/libcudacxx/extended_api/math.rst | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/libcudacxx/extended_api/math.rst b/docs/libcudacxx/extended_api/math.rst index dade27c357d..167c5fa2b39 100644 --- a/docs/libcudacxx/extended_api/math.rst +++ b/docs/libcudacxx/extended_api/math.rst @@ -6,18 +6,20 @@ Math .. code:: cuda template - [[nodiscard]] __host__ __device__ constexpr decltype(T{} / U{}) ceil_div(T a, U b) noexcept; + [[nodiscard]] __host__ __device__ inline + constexpr _CUDA_VSTD::common_type_t<_Tp, _Up> ceil_div(T a, U b) noexcept; ceil_div --------- -- *Requires*: ``T`` is an integral type or enumerator. +- *Requires*: ``T`` is an integral type (including 128-bit integers) or enumerator. - *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true. - *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value. -.. note:: +**Performance considerations** - The function is only constexpr from C++14 onwards +- The function computes ``(a + b - 1) / b`` when the common type is a signed integer. +- The function computes ``min(a, 1 + ((a - 1) / b)`` when the common type is an unsigned integer in CUDA, which generates less instructions than ``(a / b) + ((a / b) * b != a)``, especially for 64-bit types. **Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block: From 2a678b191430b5aebc2cbfefb51da4b7db27e124 Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 22 Jan 2025 15:41:30 -0800 Subject: [PATCH 11/16] disable c++11/14 --- libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp b/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp index d426c19e83a..37612764986 100644 --- a/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp @@ -6,6 +6,7 @@ // SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// +// UNSUPPORTED: c++03, c++11, c++14 #include #include From 3edaad3ab1af5bfe448062d95096e7dee2e87f48 Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 22 Jan 2025 15:47:40 -0800 Subject: [PATCH 12/16] sync docs --- docs/libcudacxx/extended_api/math.rst | 64 ++++++------------- .../libcudacxx/extended_api/math/ceil_div.rst | 52 +++++++++++++++ .../extended_api/math/round_down.rst | 38 +++++++++++ .../libcudacxx/extended_api/math/round_up.rst | 40 ++++++++++++ 4 files changed, 149 insertions(+), 45 deletions(-) create mode 100644 docs/libcudacxx/extended_api/math/ceil_div.rst create mode 100644 docs/libcudacxx/extended_api/math/round_down.rst create mode 100644 docs/libcudacxx/extended_api/math/round_up.rst diff --git a/docs/libcudacxx/extended_api/math.rst b/docs/libcudacxx/extended_api/math.rst index 167c5fa2b39..59c6068a09c 100644 --- a/docs/libcudacxx/extended_api/math.rst +++ b/docs/libcudacxx/extended_api/math.rst @@ -1,54 +1,28 @@ .. _libcudacxx-extended-api-math: Math -===== +==== -.. code:: cuda +.. toctree:: + :hidden: + :maxdepth: 1 - template - [[nodiscard]] __host__ __device__ inline - constexpr _CUDA_VSTD::common_type_t<_Tp, _Up> ceil_div(T a, U b) noexcept; + cuda::ceil_div + cuda::round_up + cuda::round_down -ceil_div ---------- +.. list-table:: + :widths: 25 45 30 + :header-rows: 0 -- *Requires*: ``T`` is an integral type (including 128-bit integers) or enumerator. -- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true. -- *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value. + * - :ref:`ceil_div ` + - Ceiling division + - CCCL 2.6.0 / CUDA 12.6 -**Performance considerations** + * - :ref:`round_up ` + - Round to the next multiple + - CCCL 2.9.0 / CUDA 12.9 -- The function computes ``(a + b - 1) / b`` when the common type is a signed integer. -- The function computes ``min(a, 1 + ((a - 1) / b)`` when the common type is an unsigned integer in CUDA, which generates less instructions than ``(a / b) + ((a / b) * b != a)``, especially for 64-bit types. - -**Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block: - -.. code:: cuda - - #include - #include - - __global__ void vscale(int n, float s, float *x) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n) x[i] *= s; - } - - int main() { - const int n = 100000; - const float s = 2.f; - std::vector x(n, 1.f); - - // Given a fixed number of threads per block... - constexpr int threads_per_block = 256; - - // ...dividing some "n" by "threads_per_block" may lead to a remainder, - // requiring the kernel to be launched with an extra thread block to handle it. - const int thread_blocks = cuda::ceil_div(n, threads_per_block); - - vscale<<>>(n, s, x.data()); - cudaDeviceSynchronize(); - - return 0; - } - -`See it on Godbolt TODO` + * - :ref:`round_down ` + - Round to the previous multiple + - CCCL 2.9.0 / CUDA 12.9 diff --git a/docs/libcudacxx/extended_api/math/ceil_div.rst b/docs/libcudacxx/extended_api/math/ceil_div.rst new file mode 100644 index 00000000000..df6d8c973fa --- /dev/null +++ b/docs/libcudacxx/extended_api/math/ceil_div.rst @@ -0,0 +1,52 @@ +.. _libcudacxx-extended-api-math-ceil-div: + +``ceil_div`` Ceiling Division +============================= + +.. code:: cuda + + template + [[nodiscard]] __host__ __device__ constexpr T ceil_div(T value, U divisor) noexcept; + +``value``: The value to be divided. +``divisor``: The divisor. + +- *Requires*: ``is_integral_v`` is true and ``is_integral_v`` is true. +- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true. +- *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value. + +.. note:: + + The function is only constexpr from C++14 onwards + +**Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block: + +.. code:: cuda + + #include + #include + + __global__ void vscale(int n, float s, float *x) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) x[i] *= s; + } + + int main() { + const int n = 100000; + const float s = 2.f; + std::vector x(n, 1.f); + + // Given a fixed number of threads per block... + constexpr int threads_per_block = 256; + + // ...dividing some "n" by "threads_per_block" may lead to a remainder, + // requiring the kernel to be launched with an extra thread block to handle it. + const int thread_blocks = cuda::ceil_div(n, threads_per_block); + + vscale<<>>(n, s, x.data()); + cudaDeviceSynchronize(); + + return 0; + } + +`See it on Godbolt TODO` diff --git a/docs/libcudacxx/extended_api/math/round_down.rst b/docs/libcudacxx/extended_api/math/round_down.rst new file mode 100644 index 00000000000..20a80998fd3 --- /dev/null +++ b/docs/libcudacxx/extended_api/math/round_down.rst @@ -0,0 +1,38 @@ +.. _libcudacxx-extended-api-math-round-down: + +``round_down`` Round to the previous multiple +============================================= + +.. code:: cuda + + template + [[nodiscard]] __host__ __device__ inline + constexpr cuda::std::common_type_t round_down(T value, U base_multiple) noexcept; + +``value``: The value to be rounded down. +``base_multiple``: The base multiple to which the value rounds down. + +- *Requires*: ``T`` and ``U`` are integral types (including 128-bit integers) or enumerators. +- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true. +- *Returns*: ``a`` rounded down to the largest multiple of ``b`` less than or equal to ``a``. If ``a`` is already a multiple of ``b``, return ``a``. + +.. note:: + + The function requires C++17 onwards + +**Performance considerations**: + +- The function performs a truncation division followed by a multiplication. It provides better performance than ``a / b * b`` when the common type is a signed integer + +**Example**: + +.. code:: cuda + + #include + + __global__ void example_kernel(int a, unsigned b, unsigned* result) { + // a = 7, b = 3 -> result = 6 + *result = cuda::round_down(a, b); + } + +`See it on Godbolt TODO` diff --git a/docs/libcudacxx/extended_api/math/round_up.rst b/docs/libcudacxx/extended_api/math/round_up.rst new file mode 100644 index 00000000000..13c282aaad7 --- /dev/null +++ b/docs/libcudacxx/extended_api/math/round_up.rst @@ -0,0 +1,40 @@ +.. _libcudacxx-extended-api-math-round-up: + +``round_up`` Round to the next multiple +======================================= + +.. code:: cuda + + template + [[nodiscard]] __host__ __device__ inline + constexpr cuda::std::common_type_t round_up(T value, U base_multiple) noexcept; + +``value``: The value to be rounded up. +``base_multiple``: The base multiple to which the value rounds up. + +- *Requires*: ``T`` and ``U`` are integral types (including 128-bit integers) or enumerators. +- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true. +- *Returns*: ``a`` rounded up to the smallest multiple of ``b`` greater than or equal to ``a``. If ``a`` is already a multiple of ``b``, return ``a``. +- *Note*: the result can overflow if ``ceil(a / b) * b`` exceeds the maximum value of the common type of + ``a`` and ``b``. The condition is checked in debug mode. + +.. note:: + + The function requires C++17 onwards + +**Performance considerations**: + +- The function performs a ceiling division (``cuda::ceil_div()``) followed by a multiplication + +**Example**: + +.. code:: cuda + + #include + + __global__ void example_kernel(int a, unsigned b, unsigned* result) { + // a = 7, b = 3 -> result = 9 + *result = cuda::round_up(a, b); + } + +`See it on Godbolt TODO` From d4ed421fd19079c4a1cfb6124b9c254860de2353 Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 22 Jan 2025 15:48:03 -0800 Subject: [PATCH 13/16] update ceil_div doc --- .../libcudacxx/extended_api/math/ceil_div.rst | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/docs/libcudacxx/extended_api/math/ceil_div.rst b/docs/libcudacxx/extended_api/math/ceil_div.rst index df6d8c973fa..167c5fa2b39 100644 --- a/docs/libcudacxx/extended_api/math/ceil_div.rst +++ b/docs/libcudacxx/extended_api/math/ceil_div.rst @@ -1,23 +1,25 @@ -.. _libcudacxx-extended-api-math-ceil-div: +.. _libcudacxx-extended-api-math: -``ceil_div`` Ceiling Division -============================= +Math +===== .. code:: cuda - template - [[nodiscard]] __host__ __device__ constexpr T ceil_div(T value, U divisor) noexcept; + template + [[nodiscard]] __host__ __device__ inline + constexpr _CUDA_VSTD::common_type_t<_Tp, _Up> ceil_div(T a, U b) noexcept; -``value``: The value to be divided. -``divisor``: The divisor. +ceil_div +--------- -- *Requires*: ``is_integral_v`` is true and ``is_integral_v`` is true. +- *Requires*: ``T`` is an integral type (including 128-bit integers) or enumerator. - *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true. - *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value. -.. note:: +**Performance considerations** - The function is only constexpr from C++14 onwards +- The function computes ``(a + b - 1) / b`` when the common type is a signed integer. +- The function computes ``min(a, 1 + ((a - 1) / b)`` when the common type is an unsigned integer in CUDA, which generates less instructions than ``(a / b) + ((a / b) * b != a)``, especially for 64-bit types. **Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block: From cc42e107e1547592c9858ba816633e2018e2ad84 Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 22 Jan 2025 15:49:09 -0800 Subject: [PATCH 14/16] restore prev version --- .../libcudacxx/extended_api/math/ceil_div.rst | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/docs/libcudacxx/extended_api/math/ceil_div.rst b/docs/libcudacxx/extended_api/math/ceil_div.rst index 167c5fa2b39..df6d8c973fa 100644 --- a/docs/libcudacxx/extended_api/math/ceil_div.rst +++ b/docs/libcudacxx/extended_api/math/ceil_div.rst @@ -1,25 +1,23 @@ -.. _libcudacxx-extended-api-math: +.. _libcudacxx-extended-api-math-ceil-div: -Math -===== +``ceil_div`` Ceiling Division +============================= .. code:: cuda - template - [[nodiscard]] __host__ __device__ inline - constexpr _CUDA_VSTD::common_type_t<_Tp, _Up> ceil_div(T a, U b) noexcept; + template + [[nodiscard]] __host__ __device__ constexpr T ceil_div(T value, U divisor) noexcept; -ceil_div ---------- +``value``: The value to be divided. +``divisor``: The divisor. -- *Requires*: ``T`` is an integral type (including 128-bit integers) or enumerator. +- *Requires*: ``is_integral_v`` is true and ``is_integral_v`` is true. - *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true. - *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value. -**Performance considerations** +.. note:: -- The function computes ``(a + b - 1) / b`` when the common type is a signed integer. -- The function computes ``min(a, 1 + ((a - 1) / b)`` when the common type is an unsigned integer in CUDA, which generates less instructions than ``(a / b) + ((a / b) * b != a)``, especially for 64-bit types. + The function is only constexpr from C++14 onwards **Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block: From 647b9fcb0ed73afc9064f1b3b3465c87c105b669 Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 22 Jan 2025 15:50:02 -0800 Subject: [PATCH 15/16] update doc --- .../libcudacxx/extended_api/math/ceil_div.rst | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/docs/libcudacxx/extended_api/math/ceil_div.rst b/docs/libcudacxx/extended_api/math/ceil_div.rst index df6d8c973fa..167c5fa2b39 100644 --- a/docs/libcudacxx/extended_api/math/ceil_div.rst +++ b/docs/libcudacxx/extended_api/math/ceil_div.rst @@ -1,23 +1,25 @@ -.. _libcudacxx-extended-api-math-ceil-div: +.. _libcudacxx-extended-api-math: -``ceil_div`` Ceiling Division -============================= +Math +===== .. code:: cuda - template - [[nodiscard]] __host__ __device__ constexpr T ceil_div(T value, U divisor) noexcept; + template + [[nodiscard]] __host__ __device__ inline + constexpr _CUDA_VSTD::common_type_t<_Tp, _Up> ceil_div(T a, U b) noexcept; -``value``: The value to be divided. -``divisor``: The divisor. +ceil_div +--------- -- *Requires*: ``is_integral_v`` is true and ``is_integral_v`` is true. +- *Requires*: ``T`` is an integral type (including 128-bit integers) or enumerator. - *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true. - *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value. -.. note:: +**Performance considerations** - The function is only constexpr from C++14 onwards +- The function computes ``(a + b - 1) / b`` when the common type is a signed integer. +- The function computes ``min(a, 1 + ((a - 1) / b)`` when the common type is an unsigned integer in CUDA, which generates less instructions than ``(a / b) + ((a / b) * b != a)``, especially for 64-bit types. **Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block: From 2768814aa032dd82a0f6ed0a5a7a396ca26f9644 Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 22 Jan 2025 16:01:29 -0800 Subject: [PATCH 16/16] fix doc label --- docs/libcudacxx/extended_api/math/ceil_div.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/libcudacxx/extended_api/math/ceil_div.rst b/docs/libcudacxx/extended_api/math/ceil_div.rst index 167c5fa2b39..3584a992a30 100644 --- a/docs/libcudacxx/extended_api/math/ceil_div.rst +++ b/docs/libcudacxx/extended_api/math/ceil_div.rst @@ -1,4 +1,4 @@ -.. _libcudacxx-extended-api-math: +.. _libcudacxx-extended-api-math-ceil-div: Math =====