Skip to content

Commit 0b5a5d8

Browse files
authored
ceil_div return common type and optmize (#3229)
1 parent 240592f commit 0b5a5d8

File tree

3 files changed

+74
-43
lines changed

3 files changed

+74
-43
lines changed

docs/libcudacxx/extended_api/math/ceil_div.rst

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,25 @@
11
.. _libcudacxx-extended-api-math-ceil-div:
22

3-
``ceil_div`` Ceiling Division
4-
=============================
3+
Math
4+
=====
55

66
.. code:: cuda
77
8-
template <typename T, typename = U>
9-
[[nodiscard]] __host__ __device__ constexpr T ceil_div(T value, U divisor) noexcept;
8+
template <typename T, typename U>
9+
[[nodiscard]] __host__ __device__ inline
10+
constexpr _CUDA_VSTD::common_type_t<_Tp, _Up> ceil_div(T a, U b) noexcept;
1011
11-
``value``: The value to be divided.
12-
``divisor``: The divisor.
12+
ceil_div
13+
---------
1314

14-
- *Requires*: ``is_integral_v<T>`` is true and ``is_integral_v<U>`` is true.
15+
- *Requires*: ``T`` is an integral type (including 128-bit integers) or enumerator.
1516
- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
1617
- *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value.
1718

18-
.. note::
19+
**Performance considerations**
1920

20-
The function is only constexpr from C++14 onwards
21+
- The function computes ``(a + b - 1) / b`` when the common type is a signed integer.
22+
- The function computes ``min(a, 1 + ((a - 1) / b)`` when the common type is an unsigned integer in CUDA, which generates less instructions than ``(a / b) + ((a / b) * b != a)``, especially for 64-bit types.
2123

2224
**Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block:
2325

libcudacxx/include/cuda/__cmath/ceil_div.h

Lines changed: 56 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,15 @@
2121
# pragma system_header
2222
#endif // no system header
2323

24+
#include <cuda/std/__algorithm/min.h>
25+
#include <cuda/std/__concepts/concept_macros.h>
2426
#include <cuda/std/__type_traits/common_type.h>
25-
#include <cuda/std/__type_traits/enable_if.h>
2627
#include <cuda/std/__type_traits/is_enum.h>
2728
#include <cuda/std/__type_traits/is_integral.h>
2829
#include <cuda/std/__type_traits/is_signed.h>
29-
#include <cuda/std/__type_traits/is_unsigned.h>
3030
#include <cuda/std/__type_traits/make_unsigned.h>
3131
#include <cuda/std/__type_traits/underlying_type.h>
32+
#include <cuda/std/__utility/to_underlying.h>
3233

3334
_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
3435

@@ -37,48 +38,75 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
3738
//! @param __b The divisor
3839
//! @pre \p __a must be non-negative
3940
//! @pre \p __b must be positive
40-
template <class _Tp,
41-
class _Up,
42-
_CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_unsigned, _Tp), int> = 0,
43-
_CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up), int> = 0>
44-
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(const _Tp __a, const _Up __b) noexcept
41+
_CCCL_TEMPLATE(class _Tp, class _Up)
42+
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up))
43+
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_Tp, _Up>
44+
ceil_div(const _Tp __a, const _Up __b) noexcept
4545
{
46-
_CCCL_ASSERT(__b > _Up(0), "cuda::ceil_div: b must be positive");
47-
using _UCommon = _CUDA_VSTD::make_unsigned_t<_CUDA_VSTD::common_type_t<_Tp, _Up>>;
48-
const auto __res = static_cast<_UCommon>(__a) / static_cast<_UCommon>(__b);
49-
return static_cast<_Tp>(__res + (__res * static_cast<_UCommon>(__b) != static_cast<_UCommon>(__a)));
46+
_CCCL_ASSERT(__b > _Up{0}, "cuda::ceil_div: 'b' must be positive");
47+
48+
if constexpr (_CUDA_VSTD::is_signed_v<_Tp>)
49+
{
50+
_CCCL_ASSERT(__a >= _Tp{0}, "cuda::ceil_div: 'a' must be non negative");
51+
}
52+
using _Common = _CUDA_VSTD::common_type_t<_Tp, _Up>;
53+
using _Prom = decltype(_Tp{} / _Up{});
54+
using _UProm = _CUDA_VSTD::make_unsigned_t<_Prom>;
55+
auto __a1 = static_cast<_UProm>(__a);
56+
auto __b1 = static_cast<_UProm>(__b);
57+
if constexpr (_CUDA_VSTD::is_signed_v<_Prom>)
58+
{
59+
return static_cast<_Common>((__a1 + __b1 - 1) / __b1);
60+
}
61+
else
62+
{
63+
// the ::min method is faster even if __b is a compile-time constant
64+
NV_IF_ELSE_TARGET(NV_IS_DEVICE,
65+
(return static_cast<_Common>(_CUDA_VSTD::min(__a1, 1 + ((__a1 - 1) / __b1)));),
66+
(const auto __res = __a1 / __b1; //
67+
return static_cast<_Common>(__res + (__res * __b1 != __a1));))
68+
}
5069
}
5170

52-
//! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder
71+
//! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder, \p __b is an enum
72+
//! @param __a The dividend
73+
//! @param __b The divisor
74+
//! @pre \p __a must be non-negative
75+
//! @pre \p __b must be positive
76+
_CCCL_TEMPLATE(class _Tp, class _Up)
77+
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up))
78+
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_Tp, _CUDA_VSTD::underlying_type_t<_Up>>
79+
ceil_div(const _Tp __a, const _Up __b) noexcept
80+
{
81+
return ::cuda::ceil_div(__a, _CUDA_VSTD::to_underlying(__b));
82+
}
83+
84+
//! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder, \p __b is an enum
5385
//! @param __a The dividend
5486
//! @param __b The divisor
5587
//! @pre \p __a must be non-negative
5688
//! @pre \p __b must be positive
57-
template <class _Tp,
58-
class _Up,
59-
_CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_signed, _Tp), int> = 0,
60-
_CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up), int> = 0>
61-
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(const _Tp __a, const _Up __b) noexcept
89+
_CCCL_TEMPLATE(class _Tp, class _Up)
90+
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up))
91+
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_CUDA_VSTD::underlying_type_t<_Tp>, _Up>
92+
ceil_div(const _Tp __a, const _Up __b) noexcept
6293
{
63-
_CCCL_ASSERT(__a >= _Tp(0), "cuda::ceil_div: a must be non negative");
64-
_CCCL_ASSERT(__b > _Up(0), "cuda::ceil_div: b must be positive");
65-
using _UCommon = _CUDA_VSTD::make_unsigned_t<_CUDA_VSTD::common_type_t<_Tp, _Up>>;
66-
// Due to the precondition `__a >= 0` we can safely cast to unsigned without danger of overflowing
67-
return static_cast<_Tp>((static_cast<_UCommon>(__a) + static_cast<_UCommon>(__b) - 1) / static_cast<_UCommon>(__b));
94+
return ::cuda::ceil_div(_CUDA_VSTD::to_underlying(__a), __b);
6895
}
6996

7097
//! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder, \p __b is an enum
7198
//! @param __a The dividend
7299
//! @param __b The divisor
73100
//! @pre \p __a must be non-negative
74101
//! @pre \p __b must be positive
75-
template <class _Tp,
76-
class _Up,
77-
_CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp), int> = 0,
78-
_CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up), int> = 0>
79-
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(const _Tp __a, const _Up __b) noexcept
102+
_CCCL_TEMPLATE(class _Tp, class _Up)
103+
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up))
104+
_CCCL_NODISCARD
105+
_LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_CUDA_VSTD::underlying_type_t<_Tp>,
106+
_CUDA_VSTD::underlying_type_t<_Up>>
107+
ceil_div(const _Tp __a, const _Up __b) noexcept
80108
{
81-
return ::cuda::ceil_div(__a, static_cast<_CUDA_VSTD::underlying_type_t<_Up>>(__b));
109+
return ::cuda::ceil_div(_CUDA_VSTD::to_underlying(__a), _CUDA_VSTD::to_underlying(__b));
82110
}
83111

84112
_LIBCUDACXX_END_NAMESPACE_CUDA

libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
77
//
88
//===----------------------------------------------------------------------===//
9+
// UNSUPPORTED: c++03, c++11, c++14
910

1011
#include <cuda/cmath>
1112
#include <cuda/std/cassert>
@@ -25,15 +26,15 @@ __host__ __device__ TEST_CONSTEXPR_CXX14 void test()
2526
constexpr T maxv = cuda::std::numeric_limits<T>::max();
2627

2728
// ensure that we return the right type
28-
static_assert(cuda::std::is_same<decltype(cuda::ceil_div(T(0), U(1))), T>::value, "");
29-
30-
assert(cuda::ceil_div(T(0), U(1)) == T(0));
31-
assert(cuda::ceil_div(T(1), U(1)) == T(1));
32-
assert(cuda::ceil_div(T(126), U(64)) == T(2));
29+
using Common = _CUDA_VSTD::common_type_t<T, U>;
30+
static_assert(cuda::std::is_same<decltype(cuda::ceil_div(T(0), U(1))), Common>::value);
31+
assert(cuda::ceil_div(T(0), U(1)) == Common(0));
32+
assert(cuda::ceil_div(T(1), U(1)) == Common(1));
33+
assert(cuda::ceil_div(T(126), U(64)) == Common(2));
3334

3435
// ensure that we are resilient against overflow
3536
assert(cuda::ceil_div(maxv, U(1)) == maxv);
36-
assert(cuda::ceil_div(maxv, maxv) == T(1));
37+
assert(cuda::ceil_div(maxv, maxv) == Common(1));
3738
}
3839

3940
template <class T>

0 commit comments

Comments
 (0)