Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 11 additions & 9 deletions docs/libcudacxx/extended_api/math/ceil_div.rst
Original file line number Diff line number Diff line change
@@ -1,23 +1,25 @@
.. _libcudacxx-extended-api-math-ceil-div:

``ceil_div`` Ceiling Division
=============================
Math
=====

.. code:: cuda

template <typename T, typename = U>
[[nodiscard]] __host__ __device__ constexpr T ceil_div(T value, U divisor) noexcept;
template <typename T, typename U>
[[nodiscard]] __host__ __device__ inline
constexpr _CUDA_VSTD::common_type_t<_Tp, _Up> ceil_div(T a, U b) noexcept;

``value``: The value to be divided.
``divisor``: The divisor.
ceil_div
---------

- *Requires*: ``is_integral_v<T>`` is true and ``is_integral_v<U>`` is true.
- *Requires*: ``T`` is an integral type (including 128-bit integers) or enumerator.
- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
- *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value.

.. note::
**Performance considerations**

The function is only constexpr from C++14 onwards
- The function computes ``(a + b - 1) / b`` when the common type is a signed integer.
- The function computes ``min(a, 1 + ((a - 1) / b)`` when the common type is an unsigned integer in CUDA, which generates less instructions than ``(a / b) + ((a / b) * b != a)``, especially for 64-bit types.

**Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block:

Expand Down
84 changes: 56 additions & 28 deletions libcudacxx/include/cuda/__cmath/ceil_div.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,15 @@
# pragma system_header
#endif // no system header

#include <cuda/std/__algorithm/min.h>
#include <cuda/std/__concepts/concept_macros.h>
#include <cuda/std/__type_traits/common_type.h>
#include <cuda/std/__type_traits/enable_if.h>
#include <cuda/std/__type_traits/is_enum.h>
#include <cuda/std/__type_traits/is_integral.h>
#include <cuda/std/__type_traits/is_signed.h>
#include <cuda/std/__type_traits/is_unsigned.h>
#include <cuda/std/__type_traits/make_unsigned.h>
#include <cuda/std/__type_traits/underlying_type.h>
#include <cuda/std/__utility/to_underlying.h>

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

Expand All @@ -37,48 +38,75 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
//! @param __b The divisor
//! @pre \p __a must be non-negative
//! @pre \p __b must be positive
template <class _Tp,
class _Up,
_CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_unsigned, _Tp), int> = 0,
_CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up), int> = 0>
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(const _Tp __a, const _Up __b) noexcept
_CCCL_TEMPLATE(class _Tp, class _Up)
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up))
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_Tp, _Up>
ceil_div(const _Tp __a, const _Up __b) noexcept
{
_CCCL_ASSERT(__b > _Up(0), "cuda::ceil_div: b must be positive");
using _UCommon = _CUDA_VSTD::make_unsigned_t<_CUDA_VSTD::common_type_t<_Tp, _Up>>;
const auto __res = static_cast<_UCommon>(__a) / static_cast<_UCommon>(__b);
return static_cast<_Tp>(__res + (__res * static_cast<_UCommon>(__b) != static_cast<_UCommon>(__a)));
_CCCL_ASSERT(__b > _Up{0}, "cuda::ceil_div: 'b' must be positive");

if constexpr (_CUDA_VSTD::is_signed_v<_Tp>)
{
_CCCL_ASSERT(__a >= _Tp{0}, "cuda::ceil_div: 'a' must be non negative");
}
using _Common = _CUDA_VSTD::common_type_t<_Tp, _Up>;
using _Prom = decltype(_Tp{} / _Up{});
using _UProm = _CUDA_VSTD::make_unsigned_t<_Prom>;
auto __a1 = static_cast<_UProm>(__a);
auto __b1 = static_cast<_UProm>(__b);
if constexpr (_CUDA_VSTD::is_signed_v<_Prom>)
{
return static_cast<_Common>((__a1 + __b1 - 1) / __b1);
}
else
{
// the ::min method is faster even if __b is a compile-time constant
NV_IF_ELSE_TARGET(NV_IS_DEVICE,
(return static_cast<_Common>(_CUDA_VSTD::min(__a1, 1 + ((__a1 - 1) / __b1)));),
(const auto __res = __a1 / __b1; //
return static_cast<_Common>(__res + (__res * __b1 != __a1));))
}
}

//! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder
//! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder, \p __b is an enum
//! @param __a The dividend
//! @param __b The divisor
//! @pre \p __a must be non-negative
//! @pre \p __b must be positive
_CCCL_TEMPLATE(class _Tp, class _Up)
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up))
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_Tp, _CUDA_VSTD::underlying_type_t<_Up>>
ceil_div(const _Tp __a, const _Up __b) noexcept
{
return ::cuda::ceil_div(__a, _CUDA_VSTD::to_underlying(__b));
}

//! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder, \p __b is an enum
//! @param __a The dividend
//! @param __b The divisor
//! @pre \p __a must be non-negative
//! @pre \p __b must be positive
template <class _Tp,
class _Up,
_CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_signed, _Tp), int> = 0,
_CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up), int> = 0>
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(const _Tp __a, const _Up __b) noexcept
_CCCL_TEMPLATE(class _Tp, class _Up)
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up))
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_CUDA_VSTD::underlying_type_t<_Tp>, _Up>
ceil_div(const _Tp __a, const _Up __b) noexcept
{
_CCCL_ASSERT(__a >= _Tp(0), "cuda::ceil_div: a must be non negative");
_CCCL_ASSERT(__b > _Up(0), "cuda::ceil_div: b must be positive");
using _UCommon = _CUDA_VSTD::make_unsigned_t<_CUDA_VSTD::common_type_t<_Tp, _Up>>;
// Due to the precondition `__a >= 0` we can safely cast to unsigned without danger of overflowing
return static_cast<_Tp>((static_cast<_UCommon>(__a) + static_cast<_UCommon>(__b) - 1) / static_cast<_UCommon>(__b));
return ::cuda::ceil_div(_CUDA_VSTD::to_underlying(__a), __b);
}

//! @brief Divides two numbers \p __a and \p __b, rounding up if there is a remainder, \p __b is an enum
//! @param __a The dividend
//! @param __b The divisor
//! @pre \p __a must be non-negative
//! @pre \p __b must be positive
template <class _Tp,
class _Up,
_CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp), int> = 0,
_CUDA_VSTD::enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up), int> = 0>
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(const _Tp __a, const _Up __b) noexcept
_CCCL_TEMPLATE(class _Tp, class _Up)
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up))
_CCCL_NODISCARD
_LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_CUDA_VSTD::underlying_type_t<_Tp>,
_CUDA_VSTD::underlying_type_t<_Up>>
ceil_div(const _Tp __a, const _Up __b) noexcept
{
return ::cuda::ceil_div(__a, static_cast<_CUDA_VSTD::underlying_type_t<_Up>>(__b));
return ::cuda::ceil_div(_CUDA_VSTD::to_underlying(__a), _CUDA_VSTD::to_underlying(__b));
}

_LIBCUDACXX_END_NAMESPACE_CUDA
Expand Down
13 changes: 7 additions & 6 deletions libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: c++03, c++11, c++14

#include <cuda/cmath>
#include <cuda/std/cassert>
Expand All @@ -25,15 +26,15 @@ __host__ __device__ TEST_CONSTEXPR_CXX14 void test()
constexpr T maxv = cuda::std::numeric_limits<T>::max();

// ensure that we return the right type
static_assert(cuda::std::is_same<decltype(cuda::ceil_div(T(0), U(1))), T>::value, "");

assert(cuda::ceil_div(T(0), U(1)) == T(0));
assert(cuda::ceil_div(T(1), U(1)) == T(1));
assert(cuda::ceil_div(T(126), U(64)) == T(2));
using Common = _CUDA_VSTD::common_type_t<T, U>;
static_assert(cuda::std::is_same<decltype(cuda::ceil_div(T(0), U(1))), Common>::value);
assert(cuda::ceil_div(T(0), U(1)) == Common(0));
assert(cuda::ceil_div(T(1), U(1)) == Common(1));
assert(cuda::ceil_div(T(126), U(64)) == Common(2));

// ensure that we are resilient against overflow
assert(cuda::ceil_div(maxv, U(1)) == maxv);
assert(cuda::ceil_div(maxv, maxv) == T(1));
assert(cuda::ceil_div(maxv, maxv) == Common(1));
}

template <class T>
Expand Down
Loading