Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Support reduction for more than 2^31 items #589

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 44 additions & 23 deletions cub/device/device_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include <limits>

#include <cub/config.cuh>
#include <cub/detail/choose_offset.cuh>
#include <cub/device/dispatch/dispatch_reduce.cuh>
#include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
#include <cub/iterator/arg_index_input_iterator.cuh>
Expand Down Expand Up @@ -158,6 +159,8 @@ struct DeviceReduce
* **[inferred]** Data element type that is convertible to the `value` type
* of `InputIteratorT`
*
* @tparam NumItemsT **[inferred]** Type of num_items
*
* @param[in] d_temp_storage
* Device-accessible allocation of temporary storage. When `nullptr`, the
* required allocation size is written to `temp_storage_bytes` and no work
Expand Down Expand Up @@ -188,18 +191,19 @@ struct DeviceReduce
template <typename InputIteratorT,
typename OutputIteratorT,
typename ReductionOpT,
typename T>
typename T,
typename NumItemsT>
CUB_RUNTIME_FUNCTION static cudaError_t Reduce(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
NumItemsT num_items,
ReductionOpT reduction_op,
T init,
cudaStream_t stream = 0)
{
// Signed integer type for global offsets
using OffsetT = int;
using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;

return DispatchReduce<InputIteratorT,
OutputIteratorT,
Expand All @@ -209,7 +213,7 @@ struct DeviceReduce
temp_storage_bytes,
d_in,
d_out,
num_items,
static_cast<OffsetT>(num_items),
reduction_op,
init,
stream);
Expand Down Expand Up @@ -303,6 +307,8 @@ struct DeviceReduce
* **[inferred]** Output iterator type for recording the reduced
* aggregate \iterator
*
* @tparam NumItemsT **[inferred]** Type of num_items
*
* @param[in] d_temp_storage
* Device-accessible allocation of temporary storage. When `nullptr`, the
* required allocation size is written to `temp_storage_bytes` and no work
Expand All @@ -324,16 +330,18 @@ struct DeviceReduce
* **[optional]** CUDA stream to launch kernels within.
* Default is stream<sub>0</sub>.
*/
template <typename InputIteratorT, typename OutputIteratorT>
template <typename InputIteratorT,
typename OutputIteratorT,
typename NumItemsT>
CUB_RUNTIME_FUNCTION static cudaError_t Sum(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
cudaStream_t stream = 0)
NumItemsT num_items,
cudaStream_t stream = 0)
{
// Signed integer type for global offsets
using OffsetT = int;
using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;

// The output value type
using OutputT =
Expand All @@ -350,7 +358,7 @@ struct DeviceReduce
temp_storage_bytes,
d_in,
d_out,
num_items,
static_cast<OffsetT>(num_items),
cub::Sum(),
InitT{}, // zero-initialize
stream);
Expand Down Expand Up @@ -429,6 +437,8 @@ struct DeviceReduce
* **[inferred]** Output iterator type for recording the reduced
* aggregate \iterator
*
* @tparam NumItemsT **[inferred]** Type of num_items
*
* @param[in] d_temp_storage
* Device-accessible allocation of temporary storage. When `nullptr`, the
* required allocation size is written to `temp_storage_bytes` and no work
Expand All @@ -450,16 +460,18 @@ struct DeviceReduce
* **[optional]** CUDA stream to launch kernels within.
* Default is stream<sub>0</sub>.
*/
template <typename InputIteratorT, typename OutputIteratorT>
template <typename InputIteratorT,
typename OutputIteratorT,
typename NumItemsT>
CUB_RUNTIME_FUNCTION static cudaError_t Min(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
NumItemsT num_items,
cudaStream_t stream = 0)
{
// Signed integer type for global offsets
using OffsetT = int;
using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;

// The input value type
using InputT = cub::detail::value_t<InputIteratorT>;
Expand All @@ -474,7 +486,7 @@ struct DeviceReduce
temp_storage_bytes,
d_in,
d_out,
num_items,
static_cast<OffsetT>(num_items),
cub::Min(),
// replace with
// std::numeric_limits<T>::max() when
Expand Down Expand Up @@ -583,7 +595,8 @@ struct DeviceReduce
* **[optional]** CUDA stream to launch kernels within.
* Default is stream<sub>0</sub>.
*/
template <typename InputIteratorT, typename OutputIteratorT>
template <typename InputIteratorT,
typename OutputIteratorT>
CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
Expand Down Expand Up @@ -707,6 +720,8 @@ struct DeviceReduce
* **[inferred]** Output iterator type for recording the reduced
* aggregate \iterator
*
* @tparam NumItemsT **[inferred]** Type of num_items
*
* @param[in] d_temp_storage
* Device-accessible allocation of temporary storage. When `nullptr`, the
* required allocation size is written to `temp_storage_bytes` and no work
Expand All @@ -728,16 +743,18 @@ struct DeviceReduce
* **[optional]** CUDA stream to launch kernels within.
* Default is stream<sub>0</sub>.
*/
template <typename InputIteratorT, typename OutputIteratorT>
template <typename InputIteratorT,
typename OutputIteratorT,
typename NumItemsT>
CUB_RUNTIME_FUNCTION static cudaError_t Max(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
NumItemsT num_items,
cudaStream_t stream = 0)
{
// Signed integer type for global offsets
using OffsetT = int;
using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;

// The input value type
using InputT = cub::detail::value_t<InputIteratorT>;
Expand All @@ -752,7 +769,7 @@ struct DeviceReduce
temp_storage_bytes,
d_in,
d_out,
num_items,
static_cast<OffsetT>(num_items),
cub::Max(),
// replace with
// std::numeric_limits<T>::lowest()
Expand Down Expand Up @@ -863,7 +880,8 @@ struct DeviceReduce
* **[optional]** CUDA stream to launch kernels within.
* Default is stream<sub>0</sub>.
*/
template <typename InputIteratorT, typename OutputIteratorT>
template <typename InputIteratorT,
typename OutputIteratorT>
CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
Expand Down Expand Up @@ -1054,6 +1072,8 @@ struct DeviceReduce
* **[inferred]*8 Binary reduction functor type having member
* `T operator()(const T &a, const T &b)`
*
* @tparam NumItemsT **[inferred]** Type of num_items
*
* @param[in] d_temp_storage
* Device-accessible allocation of temporary storage. When `nullptr`, the
* required allocation size is written to `temp_storage_bytes` and no work
Expand Down Expand Up @@ -1095,7 +1115,8 @@ struct DeviceReduce
typename ValuesInputIteratorT,
typename AggregatesOutputIteratorT,
typename NumRunsOutputIteratorT,
typename ReductionOpT>
typename ReductionOpT,
typename NumItemsT>
CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
ReduceByKey(void *d_temp_storage,
size_t &temp_storage_bytes,
Expand All @@ -1105,11 +1126,11 @@ struct DeviceReduce
AggregatesOutputIteratorT d_aggregates_out,
NumRunsOutputIteratorT d_num_runs_out,
ReductionOpT reduction_op,
int num_items,
NumItemsT num_items,
cudaStream_t stream = 0)
{
// Signed integer type for global offsets
using OffsetT = int;
using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;

// FlagT iterator type (not used)

Expand All @@ -1134,7 +1155,7 @@ struct DeviceReduce
d_num_runs_out,
EqualityOp(),
reduction_op,
num_items,
static_cast<OffsetT>(num_items),
stream);
}

Expand Down
46 changes: 40 additions & 6 deletions test/test_device_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,25 +33,25 @@
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR

#include <cub/util_allocator.cuh>
#include <cub/util_math.cuh>
#include <cub/device/device_reduce.cuh>
#include <cub/device/device_segmented_reduce.cuh>
#include <cub/iterator/constant_input_iterator.cuh>
#include <cub/iterator/discard_output_iterator.cuh>
#include <cub/iterator/transform_input_iterator.cuh>
#include <cub/util_allocator.cuh>
#include <cub/util_math.cuh>
#include <cub/util_type.cuh>

#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

#include <nv/target>

#include "test_util.h"

#include <cstdio>
#include <limits>
#include <typeinfo>

#include "test_util.h"
#include <nv/target>

using namespace cub;

Expand Down Expand Up @@ -1333,6 +1333,39 @@ __global__ void InitializeTestAccumulatorTypes(int num_items,
}
}

template <typename T>
void TestBigIndicesHelper(int magnitude)
{
const std::size_t num_items = 1ll << magnitude;
thrust::constant_iterator<T> const_iter(T{1});
thrust::device_vector<std::size_t> out(1);
std::size_t* d_out = thrust::raw_pointer_cast(out.data());

std::uint8_t *d_temp_storage{};
std::size_t temp_storage_bytes{};

CubDebugExit(
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, const_iter, d_out, num_items));

thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

CubDebugExit(
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, const_iter, d_out, num_items));
std::size_t result = out[0];

AssertEquals(result, num_items);
}

template <typename T>
void TestBigIndices()
{
TestBigIndicesHelper<T>(30);
TestBigIndicesHelper<T>(31);
TestBigIndicesHelper<T>(32);
TestBigIndicesHelper<T>(33);
}

void TestAccumulatorTypes()
{
const int num_items = 2 * 1024 * 1024;
Expand Down Expand Up @@ -1491,6 +1524,7 @@ int main(int argc, char** argv)
TestType<TestBar, TestBar>(max_items, max_segments);

TestAccumulatorTypes();
TestBigIndices<std::size_t>();
#endif
printf("\n");
return 0;
Expand Down