NVIDIA · gevtushenko · Nov 14, 2022 · Nov 1, 2022
diff --git a/cub/device/device_reduce.cuh b/cub/device/device_reduce.cuh
@@ -38,6 +38,7 @@
 #include <limits>
 
 #include <cub/config.cuh>
+#include <cub/detail/choose_offset.cuh>
 #include <cub/device/dispatch/dispatch_reduce.cuh>
 #include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
 #include <cub/iterator/arg_index_input_iterator.cuh>
@@ -158,6 +159,8 @@ struct DeviceReduce
    *   **[inferred]** Data element type that is convertible to the `value` type 
    *   of `InputIteratorT`
    *
+   * @tparam NumItemsT **[inferred]** Type of num_items
+   *
    * @param[in] d_temp_storage 
    *   Device-accessible allocation of temporary storage. When `nullptr`, the 
    *   required allocation size is written to `temp_storage_bytes` and no work 
@@ -188,18 +191,19 @@ struct DeviceReduce
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename ReductionOpT,
-            typename T>
+            typename T,
+            typename NumItemsT>
   CUB_RUNTIME_FUNCTION static cudaError_t Reduce(void *d_temp_storage,
                                                  size_t &temp_storage_bytes,
                                                  InputIteratorT d_in,
                                                  OutputIteratorT d_out,
-                                                 int num_items,
+                                                 NumItemsT num_items,
                                                  ReductionOpT reduction_op,
                                                  T init,
                                                  cudaStream_t stream = 0)
   {
     // Signed integer type for global offsets
-    using OffsetT = int;
+    using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
 
     return DispatchReduce<InputIteratorT,
                           OutputIteratorT,
@@ -209,7 +213,7 @@ struct DeviceReduce
                                        temp_storage_bytes,
                                        d_in,
                                        d_out,
-                                       num_items,
+                                       static_cast<OffsetT>(num_items),
                                        reduction_op,
                                        init,
                                        stream);
@@ -303,6 +307,8 @@ struct DeviceReduce
    *   **[inferred]** Output iterator type for recording the reduced 
    *   aggregate \iterator
    *
+   * @tparam NumItemsT **[inferred]** Type of num_items
+   *
    * @param[in] d_temp_storage  
    *   Device-accessible allocation of temporary storage. When `nullptr`, the 
    *   required allocation size is written to `temp_storage_bytes` and no work 
@@ -324,16 +330,18 @@ struct DeviceReduce
    *   **[optional]** CUDA stream to launch kernels within.  
    *   Default is stream<sub>0</sub>.
    */
-  template <typename InputIteratorT, typename OutputIteratorT>
+  template <typename InputIteratorT, 
+            typename OutputIteratorT, 
+            typename NumItemsT>
   CUB_RUNTIME_FUNCTION static cudaError_t Sum(void *d_temp_storage,
                                               size_t &temp_storage_bytes,
                                               InputIteratorT d_in,
                                               OutputIteratorT d_out,
-                                              int num_items,
-                                              cudaStream_t stream    = 0)
+                                              NumItemsT num_items,
+                                              cudaStream_t stream = 0)
   {
     // Signed integer type for global offsets
-    using OffsetT = int;
+    using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
 
     // The output value type
     using OutputT =
@@ -350,7 +358,7 @@ struct DeviceReduce
                                            temp_storage_bytes,
                                            d_in,
                                            d_out,
-                                           num_items,
+                                           static_cast<OffsetT>(num_items),
                                            cub::Sum(),
                                            InitT{}, // zero-initialize
                                            stream);
@@ -429,6 +437,8 @@ struct DeviceReduce
    *   **[inferred]** Output iterator type for recording the reduced 
    *   aggregate \iterator
    *
+   * @tparam NumItemsT **[inferred]** Type of num_items
+   *
    * @param[in] d_temp_storage  
    *   Device-accessible allocation of temporary storage. When `nullptr`, the 
    *   required allocation size is written to `temp_storage_bytes` and no work 
@@ -450,16 +460,18 @@ struct DeviceReduce
    *   **[optional]** CUDA stream to launch kernels within.  
    *   Default is stream<sub>0</sub>.
    */
-  template <typename InputIteratorT, typename OutputIteratorT>
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename NumItemsT>
   CUB_RUNTIME_FUNCTION static cudaError_t Min(void *d_temp_storage,
                                               size_t &temp_storage_bytes,
                                               InputIteratorT d_in,
                                               OutputIteratorT d_out,
-                                              int num_items,
+                                              NumItemsT num_items,
                                               cudaStream_t stream = 0)
   {
     // Signed integer type for global offsets
-    using OffsetT = int;
+    using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
 
     // The input value type
     using InputT = cub::detail::value_t<InputIteratorT>;
@@ -474,7 +486,7 @@ struct DeviceReduce
                                            temp_storage_bytes,
                                            d_in,
                                            d_out,
-                                           num_items,
+                                           static_cast<OffsetT>(num_items),
                                            cub::Min(),
                                            // replace with 
                                            // std::numeric_limits<T>::max() when
@@ -583,7 +595,8 @@ struct DeviceReduce
    *   **[optional]** CUDA stream to launch kernels within.  
    *   Default is stream<sub>0</sub>.
    */
-  template <typename InputIteratorT, typename OutputIteratorT>
+  template <typename InputIteratorT, 
+            typename OutputIteratorT>
   CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(void *d_temp_storage,
                                                  size_t &temp_storage_bytes,
                                                  InputIteratorT d_in,
@@ -707,6 +720,8 @@ struct DeviceReduce
    *   **[inferred]** Output iterator type for recording the reduced 
    *   aggregate \iterator
    *
+   * @tparam NumItemsT **[inferred]** Type of num_items
+   *
    * @param[in] d_temp_storage  
    *   Device-accessible allocation of temporary storage. When `nullptr`, the 
    *   required allocation size is written to `temp_storage_bytes` and no work 
@@ -728,16 +743,18 @@ struct DeviceReduce
    *   **[optional]** CUDA stream to launch kernels within. 
    *   Default is stream<sub>0</sub>.
    */
-  template <typename InputIteratorT, typename OutputIteratorT>
+  template <typename InputIteratorT, 
+            typename OutputIteratorT,
+            typename NumItemsT>
   CUB_RUNTIME_FUNCTION static cudaError_t Max(void *d_temp_storage,
                                               size_t &temp_storage_bytes,
                                               InputIteratorT d_in,
                                               OutputIteratorT d_out,
-                                              int num_items,
+                                              NumItemsT num_items,
                                               cudaStream_t stream = 0)
   {
     // Signed integer type for global offsets
-    using OffsetT = int;
+    using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
 
     // The input value type
     using InputT = cub::detail::value_t<InputIteratorT>;
@@ -752,7 +769,7 @@ struct DeviceReduce
                                            temp_storage_bytes,
                                            d_in,
                                            d_out,
-                                           num_items,
+                                           static_cast<OffsetT>(num_items),
                                            cub::Max(),
                                            // replace with 
                                            // std::numeric_limits<T>::lowest()
@@ -863,7 +880,8 @@ struct DeviceReduce
    *   **[optional]** CUDA stream to launch kernels within.  
    *   Default is stream<sub>0</sub>.
    */
-  template <typename InputIteratorT, typename OutputIteratorT>
+  template <typename InputIteratorT, 
+            typename OutputIteratorT>
   CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(void *d_temp_storage,
                                                  size_t &temp_storage_bytes,
                                                  InputIteratorT d_in,
@@ -1054,6 +1072,8 @@ struct DeviceReduce
    *   **[inferred]*8 Binary reduction functor type having member 
    *   `T operator()(const T &a, const T &b)`
    *
+   * @tparam NumItemsT **[inferred]** Type of num_items
+   *
    * @param[in] d_temp_storage  
    *   Device-accessible allocation of temporary storage. When `nullptr`, the 
    *   required allocation size is written to `temp_storage_bytes` and no work 
@@ -1095,7 +1115,8 @@ struct DeviceReduce
             typename ValuesInputIteratorT,
             typename AggregatesOutputIteratorT,
             typename NumRunsOutputIteratorT,
-            typename ReductionOpT>
+            typename ReductionOpT,
+            typename NumItemsT>
   CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
   ReduceByKey(void *d_temp_storage,
               size_t &temp_storage_bytes,
@@ -1105,11 +1126,11 @@ struct DeviceReduce
               AggregatesOutputIteratorT d_aggregates_out,
               NumRunsOutputIteratorT d_num_runs_out,
               ReductionOpT reduction_op,
-              int num_items,
+              NumItemsT num_items,
               cudaStream_t stream = 0)
   {
     // Signed integer type for global offsets
-    using OffsetT = int;
+    using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
 
     // FlagT iterator type (not used)
 
@@ -1134,7 +1155,7 @@ struct DeviceReduce
                                                   d_num_runs_out,
                                                   EqualityOp(),
                                                   reduction_op,
-                                                  num_items,
+                                                  static_cast<OffsetT>(num_items),
                                                   stream);
   }
 

diff --git a/test/test_device_reduce.cu b/test/test_device_reduce.cu
@@ -33,25 +33,25 @@
 // Ensure printing of CUDA runtime errors to console
 #define CUB_STDERR
 
-#include <cub/util_allocator.cuh>
-#include <cub/util_math.cuh>
 #include <cub/device/device_reduce.cuh>
 #include <cub/device/device_segmented_reduce.cuh>
 #include <cub/iterator/constant_input_iterator.cuh>
 #include <cub/iterator/discard_output_iterator.cuh>
 #include <cub/iterator/transform_input_iterator.cuh>
+#include <cub/util_allocator.cuh>
+#include <cub/util_math.cuh>
 #include <cub/util_type.cuh>
 
+#include <thrust/device_vector.h>
+#include <thrust/iterator/constant_iterator.h>
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
-#include <nv/target>
-
-#include "test_util.h"
-
 #include <cstdio>
 #include <limits>
 #include <typeinfo>
 
+#include "test_util.h"
+#include <nv/target>
 
 using namespace cub;
 
@@ -1333,6 +1333,39 @@ __global__ void InitializeTestAccumulatorTypes(int num_items,
   }
 }
 
+template <typename T>
+void TestBigIndicesHelper(int magnitude)
+{
+  const std::size_t num_items = 1ll << magnitude;
+  thrust::constant_iterator<T> const_iter(T{1});
+  thrust::device_vector<std::size_t> out(1);
+  std::size_t* d_out = thrust::raw_pointer_cast(out.data());
+
+  std::uint8_t *d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+
+  CubDebugExit(
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, const_iter, d_out, num_items));
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  CubDebugExit(
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, const_iter, d_out, num_items));
+  std::size_t result = out[0];
+
+  AssertEquals(result, num_items);
+}
+
+template <typename T>
+void TestBigIndices()
+{
+  TestBigIndicesHelper<T>(30);
+  TestBigIndicesHelper<T>(31);
+  TestBigIndicesHelper<T>(32);
+  TestBigIndicesHelper<T>(33);
+}
+
 void TestAccumulatorTypes()
 {
   const int num_items  = 2 * 1024 * 1024;
@@ -1491,6 +1524,7 @@ int main(int argc, char** argv)
     TestType<TestBar, TestBar>(max_items, max_segments);
 
     TestAccumulatorTypes();
+    TestBigIndices<std::size_t>();
 #endif
     printf("\n");
     return 0;