Adds support for large number of segments to DeviceSegmentedReduce (NVIDIA#3764)

elstehle · davebayer · commit ecbe8f84f6ea · 2025-04-07T10:04:48.000+02:00
* add support for large num segments on device level

* adds support for large number of segments on dispatch

* refactors offset iterator

* add tests for large number of segments

* fixes style

* renames offset iterator to snake case

* rely on ctad instead of factory function

* adds tests for more device interfaces

* use offset_input_iterator where applicable

* [skip-ci] addresses review comments

* fixes msvc implicit conversion warning

* drops debug print utilities

* removes argmin/max wrappers

* fixes style

* fixes include order

* fixes nvrtc

* expects user iterators to be advancable on the host

* drops redundant include

* adds workaround for c.parallel indirect_arg_t

* adds todo

* uses cuda::std traits

* adds missing exec space specifiers
diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh
@@ -91,7 +91,7 @@ private:
     size_t& temp_storage_bytes,
     InputIteratorT d_in,
     OutputIteratorT d_out,
-    int num_segments,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     ReductionOpT reduction_op,
@@ -112,7 +112,7 @@ private:
     size_t& temp_storage_bytes,
     InputIteratorT d_in,
     OutputIteratorT d_out,
-    int num_segments,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     ReductionOpT reduction_op,
@@ -243,7 +243,7 @@ public:
     size_t& temp_storage_bytes,
     InputIteratorT d_in,
     OutputIteratorT d_out,
-    int num_segments,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     ReductionOpT reduction_op,
@@ -355,7 +355,7 @@ public:
       size_t& temp_storage_bytes,
       InputIteratorT d_in,
       OutputIteratorT d_out,
-      int num_segments,
+      ::cuda::std::int64_t num_segments,
       BeginOffsetIteratorT d_begin_offsets,
       EndOffsetIteratorT d_end_offsets,
       cudaStream_t stream = 0)
@@ -478,7 +478,7 @@ public:
       size_t& temp_storage_bytes,
       InputIteratorT d_in,
       OutputIteratorT d_out,
-      int num_segments,
+      ::cuda::std::int64_t num_segments,
       BeginOffsetIteratorT d_begin_offsets,
       EndOffsetIteratorT d_end_offsets,
       cudaStream_t stream = 0)
@@ -605,7 +605,7 @@ public:
     size_t& temp_storage_bytes,
     InputIteratorT d_in,
     OutputIteratorT d_out,
-    int num_segments,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
@@ -744,7 +744,7 @@ public:
       size_t& temp_storage_bytes,
       InputIteratorT d_in,
       OutputIteratorT d_out,
-      int num_segments,
+      ::cuda::std::int64_t num_segments,
       BeginOffsetIteratorT d_begin_offsets,
       EndOffsetIteratorT d_end_offsets,
       cudaStream_t stream = 0)
@@ -869,7 +869,7 @@ public:
     size_t& temp_storage_bytes,
     InputIteratorT d_in,
     OutputIteratorT d_out,
-    int num_segments,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
diff --git a/cub/cub/device/dispatch/dispatch_common.cuh b/cub/cub/device/dispatch/dispatch_common.cuh
@@ -5,6 +5,10 @@
 
 #include <cub/config.cuh>
 
+#include <cuda/std/type_traits>
+
+#include "cuda/std/__cccl/execution_space.h"
+
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -40,4 +44,51 @@ enum class SelectImpl
   Partition
 };
 
+namespace detail
+{
+template <typename T, typename U, typename = void>
+struct has_plus_operator : ::cuda::std::false_type
+{};
+
+template <typename T, typename U>
+struct has_plus_operator<T, U, ::cuda::std::void_t<decltype(::cuda::std::declval<T>() + ::cuda::std::declval<U>())>>
+    : ::cuda::std::true_type
+{};
+
+template <typename T, typename U>
+constexpr bool has_plus_operator_v = has_plus_operator<T, U>::value;
+
+// Helper function that advances a given iterator only if it supports being advanced by the given offset
+template <typename IteratorT, typename OffsetT>
+_CCCL_HOST_DEVICE IteratorT advance_iterators_if_supported(IteratorT iter, OffsetT offset)
+{
+  if constexpr (has_plus_operator_v<IteratorT, OffsetT>)
+  {
+    // If operator+ is valid, advance the iterator.
+    return iter + offset;
+  }
+  else
+  {
+    // Otherwise, return iter unmodified.
+    static_cast<void>(offset);
+    return iter;
+  }
+}
+
+// Helper function that checks whether all of the given iterators support the + operator with the given offset
+template <typename OffsetT, typename... Iterators>
+_CCCL_HOST_DEVICE bool all_iterators_support_plus_operator(OffsetT /*offset*/, Iterators... /*iters*/)
+{
+  if constexpr ((has_plus_operator_v<Iterators, OffsetT> && ...))
+  {
+    return true;
+  }
+  else
+  {
+    return false;
+  }
+}
+
+} // namespace detail
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -46,6 +46,7 @@
 
 #include <cub/detail/launcher/cuda_runtime.cuh>
 #include <cub/detail/type_traits.cuh> // for cub::detail::invoke_result_t
+#include <cub/device/dispatch/dispatch_common.cuh>
 #include <cub/device/dispatch/kernels/reduce.cuh>
 #include <cub/device/dispatch/kernels/segmented_reduce.cuh>
 #include <cub/device/dispatch/tuning/tuning_reduce.cuh>
@@ -706,7 +707,7 @@ struct DispatchSegmentedReduce
   OutputIteratorT d_out;
 
   /// The number of segments that comprise the sorting data
-  int num_segments;
+  ::cuda::std::int64_t num_segments;
 
   /// Random-access input iterator to the sequence of beginning offsets of
   /// length `num_segments`, such that `d_begin_offsets[i]` is the first
@@ -747,7 +748,7 @@ struct DispatchSegmentedReduce
     size_t& temp_storage_bytes,
     InputIteratorT d_in,
     OutputIteratorT d_out,
-    int num_segments,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     ReductionOpT reduction_op,
@@ -813,33 +814,61 @@ struct DispatchSegmentedReduce
         break;
       }
 
-// Log device_reduce_sweep_kernel configuration
-#ifdef CUB_DEBUG_LOG
-      _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), "
-              "%d items per thread, %d SM occupancy\n",
-              num_segments,
-              policy.SegmentedReduce().BlockThreads(),
-              (long long) stream,
-              policy.SegmentedReduce().ItemsPerThread(),
-              segmented_reduce_config.sm_occupancy);
-#endif // CUB_DEBUG_LOG
-
-      // Invoke DeviceReduceKernel
-      launcher_factory(num_segments, policy.SegmentedReduce().BlockThreads(), 0, stream)
-        .doit(segmented_reduce_kernel, d_in, d_out, d_begin_offsets, d_end_offsets, num_segments, reduction_op, init);
+      const auto num_segments_per_invocation =
+        static_cast<::cuda::std::int64_t>(::cuda::std::numeric_limits<::cuda::std::int32_t>::max());
+      const ::cuda::std::int64_t num_invocations = ::cuda::ceil_div(num_segments, num_segments_per_invocation);
 
-      // Check for failure to launch
-      error = CubDebug(cudaPeekAtLastError());
-      if (cudaSuccess != error)
+      // If we need multiple passes over the segments but the iterators do not support the + operator, we cannot use the
+      // streaming approach and have to fail, returning cudaErrorInvalidValue. This is because c.parallel passes
+      // indirect_arg_t as the iterator type, which does not support the + operator.
+      // TODO (elstehle): Remove this check once https://github.com/NVIDIA/cccl/issues/4148 is resolved.
+      if (num_invocations > 1
+          && !detail::all_iterators_support_plus_operator(::cuda::std::int64_t{}, d_out, d_begin_offsets, d_end_offsets))
       {
-        break;
+        return cudaErrorInvalidValue;
       }
 
-      // Sync the stream if specified to flush runtime errors
-      error = CubDebug(detail::DebugSyncStream(stream));
-      if (cudaSuccess != error)
+      for (::cuda::std::int64_t invocation_index = 0; invocation_index < num_invocations; invocation_index++)
       {
-        break;
+        const auto current_seg_offset = invocation_index * num_segments_per_invocation;
+        const auto num_current_segments =
+          ::cuda::std::min(num_segments_per_invocation, num_segments - current_seg_offset);
+
+// Log device_reduce_sweep_kernel configuration
+#ifdef CUB_DEBUG_LOG
+        _CubLog("Invoking SegmentedDeviceReduceKernel<<<%ld, %d, 0, %lld>>>(), "
+                "%d items per thread, %d SM occupancy\n",
+                num_current_segments,
+                policy.SegmentedReduce().BlockThreads(),
+                (long long) stream,
+                policy.SegmentedReduce().ItemsPerThread(),
+                segmented_reduce_config.sm_occupancy);
+#endif // CUB_DEBUG_LOG
+
+        // Invoke DeviceReduceKernel
+        launcher_factory(
+          static_cast<::cuda::std::uint32_t>(num_current_segments), policy.SegmentedReduce().BlockThreads(), 0, stream)
+          .doit(segmented_reduce_kernel,
+                d_in,
+                detail::advance_iterators_if_supported(d_out, current_seg_offset),
+                detail::advance_iterators_if_supported(d_begin_offsets, current_seg_offset),
+                detail::advance_iterators_if_supported(d_end_offsets, current_seg_offset),
+                reduction_op,
+                init);
+
+        // Check for failure to launch
+        error = CubDebug(cudaPeekAtLastError());
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+
+        // Sync the stream if specified to flush runtime errors
+        error = CubDebug(detail::DebugSyncStream(stream));
+        if (cudaSuccess != error)
+        {
+          break;
+        }
       }
     } while (0);
 
@@ -908,7 +937,7 @@ struct DispatchSegmentedReduce
     size_t& temp_storage_bytes,
     InputIteratorT d_in,
     OutputIteratorT d_out,
-    int num_segments,
+    ::cuda::std::int64_t num_segments,
     BeginOffsetIteratorT d_begin_offsets,
     EndOffsetIteratorT d_end_offsets,
     ReductionOpT reduction_op,
diff --git a/cub/cub/device/dispatch/kernels/segmented_reduce.cuh b/cub/cub/device/dispatch/kernels/segmented_reduce.cuh
@@ -132,7 +132,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
   OutputIteratorT d_out,
   BeginOffsetIteratorT d_begin_offsets,
   EndOffsetIteratorT d_end_offsets,
-  int /*num_segments*/,
   ReductionOpT reduction_op,
   InitT init)
 {
diff --git a/cub/test/catch2_test_device_segmented_reduce_large_offsets.cu b/cub/test/catch2_test_device_segmented_reduce_large_offsets.cu

Original file line number	Diff line number	Diff line change
`@@ -132,7 +132,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)`
`132`	`132`	`OutputIteratorT d_out,`
`133`	`133`	`BeginOffsetIteratorT d_begin_offsets,`
`134`	`134`	`EndOffsetIteratorT d_end_offsets,`
`135`		`- int /num_segments/,`
`136`	`135`	`ReductionOpT reduction_op,`
`137`	`136`	`InitT init)`
`138`	`137`	`{`