davebayer
diff --git a/‎cub/cub/agent/agent_radix_sort_histogram.cuh‎
Lines changed: 13 additions & 13 deletions b/‎cub/cub/agent/agent_radix_sort_histogram.cuh‎
Lines changed: 13 additions & 13 deletions
diff --git a/‎cub/cub/agent/agent_radix_sort_upsweep.cuh‎
Lines changed: 4 additions & 3 deletions b/‎cub/cub/agent/agent_radix_sort_upsweep.cuh‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎cub/cub/block/block_radix_rank.cuh‎
Lines changed: 4 additions & 3 deletions b/‎cub/cub/block/block_radix_rank.cuh‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎cub/cub/block/block_radix_sort.cuh‎
Lines changed: 3 additions & 2 deletions b/‎cub/cub/block/block_radix_sort.cuh‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎cub/cub/device/dispatch/dispatch_radix_sort.cuh‎
Lines changed: 13 additions & 12 deletions b/‎cub/cub/device/dispatch/dispatch_radix_sort.cuh‎
Lines changed: 13 additions & 12 deletions
diff --git a/‎cub/cub/device/dispatch/kernels/radix_sort.cuh‎
Lines changed: 6 additions & 4 deletions b/‎cub/cub/device/dispatch/kernels/radix_sort.cuh‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎cub/examples/device/example_device_partition_flagged.cu‎
Lines changed: 7 additions & 7 deletions b/‎cub/examples/device/example_device_partition_flagged.cu‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎cub/examples/device/example_device_partition_if.cu‎
Lines changed: 7 additions & 6 deletions b/‎cub/examples/device/example_device_partition_if.cu‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎cub/examples/device/example_device_select_flagged.cu‎
Lines changed: 7 additions & 7 deletions b/‎cub/examples/device/example_device_select_flagged.cu‎
Lines changed: 7 additions & 7 deletions
@@ -51,6 +51,7 @@
 #include <cub/util_type.cuh>
 
 #include <cuda/ptx>
+#include <cuda/std/__algorithm_>
 
 CUB_NAMESPACE_BEGIN
 
@@ -66,7 +67,7 @@ struct AgentRadixSortHistogramPolicy
      * ID. However, lanes with the same ID in different warp use the same private
      * histogram. This arrangement helps reduce the degree of conflicts in atomic
      * operations. */
-    NUM_PARTS  = CUB_MAX(1, NOMINAL_4B_NUM_PARTS * 4 / CUB_MAX(sizeof(ComputeT), 4)),
+    NUM_PARTS  = _CUDA_VSTD::max(1, NOMINAL_4B_NUM_PARTS * 4 / _CUDA_VSTD::max(int{sizeof(ComputeT)}, 4)),
     RADIX_BITS = _RADIX_BITS,
   };
 };
@@ -94,16 +95,13 @@ template <typename AgentRadixSortHistogramPolicy,
 struct AgentRadixSortHistogram
 {
   // constants
-  enum
-  {
-    ITEMS_PER_THREAD = AgentRadixSortHistogramPolicy::ITEMS_PER_THREAD,
-    BLOCK_THREADS    = AgentRadixSortHistogramPolicy::BLOCK_THREADS,
-    TILE_ITEMS       = BLOCK_THREADS * ITEMS_PER_THREAD,
-    RADIX_BITS       = AgentRadixSortHistogramPolicy::RADIX_BITS,
-    RADIX_DIGITS     = 1 << RADIX_BITS,
-    MAX_NUM_PASSES   = (sizeof(KeyT) * 8 + RADIX_BITS - 1) / RADIX_BITS,
-    NUM_PARTS        = AgentRadixSortHistogramPolicy::NUM_PARTS,
-  };
+  static constexpr int ITEMS_PER_THREAD = AgentRadixSortHistogramPolicy::ITEMS_PER_THREAD;
+  static constexpr int BLOCK_THREADS    = AgentRadixSortHistogramPolicy::BLOCK_THREADS;
+  static constexpr int TILE_ITEMS       = BLOCK_THREADS * ITEMS_PER_THREAD;
+  static constexpr int RADIX_BITS       = AgentRadixSortHistogramPolicy::RADIX_BITS;
+  static constexpr int RADIX_DIGITS     = 1 << RADIX_BITS;
+  static constexpr int MAX_NUM_PASSES   = (sizeof(KeyT) * 8 + RADIX_BITS - 1) / RADIX_BITS;
+  static constexpr int NUM_PARTS        = AgentRadixSortHistogramPolicy::NUM_PARTS;
 
   using traits                 = radix::traits_t<KeyT>;
   using bit_ordered_type       = typename traits::bit_ordered_type;
@@ -210,7 +208,9 @@ struct AgentRadixSortHistogram
 #pragma unroll
     for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass)
     {
-      int num_bits = CUB_MIN(RADIX_BITS, end_bit - current_bit);
+      // FIXME(bgruber): the following replacement changes SASS for cub.test.device_radix_sort_pairs.lid_0
+      // const int num_bits = _CUDA_VSTD::min(+RADIX_BITS, end_bit - current_bit);
+      const int num_bits = CUB_MIN(+RADIX_BITS, end_bit - current_bit);
 #pragma unroll
       for (int u = 0; u < ITEMS_PER_THREAD; ++u)
       {
@@ -258,7 +258,7 @@ struct AgentRadixSortHistogram
 
       // Process the tiles.
       OffsetT portion_offset = portion * MAX_PORTION_SIZE;
-      OffsetT portion_size   = CUB_MIN(MAX_PORTION_SIZE, num_items - portion_offset);
+      OffsetT portion_size   = _CUDA_VSTD::min(MAX_PORTION_SIZE, num_items - portion_offset);
       for (OffsetT offset = blockIdx.x * TILE_ITEMS; offset < portion_size; offset += TILE_ITEMS * gridDim.x)
       {
         OffsetT tile_offset = portion_offset + offset;
 
@@ -53,6 +53,7 @@
 #include <cub/warp/warp_reduce.cuh>
 
 #include <cuda/ptx>
+#include <cuda/std/__algorithm_>
 
 CUB_NAMESPACE_BEGIN
 
@@ -160,17 +161,17 @@ struct AgentRadixSortUpsweep
     PACKING_RATIO     = sizeof(PackedCounter) / sizeof(DigitCounter),
     LOG_PACKING_RATIO = Log2<PACKING_RATIO>::VALUE,
 
-    LOG_COUNTER_LANES = CUB_MAX(0, int(RADIX_BITS) - int(LOG_PACKING_RATIO)),
+    LOG_COUNTER_LANES = _CUDA_VSTD::max(0, int(RADIX_BITS) - int(LOG_PACKING_RATIO)),
     COUNTER_LANES     = 1 << LOG_COUNTER_LANES,
 
     // To prevent counter overflow, we must periodically unpack and aggregate the
     // digit counters back into registers.  Each counter lane is assigned to a
     // warp for aggregation.
 
-    LANES_PER_WARP = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
+    LANES_PER_WARP = _CUDA_VSTD::max(1, (COUNTER_LANES + WARPS - 1) / WARPS),
 
     // Unroll tiles in batches without risk of counter overflow
-    UNROLL_COUNT      = CUB_MIN(64, 255 / KEYS_PER_THREAD),
+    UNROLL_COUNT      = _CUDA_VSTD::min(64, 255 / KEYS_PER_THREAD),
     UNROLLED_ELEMENTS = UNROLL_COUNT * TILE_ITEMS,
   };
 
 
@@ -49,6 +49,7 @@
 #include <cub/util_type.cuh>
 
 #include <cuda/ptx>
+#include <cuda/std/__algorithm_>
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
@@ -242,7 +243,7 @@ private:
     LOG_PACKING_RATIO = Log2<PACKING_RATIO>::VALUE,
 
     // Always at least one lane
-    LOG_COUNTER_LANES = CUB_MAX((int(RADIX_BITS) - int(LOG_PACKING_RATIO)), 0),
+    LOG_COUNTER_LANES = _CUDA_VSTD::max(RADIX_BITS - LOG_PACKING_RATIO, 0),
     COUNTER_LANES     = 1 << LOG_COUNTER_LANES,
 
     // The number of packed counters per thread (plus one for padding)
@@ -254,7 +255,7 @@ public:
   enum
   {
     /// Number of bin-starting offsets tracked per thread
-    BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
+    BINS_TRACKED_PER_THREAD = _CUDA_VSTD::max(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
   };
 
 private:
@@ -587,7 +588,7 @@ public:
   enum
   {
     /// Number of bin-starting offsets tracked per thread
-    BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
+    BINS_TRACKED_PER_THREAD = _CUDA_VSTD::max(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
   };
 
 private:
 
@@ -50,6 +50,7 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/std/__algorithm_>
 #include <cuda/std/type_traits>
 
 CUB_NAMESPACE_BEGIN
@@ -431,7 +432,7 @@ private:
     // Radix sorting passes
     while (true)
     {
-      int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+      int pass_bits = _CUDA_VSTD::min(RADIX_BITS, end_bit - begin_bit);
       auto digit_extractor =
         traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);
 
@@ -510,7 +511,7 @@ public:
     // Radix sorting passes
     while (true)
     {
-      int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+      int pass_bits = _CUDA_VSTD::min(RADIX_BITS, end_bit - begin_bit);
       auto digit_extractor =
         traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);
 
 
@@ -53,6 +53,7 @@
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <cuda/std/__algorithm_>
 #include <cuda/std/type_traits>
 
 #include <iterator>
@@ -275,7 +276,7 @@ struct DispatchRadixSort
     cudaError error = cudaSuccess;
     do
     {
-      int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
+      int pass_bits = _CUDA_VSTD::min(pass_config.radix_bits, end_bit - current_bit);
 
 // Log upsweep_kernel configuration
 #ifdef CUB_DEBUG_LOG
@@ -447,7 +448,7 @@ struct DispatchRadixSort
         max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(0);
 
         even_share.DispatchInit(
-          num_items, max_downsweep_grid_size, CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size));
+          num_items, max_downsweep_grid_size, _CUDA_VSTD::max(downsweep_config.tile_size, upsweep_config.tile_size));
 
       } while (0);
       return error;
@@ -472,8 +473,8 @@ struct DispatchRadixSort
     constexpr PortionOffsetT PORTION_SIZE = ((1 << 28) - 1) / ONESWEEP_TILE_ITEMS * ONESWEEP_TILE_ITEMS;
     int num_passes                        = ::cuda::ceil_div(end_bit - begin_bit, RADIX_BITS);
     OffsetT num_portions                  = static_cast<OffsetT>(::cuda::ceil_div(num_items, PORTION_SIZE));
-    PortionOffsetT max_num_blocks =
-      ::cuda::ceil_div(static_cast<int>(CUB_MIN(num_items, static_cast<OffsetT>(PORTION_SIZE))), ONESWEEP_TILE_ITEMS);
+    PortionOffsetT max_num_blocks         = ::cuda::ceil_div(
+      static_cast<int>(_CUDA_VSTD::min(num_items, static_cast<OffsetT>(PORTION_SIZE))), ONESWEEP_TILE_ITEMS);
 
     size_t value_size         = KEYS_ONLY ? 0 : sizeof(ValueT);
     size_t allocation_sizes[] = {
@@ -611,11 +612,11 @@ struct DispatchRadixSort
 
       for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass)
       {
-        int num_bits = CUB_MIN(end_bit - current_bit, RADIX_BITS);
+        int num_bits = _CUDA_VSTD::min(end_bit - current_bit, RADIX_BITS);
         for (OffsetT portion = 0; portion < num_portions; ++portion)
         {
-          PortionOffsetT portion_num_items = static_cast<PortionOffsetT>(
-            CUB_MIN(num_items - portion * PORTION_SIZE, static_cast<OffsetT>(PORTION_SIZE)));
+          PortionOffsetT portion_num_items =
+            static_cast<PortionOffsetT>(_CUDA_VSTD::min(num_items - portion * PORTION_SIZE, OffsetT{PORTION_SIZE}));
 
           PortionOffsetT num_blocks = ::cuda::ceil_div(portion_num_items, ONESWEEP_TILE_ITEMS);
 
@@ -777,7 +778,7 @@ struct DispatchRadixSort
       }
 
       // Get maximum spine length
-      int max_grid_size = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
+      int max_grid_size = _CUDA_VSTD::max(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
       int spine_length  = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size;
 
       // Temporary storage allocation requirements
@@ -812,7 +813,7 @@ struct DispatchRadixSort
       int num_passes         = ::cuda::ceil_div(num_bits, pass_config.radix_bits);
       bool is_num_passes_odd = num_passes & 1;
       int max_alt_passes     = (num_passes * pass_config.radix_bits) - num_bits;
-      int alt_end_bit        = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
+      int alt_end_bit        = _CUDA_VSTD::min(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
 
       // Alias the temporary storage allocations
       OffsetT* d_spine = static_cast<OffsetT*>(allocations[0]);
@@ -1241,7 +1242,7 @@ struct DispatchSegmentedRadixSort
     cudaError error = cudaSuccess;
     do
     {
-      int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
+      int pass_bits = _CUDA_VSTD::min(pass_config.radix_bits, (end_bit - current_bit));
 
 // Log kernel configuration
 #ifdef CUB_DEBUG_LOG
@@ -1381,10 +1382,10 @@ struct DispatchSegmentedRadixSort
       int radix_bits         = ActivePolicyT::SegmentedPolicy::RADIX_BITS;
       int alt_radix_bits     = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS;
       int num_bits           = end_bit - begin_bit;
-      int num_passes         = CUB_MAX(::cuda::ceil_div(num_bits, radix_bits), 1);
+      int num_passes         = _CUDA_VSTD::max(::cuda::ceil_div(num_bits, radix_bits), 1); // num_bits may be zero
       bool is_num_passes_odd = num_passes & 1;
       int max_alt_passes     = (num_passes * radix_bits) - num_bits;
-      int alt_end_bit        = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));
+      int alt_end_bit        = _CUDA_VSTD::min(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));
 
       DoubleBuffer<KeyT> d_keys_remaining_passes(
         (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[0]),
 
@@ -22,6 +22,8 @@
 #include <cub/device/dispatch/dispatch_common.cuh>
 #include <cub/grid/grid_even_share.cuh>
 
+#include <cuda/std/__algorithm_>
+
 CUB_NAMESPACE_BEGIN
 
 /******************************************************************************
@@ -98,8 +100,8 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUp
 
   enum
   {
-    TILE_ITEMS = CUB_MAX(ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD,
-                         ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD)
+    TILE_ITEMS = _CUDA_VSTD::max(ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD,
+                                 ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD)
   };
 
   // Parameterize AgentRadixSortUpsweep type for the current configuration
@@ -258,8 +260,8 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltDo
 
   enum
   {
-    TILE_ITEMS = CUB_MAX(ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD,
-                         ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD)
+    TILE_ITEMS = _CUDA_VSTD::max(ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD,
+                                 ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD)
   };
 
   // Parameterize AgentRadixSortDownsweep type for the current configuration
 
@@ -43,6 +43,8 @@
 #include <cub/device/device_partition.cuh>
 #include <cub/util_allocator.cuh>
 
+#include <cuda/std/limits>
+
 #include "../../test/test_util.h"
 #include <stdio.h>
 
@@ -65,20 +67,18 @@ CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
  */
 void Initialize(int* h_in, unsigned char* h_flags, int num_items, int max_segment)
 {
-  unsigned short max_short = (unsigned short) -1;
-
   int key = 0;
   int i   = 0;
   while (i < num_items)
   {
     // Select number of repeating occurrences
-    unsigned short repeat;
-    RandomBits(repeat);
-    repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
-    repeat = CUB_MAX(1, repeat);
+    unsigned short bits;
+    RandomBits(bits);
+    const int repeat = cuda::std::max(
+      1, static_cast<int>(bits * (static_cast<float>(max_segment) / cuda::std::numeric_limits<unsigned short>::max())));
 
     int j = i;
-    while (j < CUB_MIN(i + repeat, num_items))
+    while (j < cuda::std::min(i + repeat, num_items))
     {
       h_flags[j] = 0;
       h_in[j]    = key;
 
@@ -43,6 +43,8 @@
 #include <cub/device/device_partition.cuh>
 #include <cub/util_allocator.cuh>
 
+#include <cuda/std/limits>
+
 #include "../../test/test_util.h"
 #include <stdio.h>
 
@@ -84,14 +86,13 @@ void Initialize(int* h_in, int num_items, int max_segment)
   while (i < num_items)
   {
     // Randomly select number of repeating occurrences uniformly from [1..max_segment]
-    unsigned short max_short = (unsigned short) -1;
-    unsigned short repeat;
-    RandomBits(repeat);
-    repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
-    repeat = CUB_MAX(1, repeat);
+    unsigned short bits;
+    RandomBits(bits);
+    const int repeat = cuda::std::max(
+      1, static_cast<int>(bits * (static_cast<float>(max_segment) / cuda::std::numeric_limits<unsigned short>::max())));
 
     int j = i;
-    while (j < CUB_MIN(i + repeat, num_items))
+    while (j < cuda::std::min(i + repeat, num_items))
     {
       h_in[j] = key;
       j++;
 
@@ -43,6 +43,8 @@
 #include <cub/device/device_select.cuh>
 #include <cub/util_allocator.cuh>
 
+#include <cuda/std/limits>
+
 #include "../../test/test_util.h"
 #include <stdio.h>
 
@@ -65,20 +67,18 @@ CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
  */
 void Initialize(int* h_in, unsigned char* h_flags, int num_items, int max_segment)
 {
-  unsigned short max_short = (unsigned short) -1;
-
   int key = 0;
   int i   = 0;
   while (i < num_items)
   {
     // Select number of repeating occurrences
-    unsigned short repeat;
-    RandomBits(repeat);
-    repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
-    repeat = CUB_MAX(1, repeat);
+    unsigned short bits;
+    RandomBits(bits);
+    const int repeat = cuda::std::max(
+      1, static_cast<int>(bits * (static_cast<float>(max_segment) / cuda::std::numeric_limits<unsigned short>::max())));
 
     int j = i;
-    while (j < CUB_MIN(i + repeat, num_items))
+    while (j < cuda::std::min(i + repeat, num_items))
     {
       h_flags[j] = 0;
       h_in[j]    = key;