Skip to content

Commit a716d18

Browse files
bernhardmgruberdavebayer
authored andcommitted
Replace CUB macros in more places (NVIDIA#3930)
No SASS change on cub.test.device_radix_sort_pairs.lid_0 for SM86
1 parent 1ab0406 commit a716d18

18 files changed

+112
-98
lines changed

cub/cub/agent/agent_radix_sort_histogram.cuh

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
#include <cub/util_type.cuh>
5252

5353
#include <cuda/ptx>
54+
#include <cuda/std/__algorithm_>
5455

5556
CUB_NAMESPACE_BEGIN
5657

@@ -66,7 +67,7 @@ struct AgentRadixSortHistogramPolicy
6667
* ID. However, lanes with the same ID in different warp use the same private
6768
* histogram. This arrangement helps reduce the degree of conflicts in atomic
6869
* operations. */
69-
NUM_PARTS = CUB_MAX(1, NOMINAL_4B_NUM_PARTS * 4 / CUB_MAX(sizeof(ComputeT), 4)),
70+
NUM_PARTS = _CUDA_VSTD::max(1, NOMINAL_4B_NUM_PARTS * 4 / _CUDA_VSTD::max(int{sizeof(ComputeT)}, 4)),
7071
RADIX_BITS = _RADIX_BITS,
7172
};
7273
};
@@ -94,16 +95,13 @@ template <typename AgentRadixSortHistogramPolicy,
9495
struct AgentRadixSortHistogram
9596
{
9697
// constants
97-
enum
98-
{
99-
ITEMS_PER_THREAD = AgentRadixSortHistogramPolicy::ITEMS_PER_THREAD,
100-
BLOCK_THREADS = AgentRadixSortHistogramPolicy::BLOCK_THREADS,
101-
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
102-
RADIX_BITS = AgentRadixSortHistogramPolicy::RADIX_BITS,
103-
RADIX_DIGITS = 1 << RADIX_BITS,
104-
MAX_NUM_PASSES = (sizeof(KeyT) * 8 + RADIX_BITS - 1) / RADIX_BITS,
105-
NUM_PARTS = AgentRadixSortHistogramPolicy::NUM_PARTS,
106-
};
98+
static constexpr int ITEMS_PER_THREAD = AgentRadixSortHistogramPolicy::ITEMS_PER_THREAD;
99+
static constexpr int BLOCK_THREADS = AgentRadixSortHistogramPolicy::BLOCK_THREADS;
100+
static constexpr int TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD;
101+
static constexpr int RADIX_BITS = AgentRadixSortHistogramPolicy::RADIX_BITS;
102+
static constexpr int RADIX_DIGITS = 1 << RADIX_BITS;
103+
static constexpr int MAX_NUM_PASSES = (sizeof(KeyT) * 8 + RADIX_BITS - 1) / RADIX_BITS;
104+
static constexpr int NUM_PARTS = AgentRadixSortHistogramPolicy::NUM_PARTS;
107105

108106
using traits = radix::traits_t<KeyT>;
109107
using bit_ordered_type = typename traits::bit_ordered_type;
@@ -210,7 +208,9 @@ struct AgentRadixSortHistogram
210208
#pragma unroll
211209
for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass)
212210
{
213-
int num_bits = CUB_MIN(RADIX_BITS, end_bit - current_bit);
211+
// FIXME(bgruber): the following replacement changes SASS for cub.test.device_radix_sort_pairs.lid_0
212+
// const int num_bits = _CUDA_VSTD::min(+RADIX_BITS, end_bit - current_bit);
213+
const int num_bits = CUB_MIN(+RADIX_BITS, end_bit - current_bit);
214214
#pragma unroll
215215
for (int u = 0; u < ITEMS_PER_THREAD; ++u)
216216
{
@@ -258,7 +258,7 @@ struct AgentRadixSortHistogram
258258

259259
// Process the tiles.
260260
OffsetT portion_offset = portion * MAX_PORTION_SIZE;
261-
OffsetT portion_size = CUB_MIN(MAX_PORTION_SIZE, num_items - portion_offset);
261+
OffsetT portion_size = _CUDA_VSTD::min(MAX_PORTION_SIZE, num_items - portion_offset);
262262
for (OffsetT offset = blockIdx.x * TILE_ITEMS; offset < portion_size; offset += TILE_ITEMS * gridDim.x)
263263
{
264264
OffsetT tile_offset = portion_offset + offset;

cub/cub/agent/agent_radix_sort_upsweep.cuh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
#include <cub/warp/warp_reduce.cuh>
5454

5555
#include <cuda/ptx>
56+
#include <cuda/std/__algorithm_>
5657

5758
CUB_NAMESPACE_BEGIN
5859

@@ -160,17 +161,17 @@ struct AgentRadixSortUpsweep
160161
PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter),
161162
LOG_PACKING_RATIO = Log2<PACKING_RATIO>::VALUE,
162163

163-
LOG_COUNTER_LANES = CUB_MAX(0, int(RADIX_BITS) - int(LOG_PACKING_RATIO)),
164+
LOG_COUNTER_LANES = _CUDA_VSTD::max(0, int(RADIX_BITS) - int(LOG_PACKING_RATIO)),
164165
COUNTER_LANES = 1 << LOG_COUNTER_LANES,
165166

166167
// To prevent counter overflow, we must periodically unpack and aggregate the
167168
// digit counters back into registers. Each counter lane is assigned to a
168169
// warp for aggregation.
169170

170-
LANES_PER_WARP = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
171+
LANES_PER_WARP = _CUDA_VSTD::max(1, (COUNTER_LANES + WARPS - 1) / WARPS),
171172

172173
// Unroll tiles in batches without risk of counter overflow
173-
UNROLL_COUNT = CUB_MIN(64, 255 / KEYS_PER_THREAD),
174+
UNROLL_COUNT = _CUDA_VSTD::min(64, 255 / KEYS_PER_THREAD),
174175
UNROLLED_ELEMENTS = UNROLL_COUNT * TILE_ITEMS,
175176
};
176177

cub/cub/block/block_radix_rank.cuh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
#include <cub/util_type.cuh>
5050

5151
#include <cuda/ptx>
52+
#include <cuda/std/__algorithm_>
5253
#include <cuda/std/cstdint>
5354
#include <cuda/std/limits>
5455
#include <cuda/std/type_traits>
@@ -242,7 +243,7 @@ private:
242243
LOG_PACKING_RATIO = Log2<PACKING_RATIO>::VALUE,
243244

244245
// Always at least one lane
245-
LOG_COUNTER_LANES = CUB_MAX((int(RADIX_BITS) - int(LOG_PACKING_RATIO)), 0),
246+
LOG_COUNTER_LANES = _CUDA_VSTD::max(RADIX_BITS - LOG_PACKING_RATIO, 0),
246247
COUNTER_LANES = 1 << LOG_COUNTER_LANES,
247248

248249
// The number of packed counters per thread (plus one for padding)
@@ -254,7 +255,7 @@ public:
254255
enum
255256
{
256257
/// Number of bin-starting offsets tracked per thread
257-
BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
258+
BINS_TRACKED_PER_THREAD = _CUDA_VSTD::max(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
258259
};
259260

260261
private:
@@ -587,7 +588,7 @@ public:
587588
enum
588589
{
589590
/// Number of bin-starting offsets tracked per thread
590-
BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
591+
BINS_TRACKED_PER_THREAD = _CUDA_VSTD::max(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
591592
};
592593

593594
private:

cub/cub/block/block_radix_sort.cuh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
#include <cub/util_ptx.cuh>
5151
#include <cub/util_type.cuh>
5252

53+
#include <cuda/std/__algorithm_>
5354
#include <cuda/std/type_traits>
5455

5556
CUB_NAMESPACE_BEGIN
@@ -431,7 +432,7 @@ private:
431432
// Radix sorting passes
432433
while (true)
433434
{
434-
int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
435+
int pass_bits = _CUDA_VSTD::min(RADIX_BITS, end_bit - begin_bit);
435436
auto digit_extractor =
436437
traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);
437438

@@ -510,7 +511,7 @@ public:
510511
// Radix sorting passes
511512
while (true)
512513
{
513-
int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
514+
int pass_bits = _CUDA_VSTD::min(RADIX_BITS, end_bit - begin_bit);
514515
auto digit_extractor =
515516
traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);
516517

cub/cub/device/dispatch/dispatch_radix_sort.cuh

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353

5454
#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
5555

56+
#include <cuda/std/__algorithm_>
5657
#include <cuda/std/type_traits>
5758

5859
#include <iterator>
@@ -275,7 +276,7 @@ struct DispatchRadixSort
275276
cudaError error = cudaSuccess;
276277
do
277278
{
278-
int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
279+
int pass_bits = _CUDA_VSTD::min(pass_config.radix_bits, end_bit - current_bit);
279280

280281
// Log upsweep_kernel configuration
281282
#ifdef CUB_DEBUG_LOG
@@ -447,7 +448,7 @@ struct DispatchRadixSort
447448
max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(0);
448449

449450
even_share.DispatchInit(
450-
num_items, max_downsweep_grid_size, CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size));
451+
num_items, max_downsweep_grid_size, _CUDA_VSTD::max(downsweep_config.tile_size, upsweep_config.tile_size));
451452

452453
} while (0);
453454
return error;
@@ -472,8 +473,8 @@ struct DispatchRadixSort
472473
constexpr PortionOffsetT PORTION_SIZE = ((1 << 28) - 1) / ONESWEEP_TILE_ITEMS * ONESWEEP_TILE_ITEMS;
473474
int num_passes = ::cuda::ceil_div(end_bit - begin_bit, RADIX_BITS);
474475
OffsetT num_portions = static_cast<OffsetT>(::cuda::ceil_div(num_items, PORTION_SIZE));
475-
PortionOffsetT max_num_blocks =
476-
::cuda::ceil_div(static_cast<int>(CUB_MIN(num_items, static_cast<OffsetT>(PORTION_SIZE))), ONESWEEP_TILE_ITEMS);
476+
PortionOffsetT max_num_blocks = ::cuda::ceil_div(
477+
static_cast<int>(_CUDA_VSTD::min(num_items, static_cast<OffsetT>(PORTION_SIZE))), ONESWEEP_TILE_ITEMS);
477478

478479
size_t value_size = KEYS_ONLY ? 0 : sizeof(ValueT);
479480
size_t allocation_sizes[] = {
@@ -611,11 +612,11 @@ struct DispatchRadixSort
611612

612613
for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass)
613614
{
614-
int num_bits = CUB_MIN(end_bit - current_bit, RADIX_BITS);
615+
int num_bits = _CUDA_VSTD::min(end_bit - current_bit, RADIX_BITS);
615616
for (OffsetT portion = 0; portion < num_portions; ++portion)
616617
{
617-
PortionOffsetT portion_num_items = static_cast<PortionOffsetT>(
618-
CUB_MIN(num_items - portion * PORTION_SIZE, static_cast<OffsetT>(PORTION_SIZE)));
618+
PortionOffsetT portion_num_items =
619+
static_cast<PortionOffsetT>(_CUDA_VSTD::min(num_items - portion * PORTION_SIZE, OffsetT{PORTION_SIZE}));
619620

620621
PortionOffsetT num_blocks = ::cuda::ceil_div(portion_num_items, ONESWEEP_TILE_ITEMS);
621622

@@ -777,7 +778,7 @@ struct DispatchRadixSort
777778
}
778779

779780
// Get maximum spine length
780-
int max_grid_size = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
781+
int max_grid_size = _CUDA_VSTD::max(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
781782
int spine_length = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size;
782783

783784
// Temporary storage allocation requirements
@@ -812,7 +813,7 @@ struct DispatchRadixSort
812813
int num_passes = ::cuda::ceil_div(num_bits, pass_config.radix_bits);
813814
bool is_num_passes_odd = num_passes & 1;
814815
int max_alt_passes = (num_passes * pass_config.radix_bits) - num_bits;
815-
int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
816+
int alt_end_bit = _CUDA_VSTD::min(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
816817

817818
// Alias the temporary storage allocations
818819
OffsetT* d_spine = static_cast<OffsetT*>(allocations[0]);
@@ -1241,7 +1242,7 @@ struct DispatchSegmentedRadixSort
12411242
cudaError error = cudaSuccess;
12421243
do
12431244
{
1244-
int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
1245+
int pass_bits = _CUDA_VSTD::min(pass_config.radix_bits, (end_bit - current_bit));
12451246

12461247
// Log kernel configuration
12471248
#ifdef CUB_DEBUG_LOG
@@ -1381,10 +1382,10 @@ struct DispatchSegmentedRadixSort
13811382
int radix_bits = ActivePolicyT::SegmentedPolicy::RADIX_BITS;
13821383
int alt_radix_bits = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS;
13831384
int num_bits = end_bit - begin_bit;
1384-
int num_passes = CUB_MAX(::cuda::ceil_div(num_bits, radix_bits), 1);
1385+
int num_passes = _CUDA_VSTD::max(::cuda::ceil_div(num_bits, radix_bits), 1); // num_bits may be zero
13851386
bool is_num_passes_odd = num_passes & 1;
13861387
int max_alt_passes = (num_passes * radix_bits) - num_bits;
1387-
int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));
1388+
int alt_end_bit = _CUDA_VSTD::min(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));
13881389

13891390
DoubleBuffer<KeyT> d_keys_remaining_passes(
13901391
(is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[0]),

cub/cub/device/dispatch/kernels/radix_sort.cuh

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
#include <cub/device/dispatch/dispatch_common.cuh>
2323
#include <cub/grid/grid_even_share.cuh>
2424

25+
#include <cuda/std/__algorithm_>
26+
2527
CUB_NAMESPACE_BEGIN
2628

2729
/******************************************************************************
@@ -98,8 +100,8 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUp
98100

99101
enum
100102
{
101-
TILE_ITEMS = CUB_MAX(ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD,
102-
ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD)
103+
TILE_ITEMS = _CUDA_VSTD::max(ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD,
104+
ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD)
103105
};
104106

105107
// Parameterize AgentRadixSortUpsweep type for the current configuration
@@ -258,8 +260,8 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltDo
258260

259261
enum
260262
{
261-
TILE_ITEMS = CUB_MAX(ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD,
262-
ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD)
263+
TILE_ITEMS = _CUDA_VSTD::max(ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD,
264+
ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD)
263265
};
264266

265267
// Parameterize AgentRadixSortDownsweep type for the current configuration

cub/examples/device/example_device_partition_flagged.cu

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@
4343
#include <cub/device/device_partition.cuh>
4444
#include <cub/util_allocator.cuh>
4545

46+
#include <cuda/std/limits>
47+
4648
#include "../../test/test_util.h"
4749
#include <stdio.h>
4850

@@ -65,20 +67,18 @@ CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
6567
*/
6668
void Initialize(int* h_in, unsigned char* h_flags, int num_items, int max_segment)
6769
{
68-
unsigned short max_short = (unsigned short) -1;
69-
7070
int key = 0;
7171
int i = 0;
7272
while (i < num_items)
7373
{
7474
// Select number of repeating occurrences
75-
unsigned short repeat;
76-
RandomBits(repeat);
77-
repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
78-
repeat = CUB_MAX(1, repeat);
75+
unsigned short bits;
76+
RandomBits(bits);
77+
const int repeat = cuda::std::max(
78+
1, static_cast<int>(bits * (static_cast<float>(max_segment) / cuda::std::numeric_limits<unsigned short>::max())));
7979

8080
int j = i;
81-
while (j < CUB_MIN(i + repeat, num_items))
81+
while (j < cuda::std::min(i + repeat, num_items))
8282
{
8383
h_flags[j] = 0;
8484
h_in[j] = key;

cub/examples/device/example_device_partition_if.cu

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@
4343
#include <cub/device/device_partition.cuh>
4444
#include <cub/util_allocator.cuh>
4545

46+
#include <cuda/std/limits>
47+
4648
#include "../../test/test_util.h"
4749
#include <stdio.h>
4850

@@ -84,14 +86,13 @@ void Initialize(int* h_in, int num_items, int max_segment)
8486
while (i < num_items)
8587
{
8688
// Randomly select number of repeating occurrences uniformly from [1..max_segment]
87-
unsigned short max_short = (unsigned short) -1;
88-
unsigned short repeat;
89-
RandomBits(repeat);
90-
repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
91-
repeat = CUB_MAX(1, repeat);
89+
unsigned short bits;
90+
RandomBits(bits);
91+
const int repeat = cuda::std::max(
92+
1, static_cast<int>(bits * (static_cast<float>(max_segment) / cuda::std::numeric_limits<unsigned short>::max())));
9293

9394
int j = i;
94-
while (j < CUB_MIN(i + repeat, num_items))
95+
while (j < cuda::std::min(i + repeat, num_items))
9596
{
9697
h_in[j] = key;
9798
j++;

cub/examples/device/example_device_select_flagged.cu

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@
4343
#include <cub/device/device_select.cuh>
4444
#include <cub/util_allocator.cuh>
4545

46+
#include <cuda/std/limits>
47+
4648
#include "../../test/test_util.h"
4749
#include <stdio.h>
4850

@@ -65,20 +67,18 @@ CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
6567
*/
6668
void Initialize(int* h_in, unsigned char* h_flags, int num_items, int max_segment)
6769
{
68-
unsigned short max_short = (unsigned short) -1;
69-
7070
int key = 0;
7171
int i = 0;
7272
while (i < num_items)
7373
{
7474
// Select number of repeating occurrences
75-
unsigned short repeat;
76-
RandomBits(repeat);
77-
repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
78-
repeat = CUB_MAX(1, repeat);
75+
unsigned short bits;
76+
RandomBits(bits);
77+
const int repeat = cuda::std::max(
78+
1, static_cast<int>(bits * (static_cast<float>(max_segment) / cuda::std::numeric_limits<unsigned short>::max())));
7979

8080
int j = i;
81-
while (j < CUB_MIN(i + repeat, num_items))
81+
while (j < cuda::std::min(i + repeat, num_items))
8282
{
8383
h_flags[j] = 0;
8484
h_in[j] = key;

0 commit comments

Comments
 (0)