Skip to content

Commit 6ad5a2a

Browse files
elstehledavebayer
authored andcommitted
Adds support for large number of items and large number of segments to DeviceSegmentedSort (NVIDIA#3308)
* fixes segment offset generation * switches to analytical verification * switches to analytical verification for pairs * addresses review comments * introduces segment offset type * adds tests for large number of segments * adds support for large number of segments * drops segment offset type * fixes thrust namespace * removes about-to-be-deprecated cub iterators * no exec specifier on defaulted ctor * fixes gcc7 linker error * uses local_segment_index_t throughout * determine offset type based on type returned by segment iterator begin/end iterators * minor style improvements
1 parent 002744d commit 6ad5a2a

File tree

5 files changed

+285
-174
lines changed

5 files changed

+285
-174
lines changed

cub/cub/device/device_segmented_sort.cuh

Lines changed: 85 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,13 @@
4141
# pragma system_header
4242
#endif // no system header
4343

44+
#include <cub/detail/choose_offset.cuh>
4445
#include <cub/detail/nvtx.cuh>
4546
#include <cub/device/dispatch/dispatch_segmented_sort.cuh>
4647
#include <cub/util_namespace.cuh>
4748

49+
#include <cuda/std/cstdint>
50+
4851
CUB_NAMESPACE_BEGIN
4952

5053
//! @rst
@@ -140,16 +143,19 @@ private:
140143
std::size_t& temp_storage_bytes,
141144
const KeyT* d_keys_in,
142145
KeyT* d_keys_out,
143-
int num_items,
144-
int num_segments,
146+
::cuda::std::int64_t num_items,
147+
::cuda::std::int64_t num_segments,
145148
BeginOffsetIteratorT d_begin_offsets,
146149
EndOffsetIteratorT d_end_offsets,
147150
cudaStream_t stream = 0)
148151
{
149152
constexpr bool is_descending = false;
150153
constexpr bool is_overwrite_okay = false;
154+
155+
using OffsetT =
156+
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
151157
using DispatchT =
152-
DispatchSegmentedSort<is_descending, KeyT, cub::NullType, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
158+
DispatchSegmentedSort<is_descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
153159

154160
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
155161
DoubleBuffer<NullType> d_values;
@@ -286,8 +292,8 @@ public:
286292
std::size_t& temp_storage_bytes,
287293
const KeyT* d_keys_in,
288294
KeyT* d_keys_out,
289-
int num_items,
290-
int num_segments,
295+
::cuda::std::int64_t num_items,
296+
::cuda::std::int64_t num_segments,
291297
BeginOffsetIteratorT d_begin_offsets,
292298
EndOffsetIteratorT d_end_offsets,
293299
cudaStream_t stream = 0)
@@ -313,16 +319,19 @@ private:
313319
std::size_t& temp_storage_bytes,
314320
const KeyT* d_keys_in,
315321
KeyT* d_keys_out,
316-
int num_items,
317-
int num_segments,
322+
::cuda::std::int64_t num_items,
323+
::cuda::std::int64_t num_segments,
318324
BeginOffsetIteratorT d_begin_offsets,
319325
EndOffsetIteratorT d_end_offsets,
320326
cudaStream_t stream = 0)
321327
{
322328
constexpr bool is_descending = true;
323329
constexpr bool is_overwrite_okay = false;
330+
331+
using OffsetT =
332+
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
324333
using DispatchT =
325-
DispatchSegmentedSort<is_descending, KeyT, cub::NullType, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
334+
DispatchSegmentedSort<is_descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
326335

327336
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
328337
DoubleBuffer<NullType> d_values;
@@ -454,8 +463,8 @@ public:
454463
std::size_t& temp_storage_bytes,
455464
const KeyT* d_keys_in,
456465
KeyT* d_keys_out,
457-
int num_items,
458-
int num_segments,
466+
::cuda::std::int64_t num_items,
467+
::cuda::std::int64_t num_segments,
459468
BeginOffsetIteratorT d_begin_offsets,
460469
EndOffsetIteratorT d_end_offsets,
461470
cudaStream_t stream = 0)
@@ -480,17 +489,18 @@ private:
480489
void* d_temp_storage,
481490
std::size_t& temp_storage_bytes,
482491
DoubleBuffer<KeyT>& d_keys,
483-
int num_items,
484-
int num_segments,
492+
::cuda::std::int64_t num_items,
493+
::cuda::std::int64_t num_segments,
485494
BeginOffsetIteratorT d_begin_offsets,
486495
EndOffsetIteratorT d_end_offsets,
487496
cudaStream_t stream = 0)
488497
{
489498
constexpr bool is_descending = false;
490499
constexpr bool is_overwrite_okay = true;
491-
500+
using OffsetT =
501+
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
492502
using DispatchT =
493-
DispatchSegmentedSort<is_descending, KeyT, cub::NullType, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
503+
DispatchSegmentedSort<is_descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
494504

495505
DoubleBuffer<NullType> d_values;
496506

@@ -632,8 +642,8 @@ public:
632642
void* d_temp_storage,
633643
std::size_t& temp_storage_bytes,
634644
DoubleBuffer<KeyT>& d_keys,
635-
int num_items,
636-
int num_segments,
645+
::cuda::std::int64_t num_items,
646+
::cuda::std::int64_t num_segments,
637647
BeginOffsetIteratorT d_begin_offsets,
638648
EndOffsetIteratorT d_end_offsets,
639649
cudaStream_t stream = 0)
@@ -650,17 +660,18 @@ private:
650660
void* d_temp_storage,
651661
std::size_t& temp_storage_bytes,
652662
DoubleBuffer<KeyT>& d_keys,
653-
int num_items,
654-
int num_segments,
663+
::cuda::std::int64_t num_items,
664+
::cuda::std::int64_t num_segments,
655665
BeginOffsetIteratorT d_begin_offsets,
656666
EndOffsetIteratorT d_end_offsets,
657667
cudaStream_t stream = 0)
658668
{
659669
constexpr bool is_descending = true;
660670
constexpr bool is_overwrite_okay = true;
661-
671+
using OffsetT =
672+
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
662673
using DispatchT =
663-
DispatchSegmentedSort<is_descending, KeyT, cub::NullType, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
674+
DispatchSegmentedSort<is_descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
664675

665676
DoubleBuffer<NullType> d_values;
666677

@@ -803,8 +814,8 @@ public:
803814
void* d_temp_storage,
804815
std::size_t& temp_storage_bytes,
805816
DoubleBuffer<KeyT>& d_keys,
806-
int num_items,
807-
int num_segments,
817+
::cuda::std::int64_t num_items,
818+
::cuda::std::int64_t num_segments,
808819
BeginOffsetIteratorT d_begin_offsets,
809820
EndOffsetIteratorT d_end_offsets,
810821
cudaStream_t stream = 0)
@@ -931,8 +942,8 @@ public:
931942
std::size_t& temp_storage_bytes,
932943
const KeyT* d_keys_in,
933944
KeyT* d_keys_out,
934-
int num_items,
935-
int num_segments,
945+
::cuda::std::int64_t num_items,
946+
::cuda::std::int64_t num_segments,
936947
BeginOffsetIteratorT d_begin_offsets,
937948
EndOffsetIteratorT d_end_offsets,
938949
cudaStream_t stream = 0)
@@ -1067,8 +1078,8 @@ public:
10671078
std::size_t& temp_storage_bytes,
10681079
const KeyT* d_keys_in,
10691080
KeyT* d_keys_out,
1070-
int num_items,
1071-
int num_segments,
1081+
::cuda::std::int64_t num_items,
1082+
::cuda::std::int64_t num_segments,
10721083
BeginOffsetIteratorT d_begin_offsets,
10731084
EndOffsetIteratorT d_end_offsets,
10741085
cudaStream_t stream = 0)
@@ -1213,8 +1224,8 @@ public:
12131224
void* d_temp_storage,
12141225
std::size_t& temp_storage_bytes,
12151226
DoubleBuffer<KeyT>& d_keys,
1216-
int num_items,
1217-
int num_segments,
1227+
::cuda::std::int64_t num_items,
1228+
::cuda::std::int64_t num_segments,
12181229
BeginOffsetIteratorT d_begin_offsets,
12191230
EndOffsetIteratorT d_end_offsets,
12201231
cudaStream_t stream = 0)
@@ -1350,8 +1361,8 @@ public:
13501361
void* d_temp_storage,
13511362
std::size_t& temp_storage_bytes,
13521363
DoubleBuffer<KeyT>& d_keys,
1353-
int num_items,
1354-
int num_segments,
1364+
::cuda::std::int64_t num_items,
1365+
::cuda::std::int64_t num_segments,
13551366
BeginOffsetIteratorT d_begin_offsets,
13561367
EndOffsetIteratorT d_end_offsets,
13571368
cudaStream_t stream = 0)
@@ -1371,15 +1382,19 @@ private:
13711382
KeyT* d_keys_out,
13721383
const ValueT* d_values_in,
13731384
ValueT* d_values_out,
1374-
int num_items,
1375-
int num_segments,
1385+
::cuda::std::int64_t num_items,
1386+
::cuda::std::int64_t num_segments,
13761387
BeginOffsetIteratorT d_begin_offsets,
13771388
EndOffsetIteratorT d_end_offsets,
13781389
cudaStream_t stream = 0)
13791390
{
13801391
constexpr bool is_descending = false;
13811392
constexpr bool is_overwrite_okay = false;
1382-
using DispatchT = DispatchSegmentedSort<is_descending, KeyT, ValueT, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
1393+
1394+
using OffsetT =
1395+
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
1396+
using DispatchT =
1397+
DispatchSegmentedSort<is_descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
13831398

13841399
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
13851400
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
@@ -1539,8 +1554,8 @@ public:
15391554
KeyT* d_keys_out,
15401555
const ValueT* d_values_in,
15411556
ValueT* d_values_out,
1542-
int num_items,
1543-
int num_segments,
1557+
::cuda::std::int64_t num_items,
1558+
::cuda::std::int64_t num_segments,
15441559
BeginOffsetIteratorT d_begin_offsets,
15451560
EndOffsetIteratorT d_end_offsets,
15461561
cudaStream_t stream = 0)
@@ -1570,15 +1585,19 @@ private:
15701585
KeyT* d_keys_out,
15711586
const ValueT* d_values_in,
15721587
ValueT* d_values_out,
1573-
int num_items,
1574-
int num_segments,
1588+
::cuda::std::int64_t num_items,
1589+
::cuda::std::int64_t num_segments,
15751590
BeginOffsetIteratorT d_begin_offsets,
15761591
EndOffsetIteratorT d_end_offsets,
15771592
cudaStream_t stream = 0)
15781593
{
15791594
constexpr bool is_descending = true;
15801595
constexpr bool is_overwrite_okay = false;
1581-
using DispatchT = DispatchSegmentedSort<is_descending, KeyT, ValueT, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
1596+
1597+
using OffsetT =
1598+
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
1599+
using DispatchT =
1600+
DispatchSegmentedSort<is_descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
15821601

15831602
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
15841603
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
@@ -1734,8 +1753,8 @@ public:
17341753
KeyT* d_keys_out,
17351754
const ValueT* d_values_in,
17361755
ValueT* d_values_out,
1737-
int num_items,
1738-
int num_segments,
1756+
::cuda::std::int64_t num_items,
1757+
::cuda::std::int64_t num_segments,
17391758
BeginOffsetIteratorT d_begin_offsets,
17401759
EndOffsetIteratorT d_end_offsets,
17411760
cudaStream_t stream = 0)
@@ -1763,15 +1782,19 @@ private:
17631782
std::size_t& temp_storage_bytes,
17641783
DoubleBuffer<KeyT>& d_keys,
17651784
DoubleBuffer<ValueT>& d_values,
1766-
int num_items,
1767-
int num_segments,
1785+
::cuda::std::int64_t num_items,
1786+
::cuda::std::int64_t num_segments,
17681787
BeginOffsetIteratorT d_begin_offsets,
17691788
EndOffsetIteratorT d_end_offsets,
17701789
cudaStream_t stream = 0)
17711790
{
17721791
constexpr bool is_descending = false;
17731792
constexpr bool is_overwrite_okay = true;
1774-
using DispatchT = DispatchSegmentedSort<is_descending, KeyT, ValueT, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
1793+
1794+
using OffsetT =
1795+
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
1796+
using DispatchT =
1797+
DispatchSegmentedSort<is_descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
17751798

17761799
return DispatchT::Dispatch(
17771800
d_temp_storage,
@@ -1931,8 +1954,8 @@ public:
19311954
std::size_t& temp_storage_bytes,
19321955
DoubleBuffer<KeyT>& d_keys,
19331956
DoubleBuffer<ValueT>& d_values,
1934-
int num_items,
1935-
int num_segments,
1957+
::cuda::std::int64_t num_items,
1958+
::cuda::std::int64_t num_segments,
19361959
BeginOffsetIteratorT d_begin_offsets,
19371960
EndOffsetIteratorT d_end_offsets,
19381961
cudaStream_t stream = 0)
@@ -1958,15 +1981,19 @@ private:
19581981
std::size_t& temp_storage_bytes,
19591982
DoubleBuffer<KeyT>& d_keys,
19601983
DoubleBuffer<ValueT>& d_values,
1961-
int num_items,
1962-
int num_segments,
1984+
::cuda::std::int64_t num_items,
1985+
::cuda::std::int64_t num_segments,
19631986
BeginOffsetIteratorT d_begin_offsets,
19641987
EndOffsetIteratorT d_end_offsets,
19651988
cudaStream_t stream = 0)
19661989
{
19671990
constexpr bool is_descending = true;
19681991
constexpr bool is_overwrite_okay = true;
1969-
using DispatchT = DispatchSegmentedSort<is_descending, KeyT, ValueT, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
1992+
1993+
using OffsetT =
1994+
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
1995+
using DispatchT =
1996+
DispatchSegmentedSort<is_descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
19701997

19711998
return DispatchT::Dispatch(
19721999
d_temp_storage,
@@ -2125,8 +2152,8 @@ public:
21252152
std::size_t& temp_storage_bytes,
21262153
DoubleBuffer<KeyT>& d_keys,
21272154
DoubleBuffer<ValueT>& d_values,
2128-
int num_items,
2129-
int num_segments,
2155+
::cuda::std::int64_t num_items,
2156+
::cuda::std::int64_t num_segments,
21302157
BeginOffsetIteratorT d_begin_offsets,
21312158
EndOffsetIteratorT d_end_offsets,
21322159
cudaStream_t stream = 0)
@@ -2281,8 +2308,8 @@ public:
22812308
KeyT* d_keys_out,
22822309
const ValueT* d_values_in,
22832310
ValueT* d_values_out,
2284-
int num_items,
2285-
int num_segments,
2311+
::cuda::std::int64_t num_items,
2312+
::cuda::std::int64_t num_segments,
22862313
BeginOffsetIteratorT d_begin_offsets,
22872314
EndOffsetIteratorT d_end_offsets,
22882315
cudaStream_t stream = 0)
@@ -2439,8 +2466,8 @@ public:
24392466
KeyT* d_keys_out,
24402467
const ValueT* d_values_in,
24412468
ValueT* d_values_out,
2442-
int num_items,
2443-
int num_segments,
2469+
::cuda::std::int64_t num_items,
2470+
::cuda::std::int64_t num_segments,
24442471
BeginOffsetIteratorT d_begin_offsets,
24452472
EndOffsetIteratorT d_end_offsets,
24462473
cudaStream_t stream = 0)
@@ -2605,8 +2632,8 @@ public:
26052632
std::size_t& temp_storage_bytes,
26062633
DoubleBuffer<KeyT>& d_keys,
26072634
DoubleBuffer<ValueT>& d_values,
2608-
int num_items,
2609-
int num_segments,
2635+
::cuda::std::int64_t num_items,
2636+
::cuda::std::int64_t num_segments,
26102637
BeginOffsetIteratorT d_begin_offsets,
26112638
EndOffsetIteratorT d_end_offsets,
26122639
cudaStream_t stream = 0)
@@ -2768,8 +2795,8 @@ public:
27682795
std::size_t& temp_storage_bytes,
27692796
DoubleBuffer<KeyT>& d_keys,
27702797
DoubleBuffer<ValueT>& d_values,
2771-
int num_items,
2772-
int num_segments,
2798+
::cuda::std::int64_t num_items,
2799+
::cuda::std::int64_t num_segments,
27732800
BeginOffsetIteratorT d_begin_offsets,
27742801
EndOffsetIteratorT d_end_offsets,
27752802
cudaStream_t stream = 0)

0 commit comments

Comments
 (0)