Skip to content

Commit 96cf032

Browse files
committed
replace voting instructions
1 parent 8d44adb commit 96cf032

15 files changed

+56
-52
lines changed

cub/cub/agent/agent_radix_sort_onesweep.cuh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,7 @@ struct AgentRadixSortOnesweep
280280
} while (value_j == 0);
281281

282282
inc_sum += value_j & LOOKBACK_VALUE_MASK;
283-
want_mask = WARP_BALLOT((value_j & LOOKBACK_GLOBAL_MASK) == 0, want_mask);
283+
want_mask = __ballot_sync((value_j & LOOKBACK_GLOBAL_MASK) == 0, want_mask);
284284
if (value_j & LOOKBACK_GLOBAL_MASK)
285285
{
286286
break;
@@ -484,7 +484,7 @@ struct AgentRadixSortOnesweep
484484
{
485485
d_keys_out[global_idx] = Twiddle::Out(key, decomposer);
486486
}
487-
WARP_SYNC(WARP_MASK);
487+
__syncwarp(WARP_MASK);
488488
}
489489
}
490490

@@ -502,7 +502,7 @@ struct AgentRadixSortOnesweep
502502
{
503503
d_values_out[global_idx] = value;
504504
}
505-
WARP_SYNC(WARP_MASK);
505+
__syncwarp(WARP_MASK);
506506
}
507507
}
508508

cub/cub/agent/agent_rle.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -625,7 +625,7 @@ struct AgentRle
625625
WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id])
626626
.ScatterToStriped(run_offsets, thread_num_runs_exclusive_in_warp);
627627

628-
WARP_SYNC(0xffffffff);
628+
__syncwarp(0xffffffff);
629629

630630
WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id])
631631
.ScatterToStriped(run_lengths, thread_num_runs_exclusive_in_warp);

cub/cub/agent/agent_sub_warp_merge_sort.cuh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -233,23 +233,23 @@ public:
233233
KeyT oob_default = AgentSubWarpSort::get_oob_default(Int2Type<std::is_same<bool, KeyT>::value>{});
234234

235235
WarpLoadKeysT(storage.load_keys).Load(keys_input, keys, segment_size, oob_default);
236-
WARP_SYNC(warp_merge_sort.get_member_mask());
236+
__syncwarp(warp_merge_sort.get_member_mask());
237237

238238
if (!KEYS_ONLY)
239239
{
240240
WarpLoadItemsT(storage.load_items).Load(values_input, values, segment_size);
241241

242-
WARP_SYNC(warp_merge_sort.get_member_mask());
242+
__syncwarp(warp_merge_sort.get_member_mask());
243243
}
244244

245245
warp_merge_sort.Sort(keys, values, BinaryOpT{}, segment_size, oob_default);
246-
WARP_SYNC(warp_merge_sort.get_member_mask());
246+
__syncwarp(warp_merge_sort.get_member_mask());
247247

248248
WarpStoreKeysT(storage.store_keys).Store(keys_output, keys, segment_size);
249249

250250
if (!KEYS_ONLY)
251251
{
252-
WARP_SYNC(warp_merge_sort.get_member_mask());
252+
__syncwarp(warp_merge_sort.get_member_mask());
253253
WarpStoreItemsT(storage.store_items).Store(values_output, values, segment_size);
254254
}
255255
}

cub/cub/agent/single_pass_scan_operators.cuh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -733,7 +733,7 @@ public:
733733
tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
734734
}
735735

736-
while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff))
736+
while (__any_sync((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff))
737737
{
738738
delay_or_prevent_hoisting();
739739
TxnWord alias = LoadStatus<Order>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
@@ -918,7 +918,7 @@ struct ScanTileState<T, false>
918918
delay();
919919
status = detail::load_relaxed(d_tile_status + TILE_STATUS_PADDING + tile_idx);
920920
__threadfence();
921-
} while (WARP_ANY((status == SCAN_TILE_INVALID), 0xffffffff));
921+
} while (__any_sync((status == SCAN_TILE_INVALID), 0xffffffff));
922922

923923
if (status == StatusWord(SCAN_TILE_PARTIAL))
924924
{
@@ -1145,7 +1145,7 @@ struct ReduceByKeyScanTileState<ValueT, KeyT, true>
11451145
TxnWord alias = detail::load_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
11461146
tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
11471147

1148-
} while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
1148+
} while (__any_sync((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
11491149

11501150
status = tile_descriptor.status;
11511151
value.value = tile_descriptor.value;
@@ -1268,7 +1268,7 @@ struct TilePrefixCallbackOp
12681268
exclusive_prefix = window_aggregate;
12691269

12701270
// Keep sliding the window back until we come across a tile whose inclusive prefix is known
1271-
while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff))
1271+
while (__all_sync((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff))
12721272
{
12731273
predecessor_idx -= CUB_PTX_WARP_THREADS;
12741274

cub/cub/block/block_exchange.cuh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,7 @@ private:
324324
detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
325325
}
326326

327-
WARP_SYNC(0xffffffff);
327+
__syncwarp(0xffffffff);
328328

329329
#pragma unroll
330330
for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -363,7 +363,7 @@ private:
363363
detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
364364
}
365365

366-
WARP_SYNC(0xffffffff);
366+
__syncwarp(0xffffffff);
367367

368368
#pragma unroll
369369
for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -395,7 +395,7 @@ private:
395395
detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
396396
}
397397

398-
WARP_SYNC(0xffffffff);
398+
__syncwarp(0xffffffff);
399399

400400
#pragma unroll
401401
for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -545,7 +545,7 @@ private:
545545
detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
546546
}
547547

548-
WARP_SYNC(0xffffffff);
548+
__syncwarp(0xffffffff);
549549

550550
#pragma unroll
551551
for (int i = 0; i < ITEMS_PER_THREAD; i++)
@@ -589,7 +589,7 @@ private:
589589
detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
590590
}
591591

592-
WARP_SYNC(0xffffffff);
592+
__syncwarp(0xffffffff);
593593

594594
#pragma unroll
595595
for (int i = 0; i < ITEMS_PER_THREAD; i++)

cub/cub/block/block_radix_rank.cuh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -741,7 +741,7 @@ public:
741741
DigitCounterT warp_digit_prefix = *digit_counters[ITEM];
742742

743743
// Warp-sync
744-
WARP_SYNC(0xFFFFFFFF);
744+
__syncwarp(0xFFFFFFFF);
745745

746746
// Number of peers having same digit as me
747747
int32_t digit_count = __popc(peer_mask);
@@ -756,7 +756,7 @@ public:
756756
}
757757

758758
// Warp-sync
759-
WARP_SYNC(0xFFFFFFFF);
759+
__syncwarp(0xFFFFFFFF);
760760

761761
// Number of prior keys having same digit
762762
ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix);
@@ -978,7 +978,7 @@ struct BlockRadixRankMatchEarlyCounts
978978
match_masks[bin] = 0;
979979
}
980980
}
981-
WARP_SYNC(WARP_MASK);
981+
__syncwarp(WARP_MASK);
982982

983983
// compute private per-part histograms
984984
int part = lane % NUM_PARTS;
@@ -992,7 +992,7 @@ struct BlockRadixRankMatchEarlyCounts
992992
// no extra work is necessary if NUM_PARTS == 1
993993
if (NUM_PARTS > 1)
994994
{
995-
WARP_SYNC(WARP_MASK);
995+
__syncwarp(WARP_MASK);
996996
// TODO: handle RADIX_DIGITS % WARP_THREADS != 0 if it becomes necessary
997997
constexpr int WARP_BINS_PER_THREAD = RADIX_DIGITS / WARP_THREADS;
998998
int bins[WARP_BINS_PER_THREAD];
@@ -1067,7 +1067,7 @@ struct BlockRadixRankMatchEarlyCounts
10671067
::cuda::std::uint32_t bin = Digit(keys[u]);
10681068
int* p_match_mask = &match_masks[bin];
10691069
atomicOr(p_match_mask, lane_mask);
1070-
WARP_SYNC(WARP_MASK);
1070+
__syncwarp(WARP_MASK);
10711071
int bin_mask = *p_match_mask;
10721072
int leader = (WARP_THREADS - 1) - __clz(bin_mask);
10731073
int warp_offset = 0;
@@ -1082,7 +1082,7 @@ struct BlockRadixRankMatchEarlyCounts
10821082
{
10831083
*p_match_mask = 0;
10841084
}
1085-
WARP_SYNC(WARP_MASK);
1085+
__syncwarp(WARP_MASK);
10861086
ranks[u] = warp_offset + popc - 1;
10871087
}
10881088
}

cub/cub/block/specializations/block_reduce_raking.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ struct BlockReduceRaking
228228
// sync before re-using shmem (warp_storage/raking_grid are aliased)
229229
static_assert(RAKING_THREADS <= CUB_PTX_WARP_THREADS, "RAKING_THREADS must be <= warp size.");
230230
unsigned int mask = static_cast<unsigned int>((1ull << RAKING_THREADS) - 1);
231-
WARP_SYNC(mask);
231+
__syncwarp(mask);
232232

233233
partial = WarpReduce(temp_storage.warp_storage)
234234
.template Reduce<(IS_FULL_TILE && RAKING_UNGUARDED)>(partial, valid_raking_threads, reduction_op);

cub/cub/util_ptx.cuh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_OR(int p)
217217
/**
218218
* Warp barrier
219219
*/
220+
CCCL_DEPRECATED_BECAUSE("use __syncwarp() instead")
220221
_CCCL_DEVICE _CCCL_FORCEINLINE void WARP_SYNC(unsigned int member_mask)
221222
{
222223
__syncwarp(member_mask);
@@ -225,6 +226,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void WARP_SYNC(unsigned int member_mask)
225226
/**
226227
* Warp any
227228
*/
229+
CCCL_DEPRECATED_BECAUSE("use __any_sync() instead")
228230
_CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ANY(int predicate, unsigned int member_mask)
229231
{
230232
return __any_sync(member_mask, predicate);
@@ -233,6 +235,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ANY(int predicate, unsigned int member_m
233235
/**
234236
* Warp any
235237
*/
238+
CCCL_DEPRECATED_BECAUSE("use __all_sync() instead")
236239
_CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ALL(int predicate, unsigned int member_mask)
237240
{
238241
return __all_sync(member_mask, predicate);
@@ -241,6 +244,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ALL(int predicate, unsigned int member_m
241244
/**
242245
* Warp ballot
243246
*/
247+
CCCL_DEPRECATED_BECAUSE("use __ballot_sync() instead")
244248
_CCCL_DEVICE _CCCL_FORCEINLINE int WARP_BALLOT(int predicate, unsigned int member_mask)
245249
{
246250
return __ballot_sync(member_mask, predicate);

cub/cub/warp/specializations/warp_exchange_smem.cuh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ public:
104104
const int idx = ITEMS_PER_THREAD * lane_id + item;
105105
temp_storage.items_shared[idx] = input_items[item];
106106
}
107-
WARP_SYNC(member_mask);
107+
__syncwarp(member_mask);
108108

109109
for (int item = 0; item < ITEMS_PER_THREAD; item++)
110110
{
@@ -122,7 +122,7 @@ public:
122122
const int idx = LOGICAL_WARP_THREADS * item + lane_id;
123123
temp_storage.items_shared[idx] = input_items[item];
124124
}
125-
WARP_SYNC(member_mask);
125+
__syncwarp(member_mask);
126126

127127
for (int item = 0; item < ITEMS_PER_THREAD; item++)
128128
{
@@ -155,7 +155,7 @@ public:
155155
temp_storage.items_shared[ranks[ITEM]] = input_items[ITEM];
156156
}
157157

158-
WARP_SYNC(member_mask);
158+
__syncwarp(member_mask);
159159

160160
#pragma unroll
161161
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

cub/cub/warp/specializations/warp_reduce_shfl.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -700,7 +700,7 @@ struct WarpReduceShfl
700700
_CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
701701
{
702702
// Get the start flags for each thread in the warp.
703-
int warp_flags = WARP_BALLOT(flag, member_mask);
703+
int warp_flags = __ballot_sync(flag, member_mask);
704704

705705
// Convert to tail-segmented
706706
if (HEAD_SEGMENTED)

0 commit comments

Comments
 (0)