Skip to content

Commit 43fb061

Browse files
authored
Review/Deprecate CUB util.ptx for CCCL 2.x (#3342)
1 parent 1d426b6 commit 43fb061

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+434
-386
lines changed

cub/cub/agent/agent_adjacent_difference.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ struct AgentDifference
138138
BlockLoad(temp_storage.load).Load(load_it + tile_base, input);
139139
}
140140

141-
CTA_SYNC();
141+
__syncthreads();
142142

143143
if (ReadLeft)
144144
{
@@ -186,7 +186,7 @@ struct AgentDifference
186186
}
187187
}
188188

189-
CTA_SYNC();
189+
__syncthreads();
190190

191191
if (IS_LAST_TILE)
192192
{

cub/cub/agent/agent_batch_memcpy.cuh

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -834,7 +834,7 @@ private:
834834
BlockBLevTileCountScanT(temp_storage.staged.blev.block_scan_storage)
835835
.ExclusiveSum(block_offset, block_offset, blev_tile_prefix_op);
836836
}
837-
CTA_SYNC();
837+
__syncthreads();
838838

839839
// Read in the BLEV buffer partition (i.e., the buffers that require block-level collaboration)
840840
blev_buffer_offset = threadIdx.x * BLEV_BUFFERS_PER_THREAD;
@@ -996,7 +996,7 @@ private:
996996

997997
// Ensure all threads finished collaborative BlockExchange so temporary storage can be reused
998998
// with next iteration
999-
CTA_SYNC();
999+
__syncthreads();
10001000
}
10011001
}
10021002

@@ -1026,7 +1026,7 @@ public:
10261026
}
10271027

10281028
// Ensure we can repurpose the BlockLoad's temporary storage
1029-
CTA_SYNC();
1029+
__syncthreads();
10301030

10311031
// Count how many buffers fall into each size-class
10321032
VectorizedSizeClassCounterT size_class_histogram = GetBufferSizeClassHistogram(buffer_sizes);
@@ -1037,7 +1037,7 @@ public:
10371037
.ExclusiveSum(size_class_histogram, size_class_histogram, size_class_agg);
10381038

10391039
// Ensure we can repurpose the scan's temporary storage for scattering the buffer ids
1040-
CTA_SYNC();
1040+
__syncthreads();
10411041

10421042
// Factor in the per-size-class counts / offsets
10431043
// That is, WLEV buffer offset has to be offset by the TLEV buffer count and BLEV buffer offset
@@ -1077,15 +1077,15 @@ public:
10771077

10781078
// Ensure the prefix callback has finished using its temporary storage and that it can be reused
10791079
// in the next stage
1080-
CTA_SYNC();
1080+
__syncthreads();
10811081

10821082
// Scatter the buffers into one of the three partitions (TLEV, WLEV, BLEV) depending on their
10831083
// size
10841084
PartitionBuffersBySize(buffer_sizes, size_class_histogram, temp_storage.staged.buffers_by_size_class);
10851085

10861086
// Ensure all buffers have been partitioned by their size class AND
10871087
// ensure that blev_buffer_offset has been written to shared memory
1088-
CTA_SYNC();
1088+
__syncthreads();
10891089

10901090
// TODO: think about prefetching tile_buffer_{srcs,dsts} into shmem
10911091
InputBufferIt tile_buffer_srcs = input_buffer_it + buffer_offset;
@@ -1104,7 +1104,7 @@ public:
11041104
tile_id);
11051105

11061106
// Ensure we can repurpose the temporary storage required by EnqueueBLEVBuffers
1107-
CTA_SYNC();
1107+
__syncthreads();
11081108

11091109
// Copy warp-level buffers
11101110
BatchMemcpyWLEVBuffers(

cub/cub/agent/agent_histogram.cuh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,7 @@ struct AgentHistogram
320320
}
321321

322322
// Barrier to make sure all threads are done updating counters
323-
CTA_SYNC();
323+
__syncthreads();
324324
}
325325

326326
// Initialize privatized bin counters. Specialized for privatized shared-memory counters
@@ -350,7 +350,7 @@ struct AgentHistogram
350350
_CCCL_DEVICE _CCCL_FORCEINLINE void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
351351
{
352352
// Barrier to make sure all threads are done updating counters
353-
CTA_SYNC();
353+
__syncthreads();
354354

355355
// Apply privatized bin counts to output bin counts
356356
#pragma unroll
@@ -690,15 +690,15 @@ struct AgentHistogram
690690
ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
691691
}
692692

693-
CTA_SYNC();
693+
__syncthreads();
694694

695695
// Get next tile
696696
if (threadIdx.x == 0)
697697
{
698698
temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles;
699699
}
700700

701-
CTA_SYNC();
701+
__syncthreads();
702702

703703
tile_idx = temp_storage.tile_idx;
704704
}

cub/cub/agent/agent_merge.cuh

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ struct agent_t
132132
gmem_to_reg<threads_per_block, IsFullTile>(
133133
keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, num_keys1, num_keys2);
134134
reg_to_shared<threads_per_block>(&storage.keys_shared[0], keys_loc);
135-
CTA_SYNC();
135+
__syncthreads();
136136

137137
// use binary search in shared memory to find merge path for each of thread.
138138
// we can use int type here, because the number of items in shared memory is limited
@@ -158,7 +158,7 @@ struct agent_t
158158
keys_loc,
159159
indices,
160160
compare_op);
161-
CTA_SYNC();
161+
__syncthreads();
162162

163163
// write keys
164164
if (IsFullTile)
@@ -182,17 +182,18 @@ struct agent_t
182182
item_type items_loc[items_per_thread];
183183
gmem_to_reg<threads_per_block, IsFullTile>(
184184
items_loc, items1_in + keys1_beg, items2_in + keys2_beg, num_keys1, num_keys2);
185-
CTA_SYNC(); // block_store_keys above uses shared memory, so make sure all threads are done before we write to it
185+
__syncthreads(); // block_store_keys above uses shared memory, so make sure all threads are done before we write
186+
// to it
186187
reg_to_shared<threads_per_block>(&storage.items_shared[0], items_loc);
187-
CTA_SYNC();
188+
__syncthreads();
188189

189190
// gather items from shared mem
190191
#pragma unroll
191192
for (int i = 0; i < items_per_thread; ++i)
192193
{
193194
items_loc[i] = storage.items_shared[indices[i]];
194195
}
195-
CTA_SYNC();
196+
__syncthreads();
196197

197198
// write from reg to gmem
198199
if (IsFullTile)

cub/cub/agent/agent_merge_sort.cuh

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ struct AgentBlockSort
187187
BlockLoadItems(storage.load_items).Load(items_in + tile_base, items_local);
188188
}
189189

190-
CTA_SYNC();
190+
__syncthreads();
191191
}
192192

193193
KeyT keys_local[ITEMS_PER_THREAD];
@@ -200,7 +200,7 @@ struct AgentBlockSort
200200
BlockLoadKeys(storage.load_keys).Load(keys_in + tile_base, keys_local);
201201
}
202202

203-
CTA_SYNC();
203+
__syncthreads();
204204
_CCCL_PDL_TRIGGER_NEXT_LAUNCH();
205205

206206
_CCCL_IF_CONSTEXPR (IS_LAST_TILE)
@@ -212,7 +212,7 @@ struct AgentBlockSort
212212
BlockMergeSortT(storage.block_merge).Sort(keys_local, items_local, compare_op);
213213
}
214214

215-
CTA_SYNC();
215+
__syncthreads();
216216

217217
if (ping)
218218
{
@@ -227,7 +227,7 @@ struct AgentBlockSort
227227

228228
_CCCL_IF_CONSTEXPR (!KEYS_ONLY)
229229
{
230-
CTA_SYNC();
230+
__syncthreads();
231231

232232
_CCCL_IF_CONSTEXPR (IS_LAST_TILE)
233233
{
@@ -252,7 +252,7 @@ struct AgentBlockSort
252252

253253
_CCCL_IF_CONSTEXPR (!KEYS_ONLY)
254254
{
255-
CTA_SYNC();
255+
__syncthreads();
256256

257257
_CCCL_IF_CONSTEXPR (IS_LAST_TILE)
258258
{
@@ -583,7 +583,7 @@ struct AgentMerge
583583
}
584584
}
585585

586-
CTA_SYNC();
586+
__syncthreads();
587587
_CCCL_PDL_TRIGGER_NEXT_LAUNCH();
588588

589589
// use binary search in shared memory
@@ -616,7 +616,7 @@ struct AgentMerge
616616
indices,
617617
compare_op);
618618

619-
CTA_SYNC();
619+
__syncthreads();
620620

621621
// write keys
622622
if (ping)
@@ -650,11 +650,11 @@ struct AgentMerge
650650
_CCCL_IF_CONSTEXPR (!KEYS_ONLY)
651651
#endif // _CCCL_CUDACC_AT_LEAST(11, 8)
652652
{
653-
CTA_SYNC();
653+
__syncthreads();
654654

655655
detail::reg_to_shared<BLOCK_THREADS>(&storage.items_shared[0], items_local);
656656

657-
CTA_SYNC();
657+
__syncthreads();
658658

659659
// gather items from shared mem
660660
//
@@ -664,7 +664,7 @@ struct AgentMerge
664664
items_local[item] = storage.items_shared[indices[item]];
665665
}
666666

667-
CTA_SYNC();
667+
__syncthreads();
668668

669669
// write from reg to gmem
670670
//

cub/cub/agent/agent_radix_sort_downsweep.cuh

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,7 @@ struct AgentRadixSortDownsweep
277277
temp_storage.keys_and_offsets.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM];
278278
}
279279

280-
CTA_SYNC();
280+
__syncthreads();
281281

282282
#pragma unroll
283283
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
@@ -305,7 +305,7 @@ struct AgentRadixSortDownsweep
305305
int (&ranks)[ITEMS_PER_THREAD],
306306
OffsetT valid_items)
307307
{
308-
CTA_SYNC();
308+
__syncthreads();
309309

310310
ValueExchangeT& exchange_values = temp_storage.exchange_values.Alias();
311311

@@ -315,7 +315,7 @@ struct AgentRadixSortDownsweep
315315
exchange_values[ranks[ITEM]] = values[ITEM];
316316
}
317317

318-
CTA_SYNC();
318+
__syncthreads();
319319

320320
#pragma unroll
321321
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
@@ -342,7 +342,7 @@ struct AgentRadixSortDownsweep
342342
{
343343
BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + block_offset, keys);
344344

345-
CTA_SYNC();
345+
__syncthreads();
346346
}
347347

348348
/**
@@ -362,7 +362,7 @@ struct AgentRadixSortDownsweep
362362

363363
BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, valid_items, oob_item);
364364

365-
CTA_SYNC();
365+
__syncthreads();
366366
}
367367

368368
/**
@@ -409,7 +409,7 @@ struct AgentRadixSortDownsweep
409409
{
410410
BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + block_offset, values);
411411

412-
CTA_SYNC();
412+
__syncthreads();
413413
}
414414

415415
/**
@@ -428,7 +428,7 @@ struct AgentRadixSortDownsweep
428428

429429
BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + block_offset, values, valid_items);
430430

431-
CTA_SYNC();
431+
__syncthreads();
432432
}
433433

434434
/**
@@ -474,7 +474,7 @@ struct AgentRadixSortDownsweep
474474
{
475475
ValueT values[ITEMS_PER_THREAD];
476476

477-
CTA_SYNC();
477+
__syncthreads();
478478

479479
LoadValues(values, block_offset, valid_items, Int2Type<FULL_TILE>(), Int2Type<LOAD_WARP_STRIPED>());
480480

@@ -520,7 +520,7 @@ struct AgentRadixSortDownsweep
520520
int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
521521
BlockRadixRankT(temp_storage.radix_rank).RankKeys(keys, ranks, digit_extractor(), exclusive_digit_prefix);
522522

523-
CTA_SYNC();
523+
__syncthreads();
524524

525525
// Share exclusive digit prefix
526526
#pragma unroll
@@ -534,7 +534,7 @@ struct AgentRadixSortDownsweep
534534
}
535535
}
536536

537-
CTA_SYNC();
537+
__syncthreads();
538538

539539
// Get inclusive digit prefix
540540
int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
@@ -562,7 +562,7 @@ struct AgentRadixSortDownsweep
562562
}
563563
}
564564

565-
CTA_SYNC();
565+
__syncthreads();
566566

567567
// Update global scatter base offsets for each digit
568568
#pragma unroll
@@ -577,7 +577,7 @@ struct AgentRadixSortDownsweep
577577
}
578578
}
579579

580-
CTA_SYNC();
580+
__syncthreads();
581581

582582
// Scatter keys
583583
ScatterKeys<FULL_TILE>(keys, relative_bin_offsets, ranks, valid_items);
@@ -602,7 +602,7 @@ struct AgentRadixSortDownsweep
602602
T items[ITEMS_PER_THREAD];
603603

604604
LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
605-
CTA_SYNC();
605+
__syncthreads();
606606
StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
607607

608608
block_offset += TILE_ITEMS;
@@ -616,7 +616,7 @@ struct AgentRadixSortDownsweep
616616
T items[ITEMS_PER_THREAD];
617617

618618
LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
619-
CTA_SYNC();
619+
__syncthreads();
620620
StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items);
621621
}
622622
}
@@ -670,7 +670,7 @@ struct AgentRadixSortDownsweep
670670
}
671671
}
672672

673-
short_circuit = CTA_SYNC_AND(short_circuit);
673+
short_circuit = __syncthreads_and(short_circuit);
674674
}
675675

676676
/**
@@ -719,7 +719,7 @@ struct AgentRadixSortDownsweep
719719
}
720720
}
721721

722-
short_circuit = CTA_SYNC_AND(short_circuit);
722+
short_circuit = __syncthreads_and(short_circuit);
723723
}
724724

725725
/**
@@ -744,7 +744,7 @@ struct AgentRadixSortDownsweep
744744
ProcessTile<true>(block_offset);
745745
block_offset += TILE_ITEMS;
746746

747-
CTA_SYNC();
747+
__syncthreads();
748748
}
749749

750750
// Clean up last partial tile with guarded-I/O

0 commit comments

Comments
 (0)