Skip to content

Commit 837e1e7

Browse files
Deprecate block/warp algo specializations
Fixes: #3409
1 parent 32f41dd commit 837e1e7

17 files changed

+108
-29
lines changed

cub/cub/block/block_histogram.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -202,8 +202,8 @@ private:
202202
/// Internal specialization.
203203
using InternalBlockHistogram =
204204
::cuda::std::_If<ALGORITHM == BLOCK_HISTO_SORT,
205-
BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z>,
206-
BlockHistogramAtomic<BINS>>;
205+
detail::BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z>,
206+
detail::BlockHistogramAtomic<BINS>>;
207207

208208
/// Shared memory storage layout type for BlockHistogram
209209
using _TempStorage = typename InternalBlockHistogram::TempStorage;

cub/cub/block/block_reduce.cuh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -250,9 +250,9 @@ private:
250250
BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
251251
};
252252

253-
using WarpReductions = BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
254-
using RakingCommutativeOnly = BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
255-
using Raking = BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
253+
using WarpReductions = detail::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
254+
using RakingCommutativeOnly = detail::BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
255+
using Raking = detail::BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
256256

257257
/// Internal specialization type
258258
using InternalBlockReduce =

cub/cub/block/block_scan.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -250,9 +250,9 @@ private:
250250
? BLOCK_SCAN_RAKING
251251
: ALGORITHM;
252252

253-
using WarpScans = BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
253+
using WarpScans = detail::BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
254254
using Raking =
255-
BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE)>;
255+
detail::BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE)>;
256256

257257
/// Define the delegate type for the desired algorithm
258258
using InternalBlockScan = ::cuda::std::_If<SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS, WarpScans, Raking>;

cub/cub/block/specializations/block_histogram_atomic.cuh

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@
4545
#endif // no system header
4646

4747
CUB_NAMESPACE_BEGIN
48-
48+
namespace detail
49+
{
4950
/**
5051
* @brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide
5152
* histograms from data samples partitioned across a CUDA thread block.
@@ -72,13 +73,19 @@ struct BlockHistogramAtomic
7273
template <typename T, typename CounterT, int ITEMS_PER_THREAD>
7374
_CCCL_DEVICE _CCCL_FORCEINLINE void Composite(T (&items)[ITEMS_PER_THREAD], CounterT histogram[BINS])
7475
{
75-
// Update histogram
76+
// Update histogram
7677
#pragma unroll
7778
for (int i = 0; i < ITEMS_PER_THREAD; ++i)
7879
{
7980
atomicAdd(histogram + items[i], 1);
8081
}
8182
}
8283
};
84+
} // namespace detail
85+
86+
template <int BINS>
87+
using BlockHistogramAtomic CCCL_DEPRECATED_BECAUSE(
88+
"This class is considered an implementation detail and the public interface will be "
89+
"removed.") = detail::BlockHistogramAtomic<BINS>;
8390

8491
CUB_NAMESPACE_END

cub/cub/block/specializations/block_histogram_sort.cuh

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@
4949
#include <cub/util_ptx.cuh>
5050

5151
CUB_NAMESPACE_BEGIN
52-
52+
namespace detail
53+
{
5354
/**
5455
* @brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide
5556
* histograms from data samples partitioned across a CUDA thread block.
@@ -243,5 +244,18 @@ struct BlockHistogramSort
243244
}
244245
}
245246
};
247+
} // namespace detail
248+
249+
template <typename T,
250+
int BLOCK_DIM_X,
251+
int ITEMS_PER_THREAD,
252+
int BINS,
253+
int BLOCK_DIM_Y,
254+
int BLOCK_DIM_Z,
255+
int LEGACY_PTX_ARCH = 0>
256+
using BlockHistogramSort CCCL_DEPRECATED_BECAUSE(
257+
"This class is considered an implementation detail and the public interface will be "
258+
"removed.") =
259+
detail::BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;
246260

247261
CUB_NAMESPACE_END

cub/cub/block/specializations/block_reduce_raking.cuh

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@
5050
#include <cub/warp/warp_reduce.cuh>
5151

5252
CUB_NAMESPACE_BEGIN
53-
53+
namespace detail
54+
{
5455
/**
5556
* @brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread
5657
* block. Supports non-commutative reduction operators.
@@ -257,5 +258,11 @@ struct BlockReduceRaking
257258
return Reduce<IS_FULL_TILE>(partial, num_valid, reduction_op);
258259
}
259260
};
261+
} // namespace detail
262+
263+
template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
264+
using BlockReduceRaking CCCL_DEPRECATED_BECAUSE(
265+
"This class is considered an implementation detail and the public interface will be "
266+
"removed.") = detail::BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;
260267

261268
CUB_NAMESPACE_END

cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@
5050
#include <cub/warp/warp_reduce.cuh>
5151

5252
CUB_NAMESPACE_BEGIN
53-
53+
namespace detail
54+
{
5455
/**
5556
* @brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction
5657
* across a CUDA thread block. Does not support non-commutative reduction operators. Does not
@@ -83,7 +84,7 @@ struct BlockReduceRakingCommutativeOnly
8384

8485
// The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have
8586
// valid values
86-
using FallBack = BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
87+
using FallBack = detail::BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
8788

8889
/// Constants
8990
enum
@@ -231,5 +232,11 @@ struct BlockReduceRakingCommutativeOnly
231232
return partial;
232233
}
233234
};
235+
} // namespace detail
236+
237+
template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
238+
using BlockReduceRakingCommutativeOnly CCCL_DEPRECATED_BECAUSE(
239+
"This class is considered an implementation detail and the public interface will be "
240+
"removed.") = detail::BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;
234241

235242
CUB_NAMESPACE_END

cub/cub/block/specializations/block_reduce_warp_reductions.cuh

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@
5151
#include <cuda/ptx>
5252

5353
CUB_NAMESPACE_BEGIN
54-
54+
namespace detail
55+
{
5556
/**
5657
* @brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction
5758
* across a CUDA thread block. Supports non-commutative reduction operators.
@@ -256,5 +257,11 @@ struct BlockReduceWarpReductions
256257
return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
257258
}
258259
};
260+
} // namespace detail
261+
262+
template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
263+
using BlockReduceWarpReductions CCCL_DEPRECATED_BECAUSE(
264+
"This class is considered an implementation detail and the public interface will be "
265+
"removed.") = detail::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;
259266

260267
CUB_NAMESPACE_END

cub/cub/block/specializations/block_scan_raking.cuh

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@
5252
#include <cub/warp/warp_scan.cuh>
5353

5454
CUB_NAMESPACE_BEGIN
55-
55+
namespace detail
56+
{
5657
/**
5758
* @brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA
5859
* thread block.
@@ -794,5 +795,11 @@ struct BlockScanRaking
794795
}
795796
}
796797
};
798+
} // namespace detail
799+
800+
template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int LEGACY_PTX_ARCH = 0>
801+
using BlockScanRaking CCCL_DEPRECATED_BECAUSE(
802+
"This class is considered an implementation detail and the public interface will be "
803+
"removed.") = detail::BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, LEGACY_PTX_ARCH>;
797804

798805
CUB_NAMESPACE_END

cub/cub/block/specializations/block_scan_warp_scans.cuh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@
5050
#include <cuda/ptx>
5151

5252
CUB_NAMESPACE_BEGIN
53-
53+
namespace detail
54+
{
5455
/**
5556
* @brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA
5657
* thread block.
@@ -537,5 +538,10 @@ struct BlockScanWarpScans
537538
exclusive_output = scan_op(block_prefix, exclusive_output);
538539
}
539540
};
541+
} // namespace detail
542+
template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
543+
using BlockScanWarpScans CCCL_DEPRECATED_BECAUSE(
544+
"This class is considered an implementation detail and the public interface will be "
545+
"removed.") = detail::BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>;
540546

541547
CUB_NAMESPACE_END

0 commit comments

Comments
 (0)