Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions c/parallel/src/reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ std::string get_single_tile_kernel_name(
check(nvrtcGetTypeName<op_wrapper>(&reduction_op_t));

return std::format(
"cub::DeviceReduceSingleTileKernel<{0}, {1}, {2}, {3}, {4}, {5}, {6}>",
"cub::detail::reduce::DeviceReduceSingleTileKernel<{0}, {1}, {2}, {3}, {4}, {5}, {6}>",
chained_policy_t,
input_iterator_t,
output_iterator_t,
Expand Down Expand Up @@ -192,7 +192,7 @@ std::string get_device_reduce_kernel_name(cccl_op_t op, cccl_iterator_t input_it
check(nvrtcGetTypeName<cuda::std::__identity>(&transform_op_t));

return std::format(
"cub::DeviceReduceKernel<{0}, {1}, {2}, {3}, {4}, {5}>",
"cub::detail::reduce::DeviceReduceKernel<{0}, {1}, {2}, {3}, {4}, {5}>",
chained_policy_t,
input_iterator_t,
offset_t,
Expand Down
51 changes: 25 additions & 26 deletions cub/cub/agent/agent_three_way_partition.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,26 @@ CUB_NAMESPACE_BEGIN
* Tuning policy types
******************************************************************************/

template <int _BLOCK_THREADS,
int _ITEMS_PER_THREAD,
BlockLoadAlgorithm _LOAD_ALGORITHM,
CacheLoadModifier _LOAD_MODIFIER,
BlockScanAlgorithm _SCAN_ALGORITHM,
class DelayConstructorT = detail::fixed_delay_constructor_t<350, 450>>
struct AgentThreeWayPartitionPolicy
{
static constexpr int BLOCK_THREADS = _BLOCK_THREADS;
static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD;
static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;

struct detail
{
using delay_constructor_t = DelayConstructorT;
};
};

namespace detail
{

Expand Down Expand Up @@ -135,30 +155,6 @@ struct accumulator_pack_t : accumulator_pack_base_t<OffsetT>
}
};

} // namespace three_way_partition

} // namespace detail

template <int _BLOCK_THREADS,
int _ITEMS_PER_THREAD,
BlockLoadAlgorithm _LOAD_ALGORITHM,
CacheLoadModifier _LOAD_MODIFIER,
BlockScanAlgorithm _SCAN_ALGORITHM,
class DelayConstructorT = detail::fixed_delay_constructor_t<350, 450>>
struct AgentThreeWayPartitionPolicy
{
static constexpr int BLOCK_THREADS = _BLOCK_THREADS;
static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD;
static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;

struct detail
{
using delay_constructor_t = DelayConstructorT;
};
};

/**
* \brief Implements a device-wide three-way partitioning
*
Expand All @@ -184,9 +180,9 @@ struct AgentThreeWayPartition
//---------------------------------------------------------------------

// The input value type
using InputT = cub::detail::value_t<InputIteratorT>;
using InputT = value_t<InputIteratorT>;

using AccumPackHelperT = detail::three_way_partition::accumulator_pack_t<OffsetT>;
using AccumPackHelperT = accumulator_pack_t<OffsetT>;
using AccumPackT = typename AccumPackHelperT::pack_t;

// Tile status descriptor interface type
Expand Down Expand Up @@ -593,4 +589,7 @@ struct AgentThreeWayPartition
}
};

} // namespace three_way_partition
} // namespace detail

CUB_NAMESPACE_END
34 changes: 20 additions & 14 deletions cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@

CUB_NAMESPACE_BEGIN

namespace detail::adjacent_difference
{

template <typename AgentDifferenceInitT, typename InputIteratorT, typename InputT, typename OffsetT>
CUB_DETAIL_KERNEL_ATTRIBUTES void
DeviceAdjacentDifferenceInitKernel(InputIteratorT first, InputT* result, OffsetT num_tiles, int items_per_tile)
Expand Down Expand Up @@ -78,18 +81,18 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel(

// It is OK to introspect the return type or parameter types of the
// `operator()` function of `__device__` extended lambda within device code.
using OutputT = detail::invoke_result_t<DifferenceOpT, InputT, InputT>;

using Agent = detail::adjacent_difference::AgentDifference<
ActivePolicyT,
InputIteratorT,
OutputIteratorT,
DifferenceOpT,
OffsetT,
InputT,
OutputT,
MayAlias,
ReadLeft>;
using OutputT = invoke_result_t<DifferenceOpT, InputT, InputT>;

using Agent =
AgentDifference<ActivePolicyT,
InputIteratorT,
OutputIteratorT,
DifferenceOpT,
OffsetT,
InputT,
OutputT,
MayAlias,
ReadLeft>;

__shared__ typename Agent::TempStorage storage;

Expand All @@ -101,6 +104,8 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel(
agent.Process(tile_idx, tile_base);
}

} // namespace detail::adjacent_difference

template <typename InputIteratorT,
typename OutputIteratorT,
typename DifferenceOpT,
Expand Down Expand Up @@ -199,7 +204,8 @@ struct DispatchAdjacentDifference
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG

THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, init_block_size, 0, stream)
.doit(DeviceAdjacentDifferenceInitKernel<AgentDifferenceInitT, InputIteratorT, InputT, OffsetT>,
.doit(detail::adjacent_difference::
DeviceAdjacentDifferenceInitKernel<AgentDifferenceInitT, InputIteratorT, InputT, OffsetT>,
d_input,
first_tile_previous,
num_tiles,
Expand Down Expand Up @@ -230,7 +236,7 @@ struct DispatchAdjacentDifference

THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, 0, stream)
.doit(DeviceAdjacentDifferenceDifferenceKernel<
.doit(detail::adjacent_difference::DeviceAdjacentDifferenceDifferenceKernel<
typename PolicyHub::MaxPolicy,
InputIteratorT,
OutputIteratorT,
Expand Down
32 changes: 16 additions & 16 deletions cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ CUB_NAMESPACE_BEGIN

namespace detail
{
namespace batch_memcpy
{
/**
* Initialization kernel for tile status initialization (multi-block)
*/
Expand Down Expand Up @@ -102,15 +104,13 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO
{
using StatusWord = typename TileT::StatusWord;
using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT;
using BufferSizeT = cub::detail::value_t<BufferSizeIteratorT>;
using BufferSizeT = value_t<BufferSizeIteratorT>;
/// Internal load/store type. For byte-wise memcpy, a single-byte type
using AliasT =
typename ::cuda::std::conditional<IsMemcpy,
std::iterator_traits<char*>,
std::iterator_traits<cub::detail::value_t<InputBufferIt>>>::type::value_type;
using AliasT = typename ::cuda::std::
conditional<IsMemcpy, std::iterator_traits<char*>, std::iterator_traits<value_t<InputBufferIt>>>::type::value_type;
/// Types of the input and output buffers
using InputBufferT = cub::detail::value_t<InputBufferIt>;
using OutputBufferT = cub::detail::value_t<OutputBufferIt>;
using InputBufferT = value_t<InputBufferIt>;
using OutputBufferT = value_t<OutputBufferIt>;

constexpr uint32_t BLOCK_THREADS = ActivePolicyT::BLOCK_THREADS;
constexpr uint32_t ITEMS_PER_THREAD = ActivePolicyT::BYTES_PER_THREAD;
Expand Down Expand Up @@ -164,16 +164,15 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO
{
if (thread_offset < buffer_sizes[buffer_id])
{
const auto value =
batch_memcpy::read_item<IsMemcpy, AliasT, InputBufferT>(input_buffer_it[buffer_id], thread_offset);
batch_memcpy::write_item<IsMemcpy, AliasT, OutputBufferT>(output_buffer_it[buffer_id], thread_offset, value);
const auto value = read_item<IsMemcpy, AliasT, InputBufferT>(input_buffer_it[buffer_id], thread_offset);
write_item<IsMemcpy, AliasT, OutputBufferT>(output_buffer_it[buffer_id], thread_offset, value);
}
thread_offset += BLOCK_THREADS;
}
}
else
{
batch_memcpy::copy_items<IsMemcpy, BLOCK_THREADS, InputBufferT, OutputBufferT, BufferSizeT>(
copy_items<IsMemcpy, BLOCK_THREADS, InputBufferT, OutputBufferT, BufferSizeT>(
input_buffer_it[buffer_id],
output_buffer_it[buffer_id],
(::cuda::std::min)(buffer_sizes[buffer_id] - tile_offset_within_buffer, TILE_SIZE),
Expand Down Expand Up @@ -232,13 +231,13 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLO
BLevBlockOffsetTileState blev_block_scan_state)
{
// Internal type used for storing a buffer's size
using BufferSizeT = cub::detail::value_t<BufferSizeIteratorT>;
using BufferSizeT = value_t<BufferSizeIteratorT>;

// Alias the correct tuning policy for the current compilation pass' architecture
using AgentBatchMemcpyPolicyT = typename ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT;

// Block-level specialization
using AgentBatchMemcpyT = batch_memcpy::AgentBatchMemcpy<
using AgentBatchMemcpyT = AgentBatchMemcpy<
AgentBatchMemcpyPolicyT,
InputBufferIt,
OutputBufferIt,
Expand Down Expand Up @@ -271,6 +270,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLO
blev_block_scan_state)
.ConsumeTile(blockIdx.x);
}
} // namespace batch_memcpy

/**
* @tparam InputBufferIt **[inferred]** Random-access input iterator type providing the pointers
Expand Down Expand Up @@ -465,8 +465,8 @@ struct DispatchBatchMemcpy

// Kernels
auto init_scan_states_kernel =
InitTileStateKernel<BLevBufferOffsetTileState, BLevBlockOffsetTileState, BlockOffsetT>;
auto batch_memcpy_non_blev_kernel = BatchMemcpyKernel<
detail::batch_memcpy::InitTileStateKernel<BLevBufferOffsetTileState, BLevBlockOffsetTileState, BlockOffsetT>;
auto batch_memcpy_non_blev_kernel = detail::batch_memcpy::BatchMemcpyKernel<
typename PolicyHub::MaxPolicy,
InputBufferIt,
OutputBufferIt,
Expand All @@ -481,7 +481,7 @@ struct DispatchBatchMemcpy
BLevBlockOffsetTileState,
IsMemcpy>;

auto multi_block_memcpy_kernel = MultiBlockBatchMemcpyKernel<
auto multi_block_memcpy_kernel = detail::batch_memcpy::MultiBlockBatchMemcpyKernel<
typename PolicyHub::MaxPolicy,
BufferOffsetT,
BlevBufferSrcsOutItT,
Expand Down
9 changes: 2 additions & 7 deletions cub/cub/device/dispatch/dispatch_for.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,7 @@

CUB_NAMESPACE_BEGIN

namespace detail
{

namespace for_each
namespace detail::for_each
{

// The dispatch layer is in the detail namespace until we figure out tuning API
Expand Down Expand Up @@ -195,8 +192,6 @@ struct dispatch_t
}
};

} // namespace for_each

} // namespace detail
} // namespace detail::for_each

CUB_NAMESPACE_END
7 changes: 2 additions & 5 deletions cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,7 @@

CUB_NAMESPACE_BEGIN

namespace detail
{
namespace for_each_in_extents
namespace detail::for_each_in_extents
{

// The dispatch layer is in the detail namespace until we figure out the tuning API
Expand Down Expand Up @@ -203,8 +201,7 @@ private:
unsigned_index_type _size;
};

} // namespace for_each_in_extents
} // namespace detail
} // namespace detail::for_each_in_extents

CUB_NAMESPACE_END

Expand Down
Loading
Loading