Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Fix adjacent difference op calls count
Browse files Browse the repository at this point in the history
  • Loading branch information
gevtushenko committed Jul 18, 2022
1 parent ce8d531 commit 6dda1c2
Show file tree
Hide file tree
Showing 4 changed files with 443 additions and 422 deletions.
38 changes: 31 additions & 7 deletions cub/agent/agent_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -143,16 +143,40 @@ struct AgentDifference
{
if (IS_FIRST_TILE)
{
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
.SubtractLeft(input, output, difference_op);
if (IS_LAST_TILE)
{
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
.SubtractLeftPartialTile(input,
output,
difference_op,
num_remaining);
}
else
{
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
.SubtractLeft(input, output, difference_op);
}
}
else
{
InputT tile_prev_input = MayAlias ? first_tile_previous[tile_idx]
: *(input_it + tile_base - 1);

BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
.SubtractLeft(input, output, difference_op, tile_prev_input);
InputT tile_prev_input = MayAlias
? first_tile_previous[tile_idx]
: *(input_it + tile_base - 1);

if (IS_LAST_TILE)
{
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
.SubtractLeftPartialTile(input,
output,
difference_op,
num_remaining,
tile_prev_input);
}
else
{
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
.SubtractLeft(input, output, difference_op, tile_prev_input);
}
}
}
else
Expand Down
174 changes: 154 additions & 20 deletions cub/block/block_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -490,15 +490,16 @@ public:
}

/**
* @brief Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
* @brief Subtracts the left element of each adjacent pair of elements
* partitioned across a CUDA thread block.
*
* @par
* - \rowmajor
* - \smemreuse
*
* @par Snippet
* The code snippet below illustrates how to use @p BlockAdjacentDifference to
* compute the left difference between adjacent elements.
* The code snippet below illustrates how to use @p BlockAdjacentDifference
* to compute the left difference between adjacent elements.
*
* @par
* @code
Expand All @@ -516,30 +517,152 @@ public:
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockAdjacentDifference for a 1D block of
* // 128 threads of type int
* using BlockAdjacentDifferenceT =
* cub::BlockAdjacentDifference<int, 128>;
* // Specialize BlockAdjacentDifference for a 1D block of
* // 128 threads of type int
* using BlockAdjacentDifferenceT =
* cub::BlockAdjacentDifference<int, 128>;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
* int valid_items = 9;
*
* // Collectively compute adjacent_difference
* BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
* thread_data,
* thread_data,
* CustomDifference(),
* valid_items);
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
* @endcode
* @par
* Suppose the set of input `thread_data` across the block of threads is
* `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
* The corresponding output `result` in those threads will be
* `{ [4,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }`.
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
* @param[out] output
* Calling thread's adjacent difference result
*
* // Collectively compute adjacent_difference
* BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
* thread_data,
* thread_data,
* CustomDifference());
* @param[in] input
* Calling thread's input items (may be aliased to \p output)
*
* @param[in] difference_op
* Binary difference operator
*
* @param[in] valid_items
* Number of valid items in thread block
*/
template <int ITEMS_PER_THREAD,
typename OutputType,
typename DifferenceOpT>
__device__ __forceinline__ void
SubtractLeftPartialTile(T (&input)[ITEMS_PER_THREAD],
OutputType (&output)[ITEMS_PER_THREAD],
DifferenceOpT difference_op,
int valid_items)
{
// Share last item
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];

CTA_SYNC();

if ((linear_tid + 1) * ITEMS_PER_THREAD <= valid_items)
{
#pragma unroll
for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
{
output[item] = difference_op(input[item], input[item - 1]);
}
}
else
{
#pragma unroll
for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
{
const int idx = linear_tid * ITEMS_PER_THREAD + item;

if (idx < valid_items)
{
output[item] = difference_op(input[item], input[item - 1]);
}
else
{
output[item] = input[item];
}
}
}

if (linear_tid == 0 || valid_items <= linear_tid * ITEMS_PER_THREAD)
{
output[0] = input[0];
}
else
{
output[0] = difference_op(input[0],
temp_storage.last_items[linear_tid - 1]);
}
}

/**
* @brief Subtracts the left element of each adjacent pair of elements
* partitioned across a CUDA thread block.
*
* @par
* - \rowmajor
* - \smemreuse
*
* @par Snippet
* The code snippet below illustrates how to use @p BlockAdjacentDifference
* to compute the left difference between adjacent elements.
*
* @par
* @code
* #include <cub/cub.cuh>
* // or equivalently <cub/block/block_adjacent_difference.cuh>
*
* struct CustomDifference
* {
* template <typename DataType>
* __device__ DataType operator()(DataType &lhs, DataType &rhs)
* {
* return lhs - rhs;
* }
* };
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockAdjacentDifference for a 1D block of
* // 128 threads of type int
* using BlockAdjacentDifferenceT =
* cub::BlockAdjacentDifference<int, 128>;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
* int valid_items = 9;
* int tile_predecessor_item = 4;
*
* // Collectively compute adjacent_difference
* BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
* thread_data,
* thread_data,
* CustomDifference(),
* valid_items,
* tile_predecessor_item);
*
* @endcode
* @par
* Suppose the set of input `thread_data` across the block of threads is
* `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
* The corresponding output `result` in those threads will be
* `{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }`.
* `{ [0,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }`.
*
* @param[out] output
* Calling thread's adjacent difference result
Expand All @@ -552,6 +675,11 @@ public:
*
* @param[in] valid_items
* Number of valid items in thread block
*
* @param[in] tile_predecessor_item
* **[<em>thread</em><sub>0</sub> only]** item which is going to be
* subtracted from the first tile item (<tt>input<sub>0</sub></tt> from
* <em>thread</em><sub>0</sub>).
*/
template <int ITEMS_PER_THREAD,
typename OutputType,
Expand All @@ -560,7 +688,8 @@ public:
SubtractLeftPartialTile(T (&input)[ITEMS_PER_THREAD],
OutputType (&output)[ITEMS_PER_THREAD],
DifferenceOpT difference_op,
int valid_items)
int valid_items,
T tile_predecessor_item)
{
// Share last item
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
Expand Down Expand Up @@ -593,10 +722,15 @@ public:
}
}

if (linear_tid == 0 || valid_items <= linear_tid * ITEMS_PER_THREAD)
if (valid_items <= linear_tid * ITEMS_PER_THREAD)
{
output[0] = input[0];
}
else if (linear_tid == 0)
{
output[0] = difference_op(input[0],
tile_predecessor_item);
}
else
{
output[0] = difference_op(input[0],
Expand Down
Loading

0 comments on commit 6dda1c2

Please sign in to comment.