This repository has been archived by the owner on Mar 21, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 447
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #354 from elstehle/feature/block-run-length-decode
Adds BlockRunLengthDecode algorithm and tests
- Loading branch information
Showing
2 changed files
with
986 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,392 @@ | ||
/****************************************************************************** | ||
* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. | ||
* | ||
* Redistribution and use in source and binary forms, with or without | ||
* modification, are permitted provided that the following conditions are met: | ||
* * Redistributions of source code must retain the above copyright | ||
* notice, this list of conditions and the following disclaimer. | ||
* * Redistributions in binary form must reproduce the above copyright | ||
* notice, this list of conditions and the following disclaimer in the | ||
* documentation and/or other materials provided with the distribution. | ||
* * Neither the name of the NVIDIA CORPORATION nor the | ||
* names of its contributors may be used to endorse or promote products | ||
* derived from this software without specific prior written permission. | ||
* | ||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||
*AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
*IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY | ||
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | ||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | ||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
* | ||
******************************************************************************/ | ||
|
||
#pragma once | ||
|
||
#include "../config.cuh" | ||
#include "../thread/thread_search.cuh" | ||
#include "../util_math.cuh" | ||
#include "../util_namespace.cuh" | ||
#include "../util_ptx.cuh" | ||
#include "../util_type.cuh" | ||
#include "block_scan.cuh" | ||
#include <limits> | ||
#include <type_traits> | ||
|
||
CUB_NAMESPACE_BEGIN | ||
|
||
/** | ||
* \brief The BlockRunLengthDecode class supports decoding a run-length encoded array of items. That is, given | ||
* the two arrays run_value[N] and run_lengths[N], run_value[i] is repeated run_lengths[i] many times in the output | ||
* array. | ||
* Due to the nature of the run-length decoding algorithm ("decompression"), the output size of the run-length decoded | ||
* array is runtime-dependent and potentially without any upper bound. To address this, BlockRunLengthDecode allows | ||
* retrieving a "window" from the run-length decoded array. The window's offset can be specified and BLOCK_THREADS * | ||
* DECODED_ITEMS_PER_THREAD (i.e., referred to as window_size) decoded items from the specified window will be returned. | ||
* | ||
* \note: Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array). | ||
* A run of length zero may not be followed by a run length that is not zero. | ||
* | ||
* \par | ||
* \code | ||
* __global__ void ExampleKernel(...) | ||
* { | ||
* // Specialising BlockRunLengthDecode to run-length decode items of type uint64_t | ||
* using RunItemT = uint64_t; | ||
* // Type large enough to index into the run-length decoded array | ||
* using RunLengthT = uint32_t; | ||
* | ||
* // Specialising BlockRunLengthDecode for a 1D block of 128 threads | ||
* constexpr int BLOCK_DIM_X = 128; | ||
* // Specialising BlockRunLengthDecode to have each thread contribute 2 run-length encoded runs | ||
* constexpr int RUNS_PER_THREAD = 2; | ||
* // Specialising BlockRunLengthDecode to have each thread hold 4 run-length decoded items | ||
* constexpr int DECODED_ITEMS_PER_THREAD = 4; | ||
* | ||
* // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each | ||
* using BlockRunLengthDecodeT = | ||
* cub::BlockRunLengthDecode<RunItemT, BLOCK_DIM_X, RUNS_PER_THREAD, DECODED_ITEMS_PER_THREAD>; | ||
* | ||
* // Allocate shared memory for BlockRunLengthDecode | ||
* __shared__ typename BlockRunLengthDecodeT::TempStorage temp_storage; | ||
* | ||
* // The run-length encoded items and how often they shall be repeated in the run-length decoded output | ||
* RunItemT run_values[RUNS_PER_THREAD]; | ||
* RunLengthT run_lengths[RUNS_PER_THREAD]; | ||
* ... | ||
* | ||
* // Initialize the BlockRunLengthDecode with the runs that we want to run-length decode | ||
* uint32_t total_decoded_size = 0; | ||
* BlockRunLengthDecodeT block_rld(temp_storage, run_values, run_lengths, total_decoded_size); | ||
* | ||
* // Run-length decode ("decompress") the runs into a window buffer of limited size. This is repeated until all runs | ||
* // have been decoded. | ||
* uint32_t decoded_window_offset = 0U; | ||
* while (decoded_window_offset < total_decoded_size) | ||
* { | ||
* RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD]; | ||
* RunItemT decoded_items[DECODED_ITEMS_PER_THREAD]; | ||
* | ||
* // The number of decoded items that are valid within this window (aka pass) of run-length decoding | ||
* uint32_t num_valid_items = total_decoded_size - decoded_window_offset; | ||
* block_rld.RunLengthDecode(decoded_items, relative_offsets, decoded_window_offset); | ||
* | ||
* decoded_window_offset += BLOCK_DIM_X * DECODED_ITEMS_PER_THREAD; | ||
* | ||
* ... | ||
* } | ||
* } | ||
* \endcode | ||
* \par | ||
* Suppose the set of input \p run_values across the block of threads is | ||
* <tt>{ [0, 1], [2, 3], [4, 5], [6, 7], ..., [254, 255] }</tt> and | ||
* \p run_lengths is <tt>{ [1, 2], [3, 4], [5, 1], [2, 3], ..., [5, 1] }</tt>. | ||
* The corresponding output \p decoded_items in those threads will be <tt>{ [0, 1, 1, 2], [2, 2, 3, 3], [3, 3, 4, 4], | ||
* [4, 4, 4, 5], ..., [169, 169, 170, 171] }</tt> and \p relative_offsets will be <tt>{ [0, 0, 1, 0], [1, 2, 0, 1], [2, | ||
* 3, 0, 1], [2, 3, 4, 0], ..., [3, 4, 0, 0] }</tt> during the first iteration of the while loop. | ||
* | ||
* \tparam ItemT The data type of the items being run-length decoded | ||
* \tparam BLOCK_DIM_X The thread block length in threads along the X dimension | ||
* \tparam RUNS_PER_THREAD The number of consecutive runs that each thread contributes | ||
* \tparam DECODED_ITEMS_PER_THREAD The maximum number of decoded items that each thread holds | ||
* \tparam DecodedOffsetT Type used to index into the block's decoded items (large enough to hold the sum over all the | ||
* runs' lengths) | ||
* \tparam BLOCK_DIM_Y The thread block length in threads along the Y dimension | ||
* \tparam BLOCK_DIM_Z The thread block length in threads along the Z dimension | ||
*/ | ||
template <typename ItemT, | ||
int BLOCK_DIM_X, | ||
int RUNS_PER_THREAD, | ||
int DECODED_ITEMS_PER_THREAD, | ||
typename DecodedOffsetT = uint32_t, | ||
int BLOCK_DIM_Y = 1, | ||
int BLOCK_DIM_Z = 1> | ||
class BlockRunLengthDecode | ||
{ | ||
//--------------------------------------------------------------------- | ||
// CONFIGS & TYPE ALIASES | ||
//--------------------------------------------------------------------- | ||
private: | ||
/// The thread block size in threads | ||
static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; | ||
|
||
/// The number of runs that the block decodes (out-of-bounds items may be padded with run lengths of '0') | ||
static constexpr int BLOCK_RUNS = BLOCK_THREADS * RUNS_PER_THREAD; | ||
|
||
/// BlockScan used to determine the beginning of each run (i.e., prefix sum over the runs' length) | ||
using RunOffsetScanT = BlockScan<DecodedOffsetT, BLOCK_DIM_X, BLOCK_SCAN_RAKING_MEMOIZE, BLOCK_DIM_Y, BLOCK_DIM_Z>; | ||
|
||
/// Type used to index into the block's runs | ||
using RunOffsetT = uint32_t; | ||
|
||
/// Shared memory type required by this thread block | ||
union _TempStorage | ||
{ | ||
typename RunOffsetScanT::TempStorage offset_scan; | ||
struct | ||
{ | ||
ItemT run_values[BLOCK_RUNS]; | ||
DecodedOffsetT run_offsets[BLOCK_RUNS]; | ||
} runs; | ||
}; // union TempStorage | ||
|
||
/// Internal storage allocator (used when the user does not provide pre-allocated shared memory) | ||
__device__ __forceinline__ _TempStorage &PrivateStorage() | ||
{ | ||
__shared__ _TempStorage private_storage; | ||
return private_storage; | ||
} | ||
|
||
/// Shared storage reference | ||
_TempStorage &temp_storage; | ||
|
||
/// Linear thread-id | ||
uint32_t linear_tid; | ||
|
||
public: | ||
struct TempStorage : Uninitialized<_TempStorage> | ||
{}; | ||
|
||
//--------------------------------------------------------------------- | ||
// CONSTRUCTOR | ||
//--------------------------------------------------------------------- | ||
|
||
/** | ||
* \brief Constructor specialised for user-provided temporary storage, initializing using the runs' lengths. The | ||
* algorithm's temporary storage may not be repurposed between the constructor call and subsequent | ||
* <b>RunLengthDecode</b> calls. | ||
*/ | ||
template <typename RunLengthT, typename TotalDecodedSizeT> | ||
__device__ __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage, | ||
ItemT (&run_values)[RUNS_PER_THREAD], | ||
RunLengthT (&run_lengths)[RUNS_PER_THREAD], | ||
TotalDecodedSizeT &total_decoded_size) | ||
: temp_storage(temp_storage.Alias()) | ||
, linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) | ||
{ | ||
InitWithRunLengths(run_values, run_lengths, total_decoded_size); | ||
} | ||
|
||
/** | ||
* \brief Constructor specialised for user-provided temporary storage, initializing using the runs' offsets. The | ||
* algorithm's temporary storage may not be repurposed between the constructor call and subsequent | ||
* <b>RunLengthDecode</b> calls. | ||
*/ | ||
template <typename UserRunOffsetT> | ||
__device__ __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage, | ||
ItemT (&run_values)[RUNS_PER_THREAD], | ||
UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD]) | ||
: temp_storage(temp_storage.Alias()) | ||
, linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) | ||
{ | ||
InitWithRunOffsets(run_values, run_offsets); | ||
} | ||
|
||
/** | ||
* \brief Constructor specialised for static temporary storage, initializing using the runs' lengths. | ||
*/ | ||
template <typename RunLengthT, typename TotalDecodedSizeT> | ||
__device__ __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD], | ||
RunLengthT (&run_lengths)[RUNS_PER_THREAD], | ||
TotalDecodedSizeT &total_decoded_size) | ||
: temp_storage(PrivateStorage()) | ||
, linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) | ||
{ | ||
InitWithRunLengths(run_values, run_lengths, total_decoded_size); | ||
} | ||
|
||
/** | ||
* \brief Constructor specialised for static temporary storage, initializing using the runs' offsets. | ||
*/ | ||
template <typename UserRunOffsetT> | ||
__device__ __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD], | ||
UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD]) | ||
: temp_storage(PrivateStorage()) | ||
, linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) | ||
{ | ||
InitWithRunOffsets(run_values, run_offsets); | ||
} | ||
|
||
private: | ||
/** | ||
* \brief Returns the offset of the first value within \p input which compares greater than \p val. This version takes | ||
* \p MAX_NUM_ITEMS, an upper bound of the array size, which will be used to determine the number of binary search | ||
* iterations at compile time. | ||
*/ | ||
template <int MAX_NUM_ITEMS, | ||
typename InputIteratorT, | ||
typename OffsetT, | ||
typename T> | ||
__device__ __forceinline__ OffsetT StaticUpperBound(InputIteratorT input, ///< [in] Input sequence | ||
OffsetT num_items, ///< [in] Input sequence length | ||
T val) ///< [in] Search key | ||
{ | ||
OffsetT lower_bound = 0; | ||
OffsetT upper_bound = num_items; | ||
#pragma unroll | ||
for (int i = 0; i <= Log2<MAX_NUM_ITEMS>::VALUE; i++) | ||
{ | ||
OffsetT mid = cub::MidPoint<OffsetT>(lower_bound, upper_bound); | ||
mid = (cub::min)(mid, num_items - 1); | ||
|
||
if (val < input[mid]) | ||
{ | ||
upper_bound = mid; | ||
} | ||
else | ||
{ | ||
lower_bound = mid + 1; | ||
} | ||
} | ||
|
||
return lower_bound; | ||
} | ||
|
||
template <typename RunOffsetT> | ||
__device__ __forceinline__ void InitWithRunOffsets(ItemT (&run_values)[RUNS_PER_THREAD], | ||
RunOffsetT (&run_offsets)[RUNS_PER_THREAD]) | ||
{ | ||
// Keep the runs' items and the offsets of each run's beginning in the temporary storage | ||
RunOffsetT thread_dst_offset = static_cast<RunOffsetT>(linear_tid) * static_cast<RunOffsetT>(RUNS_PER_THREAD); | ||
#pragma unroll | ||
for (int i = 0; i < RUNS_PER_THREAD; i++) | ||
{ | ||
temp_storage.runs.run_values[thread_dst_offset] = run_values[i]; | ||
temp_storage.runs.run_offsets[thread_dst_offset] = run_offsets[i]; | ||
thread_dst_offset++; | ||
} | ||
|
||
// Ensure run offsets and run values have been writen to shared memory | ||
CTA_SYNC(); | ||
} | ||
|
||
template <typename RunLengthT, typename TotalDecodedSizeT> | ||
__device__ __forceinline__ void InitWithRunLengths(ItemT (&run_values)[RUNS_PER_THREAD], | ||
RunLengthT (&run_lengths)[RUNS_PER_THREAD], | ||
TotalDecodedSizeT &total_decoded_size) | ||
{ | ||
// Compute the offset for the beginning of each run | ||
DecodedOffsetT run_offsets[RUNS_PER_THREAD]; | ||
#pragma unroll | ||
for (int i = 0; i < RUNS_PER_THREAD; i++) | ||
{ | ||
run_offsets[i] = static_cast<DecodedOffsetT>(run_lengths[i]); | ||
} | ||
DecodedOffsetT decoded_size_aggregate; | ||
RunOffsetScanT(this->temp_storage.offset_scan).ExclusiveSum(run_offsets, run_offsets, decoded_size_aggregate); | ||
total_decoded_size = static_cast<TotalDecodedSizeT>(decoded_size_aggregate); | ||
|
||
// Ensure the prefix scan's temporary storage can be reused (may be superfluous, but depends on scan implementation) | ||
CTA_SYNC(); | ||
|
||
InitWithRunOffsets(run_values, run_offsets); | ||
} | ||
|
||
public: | ||
/** | ||
* \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded | ||
* items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the | ||
* run-length decode buffer (i.e., <b>DECODED_ITEMS_PER_THREAD * BLOCK_THREADS</b>), only the items that fit within | ||
* the buffer are returned. Subsequent calls to <b>RunLengthDecode</b> adjusting \p from_decoded_offset can be | ||
* used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to | ||
* <b>RunLengthDecode</b> is not required. | ||
* \p item_offsets can be used to retrieve each run-length decoded item's relative index within its run. E.g., the | ||
* run-length encoded array of `3, 1, 4` with the respective run lengths of `2, 1, 3` would yield the run-length | ||
* decoded array of `3, 3, 1, 4, 4, 4` with the relative offsets of `0, 1, 0, 0, 1, 2`. | ||
* \smemreuse | ||
* | ||
* \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement | ||
* \param[out] item_offsets The run-length decoded items' relative offset within the run they belong to | ||
* \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results | ||
* in undefined behavior. | ||
*/ | ||
template <typename RelativeOffsetT> | ||
__device__ __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD], | ||
RelativeOffsetT (&item_offsets)[DECODED_ITEMS_PER_THREAD], | ||
DecodedOffsetT from_decoded_offset = 0) | ||
{ | ||
// The (global) offset of the first item decoded by this thread | ||
DecodedOffsetT thread_decoded_offset = from_decoded_offset + linear_tid * DECODED_ITEMS_PER_THREAD; | ||
|
||
// The run that the first decoded item of this thread belongs to | ||
// If this thread's <thread_decoded_offset> is already beyond the total decoded size, it will be assigned to the | ||
// last run | ||
RunOffsetT assigned_run = | ||
StaticUpperBound<BLOCK_RUNS>(temp_storage.runs.run_offsets, BLOCK_RUNS, thread_decoded_offset) - | ||
static_cast<RunOffsetT>(1U); | ||
|
||
DecodedOffsetT assigned_run_begin = temp_storage.runs.run_offsets[assigned_run]; | ||
|
||
// If this thread is getting assigned the last run, we make sure it will not fetch any other run after this | ||
DecodedOffsetT assigned_run_end = (assigned_run == BLOCK_RUNS - 1) | ||
? thread_decoded_offset + DECODED_ITEMS_PER_THREAD | ||
: temp_storage.runs.run_offsets[assigned_run + 1]; | ||
|
||
ItemT val = temp_storage.runs.run_values[assigned_run]; | ||
|
||
#pragma unroll | ||
for (DecodedOffsetT i = 0; i < DECODED_ITEMS_PER_THREAD; i++) | ||
{ | ||
decoded_items[i] = val; | ||
item_offsets[i] = thread_decoded_offset - assigned_run_begin; | ||
if (thread_decoded_offset == assigned_run_end - 1) | ||
{ | ||
// We make sure that a thread is not re-entering this conditional when being assigned to the last run already by | ||
// extending the last run's length to all the thread's item | ||
assigned_run++; | ||
assigned_run_begin = temp_storage.runs.run_offsets[assigned_run]; | ||
|
||
// If this thread is getting assigned the last run, we make sure it will not fetch any other run after this | ||
assigned_run_end = (assigned_run == BLOCK_RUNS - 1) ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD | ||
: temp_storage.runs.run_offsets[assigned_run + 1]; | ||
val = temp_storage.runs.run_values[assigned_run]; | ||
} | ||
thread_decoded_offset++; | ||
} | ||
} | ||
|
||
/** | ||
* \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded | ||
* items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the | ||
* run-length decode buffer (i.e., <b>DECODED_ITEMS_PER_THREAD * BLOCK_THREADS</b>), only the items that fit within | ||
* the buffer are returned. Subsequent calls to <b>RunLengthDecode</b> adjusting \p from_decoded_offset can be | ||
* used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to | ||
* <b>RunLengthDecode</b> is not required. | ||
* | ||
* \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement | ||
* \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results | ||
* in undefined behavior. | ||
*/ | ||
__device__ __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD], | ||
DecodedOffsetT from_decoded_offset = 0) | ||
{ | ||
DecodedOffsetT item_offsets[DECODED_ITEMS_PER_THREAD]; | ||
RunLengthDecode(decoded_items, item_offsets, from_decoded_offset); | ||
} | ||
}; | ||
|
||
CUB_NAMESPACE_END |
Oops, something went wrong.