diff --git a/cub/block/block_run_length_decode.cuh b/cub/block/block_run_length_decode.cuh new file mode 100644 index 0000000000..41a3ab22b1 --- /dev/null +++ b/cub/block/block_run_length_decode.cuh @@ -0,0 +1,392 @@ +/****************************************************************************** + * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#pragma once + +#include "../config.cuh" +#include "../thread/thread_search.cuh" +#include "../util_math.cuh" +#include "../util_namespace.cuh" +#include "../util_ptx.cuh" +#include "../util_type.cuh" +#include "block_scan.cuh" +#include +#include + +CUB_NAMESPACE_BEGIN + +/** + * \brief The BlockRunLengthDecode class supports decoding a run-length encoded array of items. That is, given + * the two arrays run_value[N] and run_lengths[N], run_value[i] is repeated run_lengths[i] many times in the output + * array. + * Due to the nature of the run-length decoding algorithm ("decompression"), the output size of the run-length decoded + * array is runtime-dependent and potentially without any upper bound. To address this, BlockRunLengthDecode allows + * retrieving a "window" from the run-length decoded array. The window's offset can be specified and BLOCK_THREADS * + * DECODED_ITEMS_PER_THREAD (i.e., referred to as window_size) decoded items from the specified window will be returned. + * + * \note: Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array). + * A run of length zero may not be followed by a run length that is not zero. + * + * \par + * \code + * __global__ void ExampleKernel(...) + * { + * // Specialising BlockRunLengthDecode to run-length decode items of type uint64_t + * using RunItemT = uint64_t; + * // Type large enough to index into the run-length decoded array + * using RunLengthT = uint32_t; + * + * // Specialising BlockRunLengthDecode for a 1D block of 128 threads + * constexpr int BLOCK_DIM_X = 128; + * // Specialising BlockRunLengthDecode to have each thread contribute 2 run-length encoded runs + * constexpr int RUNS_PER_THREAD = 2; + * // Specialising BlockRunLengthDecode to have each thread hold 4 run-length decoded items + * constexpr int DECODED_ITEMS_PER_THREAD = 4; + * + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each + * using BlockRunLengthDecodeT = + * cub::BlockRunLengthDecode; + * + * // Allocate shared memory for BlockRunLengthDecode + * __shared__ typename BlockRunLengthDecodeT::TempStorage temp_storage; + * + * // The run-length encoded items and how often they shall be repeated in the run-length decoded output + * RunItemT run_values[RUNS_PER_THREAD]; + * RunLengthT run_lengths[RUNS_PER_THREAD]; + * ... + * + * // Initialize the BlockRunLengthDecode with the runs that we want to run-length decode + * uint32_t total_decoded_size = 0; + * BlockRunLengthDecodeT block_rld(temp_storage, run_values, run_lengths, total_decoded_size); + * + * // Run-length decode ("decompress") the runs into a window buffer of limited size. This is repeated until all runs + * // have been decoded. + * uint32_t decoded_window_offset = 0U; + * while (decoded_window_offset < total_decoded_size) + * { + * RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD]; + * RunItemT decoded_items[DECODED_ITEMS_PER_THREAD]; + * + * // The number of decoded items that are valid within this window (aka pass) of run-length decoding + * uint32_t num_valid_items = total_decoded_size - decoded_window_offset; + * block_rld.RunLengthDecode(decoded_items, relative_offsets, decoded_window_offset); + * + * decoded_window_offset += BLOCK_DIM_X * DECODED_ITEMS_PER_THREAD; + * + * ... + * } + * } + * \endcode + * \par + * Suppose the set of input \p run_values across the block of threads is + * { [0, 1], [2, 3], [4, 5], [6, 7], ..., [254, 255] } and + * \p run_lengths is { [1, 2], [3, 4], [5, 1], [2, 3], ..., [5, 1] }. + * The corresponding output \p decoded_items in those threads will be { [0, 1, 1, 2], [2, 2, 3, 3], [3, 3, 4, 4], + * [4, 4, 4, 5], ..., [169, 169, 170, 171] } and \p relative_offsets will be { [0, 0, 1, 0], [1, 2, 0, 1], [2, + * 3, 0, 1], [2, 3, 4, 0], ..., [3, 4, 0, 0] } during the first iteration of the while loop. + * + * \tparam ItemT The data type of the items being run-length decoded + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam RUNS_PER_THREAD The number of consecutive runs that each thread contributes + * \tparam DECODED_ITEMS_PER_THREAD The maximum number of decoded items that each thread holds + * \tparam DecodedOffsetT Type used to index into the block's decoded items (large enough to hold the sum over all the + * runs' lengths) + * \tparam BLOCK_DIM_Y The thread block length in threads along the Y dimension + * \tparam BLOCK_DIM_Z The thread block length in threads along the Z dimension + */ +template +class BlockRunLengthDecode +{ + //--------------------------------------------------------------------- + // CONFIGS & TYPE ALIASES + //--------------------------------------------------------------------- +private: + /// The thread block size in threads + static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; + + /// The number of runs that the block decodes (out-of-bounds items may be padded with run lengths of '0') + static constexpr int BLOCK_RUNS = BLOCK_THREADS * RUNS_PER_THREAD; + + /// BlockScan used to determine the beginning of each run (i.e., prefix sum over the runs' length) + using RunOffsetScanT = BlockScan; + + /// Type used to index into the block's runs + using RunOffsetT = uint32_t; + + /// Shared memory type required by this thread block + union _TempStorage + { + typename RunOffsetScanT::TempStorage offset_scan; + struct + { + ItemT run_values[BLOCK_RUNS]; + DecodedOffsetT run_offsets[BLOCK_RUNS]; + } runs; + }; // union TempStorage + + /// Internal storage allocator (used when the user does not provide pre-allocated shared memory) + __device__ __forceinline__ _TempStorage &PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + uint32_t linear_tid; + +public: + struct TempStorage : Uninitialized<_TempStorage> + {}; + + //--------------------------------------------------------------------- + // CONSTRUCTOR + //--------------------------------------------------------------------- + + /** + * \brief Constructor specialised for user-provided temporary storage, initializing using the runs' lengths. The + * algorithm's temporary storage may not be repurposed between the constructor call and subsequent + * RunLengthDecode calls. + */ + template + __device__ __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage, + ItemT (&run_values)[RUNS_PER_THREAD], + RunLengthT (&run_lengths)[RUNS_PER_THREAD], + TotalDecodedSizeT &total_decoded_size) + : temp_storage(temp_storage.Alias()) + , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + { + InitWithRunLengths(run_values, run_lengths, total_decoded_size); + } + + /** + * \brief Constructor specialised for user-provided temporary storage, initializing using the runs' offsets. The + * algorithm's temporary storage may not be repurposed between the constructor call and subsequent + * RunLengthDecode calls. + */ + template + __device__ __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage, + ItemT (&run_values)[RUNS_PER_THREAD], + UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD]) + : temp_storage(temp_storage.Alias()) + , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + { + InitWithRunOffsets(run_values, run_offsets); + } + + /** + * \brief Constructor specialised for static temporary storage, initializing using the runs' lengths. + */ + template + __device__ __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD], + RunLengthT (&run_lengths)[RUNS_PER_THREAD], + TotalDecodedSizeT &total_decoded_size) + : temp_storage(PrivateStorage()) + , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + { + InitWithRunLengths(run_values, run_lengths, total_decoded_size); + } + + /** + * \brief Constructor specialised for static temporary storage, initializing using the runs' offsets. + */ + template + __device__ __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD], + UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD]) + : temp_storage(PrivateStorage()) + , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + { + InitWithRunOffsets(run_values, run_offsets); + } + +private: + /** + * \brief Returns the offset of the first value within \p input which compares greater than \p val. This version takes + * \p MAX_NUM_ITEMS, an upper bound of the array size, which will be used to determine the number of binary search + * iterations at compile time. + */ + template + __device__ __forceinline__ OffsetT StaticUpperBound(InputIteratorT input, ///< [in] Input sequence + OffsetT num_items, ///< [in] Input sequence length + T val) ///< [in] Search key + { + OffsetT lower_bound = 0; + OffsetT upper_bound = num_items; +#pragma unroll + for (int i = 0; i <= Log2::VALUE; i++) + { + OffsetT mid = cub::MidPoint(lower_bound, upper_bound); + mid = (cub::min)(mid, num_items - 1); + + if (val < input[mid]) + { + upper_bound = mid; + } + else + { + lower_bound = mid + 1; + } + } + + return lower_bound; + } + + template + __device__ __forceinline__ void InitWithRunOffsets(ItemT (&run_values)[RUNS_PER_THREAD], + RunOffsetT (&run_offsets)[RUNS_PER_THREAD]) + { + // Keep the runs' items and the offsets of each run's beginning in the temporary storage + RunOffsetT thread_dst_offset = static_cast(linear_tid) * static_cast(RUNS_PER_THREAD); +#pragma unroll + for (int i = 0; i < RUNS_PER_THREAD; i++) + { + temp_storage.runs.run_values[thread_dst_offset] = run_values[i]; + temp_storage.runs.run_offsets[thread_dst_offset] = run_offsets[i]; + thread_dst_offset++; + } + + // Ensure run offsets and run values have been writen to shared memory + CTA_SYNC(); + } + + template + __device__ __forceinline__ void InitWithRunLengths(ItemT (&run_values)[RUNS_PER_THREAD], + RunLengthT (&run_lengths)[RUNS_PER_THREAD], + TotalDecodedSizeT &total_decoded_size) + { + // Compute the offset for the beginning of each run + DecodedOffsetT run_offsets[RUNS_PER_THREAD]; +#pragma unroll + for (int i = 0; i < RUNS_PER_THREAD; i++) + { + run_offsets[i] = static_cast(run_lengths[i]); + } + DecodedOffsetT decoded_size_aggregate; + RunOffsetScanT(this->temp_storage.offset_scan).ExclusiveSum(run_offsets, run_offsets, decoded_size_aggregate); + total_decoded_size = static_cast(decoded_size_aggregate); + + // Ensure the prefix scan's temporary storage can be reused (may be superfluous, but depends on scan implementation) + CTA_SYNC(); + + InitWithRunOffsets(run_values, run_offsets); + } + +public: + /** + * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded + * items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the + * run-length decode buffer (i.e., DECODED_ITEMS_PER_THREAD * BLOCK_THREADS), only the items that fit within + * the buffer are returned. Subsequent calls to RunLengthDecode adjusting \p from_decoded_offset can be + * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to + * RunLengthDecode is not required. + * \p item_offsets can be used to retrieve each run-length decoded item's relative index within its run. E.g., the + * run-length encoded array of `3, 1, 4` with the respective run lengths of `2, 1, 3` would yield the run-length + * decoded array of `3, 3, 1, 4, 4, 4` with the relative offsets of `0, 1, 0, 0, 1, 2`. + * \smemreuse + * + * \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement + * \param[out] item_offsets The run-length decoded items' relative offset within the run they belong to + * \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results + * in undefined behavior. + */ + template + __device__ __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD], + RelativeOffsetT (&item_offsets)[DECODED_ITEMS_PER_THREAD], + DecodedOffsetT from_decoded_offset = 0) + { + // The (global) offset of the first item decoded by this thread + DecodedOffsetT thread_decoded_offset = from_decoded_offset + linear_tid * DECODED_ITEMS_PER_THREAD; + + // The run that the first decoded item of this thread belongs to + // If this thread's is already beyond the total decoded size, it will be assigned to the + // last run + RunOffsetT assigned_run = + StaticUpperBound(temp_storage.runs.run_offsets, BLOCK_RUNS, thread_decoded_offset) - + static_cast(1U); + + DecodedOffsetT assigned_run_begin = temp_storage.runs.run_offsets[assigned_run]; + + // If this thread is getting assigned the last run, we make sure it will not fetch any other run after this + DecodedOffsetT assigned_run_end = (assigned_run == BLOCK_RUNS - 1) + ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD + : temp_storage.runs.run_offsets[assigned_run + 1]; + + ItemT val = temp_storage.runs.run_values[assigned_run]; + +#pragma unroll + for (DecodedOffsetT i = 0; i < DECODED_ITEMS_PER_THREAD; i++) + { + decoded_items[i] = val; + item_offsets[i] = thread_decoded_offset - assigned_run_begin; + if (thread_decoded_offset == assigned_run_end - 1) + { + // We make sure that a thread is not re-entering this conditional when being assigned to the last run already by + // extending the last run's length to all the thread's item + assigned_run++; + assigned_run_begin = temp_storage.runs.run_offsets[assigned_run]; + + // If this thread is getting assigned the last run, we make sure it will not fetch any other run after this + assigned_run_end = (assigned_run == BLOCK_RUNS - 1) ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD + : temp_storage.runs.run_offsets[assigned_run + 1]; + val = temp_storage.runs.run_values[assigned_run]; + } + thread_decoded_offset++; + } + } + + /** + * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded + * items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the + * run-length decode buffer (i.e., DECODED_ITEMS_PER_THREAD * BLOCK_THREADS), only the items that fit within + * the buffer are returned. Subsequent calls to RunLengthDecode adjusting \p from_decoded_offset can be + * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to + * RunLengthDecode is not required. + * + * \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement + * \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results + * in undefined behavior. + */ + __device__ __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD], + DecodedOffsetT from_decoded_offset = 0) + { + DecodedOffsetT item_offsets[DECODED_ITEMS_PER_THREAD]; + RunLengthDecode(decoded_items, item_offsets, from_decoded_offset); + } +}; + +CUB_NAMESPACE_END diff --git a/test/test_block_run_length_decode.cu b/test/test_block_run_length_decode.cu new file mode 100644 index 0000000000..5affe3dd5c --- /dev/null +++ b/test/test_block_run_length_decode.cu @@ -0,0 +1,594 @@ +/****************************************************************************** + * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "test_util.h" + +using namespace cub; + +/****************************************************************************** + * HELPER CLASS FOR RUN-LENGTH DECODING TESTS + ******************************************************************************/ + +/** + * \brief Class template to facilitate testing the BlockRunLengthDecode algorithm for all its template parameter + * specialisations. + * + * \tparam ItemItT The item type being run-length decoded + * \tparam RunLengthsItT Iterator type providing the runs' lengths + * \tparam RUNS_PER_THREAD The number of runs that each thread is getting assigned to + * \tparam DECODED_ITEMS_PER_THREAD The number of run-length decoded items that each thread is decoding + * \tparam TEST_RELATIVE_OFFSETS_ Whether to also retrieve each decoded item's relative offset within its run + * \tparam TEST_RUN_OFFSETS_ Whether to pass in each run's offset instead of each run's length + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam BLOCK_DIM_Y The thread block length in threads along the Y dimension + * \tparam BLOCK_DIM_Z The thread block length in threads along the Z dimension + */ +template +class AgentTestBlockRunLengthDecode +{ +public: + constexpr static uint32_t BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; + constexpr static uint32_t RUNS_PER_BLOCK = RUNS_PER_THREAD * BLOCK_THREADS; + constexpr static bool TEST_RELATIVE_OFFSETS = TEST_RELATIVE_OFFSETS_; + +private: + using RunItemT = typename std::iterator_traits::value_type; + using RunLengthT = typename std::iterator_traits::value_type; + + using BlockRunOffsetScanT = cub::BlockScan; + + using BlockRunLengthDecodeT = + cub::BlockRunLengthDecode; + using BlockLoadRunItemT = + cub::BlockLoad; + using BlockLoadRunLengthsT = + cub::BlockLoad; + using BlockStoreDecodedItemT = cub:: + BlockStore; + + using BlockStoreRelativeOffsetT = cub:: + BlockStore; + + __device__ __forceinline__ BlockRunLengthDecodeT InitBlockRunLengthDecode(RunItemT (&unique_items)[RUNS_PER_THREAD], + RunLengthT (&run_lengths)[RUNS_PER_THREAD], + RunLengthT &decoded_size, + cub::Int2Type /*test_run_offsets*/) + { + RunLengthT run_offsets[RUNS_PER_THREAD]; + BlockRunOffsetScanT(temp_storage.run_offsets_scan_storage).ExclusiveSum(run_lengths, run_offsets, decoded_size); + + // Ensure temporary shared memory can be repurposed + CTA_SYNC(); + + // Construct BlockRunLengthDecode and initialize with the run offsets + return BlockRunLengthDecodeT(temp_storage.decode.run_length_decode_storage, unique_items, run_offsets); + } + + __device__ __forceinline__ BlockRunLengthDecodeT InitBlockRunLengthDecode(RunItemT (&unique_items)[RUNS_PER_THREAD], + RunLengthT (&run_lengths)[RUNS_PER_THREAD], + RunLengthT &decoded_size, + cub::Int2Type /*test_run_offsets*/) + { + // Construct BlockRunLengthDecode and initialize with the run lengths + return BlockRunLengthDecodeT(temp_storage.decode.run_length_decode_storage, unique_items, run_lengths, decoded_size); + } + + __device__ __forceinline__ void LoadRuns(ItemItT d_block_unique_items, + RunLengthsItT d_block_run_lengths, + RunItemT (&unique_items)[RUNS_PER_THREAD], + RunLengthT (&run_lengths)[RUNS_PER_THREAD], + size_t num_valid_items) + { + if (num_valid_items < RUNS_PER_BLOCK) + { + BlockLoadRunItemT(temp_storage.load_uniques_storage).Load(d_block_unique_items, unique_items, num_valid_items); + } + else + { + BlockLoadRunItemT(temp_storage.load_uniques_storage).Load(d_block_unique_items, unique_items); + } + + // Ensure BlockLoad's temporary shared memory can be repurposed + CTA_SYNC(); + + // Load this block's tile of run lengths + if (num_valid_items < RUNS_PER_BLOCK) + BlockLoadRunLengthsT(temp_storage.load_run_lengths_storage) + .Load(d_block_run_lengths, run_lengths, num_valid_items, static_cast(0)); + else + BlockLoadRunLengthsT(temp_storage.load_run_lengths_storage).Load(d_block_run_lengths, run_lengths); + + // Ensure temporary shared memory can be repurposed + CTA_SYNC(); + } + +public: + union TempStorage + { + typename BlockLoadRunItemT::TempStorage load_uniques_storage; + typename BlockLoadRunLengthsT::TempStorage load_run_lengths_storage; + typename std::conditional::type + run_offsets_scan_storage; + struct + { + typename BlockRunLengthDecodeT::TempStorage run_length_decode_storage; + typename BlockStoreDecodedItemT::TempStorage store_decoded_runs_storage; + typename BlockStoreRelativeOffsetT::TempStorage store_relative_offsets; + } decode; + }; + + TempStorage &temp_storage; + + __device__ __forceinline__ AgentTestBlockRunLengthDecode(TempStorage &temp_storage) + : temp_storage(temp_storage) + {} + + /** + * \brief Loads the given block (or tile) of runs, and computes their "decompressed" (run-length decoded) size. + */ + __device__ __forceinline__ uint32_t GetDecodedSize(ItemItT d_block_unique_items, + RunLengthsItT d_block_run_lengths, + size_t num_valid_runs) + { + // Load this block's tile of encoded runs + RunItemT unique_items[RUNS_PER_THREAD]; + RunLengthT run_lengths[RUNS_PER_THREAD]; + LoadRuns(d_block_unique_items, d_block_run_lengths, unique_items, run_lengths, num_valid_runs); + + // Init the BlockRunLengthDecode and get the total decoded size of this block's tile (i.e., the "decompressed" size) + uint32_t decoded_size = 0U; + BlockRunLengthDecodeT run_length_decode = + InitBlockRunLengthDecode(unique_items, run_lengths, decoded_size, cub::Int2Type()); + return decoded_size; + } + + /** + * \brief Loads the given block (or tile) of runs, run-length decodes them, and writes the results to \p + * d_block_decoded_out. + */ + template + __device__ __forceinline__ uint32_t WriteDecodedRuns(ItemItT d_block_unique_items, + RunLengthsItT d_block_run_lengths, + UniqueItemOutItT d_block_decoded_out, + RelativeOffsetOutItT d_block_rel_out, + size_t num_valid_runs) + { + // Load this block's tile of encoded runs + RunItemT unique_items[RUNS_PER_THREAD]; + RunLengthT run_lengths[RUNS_PER_THREAD]; + LoadRuns(d_block_unique_items, d_block_run_lengths, unique_items, run_lengths, num_valid_runs); + + // Init the BlockRunLengthDecode and get the total decoded size of this block's tile (i.e., the "decompressed" size) + uint32_t decoded_size = 0U; + BlockRunLengthDecodeT run_length_decode = + InitBlockRunLengthDecode(unique_items, run_lengths, decoded_size, cub::Int2Type()); + + // Run-length decode ("decompress") the runs into a window buffer of limited size. This is repeated until all runs + // have been decoded. + uint32_t decoded_window_offset = 0U; + while (decoded_window_offset < decoded_size) + { + RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD]; + RunItemT decoded_items[DECODED_ITEMS_PER_THREAD]; + + // The number of decoded items that are valid within this window (aka pass) of run-length decoding + uint32_t num_valid_items = decoded_size - decoded_window_offset; + run_length_decode.RunLengthDecode(decoded_items, relative_offsets, decoded_window_offset); + BlockStoreDecodedItemT(temp_storage.decode.store_decoded_runs_storage) + .Store(d_block_decoded_out + decoded_window_offset, decoded_items, num_valid_items); + + if (TEST_RELATIVE_OFFSETS) + { + BlockStoreRelativeOffsetT(temp_storage.decode.store_relative_offsets) + .Store(d_block_rel_out + decoded_window_offset, relative_offsets, num_valid_items); + } + + decoded_window_offset += DECODED_ITEMS_PER_THREAD * BLOCK_THREADS; + } + return decoded_size; + } +}; + +/****************************************************************************** + * [STAGE 1] RUN-LENGTH DECODING TEST KERNEL + ******************************************************************************/ +template +__launch_bounds__(AgentTestBlockRunLengthDecode::BLOCK_THREADS) __global__ + void BlockRunLengthDecodeGetSizeKernel(const ItemItT d_unique_items, + const RunLengthsItT d_run_lengths, + const OffsetT num_runs, + DecodedSizesOutT d_decoded_sizes) +{ + constexpr OffsetT RUNS_PER_BLOCK = AgentTestBlockRunLengthDecode::RUNS_PER_BLOCK; + + __shared__ typename AgentTestBlockRunLengthDecode::TempStorage temp_storage; + + OffsetT block_offset = blockIdx.x * RUNS_PER_BLOCK; + OffsetT num_valid_runs = (block_offset + RUNS_PER_BLOCK >= num_runs) ? (num_runs - block_offset) : RUNS_PER_BLOCK; + + AgentTestBlockRunLengthDecode run_length_decode_agent(temp_storage); + uint64_t num_decoded_items = + run_length_decode_agent.GetDecodedSize(d_unique_items + block_offset, d_run_lengths + block_offset, num_valid_runs); + + d_decoded_sizes[blockIdx.x] = num_decoded_items; +} + +/****************************************************************************** + * [STAGE 2] RUN-LENGTH DECODING TEST KERNEL + ******************************************************************************/ +template +__launch_bounds__(AgentTestBlockRunLengthDecode::BLOCK_THREADS) __global__ + void BlockRunLengthDecodeTestKernel(const ItemItT d_unique_items, + const RunLengthsItT d_run_lengths, + const DecodedSizesOutT d_decoded_offsets, + const OffsetT num_runs, + DecodedItemsOutItT d_decoded_items, + RelativeOffsetOutItT d_relative_offsets) + +{ + constexpr OffsetT RUNS_PER_BLOCK = AgentTestBlockRunLengthDecode::RUNS_PER_BLOCK; + + __shared__ typename AgentTestBlockRunLengthDecode::TempStorage temp_storage; + + OffsetT block_offset = blockIdx.x * RUNS_PER_BLOCK; + OffsetT num_valid_runs = (block_offset + RUNS_PER_BLOCK >= num_runs) ? (num_runs - block_offset) : RUNS_PER_BLOCK; + + AgentTestBlockRunLengthDecode run_length_decode_agent(temp_storage); + run_length_decode_agent.WriteDecodedRuns(d_unique_items + block_offset, + d_run_lengths + block_offset, + d_decoded_items + d_decoded_offsets[blockIdx.x], + d_relative_offsets + d_decoded_offsets[blockIdx.x], + num_valid_runs); +} + +struct ModOp +{ + using T = uint32_t; + __host__ __device__ __forceinline__ T operator()(const T &x) const { return 1 + (x % 100); } +}; + +template +void TestAlgorithmSpecialisation() +{ + constexpr uint32_t THREADS_PER_BLOCK = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; + constexpr uint32_t RUNS_PER_BLOCK = RUNS_PER_THREAD * THREADS_PER_BLOCK; + + using RunItemT = float; + using RunLengthT = uint32_t; + using ItemItT = cub::CountingInputIterator; + using RunLengthsItT = cub::TransformInputIterator>; + + ItemItT d_unique_items(1000U); + RunLengthsItT d_run_lengths(cub::CountingInputIterator(0), ModOp{}); + + constexpr uint32_t num_runs = 10000; + constexpr uint32_t num_blocks = (num_runs + (RUNS_PER_BLOCK - 1U)) / RUNS_PER_BLOCK; + + size_t temp_storage_bytes = 0ULL; + void *temp_storage = nullptr; + uint32_t *h_num_decoded_total = nullptr; + uint32_t *d_decoded_sizes = nullptr; + uint32_t *d_decoded_offsets = nullptr; + RunItemT *d_decoded_out = nullptr; + RunLengthT *d_relative_offsets = nullptr; + RunItemT *h_decoded_out = nullptr; + RunLengthT *h_relative_offsets = nullptr; + + using AgentTestBlockRunLengthDecodeT = AgentTestBlockRunLengthDecode; + + enum : uint32_t + { + TIMER_SIZE_BEGIN = 0, + TIMER_SIZE_END, + TIMER_DECODE_BEGIN, + TIMER_DECODE_END, + NUM_TIMERS, + }; + + cudaStream_t stream; + cudaStreamCreate(&stream); + + cudaEvent_t cuda_evt_timers[NUM_TIMERS]; + for (uint32_t i = 0; i < NUM_TIMERS; i++) + { + cudaEventCreate(&cuda_evt_timers[i]); + } + + // Get temporary storage requirements for the scan (for computing offsets for the per-block run-length decoded items) + cub::DeviceScan::InclusiveSum(nullptr, temp_storage_bytes, d_decoded_sizes, d_decoded_offsets, num_blocks, stream); + + // Allocate device memory + CubDebugExit(cudaMalloc(&temp_storage, temp_storage_bytes)); + CubDebugExit(cudaMalloc(&d_decoded_sizes, num_blocks * sizeof(*d_decoded_sizes))); + // Allocate for the exclusive sum PLUS the overall aggregate + CubDebugExit(cudaMalloc(&d_decoded_offsets, (num_blocks + 1) * sizeof(*d_decoded_offsets))); + CubDebugExit(cudaMallocHost(&h_num_decoded_total, sizeof(*h_num_decoded_total))); + + // Get the per-block number of items being decoded (i-th thread block writing size to d_decoded_sizes[i]) + CubDebugExit(cudaEventRecord(cuda_evt_timers[TIMER_SIZE_BEGIN], stream)); + BlockRunLengthDecodeGetSizeKernel + <<>>(d_unique_items, d_run_lengths, num_runs, d_decoded_sizes); + CubDebugExit(cudaEventRecord(cuda_evt_timers[TIMER_SIZE_END], stream)); + + // Compute offsets for the runs decoded by each block (exclusive sum + aggregate) + CubDebugExit(cudaMemsetAsync(d_decoded_offsets, 0, sizeof(d_decoded_offsets[0]), stream)); + CubDebugExit(cub::DeviceScan::InclusiveSum(temp_storage, + temp_storage_bytes, + d_decoded_sizes, + &d_decoded_offsets[1], + num_blocks, + stream)); + + // Copy the total decoded size to CPU in order to allocate just the right amount of device memory + CubDebugExit(cudaMemcpyAsync(h_num_decoded_total, + &d_decoded_offsets[num_blocks], + sizeof(*h_num_decoded_total), + cudaMemcpyDeviceToHost, + stream)); + + // Ensure the total decoded size has been copied from GPU to CPU + CubDebugExit(cudaStreamSynchronize(stream)); + + // Allocate device memory for the run-length decoded output + CubDebugExit(cudaMallocHost(&h_decoded_out, (*h_num_decoded_total) * sizeof(RunItemT))); + CubDebugExit(cudaMalloc(&d_decoded_out, (*h_num_decoded_total) * sizeof(RunItemT))); + if (TEST_RELATIVE_OFFSETS) + { + CubDebugExit(cudaMalloc(&d_relative_offsets, (*h_num_decoded_total) * sizeof(RunLengthT))); + CubDebugExit(cudaMallocHost(&h_relative_offsets, (*h_num_decoded_total) * sizeof(RunLengthT))); + } + + // Perform the block-wise run-length decoding (each block taking its offset from d_decoded_offsets) + CubDebugExit(cudaEventRecord(cuda_evt_timers[TIMER_DECODE_BEGIN], stream)); + BlockRunLengthDecodeTestKernel + <<>>(d_unique_items, + d_run_lengths, + d_decoded_offsets, + num_runs, + d_decoded_out, + d_relative_offsets); + CubDebugExit(cudaEventRecord(cuda_evt_timers[TIMER_DECODE_END], stream)); + + // Copy back results for verification + CubDebugExit(cudaMemcpyAsync(h_decoded_out, + d_decoded_out, + (*h_num_decoded_total) * sizeof(*h_decoded_out), + cudaMemcpyDeviceToHost, + stream)); + + if (TEST_RELATIVE_OFFSETS) + { + // Copy back the relative offsets + CubDebugExit(cudaMemcpyAsync(h_relative_offsets, + d_relative_offsets, + (*h_num_decoded_total) * sizeof(*h_relative_offsets), + cudaMemcpyDeviceToHost, + stream)); + } + + // Generate host-side run-length decoded data for verification + std::vector> host_golden; + host_golden.reserve(*h_num_decoded_total); + for (uint32_t run = 0; run < num_runs; run++) + { + for (RunLengthT i = 0; i < d_run_lengths[run]; i++) + { + host_golden.push_back({d_unique_items[run], i}); + } + } + + // Ensure the run-length decoded result has been copied to the host + CubDebugExit(cudaStreamSynchronize(stream)); + + // Verify the total run-length decoded size is correct + AssertEquals(host_golden.size(), h_num_decoded_total[0]); + + float duration_size = 0.0f; + float duration_decode = 0.0f; + cudaEventElapsedTime(&duration_size, cuda_evt_timers[TIMER_SIZE_BEGIN], cuda_evt_timers[TIMER_SIZE_END]); + cudaEventElapsedTime(&duration_decode, cuda_evt_timers[TIMER_DECODE_BEGIN], cuda_evt_timers[TIMER_DECODE_END]); + + size_t decoded_bytes = host_golden.size() * sizeof(RunItemT); + size_t relative_offsets_bytes = TEST_RELATIVE_OFFSETS ? host_golden.size() * sizeof(RunLengthT) : 0ULL; + size_t total_bytes_written = decoded_bytes + relative_offsets_bytes; + + std::cout << "MODE: " << (TEST_RELATIVE_OFFSETS ? "offsets, " : "normal, ") // + << "INIT: " << (TEST_RUN_OFFSETS ? "run offsets, " : "run lengths, ") // + << "RUNS_PER_THREAD: " << RUNS_PER_THREAD // + << ", DECODED_ITEMS_PER_THREAD: " << DECODED_ITEMS_PER_THREAD // + << ", THREADS_PER_BLOCK: " << THREADS_PER_BLOCK // + << ", decoded size (bytes): " << decoded_bytes // + << ", relative offsets (bytes): " << relative_offsets_bytes // + << ", time_size (ms): " << duration_size // + << ", time_decode (ms): " << duration_decode // + << ", achieved decode BW (GB/s): " + << ((static_cast(total_bytes_written) / 1.0e9) * (1000.0 / duration_decode)) << "\n"; + + // Verify the run-length decoded data is correct + bool cmp_eq = true; + for (uint32_t i = 0; i < host_golden.size(); i++) + { + if (host_golden[i].first != h_decoded_out[i]) + { + std::cout << "Mismatch at #" << i << ": CPU item: " << host_golden[i].first << ", GPU: " << h_decoded_out[i] + << "\n"; + cmp_eq = false; + } + if (TEST_RELATIVE_OFFSETS) + { + if (host_golden[i].second != h_relative_offsets[i]) + { + std::cout << "Mismatch of relative offset at #" << i << ": CPU item: " << host_golden[i].first + << ", GPU: " << h_decoded_out[i] << "; relative offsets: CPU: " << host_golden[i].second + << ", GPU: " << h_relative_offsets[i] << "\n"; + cmp_eq = false; + break; + } + } + } + AssertEquals(cmp_eq, true); + + // Clean up memory allocations + CubDebugExit(cudaFree(temp_storage)); + CubDebugExit(cudaFree(d_decoded_sizes)); + CubDebugExit(cudaFree(d_decoded_offsets)); + CubDebugExit(cudaFree(d_decoded_out)); + CubDebugExit(cudaFreeHost(h_num_decoded_total)); + CubDebugExit(cudaFreeHost(h_decoded_out)); + if (TEST_RELATIVE_OFFSETS) + { + CubDebugExit(cudaFree(d_relative_offsets)); + CubDebugExit(cudaFreeHost(h_relative_offsets)); + } + + // Clean up events + for (uint32_t i = 0; i < NUM_TIMERS; i++) + { + CubDebugExit(cudaEventDestroy(cuda_evt_timers[i])); + } + + // Clean up streams + CubDebugExit(cudaStreamDestroy(stream)); +} + +template +void TestForTuningParameters() +{ + constexpr bool DO_TEST_RELATIVE_OFFSETS = true; + constexpr bool DO_NOT_TEST_RELATIVE_OFFSETS = false; + + constexpr bool TEST_WITH_RUN_OFFSETS = true; + constexpr bool TEST_WITH_RUN_LENGTHS = false; + // Run BlockRunLengthDecode that uses run lengths and generates offsets relative to each run + TestAlgorithmSpecialisation(); + + // Run BlockRunLengthDecode that uses run lengths and performs normal run-length decoding + TestAlgorithmSpecialisation(); + + // Run BlockRunLengthDecode that uses run offsets and generates offsets relative to each run + TestAlgorithmSpecialisation(); + + // Run BlockRunLengthDecode that uses run offsets and performs normal run-length decoding + TestAlgorithmSpecialisation(); +} + +int main(int argc, char **argv) +{ + CommandLineArgs args(argc, argv); + + // Initialize device + CubDebugExit(args.DeviceInit()); + + // Instantiate test template instances for various configurations (tuning parameter dimensions) + // + TestForTuningParameters<1U, 1U, 64U>(); + TestForTuningParameters<1U, 3U, 32U, 2U, 3U>(); + TestForTuningParameters<1U, 1U, 128U>(); + TestForTuningParameters<1U, 8U, 128U>(); + TestForTuningParameters<2U, 8U, 128U>(); + TestForTuningParameters<3U, 1U, 256U>(); + TestForTuningParameters<1U, 8U, 256U>(); + TestForTuningParameters<8U, 1U, 256U>(); + TestForTuningParameters<1U, 1U, 256U>(); + TestForTuningParameters<2U, 2U, 384U>(); + + return 0; +}