Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Merge pull request #354 from elstehle/feature/block-run-length-decode
Browse files Browse the repository at this point in the history
Adds BlockRunLengthDecode algorithm and tests
  • Loading branch information
alliepiper authored Oct 19, 2021
2 parents 5712619 + 5ead406 commit da50ae9
Show file tree
Hide file tree
Showing 2 changed files with 986 additions and 0 deletions.
392 changes: 392 additions & 0 deletions cub/block/block_run_length_decode.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,392 @@
/******************************************************************************
* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
*AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
*IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/

#pragma once

#include "../config.cuh"
#include "../thread/thread_search.cuh"
#include "../util_math.cuh"
#include "../util_namespace.cuh"
#include "../util_ptx.cuh"
#include "../util_type.cuh"
#include "block_scan.cuh"
#include <limits>
#include <type_traits>

CUB_NAMESPACE_BEGIN

/**
* \brief The BlockRunLengthDecode class supports decoding a run-length encoded array of items. That is, given
* the two arrays run_value[N] and run_lengths[N], run_value[i] is repeated run_lengths[i] many times in the output
* array.
* Due to the nature of the run-length decoding algorithm ("decompression"), the output size of the run-length decoded
* array is runtime-dependent and potentially without any upper bound. To address this, BlockRunLengthDecode allows
* retrieving a "window" from the run-length decoded array. The window's offset can be specified and BLOCK_THREADS *
* DECODED_ITEMS_PER_THREAD (i.e., referred to as window_size) decoded items from the specified window will be returned.
*
* \note: Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array).
* A run of length zero may not be followed by a run length that is not zero.
*
* \par
* \code
* __global__ void ExampleKernel(...)
* {
* // Specialising BlockRunLengthDecode to run-length decode items of type uint64_t
* using RunItemT = uint64_t;
* // Type large enough to index into the run-length decoded array
* using RunLengthT = uint32_t;
*
* // Specialising BlockRunLengthDecode for a 1D block of 128 threads
* constexpr int BLOCK_DIM_X = 128;
* // Specialising BlockRunLengthDecode to have each thread contribute 2 run-length encoded runs
* constexpr int RUNS_PER_THREAD = 2;
* // Specialising BlockRunLengthDecode to have each thread hold 4 run-length decoded items
* constexpr int DECODED_ITEMS_PER_THREAD = 4;
*
* // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
* using BlockRunLengthDecodeT =
* cub::BlockRunLengthDecode<RunItemT, BLOCK_DIM_X, RUNS_PER_THREAD, DECODED_ITEMS_PER_THREAD>;
*
* // Allocate shared memory for BlockRunLengthDecode
* __shared__ typename BlockRunLengthDecodeT::TempStorage temp_storage;
*
* // The run-length encoded items and how often they shall be repeated in the run-length decoded output
* RunItemT run_values[RUNS_PER_THREAD];
* RunLengthT run_lengths[RUNS_PER_THREAD];
* ...
*
* // Initialize the BlockRunLengthDecode with the runs that we want to run-length decode
* uint32_t total_decoded_size = 0;
* BlockRunLengthDecodeT block_rld(temp_storage, run_values, run_lengths, total_decoded_size);
*
* // Run-length decode ("decompress") the runs into a window buffer of limited size. This is repeated until all runs
* // have been decoded.
* uint32_t decoded_window_offset = 0U;
* while (decoded_window_offset < total_decoded_size)
* {
* RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD];
* RunItemT decoded_items[DECODED_ITEMS_PER_THREAD];
*
* // The number of decoded items that are valid within this window (aka pass) of run-length decoding
* uint32_t num_valid_items = total_decoded_size - decoded_window_offset;
* block_rld.RunLengthDecode(decoded_items, relative_offsets, decoded_window_offset);
*
* decoded_window_offset += BLOCK_DIM_X * DECODED_ITEMS_PER_THREAD;
*
* ...
* }
* }
* \endcode
* \par
* Suppose the set of input \p run_values across the block of threads is
* <tt>{ [0, 1], [2, 3], [4, 5], [6, 7], ..., [254, 255] }</tt> and
* \p run_lengths is <tt>{ [1, 2], [3, 4], [5, 1], [2, 3], ..., [5, 1] }</tt>.
* The corresponding output \p decoded_items in those threads will be <tt>{ [0, 1, 1, 2], [2, 2, 3, 3], [3, 3, 4, 4],
* [4, 4, 4, 5], ..., [169, 169, 170, 171] }</tt> and \p relative_offsets will be <tt>{ [0, 0, 1, 0], [1, 2, 0, 1], [2,
* 3, 0, 1], [2, 3, 4, 0], ..., [3, 4, 0, 0] }</tt> during the first iteration of the while loop.
*
* \tparam ItemT The data type of the items being run-length decoded
* \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
* \tparam RUNS_PER_THREAD The number of consecutive runs that each thread contributes
* \tparam DECODED_ITEMS_PER_THREAD The maximum number of decoded items that each thread holds
* \tparam DecodedOffsetT Type used to index into the block's decoded items (large enough to hold the sum over all the
* runs' lengths)
* \tparam BLOCK_DIM_Y The thread block length in threads along the Y dimension
* \tparam BLOCK_DIM_Z The thread block length in threads along the Z dimension
*/
template <typename ItemT,
int BLOCK_DIM_X,
int RUNS_PER_THREAD,
int DECODED_ITEMS_PER_THREAD,
typename DecodedOffsetT = uint32_t,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1>
class BlockRunLengthDecode
{
//---------------------------------------------------------------------
// CONFIGS & TYPE ALIASES
//---------------------------------------------------------------------
private:
/// The thread block size in threads
static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;

/// The number of runs that the block decodes (out-of-bounds items may be padded with run lengths of '0')
static constexpr int BLOCK_RUNS = BLOCK_THREADS * RUNS_PER_THREAD;

/// BlockScan used to determine the beginning of each run (i.e., prefix sum over the runs' length)
using RunOffsetScanT = BlockScan<DecodedOffsetT, BLOCK_DIM_X, BLOCK_SCAN_RAKING_MEMOIZE, BLOCK_DIM_Y, BLOCK_DIM_Z>;

/// Type used to index into the block's runs
using RunOffsetT = uint32_t;

/// Shared memory type required by this thread block
union _TempStorage
{
typename RunOffsetScanT::TempStorage offset_scan;
struct
{
ItemT run_values[BLOCK_RUNS];
DecodedOffsetT run_offsets[BLOCK_RUNS];
} runs;
}; // union TempStorage

/// Internal storage allocator (used when the user does not provide pre-allocated shared memory)
__device__ __forceinline__ _TempStorage &PrivateStorage()
{
__shared__ _TempStorage private_storage;
return private_storage;
}

/// Shared storage reference
_TempStorage &temp_storage;

/// Linear thread-id
uint32_t linear_tid;

public:
struct TempStorage : Uninitialized<_TempStorage>
{};

//---------------------------------------------------------------------
// CONSTRUCTOR
//---------------------------------------------------------------------

/**
* \brief Constructor specialised for user-provided temporary storage, initializing using the runs' lengths. The
* algorithm's temporary storage may not be repurposed between the constructor call and subsequent
* <b>RunLengthDecode</b> calls.
*/
template <typename RunLengthT, typename TotalDecodedSizeT>
__device__ __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage,
ItemT (&run_values)[RUNS_PER_THREAD],
RunLengthT (&run_lengths)[RUNS_PER_THREAD],
TotalDecodedSizeT &total_decoded_size)
: temp_storage(temp_storage.Alias())
, linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{
InitWithRunLengths(run_values, run_lengths, total_decoded_size);
}

/**
* \brief Constructor specialised for user-provided temporary storage, initializing using the runs' offsets. The
* algorithm's temporary storage may not be repurposed between the constructor call and subsequent
* <b>RunLengthDecode</b> calls.
*/
template <typename UserRunOffsetT>
__device__ __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage,
ItemT (&run_values)[RUNS_PER_THREAD],
UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
: temp_storage(temp_storage.Alias())
, linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{
InitWithRunOffsets(run_values, run_offsets);
}

/**
* \brief Constructor specialised for static temporary storage, initializing using the runs' lengths.
*/
template <typename RunLengthT, typename TotalDecodedSizeT>
__device__ __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD],
RunLengthT (&run_lengths)[RUNS_PER_THREAD],
TotalDecodedSizeT &total_decoded_size)
: temp_storage(PrivateStorage())
, linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{
InitWithRunLengths(run_values, run_lengths, total_decoded_size);
}

/**
* \brief Constructor specialised for static temporary storage, initializing using the runs' offsets.
*/
template <typename UserRunOffsetT>
__device__ __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD],
UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
: temp_storage(PrivateStorage())
, linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{
InitWithRunOffsets(run_values, run_offsets);
}

private:
/**
* \brief Returns the offset of the first value within \p input which compares greater than \p val. This version takes
* \p MAX_NUM_ITEMS, an upper bound of the array size, which will be used to determine the number of binary search
* iterations at compile time.
*/
template <int MAX_NUM_ITEMS,
typename InputIteratorT,
typename OffsetT,
typename T>
__device__ __forceinline__ OffsetT StaticUpperBound(InputIteratorT input, ///< [in] Input sequence
OffsetT num_items, ///< [in] Input sequence length
T val) ///< [in] Search key
{
OffsetT lower_bound = 0;
OffsetT upper_bound = num_items;
#pragma unroll
for (int i = 0; i <= Log2<MAX_NUM_ITEMS>::VALUE; i++)
{
OffsetT mid = cub::MidPoint<OffsetT>(lower_bound, upper_bound);
mid = (cub::min)(mid, num_items - 1);

if (val < input[mid])
{
upper_bound = mid;
}
else
{
lower_bound = mid + 1;
}
}

return lower_bound;
}

template <typename RunOffsetT>
__device__ __forceinline__ void InitWithRunOffsets(ItemT (&run_values)[RUNS_PER_THREAD],
RunOffsetT (&run_offsets)[RUNS_PER_THREAD])
{
// Keep the runs' items and the offsets of each run's beginning in the temporary storage
RunOffsetT thread_dst_offset = static_cast<RunOffsetT>(linear_tid) * static_cast<RunOffsetT>(RUNS_PER_THREAD);
#pragma unroll
for (int i = 0; i < RUNS_PER_THREAD; i++)
{
temp_storage.runs.run_values[thread_dst_offset] = run_values[i];
temp_storage.runs.run_offsets[thread_dst_offset] = run_offsets[i];
thread_dst_offset++;
}

// Ensure run offsets and run values have been writen to shared memory
CTA_SYNC();
}

template <typename RunLengthT, typename TotalDecodedSizeT>
__device__ __forceinline__ void InitWithRunLengths(ItemT (&run_values)[RUNS_PER_THREAD],
RunLengthT (&run_lengths)[RUNS_PER_THREAD],
TotalDecodedSizeT &total_decoded_size)
{
// Compute the offset for the beginning of each run
DecodedOffsetT run_offsets[RUNS_PER_THREAD];
#pragma unroll
for (int i = 0; i < RUNS_PER_THREAD; i++)
{
run_offsets[i] = static_cast<DecodedOffsetT>(run_lengths[i]);
}
DecodedOffsetT decoded_size_aggregate;
RunOffsetScanT(this->temp_storage.offset_scan).ExclusiveSum(run_offsets, run_offsets, decoded_size_aggregate);
total_decoded_size = static_cast<TotalDecodedSizeT>(decoded_size_aggregate);

// Ensure the prefix scan's temporary storage can be reused (may be superfluous, but depends on scan implementation)
CTA_SYNC();

InitWithRunOffsets(run_values, run_offsets);
}

public:
/**
* \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded
* items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the
* run-length decode buffer (i.e., <b>DECODED_ITEMS_PER_THREAD * BLOCK_THREADS</b>), only the items that fit within
* the buffer are returned. Subsequent calls to <b>RunLengthDecode</b> adjusting \p from_decoded_offset can be
* used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to
* <b>RunLengthDecode</b> is not required.
* \p item_offsets can be used to retrieve each run-length decoded item's relative index within its run. E.g., the
* run-length encoded array of `3, 1, 4` with the respective run lengths of `2, 1, 3` would yield the run-length
* decoded array of `3, 3, 1, 4, 4, 4` with the relative offsets of `0, 1, 0, 0, 1, 2`.
* \smemreuse
*
* \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement
* \param[out] item_offsets The run-length decoded items' relative offset within the run they belong to
* \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results
* in undefined behavior.
*/
template <typename RelativeOffsetT>
__device__ __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD],
RelativeOffsetT (&item_offsets)[DECODED_ITEMS_PER_THREAD],
DecodedOffsetT from_decoded_offset = 0)
{
// The (global) offset of the first item decoded by this thread
DecodedOffsetT thread_decoded_offset = from_decoded_offset + linear_tid * DECODED_ITEMS_PER_THREAD;

// The run that the first decoded item of this thread belongs to
// If this thread's <thread_decoded_offset> is already beyond the total decoded size, it will be assigned to the
// last run
RunOffsetT assigned_run =
StaticUpperBound<BLOCK_RUNS>(temp_storage.runs.run_offsets, BLOCK_RUNS, thread_decoded_offset) -
static_cast<RunOffsetT>(1U);

DecodedOffsetT assigned_run_begin = temp_storage.runs.run_offsets[assigned_run];

// If this thread is getting assigned the last run, we make sure it will not fetch any other run after this
DecodedOffsetT assigned_run_end = (assigned_run == BLOCK_RUNS - 1)
? thread_decoded_offset + DECODED_ITEMS_PER_THREAD
: temp_storage.runs.run_offsets[assigned_run + 1];

ItemT val = temp_storage.runs.run_values[assigned_run];

#pragma unroll
for (DecodedOffsetT i = 0; i < DECODED_ITEMS_PER_THREAD; i++)
{
decoded_items[i] = val;
item_offsets[i] = thread_decoded_offset - assigned_run_begin;
if (thread_decoded_offset == assigned_run_end - 1)
{
// We make sure that a thread is not re-entering this conditional when being assigned to the last run already by
// extending the last run's length to all the thread's item
assigned_run++;
assigned_run_begin = temp_storage.runs.run_offsets[assigned_run];

// If this thread is getting assigned the last run, we make sure it will not fetch any other run after this
assigned_run_end = (assigned_run == BLOCK_RUNS - 1) ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD
: temp_storage.runs.run_offsets[assigned_run + 1];
val = temp_storage.runs.run_values[assigned_run];
}
thread_decoded_offset++;
}
}

/**
* \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded
* items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the
* run-length decode buffer (i.e., <b>DECODED_ITEMS_PER_THREAD * BLOCK_THREADS</b>), only the items that fit within
* the buffer are returned. Subsequent calls to <b>RunLengthDecode</b> adjusting \p from_decoded_offset can be
* used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to
* <b>RunLengthDecode</b> is not required.
*
* \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement
* \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results
* in undefined behavior.
*/
__device__ __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD],
DecodedOffsetT from_decoded_offset = 0)
{
DecodedOffsetT item_offsets[DECODED_ITEMS_PER_THREAD];
RunLengthDecode(decoded_items, item_offsets, from_decoded_offset);
}
};

CUB_NAMESPACE_END
Loading

0 comments on commit da50ae9

Please sign in to comment.