diff --git a/cub/block/block_run_length_decode.cuh b/cub/block/block_run_length_decode.cuh
new file mode 100644
index 0000000000..41a3ab22b1
--- /dev/null
+++ b/cub/block/block_run_length_decode.cuh
@@ -0,0 +1,392 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "../config.cuh"
+#include "../thread/thread_search.cuh"
+#include "../util_math.cuh"
+#include "../util_namespace.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+#include "block_scan.cuh"
+#include <limits>
+#include <type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \brief The BlockRunLengthDecode class supports decoding a run-length encoded array of items. That is, given
+ * the two arrays run_value[N] and run_lengths[N], run_value[i] is repeated run_lengths[i] many times in the output
+ * array.
+ * Due to the nature of the run-length decoding algorithm ("decompression"), the output size of the run-length decoded
+ * array is runtime-dependent and potentially without any upper bound. To address this, BlockRunLengthDecode allows
+ * retrieving a "window" from the run-length decoded array. The window's offset can be specified and BLOCK_THREADS *
+ * DECODED_ITEMS_PER_THREAD (i.e., referred to as window_size) decoded items from the specified window will be returned.
+ *
+ * \note: Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array).
+ * A run of length zero may not be followed by a run length that is not zero.
+ *
+ * \par
+ * \code
+ * __global__ void ExampleKernel(...)
+ * {
+ *   // Specialising BlockRunLengthDecode to run-length decode items of type uint64_t
+ *   using RunItemT = uint64_t;
+ *   // Type large enough to index into the run-length decoded array
+ *   using RunLengthT = uint32_t;
+ *
+ *   // Specialising BlockRunLengthDecode for a 1D block of 128 threads
+ *   constexpr int BLOCK_DIM_X = 128;
+ *   // Specialising BlockRunLengthDecode to have each thread contribute 2 run-length encoded runs
+ *   constexpr int RUNS_PER_THREAD = 2;
+ *   // Specialising BlockRunLengthDecode to have each thread hold 4 run-length decoded items
+ *   constexpr int DECODED_ITEMS_PER_THREAD = 4;
+ *
+ *   // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
+ *   using BlockRunLengthDecodeT =
+ *     cub::BlockRunLengthDecode<RunItemT, BLOCK_DIM_X, RUNS_PER_THREAD, DECODED_ITEMS_PER_THREAD>;
+ *
+ *   // Allocate shared memory for BlockRunLengthDecode
+ *   __shared__ typename BlockRunLengthDecodeT::TempStorage temp_storage;
+ *
+ *   // The run-length encoded items and how often they shall be repeated in the run-length decoded output
+ *   RunItemT run_values[RUNS_PER_THREAD];
+ *   RunLengthT run_lengths[RUNS_PER_THREAD];
+ *   ...
+ *
+ *   // Initialize the BlockRunLengthDecode with the runs that we want to run-length decode
+ *   uint32_t total_decoded_size = 0;
+ *   BlockRunLengthDecodeT block_rld(temp_storage, run_values, run_lengths, total_decoded_size);
+ *
+ *   // Run-length decode ("decompress") the runs into a window buffer of limited size. This is repeated until all runs
+ *   // have been decoded.
+ *   uint32_t decoded_window_offset = 0U;
+ *   while (decoded_window_offset < total_decoded_size)
+ *   {
+ *     RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD];
+ *     RunItemT decoded_items[DECODED_ITEMS_PER_THREAD];
+ *
+ *     // The number of decoded items that are valid within this window (aka pass) of run-length decoding
+ *     uint32_t num_valid_items = total_decoded_size - decoded_window_offset;
+ *     block_rld.RunLengthDecode(decoded_items, relative_offsets, decoded_window_offset);
+ *
+ *     decoded_window_offset += BLOCK_DIM_X * DECODED_ITEMS_PER_THREAD;
+ *
+ *     ...
+ *   }
+ * }
+ * \endcode
+ * \par
+ * Suppose the set of input \p run_values across the block of threads is
+ * <tt>{ [0, 1], [2, 3], [4, 5], [6, 7], ..., [254, 255] }</tt> and
+ * \p run_lengths is <tt>{ [1, 2], [3, 4], [5, 1], [2, 3], ..., [5, 1] }</tt>.
+ * The corresponding output \p decoded_items in those threads will be <tt>{ [0, 1, 1, 2], [2, 2, 3, 3], [3, 3, 4, 4],
+ * [4, 4, 4, 5], ..., [169, 169, 170, 171] }</tt> and \p relative_offsets will be <tt>{ [0, 0, 1, 0], [1, 2, 0, 1], [2,
+ * 3, 0, 1], [2, 3, 4, 0], ..., [3, 4, 0, 0] }</tt> during the first iteration of the while loop.
+ *
+ * \tparam ItemT The data type of the items being run-length decoded
+ * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
+ * \tparam RUNS_PER_THREAD The number of consecutive runs that each thread contributes
+ * \tparam DECODED_ITEMS_PER_THREAD The maximum number of decoded items that each thread holds
+ * \tparam DecodedOffsetT Type used to index into the block's decoded items (large enough to hold the sum over all the
+ * runs' lengths)
+ * \tparam BLOCK_DIM_Y The thread block length in threads along the Y dimension
+ * \tparam BLOCK_DIM_Z The thread block length in threads along the Z dimension
+ */
+template <typename ItemT,
+          int BLOCK_DIM_X,
+          int RUNS_PER_THREAD,
+          int DECODED_ITEMS_PER_THREAD,
+          typename DecodedOffsetT = uint32_t,
+          int BLOCK_DIM_Y         = 1,
+          int BLOCK_DIM_Z         = 1>
+class BlockRunLengthDecode
+{
+  //---------------------------------------------------------------------
+  // CONFIGS & TYPE ALIASES
+  //---------------------------------------------------------------------
+private:
+  /// The thread block size in threads
+  static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+
+  /// The number of runs that the block decodes (out-of-bounds items may be padded with run lengths of '0')
+  static constexpr int BLOCK_RUNS = BLOCK_THREADS * RUNS_PER_THREAD;
+
+  /// BlockScan used to determine the beginning of each run (i.e., prefix sum over the runs' length)
+  using RunOffsetScanT = BlockScan<DecodedOffsetT, BLOCK_DIM_X, BLOCK_SCAN_RAKING_MEMOIZE, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+
+  /// Type used to index into the block's runs
+  using RunOffsetT = uint32_t;
+
+  /// Shared memory type required by this thread block
+  union _TempStorage
+  {
+    typename RunOffsetScanT::TempStorage offset_scan;
+    struct
+    {
+      ItemT run_values[BLOCK_RUNS];
+      DecodedOffsetT run_offsets[BLOCK_RUNS];
+    } runs;
+  }; // union TempStorage
+
+  /// Internal storage allocator (used when the user does not provide pre-allocated shared memory)
+  __device__ __forceinline__ _TempStorage &PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+  /// Shared storage reference
+  _TempStorage &temp_storage;
+
+  /// Linear thread-id
+  uint32_t linear_tid;
+
+public:
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //---------------------------------------------------------------------
+  // CONSTRUCTOR
+  //---------------------------------------------------------------------
+
+  /**
+   * \brief Constructor specialised for user-provided temporary storage, initializing using the runs' lengths. The
+   * algorithm's temporary storage may not be repurposed between the constructor call and subsequent
+   * <b>RunLengthDecode</b> calls.
+   */
+  template <typename RunLengthT, typename TotalDecodedSizeT>
+  __device__ __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage,
+                                                  ItemT (&run_values)[RUNS_PER_THREAD],
+                                                  RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+                                                  TotalDecodedSizeT &total_decoded_size)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {
+    InitWithRunLengths(run_values, run_lengths, total_decoded_size);
+  }
+
+  /**
+   * \brief Constructor specialised for user-provided temporary storage, initializing using the runs' offsets. The
+   * algorithm's temporary storage may not be repurposed between the constructor call and subsequent
+   * <b>RunLengthDecode</b> calls.
+   */
+  template <typename UserRunOffsetT>
+  __device__ __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage,
+                                                  ItemT (&run_values)[RUNS_PER_THREAD],
+                                                  UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {
+    InitWithRunOffsets(run_values, run_offsets);
+  }
+
+  /**
+   * \brief Constructor specialised for static temporary storage, initializing using the runs' lengths.
+   */
+  template <typename RunLengthT, typename TotalDecodedSizeT>
+  __device__ __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD],
+                                                  RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+                                                  TotalDecodedSizeT &total_decoded_size)
+      : temp_storage(PrivateStorage())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {
+    InitWithRunLengths(run_values, run_lengths, total_decoded_size);
+  }
+
+  /**
+   * \brief Constructor specialised for static temporary storage, initializing using the runs' offsets.
+   */
+  template <typename UserRunOffsetT>
+  __device__ __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD],
+                                                  UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
+      : temp_storage(PrivateStorage())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {
+    InitWithRunOffsets(run_values, run_offsets);
+  }
+
+private:
+  /**
+   * \brief Returns the offset of the first value within \p input which compares greater than \p val. This version takes
+   * \p MAX_NUM_ITEMS, an upper bound of the array size, which will be used to determine the number of binary search
+   * iterations at compile time.
+   */
+  template <int MAX_NUM_ITEMS,
+            typename InputIteratorT,
+            typename OffsetT,
+            typename T>
+  __device__ __forceinline__ OffsetT StaticUpperBound(InputIteratorT input, ///< [in] Input sequence
+                                                      OffsetT num_items,    ///< [in] Input sequence length
+                                                      T val)                ///< [in] Search key
+  {
+    OffsetT lower_bound = 0;
+    OffsetT upper_bound = num_items;
+#pragma unroll
+    for (int i = 0; i <= Log2<MAX_NUM_ITEMS>::VALUE; i++)
+    {
+      OffsetT mid = cub::MidPoint<OffsetT>(lower_bound, upper_bound);
+      mid         = (cub::min)(mid, num_items - 1);
+
+      if (val < input[mid])
+      {
+        upper_bound = mid;
+      }
+      else
+      {
+        lower_bound = mid + 1;
+      }
+    }
+
+    return lower_bound;
+  }
+
+  template <typename RunOffsetT>
+  __device__ __forceinline__ void InitWithRunOffsets(ItemT (&run_values)[RUNS_PER_THREAD],
+                                                     RunOffsetT (&run_offsets)[RUNS_PER_THREAD])
+  {
+    // Keep the runs' items and the offsets of each run's beginning in the temporary storage
+    RunOffsetT thread_dst_offset = static_cast<RunOffsetT>(linear_tid) * static_cast<RunOffsetT>(RUNS_PER_THREAD);
+#pragma unroll
+    for (int i = 0; i < RUNS_PER_THREAD; i++)
+    {
+      temp_storage.runs.run_values[thread_dst_offset]  = run_values[i];
+      temp_storage.runs.run_offsets[thread_dst_offset] = run_offsets[i];
+      thread_dst_offset++;
+    }
+
+    // Ensure run offsets and run values have been writen to shared memory
+    CTA_SYNC();
+  }
+
+  template <typename RunLengthT, typename TotalDecodedSizeT>
+  __device__ __forceinline__ void InitWithRunLengths(ItemT (&run_values)[RUNS_PER_THREAD],
+                                                     RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+                                                     TotalDecodedSizeT &total_decoded_size)
+  {
+    // Compute the offset for the beginning of each run
+    DecodedOffsetT run_offsets[RUNS_PER_THREAD];
+#pragma unroll
+    for (int i = 0; i < RUNS_PER_THREAD; i++)
+    {
+      run_offsets[i] = static_cast<DecodedOffsetT>(run_lengths[i]);
+    }
+    DecodedOffsetT decoded_size_aggregate;
+    RunOffsetScanT(this->temp_storage.offset_scan).ExclusiveSum(run_offsets, run_offsets, decoded_size_aggregate);
+    total_decoded_size = static_cast<TotalDecodedSizeT>(decoded_size_aggregate);
+
+    // Ensure the prefix scan's temporary storage can be reused (may be superfluous, but depends on scan implementation)
+    CTA_SYNC();
+
+    InitWithRunOffsets(run_values, run_offsets);
+  }
+
+public:
+  /**
+   * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded
+   * items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the
+   * run-length decode buffer (i.e., <b>DECODED_ITEMS_PER_THREAD * BLOCK_THREADS</b>), only the items that fit within
+   * the buffer are returned. Subsequent calls to <b>RunLengthDecode</b> adjusting \p from_decoded_offset can be
+   * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to
+   * <b>RunLengthDecode</b> is not required.
+   * \p item_offsets can be used to retrieve each run-length decoded item's relative index within its run. E.g., the
+   * run-length encoded array of `3, 1, 4` with the respective run lengths of `2, 1, 3` would yield the run-length
+   * decoded array of `3, 3, 1, 4, 4, 4` with the relative offsets of `0, 1, 0, 0, 1, 2`.
+   * \smemreuse
+   *
+   * \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement
+   * \param[out] item_offsets The run-length decoded items' relative offset within the run they belong to
+   * \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results
+   * in undefined behavior.
+   */
+  template <typename RelativeOffsetT>
+  __device__ __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD],
+                                                  RelativeOffsetT (&item_offsets)[DECODED_ITEMS_PER_THREAD],
+                                                  DecodedOffsetT from_decoded_offset = 0)
+  {
+    // The (global) offset of the first item decoded by this thread
+    DecodedOffsetT thread_decoded_offset = from_decoded_offset + linear_tid * DECODED_ITEMS_PER_THREAD;
+
+    // The run that the first decoded item of this thread belongs to
+    // If this thread's <thread_decoded_offset> is already beyond the total decoded size, it will be assigned to the
+    // last run
+    RunOffsetT assigned_run =
+      StaticUpperBound<BLOCK_RUNS>(temp_storage.runs.run_offsets, BLOCK_RUNS, thread_decoded_offset) -
+      static_cast<RunOffsetT>(1U);
+
+    DecodedOffsetT assigned_run_begin = temp_storage.runs.run_offsets[assigned_run];
+
+    // If this thread is getting assigned the last run, we make sure it will not fetch any other run after this
+    DecodedOffsetT assigned_run_end = (assigned_run == BLOCK_RUNS - 1)
+                                        ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD
+                                        : temp_storage.runs.run_offsets[assigned_run + 1];
+
+    ItemT val = temp_storage.runs.run_values[assigned_run];
+
+#pragma unroll
+    for (DecodedOffsetT i = 0; i < DECODED_ITEMS_PER_THREAD; i++)
+    {
+      decoded_items[i] = val;
+      item_offsets[i]  = thread_decoded_offset - assigned_run_begin;
+      if (thread_decoded_offset == assigned_run_end - 1)
+      {
+        // We make sure that a thread is not re-entering this conditional when being assigned to the last run already by
+        // extending the last run's length to all the thread's item
+        assigned_run++;
+        assigned_run_begin = temp_storage.runs.run_offsets[assigned_run];
+
+        // If this thread is getting assigned the last run, we make sure it will not fetch any other run after this
+        assigned_run_end = (assigned_run == BLOCK_RUNS - 1) ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD
+                                                            : temp_storage.runs.run_offsets[assigned_run + 1];
+        val              = temp_storage.runs.run_values[assigned_run];
+      }
+      thread_decoded_offset++;
+    }
+  }
+
+  /**
+   * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded
+   * items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the
+   * run-length decode buffer (i.e., <b>DECODED_ITEMS_PER_THREAD * BLOCK_THREADS</b>), only the items that fit within
+   * the buffer are returned. Subsequent calls to <b>RunLengthDecode</b> adjusting \p from_decoded_offset can be
+   * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to
+   * <b>RunLengthDecode</b> is not required.
+   *
+   * \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement
+   * \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results
+   * in undefined behavior.
+   */
+  __device__ __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD],
+                                                  DecodedOffsetT from_decoded_offset = 0)
+  {
+    DecodedOffsetT item_offsets[DECODED_ITEMS_PER_THREAD];
+    RunLengthDecode(decoded_items, item_offsets, from_decoded_offset);
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/test/test_block_run_length_decode.cu b/test/test_block_run_length_decode.cu
new file mode 100644
index 0000000000..5affe3dd5c
--- /dev/null
+++ b/test/test_block_run_length_decode.cu
@@ -0,0 +1,594 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cstdint>
+#include <iterator>
+#include <stdio.h>
+#include <type_traits>
+#include <utility>
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_run_length_decode.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/device/device_scan.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/iterator/transform_input_iterator.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+/******************************************************************************
+ * HELPER CLASS FOR RUN-LENGTH DECODING TESTS
+ ******************************************************************************/
+
+/**
+ * \brief Class template to facilitate testing the BlockRunLengthDecode algorithm for all its template parameter
+ * specialisations.
+ *
+ * \tparam ItemItT The item type being run-length decoded
+ * \tparam RunLengthsItT Iterator type providing the runs' lengths
+ * \tparam RUNS_PER_THREAD The number of runs that each thread is getting assigned to
+ * \tparam DECODED_ITEMS_PER_THREAD The number of run-length decoded items that each thread is decoding
+ * \tparam TEST_RELATIVE_OFFSETS_ Whether to also retrieve each decoded item's relative offset within its run
+ * \tparam TEST_RUN_OFFSETS_ Whether to pass in each run's offset instead of each run's length
+ * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
+ * \tparam BLOCK_DIM_Y The thread block length in threads along the Y dimension
+ * \tparam BLOCK_DIM_Z The thread block length in threads along the Z dimension
+ */
+template <typename ItemItT,
+          typename RunLengthsItT,
+          int RUNS_PER_THREAD,
+          int DECODED_ITEMS_PER_THREAD,
+          bool TEST_RELATIVE_OFFSETS_,
+          bool TEST_RUN_OFFSETS_,
+          int BLOCK_DIM_X,
+          int BLOCK_DIM_Y = 1,
+          int BLOCK_DIM_Z = 1>
+class AgentTestBlockRunLengthDecode
+{
+public:
+  constexpr static uint32_t BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+  constexpr static uint32_t RUNS_PER_BLOCK    = RUNS_PER_THREAD * BLOCK_THREADS;
+  constexpr static bool TEST_RELATIVE_OFFSETS = TEST_RELATIVE_OFFSETS_;
+
+private:
+  using RunItemT   = typename std::iterator_traits<ItemItT>::value_type;
+  using RunLengthT = typename std::iterator_traits<RunLengthsItT>::value_type;
+
+  using BlockRunOffsetScanT = cub::BlockScan<RunLengthT, BLOCK_DIM_X, BLOCK_SCAN_RAKING, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+
+  using BlockRunLengthDecodeT =
+    cub::BlockRunLengthDecode<RunItemT, BLOCK_DIM_X, RUNS_PER_THREAD, DECODED_ITEMS_PER_THREAD>;
+  using BlockLoadRunItemT =
+    cub::BlockLoad<RunItemT, BLOCK_DIM_X, RUNS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+  using BlockLoadRunLengthsT =
+    cub::BlockLoad<RunLengthT, BLOCK_DIM_X, RUNS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+  using BlockStoreDecodedItemT = cub::
+    BlockStore<RunItemT, BLOCK_DIM_X, DECODED_ITEMS_PER_THREAD, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+
+  using BlockStoreRelativeOffsetT = cub::
+    BlockStore<RunLengthT, BLOCK_DIM_X, DECODED_ITEMS_PER_THREAD, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+
+  __device__ __forceinline__ BlockRunLengthDecodeT InitBlockRunLengthDecode(RunItemT (&unique_items)[RUNS_PER_THREAD],
+                                                                            RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+                                                                            RunLengthT &decoded_size,
+                                                                            cub::Int2Type<true> /*test_run_offsets*/)
+  {
+    RunLengthT run_offsets[RUNS_PER_THREAD];
+    BlockRunOffsetScanT(temp_storage.run_offsets_scan_storage).ExclusiveSum(run_lengths, run_offsets, decoded_size);
+
+    // Ensure temporary shared memory can be repurposed
+    CTA_SYNC();
+
+    // Construct BlockRunLengthDecode and initialize with the run offsets
+    return BlockRunLengthDecodeT(temp_storage.decode.run_length_decode_storage, unique_items, run_offsets);
+  }
+
+  __device__ __forceinline__ BlockRunLengthDecodeT InitBlockRunLengthDecode(RunItemT (&unique_items)[RUNS_PER_THREAD],
+                                                                            RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+                                                                            RunLengthT &decoded_size,
+                                                                            cub::Int2Type<false> /*test_run_offsets*/)
+  {
+    // Construct BlockRunLengthDecode and initialize with the run lengths
+    return BlockRunLengthDecodeT(temp_storage.decode.run_length_decode_storage, unique_items, run_lengths, decoded_size);
+  }
+
+  __device__ __forceinline__ void LoadRuns(ItemItT d_block_unique_items,
+                                           RunLengthsItT d_block_run_lengths,
+                                           RunItemT (&unique_items)[RUNS_PER_THREAD],
+                                           RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+                                           size_t num_valid_items)
+  {
+    if (num_valid_items < RUNS_PER_BLOCK)
+    {
+      BlockLoadRunItemT(temp_storage.load_uniques_storage).Load(d_block_unique_items, unique_items, num_valid_items);
+    }
+    else
+    {
+      BlockLoadRunItemT(temp_storage.load_uniques_storage).Load(d_block_unique_items, unique_items);
+    }
+
+    // Ensure BlockLoad's temporary shared memory can be repurposed
+    CTA_SYNC();
+
+    // Load this block's tile of run lengths
+    if (num_valid_items < RUNS_PER_BLOCK)
+      BlockLoadRunLengthsT(temp_storage.load_run_lengths_storage)
+        .Load(d_block_run_lengths, run_lengths, num_valid_items, static_cast<RunLengthT>(0));
+    else
+      BlockLoadRunLengthsT(temp_storage.load_run_lengths_storage).Load(d_block_run_lengths, run_lengths);
+
+    // Ensure temporary shared memory can be repurposed
+    CTA_SYNC();
+  }
+
+public:
+  union TempStorage
+  {
+    typename BlockLoadRunItemT::TempStorage load_uniques_storage;
+    typename BlockLoadRunLengthsT::TempStorage load_run_lengths_storage;
+    typename std::conditional<TEST_RUN_OFFSETS_, typename BlockRunOffsetScanT::TempStorage, cub::NullType>::type
+      run_offsets_scan_storage;
+    struct
+    {
+      typename BlockRunLengthDecodeT::TempStorage run_length_decode_storage;
+      typename BlockStoreDecodedItemT::TempStorage store_decoded_runs_storage;
+      typename BlockStoreRelativeOffsetT::TempStorage store_relative_offsets;
+    } decode;
+  };
+
+  TempStorage &temp_storage;
+
+  __device__ __forceinline__ AgentTestBlockRunLengthDecode(TempStorage &temp_storage)
+      : temp_storage(temp_storage)
+  {}
+
+  /**
+   * \brief Loads the given block (or tile) of runs, and computes their "decompressed" (run-length decoded) size.
+   */
+  __device__ __forceinline__ uint32_t GetDecodedSize(ItemItT d_block_unique_items,
+                                                     RunLengthsItT d_block_run_lengths,
+                                                     size_t num_valid_runs)
+  {
+    // Load this block's tile of encoded runs
+    RunItemT unique_items[RUNS_PER_THREAD];
+    RunLengthT run_lengths[RUNS_PER_THREAD];
+    LoadRuns(d_block_unique_items, d_block_run_lengths, unique_items, run_lengths, num_valid_runs);
+
+    // Init the BlockRunLengthDecode and get the total decoded size of this block's tile (i.e., the "decompressed" size)
+    uint32_t decoded_size = 0U;
+    BlockRunLengthDecodeT run_length_decode =
+      InitBlockRunLengthDecode(unique_items, run_lengths, decoded_size, cub::Int2Type<TEST_RUN_OFFSETS_>());
+    return decoded_size;
+  }
+
+  /**
+   * \brief Loads the given block (or tile) of runs, run-length decodes them, and writes the results to \p
+   * d_block_decoded_out.
+   */
+  template <typename UniqueItemOutItT, typename RelativeOffsetOutItT>
+  __device__ __forceinline__ uint32_t WriteDecodedRuns(ItemItT d_block_unique_items,
+                                                       RunLengthsItT d_block_run_lengths,
+                                                       UniqueItemOutItT d_block_decoded_out,
+                                                       RelativeOffsetOutItT d_block_rel_out,
+                                                       size_t num_valid_runs)
+  {
+    // Load this block's tile of encoded runs
+    RunItemT unique_items[RUNS_PER_THREAD];
+    RunLengthT run_lengths[RUNS_PER_THREAD];
+    LoadRuns(d_block_unique_items, d_block_run_lengths, unique_items, run_lengths, num_valid_runs);
+
+    // Init the BlockRunLengthDecode and get the total decoded size of this block's tile (i.e., the "decompressed" size)
+    uint32_t decoded_size = 0U;
+    BlockRunLengthDecodeT run_length_decode =
+      InitBlockRunLengthDecode(unique_items, run_lengths, decoded_size, cub::Int2Type<TEST_RUN_OFFSETS_>());
+
+    // Run-length decode ("decompress") the runs into a window buffer of limited size. This is repeated until all runs
+    // have been decoded.
+    uint32_t decoded_window_offset = 0U;
+    while (decoded_window_offset < decoded_size)
+    {
+      RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD];
+      RunItemT decoded_items[DECODED_ITEMS_PER_THREAD];
+
+      // The number of decoded items that are valid within this window (aka pass) of run-length decoding
+      uint32_t num_valid_items = decoded_size - decoded_window_offset;
+      run_length_decode.RunLengthDecode(decoded_items, relative_offsets, decoded_window_offset);
+      BlockStoreDecodedItemT(temp_storage.decode.store_decoded_runs_storage)
+        .Store(d_block_decoded_out + decoded_window_offset, decoded_items, num_valid_items);
+
+      if (TEST_RELATIVE_OFFSETS)
+      {
+        BlockStoreRelativeOffsetT(temp_storage.decode.store_relative_offsets)
+          .Store(d_block_rel_out + decoded_window_offset, relative_offsets, num_valid_items);
+      }
+
+      decoded_window_offset += DECODED_ITEMS_PER_THREAD * BLOCK_THREADS;
+    }
+    return decoded_size;
+  }
+};
+
+/******************************************************************************
+ * [STAGE 1] RUN-LENGTH DECODING TEST KERNEL
+ ******************************************************************************/
+template <typename AgentTestBlockRunLengthDecode,
+          typename ItemItT,
+          typename RunLengthsItT,
+          typename OffsetT,
+          typename DecodedSizesOutT>
+__launch_bounds__(AgentTestBlockRunLengthDecode::BLOCK_THREADS) __global__
+  void BlockRunLengthDecodeGetSizeKernel(const ItemItT d_unique_items,
+                                         const RunLengthsItT d_run_lengths,
+                                         const OffsetT num_runs,
+                                         DecodedSizesOutT d_decoded_sizes)
+{
+  constexpr OffsetT RUNS_PER_BLOCK = AgentTestBlockRunLengthDecode::RUNS_PER_BLOCK;
+
+  __shared__ typename AgentTestBlockRunLengthDecode::TempStorage temp_storage;
+
+  OffsetT block_offset   = blockIdx.x * RUNS_PER_BLOCK;
+  OffsetT num_valid_runs = (block_offset + RUNS_PER_BLOCK >= num_runs) ? (num_runs - block_offset) : RUNS_PER_BLOCK;
+
+  AgentTestBlockRunLengthDecode run_length_decode_agent(temp_storage);
+  uint64_t num_decoded_items =
+    run_length_decode_agent.GetDecodedSize(d_unique_items + block_offset, d_run_lengths + block_offset, num_valid_runs);
+
+  d_decoded_sizes[blockIdx.x] = num_decoded_items;
+}
+
+/******************************************************************************
+ * [STAGE 2] RUN-LENGTH DECODING TEST KERNEL
+ ******************************************************************************/
+template <typename AgentTestBlockRunLengthDecode,
+          typename ItemItT,
+          typename RunLengthsItT,
+          typename DecodedSizesOutT,
+          typename OffsetT,
+          typename DecodedItemsOutItT,
+          typename RelativeOffsetOutItT>
+__launch_bounds__(AgentTestBlockRunLengthDecode::BLOCK_THREADS) __global__
+  void BlockRunLengthDecodeTestKernel(const ItemItT d_unique_items,
+                                      const RunLengthsItT d_run_lengths,
+                                      const DecodedSizesOutT d_decoded_offsets,
+                                      const OffsetT num_runs,
+                                      DecodedItemsOutItT d_decoded_items,
+                                      RelativeOffsetOutItT d_relative_offsets)
+
+{
+  constexpr OffsetT RUNS_PER_BLOCK = AgentTestBlockRunLengthDecode::RUNS_PER_BLOCK;
+
+  __shared__ typename AgentTestBlockRunLengthDecode::TempStorage temp_storage;
+
+  OffsetT block_offset   = blockIdx.x * RUNS_PER_BLOCK;
+  OffsetT num_valid_runs = (block_offset + RUNS_PER_BLOCK >= num_runs) ? (num_runs - block_offset) : RUNS_PER_BLOCK;
+
+  AgentTestBlockRunLengthDecode run_length_decode_agent(temp_storage);
+  run_length_decode_agent.WriteDecodedRuns(d_unique_items + block_offset,
+                                           d_run_lengths + block_offset,
+                                           d_decoded_items + d_decoded_offsets[blockIdx.x],
+                                           d_relative_offsets + d_decoded_offsets[blockIdx.x],
+                                           num_valid_runs);
+}
+
+struct ModOp
+{
+  using T = uint32_t;
+  __host__ __device__ __forceinline__ T operator()(const T &x) const { return 1 + (x % 100); }
+};
+
+template <uint32_t RUNS_PER_THREAD,
+          uint32_t DECODED_ITEMS_PER_THREAD,
+          uint32_t BLOCK_DIM_X,
+          uint32_t BLOCK_DIM_Y,
+          uint32_t BLOCK_DIM_Z,
+          bool TEST_RUN_OFFSETS,
+          bool TEST_RELATIVE_OFFSETS>
+void TestAlgorithmSpecialisation()
+{
+  constexpr uint32_t THREADS_PER_BLOCK = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+  constexpr uint32_t RUNS_PER_BLOCK    = RUNS_PER_THREAD * THREADS_PER_BLOCK;
+
+  using RunItemT      = float;
+  using RunLengthT    = uint32_t;
+  using ItemItT       = cub::CountingInputIterator<RunItemT>;
+  using RunLengthsItT = cub::TransformInputIterator<RunLengthT, ModOp, cub::CountingInputIterator<RunLengthT>>;
+
+  ItemItT d_unique_items(1000U);
+  RunLengthsItT d_run_lengths(cub::CountingInputIterator<RunLengthT>(0), ModOp{});
+
+  constexpr uint32_t num_runs   = 10000;
+  constexpr uint32_t num_blocks = (num_runs + (RUNS_PER_BLOCK - 1U)) / RUNS_PER_BLOCK;
+
+  size_t temp_storage_bytes      = 0ULL;
+  void *temp_storage             = nullptr;
+  uint32_t *h_num_decoded_total  = nullptr;
+  uint32_t *d_decoded_sizes      = nullptr;
+  uint32_t *d_decoded_offsets    = nullptr;
+  RunItemT *d_decoded_out        = nullptr;
+  RunLengthT *d_relative_offsets = nullptr;
+  RunItemT *h_decoded_out        = nullptr;
+  RunLengthT *h_relative_offsets = nullptr;
+
+  using AgentTestBlockRunLengthDecodeT = AgentTestBlockRunLengthDecode<ItemItT,
+                                                                       RunLengthsItT,
+                                                                       RUNS_PER_THREAD,
+                                                                       DECODED_ITEMS_PER_THREAD,
+                                                                       TEST_RELATIVE_OFFSETS,
+                                                                       TEST_RUN_OFFSETS,
+                                                                       THREADS_PER_BLOCK,
+                                                                       1,
+                                                                       1>;
+
+  enum : uint32_t
+  {
+    TIMER_SIZE_BEGIN = 0,
+    TIMER_SIZE_END,
+    TIMER_DECODE_BEGIN,
+    TIMER_DECODE_END,
+    NUM_TIMERS,
+  };
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  cudaEvent_t cuda_evt_timers[NUM_TIMERS];
+  for (uint32_t i = 0; i < NUM_TIMERS; i++)
+  {
+    cudaEventCreate(&cuda_evt_timers[i]);
+  }
+
+  // Get temporary storage requirements for the scan (for computing offsets for the per-block run-length decoded items)
+  cub::DeviceScan::InclusiveSum(nullptr, temp_storage_bytes, d_decoded_sizes, d_decoded_offsets, num_blocks, stream);
+
+  // Allocate device memory
+  CubDebugExit(cudaMalloc(&temp_storage, temp_storage_bytes));
+  CubDebugExit(cudaMalloc(&d_decoded_sizes, num_blocks * sizeof(*d_decoded_sizes)));
+  // Allocate for the exclusive sum PLUS the overall aggregate
+  CubDebugExit(cudaMalloc(&d_decoded_offsets, (num_blocks + 1) * sizeof(*d_decoded_offsets)));
+  CubDebugExit(cudaMallocHost(&h_num_decoded_total, sizeof(*h_num_decoded_total)));
+
+  // Get the per-block number of items being decoded (i-th thread block writing size to d_decoded_sizes[i])
+  CubDebugExit(cudaEventRecord(cuda_evt_timers[TIMER_SIZE_BEGIN], stream));
+  BlockRunLengthDecodeGetSizeKernel<AgentTestBlockRunLengthDecodeT>
+    <<<num_blocks, THREADS_PER_BLOCK, 0U, stream>>>(d_unique_items, d_run_lengths, num_runs, d_decoded_sizes);
+  CubDebugExit(cudaEventRecord(cuda_evt_timers[TIMER_SIZE_END], stream));
+
+  // Compute offsets for the runs decoded by each block (exclusive sum + aggregate)
+  CubDebugExit(cudaMemsetAsync(d_decoded_offsets, 0, sizeof(d_decoded_offsets[0]), stream));
+  CubDebugExit(cub::DeviceScan::InclusiveSum(temp_storage,
+                                             temp_storage_bytes,
+                                             d_decoded_sizes,
+                                             &d_decoded_offsets[1],
+                                             num_blocks,
+                                             stream));
+
+  // Copy the total decoded size to CPU in order to allocate just the right amount of device memory
+  CubDebugExit(cudaMemcpyAsync(h_num_decoded_total,
+                               &d_decoded_offsets[num_blocks],
+                               sizeof(*h_num_decoded_total),
+                               cudaMemcpyDeviceToHost,
+                               stream));
+
+  // Ensure the total decoded size has been copied from GPU to CPU
+  CubDebugExit(cudaStreamSynchronize(stream));
+
+  // Allocate device memory for the run-length decoded output
+  CubDebugExit(cudaMallocHost(&h_decoded_out, (*h_num_decoded_total) * sizeof(RunItemT)));
+  CubDebugExit(cudaMalloc(&d_decoded_out, (*h_num_decoded_total) * sizeof(RunItemT)));
+  if (TEST_RELATIVE_OFFSETS)
+  {
+    CubDebugExit(cudaMalloc(&d_relative_offsets, (*h_num_decoded_total) * sizeof(RunLengthT)));
+    CubDebugExit(cudaMallocHost(&h_relative_offsets, (*h_num_decoded_total) * sizeof(RunLengthT)));
+  }
+
+  // Perform the block-wise run-length decoding (each block taking its offset from d_decoded_offsets)
+  CubDebugExit(cudaEventRecord(cuda_evt_timers[TIMER_DECODE_BEGIN], stream));
+  BlockRunLengthDecodeTestKernel<AgentTestBlockRunLengthDecodeT>
+    <<<num_blocks, THREADS_PER_BLOCK, 0U, stream>>>(d_unique_items,
+                                                    d_run_lengths,
+                                                    d_decoded_offsets,
+                                                    num_runs,
+                                                    d_decoded_out,
+                                                    d_relative_offsets);
+  CubDebugExit(cudaEventRecord(cuda_evt_timers[TIMER_DECODE_END], stream));
+
+  // Copy back results for verification
+  CubDebugExit(cudaMemcpyAsync(h_decoded_out,
+                               d_decoded_out,
+                               (*h_num_decoded_total) * sizeof(*h_decoded_out),
+                               cudaMemcpyDeviceToHost,
+                               stream));
+
+  if (TEST_RELATIVE_OFFSETS)
+  {
+    // Copy back the relative offsets
+    CubDebugExit(cudaMemcpyAsync(h_relative_offsets,
+                                 d_relative_offsets,
+                                 (*h_num_decoded_total) * sizeof(*h_relative_offsets),
+                                 cudaMemcpyDeviceToHost,
+                                 stream));
+  }
+
+  // Generate host-side run-length decoded data for verification
+  std::vector<std::pair<RunItemT, RunLengthT>> host_golden;
+  host_golden.reserve(*h_num_decoded_total);
+  for (uint32_t run = 0; run < num_runs; run++)
+  {
+    for (RunLengthT i = 0; i < d_run_lengths[run]; i++)
+    {
+      host_golden.push_back({d_unique_items[run], i});
+    }
+  }
+
+  // Ensure the run-length decoded result has been copied to the host
+  CubDebugExit(cudaStreamSynchronize(stream));
+
+  // Verify the total run-length decoded size is correct
+  AssertEquals(host_golden.size(), h_num_decoded_total[0]);
+
+  float duration_size   = 0.0f;
+  float duration_decode = 0.0f;
+  cudaEventElapsedTime(&duration_size, cuda_evt_timers[TIMER_SIZE_BEGIN], cuda_evt_timers[TIMER_SIZE_END]);
+  cudaEventElapsedTime(&duration_decode, cuda_evt_timers[TIMER_DECODE_BEGIN], cuda_evt_timers[TIMER_DECODE_END]);
+
+  size_t decoded_bytes          = host_golden.size() * sizeof(RunItemT);
+  size_t relative_offsets_bytes = TEST_RELATIVE_OFFSETS ? host_golden.size() * sizeof(RunLengthT) : 0ULL;
+  size_t total_bytes_written    = decoded_bytes + relative_offsets_bytes;
+
+  std::cout << "MODE: " << (TEST_RELATIVE_OFFSETS ? "offsets, " : "normal,  ")    //
+            << "INIT: " << (TEST_RUN_OFFSETS ? "run offsets, " : "run lengths, ") //
+            << "RUNS_PER_THREAD: " << RUNS_PER_THREAD                             //
+            << ", DECODED_ITEMS_PER_THREAD: " << DECODED_ITEMS_PER_THREAD         //
+            << ", THREADS_PER_BLOCK: " << THREADS_PER_BLOCK                       //
+            << ", decoded size (bytes): " << decoded_bytes                        //
+            << ", relative offsets (bytes): " << relative_offsets_bytes           //
+            << ", time_size (ms): " << duration_size                              //
+            << ", time_decode (ms): " << duration_decode                          //
+            << ", achieved decode BW (GB/s): "
+            << ((static_cast<double>(total_bytes_written) / 1.0e9) * (1000.0 / duration_decode)) << "\n";
+
+  // Verify the run-length decoded data is correct
+  bool cmp_eq = true;
+  for (uint32_t i = 0; i < host_golden.size(); i++)
+  {
+    if (host_golden[i].first != h_decoded_out[i])
+    {
+      std::cout << "Mismatch at #" << i << ": CPU item: " << host_golden[i].first << ", GPU: " << h_decoded_out[i]
+                << "\n";
+      cmp_eq = false;
+    }
+    if (TEST_RELATIVE_OFFSETS)
+    {
+      if (host_golden[i].second != h_relative_offsets[i])
+      {
+        std::cout << "Mismatch of relative offset at #" << i << ": CPU item: " << host_golden[i].first
+                  << ", GPU: " << h_decoded_out[i] << "; relative offsets: CPU: " << host_golden[i].second
+                  << ", GPU: " << h_relative_offsets[i] << "\n";
+        cmp_eq = false;
+        break;
+      }
+    }
+  }
+  AssertEquals(cmp_eq, true);
+
+  // Clean up memory allocations
+  CubDebugExit(cudaFree(temp_storage));
+  CubDebugExit(cudaFree(d_decoded_sizes));
+  CubDebugExit(cudaFree(d_decoded_offsets));
+  CubDebugExit(cudaFree(d_decoded_out));
+  CubDebugExit(cudaFreeHost(h_num_decoded_total));
+  CubDebugExit(cudaFreeHost(h_decoded_out));
+  if (TEST_RELATIVE_OFFSETS)
+  {
+    CubDebugExit(cudaFree(d_relative_offsets));
+    CubDebugExit(cudaFreeHost(h_relative_offsets));
+  }
+
+  // Clean up events
+  for (uint32_t i = 0; i < NUM_TIMERS; i++)
+  {
+    CubDebugExit(cudaEventDestroy(cuda_evt_timers[i]));
+  }
+
+  // Clean up streams
+  CubDebugExit(cudaStreamDestroy(stream));
+}
+
+template <uint32_t RUNS_PER_THREAD,
+          uint32_t DECODED_ITEMS_PER_THREAD,
+          uint32_t BLOCK_DIM_X,
+          uint32_t BLOCK_DIM_Y = 1U,
+          uint32_t BLOCK_DIM_Z = 1U>
+void TestForTuningParameters()
+{
+  constexpr bool DO_TEST_RELATIVE_OFFSETS     = true;
+  constexpr bool DO_NOT_TEST_RELATIVE_OFFSETS = false;
+
+  constexpr bool TEST_WITH_RUN_OFFSETS = true;
+  constexpr bool TEST_WITH_RUN_LENGTHS = false;
+  // Run BlockRunLengthDecode that uses run lengths and generates offsets relative to each run
+  TestAlgorithmSpecialisation<RUNS_PER_THREAD,
+                              DECODED_ITEMS_PER_THREAD,
+                              BLOCK_DIM_X,
+                              BLOCK_DIM_Y,
+                              BLOCK_DIM_Z,
+                              TEST_WITH_RUN_LENGTHS,
+                              DO_TEST_RELATIVE_OFFSETS>();
+
+  // Run BlockRunLengthDecode that uses run lengths and performs normal run-length decoding
+  TestAlgorithmSpecialisation<RUNS_PER_THREAD,
+                              DECODED_ITEMS_PER_THREAD,
+                              BLOCK_DIM_X,
+                              BLOCK_DIM_Y,
+                              BLOCK_DIM_Z,
+                              TEST_WITH_RUN_LENGTHS,
+                              DO_NOT_TEST_RELATIVE_OFFSETS>();
+
+  // Run BlockRunLengthDecode that uses run offsets and generates offsets relative to each run
+  TestAlgorithmSpecialisation<RUNS_PER_THREAD,
+                              DECODED_ITEMS_PER_THREAD,
+                              BLOCK_DIM_X,
+                              BLOCK_DIM_Y,
+                              BLOCK_DIM_Z,
+                              TEST_WITH_RUN_OFFSETS,
+                              DO_TEST_RELATIVE_OFFSETS>();
+
+  // Run BlockRunLengthDecode that uses run offsets and performs normal run-length decoding
+  TestAlgorithmSpecialisation<RUNS_PER_THREAD,
+                              DECODED_ITEMS_PER_THREAD,
+                              BLOCK_DIM_X,
+                              BLOCK_DIM_Y,
+                              BLOCK_DIM_Z,
+                              TEST_WITH_RUN_OFFSETS,
+                              DO_NOT_TEST_RELATIVE_OFFSETS>();
+}
+
+int main(int argc, char **argv)
+{
+  CommandLineArgs args(argc, argv);
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  // Instantiate test template instances for various configurations (tuning parameter dimensions)
+  // <RUNS_PER_THREAD, DECODED_ITEMS_PER_THREAD, BLOCK_DIM_X[, BLOCK_DIM_Y, BLOCK_DIM_Z]>
+  TestForTuningParameters<1U, 1U, 64U>();
+  TestForTuningParameters<1U, 3U, 32U, 2U, 3U>();
+  TestForTuningParameters<1U, 1U, 128U>();
+  TestForTuningParameters<1U, 8U, 128U>();
+  TestForTuningParameters<2U, 8U, 128U>();
+  TestForTuningParameters<3U, 1U, 256U>();
+  TestForTuningParameters<1U, 8U, 256U>();
+  TestForTuningParameters<8U, 1U, 256U>();
+  TestForTuningParameters<1U, 1U, 256U>();
+  TestForTuningParameters<2U, 2U, 384U>();
+
+  return 0;
+}