NVIDIA · tbensonatl · Apr 30, 2024 · Apr 29, 2024
diff --git a/include/matx/kernels/channelize_poly.cuh b/include/matx/kernels/channelize_poly.cuh
@@ -174,6 +174,148 @@ __global__ void ChannelizePoly1D(OutType output, InType input, FilterType filter
     }
 }
 
+// This kernel works in cases where the full filter (with potentially some zero padding) and
+// the inputs required to compute elems_per_channel_per_cta outputs all fit into shared memory.
+template <typename OutType, typename InType, typename FilterType>
+__global__ void ChannelizePoly1D_Smem(OutType output, InType input, FilterType filter, index_t elems_per_channel_per_cta)
+{
+    using output_t = typename OutType::scalar_type;
+    using input_t = typename InType::scalar_type;
+    using filter_t = typename FilterType::scalar_type;
+
+    extern __shared__ uint8_t __attribute((aligned(16))) smem_dyn_align16[];
+
+    constexpr int InRank = InType::Rank();
+    constexpr int OutRank = OutType::Rank();
+    constexpr int ChannelRank = OutRank-1;
+    constexpr int OutElemRank = OutRank-2;
+
+    const index_t input_len = input.Size(InRank-1);
+    const index_t output_len_per_channel = output.Size(OutElemRank);
+    // If the filter fits into shared memory, then a 32-bit index is sufficient. One
+    // edge case exception would be num_channels > 2^32-1, but with a small filter
+    // implicitly padded with zeros. We assume that the kernel selection logic
+    // considers the size of the zero-padded filter since that is what we actually
+    // store in shared memory.
+    const uint32_t num_channels = static_cast<uint32_t>(output.Size(ChannelRank));
+    const uint32_t filter_full_len = static_cast<uint32_t>(filter.Size(0));
+    const uint32_t filter_phase_len = static_cast<uint32_t>((filter_full_len + num_channels - 1) / num_channels);
+
+    filter_t *smem_h = reinterpret_cast<filter_t *>(smem_dyn_align16);
+    size_t smem_input_offset = sizeof(filter_t) * filter_phase_len * num_channels;
+    if (smem_input_offset % sizeof(input_t)) {
+        smem_input_offset += sizeof(input_t) - smem_input_offset % sizeof(input_t);
+    }
+    input_t *smem_input = reinterpret_cast<input_t *>(smem_dyn_align16 + smem_input_offset);
+
+    const uint32_t tid = threadIdx.y * blockDim.x + threadIdx.x;
+    const uint32_t nthreads = blockDim.x * blockDim.y;
+    const uint32_t chan = threadIdx.x;
+    const uint32_t ty = threadIdx.y;
+    const uint32_t by = blockDim.y;
+
+    for (uint32_t t = tid; t < filter_full_len; t += nthreads) {
+        smem_h[t] = filter.operator()(t);
+    }
+
+    for (uint32_t t = filter_full_len+tid; t < filter_phase_len * num_channels; t += nthreads) {
+        smem_h[t] = static_cast<filter_t>(0);
+    }
+
+    // The input stored in shared memory is logically [smem_input_height, num_channels] where
+    // smem_input_height is the number of samples at the output sample rate stored in smem.
+    const uint32_t smem_input_height = filter_phase_len + by - 1;
+
+    const index_t start_elem = blockIdx.x * elems_per_channel_per_cta;
+    const index_t last_elem = std::min(output_len_per_channel-1, (blockIdx.x+1) * elems_per_channel_per_cta - 1);
+    auto indims = BlockToIdx(input, blockIdx.z, 1);
+    auto outdims = BlockToIdx(output, blockIdx.z, 2);
+    outdims[ChannelRank] = chan;
+
+    for (uint32_t t = ty; t < filter_phase_len-1; t += by) {
+        const index_t out_sample_ind = start_elem - (filter_phase_len-1) + t;
+        const uint32_t smem_ind = t * num_channels + chan;
+        const index_t input_ind = out_sample_ind * num_channels + chan;
+        if (input_ind >= 0 && input_ind < input_len) {
+            indims[InRank-1] = input_ind;
+            detail::mapply([smem_input, smem_ind, &input](auto &&...args) {
+                smem_input[smem_ind] = input.operator()(args...);
+            }, indims);
+        } else {
+            smem_input[smem_ind] = static_cast<filter_t>(0);
+        }
+    }
+
+    index_t next_start_elem = start_elem;
+    const index_t num_elem_iters = (last_elem - start_elem + 1 + by - 1) / by;
+
+    uint32_t cached_input_ind_tail = filter_phase_len - 1 + ty;
+    const filter_t *h_start = smem_h + num_channels * filter_phase_len - (num_channels - chan);
+    for (index_t iter = 0; iter < num_elem_iters; iter++) {
+
+        __syncthreads();
+
+        // Load next elems_per_channel_per_cta elements for each channel
+        const index_t next_last_elem = std::min(next_start_elem + by - 1, last_elem);
+        const uint32_t out_samples_this_iter = static_cast<uint32_t>(next_last_elem - next_start_elem + 1);
+        if (ty < out_samples_this_iter) {
+            indims[InRank-1] = (next_start_elem + ty) * num_channels + chan;
+            const uint32_t smem_ind = cached_input_ind_tail * num_channels + chan;
+            if (indims[InRank-1] < input_len) {
+                detail::mapply([smem_input, smem_ind, &input](auto &&...args) {
+                    smem_input[smem_ind] = input.operator()(args...);
+                }, indims);
+            } else {
+                smem_input[smem_ind] = static_cast<filter_t>(0);
+            }
+        }
+
+        cached_input_ind_tail += by;
+        // The below effectively mods cached_input_ind_tail by smem_input_height. Since
+        // smem_input_height is >= by, adding by means that we will need to subtract
+        // smem_input_height at most once for cached_input_ind_tail to be in the range
+        // [0, smem_input_height-1]. The conditional is cheaper than the mod, unless
+        // smem_input_height is known at compile time.
+        if (cached_input_ind_tail >= smem_input_height) {
+            cached_input_ind_tail -= smem_input_height;
+        }
+
+        __syncthreads();
+
+        outdims[OutElemRank] = next_start_elem + ty;
+        if (outdims[OutElemRank] <= last_elem) {
+            const filter_t *h = h_start;
+            output_t accum { 0 };
+            const int first_end = std::min(cached_input_ind_tail + filter_phase_len - 1, smem_input_height - 1);
+            // The footprint of samples involved in the convolution may wrap from the end
+            // to the beginning of smem_input. The prologue below handles the samples from
+            // the current tail to the end of smem_input and the epilogue starts back at the
+            // beginning of smem_input.
+            const int prologue_count = (first_end - cached_input_ind_tail + 1);
+            const int epilogue_count = (prologue_count < filter_phase_len) ? filter_phase_len - prologue_count : 0;
+            const input_t *sample = smem_input + cached_input_ind_tail * num_channels + (num_channels - 1 - chan);
+            // Apply the filter h in reverse order below to flip the filter for convolution
+            for (int k = 0; k < prologue_count; k++) {
+                accum += (*h) * (*sample);
+                sample += num_channels;
+                h -= num_channels;
+            }
+            sample = smem_input + (num_channels - 1 - chan);
+            for (int k = 0; k < epilogue_count; k++) {
+                accum += (*h) * (*sample);
+                sample += num_channels;
+                h -= num_channels;
+            }
+
+            detail::mapply([accum, &output](auto &&...args) {
+                output.operator()(args...) = accum;
+            }, outdims);
+        }
+
+        next_start_elem += out_samples_this_iter;
+    }
+}
+
 template <int THREADS, int NUM_CHAN, typename OutType, typename InType, typename FilterType>
 __launch_bounds__(THREADS)
 __global__ void ChannelizePoly1D_FusedChan(OutType output, InType input, FilterType filter)

diff --git a/include/matx/transforms/channelize_poly.h b/include/matx/transforms/channelize_poly.h
@@ -52,6 +52,11 @@ namespace detail {
 // channel counts.
 constexpr index_t MATX_CHANNELIZE_POLY1D_FUSED_CHAN_KERNEL_THRESHOLD = 6;
 
+// Number of output samples per channel per iteration for the kernel that stores
+// the input data in shared memory. Ideally, this value would be determined dynamically
+// to balance occupancy and CTA size. For now, we choose a reasonable default.
+constexpr index_t MATX_CHANNELIZE_POLY1D_FULL_SMEM_KERNEL_NOUT_PER_ITER = 4;
+
 template <typename OutType, typename InType, typename FilterType>
 inline void matxChannelizePoly1DInternal(OutType o, const InType &i,
                                      const FilterType &filter, cudaStream_t stream)
@@ -78,6 +83,70 @@ inline void matxChannelizePoly1DInternal(OutType o, const InType &i,
 #endif
 }
 
+template <typename OutType, typename InType, typename FilterType>
+inline size_t matxChannelizePoly1DInternal_SmemSizeBytes(const OutType &o, const InType &, const FilterType &filter)
+{
+  using input_t = typename InType::scalar_type;
+  using filter_t = typename FilterType::scalar_type;
+
+  index_t filter_len = filter.Size(FilterType::Rank()-1);
+
+  const index_t num_channels = o.Size(OutType::Rank()-1);
+  const index_t nout_per_channel = o.Size(OutType::Rank()-2);
+  const index_t filter_phase_len = (filter_len + num_channels - 1) / num_channels;
+
+  size_t smem_size = sizeof(filter_t)*(num_channels)*(filter_phase_len) +
+    sizeof(input_t)*(num_channels)*(filter_phase_len + MATX_CHANNELIZE_POLY1D_FULL_SMEM_KERNEL_NOUT_PER_ITER - 1);
+  const size_t max_sizeof = std::max(sizeof(filter_t), sizeof(input_t));
+  if (smem_size % max_sizeof) {
+    smem_size += max_sizeof - (smem_size % max_sizeof);
+  }
+  return smem_size;
+}
+
+template <typename OutType, typename InType, typename FilterType>
+inline size_t matxChannelizePoly1DInternal_ShouldUseSmemKernel(const OutType &out, const InType &in, const FilterType &filter)
+{
+  // 48 KB is the largest shared memory allocation that does not require
+  // explicit opt-in via cudaFuncSetAttribute()
+  const size_t MAX_SMEM_BYTES = 48 * 1024;
+  // The full shared memory kernel uses blocks of size
+  // (num_channels, detail::MATX_CHANNELIZE_POLY1D_FULL_SMEM_KERNEL_NOUT_PER_ITER), so ensure
+  // that the resulting thread per block count will not exceed MAX_NUM_THREADS_PER_BLOCK
+  const int MAX_NUM_THREADS_PER_BLOCK = 1024;
+  const index_t num_channels = out.Size(OutType::Rank()-1);
+  return (
+      matxChannelizePoly1DInternal_SmemSizeBytes(out, in, filter) <= MAX_SMEM_BYTES &&
+      num_channels <= (MAX_NUM_THREADS_PER_BLOCK/detail::MATX_CHANNELIZE_POLY1D_FULL_SMEM_KERNEL_NOUT_PER_ITER));
+}
+
+template <typename OutType, typename InType, typename FilterType>
+inline void matxChannelizePoly1DInternal_Smem(OutType o, const InType &i, const FilterType &filter, cudaStream_t stream)
+{
+#ifdef __CUDACC__
+  MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
+
+  using input_t = typename InType::scalar_type;
+  using filter_t = typename FilterType::scalar_type;
+
+  index_t filter_len = filter.Size(FilterType::Rank()-1);
+
+  const index_t num_channels = o.Size(OutType::Rank()-1);
+  const index_t nout_per_channel = o.Size(OutType::Rank()-2);
+  const int num_batches = static_cast<int>(TotalSize(i)/i.Size(i.Rank() - 1));
+
+  const int target_num_blocks = 1024;
+  const int elem_per_block = static_cast<int>(
+    (nout_per_channel + target_num_blocks - 1) / target_num_blocks);
+  dim3 block(static_cast<int>(num_channels), MATX_CHANNELIZE_POLY1D_FULL_SMEM_KERNEL_NOUT_PER_ITER);
+  const uint32_t num_blocks = static_cast<uint32_t>((nout_per_channel + elem_per_block - 1) / elem_per_block);
+  dim3 grid(num_blocks, 1, num_batches);
+  const size_t smem_size = matxChannelizePoly1DInternal_SmemSizeBytes(o, i, filter);
+  ChannelizePoly1D_Smem<OutType, InType, FilterType><<<grid, block, smem_size, stream>>>(
+      o, i, filter, elem_per_block);
+#endif
+}
+
 template <typename OutType, typename InType, typename FilterType>
 inline void matxChannelizePoly1DInternal_FusedChan(OutType o, const InType &i,
                                      const FilterType &filter, cudaStream_t stream)
@@ -237,7 +306,11 @@ inline void channelize_poly_impl(OutType out, const InType &in, const FilterType
         }
       }();
 
-      matxChannelizePoly1DInternal(fft_in_slice, in, f, stream);
+      if (matxChannelizePoly1DInternal_ShouldUseSmemKernel(out, in, f)) {
+        matxChannelizePoly1DInternal_Smem(fft_in_slice, in, f, stream);
+      } else {
+        matxChannelizePoly1DInternal(fft_in_slice, in, f, stream);
+      }
       stop_dims[OUT_RANK-1] = (num_channels/2) + 1;
       auto out_packed = slice<OUT_RANK>(out, start_dims, stop_dims);
       (out_packed = fft(fft_in_slice, num_channels)).run(stream);
@@ -247,7 +320,11 @@ inline void channelize_poly_impl(OutType out, const InType &in, const FilterType
     if (num_channels <= detail::MATX_CHANNELIZE_POLY1D_FUSED_CHAN_KERNEL_THRESHOLD) {
       matxChannelizePoly1DInternal_FusedChan(out, in, f, stream);
     } else {
-      matxChannelizePoly1DInternal(out, in, f, stream);
+      if (matxChannelizePoly1DInternal_ShouldUseSmemKernel(out, in, f)) {
+        matxChannelizePoly1DInternal_Smem(out, in, f, stream);
+      } else {
+        matxChannelizePoly1DInternal(out, in, f, stream);
+      }
       // Specify FORWARD here to prevent any normalization after the ifft. We do not
       // want any extra scaling on the output values.
       (out = ifft(out, num_channels, FFTNorm::FORWARD)).run(stream);

diff --git a/test/00_transform/ChannelizePoly.cu b/test/00_transform/ChannelizePoly.cu
@@ -145,6 +145,8 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, Simple)
     { 271374, 31*14+4, 14 },
     { 27137, 301*13+3, 13 },
     { 27138, 301*14+4, 14 },
+    { 1000000, 32*16, 32 },
+    { 1000000, 40*16, 40 }
   };
 
   cudaStream_t stream = 0;