From 1fa441e2d2e1048b5f35b6d92afb5917ad523884 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 19 Aug 2024 12:04:07 -0700
Subject: [PATCH 001/135] Update docs

---
 cpp/src/groupby/hash/groupby.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 35161eada28..741a20f72a3 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -505,7 +505,7 @@ void compute_single_pass_aggs(table_view const& keys,
 
 /**
  * @brief Computes and returns a device vector containing all populated keys in
- * `map`.
+ * `key_set`.
  */
 template <typename SetType>
 rmm::device_uvector<size_type> extract_populated_keys(SetType const& key_set,

From 65e1b5a5a33a5e8be36692fbd7829dbcd7c12e12 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 19 Aug 2024 12:05:13 -0700
Subject: [PATCH 002/135] Minor improvement

---
 cpp/src/groupby/hash/groupby.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 741a20f72a3..f8a3563a4ad 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -570,7 +570,7 @@ std::unique_ptr<table> groupby(table_view const& keys,
   auto const comparator_helper = [&](auto const d_key_equal) {
     auto const set = cuco::static_set{
       num_keys,
-      0.5,  // desired load factor
+      cudf::detail::CUCO_DESIRED_LOAD_FACTOR,  // 50% occupancy
       cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
       d_key_equal,
       probing_scheme_type{d_row_hash},

From c58ddeff1bffdf3054a119b8afa3cd118507f463 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 20 Aug 2024 15:17:52 -0700
Subject: [PATCH 003/135] Migrate the GQE shared memory groupby to cudf

---
 cpp/src/groupby/hash/groupby.cu           | 590 ++++++++++++--
 cpp/src/groupby/hash/groupby_functors.cuh | 908 ++++++++++++++++++++++
 2 files changed, 1438 insertions(+), 60 deletions(-)
 create mode 100644 cpp/src/groupby/hash/groupby_functors.cuh

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index f8a3563a4ad..a536c48143e 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -14,8 +14,11 @@
  * limitations under the License.
  */
 
+#include "cuco/types.cuh"
+#include "cudf/utilities/error.hpp"
 #include "groupby/common/utils.hpp"
-#include "groupby/hash/groupby_kernels.cuh"
+#include "groupby_functors.cuh"
+#include "groupby_kernels.cuh"
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
@@ -30,6 +33,7 @@
 #include <cudf/detail/groupby.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/unary.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
@@ -61,8 +65,11 @@ namespace {
 
 // TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested
 // types and `cg_size = 1`for flat data to improve performance
+auto constexpr window_size = 1;
+auto constexpr cg_size     = 1;
+
 using probing_scheme_type = cuco::linear_probing<
-  1,  ///< Number of threads used to handle each input key
+  cg_size,  ///< Number of threads used to handle each input key
   cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
                                                    cudf::nullate::DYNAMIC>>;
 
@@ -420,80 +427,550 @@ void sparse_to_dense_results(table_view const& keys,
   }
 }
 
+template <typename SetType>
+void extract_populated_keys(SetType const& key_set,
+                            rmm::device_uvector<cudf::size_type>& populated_keys,
+                            rmm::cuda_stream_view stream)
+{
+  auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value());
+
+  populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream);
+}
+
 // make table that will hold sparse results
-auto create_sparse_results_table(table_view const& flattened_values,
-                                 std::vector<aggregation::Kind> aggs,
+template <typename GlobalSetType>
+auto create_sparse_results_table(cudf::table_view const& flattened_values,
+                                 const cudf::aggregation::Kind* d_aggs,
+                                 std::vector<cudf::aggregation::Kind> aggs,
+                                 bool direct_aggregations,
+                                 GlobalSetType const& global_set,
+                                 rmm::device_uvector<cudf::size_type>& populated_keys,
                                  rmm::cuda_stream_view stream)
 {
   // TODO single allocation - room for performance improvement
-  std::vector<std::unique_ptr<column>> sparse_columns;
+  std::vector<std::unique_ptr<cudf::column>> sparse_columns;
   std::transform(
     flattened_values.begin(),
     flattened_values.end(),
     aggs.begin(),
     std::back_inserter(sparse_columns),
     [stream](auto const& col, auto const& agg) {
-      bool nullable =
-        (agg == aggregation::COUNT_VALID or agg == aggregation::COUNT_ALL)
-          ? false
-          : (col.has_nulls() or agg == aggregation::VARIANCE or agg == aggregation::STD);
-      auto mask_flag = (nullable) ? mask_state::ALL_NULL : mask_state::UNALLOCATED;
-
-      auto col_type = cudf::is_dictionary(col.type())
-                        ? cudf::dictionary_column_view(col).keys().type()
-                        : col.type();
-
+      bool nullable = (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL)
+                        ? false
+                        : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or
+                           agg == cudf::aggregation::STD);
+      auto mask_flag = (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED;
+      auto col_type  = cudf::is_dictionary(col.type())
+                         ? cudf::dictionary_column_view(col).keys().type()
+                         : col.type();
       return make_fixed_width_column(
         cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream);
     });
-
-  table sparse_table(std::move(sparse_columns));
-  mutable_table_view table_view = sparse_table.mutable_view();
-  cudf::detail::initialize_with_identity(table_view, aggs, stream);
+  cudf::table sparse_table(std::move(sparse_columns));
+  // If no direct aggregations, initialize the sparse table
+  // only for the keys inserted in global hash set
+  if (!direct_aggregations) {
+    auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream);
+    extract_populated_keys(global_set, populated_keys, stream);
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator(0),
+                       populated_keys.size(),
+                       initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_aggs});
+  }
+  // Else initialise the whole table
+  else {
+    cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view();
+    cudf::detail::initialize_with_identity(sparse_table_view, aggs, stream);
+  }
   return sparse_table;
 }
 
+template <typename SetType>
+__device__ void find_local_mapping(cudf::size_type cur_idx,
+                                   cudf::size_type num_input_rows,
+                                   cudf::size_type* cardinality,
+                                   SetType shared_set,
+                                   cudf::size_type* local_mapping_index,
+                                   cudf::size_type* shared_set_indices)
+{
+  cudf::size_type result_idx;
+  bool inserted;
+  if (cur_idx < num_input_rows) {
+    auto const result = shared_set.insert_and_find(cur_idx);
+    result_idx        = *result.first;
+    inserted          = result.second;
+    // inserted a new element
+    if (result.second) {
+      auto shared_set_index                = atomicAdd(cardinality, 1);
+      shared_set_indices[shared_set_index] = cur_idx;
+      local_mapping_index[cur_idx]         = shared_set_index;
+    }
+  }
+  // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all
+  // threads in the thread block.
+  __syncthreads();
+  if (cur_idx < num_input_rows) {
+    // element was already in set
+    if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; }
+  }
+}
+
+template <typename SetType>
+__device__ void find_global_mapping(cudf::size_type cur_idx,
+                                    SetType global_set,
+                                    cudf::size_type* shared_set_indices,
+                                    cudf::size_type* global_mapping_index,
+                                    cudf::size_type shared_set_num_elements)
+{
+  auto input_idx = shared_set_indices[cur_idx];
+  auto result    = global_set.insert_and_find(input_idx);
+  global_mapping_index[blockIdx.x * shared_set_num_elements + cur_idx] = *result.first;
+}
+
+/*
+ * Inserts keys into the shared memory hash set, and stores the row index of the local
+ * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a
+ * threadblock exceeds `cardinality_threshold`, the threads in that block will exit without updating
+ * `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to the
+ * global hash set, and save the row index of the global sparse table in `global_mapping_index`.
+ */
+template <class SetRef,
+          cudf::size_type shared_set_num_elements,
+          cudf::size_type cardinality_threshold,
+          typename GlobalSetType,
+          typename KeyEqual,
+          typename RowHasher,
+          class WindowExtent>
+__global__ void compute_mapping_indices(GlobalSetType global_set,
+                                        cudf::size_type num_input_rows,
+                                        WindowExtent window_extent,
+                                        KeyEqual d_key_equal,
+                                        RowHasher d_row_hash,
+                                        cudf::size_type* local_mapping_index,
+                                        cudf::size_type* global_mapping_index,
+                                        cudf::size_type* block_cardinality,
+                                        bool* direct_aggregations)
+{
+  __shared__ cudf::size_type shared_set_indices[shared_set_num_elements];
+
+  // Shared set initialization
+  __shared__ typename SetRef::window_type windows[window_extent.value()];
+  auto storage     = SetRef::storage_ref_type(window_extent, windows);
+  auto shared_set  = SetRef(cuco::empty_key<cudf::size_type>{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+                           d_key_equal,
+                           probing_scheme_type{d_row_hash},
+                            {},
+                           storage);
+  auto const block = cooperative_groups::this_thread_block();
+  shared_set.initialize(block);
+  block.sync();
+
+  auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find);
+
+  __shared__ cudf::size_type cardinality;
+
+  if (threadIdx.x == 0) { cardinality = 0; }
+
+  __syncthreads();
+
+  int num_loops =
+    cudf::util::div_rounding_up_safe(num_input_rows, (cudf::size_type)(blockDim.x * gridDim.x));
+  auto end_idx = num_loops * blockDim.x * gridDim.x;
+
+  for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < end_idx;
+       cur_idx += blockDim.x * gridDim.x) {
+    find_local_mapping(cur_idx,
+                       num_input_rows,
+                       &cardinality,
+                       shared_insert_ref,
+                       local_mapping_index,
+                       shared_set_indices);
+
+    __syncthreads();
+
+    if (cardinality >= cardinality_threshold) {
+      if (threadIdx.x == 0) { *direct_aggregations = true; }
+      break;
+    }
+
+    __syncthreads();
+  }
+
+  // Insert unique keys from shared to global hash set
+  if (cardinality < cardinality_threshold) {
+    for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) {
+      find_global_mapping(
+        cur_idx, global_set, shared_set_indices, global_mapping_index, shared_set_num_elements);
+    }
+  }
+
+  if (threadIdx.x == 0) block_cardinality[blockIdx.x] = cardinality;
+}
+
+int find_num_sms()
+{
+  int dev_id{-1};
+  CUDF_CUDA_TRY(cudaGetDevice(&dev_id));
+  int num_sms{-1};
+  CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
+  return num_sms;
+}
+template <typename FuncType>
+int find_grid_size(FuncType func, int block_size, cudf::size_type num_input_rows, int num_sms)
+{
+  int max_active_blocks{-1};
+  CUDF_CUDA_TRY(
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func, block_size, 0));
+  auto max_grid_size       = max_active_blocks * num_sms;
+  int needed_active_blocks = cudf::util::div_rounding_up_safe(num_input_rows, block_size);
+  return std::min(max_grid_size, needed_active_blocks);
+}
+
+__device__ __host__ size_t round_to_multiple_of_8(size_t num)
+{
+  size_t constexpr multiple_of = 8;
+  return cudf::util::div_rounding_up_safe(num, multiple_of) * multiple_of;
+}
+
+__device__ void calculate_columns_to_aggregate(int& col_start,
+                                               int& col_end,
+                                               cudf::mutable_table_device_view output_values,
+                                               int num_input_cols,
+                                               std::byte** s_aggregates_pointer,
+                                               bool** s_aggregates_valid_pointer,
+                                               std::byte* shared_set_aggregates,
+                                               cudf::size_type cardinality,
+                                               int total_agg_size)
+{
+  if (threadIdx.x == 0) {
+    col_start           = col_end;
+    int bytes_allocated = 0;
+    int valid_col_size  = round_to_multiple_of_8(sizeof(bool) * cardinality);
+    while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) {
+      int next_col_size =
+        round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality);
+      int next_col_total_size = valid_col_size + next_col_size;
+      if (bytes_allocated + next_col_total_size > total_agg_size) { break; }
+      s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated;
+      s_aggregates_valid_pointer[col_end] =
+        reinterpret_cast<bool*>(shared_set_aggregates + bytes_allocated + next_col_size);
+      bytes_allocated += next_col_total_size;
+      col_end++;
+    }
+  }
+}
+
+__device__ void initialize_shared_memory_aggregates(int col_start,
+                                                    int col_end,
+                                                    cudf::mutable_table_device_view output_values,
+                                                    std::byte** s_aggregates_pointer,
+                                                    bool** s_aggregates_valid_pointer,
+                                                    cudf::size_type cardinality,
+                                                    cudf::aggregation::Kind const* aggs)
+{
+  for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+    for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) {
+      cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(),
+                                                  aggs[col_idx],
+                                                  initialize_shmem{},
+                                                  s_aggregates_pointer[col_idx],
+                                                  idx,
+                                                  s_aggregates_valid_pointer[col_idx]);
+    }
+  }
+}
+
+__device__ void compute_pre_aggregrates(int col_start,
+                                        int col_end,
+                                        cudf::table_device_view input_values,
+                                        cudf::size_type num_input_rows,
+                                        cudf::size_type* local_mapping_index,
+                                        std::byte** s_aggregates_pointer,
+                                        bool** s_aggregates_valid_pointer,
+                                        cudf::aggregation::Kind const* aggs)
+{
+  for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows;
+       cur_idx += blockDim.x * gridDim.x) {
+    auto map_idx = local_mapping_index[cur_idx];
+
+    for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+      auto input_col = input_values.column(col_idx);
+
+      cudf::detail::dispatch_type_and_aggregation(input_col.type(),
+                                                  aggs[col_idx],
+                                                  shmem_element_aggregator{},
+                                                  s_aggregates_pointer[col_idx],
+                                                  map_idx,
+                                                  s_aggregates_valid_pointer[col_idx],
+                                                  input_col,
+                                                  cur_idx);
+    }
+  }
+}
+
+template <int shared_set_num_elements>
+__device__ void compute_final_aggregates(int col_start,
+                                         int col_end,
+                                         cudf::table_device_view input_values,
+                                         cudf::mutable_table_device_view output_values,
+                                         cudf::size_type cardinality,
+                                         cudf::size_type* global_mapping_index,
+                                         std::byte** s_aggregates_pointer,
+                                         bool** s_aggregates_valid_pointer,
+                                         cudf::aggregation::Kind const* aggs)
+{
+  for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) {
+    auto out_idx = global_mapping_index[blockIdx.x * shared_set_num_elements + cur_idx];
+    for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+      auto output_col = output_values.column(col_idx);
+
+      cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(),
+                                                  aggs[col_idx],
+                                                  gmem_element_aggregator{},
+                                                  output_col,
+                                                  out_idx,
+                                                  input_values.column(col_idx),
+                                                  s_aggregates_pointer[col_idx],
+                                                  cur_idx,
+                                                  s_aggregates_valid_pointer[col_idx]);
+    }
+  }
+}
+
+/* Takes the local_mapping_index and global_mapping_index to compute
+ * pre (shared) and final (global) aggregates*/
+template <cudf::size_type shared_set_num_elements, cudf::size_type cardinality_threshold>
+__global__ void compute_aggregates(cudf::size_type* local_mapping_index,
+                                   cudf::size_type* global_mapping_index,
+                                   cudf::size_type* block_cardinality,
+                                   cudf::table_device_view input_values,
+                                   cudf::mutable_table_device_view output_values,
+                                   cudf::size_type num_input_rows,
+                                   cudf::aggregation::Kind const* aggs,
+                                   int total_agg_size,
+                                   int pointer_size)
+{
+  cudf::size_type cardinality = block_cardinality[blockIdx.x];
+  if (cardinality >= cardinality_threshold) { return; }
+  int num_input_cols = output_values.num_columns();
+  extern __shared__ std::byte shared_set_aggregates[];
+  std::byte** s_aggregates_pointer =
+    reinterpret_cast<std::byte**>(shared_set_aggregates + total_agg_size);
+  bool** s_aggregates_valid_pointer =
+    reinterpret_cast<bool**>(shared_set_aggregates + total_agg_size + pointer_size);
+  __shared__ int col_start;
+  __shared__ int col_end;
+  if (threadIdx.x == 0) {
+    col_start = 0;
+    col_end   = 0;
+  }
+  __syncthreads();
+  while (col_end < num_input_cols) {
+    calculate_columns_to_aggregate(col_start,
+                                   col_end,
+                                   output_values,
+                                   num_input_cols,
+                                   s_aggregates_pointer,
+                                   s_aggregates_valid_pointer,
+                                   shared_set_aggregates,
+                                   cardinality,
+                                   total_agg_size);
+    __syncthreads();
+    initialize_shared_memory_aggregates(col_start,
+                                        col_end,
+                                        output_values,
+                                        s_aggregates_pointer,
+                                        s_aggregates_valid_pointer,
+                                        cardinality,
+                                        aggs);
+    __syncthreads();
+    compute_pre_aggregrates(col_start,
+                            col_end,
+                            input_values,
+                            num_input_rows,
+                            local_mapping_index,
+                            s_aggregates_pointer,
+                            s_aggregates_valid_pointer,
+                            aggs);
+    __syncthreads();
+    compute_final_aggregates<shared_set_num_elements>(col_start,
+                                                      col_end,
+                                                      input_values,
+                                                      output_values,
+                                                      cardinality,
+                                                      global_mapping_index,
+                                                      s_aggregates_pointer,
+                                                      s_aggregates_valid_pointer,
+                                                      aggs);
+    __syncthreads();
+  }
+}
+
+size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; }
+
+template <typename FuncType>
+size_t find_shmem_size(FuncType func, int block_size, int grid_size, int num_sms)
+{
+  auto active_blocks_per_sm = cudf::util::div_rounding_up_safe(grid_size, num_sms);
+
+  size_t dynamic_smem_size;
+  CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock(
+    &dynamic_smem_size, func, active_blocks_per_sm, block_size));
+  return get_previous_multiple_of_8(0.5 * dynamic_smem_size);
+}
+
+template <cudf::size_type shared_set_num_elements, cudf::size_type cardinality_threshold>
+void launch_compute_aggregates(int block_size,
+                               int grid_size,
+                               int num_sms,
+                               cudf::size_type* local_mapping_index,
+                               cudf::size_type* global_mapping_index,
+                               cudf::size_type* block_cardinality,
+                               cudf::table_device_view input_values,
+                               cudf::mutable_table_device_view output_values,
+                               cudf::size_type num_input_rows,
+                               cudf::aggregation::Kind const* aggs,
+                               rmm::cuda_stream_view stream)
+{
+  auto compute_aggregates_fn_ptr =
+    compute_aggregates<shared_set_num_elements, cardinality_threshold>;
+  size_t d_shmem_size = find_shmem_size(compute_aggregates_fn_ptr, block_size, grid_size, num_sms);
+  // For each aggregation, need two pointers to arrays in shmem
+  // One where the aggregation is performed, one indicating the validity of the aggregation
+  auto shmem_agg_pointer_size =
+    round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns());
+  // The rest of shmem is utilized for the actual arrays in shmem
+  auto shmem_agg_size = d_shmem_size - shmem_agg_pointer_size * 2;
+  compute_aggregates<shared_set_num_elements, cardinality_threshold>
+    <<<grid_size, block_size, d_shmem_size, stream>>>(local_mapping_index,
+                                                      global_mapping_index,
+                                                      block_cardinality,
+                                                      input_values,
+                                                      output_values,
+                                                      num_input_rows,
+                                                      aggs,
+                                                      shmem_agg_size,
+                                                      shmem_agg_pointer_size);
+}
+
 /**
  * @brief Computes all aggregations from `requests` that require a single pass
  * over the data and stores the results in `sparse_results`
  */
-template <typename SetType>
-void compute_single_pass_aggs(table_view const& keys,
-                              host_span<aggregation_request const> requests,
-                              cudf::detail::result_cache* sparse_results,
-                              SetType set,
-                              bool keys_have_nulls,
-                              null_policy include_null_keys,
-                              rmm::cuda_stream_view stream)
+template <typename SetType, typename KeyEqual, typename RowHasher>
+rmm::device_uvector<cudf::size_type> compute_single_pass_set_aggs(
+  cudf::table_view const& keys,
+  cudf::host_span<cudf::groupby::aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  SetType& global_set,
+  rmm::cuda_stream_view stream,
+  KeyEqual d_key_equal,
+  RowHasher d_row_hash)
 {
+  auto constexpr block_size            = 128;
+  auto constexpr cardinality_threshold = 128;
+
+  auto const num_input_rows = keys.num_rows();
+
+  // We add additional `block_size`, because after the number of elements in the local hash set
+  // exceeds the threshold, all threads in the thread block can still insert one more element.
+  auto constexpr shared_set_num_elements = cardinality_threshold + block_size;
+  // shared_set_num_elements with 0.7 occupancy
+  auto constexpr shared_set_capacity =
+    static_cast<std::size_t>(static_cast<double>(shared_set_num_elements) * 1.43);
+  using extent_type            = cuco::extent<cudf::size_type, shared_set_capacity>;
+  using shared_set_type        = cuco::static_set<cudf::size_type,
+                                           extent_type,
+                                           cuda::thread_scope_block,
+                                           typename SetType::key_equal,
+                                           probing_scheme_type,
+                                           cuco::cuda_allocator<cudf::size_type>,
+                                           cuco::storage<window_size>>;
+  using shared_set_ref_type    = typename shared_set_type::ref_type<>;
+  auto constexpr window_extent = cuco::make_window_extent<shared_set_ref_type>(extent_type{});
+
+  auto global_set_ref = global_set.ref(cuco::op::insert_and_find);
+
+  int num_sms                         = find_num_sms();
+  auto compute_mapping_indices_fn_ptr = compute_mapping_indices<shared_set_ref_type,
+                                                                shared_set_num_elements,
+                                                                cardinality_threshold,
+                                                                decltype(global_set_ref),
+                                                                KeyEqual,
+                                                                RowHasher,
+                                                                decltype(window_extent)>;
+  int grid_size =
+    find_grid_size(compute_mapping_indices_fn_ptr, block_size, num_input_rows, num_sms);
+  // 'local_mapping_index' maps from the global row index of the input table to the row index of
+  // the local pre-aggregate table
+  rmm::device_uvector<cudf::size_type> local_mapping_index(num_input_rows, stream);
+  // 'global_mapping_index' maps from  the local pre-aggregate table to the row index of
+  // global aggregate table
+  rmm::device_uvector<cudf::size_type> global_mapping_index(grid_size * shared_set_num_elements,
+                                                            stream);
+  rmm::device_uvector<cudf::size_type> block_cardinality(grid_size, stream);
+  rmm::device_scalar<bool> direct_aggregations(false, stream);
+  compute_mapping_indices<shared_set_ref_type, shared_set_num_elements, cardinality_threshold>
+    <<<grid_size, block_size, 0, stream>>>(global_set_ref,
+                                           num_input_rows,
+                                           window_extent,
+                                           d_key_equal,
+                                           d_row_hash,
+                                           local_mapping_index.data(),
+                                           global_mapping_index.data(),
+                                           block_cardinality.data(),
+                                           direct_aggregations.data());
+  stream.synchronize();
+
+  // 'populated_keys' contains inserted row_indices (keys) of global hash set
+  rmm::device_uvector<cudf::size_type> populated_keys(keys.num_rows(), stream);
+
   // flatten the aggs to a table that can be operated on by aggregate_row
   auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
-
+  auto const d_aggs                              = cudf::detail::make_device_uvector_async(
+    agg_kinds, stream, rmm::mr::get_current_device_resource());
   // make table that will hold sparse results
-  table sparse_table = create_sparse_results_table(flattened_values, agg_kinds, stream);
+  cudf::table sparse_table = create_sparse_results_table(flattened_values,
+                                                         d_aggs.data(),
+                                                         agg_kinds,
+                                                         direct_aggregations.value(stream),
+                                                         global_set,
+                                                         populated_keys,
+                                                         stream);
   // prepare to launch kernel to do the actual aggregation
   auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
-  auto d_values       = table_device_view::create(flattened_values, stream);
-  auto const d_aggs   = cudf::detail::make_device_uvector_async(
-    agg_kinds, stream, rmm::mr::get_current_device_resource());
-  auto const skip_key_rows_with_nulls =
-    keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
 
-  auto row_bitmask =
-    skip_key_rows_with_nulls
-      ? cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first
-      : rmm::device_buffer{};
-
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator(0),
-    keys.num_rows(),
-    hash::compute_single_pass_aggs_fn{set,
-                                      *d_values,
-                                      *d_sparse_table,
-                                      d_aggs.data(),
-                                      static_cast<bitmask_type*>(row_bitmask.data()),
-                                      skip_key_rows_with_nulls});
+  auto d_values = table_device_view::create(flattened_values, stream);
+
+  launch_compute_aggregates<shared_set_num_elements, cardinality_threshold>(
+    block_size,
+    grid_size,
+    num_sms,
+    local_mapping_index.data(),
+    global_mapping_index.data(),
+    block_cardinality.data(),
+    *d_values,
+    *d_sparse_table,
+    num_input_rows,
+    d_aggs.data(),
+    stream);
+
+  if (direct_aggregations.value(stream)) {
+    int stride = block_size * grid_size;
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator(0),
+                       keys.num_rows(),
+                       compute_direct_aggregates{global_set_ref,
+                                                 *d_values,
+                                                 *d_sparse_table,
+                                                 d_aggs.data(),
+                                                 block_cardinality.data(),
+                                                 stride,
+                                                 block_size,
+                                                 cardinality_threshold});
+    extract_populated_keys(global_set, populated_keys, stream);
+  }
+
   // Add results back to sparse_results cache
   auto sparse_result_cols = sparse_table.release();
   for (size_t i = 0; i < aggs.size(); i++) {
@@ -501,6 +978,8 @@ void compute_single_pass_aggs(table_view const& keys,
     sparse_results->add_result(
       flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i]));
   }
+
+  return populated_keys;
 }
 
 /**
@@ -580,17 +1059,8 @@ std::unique_ptr<table> groupby(table_view const& keys,
       stream.value()};
 
     // Compute all single pass aggs first
-    compute_single_pass_aggs(keys,
-                             requests,
-                             &sparse_results,
-                             set.ref(cuco::insert_and_find),
-                             keys_have_nulls,
-                             include_null_keys,
-                             stream);
-
-    // Extract the populated indices from the hash set and create a gather map.
-    // Gathering using this map from sparse results will give dense results.
-    auto gather_map = extract_populated_keys(set, keys.num_rows(), stream);
+    auto gather_map = compute_single_pass_set_aggs(
+      keys, requests, &sparse_results, set, stream, d_key_equal, d_row_hash);
 
     // Compact all results from sparse_results and insert into cache
     sparse_to_dense_results(keys,
diff --git a/cpp/src/groupby/hash/groupby_functors.cuh b/cpp/src/groupby/hash/groupby_functors.cuh
new file mode 100644
index 00000000000..5630e838272
--- /dev/null
+++ b/cpp/src/groupby/hash/groupby_functors.cuh
@@ -0,0 +1,908 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/utilities/traits.cuh>
+
+namespace cudf::groupby::detail::hash {
+template <typename Source,
+          cudf::aggregation::Kind k,
+          bool target_has_nulls,
+          bool source_has_nulls,
+          typename Enable = void>
+struct update_target_element_gmem {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::MIN,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                   !cudf::is_fixed_point<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    if (source_has_nulls and source_null[source_index]) { return; }
+
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    cudf::detail::atomic_min(&target.element<Target>(target_index),
+                             static_cast<Target>(source_casted[source_index]));
+
+    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::MIN,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::is_fixed_point<Source>() &&
+                   cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    if (source_has_nulls and source_null[source_index]) { return; }
+    using Target              = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
+    using DeviceType          = cudf::device_storage_type_t<Target>;
+    DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
+    cudf::detail::atomic_min(&target.element<DeviceType>(target_index),
+                             static_cast<DeviceType>(source_casted[source_index]));
+
+    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::MAX,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                   !cudf::is_fixed_point<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    if (source_has_nulls and source_null[source_index]) { return; }
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    cudf::detail::atomic_max(&target.element<Target>(target_index),
+                             static_cast<Target>(source_casted[source_index]));
+
+    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::MAX,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::is_fixed_point<Source>() &&
+                   cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    if (source_has_nulls and source_null[source_index]) { return; }
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
+
+    using DeviceType          = cudf::device_storage_type_t<Target>;
+    DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
+    cudf::detail::atomic_max(&target.element<DeviceType>(target_index),
+                             static_cast<DeviceType>(source_casted[source_index]));
+
+    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::SUM,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                   !cudf::is_fixed_point<Source>() && !cudf::is_timestamp<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    if (source_has_nulls and source_null[source_index]) { return; }
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    cudf::detail::atomic_add(&target.element<Target>(target_index),
+                             static_cast<Target>(source_casted[source_index]));
+
+    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::SUM,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::has_atomic_support<cudf::device_storage_type_t<Source>>() &&
+                   cudf::is_fixed_point<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    if (source_has_nulls and source_null[source_index]) { return; }
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
+
+    using DeviceType          = cudf::device_storage_type_t<Target>;
+    DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
+    cudf::detail::atomic_add(&target.element<DeviceType>(target_index),
+                             static_cast<DeviceType>(source_casted[source_index]));
+    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+// The shared memory will already have it squared
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<Source,
+                                  cudf::aggregation::SUM_OF_SQUARES,
+                                  target_has_nulls,
+                                  source_has_nulls,
+                                  std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    if (source_has_nulls and source_null[source_index]) { return; }
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM_OF_SQUARES>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    Target value          = static_cast<Target>(source_casted[source_index]);
+
+    cudf::detail::atomic_add(&target.element<Target>(target_index), value);
+
+    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<Source,
+                                  cudf::aggregation::PRODUCT,
+                                  target_has_nulls,
+                                  source_has_nulls,
+                                  std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    if (source_has_nulls and source_null[source_index]) { return; }
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::PRODUCT>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    cudf::detail::atomic_mul(&target.element<Target>(target_index),
+                             static_cast<Target>(source_casted[source_index]));
+
+    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+// Assuming that the target column of COUNT_VALID, COUNT_ALL would be using fixed_width column and
+// non-fixed point column
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::COUNT_VALID,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_VALID>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_VALID>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    cudf::detail::atomic_add(&target.element<Target>(target_index),
+                             static_cast<Target>(source_casted[source_index]));
+
+    // It is assumed the output for COUNT_VALID is initialized to be all valid
+  }
+};
+
+// TODO: VALID and ALL have same code
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::COUNT_ALL,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_ALL>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_ALL>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    cudf::detail::atomic_add(&target.element<Target>(target_index),
+                             static_cast<Target>(source_casted[source_index]));
+
+    // It is assumed the output for COUNT_VALID is initialized to be all valid
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::ARGMAX,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMAX>() and
+                   cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    if (source_has_nulls and source_null[source_index]) { return; }
+    using Target             = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMAX>;
+    Target* source_casted    = reinterpret_cast<Target*>(source);
+    auto source_argmax_index = source_casted[source_index];
+    auto old                 = cudf::detail::atomic_cas(
+      &target.element<Target>(target_index), cudf::detail::ARGMAX_SENTINEL, source_argmax_index);
+    if (old != cudf::detail::ARGMAX_SENTINEL) {
+      while (source_column.element<Source>(source_argmax_index) >
+             source_column.element<Source>(old)) {
+        old =
+          cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_argmax_index);
+      }
+    }
+
+    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::ARGMIN,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMIN>() and
+                   cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    if (source_has_nulls and source_null[source_index]) { return; }
+    using Target             = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMIN>;
+    Target* source_casted    = reinterpret_cast<Target*>(source);
+    auto source_argmin_index = source_casted[source_index];
+    auto old                 = cudf::detail::atomic_cas(
+      &target.element<Target>(target_index), cudf::detail::ARGMIN_SENTINEL, source_argmin_index);
+    if (old != cudf::detail::ARGMIN_SENTINEL) {
+      while (source_column.element<Source>(source_argmin_index) <
+             source_column.element<Source>(old)) {
+        old =
+          cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_argmin_index);
+      }
+    }
+
+    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <bool target_has_nulls = true, bool source_has_nulls = true>
+struct gmem_element_aggregator {
+  template <typename Source, cudf::aggregation::Kind k>
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    update_target_element_gmem<Source, k, target_has_nulls, source_has_nulls>{}(
+      target, target_index, source_column, source, source_index, source_null);
+  }
+};
+
+template <typename Source,
+          cudf::aggregation::Kind k,
+          bool target_has_nulls,
+          bool source_has_nulls,
+          typename Enable = void>
+struct update_target_element_shmem {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::MIN,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                   !cudf::is_fixed_point<Source>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    cudf::detail::atomic_min(&target_casted[target_index],
+                             static_cast<Target>(source.element<Source>(source_index)));
+
+    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::MIN,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::is_fixed_point<Source>() &&
+                   cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target       = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
+    using DeviceTarget = cudf::device_storage_type_t<Target>;
+    using DeviceSource = cudf::device_storage_type_t<Source>;
+
+    DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
+    cudf::detail::atomic_min(&target_casted[target_index],
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::MAX,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                   !cudf::is_fixed_point<Source>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    cudf::detail::atomic_max(&target_casted[target_index],
+                             static_cast<Target>(source.element<Source>(source_index)));
+    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::MAX,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::is_fixed_point<Source>() &&
+                   cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
+
+    using DeviceTarget = cudf::device_storage_type_t<Target>;
+    using DeviceSource = cudf::device_storage_type_t<Source>;
+
+    DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
+    cudf::detail::atomic_max(&target_casted[target_index],
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+
+    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::SUM,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                   !cudf::is_fixed_point<Source>() && !cudf::is_timestamp<Source>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    cudf::detail::atomic_add(&target_casted[target_index],
+                             static_cast<Target>(source.element<Source>(source_index)));
+
+    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::SUM,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::has_atomic_support<cudf::device_storage_type_t<Source>>() &&
+                   cudf::is_fixed_point<Source>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
+
+    using DeviceTarget = cudf::device_storage_type_t<Target>;
+    using DeviceSource = cudf::device_storage_type_t<Source>;
+
+    DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
+    cudf::detail::atomic_add(&target_casted[target_index],
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+
+    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<Source,
+                                   cudf::aggregation::SUM_OF_SQUARES,
+                                   target_has_nulls,
+                                   source_has_nulls,
+                                   std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::SUM_OF_SQUARES>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    auto value            = static_cast<Target>(source.element<Source>(source_index));
+    cudf::detail::atomic_add(&target_casted[target_index], value * value);
+
+    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<Source,
+                                   cudf::aggregation::PRODUCT,
+                                   target_has_nulls,
+                                   source_has_nulls,
+                                   std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::PRODUCT>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    cudf::detail::atomic_mul(&target_casted[target_index],
+                             static_cast<Target>(source.element<Source>(source_index)));
+
+    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::COUNT_VALID,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_VALID>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_VALID>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    cudf::detail::atomic_add(&target_casted[target_index], Target{1});
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::COUNT_ALL,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_ALL>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_ALL>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    cudf::detail::atomic_add(&target_casted[target_index], Target{1});
+
+    // Assumes target is already set to be valid
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::ARGMAX,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMAX>() and
+                   cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMAX>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    auto old              = cudf::detail::atomic_cas(
+      &target_casted[target_index], cudf::detail::ARGMAX_SENTINEL, source_index);
+    if (old != cudf::detail::ARGMAX_SENTINEL) {
+      while (source.element<Source>(source_index) > source.element<Source>(old)) {
+        old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index);
+      }
+    }
+
+    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::ARGMIN,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMIN>() and
+                   cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMIN>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    auto old              = cudf::detail::atomic_cas(
+      &target_casted[target_index], cudf::detail::ARGMIN_SENTINEL, source_index);
+    if (old != cudf::detail::ARGMIN_SENTINEL) {
+      while (source.element<Source>(source_index) < source.element<Source>(old)) {
+        old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index);
+      }
+    }
+
+    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+  }
+};
+
+template <bool target_has_nulls = true, bool source_has_nulls = true>
+struct shmem_element_aggregator {
+  template <typename Source, cudf::aggregation::Kind k>
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    update_target_element_shmem<Source, k, target_has_nulls, source_has_nulls>{}(
+      target, target_index, target_null, source, source_index);
+  }
+};
+
+template <typename T, cudf::aggregation::Kind k>
+__device__ constexpr bool is_supported()
+{
+  return cudf::is_fixed_width<T>() and
+         ((k == cudf::aggregation::SUM) or (k == cudf::aggregation::MIN) or
+          (k == cudf::aggregation::MAX) or (k == cudf::aggregation::COUNT_VALID) or
+          (k == cudf::aggregation::COUNT_ALL) or (k == cudf::aggregation::ARGMAX) or
+          (k == cudf::aggregation::ARGMIN) or (k == cudf::aggregation::SUM_OF_SQUARES) or
+          (k == cudf::aggregation::STD) or (k == cudf::aggregation::VARIANCE) or
+          (k == cudf::aggregation::PRODUCT) and cudf::detail::is_product_supported<T>());
+}
+
+template <typename T, cudf::aggregation::Kind k>
+__device__ std::enable_if_t<not std::is_same_v<cudf::detail::corresponding_operator_t<k>, void>, T>
+identity_from_operator()
+{
+  using DeviceType = cudf::device_storage_type_t<T>;
+  return cudf::detail::corresponding_operator_t<k>::template identity<DeviceType>();
+}
+
+template <typename T, cudf::aggregation::Kind k, typename Enable = void>
+__device__ std::enable_if_t<std::is_same_v<cudf::detail::corresponding_operator_t<k>, void>, T>
+identity_from_operator()
+{
+  CUDF_UNREACHABLE("Unable to get identity/sentinel from device operator");
+}
+
+template <typename T, cudf::aggregation::Kind k>
+__device__ T get_identity()
+{
+  if ((k == cudf::aggregation::ARGMAX) || (k == cudf::aggregation::ARGMIN)) {
+    if constexpr (cudf::is_timestamp<T>())
+      return k == cudf::aggregation::ARGMAX
+               ? T{typename T::duration(cudf::detail::ARGMAX_SENTINEL)}
+               : T{typename T::duration(cudf::detail::ARGMIN_SENTINEL)};
+    else {
+      using DeviceType = cudf::device_storage_type_t<T>;
+      return k == cudf::aggregation::ARGMAX
+               ? static_cast<DeviceType>(cudf::detail::ARGMAX_SENTINEL)
+               : static_cast<DeviceType>(cudf::detail::ARGMIN_SENTINEL);
+    }
+  }
+  return identity_from_operator<T, k>();
+}
+
+template <typename Target, cudf::aggregation::Kind k, typename Enable = void>
+struct initialize_target_element {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null) const noexcept
+  {
+    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
+  }
+};
+
+// TODO: are the conditions correctly checked?
+template <typename Target, cudf::aggregation::Kind k>
+struct initialize_target_element<Target, k, std::enable_if_t<is_supported<Target, k>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null) const noexcept
+  {
+    using DeviceType            = cudf::device_storage_type_t<Target>;
+    DeviceType* target_casted   = reinterpret_cast<DeviceType*>(target);
+    target_casted[target_index] = get_identity<DeviceType, k>();
+
+    if (k == cudf::aggregation::COUNT_ALL || k == cudf::aggregation::COUNT_VALID) {
+      target_null[target_index] = false;
+    } else {
+      target_null[target_index] = true;
+    }
+  }
+};
+
+struct initialize_shmem {
+  template <typename Target, cudf::aggregation::Kind k>
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null) const noexcept
+  {
+    // TODO: typecasting work for every datatype
+
+    initialize_target_element<Target, k>{}(target, target_index, target_null);
+  }
+};
+
+template <typename Target, cudf::aggregation::Kind k, typename Enable = void>
+struct initialize_target_element_gmem {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index) const noexcept
+  {
+    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
+  }
+};
+
+template <typename Target, cudf::aggregation::Kind k>
+struct initialize_target_element_gmem<
+  Target,
+  k,
+  std::enable_if_t<is_supported<Target, k>() && cudf::is_fixed_width<Target>() &&
+                   !cudf::is_fixed_point<Target>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index) const noexcept
+  {
+    using DeviceType                     = cudf::device_storage_type_t<Target>;
+    target.element<Target>(target_index) = get_identity<DeviceType, k>();
+  }
+};
+
+template <typename Target, cudf::aggregation::Kind k>
+struct initialize_target_element_gmem<
+  Target,
+  k,
+  std::enable_if_t<is_supported<Target, k>() && cudf::is_fixed_point<Target>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index) const noexcept
+  {
+    using DeviceType                         = cudf::device_storage_type_t<Target>;
+    target.element<DeviceType>(target_index) = get_identity<DeviceType, k>();
+  }
+};
+
+struct initialize_gmem {
+  template <typename Target, cudf::aggregation::Kind k>
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index) const noexcept
+  {
+    initialize_target_element_gmem<Target, k>{}(target, target_index);
+  }
+};
+
+struct initialize_sparse_table {
+  cudf::size_type const* row_indices;
+  cudf::mutable_table_device_view sparse_table;
+  cudf::aggregation::Kind const* __restrict__ aggs;
+  initialize_sparse_table(cudf::size_type const* row_indices,
+                          cudf::mutable_table_device_view sparse_table,
+                          cudf::aggregation::Kind const* aggs)
+    : row_indices(row_indices), sparse_table(sparse_table), aggs(aggs)
+  {
+  }
+  __device__ void operator()(cudf::size_type i)
+  {
+    auto key_idx = row_indices[i];
+    for (auto col_idx = 0; col_idx < sparse_table.num_columns(); col_idx++) {
+      cudf::detail::dispatch_type_and_aggregation(sparse_table.column(col_idx).type(),
+                                                  aggs[col_idx],
+                                                  initialize_gmem{},
+                                                  sparse_table.column(col_idx),
+                                                  key_idx);
+    }
+  }
+};
+
+template <typename SetType>
+struct compute_direct_aggregates {
+  SetType set;
+  cudf::table_device_view input_values;
+  cudf::mutable_table_device_view output_values;
+  cudf::aggregation::Kind const* __restrict__ aggs;
+  cudf::size_type* block_cardinality;
+  int stride;
+  int block_size;
+  cudf::size_type cardinality_threshold;
+  compute_direct_aggregates(SetType set,
+                            cudf::table_device_view input_values,
+                            cudf::mutable_table_device_view output_values,
+                            cudf::aggregation::Kind const* aggs,
+                            cudf::size_type* block_cardinality,
+                            int stride,
+                            int block_size,
+                            cudf::size_type cardinality_threshold)
+    : set(set),
+      input_values(input_values),
+      output_values(output_values),
+      aggs(aggs),
+      block_cardinality(block_cardinality),
+      stride(stride),
+      block_size(block_size),
+      cardinality_threshold(cardinality_threshold)
+  {
+  }
+  __device__ void operator()(cudf::size_type i)
+  {
+    int block_id = (i % stride) / block_size;
+    if (block_cardinality[block_id] >= cardinality_threshold) {
+      auto const result = set.insert_and_find(i);
+      cudf::detail::aggregate_row<true, true>(output_values, *result.first, input_values, i, aggs);
+    }
+  }
+};
+
+}  // namespace cudf::groupby::detail::hash

From d604d0a75fc55e49a1167c1606f83aa5e470c31f Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 20 Aug 2024 17:52:46 -0700
Subject: [PATCH 004/135] Many cleanups

---
 cpp/src/groupby/hash/groupby.cu               | 328 +-----------------
 cpp/src/groupby/hash/groupby_kernels.cuh      | 111 ------
 cpp/src/groupby/hash/helpers.cuh              |  45 +++
 cpp/src/groupby/hash/kernels.cuh              | 324 +++++++++++++++++
 ...ss_kernels.cuh => multi_pass_functors.cuh} |   0
 ..._functors.cuh => single_pass_functors.cuh} |  77 ++++
 6 files changed, 453 insertions(+), 432 deletions(-)
 delete mode 100644 cpp/src/groupby/hash/groupby_kernels.cuh
 create mode 100644 cpp/src/groupby/hash/helpers.cuh
 create mode 100644 cpp/src/groupby/hash/kernels.cuh
 rename cpp/src/groupby/hash/{multi_pass_kernels.cuh => multi_pass_functors.cuh} (100%)
 rename cpp/src/groupby/hash/{groupby_functors.cuh => single_pass_functors.cuh} (91%)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index a536c48143e..92e8f4a6f8b 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
-#include "cuco/types.cuh"
-#include "cudf/utilities/error.hpp"
 #include "groupby/common/utils.hpp"
-#include "groupby_functors.cuh"
-#include "groupby_kernels.cuh"
+#include "helpers.cuh"
+#include "kernels.cuh"
+#include "multi_pass_functors.cuh"
+#include "single_pass_functors.cuh"
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
@@ -28,12 +28,10 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/aggregation/result_cache.hpp>
 #include <cudf/detail/binaryop.hpp>
-#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/groupby.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/unary.hpp>
-#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
@@ -49,7 +47,6 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <cuco/static_set.cuh>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 
@@ -63,16 +60,6 @@ namespace detail {
 namespace hash {
 namespace {
 
-// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested
-// types and `cg_size = 1`for flat data to improve performance
-auto constexpr window_size = 1;
-auto constexpr cg_size     = 1;
-
-using probing_scheme_type = cuco::linear_probing<
-  cg_size,  ///< Number of threads used to handle each input key
-  cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
-                                                   cudf::nullate::DYNAMIC>>;
-
 /**
  * @brief List of aggregation operations that can be computed with a hash-based
  * implementation.
@@ -485,128 +472,6 @@ auto create_sparse_results_table(cudf::table_view const& flattened_values,
   return sparse_table;
 }
 
-template <typename SetType>
-__device__ void find_local_mapping(cudf::size_type cur_idx,
-                                   cudf::size_type num_input_rows,
-                                   cudf::size_type* cardinality,
-                                   SetType shared_set,
-                                   cudf::size_type* local_mapping_index,
-                                   cudf::size_type* shared_set_indices)
-{
-  cudf::size_type result_idx;
-  bool inserted;
-  if (cur_idx < num_input_rows) {
-    auto const result = shared_set.insert_and_find(cur_idx);
-    result_idx        = *result.first;
-    inserted          = result.second;
-    // inserted a new element
-    if (result.second) {
-      auto shared_set_index                = atomicAdd(cardinality, 1);
-      shared_set_indices[shared_set_index] = cur_idx;
-      local_mapping_index[cur_idx]         = shared_set_index;
-    }
-  }
-  // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all
-  // threads in the thread block.
-  __syncthreads();
-  if (cur_idx < num_input_rows) {
-    // element was already in set
-    if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; }
-  }
-}
-
-template <typename SetType>
-__device__ void find_global_mapping(cudf::size_type cur_idx,
-                                    SetType global_set,
-                                    cudf::size_type* shared_set_indices,
-                                    cudf::size_type* global_mapping_index,
-                                    cudf::size_type shared_set_num_elements)
-{
-  auto input_idx = shared_set_indices[cur_idx];
-  auto result    = global_set.insert_and_find(input_idx);
-  global_mapping_index[blockIdx.x * shared_set_num_elements + cur_idx] = *result.first;
-}
-
-/*
- * Inserts keys into the shared memory hash set, and stores the row index of the local
- * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a
- * threadblock exceeds `cardinality_threshold`, the threads in that block will exit without updating
- * `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to the
- * global hash set, and save the row index of the global sparse table in `global_mapping_index`.
- */
-template <class SetRef,
-          cudf::size_type shared_set_num_elements,
-          cudf::size_type cardinality_threshold,
-          typename GlobalSetType,
-          typename KeyEqual,
-          typename RowHasher,
-          class WindowExtent>
-__global__ void compute_mapping_indices(GlobalSetType global_set,
-                                        cudf::size_type num_input_rows,
-                                        WindowExtent window_extent,
-                                        KeyEqual d_key_equal,
-                                        RowHasher d_row_hash,
-                                        cudf::size_type* local_mapping_index,
-                                        cudf::size_type* global_mapping_index,
-                                        cudf::size_type* block_cardinality,
-                                        bool* direct_aggregations)
-{
-  __shared__ cudf::size_type shared_set_indices[shared_set_num_elements];
-
-  // Shared set initialization
-  __shared__ typename SetRef::window_type windows[window_extent.value()];
-  auto storage     = SetRef::storage_ref_type(window_extent, windows);
-  auto shared_set  = SetRef(cuco::empty_key<cudf::size_type>{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
-                           d_key_equal,
-                           probing_scheme_type{d_row_hash},
-                            {},
-                           storage);
-  auto const block = cooperative_groups::this_thread_block();
-  shared_set.initialize(block);
-  block.sync();
-
-  auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find);
-
-  __shared__ cudf::size_type cardinality;
-
-  if (threadIdx.x == 0) { cardinality = 0; }
-
-  __syncthreads();
-
-  int num_loops =
-    cudf::util::div_rounding_up_safe(num_input_rows, (cudf::size_type)(blockDim.x * gridDim.x));
-  auto end_idx = num_loops * blockDim.x * gridDim.x;
-
-  for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < end_idx;
-       cur_idx += blockDim.x * gridDim.x) {
-    find_local_mapping(cur_idx,
-                       num_input_rows,
-                       &cardinality,
-                       shared_insert_ref,
-                       local_mapping_index,
-                       shared_set_indices);
-
-    __syncthreads();
-
-    if (cardinality >= cardinality_threshold) {
-      if (threadIdx.x == 0) { *direct_aggregations = true; }
-      break;
-    }
-
-    __syncthreads();
-  }
-
-  // Insert unique keys from shared to global hash set
-  if (cardinality < cardinality_threshold) {
-    for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) {
-      find_global_mapping(
-        cur_idx, global_set, shared_set_indices, global_mapping_index, shared_set_num_elements);
-    }
-  }
-
-  if (threadIdx.x == 0) block_cardinality[blockIdx.x] = cardinality;
-}
-
 int find_num_sms()
 {
   int dev_id{-1};
@@ -615,6 +480,7 @@ int find_num_sms()
   CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
   return num_sms;
 }
+
 template <typename FuncType>
 int find_grid_size(FuncType func, int block_size, cudf::size_type num_input_rows, int num_sms)
 {
@@ -626,186 +492,6 @@ int find_grid_size(FuncType func, int block_size, cudf::size_type num_input_rows
   return std::min(max_grid_size, needed_active_blocks);
 }
 
-__device__ __host__ size_t round_to_multiple_of_8(size_t num)
-{
-  size_t constexpr multiple_of = 8;
-  return cudf::util::div_rounding_up_safe(num, multiple_of) * multiple_of;
-}
-
-__device__ void calculate_columns_to_aggregate(int& col_start,
-                                               int& col_end,
-                                               cudf::mutable_table_device_view output_values,
-                                               int num_input_cols,
-                                               std::byte** s_aggregates_pointer,
-                                               bool** s_aggregates_valid_pointer,
-                                               std::byte* shared_set_aggregates,
-                                               cudf::size_type cardinality,
-                                               int total_agg_size)
-{
-  if (threadIdx.x == 0) {
-    col_start           = col_end;
-    int bytes_allocated = 0;
-    int valid_col_size  = round_to_multiple_of_8(sizeof(bool) * cardinality);
-    while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) {
-      int next_col_size =
-        round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality);
-      int next_col_total_size = valid_col_size + next_col_size;
-      if (bytes_allocated + next_col_total_size > total_agg_size) { break; }
-      s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated;
-      s_aggregates_valid_pointer[col_end] =
-        reinterpret_cast<bool*>(shared_set_aggregates + bytes_allocated + next_col_size);
-      bytes_allocated += next_col_total_size;
-      col_end++;
-    }
-  }
-}
-
-__device__ void initialize_shared_memory_aggregates(int col_start,
-                                                    int col_end,
-                                                    cudf::mutable_table_device_view output_values,
-                                                    std::byte** s_aggregates_pointer,
-                                                    bool** s_aggregates_valid_pointer,
-                                                    cudf::size_type cardinality,
-                                                    cudf::aggregation::Kind const* aggs)
-{
-  for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
-    for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) {
-      cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(),
-                                                  aggs[col_idx],
-                                                  initialize_shmem{},
-                                                  s_aggregates_pointer[col_idx],
-                                                  idx,
-                                                  s_aggregates_valid_pointer[col_idx]);
-    }
-  }
-}
-
-__device__ void compute_pre_aggregrates(int col_start,
-                                        int col_end,
-                                        cudf::table_device_view input_values,
-                                        cudf::size_type num_input_rows,
-                                        cudf::size_type* local_mapping_index,
-                                        std::byte** s_aggregates_pointer,
-                                        bool** s_aggregates_valid_pointer,
-                                        cudf::aggregation::Kind const* aggs)
-{
-  for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows;
-       cur_idx += blockDim.x * gridDim.x) {
-    auto map_idx = local_mapping_index[cur_idx];
-
-    for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
-      auto input_col = input_values.column(col_idx);
-
-      cudf::detail::dispatch_type_and_aggregation(input_col.type(),
-                                                  aggs[col_idx],
-                                                  shmem_element_aggregator{},
-                                                  s_aggregates_pointer[col_idx],
-                                                  map_idx,
-                                                  s_aggregates_valid_pointer[col_idx],
-                                                  input_col,
-                                                  cur_idx);
-    }
-  }
-}
-
-template <int shared_set_num_elements>
-__device__ void compute_final_aggregates(int col_start,
-                                         int col_end,
-                                         cudf::table_device_view input_values,
-                                         cudf::mutable_table_device_view output_values,
-                                         cudf::size_type cardinality,
-                                         cudf::size_type* global_mapping_index,
-                                         std::byte** s_aggregates_pointer,
-                                         bool** s_aggregates_valid_pointer,
-                                         cudf::aggregation::Kind const* aggs)
-{
-  for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) {
-    auto out_idx = global_mapping_index[blockIdx.x * shared_set_num_elements + cur_idx];
-    for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
-      auto output_col = output_values.column(col_idx);
-
-      cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(),
-                                                  aggs[col_idx],
-                                                  gmem_element_aggregator{},
-                                                  output_col,
-                                                  out_idx,
-                                                  input_values.column(col_idx),
-                                                  s_aggregates_pointer[col_idx],
-                                                  cur_idx,
-                                                  s_aggregates_valid_pointer[col_idx]);
-    }
-  }
-}
-
-/* Takes the local_mapping_index and global_mapping_index to compute
- * pre (shared) and final (global) aggregates*/
-template <cudf::size_type shared_set_num_elements, cudf::size_type cardinality_threshold>
-__global__ void compute_aggregates(cudf::size_type* local_mapping_index,
-                                   cudf::size_type* global_mapping_index,
-                                   cudf::size_type* block_cardinality,
-                                   cudf::table_device_view input_values,
-                                   cudf::mutable_table_device_view output_values,
-                                   cudf::size_type num_input_rows,
-                                   cudf::aggregation::Kind const* aggs,
-                                   int total_agg_size,
-                                   int pointer_size)
-{
-  cudf::size_type cardinality = block_cardinality[blockIdx.x];
-  if (cardinality >= cardinality_threshold) { return; }
-  int num_input_cols = output_values.num_columns();
-  extern __shared__ std::byte shared_set_aggregates[];
-  std::byte** s_aggregates_pointer =
-    reinterpret_cast<std::byte**>(shared_set_aggregates + total_agg_size);
-  bool** s_aggregates_valid_pointer =
-    reinterpret_cast<bool**>(shared_set_aggregates + total_agg_size + pointer_size);
-  __shared__ int col_start;
-  __shared__ int col_end;
-  if (threadIdx.x == 0) {
-    col_start = 0;
-    col_end   = 0;
-  }
-  __syncthreads();
-  while (col_end < num_input_cols) {
-    calculate_columns_to_aggregate(col_start,
-                                   col_end,
-                                   output_values,
-                                   num_input_cols,
-                                   s_aggregates_pointer,
-                                   s_aggregates_valid_pointer,
-                                   shared_set_aggregates,
-                                   cardinality,
-                                   total_agg_size);
-    __syncthreads();
-    initialize_shared_memory_aggregates(col_start,
-                                        col_end,
-                                        output_values,
-                                        s_aggregates_pointer,
-                                        s_aggregates_valid_pointer,
-                                        cardinality,
-                                        aggs);
-    __syncthreads();
-    compute_pre_aggregrates(col_start,
-                            col_end,
-                            input_values,
-                            num_input_rows,
-                            local_mapping_index,
-                            s_aggregates_pointer,
-                            s_aggregates_valid_pointer,
-                            aggs);
-    __syncthreads();
-    compute_final_aggregates<shared_set_num_elements>(col_start,
-                                                      col_end,
-                                                      input_values,
-                                                      output_values,
-                                                      cardinality,
-                                                      global_mapping_index,
-                                                      s_aggregates_pointer,
-                                                      s_aggregates_valid_pointer,
-                                                      aggs);
-    __syncthreads();
-  }
-}
-
 size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; }
 
 template <typename FuncType>
@@ -885,7 +571,7 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_set_aggs(
                                            typename SetType::key_equal,
                                            probing_scheme_type,
                                            cuco::cuda_allocator<cudf::size_type>,
-                                           cuco::storage<window_size>>;
+                                           cuco::storage<GROUPBY_WINDOW_SIZE>>;
   using shared_set_ref_type    = typename shared_set_type::ref_type<>;
   auto constexpr window_extent = cuco::make_window_extent<shared_set_ref_type>(extent_type{});
 
@@ -1054,7 +740,7 @@ std::unique_ptr<table> groupby(table_view const& keys,
       d_key_equal,
       probing_scheme_type{d_row_hash},
       cuco::thread_scope_device,
-      cuco::storage<1>{},
+      cuco::storage<GROUPBY_WINDOW_SIZE>{},
       cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
       stream.value()};
 
diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/groupby_kernels.cuh
deleted file mode 100644
index 9abfe22950a..00000000000
--- a/cpp/src/groupby/hash/groupby_kernels.cuh
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "multi_pass_kernels.cuh"
-
-#include <cudf/detail/aggregation/aggregation.cuh>
-#include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/groupby.hpp>
-#include <cudf/utilities/bit.hpp>
-
-#include <thrust/pair.h>
-
-namespace cudf {
-namespace groupby {
-namespace detail {
-namespace hash {
-/**
- * @brief Computes single-pass aggregations and store results into a sparse `output_values` table,
- * and populate `set` with indices of unique keys
- *
- * The hash set is built by inserting every row index `i` from the `keys` and `values` tables. If
- * the index was not present in the set, insert they index and then copy it to the output. If the
- * key was already present in the set, then the inserted index is aggregated with the existing row.
- * This aggregation is done for every element `j` in the row by applying aggregation operation `j`
- * between the new and existing element.
- *
- * Instead of storing the entire rows from `input_keys` and `input_values` in
- * the hashset, we instead store the row indices. For example, when inserting
- * row at index `i` from `input_keys` into the hash set, the value `i` is what
- * gets stored for the hash set's "key". It is assumed the `set` was constructed
- * with a custom comparator that uses these row indices to check for equality
- * between key rows. For example, comparing two keys `k0` and `k1` will compare
- * the two rows `input_keys[k0] ?= input_keys[k1]`
- *
- * The exact size of the result is not known a priori, but can be upper bounded
- * by the number of rows in `input_keys` & `input_values`. Therefore, it is
- * assumed `output_values` has sufficient storage for an equivalent number of
- * rows. In this way, after all rows are aggregated, `output_values` will likely
- * be "sparse", meaning that not all rows contain the result of an aggregation.
- *
- * @tparam SetType The type of the hash set device ref
- */
-template <typename SetType>
-struct compute_single_pass_aggs_fn {
-  SetType set;
-  table_device_view input_values;
-  mutable_table_device_view output_values;
-  aggregation::Kind const* __restrict__ aggs;
-  bitmask_type const* __restrict__ row_bitmask;
-  bool skip_rows_with_nulls;
-
-  /**
-   * @brief Construct a new compute_single_pass_aggs_fn functor object
-   *
-   * @param set_ref Hash set object to insert key,value pairs into.
-   * @param input_values The table whose rows will be aggregated in the values
-   * of the hash set
-   * @param output_values Table that stores the results of aggregating rows of
-   * `input_values`.
-   * @param aggs The set of aggregation operations to perform across the
-   * columns of the `input_values` rows
-   * @param row_bitmask Bitmask where bit `i` indicates the presence of a null
-   * value in row `i` of input keys. Only used if `skip_rows_with_nulls` is `true`
-   * @param skip_rows_with_nulls Indicates if rows in `input_keys` containing
-   * null values should be skipped. It `true`, it is assumed `row_bitmask` is a
-   * bitmask where bit `i` indicates the presence of a null value in row `i`.
-   */
-  compute_single_pass_aggs_fn(SetType set,
-                              table_device_view input_values,
-                              mutable_table_device_view output_values,
-                              aggregation::Kind const* aggs,
-                              bitmask_type const* row_bitmask,
-                              bool skip_rows_with_nulls)
-    : set(set),
-      input_values(input_values),
-      output_values(output_values),
-      aggs(aggs),
-      row_bitmask(row_bitmask),
-      skip_rows_with_nulls(skip_rows_with_nulls)
-  {
-  }
-
-  __device__ void operator()(size_type i)
-  {
-    if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i)) {
-      auto const result = set.insert_and_find(i);
-
-      cudf::detail::aggregate_row<true, true>(output_values, *result.first, input_values, i, aggs);
-    }
-  }
-};
-
-}  // namespace hash
-}  // namespace detail
-}  // namespace groupby
-}  // namespace cudf
diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh
new file mode 100644
index 00000000000..32aca69accf
--- /dev/null
+++ b/cpp/src/groupby/hash/helpers.cuh
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/types.hpp>
+
+#include <cuco/static_set.cuh>
+
+namespace cudf::groupby::detail::hash {
+
+CUDF_HOST_DEVICE constexpr std::size_t round_to_multiple_of_8(std::size_t num)
+{
+  std::size_t constexpr base = 8;
+  return cudf::util::div_rounding_up_safe(num, base) * base;
+}
+
+// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested
+// types and `cg_size = 1`for flat data to improve performance
+/// Number of threads to handle each input element
+CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1;
+/// Number of slots per thread
+CUDF_HOST_DEVICE auto constexpr GROUPBY_WINDOW_SIZE = 1;
+
+using probing_scheme_type = cuco::linear_probing<
+  GROUPBY_CG_SIZE,
+  cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
+                                                   cudf::nullate::DYNAMIC>>;
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/kernels.cuh b/cpp/src/groupby/hash/kernels.cuh
new file mode 100644
index 00000000000..a50083f2082
--- /dev/null
+++ b/cpp/src/groupby/hash/kernels.cuh
@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cudf/types.hpp"
+#include "helpers.cuh"
+#include "single_pass_functors.cuh"
+
+#include <cudf/detail/cuco_helpers.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+
+namespace cudf::groupby::detail::hash {
+
+__device__ void calculate_columns_to_aggregate(int& col_start,
+                                               int& col_end,
+                                               cudf::mutable_table_device_view output_values,
+                                               int num_input_cols,
+                                               std::byte** s_aggregates_pointer,
+                                               bool** s_aggregates_valid_pointer,
+                                               std::byte* shared_set_aggregates,
+                                               cudf::size_type cardinality,
+                                               int total_agg_size)
+{
+  if (threadIdx.x == 0) {
+    col_start           = col_end;
+    int bytes_allocated = 0;
+    int valid_col_size  = round_to_multiple_of_8(sizeof(bool) * cardinality);
+    while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) {
+      int next_col_size =
+        round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality);
+      int next_col_total_size = valid_col_size + next_col_size;
+      if (bytes_allocated + next_col_total_size > total_agg_size) { break; }
+      s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated;
+      s_aggregates_valid_pointer[col_end] =
+        reinterpret_cast<bool*>(shared_set_aggregates + bytes_allocated + next_col_size);
+      bytes_allocated += next_col_total_size;
+      col_end++;
+    }
+  }
+}
+
+__device__ void initialize_shared_memory_aggregates(int col_start,
+                                                    int col_end,
+                                                    cudf::mutable_table_device_view output_values,
+                                                    std::byte** s_aggregates_pointer,
+                                                    bool** s_aggregates_valid_pointer,
+                                                    cudf::size_type cardinality,
+                                                    cudf::aggregation::Kind const* aggs)
+{
+  for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+    for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) {
+      cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(),
+                                                  aggs[col_idx],
+                                                  initialize_shmem{},
+                                                  s_aggregates_pointer[col_idx],
+                                                  idx,
+                                                  s_aggregates_valid_pointer[col_idx]);
+    }
+  }
+}
+
+__device__ void compute_pre_aggregrates(int col_start,
+                                        int col_end,
+                                        cudf::table_device_view input_values,
+                                        cudf::size_type num_input_rows,
+                                        cudf::size_type* local_mapping_index,
+                                        std::byte** s_aggregates_pointer,
+                                        bool** s_aggregates_valid_pointer,
+                                        cudf::aggregation::Kind const* aggs)
+{
+  for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows;
+       cur_idx += blockDim.x * gridDim.x) {
+    auto map_idx = local_mapping_index[cur_idx];
+
+    for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+      auto input_col = input_values.column(col_idx);
+
+      cudf::detail::dispatch_type_and_aggregation(input_col.type(),
+                                                  aggs[col_idx],
+                                                  shmem_element_aggregator{},
+                                                  s_aggregates_pointer[col_idx],
+                                                  map_idx,
+                                                  s_aggregates_valid_pointer[col_idx],
+                                                  input_col,
+                                                  cur_idx);
+    }
+  }
+}
+
+template <int shared_set_num_elements>
+__device__ void compute_final_aggregates(int col_start,
+                                         int col_end,
+                                         cudf::table_device_view input_values,
+                                         cudf::mutable_table_device_view output_values,
+                                         cudf::size_type cardinality,
+                                         cudf::size_type* global_mapping_index,
+                                         std::byte** s_aggregates_pointer,
+                                         bool** s_aggregates_valid_pointer,
+                                         cudf::aggregation::Kind const* aggs)
+{
+  for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) {
+    auto out_idx = global_mapping_index[blockIdx.x * shared_set_num_elements + cur_idx];
+    for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+      auto output_col = output_values.column(col_idx);
+
+      cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(),
+                                                  aggs[col_idx],
+                                                  gmem_element_aggregator{},
+                                                  output_col,
+                                                  out_idx,
+                                                  input_values.column(col_idx),
+                                                  s_aggregates_pointer[col_idx],
+                                                  cur_idx,
+                                                  s_aggregates_valid_pointer[col_idx]);
+    }
+  }
+}
+
+/* Takes the local_mapping_index and global_mapping_index to compute
+ * pre (shared) and final (global) aggregates*/
+template <cudf::size_type shared_set_num_elements, cudf::size_type cardinality_threshold>
+CUDF_KERNEL void compute_aggregates(cudf::size_type* local_mapping_index,
+                                    cudf::size_type* global_mapping_index,
+                                    cudf::size_type* block_cardinality,
+                                    cudf::table_device_view input_values,
+                                    cudf::mutable_table_device_view output_values,
+                                    cudf::size_type num_input_rows,
+                                    cudf::aggregation::Kind const* aggs,
+                                    int total_agg_size,
+                                    int pointer_size)
+{
+  cudf::size_type cardinality = block_cardinality[blockIdx.x];
+  if (cardinality >= cardinality_threshold) { return; }
+  int num_input_cols = output_values.num_columns();
+  extern __shared__ std::byte shared_set_aggregates[];
+  std::byte** s_aggregates_pointer =
+    reinterpret_cast<std::byte**>(shared_set_aggregates + total_agg_size);
+  bool** s_aggregates_valid_pointer =
+    reinterpret_cast<bool**>(shared_set_aggregates + total_agg_size + pointer_size);
+  __shared__ int col_start;
+  __shared__ int col_end;
+  if (threadIdx.x == 0) {
+    col_start = 0;
+    col_end   = 0;
+  }
+  __syncthreads();
+  while (col_end < num_input_cols) {
+    calculate_columns_to_aggregate(col_start,
+                                   col_end,
+                                   output_values,
+                                   num_input_cols,
+                                   s_aggregates_pointer,
+                                   s_aggregates_valid_pointer,
+                                   shared_set_aggregates,
+                                   cardinality,
+                                   total_agg_size);
+    __syncthreads();
+    initialize_shared_memory_aggregates(col_start,
+                                        col_end,
+                                        output_values,
+                                        s_aggregates_pointer,
+                                        s_aggregates_valid_pointer,
+                                        cardinality,
+                                        aggs);
+    __syncthreads();
+    compute_pre_aggregrates(col_start,
+                            col_end,
+                            input_values,
+                            num_input_rows,
+                            local_mapping_index,
+                            s_aggregates_pointer,
+                            s_aggregates_valid_pointer,
+                            aggs);
+    __syncthreads();
+    compute_final_aggregates<shared_set_num_elements>(col_start,
+                                                      col_end,
+                                                      input_values,
+                                                      output_values,
+                                                      cardinality,
+                                                      global_mapping_index,
+                                                      s_aggregates_pointer,
+                                                      s_aggregates_valid_pointer,
+                                                      aggs);
+    __syncthreads();
+  }
+}
+
+template <typename SetType>
+__device__ void find_local_mapping(cudf::size_type cur_idx,
+                                   cudf::size_type num_input_rows,
+                                   cudf::size_type* cardinality,
+                                   SetType shared_set,
+                                   cudf::size_type* local_mapping_index,
+                                   cudf::size_type* shared_set_indices)
+{
+  cudf::size_type result_idx;
+  bool inserted;
+  if (cur_idx < num_input_rows) {
+    auto const result = shared_set.insert_and_find(cur_idx);
+    result_idx        = *result.first;
+    inserted          = result.second;
+    // inserted a new element
+    if (result.second) {
+      auto shared_set_index                = atomicAdd(cardinality, 1);
+      shared_set_indices[shared_set_index] = cur_idx;
+      local_mapping_index[cur_idx]         = shared_set_index;
+    }
+  }
+  // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all
+  // threads in the thread block.
+  __syncthreads();
+  if (cur_idx < num_input_rows) {
+    // element was already in set
+    if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; }
+  }
+}
+
+template <typename SetType>
+__device__ void find_global_mapping(cudf::size_type cur_idx,
+                                    SetType global_set,
+                                    cudf::size_type* shared_set_indices,
+                                    cudf::size_type* global_mapping_index,
+                                    cudf::size_type shared_set_num_elements)
+{
+  auto input_idx = shared_set_indices[cur_idx];
+  auto result    = global_set.insert_and_find(input_idx);
+  global_mapping_index[blockIdx.x * shared_set_num_elements + cur_idx] = *result.first;
+}
+
+/*
+ * Inserts keys into the shared memory hash set, and stores the row index of the local
+ * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a
+ * threadblock exceeds `cardinality_threshold`, the threads in that block will exit without updating
+ * `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to the
+ * global hash set, and save the row index of the global sparse table in `global_mapping_index`.
+ */
+template <class SetRef,
+          cudf::size_type shared_set_num_elements,
+          cudf::size_type cardinality_threshold,
+          typename GlobalSetType,
+          typename KeyEqual,
+          typename RowHasher,
+          class WindowExtent>
+CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
+                                         cudf::size_type num_input_rows,
+                                         WindowExtent window_extent,
+                                         KeyEqual d_key_equal,
+                                         RowHasher d_row_hash,
+                                         cudf::size_type* local_mapping_index,
+                                         cudf::size_type* global_mapping_index,
+                                         cudf::size_type* block_cardinality,
+                                         bool* direct_aggregations)
+{
+  __shared__ cudf::size_type shared_set_indices[shared_set_num_elements];
+
+  // Shared set initialization
+  __shared__ typename SetRef::window_type windows[window_extent.value()];
+  auto storage     = SetRef::storage_ref_type(window_extent, windows);
+  auto shared_set  = SetRef(cuco::empty_key<cudf::size_type>{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+                           d_key_equal,
+                           probing_scheme_type{d_row_hash},
+                            {},
+                           storage);
+  auto const block = cooperative_groups::this_thread_block();
+  shared_set.initialize(block);
+  block.sync();
+
+  auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find);
+
+  __shared__ cudf::size_type cardinality;
+
+  if (threadIdx.x == 0) { cardinality = 0; }
+
+  __syncthreads();
+
+  int num_loops =
+    cudf::util::div_rounding_up_safe(num_input_rows, (cudf::size_type)(blockDim.x * gridDim.x));
+  auto end_idx = num_loops * blockDim.x * gridDim.x;
+
+  for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < end_idx;
+       cur_idx += blockDim.x * gridDim.x) {
+    find_local_mapping(cur_idx,
+                       num_input_rows,
+                       &cardinality,
+                       shared_insert_ref,
+                       local_mapping_index,
+                       shared_set_indices);
+
+    __syncthreads();
+
+    if (cardinality >= cardinality_threshold) {
+      if (threadIdx.x == 0) { *direct_aggregations = true; }
+      break;
+    }
+
+    __syncthreads();
+  }
+
+  // Insert unique keys from shared to global hash set
+  if (cardinality < cardinality_threshold) {
+    for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) {
+      find_global_mapping(
+        cur_idx, global_set, shared_set_indices, global_mapping_index, shared_set_num_elements);
+    }
+  }
+
+  if (threadIdx.x == 0) block_cardinality[blockIdx.x] = cardinality;
+}
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/multi_pass_kernels.cuh b/cpp/src/groupby/hash/multi_pass_functors.cuh
similarity index 100%
rename from cpp/src/groupby/hash/multi_pass_kernels.cuh
rename to cpp/src/groupby/hash/multi_pass_functors.cuh
diff --git a/cpp/src/groupby/hash/groupby_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
similarity index 91%
rename from cpp/src/groupby/hash/groupby_functors.cuh
rename to cpp/src/groupby/hash/single_pass_functors.cuh
index 5630e838272..2b92ed63098 100644
--- a/cpp/src/groupby/hash/groupby_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -21,6 +21,7 @@
 #include <cudf/utilities/traits.cuh>
 
 namespace cudf::groupby::detail::hash {
+
 template <typename Source,
           cudf::aggregation::Kind k,
           bool target_has_nulls,
@@ -905,4 +906,80 @@ struct compute_direct_aggregates {
   }
 };
 
+/**
+ * @brief Computes single-pass aggregations and store results into a sparse `output_values` table,
+ * and populate `set` with indices of unique keys
+ *
+ * The hash set is built by inserting every row index `i` from the `keys` and `values` tables. If
+ * the index was not present in the set, insert they index and then copy it to the output. If the
+ * key was already present in the set, then the inserted index is aggregated with the existing row.
+ * This aggregation is done for every element `j` in the row by applying aggregation operation `j`
+ * between the new and existing element.
+ *
+ * Instead of storing the entire rows from `input_keys` and `input_values` in
+ * the hashset, we instead store the row indices. For example, when inserting
+ * row at index `i` from `input_keys` into the hash set, the value `i` is what
+ * gets stored for the hash set's "key". It is assumed the `set` was constructed
+ * with a custom comparator that uses these row indices to check for equality
+ * between key rows. For example, comparing two keys `k0` and `k1` will compare
+ * the two rows `input_keys[k0] ?= input_keys[k1]`
+ *
+ * The exact size of the result is not known a priori, but can be upper bounded
+ * by the number of rows in `input_keys` & `input_values`. Therefore, it is
+ * assumed `output_values` has sufficient storage for an equivalent number of
+ * rows. In this way, after all rows are aggregated, `output_values` will likely
+ * be "sparse", meaning that not all rows contain the result of an aggregation.
+ *
+ * @tparam SetType The type of the hash set device ref
+ */
+template <typename SetType>
+struct compute_single_pass_aggs_fn {
+  SetType set;
+  table_device_view input_values;
+  mutable_table_device_view output_values;
+  aggregation::Kind const* __restrict__ aggs;
+  bitmask_type const* __restrict__ row_bitmask;
+  bool skip_rows_with_nulls;
+
+  /**
+   * @brief Construct a new compute_single_pass_aggs_fn functor object
+   *
+   * @param set_ref Hash set object to insert key,value pairs into.
+   * @param input_values The table whose rows will be aggregated in the values
+   * of the hash set
+   * @param output_values Table that stores the results of aggregating rows of
+   * `input_values`.
+   * @param aggs The set of aggregation operations to perform across the
+   * columns of the `input_values` rows
+   * @param row_bitmask Bitmask where bit `i` indicates the presence of a null
+   * value in row `i` of input keys. Only used if `skip_rows_with_nulls` is `true`
+   * @param skip_rows_with_nulls Indicates if rows in `input_keys` containing
+   * null values should be skipped. It `true`, it is assumed `row_bitmask` is a
+   * bitmask where bit `i` indicates the presence of a null value in row `i`.
+   */
+  compute_single_pass_aggs_fn(SetType set,
+                              table_device_view input_values,
+                              mutable_table_device_view output_values,
+                              aggregation::Kind const* aggs,
+                              bitmask_type const* row_bitmask,
+                              bool skip_rows_with_nulls)
+    : set(set),
+      input_values(input_values),
+      output_values(output_values),
+      aggs(aggs),
+      row_bitmask(row_bitmask),
+      skip_rows_with_nulls(skip_rows_with_nulls)
+  {
+  }
+
+  __device__ void operator()(size_type i)
+  {
+    if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i)) {
+      auto const result = set.insert_and_find(i);
+
+      cudf::detail::aggregate_row<true, true>(output_values, *result.first, input_values, i, aggs);
+    }
+  }
+};
+
 }  // namespace cudf::groupby::detail::hash

From 9ab1c0229fdb0e3f0014ee089f84d723be017fef Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 20 Aug 2024 17:56:52 -0700
Subject: [PATCH 005/135] Minor cleanups: use CCCL traits in device APIs

---
 cpp/src/groupby/hash/multi_pass_functors.cuh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/src/groupby/hash/multi_pass_functors.cuh b/cpp/src/groupby/hash/multi_pass_functors.cuh
index 7043eafdc10..6fbec5fe19e 100644
--- a/cpp/src/groupby/hash/multi_pass_functors.cuh
+++ b/cpp/src/groupby/hash/multi_pass_functors.cuh
@@ -25,6 +25,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <cuda/atomic>
+#include <cuda/std/type_traits>
 
 #include <cmath>
 
@@ -64,17 +65,15 @@ struct var_hash_functor {
   }
 
   template <typename Source>
-  __device__ std::enable_if_t<!is_supported<Source>()> operator()(column_device_view const& source,
-                                                                  size_type source_index,
-                                                                  size_type target_index) noexcept
+  __device__ cuda::std::enable_if_t<!is_supported<Source>()> operator()(
+    column_device_view const& source, size_type source_index, size_type target_index) noexcept
   {
     CUDF_UNREACHABLE("Invalid source type for std, var aggregation combination.");
   }
 
   template <typename Source>
-  __device__ std::enable_if_t<is_supported<Source>()> operator()(column_device_view const& source,
-                                                                 size_type source_index,
-                                                                 size_type target_index) noexcept
+  __device__ cuda::std::enable_if_t<is_supported<Source>()> operator()(
+    column_device_view const& source, size_type source_index, size_type target_index) noexcept
   {
     using Target    = target_type_t<Source, aggregation::VARIANCE>;
     using SumType   = target_type_t<Source, aggregation::SUM>;
@@ -93,6 +92,7 @@ struct var_hash_functor {
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
+
   __device__ inline void operator()(size_type source_index)
   {
     if (row_bitmask == nullptr or cudf::bit_is_set(row_bitmask, source_index)) {

From db1b26ab3585a956a35a4a5e74ff56885dcaaa96 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 20 Aug 2024 18:27:49 -0700
Subject: [PATCH 006/135] Move more constexpr to the helper

---
 cpp/src/groupby/hash/groupby.cu               | 91 ++++++++-----------
 cpp/src/groupby/hash/helpers.cuh              | 27 ++++--
 cpp/src/groupby/hash/kernels.cuh              | 46 ++++------
 cpp/src/groupby/hash/single_pass_functors.cuh |  9 +-
 4 files changed, 83 insertions(+), 90 deletions(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 92e8f4a6f8b..36f7bd08a37 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -505,7 +505,6 @@ size_t find_shmem_size(FuncType func, int block_size, int grid_size, int num_sms
   return get_previous_multiple_of_8(0.5 * dynamic_smem_size);
 }
 
-template <cudf::size_type shared_set_num_elements, cudf::size_type cardinality_threshold>
 void launch_compute_aggregates(int block_size,
                                int grid_size,
                                int num_sms,
@@ -518,8 +517,7 @@ void launch_compute_aggregates(int block_size,
                                cudf::aggregation::Kind const* aggs,
                                rmm::cuda_stream_view stream)
 {
-  auto compute_aggregates_fn_ptr =
-    compute_aggregates<shared_set_num_elements, cardinality_threshold>;
+  auto compute_aggregates_fn_ptr = compute_aggregates;
   size_t d_shmem_size = find_shmem_size(compute_aggregates_fn_ptr, block_size, grid_size, num_sms);
   // For each aggregation, need two pointers to arrays in shmem
   // One where the aggregation is performed, one indicating the validity of the aggregation
@@ -527,16 +525,15 @@ void launch_compute_aggregates(int block_size,
     round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns());
   // The rest of shmem is utilized for the actual arrays in shmem
   auto shmem_agg_size = d_shmem_size - shmem_agg_pointer_size * 2;
-  compute_aggregates<shared_set_num_elements, cardinality_threshold>
-    <<<grid_size, block_size, d_shmem_size, stream>>>(local_mapping_index,
-                                                      global_mapping_index,
-                                                      block_cardinality,
-                                                      input_values,
-                                                      output_values,
-                                                      num_input_rows,
-                                                      aggs,
-                                                      shmem_agg_size,
-                                                      shmem_agg_pointer_size);
+  compute_aggregates<<<grid_size, block_size, d_shmem_size, stream>>>(local_mapping_index,
+                                                                      global_mapping_index,
+                                                                      block_cardinality,
+                                                                      input_values,
+                                                                      output_values,
+                                                                      num_input_rows,
+                                                                      aggs,
+                                                                      shmem_agg_size,
+                                                                      shmem_agg_pointer_size);
 }
 
 /**
@@ -553,17 +550,9 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_set_aggs(
   KeyEqual d_key_equal,
   RowHasher d_row_hash)
 {
-  auto constexpr block_size            = 128;
-  auto constexpr cardinality_threshold = 128;
-
-  auto const num_input_rows = keys.num_rows();
-
-  // We add additional `block_size`, because after the number of elements in the local hash set
-  // exceeds the threshold, all threads in the thread block can still insert one more element.
-  auto constexpr shared_set_num_elements = cardinality_threshold + block_size;
-  // shared_set_num_elements with 0.7 occupancy
+  // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy
   auto constexpr shared_set_capacity =
-    static_cast<std::size_t>(static_cast<double>(shared_set_num_elements) * 1.43);
+    static_cast<std::size_t>(static_cast<double>(GROUPBY_SHM_MAX_ELEMENTS) * 1.43);
   using extent_type            = cuco::extent<cudf::size_type, shared_set_capacity>;
   using shared_set_type        = cuco::static_set<cudf::size_type,
                                            extent_type,
@@ -575,37 +564,37 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_set_aggs(
   using shared_set_ref_type    = typename shared_set_type::ref_type<>;
   auto constexpr window_extent = cuco::make_window_extent<shared_set_ref_type>(extent_type{});
 
+  auto const num_input_rows = keys.num_rows();
+
   auto global_set_ref = global_set.ref(cuco::op::insert_and_find);
 
   int num_sms                         = find_num_sms();
   auto compute_mapping_indices_fn_ptr = compute_mapping_indices<shared_set_ref_type,
-                                                                shared_set_num_elements,
-                                                                cardinality_threshold,
                                                                 decltype(global_set_ref),
                                                                 KeyEqual,
                                                                 RowHasher,
                                                                 decltype(window_extent)>;
   int grid_size =
-    find_grid_size(compute_mapping_indices_fn_ptr, block_size, num_input_rows, num_sms);
+    find_grid_size(compute_mapping_indices_fn_ptr, GROUPBY_BLOCK_SIZE, num_input_rows, num_sms);
   // 'local_mapping_index' maps from the global row index of the input table to the row index of
   // the local pre-aggregate table
   rmm::device_uvector<cudf::size_type> local_mapping_index(num_input_rows, stream);
   // 'global_mapping_index' maps from  the local pre-aggregate table to the row index of
   // global aggregate table
-  rmm::device_uvector<cudf::size_type> global_mapping_index(grid_size * shared_set_num_elements,
+  rmm::device_uvector<cudf::size_type> global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS,
                                                             stream);
   rmm::device_uvector<cudf::size_type> block_cardinality(grid_size, stream);
   rmm::device_scalar<bool> direct_aggregations(false, stream);
-  compute_mapping_indices<shared_set_ref_type, shared_set_num_elements, cardinality_threshold>
-    <<<grid_size, block_size, 0, stream>>>(global_set_ref,
-                                           num_input_rows,
-                                           window_extent,
-                                           d_key_equal,
-                                           d_row_hash,
-                                           local_mapping_index.data(),
-                                           global_mapping_index.data(),
-                                           block_cardinality.data(),
-                                           direct_aggregations.data());
+  compute_mapping_indices<shared_set_ref_type>
+    <<<grid_size, GROUPBY_BLOCK_SIZE, 0, stream>>>(global_set_ref,
+                                                   num_input_rows,
+                                                   window_extent,
+                                                   d_key_equal,
+                                                   d_row_hash,
+                                                   local_mapping_index.data(),
+                                                   global_mapping_index.data(),
+                                                   block_cardinality.data(),
+                                                   direct_aggregations.data());
   stream.synchronize();
 
   // 'populated_keys' contains inserted row_indices (keys) of global hash set
@@ -628,21 +617,20 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_set_aggs(
 
   auto d_values = table_device_view::create(flattened_values, stream);
 
-  launch_compute_aggregates<shared_set_num_elements, cardinality_threshold>(
-    block_size,
-    grid_size,
-    num_sms,
-    local_mapping_index.data(),
-    global_mapping_index.data(),
-    block_cardinality.data(),
-    *d_values,
-    *d_sparse_table,
-    num_input_rows,
-    d_aggs.data(),
-    stream);
+  launch_compute_aggregates(GROUPBY_BLOCK_SIZE,
+                            grid_size,
+                            num_sms,
+                            local_mapping_index.data(),
+                            global_mapping_index.data(),
+                            block_cardinality.data(),
+                            *d_values,
+                            *d_sparse_table,
+                            num_input_rows,
+                            d_aggs.data(),
+                            stream);
 
   if (direct_aggregations.value(stream)) {
-    int stride = block_size * grid_size;
+    int stride = GROUPBY_BLOCK_SIZE * grid_size;
     thrust::for_each_n(rmm::exec_policy(stream),
                        thrust::make_counting_iterator(0),
                        keys.num_rows(),
@@ -652,8 +640,7 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_set_aggs(
                                                  d_aggs.data(),
                                                  block_cardinality.data(),
                                                  stride,
-                                                 block_size,
-                                                 cardinality_threshold});
+                                                 GROUPBY_BLOCK_SIZE});
     extract_populated_keys(global_set, populated_keys, stream);
   }
 
diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh
index 32aca69accf..9e5e628966c 100644
--- a/cpp/src/groupby/hash/helpers.cuh
+++ b/cpp/src/groupby/hash/helpers.cuh
@@ -24,22 +24,37 @@
 
 namespace cudf::groupby::detail::hash {
 
-CUDF_HOST_DEVICE constexpr std::size_t round_to_multiple_of_8(std::size_t num)
-{
-  std::size_t constexpr base = 8;
-  return cudf::util::div_rounding_up_safe(num, base) * base;
-}
-
 // TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested
 // types and `cg_size = 1`for flat data to improve performance
 /// Number of threads to handle each input element
 CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1;
+
 /// Number of slots per thread
 CUDF_HOST_DEVICE auto constexpr GROUPBY_WINDOW_SIZE = 1;
 
+/// Probing scheme type used by groupby hash table
 using probing_scheme_type = cuco::linear_probing<
   GROUPBY_CG_SIZE,
   cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
                                                    cudf::nullate::DYNAMIC>>;
 
+/// Thread block size
+CUDF_HOST_DEVICE auto constexpr GROUPBY_BLOCK_SIZE = 128;
+
+/// Threshold cardinality to switch between shared memory aggregations and global memory
+/// aggregations
+CUDF_HOST_DEVICE auto constexpr GROUPBY_CARDINALITY_THRESHOLD = 128;
+
+// We add additional `block_size`, because after the number of elements in the local hash set
+// exceeds the threshold, all threads in the thread block can still insert one more element.
+/// The maximum number of elements handled per block
+CUDF_HOST_DEVICE auto constexpr GROUPBY_SHM_MAX_ELEMENTS =
+  GROUPBY_CARDINALITY_THRESHOLD + GROUPBY_BLOCK_SIZE;
+
+CUDF_HOST_DEVICE constexpr std::size_t round_to_multiple_of_8(std::size_t num)
+{
+  std::size_t constexpr base = 8;
+  return cudf::util::div_rounding_up_safe(num, base) * base;
+}
+
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/kernels.cuh b/cpp/src/groupby/hash/kernels.cuh
index a50083f2082..aee3b416b2d 100644
--- a/cpp/src/groupby/hash/kernels.cuh
+++ b/cpp/src/groupby/hash/kernels.cuh
@@ -101,7 +101,6 @@ __device__ void compute_pre_aggregrates(int col_start,
   }
 }
 
-template <int shared_set_num_elements>
 __device__ void compute_final_aggregates(int col_start,
                                          int col_end,
                                          cudf::table_device_view input_values,
@@ -113,7 +112,7 @@ __device__ void compute_final_aggregates(int col_start,
                                          cudf::aggregation::Kind const* aggs)
 {
   for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) {
-    auto out_idx = global_mapping_index[blockIdx.x * shared_set_num_elements + cur_idx];
+    auto out_idx = global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx];
     for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
       auto output_col = output_values.column(col_idx);
 
@@ -132,7 +131,6 @@ __device__ void compute_final_aggregates(int col_start,
 
 /* Takes the local_mapping_index and global_mapping_index to compute
  * pre (shared) and final (global) aggregates*/
-template <cudf::size_type shared_set_num_elements, cudf::size_type cardinality_threshold>
 CUDF_KERNEL void compute_aggregates(cudf::size_type* local_mapping_index,
                                     cudf::size_type* global_mapping_index,
                                     cudf::size_type* block_cardinality,
@@ -144,7 +142,7 @@ CUDF_KERNEL void compute_aggregates(cudf::size_type* local_mapping_index,
                                     int pointer_size)
 {
   cudf::size_type cardinality = block_cardinality[blockIdx.x];
-  if (cardinality >= cardinality_threshold) { return; }
+  if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; }
   int num_input_cols = output_values.num_columns();
   extern __shared__ std::byte shared_set_aggregates[];
   std::byte** s_aggregates_pointer =
@@ -186,15 +184,15 @@ CUDF_KERNEL void compute_aggregates(cudf::size_type* local_mapping_index,
                             s_aggregates_valid_pointer,
                             aggs);
     __syncthreads();
-    compute_final_aggregates<shared_set_num_elements>(col_start,
-                                                      col_end,
-                                                      input_values,
-                                                      output_values,
-                                                      cardinality,
-                                                      global_mapping_index,
-                                                      s_aggregates_pointer,
-                                                      s_aggregates_valid_pointer,
-                                                      aggs);
+    compute_final_aggregates(col_start,
+                             col_end,
+                             input_values,
+                             output_values,
+                             cardinality,
+                             global_mapping_index,
+                             s_aggregates_pointer,
+                             s_aggregates_valid_pointer,
+                             aggs);
     __syncthreads();
   }
 }
@@ -233,24 +231,21 @@ template <typename SetType>
 __device__ void find_global_mapping(cudf::size_type cur_idx,
                                     SetType global_set,
                                     cudf::size_type* shared_set_indices,
-                                    cudf::size_type* global_mapping_index,
-                                    cudf::size_type shared_set_num_elements)
+                                    cudf::size_type* global_mapping_index)
 {
   auto input_idx = shared_set_indices[cur_idx];
   auto result    = global_set.insert_and_find(input_idx);
-  global_mapping_index[blockIdx.x * shared_set_num_elements + cur_idx] = *result.first;
+  global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] = *result.first;
 }
 
 /*
  * Inserts keys into the shared memory hash set, and stores the row index of the local
  * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a
- * threadblock exceeds `cardinality_threshold`, the threads in that block will exit without updating
- * `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to the
- * global hash set, and save the row index of the global sparse table in `global_mapping_index`.
+ * threadblock exceeds `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without
+ * updating `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to
+ * the global hash set, and save the row index of the global sparse table in `global_mapping_index`.
  */
 template <class SetRef,
-          cudf::size_type shared_set_num_elements,
-          cudf::size_type cardinality_threshold,
           typename GlobalSetType,
           typename KeyEqual,
           typename RowHasher,
@@ -265,7 +260,7 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
                                          cudf::size_type* block_cardinality,
                                          bool* direct_aggregations)
 {
-  __shared__ cudf::size_type shared_set_indices[shared_set_num_elements];
+  __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS];
 
   // Shared set initialization
   __shared__ typename SetRef::window_type windows[window_extent.value()];
@@ -302,7 +297,7 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
 
     __syncthreads();
 
-    if (cardinality >= cardinality_threshold) {
+    if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) {
       if (threadIdx.x == 0) { *direct_aggregations = true; }
       break;
     }
@@ -311,10 +306,9 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
   }
 
   // Insert unique keys from shared to global hash set
-  if (cardinality < cardinality_threshold) {
+  if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) {
     for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) {
-      find_global_mapping(
-        cur_idx, global_set, shared_set_indices, global_mapping_index, shared_set_num_elements);
+      find_global_mapping(cur_idx, global_set, shared_set_indices, global_mapping_index);
     }
   }
 
diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index 2b92ed63098..324a2286a3e 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -877,29 +877,26 @@ struct compute_direct_aggregates {
   cudf::size_type* block_cardinality;
   int stride;
   int block_size;
-  cudf::size_type cardinality_threshold;
   compute_direct_aggregates(SetType set,
                             cudf::table_device_view input_values,
                             cudf::mutable_table_device_view output_values,
                             cudf::aggregation::Kind const* aggs,
                             cudf::size_type* block_cardinality,
                             int stride,
-                            int block_size,
-                            cudf::size_type cardinality_threshold)
+                            int block_size)
     : set(set),
       input_values(input_values),
       output_values(output_values),
       aggs(aggs),
       block_cardinality(block_cardinality),
       stride(stride),
-      block_size(block_size),
-      cardinality_threshold(cardinality_threshold)
+      block_size(block_size)
   {
   }
   __device__ void operator()(cudf::size_type i)
   {
     int block_id = (i % stride) / block_size;
-    if (block_cardinality[block_id] >= cardinality_threshold) {
+    if (block_cardinality[block_id] >= GROUPBY_CARDINALITY_THRESHOLD) {
       auto const result = set.insert_and_find(i);
       cudf::detail::aggregate_row<true, true>(output_values, *result.first, input_values, i, aggs);
     }

From 9993283b9ef3805e08a890e9d9d0c755f8a7ad05 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 20 Aug 2024 18:58:59 -0700
Subject: [PATCH 007/135] More cleanups with constexprs

---
 cpp/src/groupby/hash/groupby.cu               | 43 +++++++++----------
 cpp/src/groupby/hash/single_pass_functors.cuh |  9 ++--
 2 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 36f7bd08a37..41dff4e5d0d 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -482,31 +482,30 @@ int find_num_sms()
 }
 
 template <typename FuncType>
-int find_grid_size(FuncType func, int block_size, cudf::size_type num_input_rows, int num_sms)
+int find_grid_size(FuncType func, cudf::size_type num_input_rows, int num_sms)
 {
   int max_active_blocks{-1};
   CUDF_CUDA_TRY(
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func, block_size, 0));
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func, GROUPBY_BLOCK_SIZE, 0));
   auto max_grid_size       = max_active_blocks * num_sms;
-  int needed_active_blocks = cudf::util::div_rounding_up_safe(num_input_rows, block_size);
+  int needed_active_blocks = cudf::util::div_rounding_up_safe(num_input_rows, GROUPBY_BLOCK_SIZE);
   return std::min(max_grid_size, needed_active_blocks);
 }
 
 size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; }
 
 template <typename FuncType>
-size_t find_shmem_size(FuncType func, int block_size, int grid_size, int num_sms)
+size_t find_shmem_size(FuncType func, int grid_size, int num_sms)
 {
   auto active_blocks_per_sm = cudf::util::div_rounding_up_safe(grid_size, num_sms);
 
   size_t dynamic_smem_size;
   CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock(
-    &dynamic_smem_size, func, active_blocks_per_sm, block_size));
+    &dynamic_smem_size, func, active_blocks_per_sm, GROUPBY_BLOCK_SIZE));
   return get_previous_multiple_of_8(0.5 * dynamic_smem_size);
 }
 
-void launch_compute_aggregates(int block_size,
-                               int grid_size,
+void launch_compute_aggregates(int grid_size,
                                int num_sms,
                                cudf::size_type* local_mapping_index,
                                cudf::size_type* global_mapping_index,
@@ -518,22 +517,23 @@ void launch_compute_aggregates(int block_size,
                                rmm::cuda_stream_view stream)
 {
   auto compute_aggregates_fn_ptr = compute_aggregates;
-  size_t d_shmem_size = find_shmem_size(compute_aggregates_fn_ptr, block_size, grid_size, num_sms);
+  size_t d_shmem_size            = find_shmem_size(compute_aggregates_fn_ptr, grid_size, num_sms);
   // For each aggregation, need two pointers to arrays in shmem
   // One where the aggregation is performed, one indicating the validity of the aggregation
   auto shmem_agg_pointer_size =
     round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns());
   // The rest of shmem is utilized for the actual arrays in shmem
   auto shmem_agg_size = d_shmem_size - shmem_agg_pointer_size * 2;
-  compute_aggregates<<<grid_size, block_size, d_shmem_size, stream>>>(local_mapping_index,
-                                                                      global_mapping_index,
-                                                                      block_cardinality,
-                                                                      input_values,
-                                                                      output_values,
-                                                                      num_input_rows,
-                                                                      aggs,
-                                                                      shmem_agg_size,
-                                                                      shmem_agg_pointer_size);
+  compute_aggregates<<<grid_size, GROUPBY_BLOCK_SIZE, d_shmem_size, stream>>>(
+    local_mapping_index,
+    global_mapping_index,
+    block_cardinality,
+    input_values,
+    output_values,
+    num_input_rows,
+    aggs,
+    shmem_agg_size,
+    shmem_agg_pointer_size);
 }
 
 /**
@@ -574,8 +574,7 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_set_aggs(
                                                                 KeyEqual,
                                                                 RowHasher,
                                                                 decltype(window_extent)>;
-  int grid_size =
-    find_grid_size(compute_mapping_indices_fn_ptr, GROUPBY_BLOCK_SIZE, num_input_rows, num_sms);
+  int grid_size = find_grid_size(compute_mapping_indices_fn_ptr, num_input_rows, num_sms);
   // 'local_mapping_index' maps from the global row index of the input table to the row index of
   // the local pre-aggregate table
   rmm::device_uvector<cudf::size_type> local_mapping_index(num_input_rows, stream);
@@ -617,8 +616,7 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_set_aggs(
 
   auto d_values = table_device_view::create(flattened_values, stream);
 
-  launch_compute_aggregates(GROUPBY_BLOCK_SIZE,
-                            grid_size,
+  launch_compute_aggregates(grid_size,
                             num_sms,
                             local_mapping_index.data(),
                             global_mapping_index.data(),
@@ -639,8 +637,7 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_set_aggs(
                                                  *d_sparse_table,
                                                  d_aggs.data(),
                                                  block_cardinality.data(),
-                                                 stride,
-                                                 GROUPBY_BLOCK_SIZE});
+                                                 stride});
     extract_populated_keys(global_set, populated_keys, stream);
   }
 
diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index 324a2286a3e..170539576d1 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -876,26 +876,23 @@ struct compute_direct_aggregates {
   cudf::aggregation::Kind const* __restrict__ aggs;
   cudf::size_type* block_cardinality;
   int stride;
-  int block_size;
   compute_direct_aggregates(SetType set,
                             cudf::table_device_view input_values,
                             cudf::mutable_table_device_view output_values,
                             cudf::aggregation::Kind const* aggs,
                             cudf::size_type* block_cardinality,
-                            int stride,
-                            int block_size)
+                            int stride)
     : set(set),
       input_values(input_values),
       output_values(output_values),
       aggs(aggs),
       block_cardinality(block_cardinality),
-      stride(stride),
-      block_size(block_size)
+      stride(stride)
   {
   }
   __device__ void operator()(cudf::size_type i)
   {
-    int block_id = (i % stride) / block_size;
+    int block_id = (i % stride) / GROUPBY_BLOCK_SIZE;
     if (block_cardinality[block_id] >= GROUPBY_CARDINALITY_THRESHOLD) {
       auto const result = set.insert_and_find(i);
       cudf::detail::aggregate_row<true, true>(output_values, *result.first, input_values, i, aggs);

From c96d02cb8168cc7b9ff2997d2febc2a02c613aa4 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 21 Aug 2024 09:42:56 -0700
Subject: [PATCH 008/135] Add doc

---
 cpp/src/groupby/hash/helpers.cuh              | 3 +++
 cpp/src/groupby/hash/single_pass_functors.cuh | 1 +
 2 files changed, 4 insertions(+)

diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh
index 9e5e628966c..9287325c3fb 100644
--- a/cpp/src/groupby/hash/helpers.cuh
+++ b/cpp/src/groupby/hash/helpers.cuh
@@ -51,6 +51,9 @@ CUDF_HOST_DEVICE auto constexpr GROUPBY_CARDINALITY_THRESHOLD = 128;
 CUDF_HOST_DEVICE auto constexpr GROUPBY_SHM_MAX_ELEMENTS =
   GROUPBY_CARDINALITY_THRESHOLD + GROUPBY_BLOCK_SIZE;
 
+/**
+ * @brief Returns the smallest multiple of 8 that is greater than or equal to the given integer.
+ */
 CUDF_HOST_DEVICE constexpr std::size_t round_to_multiple_of_8(std::size_t num)
 {
   std::size_t constexpr base = 8;
diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index 170539576d1..a8cc7492c52 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -890,6 +890,7 @@ struct compute_direct_aggregates {
       stride(stride)
   {
   }
+
   __device__ void operator()(cudf::size_type i)
   {
     int block_id = (i % stride) / GROUPBY_BLOCK_SIZE;

From 7cd14d6c7e13a9d94bd5b365f06a47ceea395d66 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 21 Aug 2024 10:00:35 -0700
Subject: [PATCH 009/135] Renaming

---
 cpp/src/groupby/hash/groupby.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 41dff4e5d0d..1a13bcde8fa 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -541,7 +541,7 @@ void launch_compute_aggregates(int grid_size,
  * over the data and stores the results in `sparse_results`
  */
 template <typename SetType, typename KeyEqual, typename RowHasher>
-rmm::device_uvector<cudf::size_type> compute_single_pass_set_aggs(
+rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   cudf::table_view const& keys,
   cudf::host_span<cudf::groupby::aggregation_request const> requests,
   cudf::detail::result_cache* sparse_results,
@@ -729,7 +729,7 @@ std::unique_ptr<table> groupby(table_view const& keys,
       stream.value()};
 
     // Compute all single pass aggs first
-    auto gather_map = compute_single_pass_set_aggs(
+    auto gather_map = compute_single_pass_aggs(
       keys, requests, &sparse_results, set, stream, d_key_equal, d_row_hash);
 
     // Compact all results from sparse_results and insert into cache

From 1e04c10ccc99b28f10956dee2ae56ef3344d2a7f Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 21 Aug 2024 15:51:52 -0700
Subject: [PATCH 010/135] Fix cardinality bench

---
 cpp/benchmarks/groupby/group_max.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/benchmarks/groupby/group_max.cpp b/cpp/benchmarks/groupby/group_max.cpp
index f41285008c4..b9a701a71f4 100644
--- a/cpp/benchmarks/groupby/group_max.cpp
+++ b/cpp/benchmarks/groupby/group_max.cpp
@@ -101,4 +101,5 @@ NVBENCH_BENCH_TYPES(bench_groupby_max,
 
 NVBENCH_BENCH_TYPES(bench_groupby_max_cardinality, NVBENCH_TYPE_AXES(nvbench::type_list<int32_t>))
   .set_name("groupby_max_cardinality")
+  .add_int64_axis("num_aggregations", {1})
   .add_int64_axis("cardinality", {10, 20, 50, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000});

From 47aee18270590c357d927b1c605c9b50792659ec Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 22 Aug 2024 13:20:47 -0700
Subject: [PATCH 011/135] More cleanups with CG

---
 cpp/src/groupby/hash/groupby.cu  |  2 +-
 cpp/src/groupby/hash/kernels.cuh | 25 +++++++++++++------------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 1a13bcde8fa..730b03bee2a 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -574,7 +574,7 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
                                                                 KeyEqual,
                                                                 RowHasher,
                                                                 decltype(window_extent)>;
-  int grid_size = find_grid_size(compute_mapping_indices_fn_ptr, num_input_rows, num_sms);
+  auto const grid_size = find_grid_size(compute_mapping_indices_fn_ptr, num_input_rows, num_sms);
   // 'local_mapping_index' maps from the global row index of the input table to the row index of
   // the local pre-aggregate table
   rmm::device_uvector<cudf::size_type> local_mapping_index(num_input_rows, stream);
diff --git a/cpp/src/groupby/hash/kernels.cuh b/cpp/src/groupby/hash/kernels.cuh
index aee3b416b2d..6299a3b2acf 100644
--- a/cpp/src/groupby/hash/kernels.cuh
+++ b/cpp/src/groupby/hash/kernels.cuh
@@ -21,6 +21,7 @@
 #include "single_pass_functors.cuh"
 
 #include <cudf/detail/cuco_helpers.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 
 namespace cudf::groupby::detail::hash {
@@ -278,16 +279,15 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
 
   __shared__ cudf::size_type cardinality;
 
-  if (threadIdx.x == 0) { cardinality = 0; }
+  if (block.thread_rank() == 0) { cardinality = 0; }
 
-  __syncthreads();
+  block.sync();
 
-  int num_loops =
-    cudf::util::div_rounding_up_safe(num_input_rows, (cudf::size_type)(blockDim.x * gridDim.x));
-  auto end_idx = num_loops * blockDim.x * gridDim.x;
+  auto const stride = cudf::detail::grid_1d::grid_stride();
 
-  for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < end_idx;
-       cur_idx += blockDim.x * gridDim.x) {
+  for (auto cur_idx = cudf::detail::grid_1d::global_thread_id();
+       cur_idx - block.thread_rank() < num_input_rows;
+       cur_idx += stride) {
     find_local_mapping(cur_idx,
                        num_input_rows,
                        &cardinality,
@@ -295,24 +295,25 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
                        local_mapping_index,
                        shared_set_indices);
 
-    __syncthreads();
+    block.sync();
 
     if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) {
-      if (threadIdx.x == 0) { *direct_aggregations = true; }
+      if (block.thread_rank() == 0) { *direct_aggregations = true; }
       break;
     }
 
-    __syncthreads();
+    block.sync();
   }
 
   // Insert unique keys from shared to global hash set
   if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) {
-    for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) {
+    for (auto cur_idx = block.thread_rank(); cur_idx < cardinality;
+         cur_idx += block.num_threads()) {
       find_global_mapping(cur_idx, global_set, shared_set_indices, global_mapping_index);
     }
   }
 
-  if (threadIdx.x == 0) block_cardinality[blockIdx.x] = cardinality;
+  if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; }
 }
 
 }  // namespace cudf::groupby::detail::hash

From 6eb34598f6304c42a1a2e5ba01efd7a8a5ee69d7 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 28 Aug 2024 11:39:49 -0700
Subject: [PATCH 012/135] Use custom cuco

---
 rapids_config.cmake | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/rapids_config.cmake b/rapids_config.cmake
index 3a88769f6e7..96df5adedac 100644
--- a/rapids_config.cmake
+++ b/rapids_config.cmake
@@ -11,6 +11,10 @@
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
 # =============================================================================
+
+set(rapids-cmake-repo PointKernel/rapids-cmake)
+set(rapids-cmake-branch cuco-hash-function)
+
 file(READ "${CMAKE_CURRENT_LIST_DIR}/VERSION" _rapids_version)
 if(_rapids_version MATCHES [[^([0-9][0-9])\.([0-9][0-9])\.([0-9][0-9])]])
   set(RAPIDS_VERSION_MAJOR "${CMAKE_MATCH_1}")

From ee5f7fa2eb121ba32525bf2b0e5614fed32903f9 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 28 Aug 2024 13:34:14 -0700
Subject: [PATCH 013/135] Cleanups with new key_eq and hash_function

---
 cpp/src/groupby/hash/groupby.cu  | 23 +++++++----------------
 cpp/src/groupby/hash/kernels.cuh | 12 +++---------
 2 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 730b03bee2a..0d53a0f46ea 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -516,8 +516,7 @@ void launch_compute_aggregates(int grid_size,
                                cudf::aggregation::Kind const* aggs,
                                rmm::cuda_stream_view stream)
 {
-  auto compute_aggregates_fn_ptr = compute_aggregates;
-  size_t d_shmem_size            = find_shmem_size(compute_aggregates_fn_ptr, grid_size, num_sms);
+  size_t d_shmem_size = find_shmem_size(compute_aggregates, grid_size, num_sms);
   // For each aggregation, need two pointers to arrays in shmem
   // One where the aggregation is performed, one indicating the validity of the aggregation
   auto shmem_agg_pointer_size =
@@ -540,15 +539,13 @@ void launch_compute_aggregates(int grid_size,
  * @brief Computes all aggregations from `requests` that require a single pass
  * over the data and stores the results in `sparse_results`
  */
-template <typename SetType, typename KeyEqual, typename RowHasher>
+template <typename SetType>
 rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   cudf::table_view const& keys,
   cudf::host_span<cudf::groupby::aggregation_request const> requests,
   cudf::detail::result_cache* sparse_results,
   SetType& global_set,
-  rmm::cuda_stream_view stream,
-  KeyEqual d_key_equal,
-  RowHasher d_row_hash)
+  rmm::cuda_stream_view stream)
 {
   // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy
   auto constexpr shared_set_capacity =
@@ -568,12 +565,9 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
 
   auto global_set_ref = global_set.ref(cuco::op::insert_and_find);
 
-  int num_sms                         = find_num_sms();
-  auto compute_mapping_indices_fn_ptr = compute_mapping_indices<shared_set_ref_type,
-                                                                decltype(global_set_ref),
-                                                                KeyEqual,
-                                                                RowHasher,
-                                                                decltype(window_extent)>;
+  int num_sms = find_num_sms();
+  auto compute_mapping_indices_fn_ptr =
+    compute_mapping_indices<shared_set_ref_type, decltype(global_set_ref), decltype(window_extent)>;
   auto const grid_size = find_grid_size(compute_mapping_indices_fn_ptr, num_input_rows, num_sms);
   // 'local_mapping_index' maps from the global row index of the input table to the row index of
   // the local pre-aggregate table
@@ -588,8 +582,6 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
     <<<grid_size, GROUPBY_BLOCK_SIZE, 0, stream>>>(global_set_ref,
                                                    num_input_rows,
                                                    window_extent,
-                                                   d_key_equal,
-                                                   d_row_hash,
                                                    local_mapping_index.data(),
                                                    global_mapping_index.data(),
                                                    block_cardinality.data(),
@@ -729,8 +721,7 @@ std::unique_ptr<table> groupby(table_view const& keys,
       stream.value()};
 
     // Compute all single pass aggs first
-    auto gather_map = compute_single_pass_aggs(
-      keys, requests, &sparse_results, set, stream, d_key_equal, d_row_hash);
+    auto gather_map = compute_single_pass_aggs(keys, requests, &sparse_results, set, stream);
 
     // Compact all results from sparse_results and insert into cache
     sparse_to_dense_results(keys,
diff --git a/cpp/src/groupby/hash/kernels.cuh b/cpp/src/groupby/hash/kernels.cuh
index 6299a3b2acf..1b7add50024 100644
--- a/cpp/src/groupby/hash/kernels.cuh
+++ b/cpp/src/groupby/hash/kernels.cuh
@@ -246,16 +246,10 @@ __device__ void find_global_mapping(cudf::size_type cur_idx,
  * updating `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to
  * the global hash set, and save the row index of the global sparse table in `global_mapping_index`.
  */
-template <class SetRef,
-          typename GlobalSetType,
-          typename KeyEqual,
-          typename RowHasher,
-          class WindowExtent>
+template <class SetRef, typename GlobalSetType, class WindowExtent>
 CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
                                          cudf::size_type num_input_rows,
                                          WindowExtent window_extent,
-                                         KeyEqual d_key_equal,
-                                         RowHasher d_row_hash,
                                          cudf::size_type* local_mapping_index,
                                          cudf::size_type* global_mapping_index,
                                          cudf::size_type* block_cardinality,
@@ -267,8 +261,8 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
   __shared__ typename SetRef::window_type windows[window_extent.value()];
   auto storage     = SetRef::storage_ref_type(window_extent, windows);
   auto shared_set  = SetRef(cuco::empty_key<cudf::size_type>{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
-                           d_key_equal,
-                           probing_scheme_type{d_row_hash},
+                           global_set.key_eq(),
+                           probing_scheme_type{global_set.hash_function()},
                             {},
                            storage);
   auto const block = cooperative_groups::this_thread_block();

From aa4e9570d232feb7bcc8f5af5e0774ec82d762d0 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 28 Aug 2024 13:45:44 -0700
Subject: [PATCH 014/135] Remove the redundant num_sms function

---
 cpp/src/groupby/hash/groupby.cu | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 0d53a0f46ea..c8f0a816b77 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -472,22 +472,13 @@ auto create_sparse_results_table(cudf::table_view const& flattened_values,
   return sparse_table;
 }
 
-int find_num_sms()
-{
-  int dev_id{-1};
-  CUDF_CUDA_TRY(cudaGetDevice(&dev_id));
-  int num_sms{-1};
-  CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
-  return num_sms;
-}
-
 template <typename FuncType>
-int find_grid_size(FuncType func, cudf::size_type num_input_rows, int num_sms)
+int find_grid_size(FuncType func, cudf::size_type num_input_rows)
 {
   int max_active_blocks{-1};
   CUDF_CUDA_TRY(
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func, GROUPBY_BLOCK_SIZE, 0));
-  auto max_grid_size       = max_active_blocks * num_sms;
+  auto max_grid_size       = max_active_blocks * cudf::detail::num_multiprocessors();
   int needed_active_blocks = cudf::util::div_rounding_up_safe(num_input_rows, GROUPBY_BLOCK_SIZE);
   return std::min(max_grid_size, needed_active_blocks);
 }
@@ -495,9 +486,10 @@ int find_grid_size(FuncType func, cudf::size_type num_input_rows, int num_sms)
 size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; }
 
 template <typename FuncType>
-size_t find_shmem_size(FuncType func, int grid_size, int num_sms)
+size_t find_shmem_size(FuncType func, int grid_size)
 {
-  auto active_blocks_per_sm = cudf::util::div_rounding_up_safe(grid_size, num_sms);
+  auto active_blocks_per_sm =
+    cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
 
   size_t dynamic_smem_size;
   CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock(
@@ -506,7 +498,6 @@ size_t find_shmem_size(FuncType func, int grid_size, int num_sms)
 }
 
 void launch_compute_aggregates(int grid_size,
-                               int num_sms,
                                cudf::size_type* local_mapping_index,
                                cudf::size_type* global_mapping_index,
                                cudf::size_type* block_cardinality,
@@ -516,7 +507,7 @@ void launch_compute_aggregates(int grid_size,
                                cudf::aggregation::Kind const* aggs,
                                rmm::cuda_stream_view stream)
 {
-  size_t d_shmem_size = find_shmem_size(compute_aggregates, grid_size, num_sms);
+  size_t d_shmem_size = find_shmem_size(compute_aggregates, grid_size);
   // For each aggregation, need two pointers to arrays in shmem
   // One where the aggregation is performed, one indicating the validity of the aggregation
   auto shmem_agg_pointer_size =
@@ -565,10 +556,10 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
 
   auto global_set_ref = global_set.ref(cuco::op::insert_and_find);
 
-  int num_sms = find_num_sms();
   auto compute_mapping_indices_fn_ptr =
     compute_mapping_indices<shared_set_ref_type, decltype(global_set_ref), decltype(window_extent)>;
-  auto const grid_size = find_grid_size(compute_mapping_indices_fn_ptr, num_input_rows, num_sms);
+  auto const grid_size = find_grid_size(
+    compute_mapping_indices_fn_ptr, num_input_rows, cudf::detail::num_multiprocessors());
   // 'local_mapping_index' maps from the global row index of the input table to the row index of
   // the local pre-aggregate table
   rmm::device_uvector<cudf::size_type> local_mapping_index(num_input_rows, stream);
@@ -609,7 +600,6 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   auto d_values = table_device_view::create(flattened_values, stream);
 
   launch_compute_aggregates(grid_size,
-                            num_sms,
                             local_mapping_index.data(),
                             global_mapping_index.data(),
                             block_cardinality.data(),

From 4fdb4b87ea51f35694045b6e0cb9deabdd3782e1 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 28 Aug 2024 13:52:20 -0700
Subject: [PATCH 015/135] Add missing header + minor cleanup

---
 cpp/src/groupby/hash/groupby.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index c8f0a816b77..aeb506efc9a 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -32,6 +32,7 @@
 #include <cudf/detail/groupby.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/unary.hpp>
+#include <cudf/detail/utilities/cuda.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
@@ -558,8 +559,7 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
 
   auto compute_mapping_indices_fn_ptr =
     compute_mapping_indices<shared_set_ref_type, decltype(global_set_ref), decltype(window_extent)>;
-  auto const grid_size = find_grid_size(
-    compute_mapping_indices_fn_ptr, num_input_rows, cudf::detail::num_multiprocessors());
+  auto const grid_size = find_grid_size(compute_mapping_indices_fn_ptr, num_input_rows);
   // 'local_mapping_index' maps from the global row index of the input table to the row index of
   // the local pre-aggregate table
   rmm::device_uvector<cudf::size_type> local_mapping_index(num_input_rows, stream);

From 4049aeb25d4070a2609b05a237beb0ff94ce87b3 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 28 Aug 2024 14:23:46 -0700
Subject: [PATCH 016/135] Clean up grid_size and shmem_size utilities

---
 cpp/src/groupby/hash/groupby.cu  | 70 ++++++++++++++++----------------
 cpp/src/groupby/hash/kernels.cuh | 18 ++++----
 2 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index aeb506efc9a..eb4f856e289 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -473,49 +473,49 @@ auto create_sparse_results_table(cudf::table_view const& flattened_values,
   return sparse_table;
 }
 
-template <typename FuncType>
-int find_grid_size(FuncType func, cudf::size_type num_input_rows)
+template <typename Kernel>
+int max_occupancy_grid_size(Kernel kernel, cudf::size_type n)
 {
   int max_active_blocks{-1};
-  CUDF_CUDA_TRY(
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func, GROUPBY_BLOCK_SIZE, 0));
-  auto max_grid_size       = max_active_blocks * cudf::detail::num_multiprocessors();
-  int needed_active_blocks = cudf::util::div_rounding_up_safe(num_input_rows, GROUPBY_BLOCK_SIZE);
-  return std::min(max_grid_size, needed_active_blocks);
+  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &max_active_blocks, kernel, GROUPBY_BLOCK_SIZE, 0));
+  auto const grid_size  = max_active_blocks * cudf::detail::num_multiprocessors();
+  auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE);
+  return std::min(grid_size, num_blocks);
 }
 
 size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; }
 
-template <typename FuncType>
-size_t find_shmem_size(FuncType func, int grid_size)
+template <typename Kernel>
+size_t compute_shared_memory_size(Kernel kernel, int grid_size)
 {
-  auto active_blocks_per_sm =
+  auto const active_blocks_per_sm =
     cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
 
-  size_t dynamic_smem_size;
+  size_t dynamic_shmem_size;
   CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock(
-    &dynamic_smem_size, func, active_blocks_per_sm, GROUPBY_BLOCK_SIZE));
-  return get_previous_multiple_of_8(0.5 * dynamic_smem_size);
+    &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE));
+  return get_previous_multiple_of_8(0.5 * dynamic_shmem_size);
 }
 
-void launch_compute_aggregates(int grid_size,
-                               cudf::size_type* local_mapping_index,
-                               cudf::size_type* global_mapping_index,
-                               cudf::size_type* block_cardinality,
-                               cudf::table_device_view input_values,
-                               cudf::mutable_table_device_view output_values,
-                               cudf::size_type num_input_rows,
-                               cudf::aggregation::Kind const* aggs,
-                               rmm::cuda_stream_view stream)
+void compute_aggregations(int grid_size,
+                          cudf::size_type* local_mapping_index,
+                          cudf::size_type* global_mapping_index,
+                          cudf::size_type* block_cardinality,
+                          cudf::table_device_view input_values,
+                          cudf::mutable_table_device_view output_values,
+                          cudf::size_type num_input_rows,
+                          cudf::aggregation::Kind const* aggs,
+                          rmm::cuda_stream_view stream)
 {
-  size_t d_shmem_size = find_shmem_size(compute_aggregates, grid_size);
+  auto const shmem_size = compute_shared_memory_size(compute_aggs_kernel, grid_size);
   // For each aggregation, need two pointers to arrays in shmem
   // One where the aggregation is performed, one indicating the validity of the aggregation
   auto shmem_agg_pointer_size =
     round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns());
   // The rest of shmem is utilized for the actual arrays in shmem
-  auto shmem_agg_size = d_shmem_size - shmem_agg_pointer_size * 2;
-  compute_aggregates<<<grid_size, GROUPBY_BLOCK_SIZE, d_shmem_size, stream>>>(
+  auto shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2;
+  compute_aggs_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, shmem_size, stream>>>(
     local_mapping_index,
     global_mapping_index,
     block_cardinality,
@@ -559,7 +559,7 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
 
   auto compute_mapping_indices_fn_ptr =
     compute_mapping_indices<shared_set_ref_type, decltype(global_set_ref), decltype(window_extent)>;
-  auto const grid_size = find_grid_size(compute_mapping_indices_fn_ptr, num_input_rows);
+  auto const grid_size = max_occupancy_grid_size(compute_mapping_indices_fn_ptr, num_input_rows);
   // 'local_mapping_index' maps from the global row index of the input table to the row index of
   // the local pre-aggregate table
   rmm::device_uvector<cudf::size_type> local_mapping_index(num_input_rows, stream);
@@ -599,15 +599,15 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
 
   auto d_values = table_device_view::create(flattened_values, stream);
 
-  launch_compute_aggregates(grid_size,
-                            local_mapping_index.data(),
-                            global_mapping_index.data(),
-                            block_cardinality.data(),
-                            *d_values,
-                            *d_sparse_table,
-                            num_input_rows,
-                            d_aggs.data(),
-                            stream);
+  compute_aggregations(grid_size,
+                       local_mapping_index.data(),
+                       global_mapping_index.data(),
+                       block_cardinality.data(),
+                       *d_values,
+                       *d_sparse_table,
+                       num_input_rows,
+                       d_aggs.data(),
+                       stream);
 
   if (direct_aggregations.value(stream)) {
     int stride = GROUPBY_BLOCK_SIZE * grid_size;
diff --git a/cpp/src/groupby/hash/kernels.cuh b/cpp/src/groupby/hash/kernels.cuh
index 1b7add50024..b8aa7304725 100644
--- a/cpp/src/groupby/hash/kernels.cuh
+++ b/cpp/src/groupby/hash/kernels.cuh
@@ -132,15 +132,15 @@ __device__ void compute_final_aggregates(int col_start,
 
 /* Takes the local_mapping_index and global_mapping_index to compute
  * pre (shared) and final (global) aggregates*/
-CUDF_KERNEL void compute_aggregates(cudf::size_type* local_mapping_index,
-                                    cudf::size_type* global_mapping_index,
-                                    cudf::size_type* block_cardinality,
-                                    cudf::table_device_view input_values,
-                                    cudf::mutable_table_device_view output_values,
-                                    cudf::size_type num_input_rows,
-                                    cudf::aggregation::Kind const* aggs,
-                                    int total_agg_size,
-                                    int pointer_size)
+CUDF_KERNEL void compute_aggs_kernel(cudf::size_type* local_mapping_index,
+                                     cudf::size_type* global_mapping_index,
+                                     cudf::size_type* block_cardinality,
+                                     cudf::table_device_view input_values,
+                                     cudf::mutable_table_device_view output_values,
+                                     cudf::size_type num_input_rows,
+                                     cudf::aggregation::Kind const* aggs,
+                                     int total_agg_size,
+                                     int pointer_size)
 {
   cudf::size_type cardinality = block_cardinality[blockIdx.x];
   if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; }

From 690fceea020a979f458d3f526ff3d3fbc7cc8a7d Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 28 Aug 2024 15:26:04 -0700
Subject: [PATCH 017/135] Minor cleanups with CG

---
 cpp/src/groupby/hash/groupby.cu  |  6 +++---
 cpp/src/groupby/hash/kernels.cuh | 35 ++++++++++++++++++--------------
 2 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index eb4f856e289..3f73e12adab 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -499,12 +499,12 @@ size_t compute_shared_memory_size(Kernel kernel, int grid_size)
 }
 
 void compute_aggregations(int grid_size,
+                          cudf::size_type num_input_rows,
                           cudf::size_type* local_mapping_index,
                           cudf::size_type* global_mapping_index,
                           cudf::size_type* block_cardinality,
                           cudf::table_device_view input_values,
                           cudf::mutable_table_device_view output_values,
-                          cudf::size_type num_input_rows,
                           cudf::aggregation::Kind const* aggs,
                           rmm::cuda_stream_view stream)
 {
@@ -516,12 +516,12 @@ void compute_aggregations(int grid_size,
   // The rest of shmem is utilized for the actual arrays in shmem
   auto shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2;
   compute_aggs_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, shmem_size, stream>>>(
+    num_input_rows,
     local_mapping_index,
     global_mapping_index,
     block_cardinality,
     input_values,
     output_values,
-    num_input_rows,
     aggs,
     shmem_agg_size,
     shmem_agg_pointer_size);
@@ -600,12 +600,12 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   auto d_values = table_device_view::create(flattened_values, stream);
 
   compute_aggregations(grid_size,
+                       num_input_rows,
                        local_mapping_index.data(),
                        global_mapping_index.data(),
                        block_cardinality.data(),
                        *d_values,
                        *d_sparse_table,
-                       num_input_rows,
                        d_aggs.data(),
                        stream);
 
diff --git a/cpp/src/groupby/hash/kernels.cuh b/cpp/src/groupby/hash/kernels.cuh
index b8aa7304725..3051901fb37 100644
--- a/cpp/src/groupby/hash/kernels.cuh
+++ b/cpp/src/groupby/hash/kernels.cuh
@@ -132,42 +132,47 @@ __device__ void compute_final_aggregates(int col_start,
 
 /* Takes the local_mapping_index and global_mapping_index to compute
  * pre (shared) and final (global) aggregates*/
-CUDF_KERNEL void compute_aggs_kernel(cudf::size_type* local_mapping_index,
+CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows,
+                                     cudf::size_type* local_mapping_index,
                                      cudf::size_type* global_mapping_index,
                                      cudf::size_type* block_cardinality,
                                      cudf::table_device_view input_values,
                                      cudf::mutable_table_device_view output_values,
-                                     cudf::size_type num_input_rows,
                                      cudf::aggregation::Kind const* aggs,
                                      int total_agg_size,
                                      int pointer_size)
 {
-  cudf::size_type cardinality = block_cardinality[blockIdx.x];
+  auto const block       = cooperative_groups::this_thread_block();
+  auto const cardinality = block_cardinality[block.group_index().x];
   if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; }
-  int num_input_cols = output_values.num_columns();
+
+  auto const num_cols = output_values.num_columns();
+
+  __shared__ int col_start;
+  __shared__ int col_end;
   extern __shared__ std::byte shared_set_aggregates[];
   std::byte** s_aggregates_pointer =
     reinterpret_cast<std::byte**>(shared_set_aggregates + total_agg_size);
   bool** s_aggregates_valid_pointer =
     reinterpret_cast<bool**>(shared_set_aggregates + total_agg_size + pointer_size);
-  __shared__ int col_start;
-  __shared__ int col_end;
-  if (threadIdx.x == 0) {
+
+  if (block.thread_rank() == 0) {
     col_start = 0;
     col_end   = 0;
   }
-  __syncthreads();
-  while (col_end < num_input_cols) {
+  block.sync();
+
+  while (col_end < num_cols) {
     calculate_columns_to_aggregate(col_start,
                                    col_end,
                                    output_values,
-                                   num_input_cols,
+                                   num_cols,
                                    s_aggregates_pointer,
                                    s_aggregates_valid_pointer,
                                    shared_set_aggregates,
                                    cardinality,
                                    total_agg_size);
-    __syncthreads();
+    block.sync();
     initialize_shared_memory_aggregates(col_start,
                                         col_end,
                                         output_values,
@@ -175,16 +180,16 @@ CUDF_KERNEL void compute_aggs_kernel(cudf::size_type* local_mapping_index,
                                         s_aggregates_valid_pointer,
                                         cardinality,
                                         aggs);
-    __syncthreads();
+    block.sync();
     compute_pre_aggregrates(col_start,
                             col_end,
                             input_values,
-                            num_input_rows,
+                            num_rows,
                             local_mapping_index,
                             s_aggregates_pointer,
                             s_aggregates_valid_pointer,
                             aggs);
-    __syncthreads();
+    block.sync();
     compute_final_aggregates(col_start,
                              col_end,
                              input_values,
@@ -194,7 +199,7 @@ CUDF_KERNEL void compute_aggs_kernel(cudf::size_type* local_mapping_index,
                              s_aggregates_pointer,
                              s_aggregates_valid_pointer,
                              aggs);
-    __syncthreads();
+    block.sync();
   }
 }
 

From 716a73c9e87e609bac8c29df813dfb418232fe59 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 28 Aug 2024 16:40:57 -0700
Subject: [PATCH 018/135] Improve docs for aggregation details

---
 cpp/include/cudf/detail/aggregation/aggregation.hpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index b257eef1e9e..78d9951670d 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1497,7 +1497,7 @@ AGG_KIND_MAPPING(aggregation::VARIANCE, var_aggregation);
  *
  * @tparam F Type of callable
  * @param k The `aggregation::Kind` value to dispatch
- * aram f The callable that accepts an `aggregation::Kind` non-type template
+ * @param f The callable that accepts an `aggregation::Kind` non-type template
  * argument.
  * @param args Parameter pack forwarded to the `operator()` invocation
  * @return Forwards the return value of the callable.
@@ -1626,6 +1626,8 @@ struct dispatch_source {
  * parameter of the callable `F`
  * @param k The `aggregation::Kind` used to dispatch an `aggregation::Kind`
  * non-type template parameter for the second template parameter of the callable
+ * @param f The callable that accepts `data_type` and `aggregation::Kind` non-type template
+ * arguments.
  * @param args Parameter pack forwarded to the `operator()` invocation
  * `F`.
  */
@@ -1644,8 +1646,8 @@ CUDF_HOST_DEVICE inline constexpr decltype(auto) dispatch_type_and_aggregation(d
  * @brief Returns the target `data_type` for the specified aggregation  k
  * performed on elements of type  source_type.
  *
- * aram source_type The element type to be aggregated
- * aram k The aggregation
+ * @param source_type The element type to be aggregated
+ * @param k The aggregation
  * @return data_type The target_type of  k performed on  source_type
  * elements
  */

From 3c8403ddde6da2bfaf5694b2f2a3789a0513baa1 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 28 Aug 2024 17:21:37 -0700
Subject: [PATCH 019/135] Minor cleanup

---
 cpp/src/groupby/hash/groupby.cu | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 3f73e12adab..2833ca55522 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -555,11 +555,10 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
 
   auto const num_input_rows = keys.num_rows();
 
-  auto global_set_ref = global_set.ref(cuco::op::insert_and_find);
-
-  auto compute_mapping_indices_fn_ptr =
-    compute_mapping_indices<shared_set_ref_type, decltype(global_set_ref), decltype(window_extent)>;
-  auto const grid_size = max_occupancy_grid_size(compute_mapping_indices_fn_ptr, num_input_rows);
+  auto global_set_ref  = global_set.ref(cuco::op::insert_and_find);
+  auto const grid_size = max_occupancy_grid_size(
+    compute_mapping_indices<shared_set_ref_type, decltype(global_set_ref), decltype(window_extent)>,
+    num_input_rows);
   // 'local_mapping_index' maps from the global row index of the input table to the row index of
   // the local pre-aggregate table
   rmm::device_uvector<cudf::size_type> local_mapping_index(num_input_rows, stream);

From e7224cbdc67418e0d9b7c93547e6a4faef59ad35 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 18 Sep 2024 18:10:44 -0700
Subject: [PATCH 020/135] Update device operator overloads to agg
 identity_initializer

---
 .../cudf/detail/aggregation/aggregation.cuh   | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index ecf2f610697..bc2d0edbeba 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -636,6 +636,25 @@ struct identity_initializer {
   }
 
  public:
+  template <typename Target, cudf::aggregation::Kind k>
+  __device__ std::enable_if_t<is_supported<Target, k>(), void> operator()(
+    cudf::mutable_column_device_view target, cudf::size_type target_index)
+  {
+    using DeviceType = device_storage_type_t<Target>;
+    using ElementType =
+      cuda::std::conditional_t<cudf::is_fixed_width<Target>() && !cudf::is_fixed_point<Target>(),
+                               Target,
+                               DeviceType>;
+    target.element<ElementType>(target_index) = get_identity<DeviceType, k>();
+  }
+
+  template <typename Target, cudf::aggregation::Kind k>
+  __device__ std::enable_if_t<!is_supported<Target, k>(), void> operator()(
+    cudf::mutable_column_device_view target, cudf::size_type target_index)
+  {
+    CUDF_UNREACHABLE("Unsupported aggregation for initializing values");
+  }
+
   template <typename T, aggregation::Kind k>
   std::enable_if_t<is_supported<T, k>(), void> operator()(mutable_column_view const& col,
                                                           rmm::cuda_stream_view stream)

From 124aac0994af7f3f204c5150b20101cdc053867f Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 18 Sep 2024 18:12:23 -0700
Subject: [PATCH 021/135] Clean up groupby details for ODR

---
 .../groupby/hash/compute_single_pass_aggs.cuh | 480 ++++++++++++++++++
 .../groupby/hash/compute_single_pass_aggs.hpp |  45 ++
 cpp/src/groupby/hash/groupby.cu               | 341 +------------
 cpp/src/groupby/hash/kernels.cuh              | 112 ----
 cpp/src/groupby/hash/single_pass_functors.cuh |   2 +-
 5 files changed, 528 insertions(+), 452 deletions(-)
 create mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs.cuh
 create mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs.hpp

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
new file mode 100644
index 00000000000..3b36f8a1f81
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_single_pass_aggs.hpp"
+#include "helpers.cuh"
+#include "kernels.cuh"
+#include "single_pass_functors.cuh"
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/cuco_helpers.hpp>
+#include <cudf/detail/utilities/cuda.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <cuco/static_set.cuh>
+
+#include <unordered_set>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+namespace hash {
+
+template <typename SetType>
+__device__ void find_local_mapping(cudf::size_type cur_idx,
+                                   cudf::size_type num_input_rows,
+                                   cudf::size_type* cardinality,
+                                   SetType shared_set,
+                                   cudf::size_type* local_mapping_index,
+                                   cudf::size_type* shared_set_indices)
+{
+  cudf::size_type result_idx;
+  bool inserted;
+  if (cur_idx < num_input_rows) {
+    auto const result = shared_set.insert_and_find(cur_idx);
+    result_idx        = *result.first;
+    inserted          = result.second;
+    // inserted a new element
+    if (result.second) {
+      auto shared_set_index                = atomicAdd(cardinality, 1);
+      shared_set_indices[shared_set_index] = cur_idx;
+      local_mapping_index[cur_idx]         = shared_set_index;
+    }
+  }
+  // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all
+  // threads in the thread block.
+  __syncthreads();
+  if (cur_idx < num_input_rows) {
+    // element was already in set
+    if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; }
+  }
+}
+
+template <typename SetType>
+__device__ void find_global_mapping(cudf::size_type cur_idx,
+                                    SetType global_set,
+                                    cudf::size_type* shared_set_indices,
+                                    cudf::size_type* global_mapping_index)
+{
+  auto input_idx = shared_set_indices[cur_idx];
+  auto result    = global_set.insert_and_find(input_idx);
+  global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] = *result.first;
+}
+
+/*
+ * Inserts keys into the shared memory hash set, and stores the row index of the local
+ * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a
+ * threadblock exceeds `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without
+ * updating `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to
+ * the global hash set, and save the row index of the global sparse table in `global_mapping_index`.
+ */
+template <class SetRef, typename GlobalSetType, class WindowExtent>
+CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
+                                         cudf::size_type num_input_rows,
+                                         WindowExtent window_extent,
+                                         cudf::size_type* local_mapping_index,
+                                         cudf::size_type* global_mapping_index,
+                                         cudf::size_type* block_cardinality,
+                                         bool* direct_aggregations)
+{
+  __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS];
+
+  // Shared set initialization
+  __shared__ typename SetRef::window_type windows[window_extent.value()];
+  auto storage     = SetRef::storage_ref_type(window_extent, windows);
+  auto shared_set  = SetRef(cuco::empty_key<cudf::size_type>{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+                           global_set.key_eq(),
+                           probing_scheme_type{global_set.hash_function()},
+                            {},
+                           storage);
+  auto const block = cooperative_groups::this_thread_block();
+  shared_set.initialize(block);
+  block.sync();
+
+  auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find);
+
+  __shared__ cudf::size_type cardinality;
+
+  if (block.thread_rank() == 0) { cardinality = 0; }
+
+  block.sync();
+
+  auto const stride = cudf::detail::grid_1d::grid_stride();
+
+  for (auto cur_idx = cudf::detail::grid_1d::global_thread_id();
+       cur_idx - block.thread_rank() < num_input_rows;
+       cur_idx += stride) {
+    find_local_mapping(cur_idx,
+                       num_input_rows,
+                       &cardinality,
+                       shared_insert_ref,
+                       local_mapping_index,
+                       shared_set_indices);
+
+    block.sync();
+
+    if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) {
+      if (block.thread_rank() == 0) { *direct_aggregations = true; }
+      break;
+    }
+
+    block.sync();
+  }
+
+  // Insert unique keys from shared to global hash set
+  if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) {
+    for (auto cur_idx = block.thread_rank(); cur_idx < cardinality;
+         cur_idx += block.num_threads()) {
+      find_global_mapping(cur_idx, global_set, shared_set_indices, global_mapping_index);
+    }
+  }
+
+  if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; }
+}
+
+class groupby_simple_aggregations_collector final
+  : public cudf::detail::simple_aggregations_collector {
+ public:
+  using cudf::detail::simple_aggregations_collector::visit;
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                  cudf::detail::min_aggregation const&) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation()
+                                                    : make_min_aggregation());
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                  cudf::detail::max_aggregation const&) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation()
+                                                    : make_max_aggregation());
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                  cudf::detail::mean_aggregation const&) override
+  {
+    (void)col_type;
+    CUDF_EXPECTS(is_fixed_width(col_type), "MEAN aggregation expects fixed width type");
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(make_sum_aggregation());
+    // COUNT_VALID
+    aggs.push_back(make_count_aggregation());
+
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type,
+                                                  cudf::detail::var_aggregation const&) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(make_sum_aggregation());
+    // COUNT_VALID
+    aggs.push_back(make_count_aggregation());
+
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type,
+                                                  cudf::detail::std_aggregation const&) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(make_sum_aggregation());
+    // COUNT_VALID
+    aggs.push_back(make_count_aggregation());
+
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(
+    data_type, cudf::detail::correlation_aggregation const&) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(make_sum_aggregation());
+    // COUNT_VALID
+    aggs.push_back(make_count_aggregation());
+
+    return aggs;
+  }
+};
+
+// flatten aggs to filter in single pass aggs
+std::tuple<table_view, std::vector<aggregation::Kind>, std::vector<std::unique_ptr<aggregation>>>
+flatten_single_pass_aggs(host_span<aggregation_request const> requests)
+{
+  std::vector<column_view> columns;
+  std::vector<std::unique_ptr<aggregation>> aggs;
+  std::vector<aggregation::Kind> agg_kinds;
+
+  for (auto const& request : requests) {
+    auto const& agg_v = request.aggregations;
+
+    std::unordered_set<aggregation::Kind> agg_kinds_set;
+    auto insert_agg = [&](column_view const& request_values, std::unique_ptr<aggregation>&& agg) {
+      if (agg_kinds_set.insert(agg->kind).second) {
+        agg_kinds.push_back(agg->kind);
+        aggs.push_back(std::move(agg));
+        columns.push_back(request_values);
+      }
+    };
+
+    auto values_type = cudf::is_dictionary(request.values.type())
+                         ? cudf::dictionary_column_view(request.values).keys().type()
+                         : request.values.type();
+    for (auto&& agg : agg_v) {
+      groupby_simple_aggregations_collector collector;
+
+      for (auto& agg_s : agg->get_simple_aggregations(values_type, collector)) {
+        insert_agg(request.values, std::move(agg_s));
+      }
+    }
+  }
+
+  return std::make_tuple(table_view(columns), std::move(agg_kinds), std::move(aggs));
+}
+
+template <typename Kernel>
+int max_occupancy_grid_size(Kernel kernel, cudf::size_type n)
+{
+  int max_active_blocks{-1};
+  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &max_active_blocks, kernel, GROUPBY_BLOCK_SIZE, 0));
+  auto const grid_size  = max_active_blocks * cudf::detail::num_multiprocessors();
+  auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE);
+  return std::min(grid_size, num_blocks);
+}
+
+size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; }
+
+template <typename Kernel>
+size_t compute_shared_memory_size(Kernel kernel, int grid_size)
+{
+  auto const active_blocks_per_sm =
+    cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
+
+  size_t dynamic_shmem_size;
+  CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock(
+    &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE));
+  return get_previous_multiple_of_8(0.5 * dynamic_shmem_size);
+}
+
+void compute_aggregations(int grid_size,
+                          cudf::size_type num_input_rows,
+                          cudf::size_type* local_mapping_index,
+                          cudf::size_type* global_mapping_index,
+                          cudf::size_type* block_cardinality,
+                          cudf::table_device_view input_values,
+                          cudf::mutable_table_device_view output_values,
+                          cudf::aggregation::Kind const* aggs,
+                          rmm::cuda_stream_view stream)
+{
+  auto const shmem_size = compute_shared_memory_size(compute_aggs_kernel, grid_size);
+  // For each aggregation, need two pointers to arrays in shmem
+  // One where the aggregation is performed, one indicating the validity of the aggregation
+  auto shmem_agg_pointer_size =
+    round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns());
+  // The rest of shmem is utilized for the actual arrays in shmem
+  auto shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2;
+  compute_aggs_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, shmem_size, stream>>>(
+    num_input_rows,
+    local_mapping_index,
+    global_mapping_index,
+    block_cardinality,
+    input_values,
+    output_values,
+    aggs,
+    shmem_agg_size,
+    shmem_agg_pointer_size);
+}
+
+template <typename SetType>
+void extract_populated_keys(SetType const& key_set,
+                            rmm::device_uvector<cudf::size_type>& populated_keys,
+                            rmm::cuda_stream_view stream)
+{
+  auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value());
+
+  populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream);
+}
+
+// make table that will hold sparse results
+template <typename GlobalSetType>
+auto create_sparse_results_table(cudf::table_view const& flattened_values,
+                                 const cudf::aggregation::Kind* d_aggs,
+                                 std::vector<cudf::aggregation::Kind> aggs,
+                                 bool direct_aggregations,
+                                 GlobalSetType const& global_set,
+                                 rmm::device_uvector<cudf::size_type>& populated_keys,
+                                 rmm::cuda_stream_view stream)
+{
+  // TODO single allocation - room for performance improvement
+  std::vector<std::unique_ptr<cudf::column>> sparse_columns;
+  std::transform(
+    flattened_values.begin(),
+    flattened_values.end(),
+    aggs.begin(),
+    std::back_inserter(sparse_columns),
+    [stream](auto const& col, auto const& agg) {
+      bool nullable = (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL)
+                        ? false
+                        : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or
+                           agg == cudf::aggregation::STD);
+      auto mask_flag = (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED;
+      auto col_type  = cudf::is_dictionary(col.type())
+                         ? cudf::dictionary_column_view(col).keys().type()
+                         : col.type();
+      return make_fixed_width_column(
+        cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream);
+    });
+  cudf::table sparse_table(std::move(sparse_columns));
+  // If no direct aggregations, initialize the sparse table
+  // only for the keys inserted in global hash set
+  if (!direct_aggregations) {
+    auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream);
+    extract_populated_keys(global_set, populated_keys, stream);
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator(0),
+                       populated_keys.size(),
+                       initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_aggs});
+  }
+  // Else initialise the whole table
+  else {
+    cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view();
+    cudf::detail::initialize_with_identity(sparse_table_view, aggs, stream);
+  }
+  return sparse_table;
+}
+
+/**
+ * @brief Computes all aggregations from `requests` that require a single pass
+ * over the data and stores the results in `sparse_results`
+ */
+template <typename SetType>
+rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
+  cudf::table_view const& keys,
+  cudf::host_span<cudf::groupby::aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  SetType& global_set,
+  rmm::cuda_stream_view stream)
+{
+  // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy
+  auto constexpr shared_set_capacity =
+    static_cast<std::size_t>(static_cast<double>(GROUPBY_SHM_MAX_ELEMENTS) * 1.43);
+  using extent_type            = cuco::extent<cudf::size_type, shared_set_capacity>;
+  using shared_set_type        = cuco::static_set<cudf::size_type,
+                                           extent_type,
+                                           cuda::thread_scope_block,
+                                           typename SetType::key_equal,
+                                           probing_scheme_type,
+                                           cuco::cuda_allocator<cudf::size_type>,
+                                           cuco::storage<GROUPBY_WINDOW_SIZE>>;
+  using shared_set_ref_type    = typename shared_set_type::ref_type<>;
+  auto constexpr window_extent = cuco::make_window_extent<shared_set_ref_type>(extent_type{});
+
+  auto const num_input_rows = keys.num_rows();
+
+  auto global_set_ref  = global_set.ref(cuco::op::insert_and_find);
+  auto const grid_size = max_occupancy_grid_size(
+    compute_mapping_indices<shared_set_ref_type, decltype(global_set_ref), decltype(window_extent)>,
+    num_input_rows);
+  // 'local_mapping_index' maps from the global row index of the input table to the row index of
+  // the local pre-aggregate table
+  rmm::device_uvector<cudf::size_type> local_mapping_index(num_input_rows, stream);
+  // 'global_mapping_index' maps from  the local pre-aggregate table to the row index of
+  // global aggregate table
+  rmm::device_uvector<cudf::size_type> global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS,
+                                                            stream);
+  rmm::device_uvector<cudf::size_type> block_cardinality(grid_size, stream);
+  rmm::device_scalar<bool> direct_aggregations(false, stream);
+  compute_mapping_indices<shared_set_ref_type>
+    <<<grid_size, GROUPBY_BLOCK_SIZE, 0, stream>>>(global_set_ref,
+                                                   num_input_rows,
+                                                   window_extent,
+                                                   local_mapping_index.data(),
+                                                   global_mapping_index.data(),
+                                                   block_cardinality.data(),
+                                                   direct_aggregations.data());
+  stream.synchronize();
+
+  // 'populated_keys' contains inserted row_indices (keys) of global hash set
+  rmm::device_uvector<cudf::size_type> populated_keys(keys.num_rows(), stream);
+
+  // flatten the aggs to a table that can be operated on by aggregate_row
+  auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
+  auto const d_aggs                              = cudf::detail::make_device_uvector_async(
+    agg_kinds, stream, rmm::mr::get_current_device_resource());
+  // make table that will hold sparse results
+  cudf::table sparse_table = create_sparse_results_table(flattened_values,
+                                                         d_aggs.data(),
+                                                         agg_kinds,
+                                                         direct_aggregations.value(stream),
+                                                         global_set,
+                                                         populated_keys,
+                                                         stream);
+  // prepare to launch kernel to do the actual aggregation
+  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
+  auto d_values       = table_device_view::create(flattened_values, stream);
+
+  compute_aggregations(grid_size,
+                       num_input_rows,
+                       local_mapping_index.data(),
+                       global_mapping_index.data(),
+                       block_cardinality.data(),
+                       *d_values,
+                       *d_sparse_table,
+                       d_aggs.data(),
+                       stream);
+
+  if (direct_aggregations.value(stream)) {
+    auto const stride = GROUPBY_BLOCK_SIZE * grid_size;
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator(0),
+                       keys.num_rows(),
+                       compute_direct_aggregates{global_set_ref,
+                                                 *d_values,
+                                                 *d_sparse_table,
+                                                 d_aggs.data(),
+                                                 block_cardinality.data(),
+                                                 stride});
+    extract_populated_keys(global_set, populated_keys, stream);
+  }
+
+  // Add results back to sparse_results cache
+  auto sparse_result_cols = sparse_table.release();
+  for (size_t i = 0; i < aggs.size(); i++) {
+    // Note that the cache will make a copy of this temporary aggregation
+    sparse_results->add_result(
+      flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i]));
+  }
+
+  return populated_keys;
+}
+
+}  // namespace hash
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
new file mode 100644
index 00000000000..73a85d67627
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+namespace hash {
+/**
+ * @brief Computes all aggregations from `requests` that require a single pass
+ * over the data and stores the results in `sparse_results`
+ */
+template <typename SetType>
+CUDF_EXPORT rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
+  cudf::table_view const& keys,
+  cudf::host_span<cudf::groupby::aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  SetType& global_set,
+  rmm::cuda_stream_view stream);
+
+}  // namespace hash
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index ceda6c5a4cb..e93c8b46613 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
+#include "compute_single_pass_aggs.cuh"
+#include "compute_single_pass_aggs.hpp"
 #include "groupby/common/utils.hpp"
 #include "helpers.cuh"
-#include "kernels.cuh"
 #include "multi_pass_functors.cuh"
 #include "single_pass_functors.cuh"
 
@@ -52,7 +53,6 @@
 #include <thrust/iterator/counting_iterator.h>
 
 #include <memory>
-#include <unordered_set>
 #include <utility>
 
 namespace cudf {
@@ -105,76 +105,6 @@ bool constexpr is_hash_aggregation(aggregation::Kind t)
   return array_contains(hash_aggregations, t);
 }
 
-class groupby_simple_aggregations_collector final
-  : public cudf::detail::simple_aggregations_collector {
- public:
-  using cudf::detail::simple_aggregations_collector::visit;
-
-  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
-                                                  cudf::detail::min_aggregation const&) override
-  {
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation()
-                                                    : make_min_aggregation());
-    return aggs;
-  }
-
-  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
-                                                  cudf::detail::max_aggregation const&) override
-  {
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation()
-                                                    : make_max_aggregation());
-    return aggs;
-  }
-
-  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
-                                                  cudf::detail::mean_aggregation const&) override
-  {
-    (void)col_type;
-    CUDF_EXPECTS(is_fixed_width(col_type), "MEAN aggregation expects fixed width type");
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(make_sum_aggregation());
-    // COUNT_VALID
-    aggs.push_back(make_count_aggregation());
-
-    return aggs;
-  }
-
-  std::vector<std::unique_ptr<aggregation>> visit(data_type,
-                                                  cudf::detail::var_aggregation const&) override
-  {
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(make_sum_aggregation());
-    // COUNT_VALID
-    aggs.push_back(make_count_aggregation());
-
-    return aggs;
-  }
-
-  std::vector<std::unique_ptr<aggregation>> visit(data_type,
-                                                  cudf::detail::std_aggregation const&) override
-  {
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(make_sum_aggregation());
-    // COUNT_VALID
-    aggs.push_back(make_count_aggregation());
-
-    return aggs;
-  }
-
-  std::vector<std::unique_ptr<aggregation>> visit(
-    data_type, cudf::detail::correlation_aggregation const&) override
-  {
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(make_sum_aggregation());
-    // COUNT_VALID
-    aggs.push_back(make_count_aggregation());
-
-    return aggs;
-  }
-};
-
 template <typename SetType>
 class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer {
   column_view col;
@@ -342,40 +272,6 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
     dense_results->add_result(col, agg, std::move(result));
   }
 };
-// flatten aggs to filter in single pass aggs
-std::tuple<table_view, std::vector<aggregation::Kind>, std::vector<std::unique_ptr<aggregation>>>
-flatten_single_pass_aggs(host_span<aggregation_request const> requests)
-{
-  std::vector<column_view> columns;
-  std::vector<std::unique_ptr<aggregation>> aggs;
-  std::vector<aggregation::Kind> agg_kinds;
-
-  for (auto const& request : requests) {
-    auto const& agg_v = request.aggregations;
-
-    std::unordered_set<aggregation::Kind> agg_kinds_set;
-    auto insert_agg = [&](column_view const& request_values, std::unique_ptr<aggregation>&& agg) {
-      if (agg_kinds_set.insert(agg->kind).second) {
-        agg_kinds.push_back(agg->kind);
-        aggs.push_back(std::move(agg));
-        columns.push_back(request_values);
-      }
-    };
-
-    auto values_type = cudf::is_dictionary(request.values.type())
-                         ? cudf::dictionary_column_view(request.values).keys().type()
-                         : request.values.type();
-    for (auto&& agg : agg_v) {
-      groupby_simple_aggregations_collector collector;
-
-      for (auto& agg_s : agg->get_simple_aggregations(values_type, collector)) {
-        insert_agg(request.values, std::move(agg_s));
-      }
-    }
-  }
-
-  return std::make_tuple(table_view(columns), std::move(agg_kinds), std::move(aggs));
-}
 
 /**
  * @brief Gather sparse results into dense using `gather_map` and add to
@@ -415,239 +311,6 @@ void sparse_to_dense_results(table_view const& keys,
   }
 }
 
-template <typename SetType>
-void extract_populated_keys(SetType const& key_set,
-                            rmm::device_uvector<cudf::size_type>& populated_keys,
-                            rmm::cuda_stream_view stream)
-{
-  auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value());
-
-  populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream);
-}
-
-// make table that will hold sparse results
-template <typename GlobalSetType>
-auto create_sparse_results_table(cudf::table_view const& flattened_values,
-                                 const cudf::aggregation::Kind* d_aggs,
-                                 std::vector<cudf::aggregation::Kind> aggs,
-                                 bool direct_aggregations,
-                                 GlobalSetType const& global_set,
-                                 rmm::device_uvector<cudf::size_type>& populated_keys,
-                                 rmm::cuda_stream_view stream)
-{
-  // TODO single allocation - room for performance improvement
-  std::vector<std::unique_ptr<cudf::column>> sparse_columns;
-  std::transform(
-    flattened_values.begin(),
-    flattened_values.end(),
-    aggs.begin(),
-    std::back_inserter(sparse_columns),
-    [stream](auto const& col, auto const& agg) {
-      bool nullable = (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL)
-                        ? false
-                        : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or
-                           agg == cudf::aggregation::STD);
-      auto mask_flag = (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED;
-      auto col_type  = cudf::is_dictionary(col.type())
-                         ? cudf::dictionary_column_view(col).keys().type()
-                         : col.type();
-      return make_fixed_width_column(
-        cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream);
-    });
-  cudf::table sparse_table(std::move(sparse_columns));
-  // If no direct aggregations, initialize the sparse table
-  // only for the keys inserted in global hash set
-  if (!direct_aggregations) {
-    auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream);
-    extract_populated_keys(global_set, populated_keys, stream);
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator(0),
-                       populated_keys.size(),
-                       initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_aggs});
-  }
-  // Else initialise the whole table
-  else {
-    cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view();
-    cudf::detail::initialize_with_identity(sparse_table_view, aggs, stream);
-  }
-  return sparse_table;
-}
-
-template <typename Kernel>
-int max_occupancy_grid_size(Kernel kernel, cudf::size_type n)
-{
-  int max_active_blocks{-1};
-  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &max_active_blocks, kernel, GROUPBY_BLOCK_SIZE, 0));
-  auto const grid_size  = max_active_blocks * cudf::detail::num_multiprocessors();
-  auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE);
-  return std::min(grid_size, num_blocks);
-}
-
-size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; }
-
-template <typename Kernel>
-size_t compute_shared_memory_size(Kernel kernel, int grid_size)
-{
-  auto const active_blocks_per_sm =
-    cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
-
-  size_t dynamic_shmem_size;
-  CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock(
-    &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE));
-  return get_previous_multiple_of_8(0.5 * dynamic_shmem_size);
-}
-
-void compute_aggregations(int grid_size,
-                          cudf::size_type num_input_rows,
-                          cudf::size_type* local_mapping_index,
-                          cudf::size_type* global_mapping_index,
-                          cudf::size_type* block_cardinality,
-                          cudf::table_device_view input_values,
-                          cudf::mutable_table_device_view output_values,
-                          cudf::aggregation::Kind const* aggs,
-                          rmm::cuda_stream_view stream)
-{
-  auto const shmem_size = compute_shared_memory_size(compute_aggs_kernel, grid_size);
-  // For each aggregation, need two pointers to arrays in shmem
-  // One where the aggregation is performed, one indicating the validity of the aggregation
-  auto shmem_agg_pointer_size =
-    round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns());
-  // The rest of shmem is utilized for the actual arrays in shmem
-  auto shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2;
-  compute_aggs_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, shmem_size, stream>>>(
-    num_input_rows,
-    local_mapping_index,
-    global_mapping_index,
-    block_cardinality,
-    input_values,
-    output_values,
-    aggs,
-    shmem_agg_size,
-    shmem_agg_pointer_size);
-}
-
-/**
- * @brief Computes all aggregations from `requests` that require a single pass
- * over the data and stores the results in `sparse_results`
- */
-template <typename SetType>
-rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
-  cudf::table_view const& keys,
-  cudf::host_span<cudf::groupby::aggregation_request const> requests,
-  cudf::detail::result_cache* sparse_results,
-  SetType& global_set,
-  rmm::cuda_stream_view stream)
-{
-  // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy
-  auto constexpr shared_set_capacity =
-    static_cast<std::size_t>(static_cast<double>(GROUPBY_SHM_MAX_ELEMENTS) * 1.43);
-  using extent_type            = cuco::extent<cudf::size_type, shared_set_capacity>;
-  using shared_set_type        = cuco::static_set<cudf::size_type,
-                                           extent_type,
-                                           cuda::thread_scope_block,
-                                           typename SetType::key_equal,
-                                           probing_scheme_type,
-                                           cuco::cuda_allocator<cudf::size_type>,
-                                           cuco::storage<GROUPBY_WINDOW_SIZE>>;
-  using shared_set_ref_type    = typename shared_set_type::ref_type<>;
-  auto constexpr window_extent = cuco::make_window_extent<shared_set_ref_type>(extent_type{});
-
-  auto const num_input_rows = keys.num_rows();
-
-  auto global_set_ref  = global_set.ref(cuco::op::insert_and_find);
-  auto const grid_size = max_occupancy_grid_size(
-    compute_mapping_indices<shared_set_ref_type, decltype(global_set_ref), decltype(window_extent)>,
-    num_input_rows);
-  // 'local_mapping_index' maps from the global row index of the input table to the row index of
-  // the local pre-aggregate table
-  rmm::device_uvector<cudf::size_type> local_mapping_index(num_input_rows, stream);
-  // 'global_mapping_index' maps from  the local pre-aggregate table to the row index of
-  // global aggregate table
-  rmm::device_uvector<cudf::size_type> global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS,
-                                                            stream);
-  rmm::device_uvector<cudf::size_type> block_cardinality(grid_size, stream);
-  rmm::device_scalar<bool> direct_aggregations(false, stream);
-  compute_mapping_indices<shared_set_ref_type>
-    <<<grid_size, GROUPBY_BLOCK_SIZE, 0, stream>>>(global_set_ref,
-                                                   num_input_rows,
-                                                   window_extent,
-                                                   local_mapping_index.data(),
-                                                   global_mapping_index.data(),
-                                                   block_cardinality.data(),
-                                                   direct_aggregations.data());
-  stream.synchronize();
-
-  // 'populated_keys' contains inserted row_indices (keys) of global hash set
-  rmm::device_uvector<cudf::size_type> populated_keys(keys.num_rows(), stream);
-
-  // flatten the aggs to a table that can be operated on by aggregate_row
-  auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
-  auto const d_aggs                              = cudf::detail::make_device_uvector_async(
-    agg_kinds, stream, rmm::mr::get_current_device_resource());
-  // make table that will hold sparse results
-  cudf::table sparse_table = create_sparse_results_table(flattened_values,
-                                                         d_aggs.data(),
-                                                         agg_kinds,
-                                                         direct_aggregations.value(stream),
-                                                         global_set,
-                                                         populated_keys,
-                                                         stream);
-  // prepare to launch kernel to do the actual aggregation
-  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
-  auto d_values       = table_device_view::create(flattened_values, stream);
-
-  compute_aggregations(grid_size,
-                       num_input_rows,
-                       local_mapping_index.data(),
-                       global_mapping_index.data(),
-                       block_cardinality.data(),
-                       *d_values,
-                       *d_sparse_table,
-                       d_aggs.data(),
-                       stream);
-
-  if (direct_aggregations.value(stream)) {
-    int stride = GROUPBY_BLOCK_SIZE * grid_size;
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator(0),
-                       keys.num_rows(),
-                       compute_direct_aggregates{global_set_ref,
-                                                 *d_values,
-                                                 *d_sparse_table,
-                                                 d_aggs.data(),
-                                                 block_cardinality.data(),
-                                                 stride});
-    extract_populated_keys(global_set, populated_keys, stream);
-  }
-
-  // Add results back to sparse_results cache
-  auto sparse_result_cols = sparse_table.release();
-  for (size_t i = 0; i < aggs.size(); i++) {
-    // Note that the cache will make a copy of this temporary aggregation
-    sparse_results->add_result(
-      flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i]));
-  }
-
-  return populated_keys;
-}
-
-/**
- * @brief Computes and returns a device vector containing all populated keys in
- * `key_set`.
- */
-template <typename SetType>
-rmm::device_uvector<size_type> extract_populated_keys(SetType const& key_set,
-                                                      size_type num_keys,
-                                                      rmm::cuda_stream_view stream)
-{
-  rmm::device_uvector<size_type> populated_keys(num_keys, stream);
-  auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value());
-
-  populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream);
-  return populated_keys;
-}
-
 /**
  * @brief Computes groupby using hash table.
  *
diff --git a/cpp/src/groupby/hash/kernels.cuh b/cpp/src/groupby/hash/kernels.cuh
index 3051901fb37..7db66d0f526 100644
--- a/cpp/src/groupby/hash/kernels.cuh
+++ b/cpp/src/groupby/hash/kernels.cuh
@@ -203,116 +203,4 @@ CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows,
   }
 }
 
-template <typename SetType>
-__device__ void find_local_mapping(cudf::size_type cur_idx,
-                                   cudf::size_type num_input_rows,
-                                   cudf::size_type* cardinality,
-                                   SetType shared_set,
-                                   cudf::size_type* local_mapping_index,
-                                   cudf::size_type* shared_set_indices)
-{
-  cudf::size_type result_idx;
-  bool inserted;
-  if (cur_idx < num_input_rows) {
-    auto const result = shared_set.insert_and_find(cur_idx);
-    result_idx        = *result.first;
-    inserted          = result.second;
-    // inserted a new element
-    if (result.second) {
-      auto shared_set_index                = atomicAdd(cardinality, 1);
-      shared_set_indices[shared_set_index] = cur_idx;
-      local_mapping_index[cur_idx]         = shared_set_index;
-    }
-  }
-  // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all
-  // threads in the thread block.
-  __syncthreads();
-  if (cur_idx < num_input_rows) {
-    // element was already in set
-    if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; }
-  }
-}
-
-template <typename SetType>
-__device__ void find_global_mapping(cudf::size_type cur_idx,
-                                    SetType global_set,
-                                    cudf::size_type* shared_set_indices,
-                                    cudf::size_type* global_mapping_index)
-{
-  auto input_idx = shared_set_indices[cur_idx];
-  auto result    = global_set.insert_and_find(input_idx);
-  global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] = *result.first;
-}
-
-/*
- * Inserts keys into the shared memory hash set, and stores the row index of the local
- * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a
- * threadblock exceeds `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without
- * updating `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to
- * the global hash set, and save the row index of the global sparse table in `global_mapping_index`.
- */
-template <class SetRef, typename GlobalSetType, class WindowExtent>
-CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
-                                         cudf::size_type num_input_rows,
-                                         WindowExtent window_extent,
-                                         cudf::size_type* local_mapping_index,
-                                         cudf::size_type* global_mapping_index,
-                                         cudf::size_type* block_cardinality,
-                                         bool* direct_aggregations)
-{
-  __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS];
-
-  // Shared set initialization
-  __shared__ typename SetRef::window_type windows[window_extent.value()];
-  auto storage     = SetRef::storage_ref_type(window_extent, windows);
-  auto shared_set  = SetRef(cuco::empty_key<cudf::size_type>{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
-                           global_set.key_eq(),
-                           probing_scheme_type{global_set.hash_function()},
-                            {},
-                           storage);
-  auto const block = cooperative_groups::this_thread_block();
-  shared_set.initialize(block);
-  block.sync();
-
-  auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find);
-
-  __shared__ cudf::size_type cardinality;
-
-  if (block.thread_rank() == 0) { cardinality = 0; }
-
-  block.sync();
-
-  auto const stride = cudf::detail::grid_1d::grid_stride();
-
-  for (auto cur_idx = cudf::detail::grid_1d::global_thread_id();
-       cur_idx - block.thread_rank() < num_input_rows;
-       cur_idx += stride) {
-    find_local_mapping(cur_idx,
-                       num_input_rows,
-                       &cardinality,
-                       shared_insert_ref,
-                       local_mapping_index,
-                       shared_set_indices);
-
-    block.sync();
-
-    if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) {
-      if (block.thread_rank() == 0) { *direct_aggregations = true; }
-      break;
-    }
-
-    block.sync();
-  }
-
-  // Insert unique keys from shared to global hash set
-  if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) {
-    for (auto cur_idx = block.thread_rank(); cur_idx < cardinality;
-         cur_idx += block.num_threads()) {
-      find_global_mapping(cur_idx, global_set, shared_set_indices, global_mapping_index);
-    }
-  }
-
-  if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; }
-}
-
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index a8cc7492c52..b6bf7a9d500 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -861,7 +861,7 @@ struct initialize_sparse_table {
     for (auto col_idx = 0; col_idx < sparse_table.num_columns(); col_idx++) {
       cudf::detail::dispatch_type_and_aggregation(sparse_table.column(col_idx).type(),
                                                   aggs[col_idx],
-                                                  initialize_gmem{},
+                                                  cudf::detail::identity_initializer{},
                                                   sparse_table.column(col_idx),
                                                   key_idx);
     }

From 50094f7b2f817a346e502879fea7f6f50e89dd2a Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 18 Sep 2024 19:18:12 -0700
Subject: [PATCH 022/135] Revert back to GQE init

---
 cpp/include/cudf/detail/aggregation/aggregation.cuh | 2 +-
 cpp/src/groupby/hash/single_pass_functors.cuh       | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index bc2d0edbeba..82383023ef1 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -620,7 +620,7 @@ struct identity_initializer {
   }
 
   template <typename T, aggregation::Kind k>
-  T get_identity()
+  constexpr T get_identity()
   {
     if (k == aggregation::ARGMAX || k == aggregation::ARGMIN) {
       if constexpr (cudf::is_timestamp<T>())
diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index b6bf7a9d500..5aff267fd6f 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -861,7 +861,8 @@ struct initialize_sparse_table {
     for (auto col_idx = 0; col_idx < sparse_table.num_columns(); col_idx++) {
       cudf::detail::dispatch_type_and_aggregation(sparse_table.column(col_idx).type(),
                                                   aggs[col_idx],
-                                                  cudf::detail::identity_initializer{},
+                                                  // cudf::detail::identity_initializer{},
+                                                  initialize_gmem{},
                                                   sparse_table.column(col_idx),
                                                   key_idx);
     }

From 13620c7056159daac575dcd8c6f02d3abd3beed2 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 20 Sep 2024 11:29:24 -0700
Subject: [PATCH 023/135] Pass null policies to agg kernels

---
 cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 15 ++++++++++++++-
 cpp/src/groupby/hash/compute_single_pass_aggs.hpp |  2 ++
 cpp/src/groupby/hash/groupby.cu                   | 13 ++++++-------
 cpp/src/groupby/hash/single_pass_functors.cuh     | 14 +++++++++++---
 4 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index 3b36f8a1f81..70fda95bc8e 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -21,6 +21,7 @@
 
 #include <cudf/detail/aggregation/result_cache.hpp>
 #include <cudf/detail/cuco_helpers.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/groupby.hpp>
@@ -92,6 +93,8 @@ template <class SetRef, typename GlobalSetType, class WindowExtent>
 CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
                                          cudf::size_type num_input_rows,
                                          WindowExtent window_extent,
+                                         bitmask_type const* row_bitmask,
+                                         bool skip_rows_with_nulls,
                                          cudf::size_type* local_mapping_index,
                                          cudf::size_type* global_mapping_index,
                                          cudf::size_type* block_cardinality,
@@ -379,6 +382,7 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   cudf::host_span<cudf::groupby::aggregation_request const> requests,
   cudf::detail::result_cache* sparse_results,
   SetType& global_set,
+  bool skip_key_rows_with_nulls,
   rmm::cuda_stream_view stream)
 {
   // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy
@@ -397,6 +401,11 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
 
   auto const num_input_rows = keys.num_rows();
 
+  auto row_bitmask =
+    skip_key_rows_with_nulls
+      ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
+      : rmm::device_buffer{};
+
   auto global_set_ref  = global_set.ref(cuco::op::insert_and_find);
   auto const grid_size = max_occupancy_grid_size(
     compute_mapping_indices<shared_set_ref_type, decltype(global_set_ref), decltype(window_extent)>,
@@ -414,6 +423,8 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
     <<<grid_size, GROUPBY_BLOCK_SIZE, 0, stream>>>(global_set_ref,
                                                    num_input_rows,
                                                    window_extent,
+                                                   static_cast<bitmask_type*>(row_bitmask.data()),
+                                                   skip_key_rows_with_nulls,
                                                    local_mapping_index.data(),
                                                    global_mapping_index.data(),
                                                    block_cardinality.data(),
@@ -459,7 +470,9 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
                                                  *d_sparse_table,
                                                  d_aggs.data(),
                                                  block_cardinality.data(),
-                                                 stride});
+                                                 stride,
+                                                 static_cast<bitmask_type*>(row_bitmask.data()),
+                                                 skip_key_rows_with_nulls});
     extract_populated_keys(global_set, populated_keys, stream);
   }
 
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
index 73a85d67627..70eb7bb0c89 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
@@ -37,6 +37,8 @@ CUDF_EXPORT rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   cudf::host_span<cudf::groupby::aggregation_request const> requests,
   cudf::detail::result_cache* sparse_results,
   SetType& global_set,
+  bool keys_have_nulls,
+  null_policy include_null_keys,
   rmm::cuda_stream_view stream);
 
 }  // namespace hash
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index e93c8b46613..2c32930c061 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -19,7 +19,6 @@
 #include "groupby/common/utils.hpp"
 #include "helpers.cuh"
 #include "multi_pass_functors.cuh"
-#include "single_pass_functors.cuh"
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
@@ -286,14 +285,12 @@ void sparse_to_dense_results(table_view const& keys,
                              cudf::detail::result_cache* dense_results,
                              device_span<size_type const> gather_map,
                              SetType set,
-                             bool keys_have_nulls,
-                             null_policy include_null_keys,
+                             bool skip_key_rows_with_nulls,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr)
 {
   auto row_bitmask =
     cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first;
-  bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
   bitmask_type const* row_bitmask_ptr =
     skip_key_rows_with_nulls ? static_cast<bitmask_type*>(row_bitmask.data()) : nullptr;
 
@@ -349,6 +346,8 @@ std::unique_ptr<table> groupby(table_view const& keys,
   auto const num_keys            = static_cast<int64_t>(keys.num_rows());
   auto const null_keys_are_equal = null_equality::EQUAL;
   auto const has_null            = nullate::DYNAMIC{cudf::has_nested_nulls(keys)};
+  auto const skip_key_rows_with_nulls =
+    keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
 
   auto preprocessed_keys = cudf::experimental::row::hash::preprocessed_table::create(keys, stream);
   auto const comparator  = cudf::experimental::row::equality::self_comparator{preprocessed_keys};
@@ -372,7 +371,8 @@ std::unique_ptr<table> groupby(table_view const& keys,
       stream.value()};
 
     // Compute all single pass aggs first
-    auto gather_map = compute_single_pass_aggs(keys, requests, &sparse_results, set, stream);
+    auto gather_map = compute_single_pass_aggs(
+      keys, requests, &sparse_results, set, skip_key_rows_with_nulls, stream);
 
     // Compact all results from sparse_results and insert into cache
     sparse_to_dense_results(keys,
@@ -381,8 +381,7 @@ std::unique_ptr<table> groupby(table_view const& keys,
                             cache,
                             gather_map,
                             set.ref(cuco::find),
-                            keys_have_nulls,
-                            include_null_keys,
+                            skip_key_rows_with_nulls,
                             stream,
                             mr);
 
diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index 5aff267fd6f..9ab774e5fe1 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -877,25 +877,33 @@ struct compute_direct_aggregates {
   cudf::aggregation::Kind const* __restrict__ aggs;
   cudf::size_type* block_cardinality;
   int stride;
+  bitmask_type const* __restrict__ row_bitmask;
+  bool skip_rows_with_nulls;
+
   compute_direct_aggregates(SetType set,
                             cudf::table_device_view input_values,
                             cudf::mutable_table_device_view output_values,
                             cudf::aggregation::Kind const* aggs,
                             cudf::size_type* block_cardinality,
-                            int stride)
+                            int stride,
+                            bitmask_type const* row_bitmask,
+                            bool skip_rows_with_nulls)
     : set(set),
       input_values(input_values),
       output_values(output_values),
       aggs(aggs),
       block_cardinality(block_cardinality),
-      stride(stride)
+      stride(stride),
+      row_bitmask(row_bitmask),
+      skip_rows_with_nulls(skip_rows_with_nulls)
   {
   }
 
   __device__ void operator()(cudf::size_type i)
   {
     int block_id = (i % stride) / GROUPBY_BLOCK_SIZE;
-    if (block_cardinality[block_id] >= GROUPBY_CARDINALITY_THRESHOLD) {
+    if (block_cardinality[block_id] >= GROUPBY_CARDINALITY_THRESHOLD and
+        (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i))) {
       auto const result = set.insert_and_find(i);
       cudf::detail::aggregate_row<true, true>(output_values, *result.first, input_values, i, aggs);
     }

From 47de4b3305cd2db6dc9d1b0deceeeb9282fdd377 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 20 Sep 2024 13:27:26 -0700
Subject: [PATCH 024/135] Add notes + cleanups

---
 .../groupby/hash/compute_single_pass_aggs.cuh | 38 +++++++++++--------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index 70fda95bc8e..8ee4aecfc10 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -42,22 +42,28 @@ namespace detail {
 namespace hash {
 
 template <typename SetType>
+// TODO pass block
 __device__ void find_local_mapping(cudf::size_type cur_idx,
                                    cudf::size_type num_input_rows,
-                                   cudf::size_type* cardinality,
                                    SetType shared_set,
+                                   bitmask_type const* row_bitmask,
+                                   bool skip_rows_with_nulls,
+                                   cudf::size_type* cardinality,
                                    cudf::size_type* local_mapping_index,
                                    cudf::size_type* shared_set_indices)
 {
   cudf::size_type result_idx;
+  // TODO: un-init
   bool inserted;
-  if (cur_idx < num_input_rows) {
+  if (cur_idx < num_input_rows
+      // and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))
+  ) {
     auto const result = shared_set.insert_and_find(cur_idx);
     result_idx        = *result.first;
     inserted          = result.second;
     // inserted a new element
     if (result.second) {
-      auto shared_set_index                = atomicAdd(cardinality, 1);
+      auto const shared_set_index          = atomicAdd(cardinality, 1);
       shared_set_indices[shared_set_index] = cur_idx;
       local_mapping_index[cur_idx]         = shared_set_index;
     }
@@ -65,7 +71,9 @@ __device__ void find_local_mapping(cudf::size_type cur_idx,
   // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all
   // threads in the thread block.
   __syncthreads();
-  if (cur_idx < num_input_rows) {
+  if (cur_idx < num_input_rows
+      // and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))
+  ) {
     // element was already in set
     if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; }
   }
@@ -77,9 +85,9 @@ __device__ void find_global_mapping(cudf::size_type cur_idx,
                                     cudf::size_type* shared_set_indices,
                                     cudf::size_type* global_mapping_index)
 {
-  auto input_idx = shared_set_indices[cur_idx];
-  auto result    = global_set.insert_and_find(input_idx);
-  global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] = *result.first;
+  auto const input_idx = shared_set_indices[cur_idx];
+  global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] =
+    *global_set.insert_and_find(input_idx).first;
 }
 
 /*
@@ -100,6 +108,7 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
                                          cudf::size_type* block_cardinality,
                                          bool* direct_aggregations)
 {
+  // TODO: indices inserted in each shared memory set
   __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS];
 
   // Shared set initialization
@@ -112,14 +121,11 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
                            storage);
   auto const block = cooperative_groups::this_thread_block();
   shared_set.initialize(block);
-  block.sync();
 
   auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find);
 
   __shared__ cudf::size_type cardinality;
-
   if (block.thread_rank() == 0) { cardinality = 0; }
-
   block.sync();
 
   auto const stride = cudf::detail::grid_1d::grid_stride();
@@ -129,8 +135,10 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
        cur_idx += stride) {
     find_local_mapping(cur_idx,
                        num_input_rows,
-                       &cardinality,
                        shared_insert_ref,
+                       row_bitmask,
+                       skip_rows_with_nulls,
+                       &cardinality,
                        local_mapping_index,
                        shared_set_indices);
 
@@ -382,7 +390,7 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   cudf::host_span<cudf::groupby::aggregation_request const> requests,
   cudf::detail::result_cache* sparse_results,
   SetType& global_set,
-  bool skip_key_rows_with_nulls,
+  bool skip_rows_with_nulls,
   rmm::cuda_stream_view stream)
 {
   // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy
@@ -402,7 +410,7 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   auto const num_input_rows = keys.num_rows();
 
   auto row_bitmask =
-    skip_key_rows_with_nulls
+    skip_rows_with_nulls
       ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
       : rmm::device_buffer{};
 
@@ -424,7 +432,7 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
                                                    num_input_rows,
                                                    window_extent,
                                                    static_cast<bitmask_type*>(row_bitmask.data()),
-                                                   skip_key_rows_with_nulls,
+                                                   skip_rows_with_nulls,
                                                    local_mapping_index.data(),
                                                    global_mapping_index.data(),
                                                    block_cardinality.data(),
@@ -472,7 +480,7 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
                                                  block_cardinality.data(),
                                                  stride,
                                                  static_cast<bitmask_type*>(row_bitmask.data()),
-                                                 skip_key_rows_with_nulls});
+                                                 skip_rows_with_nulls});
     extract_populated_keys(global_set, populated_keys, stream);
   }
 

From 2f04781da3f3d078915151ad429e874e3913321e Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 20 Sep 2024 16:30:13 -0700
Subject: [PATCH 025/135] Fix null bugs

---
 .../groupby/hash/compute_single_pass_aggs.cuh | 16 +++++----
 cpp/src/groupby/hash/kernels.cuh              | 35 ++++++++++++-------
 2 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index 8ee4aecfc10..96bc851b9e6 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -55,9 +55,8 @@ __device__ void find_local_mapping(cudf::size_type cur_idx,
   cudf::size_type result_idx;
   // TODO: un-init
   bool inserted;
-  if (cur_idx < num_input_rows
-      // and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))
-  ) {
+  if (cur_idx < num_input_rows and
+      (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) {
     auto const result = shared_set.insert_and_find(cur_idx);
     result_idx        = *result.first;
     inserted          = result.second;
@@ -71,9 +70,8 @@ __device__ void find_local_mapping(cudf::size_type cur_idx,
   // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all
   // threads in the thread block.
   __syncthreads();
-  if (cur_idx < num_input_rows
-      // and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))
-  ) {
+  if (cur_idx < num_input_rows and
+      (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) {
     // element was already in set
     if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; }
   }
@@ -295,6 +293,8 @@ size_t compute_shared_memory_size(Kernel kernel, int grid_size)
 
 void compute_aggregations(int grid_size,
                           cudf::size_type num_input_rows,
+                          bitmask_type const* row_bitmask,
+                          bool skip_rows_with_nulls,
                           cudf::size_type* local_mapping_index,
                           cudf::size_type* global_mapping_index,
                           cudf::size_type* block_cardinality,
@@ -312,6 +312,8 @@ void compute_aggregations(int grid_size,
   auto shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2;
   compute_aggs_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, shmem_size, stream>>>(
     num_input_rows,
+    row_bitmask,
+    skip_rows_with_nulls,
     local_mapping_index,
     global_mapping_index,
     block_cardinality,
@@ -460,6 +462,8 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
 
   compute_aggregations(grid_size,
                        num_input_rows,
+                       static_cast<bitmask_type*>(row_bitmask.data()),
+                       skip_rows_with_nulls,
                        local_mapping_index.data(),
                        global_mapping_index.data(),
                        block_cardinality.data(),
diff --git a/cpp/src/groupby/hash/kernels.cuh b/cpp/src/groupby/hash/kernels.cuh
index 7db66d0f526..ecbb013902f 100644
--- a/cpp/src/groupby/hash/kernels.cuh
+++ b/cpp/src/groupby/hash/kernels.cuh
@@ -76,6 +76,8 @@ __device__ void initialize_shared_memory_aggregates(int col_start,
 
 __device__ void compute_pre_aggregrates(int col_start,
                                         int col_end,
+                                        bitmask_type const* row_bitmask,
+                                        bool skip_rows_with_nulls,
                                         cudf::table_device_view input_values,
                                         cudf::size_type num_input_rows,
                                         cudf::size_type* local_mapping_index,
@@ -83,21 +85,24 @@ __device__ void compute_pre_aggregrates(int col_start,
                                         bool** s_aggregates_valid_pointer,
                                         cudf::aggregation::Kind const* aggs)
 {
+  // TODO grid_1d utility
   for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows;
        cur_idx += blockDim.x * gridDim.x) {
-    auto map_idx = local_mapping_index[cur_idx];
-
-    for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
-      auto input_col = input_values.column(col_idx);
-
-      cudf::detail::dispatch_type_and_aggregation(input_col.type(),
-                                                  aggs[col_idx],
-                                                  shmem_element_aggregator{},
-                                                  s_aggregates_pointer[col_idx],
-                                                  map_idx,
-                                                  s_aggregates_valid_pointer[col_idx],
-                                                  input_col,
-                                                  cur_idx);
+    if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx)) {
+      auto map_idx = local_mapping_index[cur_idx];
+
+      for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+        auto input_col = input_values.column(col_idx);
+
+        cudf::detail::dispatch_type_and_aggregation(input_col.type(),
+                                                    aggs[col_idx],
+                                                    shmem_element_aggregator{},
+                                                    s_aggregates_pointer[col_idx],
+                                                    map_idx,
+                                                    s_aggregates_valid_pointer[col_idx],
+                                                    input_col,
+                                                    cur_idx);
+      }
     }
   }
 }
@@ -133,6 +138,8 @@ __device__ void compute_final_aggregates(int col_start,
 /* Takes the local_mapping_index and global_mapping_index to compute
  * pre (shared) and final (global) aggregates*/
 CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows,
+                                     bitmask_type const* row_bitmask,
+                                     bool skip_rows_with_nulls,
                                      cudf::size_type* local_mapping_index,
                                      cudf::size_type* global_mapping_index,
                                      cudf::size_type* block_cardinality,
@@ -183,6 +190,8 @@ CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows,
     block.sync();
     compute_pre_aggregrates(col_start,
                             col_end,
+                            row_bitmask,
+                            skip_rows_with_nulls,
                             input_values,
                             num_rows,
                             local_mapping_index,

From 4a0d7a05000f6d48cc87e34c23cd95da2755f4d6 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 20 Sep 2024 17:27:30 -0700
Subject: [PATCH 026/135] Make var const

---
 .../groupby/hash/compute_single_pass_aggs.cuh | 35 ++++++++++---------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index 96bc851b9e6..c059c2dd3cf 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -346,23 +346,24 @@ auto create_sparse_results_table(cudf::table_view const& flattened_values,
 {
   // TODO single allocation - room for performance improvement
   std::vector<std::unique_ptr<cudf::column>> sparse_columns;
-  std::transform(
-    flattened_values.begin(),
-    flattened_values.end(),
-    aggs.begin(),
-    std::back_inserter(sparse_columns),
-    [stream](auto const& col, auto const& agg) {
-      bool nullable = (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL)
-                        ? false
-                        : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or
-                           agg == cudf::aggregation::STD);
-      auto mask_flag = (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED;
-      auto col_type  = cudf::is_dictionary(col.type())
-                         ? cudf::dictionary_column_view(col).keys().type()
-                         : col.type();
-      return make_fixed_width_column(
-        cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream);
-    });
+  std::transform(flattened_values.begin(),
+                 flattened_values.end(),
+                 aggs.begin(),
+                 std::back_inserter(sparse_columns),
+                 [stream](auto const& col, auto const& agg) {
+                   auto const nullable =
+                     (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL)
+                       ? false
+                       : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or
+                          agg == cudf::aggregation::STD);
+                   auto mask_flag =
+                     (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED;
+                   auto const col_type = cudf::is_dictionary(col.type())
+                                           ? cudf::dictionary_column_view(col).keys().type()
+                                           : col.type();
+                   return make_fixed_width_column(
+                     cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream);
+                 });
   cudf::table sparse_table(std::move(sparse_columns));
   // If no direct aggregations, initialize the sparse table
   // only for the keys inserted in global hash set

From 398c9f4009891b6dd1acbcb62aaac65650178b47 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 23 Sep 2024 10:48:49 -0700
Subject: [PATCH 027/135] Make vars const

---
 cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index c059c2dd3cf..7bdef7e6d2b 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -306,10 +306,10 @@ void compute_aggregations(int grid_size,
   auto const shmem_size = compute_shared_memory_size(compute_aggs_kernel, grid_size);
   // For each aggregation, need two pointers to arrays in shmem
   // One where the aggregation is performed, one indicating the validity of the aggregation
-  auto shmem_agg_pointer_size =
+  auto const shmem_agg_pointer_size =
     round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns());
   // The rest of shmem is utilized for the actual arrays in shmem
-  auto shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2;
+  auto const shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2;
   compute_aggs_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, shmem_size, stream>>>(
     num_input_rows,
     row_bitmask,

From c1c53a3a25de296ca70bfa14f063405d47a95682 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 23 Sep 2024 11:46:43 -0700
Subject: [PATCH 028/135] Cleanups for ODR

---
 cpp/CMakeLists.txt                            |   1 +
 cpp/src/groupby/hash/compute_aggregations.cu  | 272 ++++++++++++++++++
 cpp/src/groupby/hash/compute_aggregations.hpp |  41 +++
 .../groupby/hash/compute_single_pass_aggs.cuh |  51 +---
 .../groupby/hash/compute_single_pass_aggs.hpp |   2 +-
 cpp/src/groupby/hash/single_pass_functors.cuh |   3 +
 6 files changed, 322 insertions(+), 48 deletions(-)
 create mode 100644 cpp/src/groupby/hash/compute_aggregations.cu
 create mode 100644 cpp/src/groupby/hash/compute_aggregations.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7bc01e64441..0f096bd7e4f 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -314,6 +314,7 @@ add_library(
   src/filling/repeat.cu
   src/filling/sequence.cu
   src/groupby/groupby.cu
+  src/groupby/hash/compute_aggregations.cu
   src/groupby/hash/groupby.cu
   src/groupby/sort/aggregate.cpp
   src/groupby/sort/group_argmax.cu
diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu
new file mode 100644
index 00000000000..218f513e964
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_aggregations.cu
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_aggregations.hpp"
+#include "helpers.cuh"
+#include "single_pass_functors.cuh"
+
+#include <cudf/aggregation.hpp>
+#include <cudf/detail/cuco_helpers.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/cuda.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cooperative_groups.h>
+
+#include <cstddef>
+
+namespace cudf::groupby::detail::hash {
+namespace {
+__device__ void calculate_columns_to_aggregate(int& col_start,
+                                               int& col_end,
+                                               cudf::mutable_table_device_view output_values,
+                                               int num_input_cols,
+                                               std::byte** s_aggregates_pointer,
+                                               bool** s_aggregates_valid_pointer,
+                                               std::byte* shared_set_aggregates,
+                                               cudf::size_type cardinality,
+                                               int total_agg_size)
+{
+  if (threadIdx.x == 0) {
+    col_start           = col_end;
+    int bytes_allocated = 0;
+    int valid_col_size  = round_to_multiple_of_8(sizeof(bool) * cardinality);
+    while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) {
+      int next_col_size =
+        round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality);
+      int next_col_total_size = valid_col_size + next_col_size;
+      if (bytes_allocated + next_col_total_size > total_agg_size) { break; }
+      s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated;
+      s_aggregates_valid_pointer[col_end] =
+        reinterpret_cast<bool*>(shared_set_aggregates + bytes_allocated + next_col_size);
+      bytes_allocated += next_col_total_size;
+      col_end++;
+    }
+  }
+}
+
+__device__ void initialize_shared_memory_aggregates(int col_start,
+                                                    int col_end,
+                                                    cudf::mutable_table_device_view output_values,
+                                                    std::byte** s_aggregates_pointer,
+                                                    bool** s_aggregates_valid_pointer,
+                                                    cudf::size_type cardinality,
+                                                    cudf::aggregation::Kind const* aggs)
+{
+  for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+    for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) {
+      cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(),
+                                                  aggs[col_idx],
+                                                  initialize_shmem{},
+                                                  s_aggregates_pointer[col_idx],
+                                                  idx,
+                                                  s_aggregates_valid_pointer[col_idx]);
+    }
+  }
+}
+
+__device__ void compute_pre_aggregrates(int col_start,
+                                        int col_end,
+                                        bitmask_type const* row_bitmask,
+                                        bool skip_rows_with_nulls,
+                                        cudf::table_device_view input_values,
+                                        cudf::size_type num_input_rows,
+                                        cudf::size_type* local_mapping_index,
+                                        std::byte** s_aggregates_pointer,
+                                        bool** s_aggregates_valid_pointer,
+                                        cudf::aggregation::Kind const* aggs)
+{
+  // TODO grid_1d utility
+  for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows;
+       cur_idx += blockDim.x * gridDim.x) {
+    if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx)) {
+      auto map_idx = local_mapping_index[cur_idx];
+
+      for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+        auto input_col = input_values.column(col_idx);
+
+        cudf::detail::dispatch_type_and_aggregation(input_col.type(),
+                                                    aggs[col_idx],
+                                                    shmem_element_aggregator{},
+                                                    s_aggregates_pointer[col_idx],
+                                                    map_idx,
+                                                    s_aggregates_valid_pointer[col_idx],
+                                                    input_col,
+                                                    cur_idx);
+      }
+    }
+  }
+}
+
+__device__ void compute_final_aggregates(int col_start,
+                                         int col_end,
+                                         cudf::table_device_view input_values,
+                                         cudf::mutable_table_device_view output_values,
+                                         cudf::size_type cardinality,
+                                         cudf::size_type* global_mapping_index,
+                                         std::byte** s_aggregates_pointer,
+                                         bool** s_aggregates_valid_pointer,
+                                         cudf::aggregation::Kind const* aggs)
+{
+  for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) {
+    auto out_idx = global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx];
+    for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+      auto output_col = output_values.column(col_idx);
+
+      cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(),
+                                                  aggs[col_idx],
+                                                  gmem_element_aggregator{},
+                                                  output_col,
+                                                  out_idx,
+                                                  input_values.column(col_idx),
+                                                  s_aggregates_pointer[col_idx],
+                                                  cur_idx,
+                                                  s_aggregates_valid_pointer[col_idx]);
+    }
+  }
+}
+
+/* Takes the local_mapping_index and global_mapping_index to compute
+ * pre (shared) and final (global) aggregates*/
+CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows,
+                                     bitmask_type const* row_bitmask,
+                                     bool skip_rows_with_nulls,
+                                     cudf::size_type* local_mapping_index,
+                                     cudf::size_type* global_mapping_index,
+                                     cudf::size_type* block_cardinality,
+                                     cudf::table_device_view input_values,
+                                     cudf::mutable_table_device_view output_values,
+                                     cudf::aggregation::Kind const* aggs,
+                                     int total_agg_size,
+                                     int pointer_size)
+{
+  auto const block       = cooperative_groups::this_thread_block();
+  auto const cardinality = block_cardinality[block.group_index().x];
+  if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; }
+
+  auto const num_cols = output_values.num_columns();
+
+  __shared__ int col_start;
+  __shared__ int col_end;
+  extern __shared__ std::byte shared_set_aggregates[];
+  std::byte** s_aggregates_pointer =
+    reinterpret_cast<std::byte**>(shared_set_aggregates + total_agg_size);
+  bool** s_aggregates_valid_pointer =
+    reinterpret_cast<bool**>(shared_set_aggregates + total_agg_size + pointer_size);
+
+  if (block.thread_rank() == 0) {
+    col_start = 0;
+    col_end   = 0;
+  }
+  block.sync();
+
+  while (col_end < num_cols) {
+    calculate_columns_to_aggregate(col_start,
+                                   col_end,
+                                   output_values,
+                                   num_cols,
+                                   s_aggregates_pointer,
+                                   s_aggregates_valid_pointer,
+                                   shared_set_aggregates,
+                                   cardinality,
+                                   total_agg_size);
+    block.sync();
+    initialize_shared_memory_aggregates(col_start,
+                                        col_end,
+                                        output_values,
+                                        s_aggregates_pointer,
+                                        s_aggregates_valid_pointer,
+                                        cardinality,
+                                        aggs);
+    block.sync();
+    compute_pre_aggregrates(col_start,
+                            col_end,
+                            row_bitmask,
+                            skip_rows_with_nulls,
+                            input_values,
+                            num_rows,
+                            local_mapping_index,
+                            s_aggregates_pointer,
+                            s_aggregates_valid_pointer,
+                            aggs);
+    block.sync();
+    compute_final_aggregates(col_start,
+                             col_end,
+                             input_values,
+                             output_values,
+                             cardinality,
+                             global_mapping_index,
+                             s_aggregates_pointer,
+                             s_aggregates_valid_pointer,
+                             aggs);
+    block.sync();
+  }
+}
+
+constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; }
+
+template <typename Kernel>
+constexpr size_t compute_shared_memory_size(Kernel kernel, int grid_size)
+{
+  auto const active_blocks_per_sm =
+    cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
+
+  size_t dynamic_shmem_size;
+  CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock(
+    &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE));
+  return get_previous_multiple_of_8(0.5 * dynamic_shmem_size);
+}
+
+}  // namespace
+
+void compute_aggregations(int grid_size,
+                          cudf::size_type num_input_rows,
+                          bitmask_type const* row_bitmask,
+                          bool skip_rows_with_nulls,
+                          cudf::size_type* local_mapping_index,
+                          cudf::size_type* global_mapping_index,
+                          cudf::size_type* block_cardinality,
+                          cudf::table_device_view input_values,
+                          cudf::mutable_table_device_view output_values,
+                          cudf::aggregation::Kind const* aggs,
+                          rmm::cuda_stream_view stream)
+{
+  auto const shmem_size = compute_shared_memory_size(compute_aggs_kernel, grid_size);
+  // For each aggregation, need two pointers to arrays in shmem
+  // One where the aggregation is performed, one indicating the validity of the aggregation
+  auto const shmem_agg_pointer_size =
+    round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns());
+  // The rest of shmem is utilized for the actual arrays in shmem
+  auto const shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2;
+  compute_aggs_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, shmem_size, stream>>>(
+    num_input_rows,
+    row_bitmask,
+    skip_rows_with_nulls,
+    local_mapping_index,
+    global_mapping_index,
+    block_cardinality,
+    input_values,
+    output_values,
+    aggs,
+    shmem_agg_size,
+    shmem_agg_pointer_size);
+}
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_aggregations.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp
new file mode 100644
index 00000000000..87c37158cd0
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_aggregations.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cstddef>
+
+namespace cudf::groupby::detail::hash {
+
+void compute_aggregations(int grid_size,
+                          cudf::size_type num_input_rows,
+                          bitmask_type const* row_bitmask,
+                          bool skip_rows_with_nulls,
+                          cudf::size_type* local_mapping_index,
+                          cudf::size_type* global_mapping_index,
+                          cudf::size_type* block_cardinality,
+                          cudf::table_device_view input_values,
+                          cudf::mutable_table_device_view output_values,
+                          cudf::aggregation::Kind const* aggs,
+                          rmm::cuda_stream_view stream);
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index 7bdef7e6d2b..78af61639c2 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "compute_aggregations.hpp"
 #include "compute_single_pass_aggs.hpp"
 #include "helpers.cuh"
 #include "kernels.cuh"
@@ -32,6 +35,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <cooperative_groups.h>
 #include <cuco/static_set.cuh>
 
 #include <unordered_set>
@@ -277,53 +281,6 @@ int max_occupancy_grid_size(Kernel kernel, cudf::size_type n)
   return std::min(grid_size, num_blocks);
 }
 
-size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; }
-
-template <typename Kernel>
-size_t compute_shared_memory_size(Kernel kernel, int grid_size)
-{
-  auto const active_blocks_per_sm =
-    cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
-
-  size_t dynamic_shmem_size;
-  CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock(
-    &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE));
-  return get_previous_multiple_of_8(0.5 * dynamic_shmem_size);
-}
-
-void compute_aggregations(int grid_size,
-                          cudf::size_type num_input_rows,
-                          bitmask_type const* row_bitmask,
-                          bool skip_rows_with_nulls,
-                          cudf::size_type* local_mapping_index,
-                          cudf::size_type* global_mapping_index,
-                          cudf::size_type* block_cardinality,
-                          cudf::table_device_view input_values,
-                          cudf::mutable_table_device_view output_values,
-                          cudf::aggregation::Kind const* aggs,
-                          rmm::cuda_stream_view stream)
-{
-  auto const shmem_size = compute_shared_memory_size(compute_aggs_kernel, grid_size);
-  // For each aggregation, need two pointers to arrays in shmem
-  // One where the aggregation is performed, one indicating the validity of the aggregation
-  auto const shmem_agg_pointer_size =
-    round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns());
-  // The rest of shmem is utilized for the actual arrays in shmem
-  auto const shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2;
-  compute_aggs_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, shmem_size, stream>>>(
-    num_input_rows,
-    row_bitmask,
-    skip_rows_with_nulls,
-    local_mapping_index,
-    global_mapping_index,
-    block_cardinality,
-    input_values,
-    output_values,
-    aggs,
-    shmem_agg_size,
-    shmem_agg_pointer_size);
-}
-
 template <typename SetType>
 void extract_populated_keys(SetType const& key_set,
                             rmm::device_uvector<cudf::size_type>& populated_keys,
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
index 70eb7bb0c89..12e5ff459e9 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
@@ -32,7 +32,7 @@ namespace hash {
  * over the data and stores the results in `sparse_results`
  */
 template <typename SetType>
-CUDF_EXPORT rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
+rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   cudf::table_view const& keys,
   cudf::host_span<cudf::groupby::aggregation_request const> requests,
   cudf::detail::result_cache* sparse_results,
diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index 9ab774e5fe1..bb8cbaf4b46 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -16,7 +16,10 @@
 
 #pragma once
 
+#include "helpers.cuh"
+
 #include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/utilities/traits.cuh>
 

From 367d698c615468dc77375ba85f4a60cddaaa3012 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 23 Sep 2024 12:01:52 -0700
Subject: [PATCH 029/135] Fix a typo

---
 cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index 78af61639c2..58baefbe1ab 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -332,7 +332,7 @@ auto create_sparse_results_table(cudf::table_view const& flattened_values,
                        populated_keys.size(),
                        initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_aggs});
   }
-  // Else initialise the whole table
+  // Else initialize the whole table
   else {
     cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view();
     cudf::detail::initialize_with_identity(sparse_table_view, aggs, stream);

From df15519bedba15bf30c6be2077cd5422f93583ee Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 23 Sep 2024 12:29:04 -0700
Subject: [PATCH 030/135] Renaming for clarity

---
 .../groupby/hash/compute_single_pass_aggs.cuh | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index 58baefbe1ab..8d91c382dbe 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -294,7 +294,7 @@ void extract_populated_keys(SetType const& key_set,
 // make table that will hold sparse results
 template <typename GlobalSetType>
 auto create_sparse_results_table(cudf::table_view const& flattened_values,
-                                 const cudf::aggregation::Kind* d_aggs,
+                                 cudf::aggregation::Kind const* d_agg_kinds,
                                  std::vector<cudf::aggregation::Kind> aggs,
                                  bool direct_aggregations,
                                  GlobalSetType const& global_set,
@@ -327,10 +327,11 @@ auto create_sparse_results_table(cudf::table_view const& flattened_values,
   if (!direct_aggregations) {
     auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream);
     extract_populated_keys(global_set, populated_keys, stream);
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator(0),
-                       populated_keys.size(),
-                       initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_aggs});
+    thrust::for_each_n(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(0),
+      populated_keys.size(),
+      initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_agg_kinds});
   }
   // Else initialize the whole table
   else {
@@ -404,11 +405,11 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
 
   // flatten the aggs to a table that can be operated on by aggregate_row
   auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
-  auto const d_aggs                              = cudf::detail::make_device_uvector_async(
+  auto const d_agg_kinds                         = cudf::detail::make_device_uvector_async(
     agg_kinds, stream, rmm::mr::get_current_device_resource());
   // make table that will hold sparse results
   cudf::table sparse_table = create_sparse_results_table(flattened_values,
-                                                         d_aggs.data(),
+                                                         d_agg_kinds.data(),
                                                          agg_kinds,
                                                          direct_aggregations.value(stream),
                                                          global_set,
@@ -427,7 +428,7 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
                        block_cardinality.data(),
                        *d_values,
                        *d_sparse_table,
-                       d_aggs.data(),
+                       d_agg_kinds.data(),
                        stream);
 
   if (direct_aggregations.value(stream)) {
@@ -438,7 +439,7 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
                        compute_direct_aggregates{global_set_ref,
                                                  *d_values,
                                                  *d_sparse_table,
-                                                 d_aggs.data(),
+                                                 d_agg_kinds.data(),
                                                  block_cardinality.data(),
                                                  stride,
                                                  static_cast<bitmask_type*>(row_bitmask.data()),

From 2a39f8fcb17917733c195b6042d0036370d9f588 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 23 Sep 2024 14:38:22 -0700
Subject: [PATCH 031/135] Remove unused file

---
 cpp/src/groupby/hash/kernels.cuh | 215 -------------------------------
 1 file changed, 215 deletions(-)
 delete mode 100644 cpp/src/groupby/hash/kernels.cuh

diff --git a/cpp/src/groupby/hash/kernels.cuh b/cpp/src/groupby/hash/kernels.cuh
deleted file mode 100644
index ecbb013902f..00000000000
--- a/cpp/src/groupby/hash/kernels.cuh
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "cudf/types.hpp"
-#include "helpers.cuh"
-#include "single_pass_functors.cuh"
-
-#include <cudf/detail/cuco_helpers.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/integer_utils.hpp>
-
-namespace cudf::groupby::detail::hash {
-
-__device__ void calculate_columns_to_aggregate(int& col_start,
-                                               int& col_end,
-                                               cudf::mutable_table_device_view output_values,
-                                               int num_input_cols,
-                                               std::byte** s_aggregates_pointer,
-                                               bool** s_aggregates_valid_pointer,
-                                               std::byte* shared_set_aggregates,
-                                               cudf::size_type cardinality,
-                                               int total_agg_size)
-{
-  if (threadIdx.x == 0) {
-    col_start           = col_end;
-    int bytes_allocated = 0;
-    int valid_col_size  = round_to_multiple_of_8(sizeof(bool) * cardinality);
-    while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) {
-      int next_col_size =
-        round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality);
-      int next_col_total_size = valid_col_size + next_col_size;
-      if (bytes_allocated + next_col_total_size > total_agg_size) { break; }
-      s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated;
-      s_aggregates_valid_pointer[col_end] =
-        reinterpret_cast<bool*>(shared_set_aggregates + bytes_allocated + next_col_size);
-      bytes_allocated += next_col_total_size;
-      col_end++;
-    }
-  }
-}
-
-__device__ void initialize_shared_memory_aggregates(int col_start,
-                                                    int col_end,
-                                                    cudf::mutable_table_device_view output_values,
-                                                    std::byte** s_aggregates_pointer,
-                                                    bool** s_aggregates_valid_pointer,
-                                                    cudf::size_type cardinality,
-                                                    cudf::aggregation::Kind const* aggs)
-{
-  for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
-    for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) {
-      cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(),
-                                                  aggs[col_idx],
-                                                  initialize_shmem{},
-                                                  s_aggregates_pointer[col_idx],
-                                                  idx,
-                                                  s_aggregates_valid_pointer[col_idx]);
-    }
-  }
-}
-
-__device__ void compute_pre_aggregrates(int col_start,
-                                        int col_end,
-                                        bitmask_type const* row_bitmask,
-                                        bool skip_rows_with_nulls,
-                                        cudf::table_device_view input_values,
-                                        cudf::size_type num_input_rows,
-                                        cudf::size_type* local_mapping_index,
-                                        std::byte** s_aggregates_pointer,
-                                        bool** s_aggregates_valid_pointer,
-                                        cudf::aggregation::Kind const* aggs)
-{
-  // TODO grid_1d utility
-  for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows;
-       cur_idx += blockDim.x * gridDim.x) {
-    if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx)) {
-      auto map_idx = local_mapping_index[cur_idx];
-
-      for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
-        auto input_col = input_values.column(col_idx);
-
-        cudf::detail::dispatch_type_and_aggregation(input_col.type(),
-                                                    aggs[col_idx],
-                                                    shmem_element_aggregator{},
-                                                    s_aggregates_pointer[col_idx],
-                                                    map_idx,
-                                                    s_aggregates_valid_pointer[col_idx],
-                                                    input_col,
-                                                    cur_idx);
-      }
-    }
-  }
-}
-
-__device__ void compute_final_aggregates(int col_start,
-                                         int col_end,
-                                         cudf::table_device_view input_values,
-                                         cudf::mutable_table_device_view output_values,
-                                         cudf::size_type cardinality,
-                                         cudf::size_type* global_mapping_index,
-                                         std::byte** s_aggregates_pointer,
-                                         bool** s_aggregates_valid_pointer,
-                                         cudf::aggregation::Kind const* aggs)
-{
-  for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) {
-    auto out_idx = global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx];
-    for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
-      auto output_col = output_values.column(col_idx);
-
-      cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(),
-                                                  aggs[col_idx],
-                                                  gmem_element_aggregator{},
-                                                  output_col,
-                                                  out_idx,
-                                                  input_values.column(col_idx),
-                                                  s_aggregates_pointer[col_idx],
-                                                  cur_idx,
-                                                  s_aggregates_valid_pointer[col_idx]);
-    }
-  }
-}
-
-/* Takes the local_mapping_index and global_mapping_index to compute
- * pre (shared) and final (global) aggregates*/
-CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows,
-                                     bitmask_type const* row_bitmask,
-                                     bool skip_rows_with_nulls,
-                                     cudf::size_type* local_mapping_index,
-                                     cudf::size_type* global_mapping_index,
-                                     cudf::size_type* block_cardinality,
-                                     cudf::table_device_view input_values,
-                                     cudf::mutable_table_device_view output_values,
-                                     cudf::aggregation::Kind const* aggs,
-                                     int total_agg_size,
-                                     int pointer_size)
-{
-  auto const block       = cooperative_groups::this_thread_block();
-  auto const cardinality = block_cardinality[block.group_index().x];
-  if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; }
-
-  auto const num_cols = output_values.num_columns();
-
-  __shared__ int col_start;
-  __shared__ int col_end;
-  extern __shared__ std::byte shared_set_aggregates[];
-  std::byte** s_aggregates_pointer =
-    reinterpret_cast<std::byte**>(shared_set_aggregates + total_agg_size);
-  bool** s_aggregates_valid_pointer =
-    reinterpret_cast<bool**>(shared_set_aggregates + total_agg_size + pointer_size);
-
-  if (block.thread_rank() == 0) {
-    col_start = 0;
-    col_end   = 0;
-  }
-  block.sync();
-
-  while (col_end < num_cols) {
-    calculate_columns_to_aggregate(col_start,
-                                   col_end,
-                                   output_values,
-                                   num_cols,
-                                   s_aggregates_pointer,
-                                   s_aggregates_valid_pointer,
-                                   shared_set_aggregates,
-                                   cardinality,
-                                   total_agg_size);
-    block.sync();
-    initialize_shared_memory_aggregates(col_start,
-                                        col_end,
-                                        output_values,
-                                        s_aggregates_pointer,
-                                        s_aggregates_valid_pointer,
-                                        cardinality,
-                                        aggs);
-    block.sync();
-    compute_pre_aggregrates(col_start,
-                            col_end,
-                            row_bitmask,
-                            skip_rows_with_nulls,
-                            input_values,
-                            num_rows,
-                            local_mapping_index,
-                            s_aggregates_pointer,
-                            s_aggregates_valid_pointer,
-                            aggs);
-    block.sync();
-    compute_final_aggregates(col_start,
-                             col_end,
-                             input_values,
-                             output_values,
-                             cardinality,
-                             global_mapping_index,
-                             s_aggregates_pointer,
-                             s_aggregates_valid_pointer,
-                             aggs);
-    block.sync();
-  }
-}
-
-}  // namespace cudf::groupby::detail::hash

From 890ef4561f548e75d4c0fbe7ce8ab24b8882bf97 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 23 Sep 2024 14:53:58 -0700
Subject: [PATCH 032/135] Add missing pragma once for header

---
 cpp/src/groupby/hash/compute_single_pass_aggs.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
index 12e5ff459e9..848ace94ff9 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include <cudf/detail/aggregation/result_cache.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/table/table_view.hpp>

From 57bdf2c5fcbcf7d72847cc650dc2ab775fa51b7e Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 23 Sep 2024 15:18:52 -0700
Subject: [PATCH 033/135] Minor fixes

---
 cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index 8d91c382dbe..617a4411243 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -19,12 +19,12 @@
 #include "compute_aggregations.hpp"
 #include "compute_single_pass_aggs.hpp"
 #include "helpers.cuh"
-#include "kernels.cuh"
 #include "single_pass_functors.cuh"
 
 #include <cudf/detail/aggregation/result_cache.hpp>
 #include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/cuda.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/groupby.hpp>

From d5856784133a9a3de92adf05fa4b952b28ca55c0 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 23 Sep 2024 17:07:21 -0700
Subject: [PATCH 034/135] Fix dictionary test failures

---
 cpp/src/groupby/hash/compute_aggregations.cu  |   2 +
 .../groupby/hash/global_memory_aggregator.cuh | 460 ++++++++++++
 .../groupby/hash/shared_memory_aggregator.cuh | 416 +++++++++++
 cpp/src/groupby/hash/single_pass_functors.cuh | 692 ------------------
 4 files changed, 878 insertions(+), 692 deletions(-)
 create mode 100644 cpp/src/groupby/hash/global_memory_aggregator.cuh
 create mode 100644 cpp/src/groupby/hash/shared_memory_aggregator.cuh

diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu
index 218f513e964..73dca45edf7 100644
--- a/cpp/src/groupby/hash/compute_aggregations.cu
+++ b/cpp/src/groupby/hash/compute_aggregations.cu
@@ -15,7 +15,9 @@
  */
 
 #include "compute_aggregations.hpp"
+#include "global_memory_aggregator.cuh"
 #include "helpers.cuh"
+#include "shared_memory_aggregator.cuh"
 #include "single_pass_functors.cuh"
 
 #include <cudf/aggregation.hpp>
diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh
new file mode 100644
index 00000000000..4dd39e640e0
--- /dev/null
+++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/utilities/assert.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/utilities/traits.cuh>
+
+namespace cudf::groupby::detail::hash {
+
+template <typename Source,
+          cudf::aggregation::Kind k,
+          bool target_has_nulls,
+          bool source_has_nulls,
+          typename Enable = void>
+struct update_target_element_gmem {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::MIN,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                   !cudf::is_fixed_point<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    if (source_has_nulls and source_null[source_index]) { return; }
+
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    cudf::detail::atomic_min(&target.element<Target>(target_index),
+                             static_cast<Target>(source_casted[source_index]));
+
+    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::MIN,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::is_fixed_point<Source>() &&
+                   cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    if (source_has_nulls and source_null[source_index]) { return; }
+    using Target              = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
+    using DeviceType          = cudf::device_storage_type_t<Target>;
+    DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
+    cudf::detail::atomic_min(&target.element<DeviceType>(target_index),
+                             static_cast<DeviceType>(source_casted[source_index]));
+
+    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::MAX,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                   !cudf::is_fixed_point<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    if (source_has_nulls and source_null[source_index]) { return; }
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    cudf::detail::atomic_max(&target.element<Target>(target_index),
+                             static_cast<Target>(source_casted[source_index]));
+
+    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::MAX,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::is_fixed_point<Source>() &&
+                   cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    if (source_has_nulls and source_null[source_index]) { return; }
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
+
+    using DeviceType          = cudf::device_storage_type_t<Target>;
+    DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
+    cudf::detail::atomic_max(&target.element<DeviceType>(target_index),
+                             static_cast<DeviceType>(source_casted[source_index]));
+
+    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::SUM,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                   !cudf::is_fixed_point<Source>() && !cudf::is_timestamp<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    if (source_has_nulls and source_null[source_index]) { return; }
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    cudf::detail::atomic_add(&target.element<Target>(target_index),
+                             static_cast<Target>(source_casted[source_index]));
+
+    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::SUM,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::has_atomic_support<cudf::device_storage_type_t<Source>>() &&
+                   cudf::is_fixed_point<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    if (source_has_nulls and source_null[source_index]) { return; }
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
+
+    using DeviceType          = cudf::device_storage_type_t<Target>;
+    DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
+    cudf::detail::atomic_add(&target.element<DeviceType>(target_index),
+                             static_cast<DeviceType>(source_casted[source_index]));
+    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+/**
+ * @brief Function object to update a single element in a target column using
+ * the dictionary key addressed by the specific index.
+ *
+ * SFINAE is used to prevent recursion for dictionary type. Dictionary keys cannot be a
+ * dictionary.
+ *
+ */
+template <bool target_has_nulls = true>
+struct update_target_from_dictionary_gmem {
+  template <typename Source,
+            aggregation::Kind k,
+            std::enable_if_t<!is_dictionary<Source>()>* = nullptr>
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    update_target_element_gmem<Source, k, target_has_nulls, false>{}(
+      target, target_index, source_column, source, source_index, source_null);
+  }
+  template <typename Source,
+            aggregation::Kind k,
+            std::enable_if_t<is_dictionary<Source>()>* = nullptr>
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+  }
+};
+
+/**
+ * @brief Specialization function for dictionary type and aggregations.
+ *
+ * The `source` column is a dictionary type. This functor de-references the
+ * dictionary's keys child column and maps the input source index through
+ * the dictionary's indices child column to pass to the `update_target_element`
+ * in the above `update_target_from_dictionary` using the type-dispatcher to
+ * resolve the keys column type.
+ *
+ * `update_target_element( target, target_index, source.keys(), source.indices()[source_index] )`
+ *
+ * @tparam target_has_nulls Indicates presence of null elements in `target`
+ * @tparam source_has_nulls Indicates presence of null elements in `source`.
+ */
+template <aggregation::Kind k, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<
+  dictionary32,
+  k,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<not(k == aggregation::ARGMIN or k == aggregation::ARGMAX or
+                       k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL)>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    if (source_has_nulls and source_null[source_index]) { return; }
+
+    dispatch_type_and_aggregation(
+      source_column.child(cudf::dictionary_column_view::keys_column_index).type(),
+      k,
+      update_target_from_dictionary_gmem<target_has_nulls>{},
+      target,
+      target_index,
+      source_column,
+      source,
+      source_index,
+      source_null);
+  }
+};
+
+// The shared memory will already have it squared
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<Source,
+                                  cudf::aggregation::SUM_OF_SQUARES,
+                                  target_has_nulls,
+                                  source_has_nulls,
+                                  std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    if (source_has_nulls and source_null[source_index]) { return; }
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM_OF_SQUARES>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    Target value          = static_cast<Target>(source_casted[source_index]);
+
+    cudf::detail::atomic_add(&target.element<Target>(target_index), value);
+
+    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<Source,
+                                  cudf::aggregation::PRODUCT,
+                                  target_has_nulls,
+                                  source_has_nulls,
+                                  std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    if (source_has_nulls and source_null[source_index]) { return; }
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::PRODUCT>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    cudf::detail::atomic_mul(&target.element<Target>(target_index),
+                             static_cast<Target>(source_casted[source_index]));
+
+    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+// Assuming that the target column of COUNT_VALID, COUNT_ALL would be using fixed_width column and
+// non-fixed point column
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::COUNT_VALID,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_VALID>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_VALID>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    cudf::detail::atomic_add(&target.element<Target>(target_index),
+                             static_cast<Target>(source_casted[source_index]));
+
+    // It is assumed the output for COUNT_VALID is initialized to be all valid
+  }
+};
+
+// TODO: VALID and ALL have same code
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::COUNT_ALL,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_ALL>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_ALL>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    cudf::detail::atomic_add(&target.element<Target>(target_index),
+                             static_cast<Target>(source_casted[source_index]));
+
+    // It is assumed the output for COUNT_VALID is initialized to be all valid
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::ARGMAX,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMAX>() and
+                   cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    if (source_has_nulls and source_null[source_index]) { return; }
+    using Target             = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMAX>;
+    Target* source_casted    = reinterpret_cast<Target*>(source);
+    auto source_argmax_index = source_casted[source_index];
+    auto old                 = cudf::detail::atomic_cas(
+      &target.element<Target>(target_index), cudf::detail::ARGMAX_SENTINEL, source_argmax_index);
+    if (old != cudf::detail::ARGMAX_SENTINEL) {
+      while (source_column.element<Source>(source_argmax_index) >
+             source_column.element<Source>(old)) {
+        old =
+          cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_argmax_index);
+      }
+    }
+
+    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::ARGMIN,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMIN>() and
+                   cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    if (source_has_nulls and source_null[source_index]) { return; }
+    using Target             = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMIN>;
+    Target* source_casted    = reinterpret_cast<Target*>(source);
+    auto source_argmin_index = source_casted[source_index];
+    auto old                 = cudf::detail::atomic_cas(
+      &target.element<Target>(target_index), cudf::detail::ARGMIN_SENTINEL, source_argmin_index);
+    if (old != cudf::detail::ARGMIN_SENTINEL) {
+      while (source_column.element<Source>(source_argmin_index) <
+             source_column.element<Source>(old)) {
+        old =
+          cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_argmin_index);
+      }
+    }
+
+    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <bool target_has_nulls = true, bool source_has_nulls = true>
+struct gmem_element_aggregator {
+  template <typename Source, cudf::aggregation::Kind k>
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             std::byte* source,
+                             cudf::size_type source_index,
+                             bool* source_null) const noexcept
+  {
+    update_target_element_gmem<Source, k, target_has_nulls, source_has_nulls>{}(
+      target, target_index, source_column, source, source_index, source_null);
+  }
+};
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
new file mode 100644
index 00000000000..c2d72d84b5b
--- /dev/null
+++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
@@ -0,0 +1,416 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/utilities/assert.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/utilities/traits.cuh>
+
+namespace cudf::groupby::detail::hash {
+
+template <typename Source,
+          cudf::aggregation::Kind k,
+          bool target_has_nulls,
+          bool source_has_nulls,
+          typename Enable = void>
+struct update_target_element_shmem {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::MIN,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                   !cudf::is_fixed_point<Source>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    cudf::detail::atomic_min(&target_casted[target_index],
+                             static_cast<Target>(source.element<Source>(source_index)));
+
+    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::MIN,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::is_fixed_point<Source>() &&
+                   cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target       = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
+    using DeviceTarget = cudf::device_storage_type_t<Target>;
+    using DeviceSource = cudf::device_storage_type_t<Source>;
+
+    DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
+    cudf::detail::atomic_min(&target_casted[target_index],
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::MAX,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                   !cudf::is_fixed_point<Source>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    cudf::detail::atomic_max(&target_casted[target_index],
+                             static_cast<Target>(source.element<Source>(source_index)));
+    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::MAX,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::is_fixed_point<Source>() &&
+                   cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
+
+    using DeviceTarget = cudf::device_storage_type_t<Target>;
+    using DeviceSource = cudf::device_storage_type_t<Source>;
+
+    DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
+    cudf::detail::atomic_max(&target_casted[target_index],
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+
+    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::SUM,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                   !cudf::is_fixed_point<Source>() && !cudf::is_timestamp<Source>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    cudf::detail::atomic_add(&target_casted[target_index],
+                             static_cast<Target>(source.element<Source>(source_index)));
+
+    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::SUM,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::has_atomic_support<cudf::device_storage_type_t<Source>>() &&
+                   cudf::is_fixed_point<Source>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
+
+    using DeviceTarget = cudf::device_storage_type_t<Target>;
+    using DeviceSource = cudf::device_storage_type_t<Source>;
+
+    DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
+    cudf::detail::atomic_add(&target_casted[target_index],
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+
+    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+  }
+};
+
+template <bool target_has_nulls = true>
+struct update_target_from_dictionary_shmem {
+  template <typename Source,
+            aggregation::Kind k,
+            std::enable_if_t<!is_dictionary<Source>()>* = nullptr>
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    update_target_element_shmem<Source, k, target_has_nulls, false>{}(
+      target, target_index, target_null, source, source_index);
+  }
+  template <typename Source,
+            aggregation::Kind k,
+            std::enable_if_t<is_dictionary<Source>()>* = nullptr>
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+  }
+};
+
+template <aggregation::Kind k, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<
+  dictionary32,
+  k,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<not(k == aggregation::ARGMIN or k == aggregation::ARGMAX or
+                       k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL)>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    dispatch_type_and_aggregation(
+      source.child(cudf::dictionary_column_view::keys_column_index).type(),
+      k,
+      update_target_from_dictionary_shmem<target_has_nulls>{},
+      target,
+      target_index,
+      target_null,
+      source.child(cudf::dictionary_column_view::keys_column_index),
+      static_cast<cudf::size_type>(source.element<dictionary32>(source_index)));
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<Source,
+                                   cudf::aggregation::SUM_OF_SQUARES,
+                                   target_has_nulls,
+                                   source_has_nulls,
+                                   std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::SUM_OF_SQUARES>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    auto value            = static_cast<Target>(source.element<Source>(source_index));
+    cudf::detail::atomic_add(&target_casted[target_index], value * value);
+
+    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<Source,
+                                   cudf::aggregation::PRODUCT,
+                                   target_has_nulls,
+                                   source_has_nulls,
+                                   std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::PRODUCT>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    cudf::detail::atomic_mul(&target_casted[target_index],
+                             static_cast<Target>(source.element<Source>(source_index)));
+
+    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::COUNT_VALID,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_VALID>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_VALID>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    cudf::detail::atomic_add(&target_casted[target_index], Target{1});
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::COUNT_ALL,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_ALL>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_ALL>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    cudf::detail::atomic_add(&target_casted[target_index], Target{1});
+
+    // Assumes target is already set to be valid
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::ARGMAX,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMAX>() and
+                   cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMAX>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    auto old              = cudf::detail::atomic_cas(
+      &target_casted[target_index], cudf::detail::ARGMAX_SENTINEL, source_index);
+    if (old != cudf::detail::ARGMAX_SENTINEL) {
+      while (source.element<Source>(source_index) > source.element<Source>(old)) {
+        old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index);
+      }
+    }
+
+    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+  }
+};
+
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::ARGMIN,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMIN>() and
+                   cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMIN>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    auto old              = cudf::detail::atomic_cas(
+      &target_casted[target_index], cudf::detail::ARGMIN_SENTINEL, source_index);
+    if (old != cudf::detail::ARGMIN_SENTINEL) {
+      while (source.element<Source>(source_index) < source.element<Source>(old)) {
+        old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index);
+      }
+    }
+
+    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+  }
+};
+
+template <bool target_has_nulls = true, bool source_has_nulls = true>
+struct shmem_element_aggregator {
+  template <typename Source, cudf::aggregation::Kind k>
+  __device__ void operator()(std::byte* target,
+                             cudf::size_type target_index,
+                             bool* target_null,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    update_target_element_shmem<Source, k, target_has_nulls, source_has_nulls>{}(
+      target, target_index, target_null, source, source_index);
+  }
+};
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index bb8cbaf4b46..19ba33e01e3 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -25,698 +25,6 @@
 
 namespace cudf::groupby::detail::hash {
 
-template <typename Source,
-          cudf::aggregation::Kind k,
-          bool target_has_nulls,
-          bool source_has_nulls,
-          typename Enable = void>
-struct update_target_element_gmem {
-  __device__ void operator()(cudf::mutable_column_device_view target,
-                             cudf::size_type target_index,
-                             cudf::column_device_view source_column,
-                             std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
-  {
-    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_gmem<
-  Source,
-  cudf::aggregation::MIN,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                   !cudf::is_fixed_point<Source>()>> {
-  __device__ void operator()(cudf::mutable_column_device_view target,
-                             cudf::size_type target_index,
-                             cudf::column_device_view source_column,
-                             std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
-  {
-    if (source_has_nulls and source_null[source_index]) { return; }
-
-    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
-
-    Target* source_casted = reinterpret_cast<Target*>(source);
-    cudf::detail::atomic_min(&target.element<Target>(target_index),
-                             static_cast<Target>(source_casted[source_index]));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_gmem<
-  Source,
-  cudf::aggregation::MIN,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<cudf::is_fixed_point<Source>() &&
-                   cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
-  __device__ void operator()(cudf::mutable_column_device_view target,
-                             cudf::size_type target_index,
-                             cudf::column_device_view source_column,
-                             std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
-  {
-    if (source_has_nulls and source_null[source_index]) { return; }
-    using Target              = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
-    using DeviceType          = cudf::device_storage_type_t<Target>;
-    DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
-    cudf::detail::atomic_min(&target.element<DeviceType>(target_index),
-                             static_cast<DeviceType>(source_casted[source_index]));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_gmem<
-  Source,
-  cudf::aggregation::MAX,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                   !cudf::is_fixed_point<Source>()>> {
-  __device__ void operator()(cudf::mutable_column_device_view target,
-                             cudf::size_type target_index,
-                             cudf::column_device_view source_column,
-                             std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
-  {
-    if (source_has_nulls and source_null[source_index]) { return; }
-    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
-    Target* source_casted = reinterpret_cast<Target*>(source);
-    cudf::detail::atomic_max(&target.element<Target>(target_index),
-                             static_cast<Target>(source_casted[source_index]));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_gmem<
-  Source,
-  cudf::aggregation::MAX,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<cudf::is_fixed_point<Source>() &&
-                   cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
-  __device__ void operator()(cudf::mutable_column_device_view target,
-                             cudf::size_type target_index,
-                             cudf::column_device_view source_column,
-                             std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
-  {
-    if (source_has_nulls and source_null[source_index]) { return; }
-    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
-
-    using DeviceType          = cudf::device_storage_type_t<Target>;
-    DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
-    cudf::detail::atomic_max(&target.element<DeviceType>(target_index),
-                             static_cast<DeviceType>(source_casted[source_index]));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_gmem<
-  Source,
-  cudf::aggregation::SUM,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                   !cudf::is_fixed_point<Source>() && !cudf::is_timestamp<Source>()>> {
-  __device__ void operator()(cudf::mutable_column_device_view target,
-                             cudf::size_type target_index,
-                             cudf::column_device_view source_column,
-                             std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
-  {
-    if (source_has_nulls and source_null[source_index]) { return; }
-    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
-
-    Target* source_casted = reinterpret_cast<Target*>(source);
-    cudf::detail::atomic_add(&target.element<Target>(target_index),
-                             static_cast<Target>(source_casted[source_index]));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_gmem<
-  Source,
-  cudf::aggregation::SUM,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<cudf::has_atomic_support<cudf::device_storage_type_t<Source>>() &&
-                   cudf::is_fixed_point<Source>()>> {
-  __device__ void operator()(cudf::mutable_column_device_view target,
-                             cudf::size_type target_index,
-                             cudf::column_device_view source_column,
-                             std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
-  {
-    if (source_has_nulls and source_null[source_index]) { return; }
-    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
-
-    using DeviceType          = cudf::device_storage_type_t<Target>;
-    DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
-    cudf::detail::atomic_add(&target.element<DeviceType>(target_index),
-                             static_cast<DeviceType>(source_casted[source_index]));
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-// The shared memory will already have it squared
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_gmem<Source,
-                                  cudf::aggregation::SUM_OF_SQUARES,
-                                  target_has_nulls,
-                                  source_has_nulls,
-                                  std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
-  __device__ void operator()(cudf::mutable_column_device_view target,
-                             cudf::size_type target_index,
-                             cudf::column_device_view source_column,
-                             std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
-  {
-    if (source_has_nulls and source_null[source_index]) { return; }
-    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM_OF_SQUARES>;
-
-    Target* source_casted = reinterpret_cast<Target*>(source);
-    Target value          = static_cast<Target>(source_casted[source_index]);
-
-    cudf::detail::atomic_add(&target.element<Target>(target_index), value);
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_gmem<Source,
-                                  cudf::aggregation::PRODUCT,
-                                  target_has_nulls,
-                                  source_has_nulls,
-                                  std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
-  __device__ void operator()(cudf::mutable_column_device_view target,
-                             cudf::size_type target_index,
-                             cudf::column_device_view source_column,
-                             std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
-  {
-    if (source_has_nulls and source_null[source_index]) { return; }
-    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::PRODUCT>;
-
-    Target* source_casted = reinterpret_cast<Target*>(source);
-    cudf::detail::atomic_mul(&target.element<Target>(target_index),
-                             static_cast<Target>(source_casted[source_index]));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-// Assuming that the target column of COUNT_VALID, COUNT_ALL would be using fixed_width column and
-// non-fixed point column
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_gmem<
-  Source,
-  cudf::aggregation::COUNT_VALID,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_VALID>()>> {
-  __device__ void operator()(cudf::mutable_column_device_view target,
-                             cudf::size_type target_index,
-                             cudf::column_device_view source_column,
-                             std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
-  {
-    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_VALID>;
-
-    Target* source_casted = reinterpret_cast<Target*>(source);
-    cudf::detail::atomic_add(&target.element<Target>(target_index),
-                             static_cast<Target>(source_casted[source_index]));
-
-    // It is assumed the output for COUNT_VALID is initialized to be all valid
-  }
-};
-
-// TODO: VALID and ALL have same code
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_gmem<
-  Source,
-  cudf::aggregation::COUNT_ALL,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_ALL>()>> {
-  __device__ void operator()(cudf::mutable_column_device_view target,
-                             cudf::size_type target_index,
-                             cudf::column_device_view source_column,
-                             std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
-  {
-    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_ALL>;
-
-    Target* source_casted = reinterpret_cast<Target*>(source);
-    cudf::detail::atomic_add(&target.element<Target>(target_index),
-                             static_cast<Target>(source_casted[source_index]));
-
-    // It is assumed the output for COUNT_VALID is initialized to be all valid
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_gmem<
-  Source,
-  cudf::aggregation::ARGMAX,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMAX>() and
-                   cudf::is_relationally_comparable<Source, Source>()>> {
-  __device__ void operator()(cudf::mutable_column_device_view target,
-                             cudf::size_type target_index,
-                             cudf::column_device_view source_column,
-                             std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
-  {
-    if (source_has_nulls and source_null[source_index]) { return; }
-    using Target             = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMAX>;
-    Target* source_casted    = reinterpret_cast<Target*>(source);
-    auto source_argmax_index = source_casted[source_index];
-    auto old                 = cudf::detail::atomic_cas(
-      &target.element<Target>(target_index), cudf::detail::ARGMAX_SENTINEL, source_argmax_index);
-    if (old != cudf::detail::ARGMAX_SENTINEL) {
-      while (source_column.element<Source>(source_argmax_index) >
-             source_column.element<Source>(old)) {
-        old =
-          cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_argmax_index);
-      }
-    }
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_gmem<
-  Source,
-  cudf::aggregation::ARGMIN,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMIN>() and
-                   cudf::is_relationally_comparable<Source, Source>()>> {
-  __device__ void operator()(cudf::mutable_column_device_view target,
-                             cudf::size_type target_index,
-                             cudf::column_device_view source_column,
-                             std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
-  {
-    if (source_has_nulls and source_null[source_index]) { return; }
-    using Target             = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMIN>;
-    Target* source_casted    = reinterpret_cast<Target*>(source);
-    auto source_argmin_index = source_casted[source_index];
-    auto old                 = cudf::detail::atomic_cas(
-      &target.element<Target>(target_index), cudf::detail::ARGMIN_SENTINEL, source_argmin_index);
-    if (old != cudf::detail::ARGMIN_SENTINEL) {
-      while (source_column.element<Source>(source_argmin_index) <
-             source_column.element<Source>(old)) {
-        old =
-          cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_argmin_index);
-      }
-    }
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <bool target_has_nulls = true, bool source_has_nulls = true>
-struct gmem_element_aggregator {
-  template <typename Source, cudf::aggregation::Kind k>
-  __device__ void operator()(cudf::mutable_column_device_view target,
-                             cudf::size_type target_index,
-                             cudf::column_device_view source_column,
-                             std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
-  {
-    update_target_element_gmem<Source, k, target_has_nulls, source_has_nulls>{}(
-      target, target_index, source_column, source, source_index, source_null);
-  }
-};
-
-template <typename Source,
-          cudf::aggregation::Kind k,
-          bool target_has_nulls,
-          bool source_has_nulls,
-          typename Enable = void>
-struct update_target_element_shmem {
-  __device__ void operator()(std::byte* target,
-                             cudf::size_type target_index,
-                             bool* target_null,
-                             cudf::column_device_view source,
-                             cudf::size_type source_index) const noexcept
-  {
-    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_shmem<
-  Source,
-  cudf::aggregation::MIN,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                   !cudf::is_fixed_point<Source>()>> {
-  __device__ void operator()(std::byte* target,
-                             cudf::size_type target_index,
-                             bool* target_null,
-                             cudf::column_device_view source,
-                             cudf::size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
-    Target* target_casted = reinterpret_cast<Target*>(target);
-    cudf::detail::atomic_min(&target_casted[target_index],
-                             static_cast<Target>(source.element<Source>(source_index)));
-
-    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_shmem<
-  Source,
-  cudf::aggregation::MIN,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<cudf::is_fixed_point<Source>() &&
-                   cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
-  __device__ void operator()(std::byte* target,
-                             cudf::size_type target_index,
-                             bool* target_null,
-                             cudf::column_device_view source,
-                             cudf::size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target       = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
-    using DeviceTarget = cudf::device_storage_type_t<Target>;
-    using DeviceSource = cudf::device_storage_type_t<Source>;
-
-    DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
-    cudf::detail::atomic_min(&target_casted[target_index],
-                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
-    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_shmem<
-  Source,
-  cudf::aggregation::MAX,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                   !cudf::is_fixed_point<Source>()>> {
-  __device__ void operator()(std::byte* target,
-                             cudf::size_type target_index,
-                             bool* target_null,
-                             cudf::column_device_view source,
-                             cudf::size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
-    Target* target_casted = reinterpret_cast<Target*>(target);
-    cudf::detail::atomic_max(&target_casted[target_index],
-                             static_cast<Target>(source.element<Source>(source_index)));
-    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_shmem<
-  Source,
-  cudf::aggregation::MAX,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<cudf::is_fixed_point<Source>() &&
-                   cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
-  __device__ void operator()(std::byte* target,
-                             cudf::size_type target_index,
-                             bool* target_null,
-                             cudf::column_device_view source,
-                             cudf::size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
-
-    using DeviceTarget = cudf::device_storage_type_t<Target>;
-    using DeviceSource = cudf::device_storage_type_t<Source>;
-
-    DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
-    cudf::detail::atomic_max(&target_casted[target_index],
-                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
-
-    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_shmem<
-  Source,
-  cudf::aggregation::SUM,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                   !cudf::is_fixed_point<Source>() && !cudf::is_timestamp<Source>()>> {
-  __device__ void operator()(std::byte* target,
-                             cudf::size_type target_index,
-                             bool* target_null,
-                             cudf::column_device_view source,
-                             cudf::size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
-    Target* target_casted = reinterpret_cast<Target*>(target);
-    cudf::detail::atomic_add(&target_casted[target_index],
-                             static_cast<Target>(source.element<Source>(source_index)));
-
-    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_shmem<
-  Source,
-  cudf::aggregation::SUM,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<cudf::has_atomic_support<cudf::device_storage_type_t<Source>>() &&
-                   cudf::is_fixed_point<Source>()>> {
-  __device__ void operator()(std::byte* target,
-                             cudf::size_type target_index,
-                             bool* target_null,
-                             cudf::column_device_view source,
-                             cudf::size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
-
-    using DeviceTarget = cudf::device_storage_type_t<Target>;
-    using DeviceSource = cudf::device_storage_type_t<Source>;
-
-    DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
-    cudf::detail::atomic_add(&target_casted[target_index],
-                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
-
-    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_shmem<Source,
-                                   cudf::aggregation::SUM_OF_SQUARES,
-                                   target_has_nulls,
-                                   source_has_nulls,
-                                   std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
-  __device__ void operator()(std::byte* target,
-                             cudf::size_type target_index,
-                             bool* target_null,
-                             cudf::column_device_view source,
-                             cudf::size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::SUM_OF_SQUARES>;
-    Target* target_casted = reinterpret_cast<Target*>(target);
-    auto value            = static_cast<Target>(source.element<Source>(source_index));
-    cudf::detail::atomic_add(&target_casted[target_index], value * value);
-
-    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_shmem<Source,
-                                   cudf::aggregation::PRODUCT,
-                                   target_has_nulls,
-                                   source_has_nulls,
-                                   std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
-  __device__ void operator()(std::byte* target,
-                             cudf::size_type target_index,
-                             bool* target_null,
-                             cudf::column_device_view source,
-                             cudf::size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::PRODUCT>;
-    Target* target_casted = reinterpret_cast<Target*>(target);
-    cudf::detail::atomic_mul(&target_casted[target_index],
-                             static_cast<Target>(source.element<Source>(source_index)));
-
-    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_shmem<
-  Source,
-  cudf::aggregation::COUNT_VALID,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_VALID>()>> {
-  __device__ void operator()(std::byte* target,
-                             cudf::size_type target_index,
-                             bool* target_null,
-                             cudf::column_device_view source,
-                             cudf::size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_VALID>;
-    Target* target_casted = reinterpret_cast<Target*>(target);
-    cudf::detail::atomic_add(&target_casted[target_index], Target{1});
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_shmem<
-  Source,
-  cudf::aggregation::COUNT_ALL,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_ALL>()>> {
-  __device__ void operator()(std::byte* target,
-                             cudf::size_type target_index,
-                             bool* target_null,
-                             cudf::column_device_view source,
-                             cudf::size_type source_index) const noexcept
-  {
-    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_ALL>;
-    Target* target_casted = reinterpret_cast<Target*>(target);
-    cudf::detail::atomic_add(&target_casted[target_index], Target{1});
-
-    // Assumes target is already set to be valid
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_shmem<
-  Source,
-  cudf::aggregation::ARGMAX,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMAX>() and
-                   cudf::is_relationally_comparable<Source, Source>()>> {
-  __device__ void operator()(std::byte* target,
-                             cudf::size_type target_index,
-                             bool* target_null,
-                             cudf::column_device_view source,
-                             cudf::size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMAX>;
-    Target* target_casted = reinterpret_cast<Target*>(target);
-    auto old              = cudf::detail::atomic_cas(
-      &target_casted[target_index], cudf::detail::ARGMAX_SENTINEL, source_index);
-    if (old != cudf::detail::ARGMAX_SENTINEL) {
-      while (source.element<Source>(source_index) > source.element<Source>(old)) {
-        old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index);
-      }
-    }
-
-    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element_shmem<
-  Source,
-  cudf::aggregation::ARGMIN,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMIN>() and
-                   cudf::is_relationally_comparable<Source, Source>()>> {
-  __device__ void operator()(std::byte* target,
-                             cudf::size_type target_index,
-                             bool* target_null,
-                             cudf::column_device_view source,
-                             cudf::size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMIN>;
-    Target* target_casted = reinterpret_cast<Target*>(target);
-    auto old              = cudf::detail::atomic_cas(
-      &target_casted[target_index], cudf::detail::ARGMIN_SENTINEL, source_index);
-    if (old != cudf::detail::ARGMIN_SENTINEL) {
-      while (source.element<Source>(source_index) < source.element<Source>(old)) {
-        old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index);
-      }
-    }
-
-    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
-  }
-};
-
-template <bool target_has_nulls = true, bool source_has_nulls = true>
-struct shmem_element_aggregator {
-  template <typename Source, cudf::aggregation::Kind k>
-  __device__ void operator()(std::byte* target,
-                             cudf::size_type target_index,
-                             bool* target_null,
-                             cudf::column_device_view source,
-                             cudf::size_type source_index) const noexcept
-  {
-    update_target_element_shmem<Source, k, target_has_nulls, source_has_nulls>{}(
-      target, target_index, target_null, source, source_index);
-  }
-};
-
 template <typename T, cudf::aggregation::Kind k>
 __device__ constexpr bool is_supported()
 {

From f75f2c927121e279c74bf49a1fbc5002fe057607 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 23 Sep 2024 17:16:30 -0700
Subject: [PATCH 035/135] Add missing headers

---
 cpp/src/groupby/hash/global_memory_aggregator.cuh | 3 +++
 cpp/src/groupby/hash/shared_memory_aggregator.cuh | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh
index 4dd39e640e0..9f38750060b 100644
--- a/cpp/src/groupby/hash/global_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh
@@ -16,8 +16,11 @@
 
 #pragma once
 
+#include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/utilities/traits.cuh>
 
 namespace cudf::groupby::detail::hash {
diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
index c2d72d84b5b..ef46c9b4cb4 100644
--- a/cpp/src/groupby/hash/shared_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
@@ -16,8 +16,11 @@
 
 #pragma once
 
+#include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/utilities/traits.cuh>
 
 namespace cudf::groupby::detail::hash {

From feb93c36c280324e91efc818e7fef3ec07c8f441 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 24 Sep 2024 15:25:34 -0700
Subject: [PATCH 036/135] Separate files to reduce build time

---
 cpp/CMakeLists.txt                            |   3 +
 cpp/src/groupby/hash/compute_groupby.cu       |  32 ++
 cpp/src/groupby/hash/compute_groupby.cuh      | 312 +++++++++++++++++
 cpp/src/groupby/hash/compute_groupby_null.cu  |  32 ++
 .../groupby/hash/compute_single_pass_aggs.cuh | 122 +------
 .../groupby/hash/flatten_single_pass_aggs.cpp | 138 ++++++++
 .../groupby/hash/flatten_single_pass_aggs.hpp |  34 ++
 cpp/src/groupby/hash/groupby.cu               | 324 ++----------------
 cpp/src/groupby/hash/helpers.cuh              |  23 +-
 9 files changed, 594 insertions(+), 426 deletions(-)
 create mode 100644 cpp/src/groupby/hash/compute_groupby.cu
 create mode 100644 cpp/src/groupby/hash/compute_groupby.cuh
 create mode 100644 cpp/src/groupby/hash/compute_groupby_null.cu
 create mode 100644 cpp/src/groupby/hash/flatten_single_pass_aggs.cpp
 create mode 100644 cpp/src/groupby/hash/flatten_single_pass_aggs.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index bfec788fc0a..e405f907289 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -315,6 +315,9 @@ add_library(
   src/filling/sequence.cu
   src/groupby/groupby.cu
   src/groupby/hash/compute_aggregations.cu
+  src/groupby/hash/compute_groupby.cu
+  src/groupby/hash/compute_groupby_null.cu
+  src/groupby/hash/flatten_single_pass_aggs.cpp
   src/groupby/hash/groupby.cu
   src/groupby/sort/aggregate.cpp
   src/groupby/sort/group_argmax.cu
diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu
new file mode 100644
index 00000000000..111e7c7972a
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_groupby.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_groupby.cuh"
+
+namespace cudf::groupby::detail::hash {
+
+template std::unique_ptr<table> compute_groupby<row_comparator_t>(
+  table_view const& keys,
+  host_span<aggregation_request const> requests,
+  cudf::detail::result_cache* cache,
+  bool skip_key_rows_with_nulls,
+  row_comparator_t const& d_row_equal,
+  cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
+                                                   cudf::nullate::DYNAMIC> const& d_row_hash,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_groupby.cuh b/cpp/src/groupby/hash/compute_groupby.cuh
new file mode 100644
index 00000000000..5b2cc6e1ba6
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_groupby.cuh
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "compute_single_pass_aggs.cuh"
+#include "compute_single_pass_aggs.hpp"
+#include "multi_pass_functors.cuh"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/binaryop.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/unary.hpp>
+
+namespace cudf::groupby::detail::hash {
+
+template <typename SetType>
+class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer {
+  column_view col;
+  data_type result_type;
+  cudf::detail::result_cache* sparse_results;
+  cudf::detail::result_cache* dense_results;
+  device_span<size_type const> gather_map;
+  SetType set;
+  bitmask_type const* __restrict__ row_bitmask;
+  rmm::cuda_stream_view stream;
+  rmm::device_async_resource_ref mr;
+
+ public:
+  using cudf::detail::aggregation_finalizer::visit;
+
+  hash_compound_agg_finalizer(column_view col,
+                              cudf::detail::result_cache* sparse_results,
+                              cudf::detail::result_cache* dense_results,
+                              device_span<size_type const> gather_map,
+                              SetType set,
+                              bitmask_type const* row_bitmask,
+                              rmm::cuda_stream_view stream,
+                              rmm::device_async_resource_ref mr)
+    : col(col),
+      sparse_results(sparse_results),
+      dense_results(dense_results),
+      gather_map(gather_map),
+      set(set),
+      row_bitmask(row_bitmask),
+      stream(stream),
+      mr(mr)
+  {
+    result_type = cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type()
+                                                  : col.type();
+  }
+
+  auto to_dense_agg_result(cudf::aggregation const& agg)
+  {
+    auto s                  = sparse_results->get_result(col, agg);
+    auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}),
+                                                   gather_map,
+                                                   out_of_bounds_policy::DONT_CHECK,
+                                                   cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                                   stream,
+                                                   mr);
+    return std::move(dense_result_table->release()[0]);
+  }
+
+  // Enables conversion of ARGMIN/ARGMAX into MIN/MAX
+  auto gather_argminmax(aggregation const& agg)
+  {
+    auto arg_result = to_dense_agg_result(agg);
+    // We make a view of ARG(MIN/MAX) result without a null mask and gather
+    // using this map. The values in data buffer of ARG(MIN/MAX) result
+    // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL
+    // which is an out of bounds index value (-1) and causes the gathered
+    // value to be null.
+    column_view null_removed_map(
+      data_type(type_to_id<size_type>()),
+      arg_result->size(),
+      static_cast<void const*>(arg_result->view().template data<size_type>()),
+      nullptr,
+      0);
+    auto gather_argminmax =
+      cudf::detail::gather(table_view({col}),
+                           null_removed_map,
+                           arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY
+                                                  : cudf::out_of_bounds_policy::DONT_CHECK,
+                           cudf::detail::negative_index_policy::NOT_ALLOWED,
+                           stream,
+                           mr);
+    return std::move(gather_argminmax->release()[0]);
+  }
+
+  // Declare overloads for each kind of aggregation to dispatch
+  void visit(cudf::aggregation const& agg) override
+  {
+    if (dense_results->has_result(col, agg)) return;
+    dense_results->add_result(col, agg, to_dense_agg_result(agg));
+  }
+
+  void visit(cudf::detail::min_aggregation const& agg) override
+  {
+    if (dense_results->has_result(col, agg)) return;
+    if (result_type.id() == type_id::STRING) {
+      auto transformed_agg = make_argmin_aggregation();
+      dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
+    } else {
+      dense_results->add_result(col, agg, to_dense_agg_result(agg));
+    }
+  }
+
+  void visit(cudf::detail::max_aggregation const& agg) override
+  {
+    if (dense_results->has_result(col, agg)) return;
+
+    if (result_type.id() == type_id::STRING) {
+      auto transformed_agg = make_argmax_aggregation();
+      dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
+    } else {
+      dense_results->add_result(col, agg, to_dense_agg_result(agg));
+    }
+  }
+
+  void visit(cudf::detail::mean_aggregation const& agg) override
+  {
+    if (dense_results->has_result(col, agg)) return;
+
+    auto sum_agg   = make_sum_aggregation();
+    auto count_agg = make_count_aggregation();
+    this->visit(*sum_agg);
+    this->visit(*count_agg);
+    column_view sum_result   = dense_results->get_result(col, *sum_agg);
+    column_view count_result = dense_results->get_result(col, *count_agg);
+
+    auto result =
+      cudf::detail::binary_operation(sum_result,
+                                     count_result,
+                                     binary_operator::DIV,
+                                     cudf::detail::target_type(result_type, aggregation::MEAN),
+                                     stream,
+                                     mr);
+    dense_results->add_result(col, agg, std::move(result));
+  }
+
+  void visit(cudf::detail::var_aggregation const& agg) override
+  {
+    if (dense_results->has_result(col, agg)) return;
+
+    auto sum_agg   = make_sum_aggregation();
+    auto count_agg = make_count_aggregation();
+    this->visit(*sum_agg);
+    this->visit(*count_agg);
+    column_view sum_result   = sparse_results->get_result(col, *sum_agg);
+    column_view count_result = sparse_results->get_result(col, *count_agg);
+
+    auto values_view = column_device_view::create(col, stream);
+    auto sum_view    = column_device_view::create(sum_result, stream);
+    auto count_view  = column_device_view::create(count_result, stream);
+
+    auto var_result = make_fixed_width_column(
+      cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream);
+    auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream);
+    mutable_table_view var_table_view{{var_result->mutable_view()}};
+    cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream);
+
+    thrust::for_each_n(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(0),
+      col.size(),
+      ::cudf::detail::var_hash_functor{
+        set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof});
+    sparse_results->add_result(col, agg, std::move(var_result));
+    dense_results->add_result(col, agg, to_dense_agg_result(agg));
+  }
+
+  void visit(cudf::detail::std_aggregation const& agg) override
+  {
+    if (dense_results->has_result(col, agg)) return;
+    auto var_agg = make_variance_aggregation(agg._ddof);
+    this->visit(*dynamic_cast<cudf::detail::var_aggregation*>(var_agg.get()));
+    column_view variance = dense_results->get_result(col, *var_agg);
+
+    auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr);
+    dense_results->add_result(col, agg, std::move(result));
+  }
+};
+
+/**
+ * @brief Gather sparse results into dense using `gather_map` and add to
+ * `dense_cache`
+ *
+ * @see groupby_null_templated()
+ */
+template <typename SetType>
+void sparse_to_dense_results(table_view const& keys,
+                             host_span<aggregation_request const> requests,
+                             cudf::detail::result_cache* sparse_results,
+                             cudf::detail::result_cache* dense_results,
+                             device_span<size_type const> gather_map,
+                             SetType set,
+                             bool skip_key_rows_with_nulls,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr)
+{
+  auto row_bitmask =
+    cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first;
+  bitmask_type const* row_bitmask_ptr =
+    skip_key_rows_with_nulls ? static_cast<bitmask_type*>(row_bitmask.data()) : nullptr;
+
+  for (auto const& request : requests) {
+    auto const& agg_v = request.aggregations;
+    auto const& col   = request.values;
+
+    // Given an aggregation, this will get the result from sparse_results and
+    // convert and return dense, compacted result
+    auto finalizer = hash_compound_agg_finalizer(
+      col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr);
+    for (auto&& agg : agg_v) {
+      agg->finalize(finalizer);
+    }
+  }
+}
+
+/**
+ * @brief Computes groupby using hash table.
+ *
+ * First, we create a hash table that stores the indices of unique rows in
+ * `keys`. The upper limit on the number of values in this map is the number
+ * of rows in `keys`.
+ *
+ * To store the results of aggregations, we create temporary sparse columns
+ * which have the same size as input value columns. Using the hash map, we
+ * determine the location within the sparse column to write the result of the
+ * aggregation into.
+ *
+ * The sparse column results of all aggregations are stored into the cache
+ * `sparse_results`. This enables the use of previously calculated results in
+ * other aggregations.
+ *
+ * All the aggregations which can be computed in a single pass are computed
+ * first, in a combined kernel. Then using these results, aggregations that
+ * require multiple passes, will be computed.
+ *
+ * Finally, using the hash map, we generate a vector of indices of populated
+ * values in sparse result columns. Then, for each aggregation originally
+ * requested in `requests`, we gather sparse results into a column of dense
+ * results using the aforementioned index vector. Dense results are stored into
+ * the in/out parameter `cache`.
+ */
+template <typename Equal>
+std::unique_ptr<table> compute_groupby(
+  table_view const& keys,
+  host_span<aggregation_request const> requests,
+  cudf::detail::result_cache* cache,
+  bool skip_key_rows_with_nulls,
+  Equal const& d_row_equal,
+  cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
+                                                   cudf::nullate::DYNAMIC> const& d_row_hash,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  // convert to int64_t to avoid potential overflow with large `keys`
+  auto const num_keys = static_cast<int64_t>(keys.num_rows());
+
+  // Cache of sparse results where the location of aggregate value in each
+  // column is indexed by the hash set
+  cudf::detail::result_cache sparse_results(requests.size());
+
+  auto const set = cuco::static_set{
+    cuco::extent<int64_t>{num_keys},
+    cudf::detail::CUCO_DESIRED_LOAD_FACTOR,  // 50% occupancy
+    cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+    d_row_equal,
+    probing_scheme_t{d_row_hash},
+    cuco::thread_scope_device,
+    cuco::storage<GROUPBY_WINDOW_SIZE>{},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
+
+  // Compute all single pass aggs first
+  auto gather_map = compute_single_pass_aggs(
+    keys, requests, &sparse_results, set, skip_key_rows_with_nulls, stream);
+
+  // Compact all results from sparse_results and insert into cache
+  sparse_to_dense_results(keys,
+                          requests,
+                          &sparse_results,
+                          cache,
+                          gather_map,
+                          set.ref(cuco::find),
+                          skip_key_rows_with_nulls,
+                          stream,
+                          mr);
+
+  return cudf::detail::gather(keys,
+                              gather_map,
+                              out_of_bounds_policy::DONT_CHECK,
+                              cudf::detail::negative_index_policy::NOT_ALLOWED,
+                              stream,
+                              mr);
+}
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_groupby_null.cu b/cpp/src/groupby/hash/compute_groupby_null.cu
new file mode 100644
index 00000000000..1420bd2a987
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_groupby_null.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_groupby.cuh"
+
+namespace cudf::groupby::detail::hash {
+
+template std::unique_ptr<table> compute_groupby<nullable_row_comparator_t>(
+  table_view const& keys,
+  host_span<aggregation_request const> requests,
+  cudf::detail::result_cache* cache,
+  bool skip_key_rows_with_nulls,
+  nullable_row_comparator_t const& d_row_equal,
+  cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
+                                                   cudf::nullate::DYNAMIC> const& d_row_hash,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index 617a4411243..464365c0416 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -17,7 +17,8 @@
 #pragma once
 
 #include "compute_aggregations.hpp"
-#include "compute_single_pass_aggs.hpp"
+// #include "compute_single_pass_aggs.hpp"
+#include "flatten_single_pass_aggs.hpp"
 #include "helpers.cuh"
 #include "single_pass_functors.cuh"
 
@@ -40,10 +41,7 @@
 
 #include <unordered_set>
 
-namespace cudf {
-namespace groupby {
-namespace detail {
-namespace hash {
+namespace cudf::groupby::detail::hash {
 
 template <typename SetType>
 // TODO pass block
@@ -118,7 +116,7 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
   auto storage     = SetRef::storage_ref_type(window_extent, windows);
   auto shared_set  = SetRef(cuco::empty_key<cudf::size_type>{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
                            global_set.key_eq(),
-                           probing_scheme_type{global_set.hash_function()},
+                           probing_scheme_t{global_set.hash_function()},
                             {},
                            storage);
   auto const block = cooperative_groups::this_thread_block();
@@ -165,111 +163,6 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
   if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; }
 }
 
-class groupby_simple_aggregations_collector final
-  : public cudf::detail::simple_aggregations_collector {
- public:
-  using cudf::detail::simple_aggregations_collector::visit;
-
-  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
-                                                  cudf::detail::min_aggregation const&) override
-  {
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation()
-                                                    : make_min_aggregation());
-    return aggs;
-  }
-
-  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
-                                                  cudf::detail::max_aggregation const&) override
-  {
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation()
-                                                    : make_max_aggregation());
-    return aggs;
-  }
-
-  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
-                                                  cudf::detail::mean_aggregation const&) override
-  {
-    (void)col_type;
-    CUDF_EXPECTS(is_fixed_width(col_type), "MEAN aggregation expects fixed width type");
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(make_sum_aggregation());
-    // COUNT_VALID
-    aggs.push_back(make_count_aggregation());
-
-    return aggs;
-  }
-
-  std::vector<std::unique_ptr<aggregation>> visit(data_type,
-                                                  cudf::detail::var_aggregation const&) override
-  {
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(make_sum_aggregation());
-    // COUNT_VALID
-    aggs.push_back(make_count_aggregation());
-
-    return aggs;
-  }
-
-  std::vector<std::unique_ptr<aggregation>> visit(data_type,
-                                                  cudf::detail::std_aggregation const&) override
-  {
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(make_sum_aggregation());
-    // COUNT_VALID
-    aggs.push_back(make_count_aggregation());
-
-    return aggs;
-  }
-
-  std::vector<std::unique_ptr<aggregation>> visit(
-    data_type, cudf::detail::correlation_aggregation const&) override
-  {
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(make_sum_aggregation());
-    // COUNT_VALID
-    aggs.push_back(make_count_aggregation());
-
-    return aggs;
-  }
-};
-
-// flatten aggs to filter in single pass aggs
-std::tuple<table_view, std::vector<aggregation::Kind>, std::vector<std::unique_ptr<aggregation>>>
-flatten_single_pass_aggs(host_span<aggregation_request const> requests)
-{
-  std::vector<column_view> columns;
-  std::vector<std::unique_ptr<aggregation>> aggs;
-  std::vector<aggregation::Kind> agg_kinds;
-
-  for (auto const& request : requests) {
-    auto const& agg_v = request.aggregations;
-
-    std::unordered_set<aggregation::Kind> agg_kinds_set;
-    auto insert_agg = [&](column_view const& request_values, std::unique_ptr<aggregation>&& agg) {
-      if (agg_kinds_set.insert(agg->kind).second) {
-        agg_kinds.push_back(agg->kind);
-        aggs.push_back(std::move(agg));
-        columns.push_back(request_values);
-      }
-    };
-
-    auto values_type = cudf::is_dictionary(request.values.type())
-                         ? cudf::dictionary_column_view(request.values).keys().type()
-                         : request.values.type();
-    for (auto&& agg : agg_v) {
-      groupby_simple_aggregations_collector collector;
-
-      for (auto& agg_s : agg->get_simple_aggregations(values_type, collector)) {
-        insert_agg(request.values, std::move(agg_s));
-      }
-    }
-  }
-
-  return std::make_tuple(table_view(columns), std::move(agg_kinds), std::move(aggs));
-}
-
 template <typename Kernel>
 int max_occupancy_grid_size(Kernel kernel, cudf::size_type n)
 {
@@ -362,7 +255,7 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
                                            extent_type,
                                            cuda::thread_scope_block,
                                            typename SetType::key_equal,
-                                           probing_scheme_type,
+                                           probing_scheme_t,
                                            cuco::cuda_allocator<cudf::size_type>,
                                            cuco::storage<GROUPBY_WINDOW_SIZE>>;
   using shared_set_ref_type    = typename shared_set_type::ref_type<>;
@@ -458,7 +351,4 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   return populated_keys;
 }
 
-}  // namespace hash
-}  // namespace detail
-}  // namespace groupby
-}  // namespace cudf
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp
new file mode 100644
index 00000000000..2d34a757a6f
--- /dev/null
+++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flatten_single_pass_aggs.hpp"
+
+#include <cudf/aggregation.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <memory>
+#include <tuple>
+#include <unordered_set>
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+
+class groupby_simple_aggregations_collector final
+  : public cudf::detail::simple_aggregations_collector {
+ public:
+  using cudf::detail::simple_aggregations_collector::visit;
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                  cudf::detail::min_aggregation const&) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation()
+                                                    : make_min_aggregation());
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                  cudf::detail::max_aggregation const&) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation()
+                                                    : make_max_aggregation());
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                  cudf::detail::mean_aggregation const&) override
+  {
+    (void)col_type;
+    CUDF_EXPECTS(is_fixed_width(col_type), "MEAN aggregation expects fixed width type");
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(make_sum_aggregation());
+    // COUNT_VALID
+    aggs.push_back(make_count_aggregation());
+
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type,
+                                                  cudf::detail::var_aggregation const&) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(make_sum_aggregation());
+    // COUNT_VALID
+    aggs.push_back(make_count_aggregation());
+
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type,
+                                                  cudf::detail::std_aggregation const&) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(make_sum_aggregation());
+    // COUNT_VALID
+    aggs.push_back(make_count_aggregation());
+
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(
+    data_type, cudf::detail::correlation_aggregation const&) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(make_sum_aggregation());
+    // COUNT_VALID
+    aggs.push_back(make_count_aggregation());
+
+    return aggs;
+  }
+};
+
+// flatten aggs to filter in single pass aggs
+std::tuple<table_view, std::vector<aggregation::Kind>, std::vector<std::unique_ptr<aggregation>>>
+flatten_single_pass_aggs(host_span<aggregation_request const> requests)
+{
+  std::vector<column_view> columns;
+  std::vector<std::unique_ptr<aggregation>> aggs;
+  std::vector<aggregation::Kind> agg_kinds;
+
+  for (auto const& request : requests) {
+    auto const& agg_v = request.aggregations;
+
+    std::unordered_set<aggregation::Kind> agg_kinds_set;
+    auto insert_agg = [&](column_view const& request_values, std::unique_ptr<aggregation>&& agg) {
+      if (agg_kinds_set.insert(agg->kind).second) {
+        agg_kinds.push_back(agg->kind);
+        aggs.push_back(std::move(agg));
+        columns.push_back(request_values);
+      }
+    };
+
+    auto values_type = cudf::is_dictionary(request.values.type())
+                         ? cudf::dictionary_column_view(request.values).keys().type()
+                         : request.values.type();
+    for (auto&& agg : agg_v) {
+      groupby_simple_aggregations_collector collector;
+
+      for (auto& agg_s : agg->get_simple_aggregations(values_type, collector)) {
+        insert_agg(request.values, std::move(agg_s));
+      }
+    }
+  }
+
+  return std::make_tuple(table_view(columns), std::move(agg_kinds), std::move(aggs));
+}
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp
new file mode 100644
index 00000000000..d79e826112b
--- /dev/null
+++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <memory>
+#include <tuple>
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+
+// flatten aggs to filter in single pass aggs
+std::tuple<table_view, std::vector<aggregation::Kind>, std::vector<std::unique_ptr<aggregation>>>
+flatten_single_pass_aggs(host_span<aggregation_request const> requests);
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 2c32930c061..62434bf5fd2 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -14,24 +14,18 @@
  * limitations under the License.
  */
 
-#include "compute_single_pass_aggs.cuh"
-#include "compute_single_pass_aggs.hpp"
+#include "compute_groupby.cuh"
 #include "groupby/common/utils.hpp"
 #include "helpers.cuh"
-#include "multi_pass_functors.cuh"
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
-#include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/aggregation/result_cache.hpp>
-#include <cudf/detail/binaryop.hpp>
-#include <cudf/detail/gather.hpp>
 #include <cudf/detail/groupby.hpp>
 #include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/unary.hpp>
 #include <cudf/detail/utilities/cuda.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
@@ -54,10 +48,7 @@
 #include <memory>
 #include <utility>
 
-namespace cudf {
-namespace groupby {
-namespace detail {
-namespace hash {
+namespace cudf::groupby::detail::hash {
 namespace {
 
 /**
@@ -99,306 +90,36 @@ constexpr bool array_contains(std::array<T, N> const& haystack, T needle)
  * @return true `t` is valid for a hash based groupby
  * @return false `t` is invalid for a hash based groupby
  */
-bool constexpr is_hash_aggregation(aggregation::Kind t)
+constexpr bool is_hash_aggregation(aggregation::Kind t)
 {
   return array_contains(hash_aggregations, t);
 }
 
-template <typename SetType>
-class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer {
-  column_view col;
-  data_type result_type;
-  cudf::detail::result_cache* sparse_results;
-  cudf::detail::result_cache* dense_results;
-  device_span<size_type const> gather_map;
-  SetType set;
-  bitmask_type const* __restrict__ row_bitmask;
-  rmm::cuda_stream_view stream;
-  rmm::device_async_resource_ref mr;
-
- public:
-  using cudf::detail::aggregation_finalizer::visit;
-
-  hash_compound_agg_finalizer(column_view col,
-                              cudf::detail::result_cache* sparse_results,
-                              cudf::detail::result_cache* dense_results,
-                              device_span<size_type const> gather_map,
-                              SetType set,
-                              bitmask_type const* row_bitmask,
-                              rmm::cuda_stream_view stream,
-                              rmm::device_async_resource_ref mr)
-    : col(col),
-      sparse_results(sparse_results),
-      dense_results(dense_results),
-      gather_map(gather_map),
-      set(set),
-      row_bitmask(row_bitmask),
-      stream(stream),
-      mr(mr)
-  {
-    result_type = cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type()
-                                                  : col.type();
-  }
-
-  auto to_dense_agg_result(cudf::aggregation const& agg)
-  {
-    auto s                  = sparse_results->get_result(col, agg);
-    auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}),
-                                                   gather_map,
-                                                   out_of_bounds_policy::DONT_CHECK,
-                                                   cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                                   stream,
-                                                   mr);
-    return std::move(dense_result_table->release()[0]);
-  }
-
-  // Enables conversion of ARGMIN/ARGMAX into MIN/MAX
-  auto gather_argminmax(aggregation const& agg)
-  {
-    auto arg_result = to_dense_agg_result(agg);
-    // We make a view of ARG(MIN/MAX) result without a null mask and gather
-    // using this map. The values in data buffer of ARG(MIN/MAX) result
-    // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL
-    // which is an out of bounds index value (-1) and causes the gathered
-    // value to be null.
-    column_view null_removed_map(
-      data_type(type_to_id<size_type>()),
-      arg_result->size(),
-      static_cast<void const*>(arg_result->view().template data<size_type>()),
-      nullptr,
-      0);
-    auto gather_argminmax =
-      cudf::detail::gather(table_view({col}),
-                           null_removed_map,
-                           arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY
-                                                  : cudf::out_of_bounds_policy::DONT_CHECK,
-                           cudf::detail::negative_index_policy::NOT_ALLOWED,
-                           stream,
-                           mr);
-    return std::move(gather_argminmax->release()[0]);
-  }
-
-  // Declare overloads for each kind of aggregation to dispatch
-  void visit(cudf::aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-    dense_results->add_result(col, agg, to_dense_agg_result(agg));
-  }
-
-  void visit(cudf::detail::min_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-    if (result_type.id() == type_id::STRING) {
-      auto transformed_agg = make_argmin_aggregation();
-      dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
-    } else {
-      dense_results->add_result(col, agg, to_dense_agg_result(agg));
-    }
-  }
-
-  void visit(cudf::detail::max_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-
-    if (result_type.id() == type_id::STRING) {
-      auto transformed_agg = make_argmax_aggregation();
-      dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
-    } else {
-      dense_results->add_result(col, agg, to_dense_agg_result(agg));
-    }
-  }
-
-  void visit(cudf::detail::mean_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-
-    auto sum_agg   = make_sum_aggregation();
-    auto count_agg = make_count_aggregation();
-    this->visit(*sum_agg);
-    this->visit(*count_agg);
-    column_view sum_result   = dense_results->get_result(col, *sum_agg);
-    column_view count_result = dense_results->get_result(col, *count_agg);
-
-    auto result =
-      cudf::detail::binary_operation(sum_result,
-                                     count_result,
-                                     binary_operator::DIV,
-                                     cudf::detail::target_type(result_type, aggregation::MEAN),
-                                     stream,
-                                     mr);
-    dense_results->add_result(col, agg, std::move(result));
-  }
-
-  void visit(cudf::detail::var_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-
-    auto sum_agg   = make_sum_aggregation();
-    auto count_agg = make_count_aggregation();
-    this->visit(*sum_agg);
-    this->visit(*count_agg);
-    column_view sum_result   = sparse_results->get_result(col, *sum_agg);
-    column_view count_result = sparse_results->get_result(col, *count_agg);
-
-    auto values_view = column_device_view::create(col, stream);
-    auto sum_view    = column_device_view::create(sum_result, stream);
-    auto count_view  = column_device_view::create(count_result, stream);
-
-    auto var_result = make_fixed_width_column(
-      cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream);
-    auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream);
-    mutable_table_view var_table_view{{var_result->mutable_view()}};
-    cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream);
-
-    thrust::for_each_n(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator(0),
-      col.size(),
-      ::cudf::detail::var_hash_functor{
-        set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof});
-    sparse_results->add_result(col, agg, std::move(var_result));
-    dense_results->add_result(col, agg, to_dense_agg_result(agg));
-  }
-
-  void visit(cudf::detail::std_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-    auto var_agg = make_variance_aggregation(agg._ddof);
-    this->visit(*dynamic_cast<cudf::detail::var_aggregation*>(var_agg.get()));
-    column_view variance = dense_results->get_result(col, *var_agg);
-
-    auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr);
-    dense_results->add_result(col, agg, std::move(result));
-  }
-};
-
-/**
- * @brief Gather sparse results into dense using `gather_map` and add to
- * `dense_cache`
- *
- * @see groupby_null_templated()
- */
-template <typename SetType>
-void sparse_to_dense_results(table_view const& keys,
-                             host_span<aggregation_request const> requests,
-                             cudf::detail::result_cache* sparse_results,
-                             cudf::detail::result_cache* dense_results,
-                             device_span<size_type const> gather_map,
-                             SetType set,
-                             bool skip_key_rows_with_nulls,
-                             rmm::cuda_stream_view stream,
-                             rmm::device_async_resource_ref mr)
-{
-  auto row_bitmask =
-    cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first;
-  bitmask_type const* row_bitmask_ptr =
-    skip_key_rows_with_nulls ? static_cast<bitmask_type*>(row_bitmask.data()) : nullptr;
-
-  for (auto const& request : requests) {
-    auto const& agg_v = request.aggregations;
-    auto const& col   = request.values;
-
-    // Given an aggregation, this will get the result from sparse_results and
-    // convert and return dense, compacted result
-    auto finalizer = hash_compound_agg_finalizer(
-      col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr);
-    for (auto&& agg : agg_v) {
-      agg->finalize(finalizer);
-    }
-  }
-}
-
-/**
- * @brief Computes groupby using hash table.
- *
- * First, we create a hash table that stores the indices of unique rows in
- * `keys`. The upper limit on the number of values in this map is the number
- * of rows in `keys`.
- *
- * To store the results of aggregations, we create temporary sparse columns
- * which have the same size as input value columns. Using the hash map, we
- * determine the location within the sparse column to write the result of the
- * aggregation into.
- *
- * The sparse column results of all aggregations are stored into the cache
- * `sparse_results`. This enables the use of previously calculated results in
- * other aggregations.
- *
- * All the aggregations which can be computed in a single pass are computed
- * first, in a combined kernel. Then using these results, aggregations that
- * require multiple passes, will be computed.
- *
- * Finally, using the hash map, we generate a vector of indices of populated
- * values in sparse result columns. Then, for each aggregation originally
- * requested in `requests`, we gather sparse results into a column of dense
- * results using the aforementioned index vector. Dense results are stored into
- * the in/out parameter `cache`.
- */
-std::unique_ptr<table> groupby(table_view const& keys,
-                               host_span<aggregation_request const> requests,
-                               cudf::detail::result_cache* cache,
-                               bool const keys_have_nulls,
-                               null_policy const include_null_keys,
-                               rmm::cuda_stream_view stream,
-                               rmm::device_async_resource_ref mr)
+std::unique_ptr<table> dispatch_groupby(table_view const& keys,
+                                        host_span<aggregation_request const> requests,
+                                        cudf::detail::result_cache* cache,
+                                        bool keys_have_nulls,
+                                        null_policy include_null_keys,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::device_async_resource_ref mr)
 {
-  // convert to int64_t to avoid potential overflow with large `keys`
-  auto const num_keys            = static_cast<int64_t>(keys.num_rows());
-  auto const null_keys_are_equal = null_equality::EQUAL;
-  auto const has_null            = nullate::DYNAMIC{cudf::has_nested_nulls(keys)};
-  auto const skip_key_rows_with_nulls =
-    keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
+  auto const null_keys_are_equal  = null_equality::EQUAL;
+  auto const has_null             = nullate::DYNAMIC{cudf::has_nested_nulls(keys)};
+  auto const skip_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
 
   auto preprocessed_keys = cudf::experimental::row::hash::preprocessed_table::create(keys, stream);
   auto const comparator  = cudf::experimental::row::equality::self_comparator{preprocessed_keys};
   auto const row_hash    = cudf::experimental::row::hash::row_hasher{std::move(preprocessed_keys)};
   auto const d_row_hash  = row_hash.device_hasher(has_null);
 
-  // Cache of sparse results where the location of aggregate value in each
-  // column is indexed by the hash set
-  cudf::detail::result_cache sparse_results(requests.size());
-
-  auto const comparator_helper = [&](auto const d_key_equal) {
-    auto const set = cuco::static_set{
-      num_keys,
-      cudf::detail::CUCO_DESIRED_LOAD_FACTOR,  // 50% occupancy
-      cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
-      d_key_equal,
-      probing_scheme_type{d_row_hash},
-      cuco::thread_scope_device,
-      cuco::storage<GROUPBY_WINDOW_SIZE>{},
-      cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
-      stream.value()};
-
-    // Compute all single pass aggs first
-    auto gather_map = compute_single_pass_aggs(
-      keys, requests, &sparse_results, set, skip_key_rows_with_nulls, stream);
-
-    // Compact all results from sparse_results and insert into cache
-    sparse_to_dense_results(keys,
-                            requests,
-                            &sparse_results,
-                            cache,
-                            gather_map,
-                            set.ref(cuco::find),
-                            skip_key_rows_with_nulls,
-                            stream,
-                            mr);
-
-    return cudf::detail::gather(keys,
-                                gather_map,
-                                out_of_bounds_policy::DONT_CHECK,
-                                cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                stream,
-                                mr);
-  };
-
   if (cudf::detail::has_nested_columns(keys)) {
-    auto const d_key_equal = comparator.equal_to<true>(has_null, null_keys_are_equal);
-    return comparator_helper(d_key_equal);
+    auto const d_row_equal = comparator.equal_to<true>(has_null, null_keys_are_equal);
+    return compute_groupby<nullable_row_comparator_t>(
+      keys, requests, cache, skip_rows_with_nulls, d_row_equal, d_row_hash, stream, mr);
   } else {
-    auto const d_key_equal = comparator.equal_to<false>(has_null, null_keys_are_equal);
-    return comparator_helper(d_key_equal);
+    auto const d_row_equal = comparator.equal_to<false>(has_null, null_keys_are_equal);
+    return compute_groupby<row_comparator_t>(
+      keys, requests, cache, skip_rows_with_nulls, d_row_equal, d_row_hash, stream, mr);
   }
 }
 
@@ -442,11 +163,8 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
   cudf::detail::result_cache cache(requests.size());
 
   std::unique_ptr<table> unique_keys =
-    groupby(keys, requests, &cache, cudf::has_nulls(keys), include_null_keys, stream, mr);
+    dispatch_groupby(keys, requests, &cache, cudf::has_nulls(keys), include_null_keys, stream, mr);
 
   return std::pair(std::move(unique_keys), extract_results(requests, cache, stream, mr));
 }
-}  // namespace hash
-}  // namespace detail
-}  // namespace groupby
-}  // namespace cudf
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh
index 9287325c3fb..9918aa5575a 100644
--- a/cpp/src/groupby/hash/helpers.cuh
+++ b/cpp/src/groupby/hash/helpers.cuh
@@ -23,7 +23,6 @@
 #include <cuco/static_set.cuh>
 
 namespace cudf::groupby::detail::hash {
-
 // TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested
 // types and `cg_size = 1`for flat data to improve performance
 /// Number of threads to handle each input element
@@ -32,12 +31,6 @@ CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1;
 /// Number of slots per thread
 CUDF_HOST_DEVICE auto constexpr GROUPBY_WINDOW_SIZE = 1;
 
-/// Probing scheme type used by groupby hash table
-using probing_scheme_type = cuco::linear_probing<
-  GROUPBY_CG_SIZE,
-  cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
-                                                   cudf::nullate::DYNAMIC>>;
-
 /// Thread block size
 CUDF_HOST_DEVICE auto constexpr GROUPBY_BLOCK_SIZE = 128;
 
@@ -60,4 +53,20 @@ CUDF_HOST_DEVICE constexpr std::size_t round_to_multiple_of_8(std::size_t num)
   return cudf::util::div_rounding_up_safe(num, base) * base;
 }
 
+/// Probing scheme type used by groupby hash table
+using probing_scheme_t = cuco::linear_probing<
+  GROUPBY_CG_SIZE,
+  cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
+                                                   cudf::nullate::DYNAMIC>>;
+
+using row_comparator_t = cudf::experimental::row::equality::device_row_comparator<
+  false,
+  cudf::nullate::DYNAMIC,
+  cudf::experimental::row::equality::nan_equal_physical_equality_comparator>;
+
+using nullable_row_comparator_t = cudf::experimental::row::equality::device_row_comparator<
+  true,
+  cudf::nullate::DYNAMIC,
+  cudf::experimental::row::equality::nan_equal_physical_equality_comparator>;
+
 }  // namespace cudf::groupby::detail::hash

From 29cba47ac7dcd7df48a3427d06b59ee74011bd9e Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 24 Sep 2024 15:46:43 -0700
Subject: [PATCH 037/135] Minor cleanups

---
 cpp/src/groupby/hash/compute_groupby.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/groupby/hash/compute_groupby.cuh b/cpp/src/groupby/hash/compute_groupby.cuh
index 5b2cc6e1ba6..4dab0ae29cf 100644
--- a/cpp/src/groupby/hash/compute_groupby.cuh
+++ b/cpp/src/groupby/hash/compute_groupby.cuh
@@ -16,7 +16,7 @@
 #pragma once
 
 #include "compute_single_pass_aggs.cuh"
-#include "compute_single_pass_aggs.hpp"
+// #include "compute_single_pass_aggs.hpp"
 #include "multi_pass_functors.cuh"
 
 #include <cudf/column/column_factories.hpp>

From 523737fd13c2ffb57171d0e3080c2c7ca9e522d7 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 24 Sep 2024 19:43:08 -0700
Subject: [PATCH 038/135] More explicit instantiations

---
 cpp/CMakeLists.txt                           |  1 +
 cpp/src/groupby/hash/compute_groupby.cu      | 13 ++++++++--
 cpp/src/groupby/hash/compute_groupby.cuh     |  2 +-
 cpp/src/groupby/hash/compute_groupby_null.cu | 14 +++++++++--
 cpp/src/groupby/hash/helpers.cuh             | 15 ++++++++---
 cpp/src/groupby/hash/multi_pass_functors.cu  | 26 ++++++++++++++++++++
 cpp/src/groupby/hash/multi_pass_functors.cuh | 18 ++++++--------
 7 files changed, 70 insertions(+), 19 deletions(-)
 create mode 100644 cpp/src/groupby/hash/multi_pass_functors.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e405f907289..76552a88d7c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -319,6 +319,7 @@ add_library(
   src/groupby/hash/compute_groupby_null.cu
   src/groupby/hash/flatten_single_pass_aggs.cpp
   src/groupby/hash/groupby.cu
+  src/groupby/hash/multi_pass_functors.cu
   src/groupby/sort/aggregate.cpp
   src/groupby/sort/group_argmax.cu
   src/groupby/sort/group_argmin.cu
diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu
index 111e7c7972a..4944fed9b68 100644
--- a/cpp/src/groupby/hash/compute_groupby.cu
+++ b/cpp/src/groupby/hash/compute_groupby.cu
@@ -18,14 +18,23 @@
 
 namespace cudf::groupby::detail::hash {
 
+template void sparse_to_dense_results<hash_set_ref_t>(table_view const& keys,
+                                                      host_span<aggregation_request const> requests,
+                                                      cudf::detail::result_cache* sparse_results,
+                                                      cudf::detail::result_cache* dense_results,
+                                                      device_span<size_type const> gather_map,
+                                                      hash_set_ref_t set,
+                                                      bool skip_key_rows_with_nulls,
+                                                      rmm::cuda_stream_view stream,
+                                                      rmm::device_async_resource_ref mr);
+
 template std::unique_ptr<table> compute_groupby<row_comparator_t>(
   table_view const& keys,
   host_span<aggregation_request const> requests,
   cudf::detail::result_cache* cache,
   bool skip_key_rows_with_nulls,
   row_comparator_t const& d_row_equal,
-  cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
-                                                   cudf::nullate::DYNAMIC> const& d_row_hash,
+  row_hash_t const& d_row_hash,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr);
 
diff --git a/cpp/src/groupby/hash/compute_groupby.cuh b/cpp/src/groupby/hash/compute_groupby.cuh
index 4dab0ae29cf..6599a93f730 100644
--- a/cpp/src/groupby/hash/compute_groupby.cuh
+++ b/cpp/src/groupby/hash/compute_groupby.cuh
@@ -176,7 +176,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
       rmm::exec_policy(stream),
       thrust::make_counting_iterator(0),
       col.size(),
-      ::cudf::detail::var_hash_functor{
+      var_hash_functor{
         set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof});
     sparse_results->add_result(col, agg, std::move(var_result));
     dense_results->add_result(col, agg, to_dense_agg_result(agg));
diff --git a/cpp/src/groupby/hash/compute_groupby_null.cu b/cpp/src/groupby/hash/compute_groupby_null.cu
index 1420bd2a987..fc05e98ed4f 100644
--- a/cpp/src/groupby/hash/compute_groupby_null.cu
+++ b/cpp/src/groupby/hash/compute_groupby_null.cu
@@ -18,14 +18,24 @@
 
 namespace cudf::groupby::detail::hash {
 
+template void sparse_to_dense_results<nullable_hash_set_ref_t>(
+  table_view const& keys,
+  host_span<aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  cudf::detail::result_cache* dense_results,
+  device_span<size_type const> gather_map,
+  nullable_hash_set_ref_t set,
+  bool skip_key_rows_with_nulls,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
 template std::unique_ptr<table> compute_groupby<nullable_row_comparator_t>(
   table_view const& keys,
   host_span<aggregation_request const> requests,
   cudf::detail::result_cache* cache,
   bool skip_key_rows_with_nulls,
   nullable_row_comparator_t const& d_row_equal,
-  cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
-                                                   cudf::nullate::DYNAMIC> const& d_row_hash,
+  row_hash_t const& d_row_hash,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr);
 
diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh
index 9918aa5575a..650b936372d 100644
--- a/cpp/src/groupby/hash/helpers.cuh
+++ b/cpp/src/groupby/hash/helpers.cuh
@@ -53,11 +53,12 @@ CUDF_HOST_DEVICE constexpr std::size_t round_to_multiple_of_8(std::size_t num)
   return cudf::util::div_rounding_up_safe(num, base) * base;
 }
 
-/// Probing scheme type used by groupby hash table
-using probing_scheme_t = cuco::linear_probing<
-  GROUPBY_CG_SIZE,
+using row_hash_t =
   cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
-                                                   cudf::nullate::DYNAMIC>>;
+                                                   cudf::nullate::DYNAMIC>;
+
+/// Probing scheme type used by groupby hash table
+using probing_scheme_t = cuco::linear_probing<GROUPBY_CG_SIZE, row_hash_t>;
 
 using row_comparator_t = cudf::experimental::row::equality::device_row_comparator<
   false,
@@ -69,4 +70,10 @@ using nullable_row_comparator_t = cudf::experimental::row::equality::device_row_
   cudf::nullate::DYNAMIC,
   cudf::experimental::row::equality::nan_equal_physical_equality_comparator>;
 
+using hash_set_ref_t = cuco::
+  static_set_ref<cudf::size_type, cuda::thread_scope_device, row_comparator_t, probing_scheme_t, cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, cuco::extent<int64_t>>, cuco::op::find_tag, >;
+
+using nullable_hash_set_ref_t = cuco::
+  static_set_ref<cudf::size_type, cuda::thread_scope_device, nullable_row_comparator_t, probing_scheme_t, cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, cuco::extent<int64_t>>, cuco::op::find_tag, >;
+
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/multi_pass_functors.cu b/cpp/src/groupby/hash/multi_pass_functors.cu
new file mode 100644
index 00000000000..1f4c2a6a923
--- /dev/null
+++ b/cpp/src/groupby/hash/multi_pass_functors.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "helpers.cuh"
+#include "multi_pass_functors.cuh"
+
+namespace cudf::groupby::detail::hash {
+
+// explicit template instantiation to reduce build time
+template struct var_hash_functor<hash_set_ref_t>;
+template struct var_hash_functor<nullable_hash_set_ref_t>;
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/multi_pass_functors.cuh b/cpp/src/groupby/hash/multi_pass_functors.cuh
index 6fbec5fe19e..98668d0cb45 100644
--- a/cpp/src/groupby/hash/multi_pass_functors.cuh
+++ b/cpp/src/groupby/hash/multi_pass_functors.cuh
@@ -29,10 +29,9 @@
 
 #include <cmath>
 
-namespace cudf {
-namespace detail {
+namespace cudf::groupby::detail::hash {
 
-template <typename SetType, bool target_has_nulls = true, bool source_has_nulls = true>
+template <typename SetType>
 struct var_hash_functor {
   SetType set;
   bitmask_type const* __restrict__ row_bitmask;
@@ -75,11 +74,11 @@ struct var_hash_functor {
   __device__ cuda::std::enable_if_t<is_supported<Source>()> operator()(
     column_device_view const& source, size_type source_index, size_type target_index) noexcept
   {
-    using Target    = target_type_t<Source, aggregation::VARIANCE>;
-    using SumType   = target_type_t<Source, aggregation::SUM>;
-    using CountType = target_type_t<Source, aggregation::COUNT_VALID>;
+    using Target    = cudf::detail::target_type_t<Source, aggregation::VARIANCE>;
+    using SumType   = cudf::detail::target_type_t<Source, aggregation::SUM>;
+    using CountType = cudf::detail::target_type_t<Source, aggregation::COUNT_VALID>;
 
-    if (source_has_nulls and source.is_null(source_index)) return;
+    if (source.is_null(source_index)) return;
     CountType group_size = count.element<CountType>(target_index);
     if (group_size == 0 or group_size - ddof <= 0) return;
 
@@ -90,7 +89,7 @@ struct var_hash_functor {
     ref.fetch_add(result, cuda::std::memory_order_relaxed);
     // STD sqrt is applied in finalize()
 
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
   }
 
   __device__ inline void operator()(size_type source_index)
@@ -111,5 +110,4 @@ struct var_hash_functor {
   }
 };
 
-}  // namespace detail
-}  // namespace cudf
+}  // namespace cudf::groupby::detail::hash

From a5743459acdb246d420c6501cfde41d2279b2eaa Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 25 Sep 2024 19:08:26 -0700
Subject: [PATCH 039/135] Test rollback

---
 cpp/src/groupby/hash/compute_aggregations.cu  | 40 +++++++++++--------
 cpp/src/groupby/hash/compute_aggregations.hpp | 22 +++++-----
 .../groupby/hash/compute_single_pass_aggs.cuh | 27 ++++++++-----
 3 files changed, 50 insertions(+), 39 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu
index 73dca45edf7..7eb8216ee13 100644
--- a/cpp/src/groupby/hash/compute_aggregations.cu
+++ b/cpp/src/groupby/hash/compute_aggregations.cu
@@ -225,32 +225,36 @@ CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows,
 constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; }
 
 template <typename Kernel>
-constexpr size_t compute_shared_memory_size(Kernel kernel, int grid_size)
+constexpr std::pair<cudaError_t, size_t> compute_shared_memory_size(Kernel kernel, int grid_size)
 {
   auto const active_blocks_per_sm =
     cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
 
-  size_t dynamic_shmem_size;
-  CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock(
-    &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE));
-  return get_previous_multiple_of_8(0.5 * dynamic_shmem_size);
+  size_t dynamic_shmem_size = 0;
+
+  auto const cuda_error = cudaOccupancyAvailableDynamicSMemPerBlock(
+    &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE);
+  return {cuda_error, get_previous_multiple_of_8(0.5 * dynamic_shmem_size)};
 }
 
 }  // namespace
 
-void compute_aggregations(int grid_size,
-                          cudf::size_type num_input_rows,
-                          bitmask_type const* row_bitmask,
-                          bool skip_rows_with_nulls,
-                          cudf::size_type* local_mapping_index,
-                          cudf::size_type* global_mapping_index,
-                          cudf::size_type* block_cardinality,
-                          cudf::table_device_view input_values,
-                          cudf::mutable_table_device_view output_values,
-                          cudf::aggregation::Kind const* aggs,
-                          rmm::cuda_stream_view stream)
+cudaError_t compute_aggregations(int grid_size,
+                                 cudf::size_type num_input_rows,
+                                 bitmask_type const* row_bitmask,
+                                 bool skip_rows_with_nulls,
+                                 cudf::size_type* local_mapping_index,
+                                 cudf::size_type* global_mapping_index,
+                                 cudf::size_type* block_cardinality,
+                                 cudf::table_device_view input_values,
+                                 cudf::mutable_table_device_view output_values,
+                                 cudf::aggregation::Kind const* aggs,
+                                 rmm::cuda_stream_view stream)
 {
-  auto const shmem_size = compute_shared_memory_size(compute_aggs_kernel, grid_size);
+  auto const [cuda_error, shmem_size] = compute_shared_memory_size(compute_aggs_kernel, grid_size);
+
+  if (cuda_error != cudaSuccess) { return cuda_error; }
+
   // For each aggregation, need two pointers to arrays in shmem
   // One where the aggregation is performed, one indicating the validity of the aggregation
   auto const shmem_agg_pointer_size =
@@ -269,6 +273,8 @@ void compute_aggregations(int grid_size,
     aggs,
     shmem_agg_size,
     shmem_agg_pointer_size);
+
+  return cudaSuccess;
 }
 
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_aggregations.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp
index 87c37158cd0..862462c8b9f 100644
--- a/cpp/src/groupby/hash/compute_aggregations.hpp
+++ b/cpp/src/groupby/hash/compute_aggregations.hpp
@@ -26,16 +26,16 @@
 
 namespace cudf::groupby::detail::hash {
 
-void compute_aggregations(int grid_size,
-                          cudf::size_type num_input_rows,
-                          bitmask_type const* row_bitmask,
-                          bool skip_rows_with_nulls,
-                          cudf::size_type* local_mapping_index,
-                          cudf::size_type* global_mapping_index,
-                          cudf::size_type* block_cardinality,
-                          cudf::table_device_view input_values,
-                          cudf::mutable_table_device_view output_values,
-                          cudf::aggregation::Kind const* aggs,
-                          rmm::cuda_stream_view stream);
+cudaError_t compute_aggregations(int grid_size,
+                                 cudf::size_type num_input_rows,
+                                 bitmask_type const* row_bitmask,
+                                 bool skip_rows_with_nulls,
+                                 cudf::size_type* local_mapping_index,
+                                 cudf::size_type* global_mapping_index,
+                                 cudf::size_type* block_cardinality,
+                                 cudf::table_device_view input_values,
+                                 cudf::mutable_table_device_view output_values,
+                                 cudf::aggregation::Kind const* aggs,
+                                 rmm::cuda_stream_view stream);
 
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index 464365c0416..a52d6ecd530 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -312,17 +312,22 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
   auto d_values       = table_device_view::create(flattened_values, stream);
 
-  compute_aggregations(grid_size,
-                       num_input_rows,
-                       static_cast<bitmask_type*>(row_bitmask.data()),
-                       skip_rows_with_nulls,
-                       local_mapping_index.data(),
-                       global_mapping_index.data(),
-                       block_cardinality.data(),
-                       *d_values,
-                       *d_sparse_table,
-                       d_agg_kinds.data(),
-                       stream);
+  auto const cuda_error = compute_aggregations(grid_size,
+                                               num_input_rows,
+                                               static_cast<bitmask_type*>(row_bitmask.data()),
+                                               skip_rows_with_nulls,
+                                               local_mapping_index.data(),
+                                               global_mapping_index.data(),
+                                               block_cardinality.data(),
+                                               *d_values,
+                                               *d_sparse_table,
+                                               d_agg_kinds.data(),
+                                               stream);
+
+  if (cuda_error != cudaSuccess) {
+    constexpr bool uses_direct_aggs = true;
+    direct_aggregations.set_value_async(uses_direct_aggs, stream);
+  }
 
   if (direct_aggregations.value(stream)) {
     auto const stride = GROUPBY_BLOCK_SIZE * grid_size;

From 4b2b55fe37350dd1c48ddb92b1b03aa387d458e2 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 25 Sep 2024 19:32:01 -0700
Subject: [PATCH 040/135] More explicit instantiations

---
 cpp/CMakeLists.txt                            |   5 +-
 cpp/src/groupby/hash/compute_groupby.cu       |  10 -
 cpp/src/groupby/hash/compute_groupby.cuh      | 207 +-----------------
 cpp/src/groupby/hash/compute_groupby_null.cu  |  11 -
 .../hash/hash_compound_agg_finalizer.cu       |  25 +++
 .../hash/hash_compound_agg_finalizer.cuh      | 197 +++++++++++++++++
 .../groupby/hash/sparse_to_dense_results.cu   |  32 +++
 .../groupby/hash/sparse_to_dense_results.cuh  |  65 ++++++
 .../hash/sparse_to_dense_results_null.cu      |  33 +++
 ...i_pass_functors.cu => var_hash_functor.cu} |   2 +-
 ...pass_functors.cuh => var_hash_functor.cuh} |   0
 11 files changed, 360 insertions(+), 227 deletions(-)
 create mode 100644 cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
 create mode 100644 cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh
 create mode 100644 cpp/src/groupby/hash/sparse_to_dense_results.cu
 create mode 100644 cpp/src/groupby/hash/sparse_to_dense_results.cuh
 create mode 100644 cpp/src/groupby/hash/sparse_to_dense_results_null.cu
 rename cpp/src/groupby/hash/{multi_pass_functors.cu => var_hash_functor.cu} (96%)
 rename cpp/src/groupby/hash/{multi_pass_functors.cuh => var_hash_functor.cuh} (100%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 76552a88d7c..b48b480f6fc 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -318,8 +318,11 @@ add_library(
   src/groupby/hash/compute_groupby.cu
   src/groupby/hash/compute_groupby_null.cu
   src/groupby/hash/flatten_single_pass_aggs.cpp
+  src/groupby/hash/sparse_to_dense_results.cu
+  src/groupby/hash/sparse_to_dense_results_null.cu
   src/groupby/hash/groupby.cu
-  src/groupby/hash/multi_pass_functors.cu
+  src/groupby/hash/var_hash_functor.cu
+  src/groupby/hash/hash_compound_agg_finalizer.cu
   src/groupby/sort/aggregate.cpp
   src/groupby/sort/group_argmax.cu
   src/groupby/sort/group_argmin.cu
diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu
index 4944fed9b68..7965d0891a7 100644
--- a/cpp/src/groupby/hash/compute_groupby.cu
+++ b/cpp/src/groupby/hash/compute_groupby.cu
@@ -18,16 +18,6 @@
 
 namespace cudf::groupby::detail::hash {
 
-template void sparse_to_dense_results<hash_set_ref_t>(table_view const& keys,
-                                                      host_span<aggregation_request const> requests,
-                                                      cudf::detail::result_cache* sparse_results,
-                                                      cudf::detail::result_cache* dense_results,
-                                                      device_span<size_type const> gather_map,
-                                                      hash_set_ref_t set,
-                                                      bool skip_key_rows_with_nulls,
-                                                      rmm::cuda_stream_view stream,
-                                                      rmm::device_async_resource_ref mr);
-
 template std::unique_ptr<table> compute_groupby<row_comparator_t>(
   table_view const& keys,
   host_span<aggregation_request const> requests,
diff --git a/cpp/src/groupby/hash/compute_groupby.cuh b/cpp/src/groupby/hash/compute_groupby.cuh
index 6599a93f730..e97853fa155 100644
--- a/cpp/src/groupby/hash/compute_groupby.cuh
+++ b/cpp/src/groupby/hash/compute_groupby.cuh
@@ -18,217 +18,16 @@
 #include "compute_single_pass_aggs.cuh"
 // #include "compute_single_pass_aggs.hpp"
 #include "multi_pass_functors.cuh"
+#include "sparse_to_dense_results.cuh"
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/binaryop.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/unary.hpp>
 
-namespace cudf::groupby::detail::hash {
-
-template <typename SetType>
-class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer {
-  column_view col;
-  data_type result_type;
-  cudf::detail::result_cache* sparse_results;
-  cudf::detail::result_cache* dense_results;
-  device_span<size_type const> gather_map;
-  SetType set;
-  bitmask_type const* __restrict__ row_bitmask;
-  rmm::cuda_stream_view stream;
-  rmm::device_async_resource_ref mr;
-
- public:
-  using cudf::detail::aggregation_finalizer::visit;
-
-  hash_compound_agg_finalizer(column_view col,
-                              cudf::detail::result_cache* sparse_results,
-                              cudf::detail::result_cache* dense_results,
-                              device_span<size_type const> gather_map,
-                              SetType set,
-                              bitmask_type const* row_bitmask,
-                              rmm::cuda_stream_view stream,
-                              rmm::device_async_resource_ref mr)
-    : col(col),
-      sparse_results(sparse_results),
-      dense_results(dense_results),
-      gather_map(gather_map),
-      set(set),
-      row_bitmask(row_bitmask),
-      stream(stream),
-      mr(mr)
-  {
-    result_type = cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type()
-                                                  : col.type();
-  }
-
-  auto to_dense_agg_result(cudf::aggregation const& agg)
-  {
-    auto s                  = sparse_results->get_result(col, agg);
-    auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}),
-                                                   gather_map,
-                                                   out_of_bounds_policy::DONT_CHECK,
-                                                   cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                                   stream,
-                                                   mr);
-    return std::move(dense_result_table->release()[0]);
-  }
-
-  // Enables conversion of ARGMIN/ARGMAX into MIN/MAX
-  auto gather_argminmax(aggregation const& agg)
-  {
-    auto arg_result = to_dense_agg_result(agg);
-    // We make a view of ARG(MIN/MAX) result without a null mask and gather
-    // using this map. The values in data buffer of ARG(MIN/MAX) result
-    // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL
-    // which is an out of bounds index value (-1) and causes the gathered
-    // value to be null.
-    column_view null_removed_map(
-      data_type(type_to_id<size_type>()),
-      arg_result->size(),
-      static_cast<void const*>(arg_result->view().template data<size_type>()),
-      nullptr,
-      0);
-    auto gather_argminmax =
-      cudf::detail::gather(table_view({col}),
-                           null_removed_map,
-                           arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY
-                                                  : cudf::out_of_bounds_policy::DONT_CHECK,
-                           cudf::detail::negative_index_policy::NOT_ALLOWED,
-                           stream,
-                           mr);
-    return std::move(gather_argminmax->release()[0]);
-  }
-
-  // Declare overloads for each kind of aggregation to dispatch
-  void visit(cudf::aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-    dense_results->add_result(col, agg, to_dense_agg_result(agg));
-  }
-
-  void visit(cudf::detail::min_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-    if (result_type.id() == type_id::STRING) {
-      auto transformed_agg = make_argmin_aggregation();
-      dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
-    } else {
-      dense_results->add_result(col, agg, to_dense_agg_result(agg));
-    }
-  }
-
-  void visit(cudf::detail::max_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-
-    if (result_type.id() == type_id::STRING) {
-      auto transformed_agg = make_argmax_aggregation();
-      dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
-    } else {
-      dense_results->add_result(col, agg, to_dense_agg_result(agg));
-    }
-  }
-
-  void visit(cudf::detail::mean_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-
-    auto sum_agg   = make_sum_aggregation();
-    auto count_agg = make_count_aggregation();
-    this->visit(*sum_agg);
-    this->visit(*count_agg);
-    column_view sum_result   = dense_results->get_result(col, *sum_agg);
-    column_view count_result = dense_results->get_result(col, *count_agg);
-
-    auto result =
-      cudf::detail::binary_operation(sum_result,
-                                     count_result,
-                                     binary_operator::DIV,
-                                     cudf::detail::target_type(result_type, aggregation::MEAN),
-                                     stream,
-                                     mr);
-    dense_results->add_result(col, agg, std::move(result));
-  }
-
-  void visit(cudf::detail::var_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
+#include <cuco/static_set.cuh>
 
-    auto sum_agg   = make_sum_aggregation();
-    auto count_agg = make_count_aggregation();
-    this->visit(*sum_agg);
-    this->visit(*count_agg);
-    column_view sum_result   = sparse_results->get_result(col, *sum_agg);
-    column_view count_result = sparse_results->get_result(col, *count_agg);
-
-    auto values_view = column_device_view::create(col, stream);
-    auto sum_view    = column_device_view::create(sum_result, stream);
-    auto count_view  = column_device_view::create(count_result, stream);
-
-    auto var_result = make_fixed_width_column(
-      cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream);
-    auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream);
-    mutable_table_view var_table_view{{var_result->mutable_view()}};
-    cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream);
-
-    thrust::for_each_n(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator(0),
-      col.size(),
-      var_hash_functor{
-        set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof});
-    sparse_results->add_result(col, agg, std::move(var_result));
-    dense_results->add_result(col, agg, to_dense_agg_result(agg));
-  }
-
-  void visit(cudf::detail::std_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-    auto var_agg = make_variance_aggregation(agg._ddof);
-    this->visit(*dynamic_cast<cudf::detail::var_aggregation*>(var_agg.get()));
-    column_view variance = dense_results->get_result(col, *var_agg);
-
-    auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr);
-    dense_results->add_result(col, agg, std::move(result));
-  }
-};
-
-/**
- * @brief Gather sparse results into dense using `gather_map` and add to
- * `dense_cache`
- *
- * @see groupby_null_templated()
- */
-template <typename SetType>
-void sparse_to_dense_results(table_view const& keys,
-                             host_span<aggregation_request const> requests,
-                             cudf::detail::result_cache* sparse_results,
-                             cudf::detail::result_cache* dense_results,
-                             device_span<size_type const> gather_map,
-                             SetType set,
-                             bool skip_key_rows_with_nulls,
-                             rmm::cuda_stream_view stream,
-                             rmm::device_async_resource_ref mr)
-{
-  auto row_bitmask =
-    cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first;
-  bitmask_type const* row_bitmask_ptr =
-    skip_key_rows_with_nulls ? static_cast<bitmask_type*>(row_bitmask.data()) : nullptr;
-
-  for (auto const& request : requests) {
-    auto const& agg_v = request.aggregations;
-    auto const& col   = request.values;
-
-    // Given an aggregation, this will get the result from sparse_results and
-    // convert and return dense, compacted result
-    auto finalizer = hash_compound_agg_finalizer(
-      col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr);
-    for (auto&& agg : agg_v) {
-      agg->finalize(finalizer);
-    }
-  }
-}
+namespace cudf::groupby::detail::hash {
 
 /**
  * @brief Computes groupby using hash table.
diff --git a/cpp/src/groupby/hash/compute_groupby_null.cu b/cpp/src/groupby/hash/compute_groupby_null.cu
index fc05e98ed4f..1f9707902cc 100644
--- a/cpp/src/groupby/hash/compute_groupby_null.cu
+++ b/cpp/src/groupby/hash/compute_groupby_null.cu
@@ -18,17 +18,6 @@
 
 namespace cudf::groupby::detail::hash {
 
-template void sparse_to_dense_results<nullable_hash_set_ref_t>(
-  table_view const& keys,
-  host_span<aggregation_request const> requests,
-  cudf::detail::result_cache* sparse_results,
-  cudf::detail::result_cache* dense_results,
-  device_span<size_type const> gather_map,
-  nullable_hash_set_ref_t set,
-  bool skip_key_rows_with_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr);
-
 template std::unique_ptr<table> compute_groupby<nullable_row_comparator_t>(
   table_view const& keys,
   host_span<aggregation_request const> requests,
diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
new file mode 100644
index 00000000000..e7a7af92f15
--- /dev/null
+++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "hash_compound_agg_finalizer.cuh"
+#include "helpers.cuh"
+
+namespace cudf::groupby::detail::hash {
+
+template class hash_compound_agg_finalizer<hash_set_ref_t>;
+template class hash_compound_agg_finalizer<nullable_hash_set_ref_t>;
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh
new file mode 100644
index 00000000000..a9326873282
--- /dev/null
+++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "var_hash_functor.cuh"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/binaryop.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/unary.hpp>
+#include <cudf/types.hpp>
+
+namespace cudf::groupby::detail::hash {
+
+template <typename SetType>
+class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer {
+  column_view col;
+  data_type result_type;
+  cudf::detail::result_cache* sparse_results;
+  cudf::detail::result_cache* dense_results;
+  device_span<size_type const> gather_map;
+  SetType set;
+  bitmask_type const* __restrict__ row_bitmask;
+  rmm::cuda_stream_view stream;
+  rmm::device_async_resource_ref mr;
+
+ public:
+  using cudf::detail::aggregation_finalizer::visit;
+
+  hash_compound_agg_finalizer(column_view col,
+                              cudf::detail::result_cache* sparse_results,
+                              cudf::detail::result_cache* dense_results,
+                              device_span<size_type const> gather_map,
+                              SetType set,
+                              bitmask_type const* row_bitmask,
+                              rmm::cuda_stream_view stream,
+                              rmm::device_async_resource_ref mr)
+    : col(col),
+      sparse_results(sparse_results),
+      dense_results(dense_results),
+      gather_map(gather_map),
+      set(set),
+      row_bitmask(row_bitmask),
+      stream(stream),
+      mr(mr)
+  {
+    result_type = cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type()
+                                                  : col.type();
+  }
+
+  auto to_dense_agg_result(cudf::aggregation const& agg)
+  {
+    auto s                  = sparse_results->get_result(col, agg);
+    auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}),
+                                                   gather_map,
+                                                   out_of_bounds_policy::DONT_CHECK,
+                                                   cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                                   stream,
+                                                   mr);
+    return std::move(dense_result_table->release()[0]);
+  }
+
+  // Enables conversion of ARGMIN/ARGMAX into MIN/MAX
+  auto gather_argminmax(aggregation const& agg)
+  {
+    auto arg_result = to_dense_agg_result(agg);
+    // We make a view of ARG(MIN/MAX) result without a null mask and gather
+    // using this map. The values in data buffer of ARG(MIN/MAX) result
+    // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL
+    // which is an out of bounds index value (-1) and causes the gathered
+    // value to be null.
+    column_view null_removed_map(
+      data_type(type_to_id<size_type>()),
+      arg_result->size(),
+      static_cast<void const*>(arg_result->view().template data<size_type>()),
+      nullptr,
+      0);
+    auto gather_argminmax =
+      cudf::detail::gather(table_view({col}),
+                           null_removed_map,
+                           arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY
+                                                  : cudf::out_of_bounds_policy::DONT_CHECK,
+                           cudf::detail::negative_index_policy::NOT_ALLOWED,
+                           stream,
+                           mr);
+    return std::move(gather_argminmax->release()[0]);
+  }
+
+  // Declare overloads for each kind of aggregation to dispatch
+  void visit(cudf::aggregation const& agg) override
+  {
+    if (dense_results->has_result(col, agg)) return;
+    dense_results->add_result(col, agg, to_dense_agg_result(agg));
+  }
+
+  void visit(cudf::detail::min_aggregation const& agg) override
+  {
+    if (dense_results->has_result(col, agg)) return;
+    if (result_type.id() == type_id::STRING) {
+      auto transformed_agg = make_argmin_aggregation();
+      dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
+    } else {
+      dense_results->add_result(col, agg, to_dense_agg_result(agg));
+    }
+  }
+
+  void visit(cudf::detail::max_aggregation const& agg) override
+  {
+    if (dense_results->has_result(col, agg)) return;
+
+    if (result_type.id() == type_id::STRING) {
+      auto transformed_agg = make_argmax_aggregation();
+      dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
+    } else {
+      dense_results->add_result(col, agg, to_dense_agg_result(agg));
+    }
+  }
+
+  void visit(cudf::detail::mean_aggregation const& agg) override
+  {
+    if (dense_results->has_result(col, agg)) return;
+
+    auto sum_agg   = make_sum_aggregation();
+    auto count_agg = make_count_aggregation();
+    this->visit(*sum_agg);
+    this->visit(*count_agg);
+    column_view sum_result   = dense_results->get_result(col, *sum_agg);
+    column_view count_result = dense_results->get_result(col, *count_agg);
+
+    auto result =
+      cudf::detail::binary_operation(sum_result,
+                                     count_result,
+                                     binary_operator::DIV,
+                                     cudf::detail::target_type(result_type, aggregation::MEAN),
+                                     stream,
+                                     mr);
+    dense_results->add_result(col, agg, std::move(result));
+  }
+
+  void visit(cudf::detail::var_aggregation const& agg) override
+  {
+    if (dense_results->has_result(col, agg)) return;
+
+    auto sum_agg   = make_sum_aggregation();
+    auto count_agg = make_count_aggregation();
+    this->visit(*sum_agg);
+    this->visit(*count_agg);
+    column_view sum_result   = sparse_results->get_result(col, *sum_agg);
+    column_view count_result = sparse_results->get_result(col, *count_agg);
+
+    auto values_view = column_device_view::create(col, stream);
+    auto sum_view    = column_device_view::create(sum_result, stream);
+    auto count_view  = column_device_view::create(count_result, stream);
+
+    auto var_result = make_fixed_width_column(
+      cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream);
+    auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream);
+    mutable_table_view var_table_view{{var_result->mutable_view()}};
+    cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream);
+
+    thrust::for_each_n(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(0),
+      col.size(),
+      var_hash_functor{
+        set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof});
+    sparse_results->add_result(col, agg, std::move(var_result));
+    dense_results->add_result(col, agg, to_dense_agg_result(agg));
+  }
+
+  void visit(cudf::detail::std_aggregation const& agg) override
+  {
+    if (dense_results->has_result(col, agg)) return;
+    auto var_agg = make_variance_aggregation(agg._ddof);
+    this->visit(*dynamic_cast<cudf::detail::var_aggregation*>(var_agg.get()));
+    column_view variance = dense_results->get_result(col, *var_agg);
+
+    auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr);
+    dense_results->add_result(col, agg, std::move(result));
+  }
+};
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu
new file mode 100644
index 00000000000..760926afa13
--- /dev/null
+++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "helpers.cuh"
+#include "sparse_to_dense_results.cuh"
+
+namespace cudf::groupby::detail::hash {
+
+template void sparse_to_dense_results<hash_set_ref_t>(table_view const& keys,
+                                                      host_span<aggregation_request const> requests,
+                                                      cudf::detail::result_cache* sparse_results,
+                                                      cudf::detail::result_cache* dense_results,
+                                                      device_span<size_type const> gather_map,
+                                                      hash_set_ref_t set,
+                                                      bool skip_key_rows_with_nulls,
+                                                      rmm::cuda_stream_view stream,
+                                                      rmm::device_async_resource_ref mr);
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cuh b/cpp/src/groupby/hash/sparse_to_dense_results.cuh
new file mode 100644
index 00000000000..8d40358d0c8
--- /dev/null
+++ b/cpp/src/groupby/hash/sparse_to_dense_results.cuh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "compute_single_pass_aggs.cuh"
+#include "hash_compound_agg_finalizer.cuh"
+#include "var_hash_functor.cuh"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/binaryop.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/unary.hpp>
+
+namespace cudf::groupby::detail::hash {
+
+/**
+ * @brief Gather sparse results into dense using `gather_map` and add to
+ * `dense_cache`
+ *
+ * @see groupby_null_templated()
+ */
+template <typename SetType>
+void sparse_to_dense_results(table_view const& keys,
+                             host_span<aggregation_request const> requests,
+                             cudf::detail::result_cache* sparse_results,
+                             cudf::detail::result_cache* dense_results,
+                             device_span<size_type const> gather_map,
+                             SetType set,
+                             bool skip_key_rows_with_nulls,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr)
+{
+  auto row_bitmask =
+    cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first;
+  bitmask_type const* row_bitmask_ptr =
+    skip_key_rows_with_nulls ? static_cast<bitmask_type*>(row_bitmask.data()) : nullptr;
+
+  for (auto const& request : requests) {
+    auto const& agg_v = request.aggregations;
+    auto const& col   = request.values;
+
+    // Given an aggregation, this will get the result from sparse_results and
+    // convert and return dense, compacted result
+    auto finalizer = hash_compound_agg_finalizer(
+      col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr);
+    for (auto&& agg : agg_v) {
+      agg->finalize(finalizer);
+    }
+  }
+}
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results_null.cu b/cpp/src/groupby/hash/sparse_to_dense_results_null.cu
new file mode 100644
index 00000000000..b6820f7f6db
--- /dev/null
+++ b/cpp/src/groupby/hash/sparse_to_dense_results_null.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "helpers.cuh"
+#include "sparse_to_dense_results.cuh"
+
+namespace cudf::groupby::detail::hash {
+
+template void sparse_to_dense_results<nullable_hash_set_ref_t>(
+  table_view const& keys,
+  host_span<aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  cudf::detail::result_cache* dense_results,
+  device_span<size_type const> gather_map,
+  nullable_hash_set_ref_t set,
+  bool skip_key_rows_with_nulls,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/multi_pass_functors.cu b/cpp/src/groupby/hash/var_hash_functor.cu
similarity index 96%
rename from cpp/src/groupby/hash/multi_pass_functors.cu
rename to cpp/src/groupby/hash/var_hash_functor.cu
index 1f4c2a6a923..4881f4ed85e 100644
--- a/cpp/src/groupby/hash/multi_pass_functors.cu
+++ b/cpp/src/groupby/hash/var_hash_functor.cu
@@ -15,7 +15,7 @@
  */
 
 #include "helpers.cuh"
-#include "multi_pass_functors.cuh"
+#include "var_hash_functor.cuh"
 
 namespace cudf::groupby::detail::hash {
 
diff --git a/cpp/src/groupby/hash/multi_pass_functors.cuh b/cpp/src/groupby/hash/var_hash_functor.cuh
similarity index 100%
rename from cpp/src/groupby/hash/multi_pass_functors.cuh
rename to cpp/src/groupby/hash/var_hash_functor.cuh

From 44806bacac8dedbf78cd0852d52322f4a5c0ec64 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 25 Sep 2024 20:00:26 -0700
Subject: [PATCH 041/135] Add missing headers + more explicit instantiations

---
 cpp/CMakeLists.txt                            |  2 +
 cpp/src/groupby/hash/compute_groupby.cuh      |  2 +-
 .../groupby/hash/compute_single_pass_aggs.cu  | 59 +++++++++++++++++++
 .../hash/compute_single_pass_aggs_null.cu     | 59 +++++++++++++++++++
 .../hash/hash_compound_agg_finalizer.cuh      |  2 +
 .../groupby/hash/sparse_to_dense_results.cuh  |  1 +
 6 files changed, 124 insertions(+), 1 deletion(-)
 create mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs.cu
 create mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs_null.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3dfcbe69b2f..6854295a6a6 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -318,6 +318,8 @@ add_library(
   src/groupby/hash/compute_groupby.cu
   src/groupby/hash/compute_groupby_null.cu
   src/groupby/hash/flatten_single_pass_aggs.cpp
+  src/groupby/hash/compute_single_pass_aggs_null.cu
+  src/groupby/hash/compute_single_pass_aggs.cu
   src/groupby/hash/sparse_to_dense_results.cu
   src/groupby/hash/sparse_to_dense_results_null.cu
   src/groupby/hash/groupby.cu
diff --git a/cpp/src/groupby/hash/compute_groupby.cuh b/cpp/src/groupby/hash/compute_groupby.cuh
index e97853fa155..146900f61eb 100644
--- a/cpp/src/groupby/hash/compute_groupby.cuh
+++ b/cpp/src/groupby/hash/compute_groupby.cuh
@@ -17,8 +17,8 @@
 
 #include "compute_single_pass_aggs.cuh"
 // #include "compute_single_pass_aggs.hpp"
-#include "multi_pass_functors.cuh"
 #include "sparse_to_dense_results.cuh"
+#include "var_hash_functor.cuh"
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/binaryop.hpp>
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu
new file mode 100644
index 00000000000..aa883f25315
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cu
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_single_pass_aggs.cuh"
+#include "helpers.cuh"
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+namespace hash {
+
+using global_set_t = cuco::static_set<cudf::size_type,
+                                      cuco::extent<int64_t>,
+                                      cuda::thread_scope_device,
+                                      row_comparator_t,
+                                      probing_scheme_t,
+                                      cudf::detail::cuco_allocator<char>,
+                                      cuco::storage<GROUPBY_WINDOW_SIZE>>;
+
+template void extract_populated_keys<global_set_t>(
+  global_set_t const& key_set,
+  rmm::device_uvector<cudf::size_type>& populated_keys,
+  rmm::cuda_stream_view stream);
+
+template auto create_sparse_results_table<global_set_t>(
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> aggs,
+  bool direct_aggregations,
+  global_set_t const& global_set,
+  rmm::device_uvector<cudf::size_type>& populated_keys,
+  rmm::cuda_stream_view stream);
+
+template rmm::device_uvector<cudf::size_type> compute_single_pass_aggs<global_set_t>(
+  cudf::table_view const& keys,
+  cudf::host_span<cudf::groupby::aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  global_set_t& global_set,
+  bool keys_have_nulls,
+  null_policy include_null_keys,
+  rmm::cuda_stream_view stream);
+
+}  // namespace hash
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu b/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu
new file mode 100644
index 00000000000..f0889a362fe
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_single_pass_aggs.cuh"
+#include "helpers.cuh"
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+namespace hash {
+
+using global_set_t = cuco::static_set<cudf::size_type,
+                                      cuco::extent<int64_t>,
+                                      cuda::thread_scope_device,
+                                      nullable_row_comparator_t,
+                                      probing_scheme_t,
+                                      cudf::detail::cuco_allocator<char>,
+                                      cuco::storage<GROUPBY_WINDOW_SIZE>>;
+
+template void extract_populated_keys<global_set_t>(
+  global_set_t const& key_set,
+  rmm::device_uvector<cudf::size_type>& populated_keys,
+  rmm::cuda_stream_view stream);
+
+template auto create_sparse_results_table<global_set_t>(
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> aggs,
+  bool direct_aggregations,
+  global_set_t const& global_set,
+  rmm::device_uvector<cudf::size_type>& populated_keys,
+  rmm::cuda_stream_view stream);
+
+template rmm::device_uvector<cudf::size_type> compute_single_pass_aggs<global_set_t>(
+  cudf::table_view const& keys,
+  cudf::host_span<cudf::groupby::aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  global_set_t& global_set,
+  bool keys_have_nulls,
+  null_policy include_null_keys,
+  rmm::cuda_stream_view stream);
+
+}  // namespace hash
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh
index a9326873282..1c40b77b5a1 100644
--- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh
+++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh
@@ -18,7 +18,9 @@
 #include "var_hash_functor.cuh"
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/aggregation/result_cache.hpp>
 #include <cudf/detail/binaryop.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/unary.hpp>
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cuh b/cpp/src/groupby/hash/sparse_to_dense_results.cuh
index 8d40358d0c8..b89fc308e6e 100644
--- a/cpp/src/groupby/hash/sparse_to_dense_results.cuh
+++ b/cpp/src/groupby/hash/sparse_to_dense_results.cuh
@@ -20,6 +20,7 @@
 #include "var_hash_functor.cuh"
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/result_cache.hpp>
 #include <cudf/detail/binaryop.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/unary.hpp>

From 85bf877889cafb0d7963cdc1ebe53f6ea6432276 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 25 Sep 2024 20:02:12 -0700
Subject: [PATCH 042/135] Reorder files

---
 cpp/CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6854295a6a6..0d9529ef58d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -317,14 +317,14 @@ add_library(
   src/groupby/hash/compute_aggregations.cu
   src/groupby/hash/compute_groupby.cu
   src/groupby/hash/compute_groupby_null.cu
-  src/groupby/hash/flatten_single_pass_aggs.cpp
   src/groupby/hash/compute_single_pass_aggs_null.cu
   src/groupby/hash/compute_single_pass_aggs.cu
+  src/groupby/hash/flatten_single_pass_aggs.cpp
+  src/groupby/hash/groupby.cu
+  src/groupby/hash/hash_compound_agg_finalizer.cu
   src/groupby/hash/sparse_to_dense_results.cu
   src/groupby/hash/sparse_to_dense_results_null.cu
-  src/groupby/hash/groupby.cu
   src/groupby/hash/var_hash_functor.cu
-  src/groupby/hash/hash_compound_agg_finalizer.cu
   src/groupby/sort/aggregate.cpp
   src/groupby/sort/group_argmax.cu
   src/groupby/sort/group_argmin.cu

From 049acffb986d4ae5d56f95df713e75730cc0452f Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 25 Sep 2024 20:31:30 -0700
Subject: [PATCH 043/135] Fix typos + add missing header

---
 cpp/src/groupby/hash/compute_single_pass_aggs.cu      | 3 +--
 cpp/src/groupby/hash/compute_single_pass_aggs.cuh     | 1 +
 cpp/src/groupby/hash/compute_single_pass_aggs_null.cu | 3 +--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu
index aa883f25315..2770dc2a84c 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cu
@@ -49,8 +49,7 @@ template rmm::device_uvector<cudf::size_type> compute_single_pass_aggs<global_se
   cudf::host_span<cudf::groupby::aggregation_request const> requests,
   cudf::detail::result_cache* sparse_results,
   global_set_t& global_set,
-  bool keys_have_nulls,
-  null_policy include_null_keys,
+  bool skip_rows_with_nulls,
   rmm::cuda_stream_view stream);
 
 }  // namespace hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index a52d6ecd530..b2891c1df1f 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -22,6 +22,7 @@
 #include "helpers.cuh"
 #include "single_pass_functors.cuh"
 
+#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/aggregation/result_cache.hpp>
 #include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/null_mask.hpp>
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu b/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu
index f0889a362fe..e566c2c5d27 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu
@@ -49,8 +49,7 @@ template rmm::device_uvector<cudf::size_type> compute_single_pass_aggs<global_se
   cudf::host_span<cudf::groupby::aggregation_request const> requests,
   cudf::detail::result_cache* sparse_results,
   global_set_t& global_set,
-  bool keys_have_nulls,
-  null_policy include_null_keys,
+  bool skip_rows_with_nulls,
   rmm::cuda_stream_view stream);
 
 }  // namespace hash

From 2d42b9b10aeedb6d80e83b738e7a7a79ad1dfb92 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 26 Sep 2024 10:50:17 -0700
Subject: [PATCH 044/135] Revert temp rollback

---
 cpp/src/groupby/hash/compute_aggregations.cu  | 39 +++++++++----------
 cpp/src/groupby/hash/compute_aggregations.hpp | 22 +++++------
 .../groupby/hash/compute_single_pass_aggs.cuh | 27 ++++++-------
 3 files changed, 41 insertions(+), 47 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu
index 7eb8216ee13..54f37c7f397 100644
--- a/cpp/src/groupby/hash/compute_aggregations.cu
+++ b/cpp/src/groupby/hash/compute_aggregations.cu
@@ -225,35 +225,36 @@ CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows,
 constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; }
 
 template <typename Kernel>
-constexpr std::pair<cudaError_t, size_t> compute_shared_memory_size(Kernel kernel, int grid_size)
+constexpr size_t compute_shared_memory_size(Kernel kernel, int grid_size)
 {
   auto const active_blocks_per_sm =
     cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
 
+  CUDF_EXPECTS(active_blocks_per_sm >= 1, "active_blocks_per_sm must be larger than 1");
+  CUDF_EXPECTS(grid_size >= 1, "grid_size must be larger than 1");
+
   size_t dynamic_shmem_size = 0;
 
-  auto const cuda_error = cudaOccupancyAvailableDynamicSMemPerBlock(
-    &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE);
-  return {cuda_error, get_previous_multiple_of_8(0.5 * dynamic_shmem_size)};
+  CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock(
+    &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE));
+  return get_previous_multiple_of_8(0.5 * dynamic_shmem_size);
 }
 
 }  // namespace
 
-cudaError_t compute_aggregations(int grid_size,
-                                 cudf::size_type num_input_rows,
-                                 bitmask_type const* row_bitmask,
-                                 bool skip_rows_with_nulls,
-                                 cudf::size_type* local_mapping_index,
-                                 cudf::size_type* global_mapping_index,
-                                 cudf::size_type* block_cardinality,
-                                 cudf::table_device_view input_values,
-                                 cudf::mutable_table_device_view output_values,
-                                 cudf::aggregation::Kind const* aggs,
-                                 rmm::cuda_stream_view stream)
+void compute_aggregations(int grid_size,
+                          cudf::size_type num_input_rows,
+                          bitmask_type const* row_bitmask,
+                          bool skip_rows_with_nulls,
+                          cudf::size_type* local_mapping_index,
+                          cudf::size_type* global_mapping_index,
+                          cudf::size_type* block_cardinality,
+                          cudf::table_device_view input_values,
+                          cudf::mutable_table_device_view output_values,
+                          cudf::aggregation::Kind const* aggs,
+                          rmm::cuda_stream_view stream)
 {
-  auto const [cuda_error, shmem_size] = compute_shared_memory_size(compute_aggs_kernel, grid_size);
-
-  if (cuda_error != cudaSuccess) { return cuda_error; }
+  auto const shmem_size = compute_shared_memory_size(compute_aggs_kernel, grid_size);
 
   // For each aggregation, need two pointers to arrays in shmem
   // One where the aggregation is performed, one indicating the validity of the aggregation
@@ -273,8 +274,6 @@ cudaError_t compute_aggregations(int grid_size,
     aggs,
     shmem_agg_size,
     shmem_agg_pointer_size);
-
-  return cudaSuccess;
 }
 
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_aggregations.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp
index 862462c8b9f..87c37158cd0 100644
--- a/cpp/src/groupby/hash/compute_aggregations.hpp
+++ b/cpp/src/groupby/hash/compute_aggregations.hpp
@@ -26,16 +26,16 @@
 
 namespace cudf::groupby::detail::hash {
 
-cudaError_t compute_aggregations(int grid_size,
-                                 cudf::size_type num_input_rows,
-                                 bitmask_type const* row_bitmask,
-                                 bool skip_rows_with_nulls,
-                                 cudf::size_type* local_mapping_index,
-                                 cudf::size_type* global_mapping_index,
-                                 cudf::size_type* block_cardinality,
-                                 cudf::table_device_view input_values,
-                                 cudf::mutable_table_device_view output_values,
-                                 cudf::aggregation::Kind const* aggs,
-                                 rmm::cuda_stream_view stream);
+void compute_aggregations(int grid_size,
+                          cudf::size_type num_input_rows,
+                          bitmask_type const* row_bitmask,
+                          bool skip_rows_with_nulls,
+                          cudf::size_type* local_mapping_index,
+                          cudf::size_type* global_mapping_index,
+                          cudf::size_type* block_cardinality,
+                          cudf::table_device_view input_values,
+                          cudf::mutable_table_device_view output_values,
+                          cudf::aggregation::Kind const* aggs,
+                          rmm::cuda_stream_view stream);
 
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index b2891c1df1f..051259bf9f4 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -313,22 +313,17 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
   auto d_values       = table_device_view::create(flattened_values, stream);
 
-  auto const cuda_error = compute_aggregations(grid_size,
-                                               num_input_rows,
-                                               static_cast<bitmask_type*>(row_bitmask.data()),
-                                               skip_rows_with_nulls,
-                                               local_mapping_index.data(),
-                                               global_mapping_index.data(),
-                                               block_cardinality.data(),
-                                               *d_values,
-                                               *d_sparse_table,
-                                               d_agg_kinds.data(),
-                                               stream);
-
-  if (cuda_error != cudaSuccess) {
-    constexpr bool uses_direct_aggs = true;
-    direct_aggregations.set_value_async(uses_direct_aggs, stream);
-  }
+  compute_aggregations(grid_size,
+                       num_input_rows,
+                       static_cast<bitmask_type*>(row_bitmask.data()),
+                       skip_rows_with_nulls,
+                       local_mapping_index.data(),
+                       global_mapping_index.data(),
+                       block_cardinality.data(),
+                       *d_values,
+                       *d_sparse_table,
+                       d_agg_kinds.data(),
+                       stream);
 
   if (direct_aggregations.value(stream)) {
     auto const stride = GROUPBY_BLOCK_SIZE * grid_size;

From 45573e0b1f69c8c497407e98ad4f9dbe3baf36c4 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 26 Sep 2024 12:37:39 -0700
Subject: [PATCH 045/135] Cleanups

---
 cpp/CMakeLists.txt                            |   3 -
 cpp/src/groupby/hash/compute_groupby.cu       | 111 +++++-
 cpp/src/groupby/hash/compute_groupby.cuh      | 111 ------
 cpp/src/groupby/hash/compute_groupby.hpp      |  68 ++++
 cpp/src/groupby/hash/compute_groupby_null.cu  |  31 --
 .../groupby/hash/compute_single_pass_aggs.cu  | 375 ++++++++++++++++--
 .../groupby/hash/compute_single_pass_aggs.cuh | 355 -----------------
 .../groupby/hash/compute_single_pass_aggs.hpp |  15 +-
 .../hash/compute_single_pass_aggs_null.cu     |  58 ---
 cpp/src/groupby/hash/groupby.cu               |   2 +-
 .../groupby/hash/sparse_to_dense_results.cu   |  61 ++-
 ...esults.cuh => sparse_to_dense_results.hpp} |  38 +-
 .../hash/sparse_to_dense_results_null.cu      |  33 --
 13 files changed, 599 insertions(+), 662 deletions(-)
 delete mode 100644 cpp/src/groupby/hash/compute_groupby.cuh
 create mode 100644 cpp/src/groupby/hash/compute_groupby.hpp
 delete mode 100644 cpp/src/groupby/hash/compute_groupby_null.cu
 delete mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs.cuh
 delete mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs_null.cu
 rename cpp/src/groupby/hash/{sparse_to_dense_results.cuh => sparse_to_dense_results.hpp} (57%)
 delete mode 100644 cpp/src/groupby/hash/sparse_to_dense_results_null.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 0d9529ef58d..663f2210ef4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -316,14 +316,11 @@ add_library(
   src/groupby/groupby.cu
   src/groupby/hash/compute_aggregations.cu
   src/groupby/hash/compute_groupby.cu
-  src/groupby/hash/compute_groupby_null.cu
-  src/groupby/hash/compute_single_pass_aggs_null.cu
   src/groupby/hash/compute_single_pass_aggs.cu
   src/groupby/hash/flatten_single_pass_aggs.cpp
   src/groupby/hash/groupby.cu
   src/groupby/hash/hash_compound_agg_finalizer.cu
   src/groupby/hash/sparse_to_dense_results.cu
-  src/groupby/hash/sparse_to_dense_results_null.cu
   src/groupby/hash/var_hash_functor.cu
   src/groupby/sort/aggregate.cpp
   src/groupby/sort/group_argmax.cu
diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu
index 7965d0891a7..9643567a825 100644
--- a/cpp/src/groupby/hash/compute_groupby.cu
+++ b/cpp/src/groupby/hash/compute_groupby.cu
@@ -13,11 +13,110 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
 
-#include "compute_groupby.cuh"
+#include "compute_single_pass_aggs.cuh"
+// #include "compute_single_pass_aggs.hpp"
+#include "helpers.cuh"
+#include "sparse_to_dense_results.cuh"
+#include "var_hash_functor.cuh"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/binaryop.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/unary.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <cuco/static_set.cuh>
+
+#include <memory>
 
 namespace cudf::groupby::detail::hash {
 
+/**
+ * @brief Computes groupby using hash table.
+ *
+ * First, we create a hash table that stores the indices of unique rows in
+ * `keys`. The upper limit on the number of values in this map is the number
+ * of rows in `keys`.
+ *
+ * To store the results of aggregations, we create temporary sparse columns
+ * which have the same size as input value columns. Using the hash map, we
+ * determine the location within the sparse column to write the result of the
+ * aggregation into.
+ *
+ * The sparse column results of all aggregations are stored into the cache
+ * `sparse_results`. This enables the use of previously calculated results in
+ * other aggregations.
+ *
+ * All the aggregations which can be computed in a single pass are computed
+ * first, in a combined kernel. Then using these results, aggregations that
+ * require multiple passes, will be computed.
+ *
+ * Finally, using the hash map, we generate a vector of indices of populated
+ * values in sparse result columns. Then, for each aggregation originally
+ * requested in `requests`, we gather sparse results into a column of dense
+ * results using the aforementioned index vector. Dense results are stored into
+ * the in/out parameter `cache`.
+ */
+template <typename Equal>
+std::unique_ptr<table> compute_groupby(table_view const& keys,
+                                       host_span<aggregation_request const> requests,
+                                       cudf::detail::result_cache* cache,
+                                       bool skip_key_rows_with_nulls,
+                                       Equal const& d_row_equal,
+                                       row_hash_t const& d_row_hash,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
+{
+  // convert to int64_t to avoid potential overflow with large `keys`
+  auto const num_keys = static_cast<int64_t>(keys.num_rows());
+
+  // Cache of sparse results where the location of aggregate value in each
+  // column is indexed by the hash set
+  cudf::detail::result_cache sparse_results(requests.size());
+
+  auto const set = cuco::static_set{
+    cuco::extent<int64_t>{num_keys},
+    cudf::detail::CUCO_DESIRED_LOAD_FACTOR,  // 50% occupancy
+    cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+    d_row_equal,
+    probing_scheme_t{d_row_hash},
+    cuco::thread_scope_device,
+    cuco::storage<GROUPBY_WINDOW_SIZE>{},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
+
+  // Compute all single pass aggs first
+  auto gather_map = compute_single_pass_aggs(
+    keys, requests, &sparse_results, set, skip_key_rows_with_nulls, stream);
+
+  // Compact all results from sparse_results and insert into cache
+  sparse_to_dense_results(keys,
+                          requests,
+                          &sparse_results,
+                          cache,
+                          gather_map,
+                          set.ref(cuco::find),
+                          skip_key_rows_with_nulls,
+                          stream,
+                          mr);
+
+  return cudf::detail::gather(keys,
+                              gather_map,
+                              out_of_bounds_policy::DONT_CHECK,
+                              cudf::detail::negative_index_policy::NOT_ALLOWED,
+                              stream,
+                              mr);
+}
+
 template std::unique_ptr<table> compute_groupby<row_comparator_t>(
   table_view const& keys,
   host_span<aggregation_request const> requests,
@@ -28,4 +127,14 @@ template std::unique_ptr<table> compute_groupby<row_comparator_t>(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr);
 
+template std::unique_ptr<table> compute_groupby<nullable_row_comparator_t>(
+  table_view const& keys,
+  host_span<aggregation_request const> requests,
+  cudf::detail::result_cache* cache,
+  bool skip_key_rows_with_nulls,
+  nullable_row_comparator_t const& d_row_equal,
+  row_hash_t const& d_row_hash,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_groupby.cuh b/cpp/src/groupby/hash/compute_groupby.cuh
deleted file mode 100644
index 146900f61eb..00000000000
--- a/cpp/src/groupby/hash/compute_groupby.cuh
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include "compute_single_pass_aggs.cuh"
-// #include "compute_single_pass_aggs.hpp"
-#include "sparse_to_dense_results.cuh"
-#include "var_hash_functor.cuh"
-
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/binaryop.hpp>
-#include <cudf/detail/gather.hpp>
-#include <cudf/detail/unary.hpp>
-
-#include <cuco/static_set.cuh>
-
-namespace cudf::groupby::detail::hash {
-
-/**
- * @brief Computes groupby using hash table.
- *
- * First, we create a hash table that stores the indices of unique rows in
- * `keys`. The upper limit on the number of values in this map is the number
- * of rows in `keys`.
- *
- * To store the results of aggregations, we create temporary sparse columns
- * which have the same size as input value columns. Using the hash map, we
- * determine the location within the sparse column to write the result of the
- * aggregation into.
- *
- * The sparse column results of all aggregations are stored into the cache
- * `sparse_results`. This enables the use of previously calculated results in
- * other aggregations.
- *
- * All the aggregations which can be computed in a single pass are computed
- * first, in a combined kernel. Then using these results, aggregations that
- * require multiple passes, will be computed.
- *
- * Finally, using the hash map, we generate a vector of indices of populated
- * values in sparse result columns. Then, for each aggregation originally
- * requested in `requests`, we gather sparse results into a column of dense
- * results using the aforementioned index vector. Dense results are stored into
- * the in/out parameter `cache`.
- */
-template <typename Equal>
-std::unique_ptr<table> compute_groupby(
-  table_view const& keys,
-  host_span<aggregation_request const> requests,
-  cudf::detail::result_cache* cache,
-  bool skip_key_rows_with_nulls,
-  Equal const& d_row_equal,
-  cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
-                                                   cudf::nullate::DYNAMIC> const& d_row_hash,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  // convert to int64_t to avoid potential overflow with large `keys`
-  auto const num_keys = static_cast<int64_t>(keys.num_rows());
-
-  // Cache of sparse results where the location of aggregate value in each
-  // column is indexed by the hash set
-  cudf::detail::result_cache sparse_results(requests.size());
-
-  auto const set = cuco::static_set{
-    cuco::extent<int64_t>{num_keys},
-    cudf::detail::CUCO_DESIRED_LOAD_FACTOR,  // 50% occupancy
-    cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
-    d_row_equal,
-    probing_scheme_t{d_row_hash},
-    cuco::thread_scope_device,
-    cuco::storage<GROUPBY_WINDOW_SIZE>{},
-    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
-    stream.value()};
-
-  // Compute all single pass aggs first
-  auto gather_map = compute_single_pass_aggs(
-    keys, requests, &sparse_results, set, skip_key_rows_with_nulls, stream);
-
-  // Compact all results from sparse_results and insert into cache
-  sparse_to_dense_results(keys,
-                          requests,
-                          &sparse_results,
-                          cache,
-                          gather_map,
-                          set.ref(cuco::find),
-                          skip_key_rows_with_nulls,
-                          stream,
-                          mr);
-
-  return cudf::detail::gather(keys,
-                              gather_map,
-                              out_of_bounds_policy::DONT_CHECK,
-                              cudf::detail::negative_index_policy::NOT_ALLOWED,
-                              stream,
-                              mr);
-}
-
-}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_groupby.hpp b/cpp/src/groupby/hash/compute_groupby.hpp
new file mode 100644
index 00000000000..358c81365a0
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_groupby.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "helpers.cuh"
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <memory>
+
+namespace cudf::groupby::detail::hash {
+/**
+ * @brief Computes groupby using hash table.
+ *
+ * First, we create a hash table that stores the indices of unique rows in
+ * `keys`. The upper limit on the number of values in this map is the number
+ * of rows in `keys`.
+ *
+ * To store the results of aggregations, we create temporary sparse columns
+ * which have the same size as input value columns. Using the hash map, we
+ * determine the location within the sparse column to write the result of the
+ * aggregation into.
+ *
+ * The sparse column results of all aggregations are stored into the cache
+ * `sparse_results`. This enables the use of previously calculated results in
+ * other aggregations.
+ *
+ * All the aggregations which can be computed in a single pass are computed
+ * first, in a combined kernel. Then using these results, aggregations that
+ * require multiple passes, will be computed.
+ *
+ * Finally, using the hash map, we generate a vector of indices of populated
+ * values in sparse result columns. Then, for each aggregation originally
+ * requested in `requests`, we gather sparse results into a column of dense
+ * results using the aforementioned index vector. Dense results are stored into
+ * the in/out parameter `cache`.
+ */
+template <typename Equal>
+std::unique_ptr<cudf::table> compute_groupby(table_view const& keys,
+                                             host_span<aggregation_request const> requests,
+                                             cudf::detail::result_cache* cache,
+                                             bool skip_key_rows_with_nulls,
+                                             Equal const& d_row_equal,
+                                             row_hash_t const& d_row_hash,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr);
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_groupby_null.cu b/cpp/src/groupby/hash/compute_groupby_null.cu
deleted file mode 100644
index 1f9707902cc..00000000000
--- a/cpp/src/groupby/hash/compute_groupby_null.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "compute_groupby.cuh"
-
-namespace cudf::groupby::detail::hash {
-
-template std::unique_ptr<table> compute_groupby<nullable_row_comparator_t>(
-  table_view const& keys,
-  host_span<aggregation_request const> requests,
-  cudf::detail::result_cache* cache,
-  bool skip_key_rows_with_nulls,
-  nullable_row_comparator_t const& d_row_equal,
-  row_hash_t const& d_row_hash,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr);
-
-}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu
index 2770dc2a84c..b5c68ea639a 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,342 @@
  * limitations under the License.
  */
 
-#include "compute_single_pass_aggs.cuh"
+#include "compute_aggregations.hpp"
+// #include "compute_single_pass_aggs.hpp"
+#include "flatten_single_pass_aggs.hpp"
 #include "helpers.cuh"
+#include "single_pass_functors.cuh"
 
-namespace cudf {
-namespace groupby {
-namespace detail {
-namespace hash {
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/cuco_helpers.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/cuda.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <cooperative_groups.h>
+#include <cuco/static_set.cuh>
+
+#include <unordered_set>
+
+namespace cudf::groupby::detail::hash {
+namespace {
+template <typename SetType>
+// TODO pass block
+__device__ void find_local_mapping(cudf::size_type cur_idx,
+                                   cudf::size_type num_input_rows,
+                                   SetType shared_set,
+                                   bitmask_type const* row_bitmask,
+                                   bool skip_rows_with_nulls,
+                                   cudf::size_type* cardinality,
+                                   cudf::size_type* local_mapping_index,
+                                   cudf::size_type* shared_set_indices)
+{
+  cudf::size_type result_idx;
+  // TODO: un-init
+  bool inserted;
+  if (cur_idx < num_input_rows and
+      (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) {
+    auto const result = shared_set.insert_and_find(cur_idx);
+    result_idx        = *result.first;
+    inserted          = result.second;
+    // inserted a new element
+    if (result.second) {
+      auto const shared_set_index          = atomicAdd(cardinality, 1);
+      shared_set_indices[shared_set_index] = cur_idx;
+      local_mapping_index[cur_idx]         = shared_set_index;
+    }
+  }
+  // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all
+  // threads in the thread block.
+  __syncthreads();
+  if (cur_idx < num_input_rows and
+      (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) {
+    // element was already in set
+    if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; }
+  }
+}
+
+template <typename SetType>
+__device__ void find_global_mapping(cudf::size_type cur_idx,
+                                    SetType global_set,
+                                    cudf::size_type* shared_set_indices,
+                                    cudf::size_type* global_mapping_index)
+{
+  auto const input_idx = shared_set_indices[cur_idx];
+  global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] =
+    *global_set.insert_and_find(input_idx).first;
+}
+
+/*
+ * Inserts keys into the shared memory hash set, and stores the row index of the local
+ * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a
+ * threadblock exceeds `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without
+ * updating `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to
+ * the global hash set, and save the row index of the global sparse table in `global_mapping_index`.
+ */
+template <class SetRef, typename GlobalSetType, class WindowExtent>
+CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
+                                         cudf::size_type num_input_rows,
+                                         WindowExtent window_extent,
+                                         bitmask_type const* row_bitmask,
+                                         bool skip_rows_with_nulls,
+                                         cudf::size_type* local_mapping_index,
+                                         cudf::size_type* global_mapping_index,
+                                         cudf::size_type* block_cardinality,
+                                         bool* direct_aggregations)
+{
+  // TODO: indices inserted in each shared memory set
+  __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS];
+
+  // Shared set initialization
+  __shared__ typename SetRef::window_type windows[window_extent.value()];
+  auto storage     = SetRef::storage_ref_type(window_extent, windows);
+  auto shared_set  = SetRef(cuco::empty_key<cudf::size_type>{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+                           global_set.key_eq(),
+                           probing_scheme_t{global_set.hash_function()},
+                            {},
+                           storage);
+  auto const block = cooperative_groups::this_thread_block();
+  shared_set.initialize(block);
+
+  auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find);
+
+  __shared__ cudf::size_type cardinality;
+  if (block.thread_rank() == 0) { cardinality = 0; }
+  block.sync();
+
+  auto const stride = cudf::detail::grid_1d::grid_stride();
+
+  for (auto cur_idx = cudf::detail::grid_1d::global_thread_id();
+       cur_idx - block.thread_rank() < num_input_rows;
+       cur_idx += stride) {
+    find_local_mapping(cur_idx,
+                       num_input_rows,
+                       shared_insert_ref,
+                       row_bitmask,
+                       skip_rows_with_nulls,
+                       &cardinality,
+                       local_mapping_index,
+                       shared_set_indices);
+
+    block.sync();
+
+    if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) {
+      if (block.thread_rank() == 0) { *direct_aggregations = true; }
+      break;
+    }
+
+    block.sync();
+  }
+
+  // Insert unique keys from shared to global hash set
+  if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) {
+    for (auto cur_idx = block.thread_rank(); cur_idx < cardinality;
+         cur_idx += block.num_threads()) {
+      find_global_mapping(cur_idx, global_set, shared_set_indices, global_mapping_index);
+    }
+  }
+
+  if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; }
+}
+
+template <typename Kernel>
+int max_occupancy_grid_size(Kernel kernel, cudf::size_type n)
+{
+  int max_active_blocks{-1};
+  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &max_active_blocks, kernel, GROUPBY_BLOCK_SIZE, 0));
+  auto const grid_size  = max_active_blocks * cudf::detail::num_multiprocessors();
+  auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE);
+  return std::min(grid_size, num_blocks);
+}
+
+template <typename SetType>
+void extract_populated_keys(SetType const& key_set,
+                            rmm::device_uvector<cudf::size_type>& populated_keys,
+                            rmm::cuda_stream_view stream)
+{
+  auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value());
+
+  populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream);
+}
+
+// make table that will hold sparse results
+template <typename GlobalSetType>
+auto create_sparse_results_table(cudf::table_view const& flattened_values,
+                                 cudf::aggregation::Kind const* d_agg_kinds,
+                                 std::vector<cudf::aggregation::Kind> aggs,
+                                 bool direct_aggregations,
+                                 GlobalSetType const& global_set,
+                                 rmm::device_uvector<cudf::size_type>& populated_keys,
+                                 rmm::cuda_stream_view stream)
+{
+  // TODO single allocation - room for performance improvement
+  std::vector<std::unique_ptr<cudf::column>> sparse_columns;
+  std::transform(flattened_values.begin(),
+                 flattened_values.end(),
+                 aggs.begin(),
+                 std::back_inserter(sparse_columns),
+                 [stream](auto const& col, auto const& agg) {
+                   auto const nullable =
+                     (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL)
+                       ? false
+                       : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or
+                          agg == cudf::aggregation::STD);
+                   auto mask_flag =
+                     (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED;
+                   auto const col_type = cudf::is_dictionary(col.type())
+                                           ? cudf::dictionary_column_view(col).keys().type()
+                                           : col.type();
+                   return make_fixed_width_column(
+                     cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream);
+                 });
+  cudf::table sparse_table(std::move(sparse_columns));
+  // If no direct aggregations, initialize the sparse table
+  // only for the keys inserted in global hash set
+  if (!direct_aggregations) {
+    auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream);
+    extract_populated_keys(global_set, populated_keys, stream);
+    thrust::for_each_n(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(0),
+      populated_keys.size(),
+      initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_agg_kinds});
+  }
+  // Else initialize the whole table
+  else {
+    cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view();
+    cudf::detail::initialize_with_identity(sparse_table_view, aggs, stream);
+  }
+  return sparse_table;
+}
+}  // namespace
+
+/**
+ * @brief Computes all aggregations from `requests` that require a single pass
+ * over the data and stores the results in `sparse_results`
+ */
+template <typename SetType>
+rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
+  cudf::table_view const& keys,
+  cudf::host_span<cudf::groupby::aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  SetType& global_set,
+  bool skip_rows_with_nulls,
+  rmm::cuda_stream_view stream)
+{
+  // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy
+  auto constexpr shared_set_capacity =
+    static_cast<std::size_t>(static_cast<double>(GROUPBY_SHM_MAX_ELEMENTS) * 1.43);
+  using extent_type            = cuco::extent<cudf::size_type, shared_set_capacity>;
+  using shared_set_type        = cuco::static_set<cudf::size_type,
+                                           extent_type,
+                                           cuda::thread_scope_block,
+                                           typename SetType::key_equal,
+                                           probing_scheme_t,
+                                           cuco::cuda_allocator<cudf::size_type>,
+                                           cuco::storage<GROUPBY_WINDOW_SIZE>>;
+  using shared_set_ref_type    = typename shared_set_type::ref_type<>;
+  auto constexpr window_extent = cuco::make_window_extent<shared_set_ref_type>(extent_type{});
+
+  auto const num_input_rows = keys.num_rows();
+
+  auto row_bitmask =
+    skip_rows_with_nulls
+      ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
+      : rmm::device_buffer{};
+
+  auto global_set_ref  = global_set.ref(cuco::op::insert_and_find);
+  auto const grid_size = max_occupancy_grid_size(
+    compute_mapping_indices<shared_set_ref_type, decltype(global_set_ref), decltype(window_extent)>,
+    num_input_rows);
+  // 'local_mapping_index' maps from the global row index of the input table to the row index of
+  // the local pre-aggregate table
+  rmm::device_uvector<cudf::size_type> local_mapping_index(num_input_rows, stream);
+  // 'global_mapping_index' maps from  the local pre-aggregate table to the row index of
+  // global aggregate table
+  rmm::device_uvector<cudf::size_type> global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS,
+                                                            stream);
+  rmm::device_uvector<cudf::size_type> block_cardinality(grid_size, stream);
+  rmm::device_scalar<bool> direct_aggregations(false, stream);
+  compute_mapping_indices<shared_set_ref_type>
+    <<<grid_size, GROUPBY_BLOCK_SIZE, 0, stream>>>(global_set_ref,
+                                                   num_input_rows,
+                                                   window_extent,
+                                                   static_cast<bitmask_type*>(row_bitmask.data()),
+                                                   skip_rows_with_nulls,
+                                                   local_mapping_index.data(),
+                                                   global_mapping_index.data(),
+                                                   block_cardinality.data(),
+                                                   direct_aggregations.data());
+  stream.synchronize();
+
+  // 'populated_keys' contains inserted row_indices (keys) of global hash set
+  rmm::device_uvector<cudf::size_type> populated_keys(keys.num_rows(), stream);
+
+  // flatten the aggs to a table that can be operated on by aggregate_row
+  auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
+  auto const d_agg_kinds                         = cudf::detail::make_device_uvector_async(
+    agg_kinds, stream, rmm::mr::get_current_device_resource());
+  // make table that will hold sparse results
+  cudf::table sparse_table = create_sparse_results_table(flattened_values,
+                                                         d_agg_kinds.data(),
+                                                         agg_kinds,
+                                                         direct_aggregations.value(stream),
+                                                         global_set,
+                                                         populated_keys,
+                                                         stream);
+  // prepare to launch kernel to do the actual aggregation
+  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
+  auto d_values       = table_device_view::create(flattened_values, stream);
+
+  compute_aggregations(grid_size,
+                       num_input_rows,
+                       static_cast<bitmask_type*>(row_bitmask.data()),
+                       skip_rows_with_nulls,
+                       local_mapping_index.data(),
+                       global_mapping_index.data(),
+                       block_cardinality.data(),
+                       *d_values,
+                       *d_sparse_table,
+                       d_agg_kinds.data(),
+                       stream);
+
+  if (direct_aggregations.value(stream)) {
+    auto const stride = GROUPBY_BLOCK_SIZE * grid_size;
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator(0),
+                       keys.num_rows(),
+                       compute_direct_aggregates{global_set_ref,
+                                                 *d_values,
+                                                 *d_sparse_table,
+                                                 d_agg_kinds.data(),
+                                                 block_cardinality.data(),
+                                                 stride,
+                                                 static_cast<bitmask_type*>(row_bitmask.data()),
+                                                 skip_rows_with_nulls});
+    extract_populated_keys(global_set, populated_keys, stream);
+  }
+
+  // Add results back to sparse_results cache
+  auto sparse_result_cols = sparse_table.release();
+  for (size_t i = 0; i < aggs.size(); i++) {
+    // Note that the cache will make a copy of this temporary aggregation
+    sparse_results->add_result(
+      flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i]));
+  }
+
+  return populated_keys;
+}
 
 using global_set_t = cuco::static_set<cudf::size_type,
                                       cuco::extent<int64_t>,
@@ -30,20 +359,6 @@ using global_set_t = cuco::static_set<cudf::size_type,
                                       cudf::detail::cuco_allocator<char>,
                                       cuco::storage<GROUPBY_WINDOW_SIZE>>;
 
-template void extract_populated_keys<global_set_t>(
-  global_set_t const& key_set,
-  rmm::device_uvector<cudf::size_type>& populated_keys,
-  rmm::cuda_stream_view stream);
-
-template auto create_sparse_results_table<global_set_t>(
-  cudf::table_view const& flattened_values,
-  cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> aggs,
-  bool direct_aggregations,
-  global_set_t const& global_set,
-  rmm::device_uvector<cudf::size_type>& populated_keys,
-  rmm::cuda_stream_view stream);
-
 template rmm::device_uvector<cudf::size_type> compute_single_pass_aggs<global_set_t>(
   cudf::table_view const& keys,
   cudf::host_span<cudf::groupby::aggregation_request const> requests,
@@ -52,7 +367,19 @@ template rmm::device_uvector<cudf::size_type> compute_single_pass_aggs<global_se
   bool skip_rows_with_nulls,
   rmm::cuda_stream_view stream);
 
-}  // namespace hash
-}  // namespace detail
-}  // namespace groupby
-}  // namespace cudf
+using nullable_global_set_t = cuco::static_set<cudf::size_type,
+                                               cuco::extent<int64_t>,
+                                               cuda::thread_scope_device,
+                                               nullable_row_comparator_t,
+                                               probing_scheme_t,
+                                               cudf::detail::cuco_allocator<char>,
+                                               cuco::storage<GROUPBY_WINDOW_SIZE>>;
+
+template rmm::device_uvector<cudf::size_type> compute_single_pass_aggs<nullable_global_set_t>(
+  cudf::table_view const& keys,
+  cudf::host_span<cudf::groupby::aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  nullable_global_set_t& global_set,
+  bool skip_rows_with_nulls,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
deleted file mode 100644
index 051259bf9f4..00000000000
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ /dev/null
@@ -1,355 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "compute_aggregations.hpp"
-// #include "compute_single_pass_aggs.hpp"
-#include "flatten_single_pass_aggs.hpp"
-#include "helpers.cuh"
-#include "single_pass_functors.cuh"
-
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/aggregation/result_cache.hpp>
-#include <cudf/detail/cuco_helpers.hpp>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/cuda.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/groupby.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
-#include <cudf/utilities/span.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-
-#include <cooperative_groups.h>
-#include <cuco/static_set.cuh>
-
-#include <unordered_set>
-
-namespace cudf::groupby::detail::hash {
-
-template <typename SetType>
-// TODO pass block
-__device__ void find_local_mapping(cudf::size_type cur_idx,
-                                   cudf::size_type num_input_rows,
-                                   SetType shared_set,
-                                   bitmask_type const* row_bitmask,
-                                   bool skip_rows_with_nulls,
-                                   cudf::size_type* cardinality,
-                                   cudf::size_type* local_mapping_index,
-                                   cudf::size_type* shared_set_indices)
-{
-  cudf::size_type result_idx;
-  // TODO: un-init
-  bool inserted;
-  if (cur_idx < num_input_rows and
-      (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) {
-    auto const result = shared_set.insert_and_find(cur_idx);
-    result_idx        = *result.first;
-    inserted          = result.second;
-    // inserted a new element
-    if (result.second) {
-      auto const shared_set_index          = atomicAdd(cardinality, 1);
-      shared_set_indices[shared_set_index] = cur_idx;
-      local_mapping_index[cur_idx]         = shared_set_index;
-    }
-  }
-  // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all
-  // threads in the thread block.
-  __syncthreads();
-  if (cur_idx < num_input_rows and
-      (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) {
-    // element was already in set
-    if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; }
-  }
-}
-
-template <typename SetType>
-__device__ void find_global_mapping(cudf::size_type cur_idx,
-                                    SetType global_set,
-                                    cudf::size_type* shared_set_indices,
-                                    cudf::size_type* global_mapping_index)
-{
-  auto const input_idx = shared_set_indices[cur_idx];
-  global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] =
-    *global_set.insert_and_find(input_idx).first;
-}
-
-/*
- * Inserts keys into the shared memory hash set, and stores the row index of the local
- * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a
- * threadblock exceeds `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without
- * updating `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to
- * the global hash set, and save the row index of the global sparse table in `global_mapping_index`.
- */
-template <class SetRef, typename GlobalSetType, class WindowExtent>
-CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
-                                         cudf::size_type num_input_rows,
-                                         WindowExtent window_extent,
-                                         bitmask_type const* row_bitmask,
-                                         bool skip_rows_with_nulls,
-                                         cudf::size_type* local_mapping_index,
-                                         cudf::size_type* global_mapping_index,
-                                         cudf::size_type* block_cardinality,
-                                         bool* direct_aggregations)
-{
-  // TODO: indices inserted in each shared memory set
-  __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS];
-
-  // Shared set initialization
-  __shared__ typename SetRef::window_type windows[window_extent.value()];
-  auto storage     = SetRef::storage_ref_type(window_extent, windows);
-  auto shared_set  = SetRef(cuco::empty_key<cudf::size_type>{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
-                           global_set.key_eq(),
-                           probing_scheme_t{global_set.hash_function()},
-                            {},
-                           storage);
-  auto const block = cooperative_groups::this_thread_block();
-  shared_set.initialize(block);
-
-  auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find);
-
-  __shared__ cudf::size_type cardinality;
-  if (block.thread_rank() == 0) { cardinality = 0; }
-  block.sync();
-
-  auto const stride = cudf::detail::grid_1d::grid_stride();
-
-  for (auto cur_idx = cudf::detail::grid_1d::global_thread_id();
-       cur_idx - block.thread_rank() < num_input_rows;
-       cur_idx += stride) {
-    find_local_mapping(cur_idx,
-                       num_input_rows,
-                       shared_insert_ref,
-                       row_bitmask,
-                       skip_rows_with_nulls,
-                       &cardinality,
-                       local_mapping_index,
-                       shared_set_indices);
-
-    block.sync();
-
-    if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) {
-      if (block.thread_rank() == 0) { *direct_aggregations = true; }
-      break;
-    }
-
-    block.sync();
-  }
-
-  // Insert unique keys from shared to global hash set
-  if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) {
-    for (auto cur_idx = block.thread_rank(); cur_idx < cardinality;
-         cur_idx += block.num_threads()) {
-      find_global_mapping(cur_idx, global_set, shared_set_indices, global_mapping_index);
-    }
-  }
-
-  if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; }
-}
-
-template <typename Kernel>
-int max_occupancy_grid_size(Kernel kernel, cudf::size_type n)
-{
-  int max_active_blocks{-1};
-  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &max_active_blocks, kernel, GROUPBY_BLOCK_SIZE, 0));
-  auto const grid_size  = max_active_blocks * cudf::detail::num_multiprocessors();
-  auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE);
-  return std::min(grid_size, num_blocks);
-}
-
-template <typename SetType>
-void extract_populated_keys(SetType const& key_set,
-                            rmm::device_uvector<cudf::size_type>& populated_keys,
-                            rmm::cuda_stream_view stream)
-{
-  auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value());
-
-  populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream);
-}
-
-// make table that will hold sparse results
-template <typename GlobalSetType>
-auto create_sparse_results_table(cudf::table_view const& flattened_values,
-                                 cudf::aggregation::Kind const* d_agg_kinds,
-                                 std::vector<cudf::aggregation::Kind> aggs,
-                                 bool direct_aggregations,
-                                 GlobalSetType const& global_set,
-                                 rmm::device_uvector<cudf::size_type>& populated_keys,
-                                 rmm::cuda_stream_view stream)
-{
-  // TODO single allocation - room for performance improvement
-  std::vector<std::unique_ptr<cudf::column>> sparse_columns;
-  std::transform(flattened_values.begin(),
-                 flattened_values.end(),
-                 aggs.begin(),
-                 std::back_inserter(sparse_columns),
-                 [stream](auto const& col, auto const& agg) {
-                   auto const nullable =
-                     (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL)
-                       ? false
-                       : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or
-                          agg == cudf::aggregation::STD);
-                   auto mask_flag =
-                     (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED;
-                   auto const col_type = cudf::is_dictionary(col.type())
-                                           ? cudf::dictionary_column_view(col).keys().type()
-                                           : col.type();
-                   return make_fixed_width_column(
-                     cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream);
-                 });
-  cudf::table sparse_table(std::move(sparse_columns));
-  // If no direct aggregations, initialize the sparse table
-  // only for the keys inserted in global hash set
-  if (!direct_aggregations) {
-    auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream);
-    extract_populated_keys(global_set, populated_keys, stream);
-    thrust::for_each_n(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator(0),
-      populated_keys.size(),
-      initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_agg_kinds});
-  }
-  // Else initialize the whole table
-  else {
-    cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view();
-    cudf::detail::initialize_with_identity(sparse_table_view, aggs, stream);
-  }
-  return sparse_table;
-}
-
-/**
- * @brief Computes all aggregations from `requests` that require a single pass
- * over the data and stores the results in `sparse_results`
- */
-template <typename SetType>
-rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
-  cudf::table_view const& keys,
-  cudf::host_span<cudf::groupby::aggregation_request const> requests,
-  cudf::detail::result_cache* sparse_results,
-  SetType& global_set,
-  bool skip_rows_with_nulls,
-  rmm::cuda_stream_view stream)
-{
-  // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy
-  auto constexpr shared_set_capacity =
-    static_cast<std::size_t>(static_cast<double>(GROUPBY_SHM_MAX_ELEMENTS) * 1.43);
-  using extent_type            = cuco::extent<cudf::size_type, shared_set_capacity>;
-  using shared_set_type        = cuco::static_set<cudf::size_type,
-                                           extent_type,
-                                           cuda::thread_scope_block,
-                                           typename SetType::key_equal,
-                                           probing_scheme_t,
-                                           cuco::cuda_allocator<cudf::size_type>,
-                                           cuco::storage<GROUPBY_WINDOW_SIZE>>;
-  using shared_set_ref_type    = typename shared_set_type::ref_type<>;
-  auto constexpr window_extent = cuco::make_window_extent<shared_set_ref_type>(extent_type{});
-
-  auto const num_input_rows = keys.num_rows();
-
-  auto row_bitmask =
-    skip_rows_with_nulls
-      ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
-      : rmm::device_buffer{};
-
-  auto global_set_ref  = global_set.ref(cuco::op::insert_and_find);
-  auto const grid_size = max_occupancy_grid_size(
-    compute_mapping_indices<shared_set_ref_type, decltype(global_set_ref), decltype(window_extent)>,
-    num_input_rows);
-  // 'local_mapping_index' maps from the global row index of the input table to the row index of
-  // the local pre-aggregate table
-  rmm::device_uvector<cudf::size_type> local_mapping_index(num_input_rows, stream);
-  // 'global_mapping_index' maps from  the local pre-aggregate table to the row index of
-  // global aggregate table
-  rmm::device_uvector<cudf::size_type> global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS,
-                                                            stream);
-  rmm::device_uvector<cudf::size_type> block_cardinality(grid_size, stream);
-  rmm::device_scalar<bool> direct_aggregations(false, stream);
-  compute_mapping_indices<shared_set_ref_type>
-    <<<grid_size, GROUPBY_BLOCK_SIZE, 0, stream>>>(global_set_ref,
-                                                   num_input_rows,
-                                                   window_extent,
-                                                   static_cast<bitmask_type*>(row_bitmask.data()),
-                                                   skip_rows_with_nulls,
-                                                   local_mapping_index.data(),
-                                                   global_mapping_index.data(),
-                                                   block_cardinality.data(),
-                                                   direct_aggregations.data());
-  stream.synchronize();
-
-  // 'populated_keys' contains inserted row_indices (keys) of global hash set
-  rmm::device_uvector<cudf::size_type> populated_keys(keys.num_rows(), stream);
-
-  // flatten the aggs to a table that can be operated on by aggregate_row
-  auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
-  auto const d_agg_kinds                         = cudf::detail::make_device_uvector_async(
-    agg_kinds, stream, rmm::mr::get_current_device_resource());
-  // make table that will hold sparse results
-  cudf::table sparse_table = create_sparse_results_table(flattened_values,
-                                                         d_agg_kinds.data(),
-                                                         agg_kinds,
-                                                         direct_aggregations.value(stream),
-                                                         global_set,
-                                                         populated_keys,
-                                                         stream);
-  // prepare to launch kernel to do the actual aggregation
-  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
-  auto d_values       = table_device_view::create(flattened_values, stream);
-
-  compute_aggregations(grid_size,
-                       num_input_rows,
-                       static_cast<bitmask_type*>(row_bitmask.data()),
-                       skip_rows_with_nulls,
-                       local_mapping_index.data(),
-                       global_mapping_index.data(),
-                       block_cardinality.data(),
-                       *d_values,
-                       *d_sparse_table,
-                       d_agg_kinds.data(),
-                       stream);
-
-  if (direct_aggregations.value(stream)) {
-    auto const stride = GROUPBY_BLOCK_SIZE * grid_size;
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator(0),
-                       keys.num_rows(),
-                       compute_direct_aggregates{global_set_ref,
-                                                 *d_values,
-                                                 *d_sparse_table,
-                                                 d_agg_kinds.data(),
-                                                 block_cardinality.data(),
-                                                 stride,
-                                                 static_cast<bitmask_type*>(row_bitmask.data()),
-                                                 skip_rows_with_nulls});
-    extract_populated_keys(global_set, populated_keys, stream);
-  }
-
-  // Add results back to sparse_results cache
-  auto sparse_result_cols = sparse_table.release();
-  for (size_t i = 0; i < aggs.size(); i++) {
-    // Note that the cache will make a copy of this temporary aggregation
-    sparse_results->add_result(
-      flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i]));
-  }
-
-  return populated_keys;
-}
-
-}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
index 848ace94ff9..6cbea9fcd3c 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
 #include <cudf/detail/aggregation/result_cache.hpp>
@@ -25,10 +24,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
-namespace cudf {
-namespace groupby {
-namespace detail {
-namespace hash {
+namespace cudf::groupby::detail::hash {
 /**
  * @brief Computes all aggregations from `requests` that require a single pass
  * over the data and stores the results in `sparse_results`
@@ -39,11 +35,6 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   cudf::host_span<cudf::groupby::aggregation_request const> requests,
   cudf::detail::result_cache* sparse_results,
   SetType& global_set,
-  bool keys_have_nulls,
-  null_policy include_null_keys,
+  bool skip_rows_with_nulls,
   rmm::cuda_stream_view stream);
-
-}  // namespace hash
-}  // namespace detail
-}  // namespace groupby
-}  // namespace cudf
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu b/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu
deleted file mode 100644
index e566c2c5d27..00000000000
--- a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "compute_single_pass_aggs.cuh"
-#include "helpers.cuh"
-
-namespace cudf {
-namespace groupby {
-namespace detail {
-namespace hash {
-
-using global_set_t = cuco::static_set<cudf::size_type,
-                                      cuco::extent<int64_t>,
-                                      cuda::thread_scope_device,
-                                      nullable_row_comparator_t,
-                                      probing_scheme_t,
-                                      cudf::detail::cuco_allocator<char>,
-                                      cuco::storage<GROUPBY_WINDOW_SIZE>>;
-
-template void extract_populated_keys<global_set_t>(
-  global_set_t const& key_set,
-  rmm::device_uvector<cudf::size_type>& populated_keys,
-  rmm::cuda_stream_view stream);
-
-template auto create_sparse_results_table<global_set_t>(
-  cudf::table_view const& flattened_values,
-  cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> aggs,
-  bool direct_aggregations,
-  global_set_t const& global_set,
-  rmm::device_uvector<cudf::size_type>& populated_keys,
-  rmm::cuda_stream_view stream);
-
-template rmm::device_uvector<cudf::size_type> compute_single_pass_aggs<global_set_t>(
-  cudf::table_view const& keys,
-  cudf::host_span<cudf::groupby::aggregation_request const> requests,
-  cudf::detail::result_cache* sparse_results,
-  global_set_t& global_set,
-  bool skip_rows_with_nulls,
-  rmm::cuda_stream_view stream);
-
-}  // namespace hash
-}  // namespace detail
-}  // namespace groupby
-}  // namespace cudf
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 62434bf5fd2..b307b8a8d1f 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "compute_groupby.cuh"
+#include "compute_groupby.hpp"
 #include "groupby/common/utils.hpp"
 #include "helpers.cuh"
 
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu
index 760926afa13..a416e2124ce 100644
--- a/cpp/src/groupby/hash/sparse_to_dense_results.cu
+++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,11 +13,55 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
 
-#include "helpers.cuh"
-#include "sparse_to_dense_results.cuh"
+#include "hash_compound_agg_finalizer.cuh"
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 
 namespace cudf::groupby::detail::hash {
+/**
+ * @brief Gather sparse results into dense using `gather_map` and add to
+ * `dense_cache`
+ *
+ * @see groupby_null_templated()
+ */
+template <typename SetType>
+void sparse_to_dense_results(table_view const& keys,
+                             host_span<aggregation_request const> requests,
+                             cudf::detail::result_cache* sparse_results,
+                             cudf::detail::result_cache* dense_results,
+                             device_span<size_type const> gather_map,
+                             SetType set,
+                             bool skip_key_rows_with_nulls,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr)
+{
+  auto row_bitmask =
+    cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first;
+  bitmask_type const* row_bitmask_ptr =
+    skip_key_rows_with_nulls ? static_cast<bitmask_type*>(row_bitmask.data()) : nullptr;
+
+  for (auto const& request : requests) {
+    auto const& agg_v = request.aggregations;
+    auto const& col   = request.values;
+
+    // Given an aggregation, this will get the result from sparse_results and
+    // convert and return dense, compacted result
+    auto finalizer = hash_compound_agg_finalizer(
+      col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr);
+    for (auto&& agg : agg_v) {
+      agg->finalize(finalizer);
+    }
+  }
+}
 
 template void sparse_to_dense_results<hash_set_ref_t>(table_view const& keys,
                                                       host_span<aggregation_request const> requests,
@@ -29,4 +73,15 @@ template void sparse_to_dense_results<hash_set_ref_t>(table_view const& keys,
                                                       rmm::cuda_stream_view stream,
                                                       rmm::device_async_resource_ref mr);
 
+template void sparse_to_dense_results<nullable_hash_set_ref_t>(
+  table_view const& keys,
+  host_span<aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  cudf::detail::result_cache* dense_results,
+  device_span<size_type const> gather_map,
+  nullable_hash_set_ref_t set,
+  bool skip_key_rows_with_nulls,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cuh b/cpp/src/groupby/hash/sparse_to_dense_results.hpp
similarity index 57%
rename from cpp/src/groupby/hash/sparse_to_dense_results.cuh
rename to cpp/src/groupby/hash/sparse_to_dense_results.hpp
index b89fc308e6e..bfdc42953ad 100644
--- a/cpp/src/groupby/hash/sparse_to_dense_results.cuh
+++ b/cpp/src/groupby/hash/sparse_to_dense_results.hpp
@@ -15,18 +15,16 @@
  */
 #pragma once
 
-#include "compute_single_pass_aggs.cuh"
-#include "hash_compound_agg_finalizer.cuh"
-#include "var_hash_functor.cuh"
-
-#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/aggregation/result_cache.hpp>
-#include <cudf/detail/binaryop.hpp>
-#include <cudf/detail/gather.hpp>
-#include <cudf/detail/unary.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
-namespace cudf::groupby::detail::hash {
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 
+namespace cudf::groupby::detail::hash {
 /**
  * @brief Gather sparse results into dense using `gather_map` and add to
  * `dense_cache`
@@ -42,25 +40,5 @@ void sparse_to_dense_results(table_view const& keys,
                              SetType set,
                              bool skip_key_rows_with_nulls,
                              rmm::cuda_stream_view stream,
-                             rmm::device_async_resource_ref mr)
-{
-  auto row_bitmask =
-    cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first;
-  bitmask_type const* row_bitmask_ptr =
-    skip_key_rows_with_nulls ? static_cast<bitmask_type*>(row_bitmask.data()) : nullptr;
-
-  for (auto const& request : requests) {
-    auto const& agg_v = request.aggregations;
-    auto const& col   = request.values;
-
-    // Given an aggregation, this will get the result from sparse_results and
-    // convert and return dense, compacted result
-    auto finalizer = hash_compound_agg_finalizer(
-      col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr);
-    for (auto&& agg : agg_v) {
-      agg->finalize(finalizer);
-    }
-  }
-}
-
+                             rmm::device_async_resource_ref mr);
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results_null.cu b/cpp/src/groupby/hash/sparse_to_dense_results_null.cu
deleted file mode 100644
index b6820f7f6db..00000000000
--- a/cpp/src/groupby/hash/sparse_to_dense_results_null.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "helpers.cuh"
-#include "sparse_to_dense_results.cuh"
-
-namespace cudf::groupby::detail::hash {
-
-template void sparse_to_dense_results<nullable_hash_set_ref_t>(
-  table_view const& keys,
-  host_span<aggregation_request const> requests,
-  cudf::detail::result_cache* sparse_results,
-  cudf::detail::result_cache* dense_results,
-  device_span<size_type const> gather_map,
-  nullable_hash_set_ref_t set,
-  bool skip_key_rows_with_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr);
-
-}  // namespace cudf::groupby::detail::hash

From dec49a828409709ce1f585d470d3138802164be4 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 26 Sep 2024 12:50:35 -0700
Subject: [PATCH 046/135] Header cleanups

---
 cpp/src/groupby/hash/compute_aggregations.cu    | 1 -
 cpp/src/groupby/hash/compute_groupby.cu         | 7 +++----
 cpp/src/groupby/hash/sparse_to_dense_results.cu | 3 ++-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu
index 54f37c7f397..8b559ffc1be 100644
--- a/cpp/src/groupby/hash/compute_aggregations.cu
+++ b/cpp/src/groupby/hash/compute_aggregations.cu
@@ -21,7 +21,6 @@
 #include "single_pass_functors.cuh"
 
 #include <cudf/aggregation.hpp>
-#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/cuda.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu
index 9643567a825..4aa03d17999 100644
--- a/cpp/src/groupby/hash/compute_groupby.cu
+++ b/cpp/src/groupby/hash/compute_groupby.cu
@@ -13,17 +13,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#pragma once
 
-#include "compute_single_pass_aggs.cuh"
-// #include "compute_single_pass_aggs.hpp"
+#include "compute_single_pass_aggs.hpp"
 #include "helpers.cuh"
-#include "sparse_to_dense_results.cuh"
+#include "sparse_to_dense_results.hpp"
 #include "var_hash_functor.cuh"
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/aggregation/result_cache.hpp>
 #include <cudf/detail/binaryop.hpp>
+#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/groupby.hpp>
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu
index a416e2124ce..7f7290141f9 100644
--- a/cpp/src/groupby/hash/sparse_to_dense_results.cu
+++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu
@@ -13,11 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#pragma once
 
 #include "hash_compound_agg_finalizer.cuh"
+#include "helpers.cuh"
 
 #include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>

From 777400978907f9d88ba9f172a8f1b82d9e8118a5 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 26 Sep 2024 14:06:20 -0700
Subject: [PATCH 047/135] More cleanups for hash_compound_agg_finalizer

---
 cpp/CMakeLists.txt                            |   1 -
 cpp/src/groupby/hash/compute_groupby.cu       |   2 +-
 .../hash/hash_compound_agg_finalizer.cu       | 177 +++++++++++++++-
 .../hash/hash_compound_agg_finalizer.cuh      | 199 ------------------
 .../hash/hash_compound_agg_finalizer.hpp      |  69 ++++++
 cpp/src/groupby/hash/helpers.cuh              |  20 +-
 .../groupby/hash/sparse_to_dense_results.cu   |   2 +-
 cpp/src/groupby/hash/var_hash_functor.cu      |  26 ---
 cpp/src/groupby/hash/var_hash_functor.cuh     |   2 +
 9 files changed, 264 insertions(+), 234 deletions(-)
 delete mode 100644 cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh
 create mode 100644 cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp
 delete mode 100644 cpp/src/groupby/hash/var_hash_functor.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 663f2210ef4..cd92e086329 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -321,7 +321,6 @@ add_library(
   src/groupby/hash/groupby.cu
   src/groupby/hash/hash_compound_agg_finalizer.cu
   src/groupby/hash/sparse_to_dense_results.cu
-  src/groupby/hash/var_hash_functor.cu
   src/groupby/sort/aggregate.cpp
   src/groupby/sort/group_argmax.cu
   src/groupby/sort/group_argmin.cu
diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu
index 4aa03d17999..1eb208c588d 100644
--- a/cpp/src/groupby/hash/compute_groupby.cu
+++ b/cpp/src/groupby/hash/compute_groupby.cu
@@ -82,7 +82,7 @@ std::unique_ptr<table> compute_groupby(table_view const& keys,
   // column is indexed by the hash set
   cudf::detail::result_cache sparse_results(requests.size());
 
-  auto const set = cuco::static_set{
+  auto set = cuco::static_set{
     cuco::extent<int64_t>{num_keys},
     cudf::detail::CUCO_DESIRED_LOAD_FACTOR,  // 50% occupancy
     cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
index e7a7af92f15..119ac8cf6fd 100644
--- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
+++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
@@ -14,10 +14,185 @@
  * limitations under the License.
  */
 
-#include "hash_compound_agg_finalizer.cuh"
+#include "hash_compound_agg_finalizer.hpp"
 #include "helpers.cuh"
+#include "var_hash_functor.cuh"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/binaryop.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/unary.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <memory>
 
 namespace cudf::groupby::detail::hash {
+template <typename SetType>
+hash_compound_agg_finalizer<SetType>::hash_compound_agg_finalizer(
+  column_view col,
+  cudf::detail::result_cache* sparse_results,
+  cudf::detail::result_cache* dense_results,
+  device_span<size_type const> gather_map,
+  SetType set,
+  bitmask_type const* row_bitmask,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+  : col(col),
+    sparse_results(sparse_results),
+    dense_results(dense_results),
+    gather_map(gather_map),
+    set(set),
+    row_bitmask(row_bitmask),
+    stream(stream),
+    mr(mr)
+{
+  result_type =
+    cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() : col.type();
+}
+
+template <typename SetType>
+auto hash_compound_agg_finalizer<SetType>::to_dense_agg_result(cudf::aggregation const& agg)
+{
+  auto s                  = sparse_results->get_result(col, agg);
+  auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}),
+                                                 gather_map,
+                                                 out_of_bounds_policy::DONT_CHECK,
+                                                 cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                                 stream,
+                                                 mr);
+  return std::move(dense_result_table->release()[0]);
+}
+
+template <typename SetType>
+auto hash_compound_agg_finalizer<SetType>::gather_argminmax(aggregation const& agg)
+{
+  auto arg_result = to_dense_agg_result(agg);
+  // We make a view of ARG(MIN/MAX) result without a null mask and gather
+  // using this map. The values in data buffer of ARG(MIN/MAX) result
+  // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL
+  // which is an out of bounds index value (-1) and causes the gathered
+  // value to be null.
+  column_view null_removed_map(
+    data_type(type_to_id<size_type>()),
+    arg_result->size(),
+    static_cast<void const*>(arg_result->view().template data<size_type>()),
+    nullptr,
+    0);
+  auto gather_argminmax =
+    cudf::detail::gather(table_view({col}),
+                         null_removed_map,
+                         arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY
+                                                : cudf::out_of_bounds_policy::DONT_CHECK,
+                         cudf::detail::negative_index_policy::NOT_ALLOWED,
+                         stream,
+                         mr);
+  return std::move(gather_argminmax->release()[0]);
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+  dense_results->add_result(col, agg, to_dense_agg_result(agg));
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::min_aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+  if (result_type.id() == type_id::STRING) {
+    auto transformed_agg = make_argmin_aggregation();
+    dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
+  } else {
+    dense_results->add_result(col, agg, to_dense_agg_result(agg));
+  }
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::max_aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+
+  if (result_type.id() == type_id::STRING) {
+    auto transformed_agg = make_argmax_aggregation();
+    dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
+  } else {
+    dense_results->add_result(col, agg, to_dense_agg_result(agg));
+  }
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::mean_aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+
+  auto sum_agg   = make_sum_aggregation();
+  auto count_agg = make_count_aggregation();
+  this->visit(*sum_agg);
+  this->visit(*count_agg);
+  column_view sum_result   = dense_results->get_result(col, *sum_agg);
+  column_view count_result = dense_results->get_result(col, *count_agg);
+
+  auto result =
+    cudf::detail::binary_operation(sum_result,
+                                   count_result,
+                                   binary_operator::DIV,
+                                   cudf::detail::target_type(result_type, aggregation::MEAN),
+                                   stream,
+                                   mr);
+  dense_results->add_result(col, agg, std::move(result));
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::var_aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+
+  auto sum_agg   = make_sum_aggregation();
+  auto count_agg = make_count_aggregation();
+  this->visit(*sum_agg);
+  this->visit(*count_agg);
+  column_view sum_result   = sparse_results->get_result(col, *sum_agg);
+  column_view count_result = sparse_results->get_result(col, *count_agg);
+
+  auto values_view = column_device_view::create(col, stream);
+  auto sum_view    = column_device_view::create(sum_result, stream);
+  auto count_view  = column_device_view::create(count_result, stream);
+
+  auto var_result = make_fixed_width_column(
+    cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream);
+  auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream);
+  mutable_table_view var_table_view{{var_result->mutable_view()}};
+  cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream);
+
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator(0),
+    col.size(),
+    var_hash_functor{
+      set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof});
+  sparse_results->add_result(col, agg, std::move(var_result));
+  dense_results->add_result(col, agg, to_dense_agg_result(agg));
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::std_aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+  auto var_agg = make_variance_aggregation(agg._ddof);
+  this->visit(*dynamic_cast<cudf::detail::var_aggregation*>(var_agg.get()));
+  column_view variance = dense_results->get_result(col, *var_agg);
+
+  auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr);
+  dense_results->add_result(col, agg, std::move(result));
+}
 
 template class hash_compound_agg_finalizer<hash_set_ref_t>;
 template class hash_compound_agg_finalizer<nullable_hash_set_ref_t>;
diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh
deleted file mode 100644
index 1c40b77b5a1..00000000000
--- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include "var_hash_functor.cuh"
-
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/aggregation/aggregation.cuh>
-#include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/detail/aggregation/result_cache.hpp>
-#include <cudf/detail/binaryop.hpp>
-#include <cudf/detail/gather.hpp>
-#include <cudf/detail/unary.hpp>
-#include <cudf/types.hpp>
-
-namespace cudf::groupby::detail::hash {
-
-template <typename SetType>
-class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer {
-  column_view col;
-  data_type result_type;
-  cudf::detail::result_cache* sparse_results;
-  cudf::detail::result_cache* dense_results;
-  device_span<size_type const> gather_map;
-  SetType set;
-  bitmask_type const* __restrict__ row_bitmask;
-  rmm::cuda_stream_view stream;
-  rmm::device_async_resource_ref mr;
-
- public:
-  using cudf::detail::aggregation_finalizer::visit;
-
-  hash_compound_agg_finalizer(column_view col,
-                              cudf::detail::result_cache* sparse_results,
-                              cudf::detail::result_cache* dense_results,
-                              device_span<size_type const> gather_map,
-                              SetType set,
-                              bitmask_type const* row_bitmask,
-                              rmm::cuda_stream_view stream,
-                              rmm::device_async_resource_ref mr)
-    : col(col),
-      sparse_results(sparse_results),
-      dense_results(dense_results),
-      gather_map(gather_map),
-      set(set),
-      row_bitmask(row_bitmask),
-      stream(stream),
-      mr(mr)
-  {
-    result_type = cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type()
-                                                  : col.type();
-  }
-
-  auto to_dense_agg_result(cudf::aggregation const& agg)
-  {
-    auto s                  = sparse_results->get_result(col, agg);
-    auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}),
-                                                   gather_map,
-                                                   out_of_bounds_policy::DONT_CHECK,
-                                                   cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                                   stream,
-                                                   mr);
-    return std::move(dense_result_table->release()[0]);
-  }
-
-  // Enables conversion of ARGMIN/ARGMAX into MIN/MAX
-  auto gather_argminmax(aggregation const& agg)
-  {
-    auto arg_result = to_dense_agg_result(agg);
-    // We make a view of ARG(MIN/MAX) result without a null mask and gather
-    // using this map. The values in data buffer of ARG(MIN/MAX) result
-    // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL
-    // which is an out of bounds index value (-1) and causes the gathered
-    // value to be null.
-    column_view null_removed_map(
-      data_type(type_to_id<size_type>()),
-      arg_result->size(),
-      static_cast<void const*>(arg_result->view().template data<size_type>()),
-      nullptr,
-      0);
-    auto gather_argminmax =
-      cudf::detail::gather(table_view({col}),
-                           null_removed_map,
-                           arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY
-                                                  : cudf::out_of_bounds_policy::DONT_CHECK,
-                           cudf::detail::negative_index_policy::NOT_ALLOWED,
-                           stream,
-                           mr);
-    return std::move(gather_argminmax->release()[0]);
-  }
-
-  // Declare overloads for each kind of aggregation to dispatch
-  void visit(cudf::aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-    dense_results->add_result(col, agg, to_dense_agg_result(agg));
-  }
-
-  void visit(cudf::detail::min_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-    if (result_type.id() == type_id::STRING) {
-      auto transformed_agg = make_argmin_aggregation();
-      dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
-    } else {
-      dense_results->add_result(col, agg, to_dense_agg_result(agg));
-    }
-  }
-
-  void visit(cudf::detail::max_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-
-    if (result_type.id() == type_id::STRING) {
-      auto transformed_agg = make_argmax_aggregation();
-      dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
-    } else {
-      dense_results->add_result(col, agg, to_dense_agg_result(agg));
-    }
-  }
-
-  void visit(cudf::detail::mean_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-
-    auto sum_agg   = make_sum_aggregation();
-    auto count_agg = make_count_aggregation();
-    this->visit(*sum_agg);
-    this->visit(*count_agg);
-    column_view sum_result   = dense_results->get_result(col, *sum_agg);
-    column_view count_result = dense_results->get_result(col, *count_agg);
-
-    auto result =
-      cudf::detail::binary_operation(sum_result,
-                                     count_result,
-                                     binary_operator::DIV,
-                                     cudf::detail::target_type(result_type, aggregation::MEAN),
-                                     stream,
-                                     mr);
-    dense_results->add_result(col, agg, std::move(result));
-  }
-
-  void visit(cudf::detail::var_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-
-    auto sum_agg   = make_sum_aggregation();
-    auto count_agg = make_count_aggregation();
-    this->visit(*sum_agg);
-    this->visit(*count_agg);
-    column_view sum_result   = sparse_results->get_result(col, *sum_agg);
-    column_view count_result = sparse_results->get_result(col, *count_agg);
-
-    auto values_view = column_device_view::create(col, stream);
-    auto sum_view    = column_device_view::create(sum_result, stream);
-    auto count_view  = column_device_view::create(count_result, stream);
-
-    auto var_result = make_fixed_width_column(
-      cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream);
-    auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream);
-    mutable_table_view var_table_view{{var_result->mutable_view()}};
-    cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream);
-
-    thrust::for_each_n(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator(0),
-      col.size(),
-      var_hash_functor{
-        set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof});
-    sparse_results->add_result(col, agg, std::move(var_result));
-    dense_results->add_result(col, agg, to_dense_agg_result(agg));
-  }
-
-  void visit(cudf::detail::std_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-    auto var_agg = make_variance_aggregation(agg._ddof);
-    this->visit(*dynamic_cast<cudf::detail::var_aggregation*>(var_agg.get()));
-    column_view variance = dense_results->get_result(col, *var_agg);
-
-    auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr);
-    dense_results->add_result(col, agg, std::move(result));
-  }
-};
-
-}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp
new file mode 100644
index 00000000000..16cbe92511f
--- /dev/null
+++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
+class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer {
+  column_view col;
+  data_type result_type;
+  cudf::detail::result_cache* sparse_results;
+  cudf::detail::result_cache* dense_results;
+  device_span<size_type const> gather_map;
+  SetType set;
+  bitmask_type const* __restrict__ row_bitmask;
+  rmm::cuda_stream_view stream;
+  rmm::device_async_resource_ref mr;
+
+ public:
+  using cudf::detail::aggregation_finalizer::visit;
+
+  hash_compound_agg_finalizer(column_view col,
+                              cudf::detail::result_cache* sparse_results,
+                              cudf::detail::result_cache* dense_results,
+                              device_span<size_type const> gather_map,
+                              SetType set,
+                              bitmask_type const* row_bitmask,
+                              rmm::cuda_stream_view stream,
+                              rmm::device_async_resource_ref mr);
+
+  auto to_dense_agg_result(cudf::aggregation const& agg);
+
+  // Enables conversion of ARGMIN/ARGMAX into MIN/MAX
+  auto gather_argminmax(cudf::aggregation const& agg);
+
+  // Declare overloads for each kind of aggregation to dispatch
+  void visit(cudf::aggregation const& agg) override;
+
+  void visit(cudf::detail::min_aggregation const& agg) override;
+
+  void visit(cudf::detail::max_aggregation const& agg) override;
+
+  void visit(cudf::detail::mean_aggregation const& agg) override;
+
+  void visit(cudf::detail::var_aggregation const& agg) override;
+
+  void visit(cudf::detail::std_aggregation const& agg) override;
+};
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh
index 650b936372d..c1dd68c2b78 100644
--- a/cpp/src/groupby/hash/helpers.cuh
+++ b/cpp/src/groupby/hash/helpers.cuh
@@ -70,10 +70,20 @@ using nullable_row_comparator_t = cudf::experimental::row::equality::device_row_
   cudf::nullate::DYNAMIC,
   cudf::experimental::row::equality::nan_equal_physical_equality_comparator>;
 
-using hash_set_ref_t = cuco::
-  static_set_ref<cudf::size_type, cuda::thread_scope_device, row_comparator_t, probing_scheme_t, cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, cuco::extent<int64_t>>, cuco::op::find_tag, >;
-
-using nullable_hash_set_ref_t = cuco::
-  static_set_ref<cudf::size_type, cuda::thread_scope_device, nullable_row_comparator_t, probing_scheme_t, cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, cuco::extent<int64_t>>, cuco::op::find_tag, >;
+using hash_set_ref_t = cuco::static_set_ref<
+  cudf::size_type,
+  cuda::thread_scope_device,
+  row_comparator_t,
+  probing_scheme_t,
+  cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, cuco::window_extent<int64_t>>,
+  cuco::op::find_tag>;
+
+using nullable_hash_set_ref_t = cuco::static_set_ref<
+  cudf::size_type,
+  cuda::thread_scope_device,
+  nullable_row_comparator_t,
+  probing_scheme_t,
+  cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, cuco::window_extent<int64_t>>,
+  cuco::op::find_tag>;
 
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu
index 7f7290141f9..af61173fb6a 100644
--- a/cpp/src/groupby/hash/sparse_to_dense_results.cu
+++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "hash_compound_agg_finalizer.cuh"
+#include "hash_compound_agg_finalizer.hpp"
 #include "helpers.cuh"
 
 #include <cudf/detail/aggregation/result_cache.hpp>
diff --git a/cpp/src/groupby/hash/var_hash_functor.cu b/cpp/src/groupby/hash/var_hash_functor.cu
deleted file mode 100644
index 4881f4ed85e..00000000000
--- a/cpp/src/groupby/hash/var_hash_functor.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "helpers.cuh"
-#include "var_hash_functor.cuh"
-
-namespace cudf::groupby::detail::hash {
-
-// explicit template instantiation to reduce build time
-template struct var_hash_functor<hash_set_ref_t>;
-template struct var_hash_functor<nullable_hash_set_ref_t>;
-
-}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/var_hash_functor.cuh b/cpp/src/groupby/hash/var_hash_functor.cuh
index 98668d0cb45..abcd57263f4 100644
--- a/cpp/src/groupby/hash/var_hash_functor.cuh
+++ b/cpp/src/groupby/hash/var_hash_functor.cuh
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include "helpers.cuh"
+
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/aggregation/aggregation.hpp>

From b4422c0427237d04970f26ec01e40f140f79e723 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 26 Sep 2024 15:58:40 -0700
Subject: [PATCH 048/135] Separate create_sparse_results_table

---
 cpp/CMakeLists.txt                            |   1 +
 .../groupby/hash/compute_single_pass_aggs.cu  |  79 +----------
 .../hash/create_sparse_results_table.cu       | 125 ++++++++++++++++++
 .../hash/create_sparse_results_table.hpp      |  42 ++++++
 cpp/src/groupby/hash/helpers.cuh              |  17 +++
 5 files changed, 187 insertions(+), 77 deletions(-)
 create mode 100644 cpp/src/groupby/hash/create_sparse_results_table.cu
 create mode 100644 cpp/src/groupby/hash/create_sparse_results_table.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index cd92e086329..7f90fb388dc 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -317,6 +317,7 @@ add_library(
   src/groupby/hash/compute_aggregations.cu
   src/groupby/hash/compute_groupby.cu
   src/groupby/hash/compute_single_pass_aggs.cu
+  src/groupby/hash/create_sparse_results_table.cu
   src/groupby/hash/flatten_single_pass_aggs.cpp
   src/groupby/hash/groupby.cu
   src/groupby/hash/hash_compound_agg_finalizer.cu
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu
index b5c68ea639a..465d58ed9ef 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cu
@@ -15,7 +15,8 @@
  */
 
 #include "compute_aggregations.hpp"
-// #include "compute_single_pass_aggs.hpp"
+#include "compute_single_pass_aggs.hpp"
+#include "create_sparse_results_table.hpp"
 #include "flatten_single_pass_aggs.hpp"
 #include "helpers.cuh"
 #include "single_pass_functors.cuh"
@@ -172,66 +173,6 @@ int max_occupancy_grid_size(Kernel kernel, cudf::size_type n)
   auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE);
   return std::min(grid_size, num_blocks);
 }
-
-template <typename SetType>
-void extract_populated_keys(SetType const& key_set,
-                            rmm::device_uvector<cudf::size_type>& populated_keys,
-                            rmm::cuda_stream_view stream)
-{
-  auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value());
-
-  populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream);
-}
-
-// make table that will hold sparse results
-template <typename GlobalSetType>
-auto create_sparse_results_table(cudf::table_view const& flattened_values,
-                                 cudf::aggregation::Kind const* d_agg_kinds,
-                                 std::vector<cudf::aggregation::Kind> aggs,
-                                 bool direct_aggregations,
-                                 GlobalSetType const& global_set,
-                                 rmm::device_uvector<cudf::size_type>& populated_keys,
-                                 rmm::cuda_stream_view stream)
-{
-  // TODO single allocation - room for performance improvement
-  std::vector<std::unique_ptr<cudf::column>> sparse_columns;
-  std::transform(flattened_values.begin(),
-                 flattened_values.end(),
-                 aggs.begin(),
-                 std::back_inserter(sparse_columns),
-                 [stream](auto const& col, auto const& agg) {
-                   auto const nullable =
-                     (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL)
-                       ? false
-                       : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or
-                          agg == cudf::aggregation::STD);
-                   auto mask_flag =
-                     (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED;
-                   auto const col_type = cudf::is_dictionary(col.type())
-                                           ? cudf::dictionary_column_view(col).keys().type()
-                                           : col.type();
-                   return make_fixed_width_column(
-                     cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream);
-                 });
-  cudf::table sparse_table(std::move(sparse_columns));
-  // If no direct aggregations, initialize the sparse table
-  // only for the keys inserted in global hash set
-  if (!direct_aggregations) {
-    auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream);
-    extract_populated_keys(global_set, populated_keys, stream);
-    thrust::for_each_n(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator(0),
-      populated_keys.size(),
-      initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_agg_kinds});
-  }
-  // Else initialize the whole table
-  else {
-    cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view();
-    cudf::detail::initialize_with_identity(sparse_table_view, aggs, stream);
-  }
-  return sparse_table;
-}
 }  // namespace
 
 /**
@@ -351,14 +292,6 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   return populated_keys;
 }
 
-using global_set_t = cuco::static_set<cudf::size_type,
-                                      cuco::extent<int64_t>,
-                                      cuda::thread_scope_device,
-                                      row_comparator_t,
-                                      probing_scheme_t,
-                                      cudf::detail::cuco_allocator<char>,
-                                      cuco::storage<GROUPBY_WINDOW_SIZE>>;
-
 template rmm::device_uvector<cudf::size_type> compute_single_pass_aggs<global_set_t>(
   cudf::table_view const& keys,
   cudf::host_span<cudf::groupby::aggregation_request const> requests,
@@ -367,14 +300,6 @@ template rmm::device_uvector<cudf::size_type> compute_single_pass_aggs<global_se
   bool skip_rows_with_nulls,
   rmm::cuda_stream_view stream);
 
-using nullable_global_set_t = cuco::static_set<cudf::size_type,
-                                               cuco::extent<int64_t>,
-                                               cuda::thread_scope_device,
-                                               nullable_row_comparator_t,
-                                               probing_scheme_t,
-                                               cudf::detail::cuco_allocator<char>,
-                                               cuco::storage<GROUPBY_WINDOW_SIZE>>;
-
 template rmm::device_uvector<cudf::size_type> compute_single_pass_aggs<nullable_global_set_t>(
   cudf::table_view const& keys,
   cudf::host_span<cudf::groupby::aggregation_request const> requests,
diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu
new file mode 100644
index 00000000000..7ae0184528d
--- /dev/null
+++ b/cpp/src/groupby/hash/create_sparse_results_table.cu
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "create_sparse_results_table.hpp"
+#include "helpers.cuh"
+#include "single_pass_functors.cuh"
+
+#include <cudf/aggregation.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <cuco/static_set.cuh>
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
+void extract_populated_keys(SetType const& key_set,
+                            rmm::device_uvector<cudf::size_type>& populated_keys,
+                            rmm::cuda_stream_view stream)
+{
+  auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value());
+
+  populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream);
+}
+
+// make table that will hold sparse results
+template <typename GlobalSetType>
+cudf::table create_sparse_results_table(cudf::table_view const& flattened_values,
+                                        cudf::aggregation::Kind const* d_agg_kinds,
+                                        std::vector<cudf::aggregation::Kind> aggs,
+                                        bool direct_aggregations,
+                                        GlobalSetType const& global_set,
+                                        rmm::device_uvector<cudf::size_type>& populated_keys,
+                                        rmm::cuda_stream_view stream)
+{
+  // TODO single allocation - room for performance improvement
+  std::vector<std::unique_ptr<cudf::column>> sparse_columns;
+  std::transform(flattened_values.begin(),
+                 flattened_values.end(),
+                 aggs.begin(),
+                 std::back_inserter(sparse_columns),
+                 [stream](auto const& col, auto const& agg) {
+                   auto const nullable =
+                     (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL)
+                       ? false
+                       : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or
+                          agg == cudf::aggregation::STD);
+                   auto mask_flag =
+                     (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED;
+                   auto const col_type = cudf::is_dictionary(col.type())
+                                           ? cudf::dictionary_column_view(col).keys().type()
+                                           : col.type();
+                   return make_fixed_width_column(
+                     cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream);
+                 });
+  cudf::table sparse_table(std::move(sparse_columns));
+  // If no direct aggregations, initialize the sparse table
+  // only for the keys inserted in global hash set
+  if (!direct_aggregations) {
+    auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream);
+    extract_populated_keys(global_set, populated_keys, stream);
+    thrust::for_each_n(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(0),
+      populated_keys.size(),
+      initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_agg_kinds});
+  }
+  // Else initialize the whole table
+  else {
+    cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view();
+    cudf::detail::initialize_with_identity(sparse_table_view, aggs, stream);
+  }
+  return sparse_table;
+}
+
+template void extract_populated_keys<global_set_t>(
+  global_set_t const& key_set,
+  rmm::device_uvector<cudf::size_type>& populated_keys,
+  rmm::cuda_stream_view stream);
+
+template void extract_populated_keys<nullable_global_set_t>(
+  nullable_global_set_t const& key_set,
+  rmm::device_uvector<cudf::size_type>& populated_keys,
+  rmm::cuda_stream_view stream);
+
+template cudf::table create_sparse_results_table<global_set_t>(
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> aggs,
+  bool direct_aggregations,
+  global_set_t const& global_set,
+  rmm::device_uvector<cudf::size_type>& populated_keys,
+  rmm::cuda_stream_view stream);
+
+template cudf::table create_sparse_results_table<nullable_global_set_t>(
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> aggs,
+  bool direct_aggregations,
+  nullable_global_set_t const& global_set,
+  rmm::device_uvector<cudf::size_type>& populated_keys,
+  rmm::cuda_stream_view stream);
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp
new file mode 100644
index 00000000000..2daa88289c0
--- /dev/null
+++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
+void extract_populated_keys(SetType const& key_set,
+                            rmm::device_uvector<cudf::size_type>& populated_keys,
+                            rmm::cuda_stream_view stream);
+
+// make table that will hold sparse results
+template <typename GlobalSetType>
+cudf::table create_sparse_results_table(cudf::table_view const& flattened_values,
+                                        cudf::aggregation::Kind const* d_agg_kinds,
+                                        std::vector<cudf::aggregation::Kind> aggs,
+                                        bool direct_aggregations,
+                                        GlobalSetType const& global_set,
+                                        rmm::device_uvector<cudf::size_type>& populated_keys,
+                                        rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh
index c1dd68c2b78..651a6a2014a 100644
--- a/cpp/src/groupby/hash/helpers.cuh
+++ b/cpp/src/groupby/hash/helpers.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/types.hpp>
@@ -70,6 +71,22 @@ using nullable_row_comparator_t = cudf::experimental::row::equality::device_row_
   cudf::nullate::DYNAMIC,
   cudf::experimental::row::equality::nan_equal_physical_equality_comparator>;
 
+using global_set_t = cuco::static_set<cudf::size_type,
+                                      cuco::extent<int64_t>,
+                                      cuda::thread_scope_device,
+                                      row_comparator_t,
+                                      probing_scheme_t,
+                                      cudf::detail::cuco_allocator<char>,
+                                      cuco::storage<GROUPBY_WINDOW_SIZE>>;
+
+using nullable_global_set_t = cuco::static_set<cudf::size_type,
+                                               cuco::extent<int64_t>,
+                                               cuda::thread_scope_device,
+                                               nullable_row_comparator_t,
+                                               probing_scheme_t,
+                                               cudf::detail::cuco_allocator<char>,
+                                               cuco::storage<GROUPBY_WINDOW_SIZE>>;
+
 using hash_set_ref_t = cuco::static_set_ref<
   cudf::size_type,
   cuda::thread_scope_device,

From 8ce4cda47cb6e9010f121b7cfda853884bfef22e Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 26 Sep 2024 17:48:28 -0700
Subject: [PATCH 049/135] Add groupby multi-aggs test

---
 cpp/tests/CMakeLists.txt               |   1 +
 cpp/tests/groupby/multi_aggs_tests.cpp | 115 +++++++++++++++++++++++++
 2 files changed, 116 insertions(+)
 create mode 100644 cpp/tests/groupby/multi_aggs_tests.cpp

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index b67d922d377..66d70c0c7cf 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -137,6 +137,7 @@ ConfigureTest(
   groupby/merge_lists_tests.cpp
   groupby/merge_sets_tests.cpp
   groupby/min_scan_tests.cpp
+  groupby/multi_aggs_tests.cpp
   groupby/nth_element_tests.cpp
   groupby/nunique_tests.cpp
   groupby/product_scan_tests.cpp
diff --git a/cpp/tests/groupby/multi_aggs_tests.cpp b/cpp/tests/groupby/multi_aggs_tests.cpp
new file mode 100644
index 00000000000..ae491a8f796
--- /dev/null
+++ b/cpp/tests/groupby/multi_aggs_tests.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_list_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+
+#include <initializer_list>
+#include <vector>
+
+using namespace cudf::test::iterators;
+
+namespace {
+template <typename T, typename Elements>
+std::unique_ptr<cudf::table> create_fixed_table(cudf::size_type num_columns,
+                                                cudf::size_type num_rows,
+                                                bool include_validity,
+                                                Elements elements)
+{
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
+  std::vector<cudf::test::fixed_width_column_wrapper<T>> src_cols(num_columns);
+  for (int idx = 0; idx < num_columns; idx++) {
+    if (include_validity) {
+      src_cols[idx] =
+        cudf::test::fixed_width_column_wrapper<T>(elements, elements + num_rows, valids);
+    } else {
+      src_cols[idx] = cudf::test::fixed_width_column_wrapper<T>(elements, elements + num_rows);
+    }
+  }
+  std::vector<std::unique_ptr<cudf::column>> columns(num_columns);
+  std::transform(src_cols.begin(),
+                 src_cols.end(),
+                 columns.begin(),
+                 [](cudf::test::fixed_width_column_wrapper<T>& in) {
+                   auto ret = in.release();
+                   // pre-cache the null count
+                   [[maybe_unused]] auto const nulls = ret->has_nulls();
+                   return ret;
+                 });
+  return std::make_unique<cudf::table>(std::move(columns));
+}
+
+template <typename T>
+std::unique_ptr<cudf::table> create_random_fixed_table(cudf::size_type num_columns,
+                                                       cudf::size_type num_rows)
+{
+  auto rand_elements =
+    cudf::detail::make_counting_transform_iterator(0, [](T i) { return rand(); });
+  return create_fixed_table<T>(num_columns, num_rows, false, rand_elements);
+}
+}  // namespace
+
+template <typename V>
+struct groupby_multi_aggs_test : public cudf::test::BaseFixture {};
+
+template <typename Target, typename Source>
+std::vector<Target> convert(std::initializer_list<Source> in)
+{
+  std::vector<Target> out(std::cbegin(in), std::cend(in));
+  return out;
+}
+
+using supported_types = cudf::test::Concat<cudf::test::Types<int32_t, int64_t, float, double>>;
+TYPED_TEST_SUITE(groupby_multi_aggs_test, supported_types);
+using K = int32_t;
+
+TYPED_TEST(groupby_multi_aggs_test, basic)
+{
+  using V = TypeParam;
+
+  auto constexpr num_cols = 3'000;
+  auto constexpr num_rows = 100'000;
+  auto keys               = create_random_fixed_table<K>(1, num_rows);
+
+  auto vals = create_random_fixed_table<V>(num_cols, num_rows);
+
+  std::vector<cudf::groupby::aggregation_request> requests;
+  for (auto i = 0; i < num_cols; i++) {
+    requests.emplace_back();
+
+    requests[i].values = vals->get_column(i).view();
+    requests[i].aggregations.push_back(
+      std::move(cudf::make_mean_aggregation<cudf::groupby_aggregation>()));
+    requests[i].aggregations.push_back(
+      std::move(cudf::make_min_aggregation<cudf::groupby_aggregation>()));
+    requests[i].aggregations.push_back(
+      std::move(cudf::make_max_aggregation<cudf::groupby_aggregation>()));
+    requests[i].aggregations.push_back(
+      std::move(cudf::make_count_aggregation<cudf::groupby_aggregation>()));
+  }
+
+  cudf::groupby::groupby gb_obj{keys->view()};
+
+  auto result = gb_obj.aggregate(requests, cudf::test::get_default_stream());
+}

From 06cf48f5bfb7b1e26e2ce88fa8388e76954e0f3d Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 26 Sep 2024 18:12:23 -0700
Subject: [PATCH 050/135] Further separate compute_single_pass_aggs

---
 cpp/CMakeLists.txt                            |   1 +
 .../groupby/hash/compute_single_pass_aggs.cu  | 284 +----------------
 .../groupby/hash/compute_single_pass_aggs.cuh | 295 ++++++++++++++++++
 .../hash/compute_single_pass_aggs_null.cu     |  28 ++
 4 files changed, 325 insertions(+), 283 deletions(-)
 create mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs.cuh
 create mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs_null.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7f90fb388dc..4bc37eb212c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -317,6 +317,7 @@ add_library(
   src/groupby/hash/compute_aggregations.cu
   src/groupby/hash/compute_groupby.cu
   src/groupby/hash/compute_single_pass_aggs.cu
+  src/groupby/hash/compute_single_pass_aggs_null.cu
   src/groupby/hash/create_sparse_results_table.cu
   src/groupby/hash/flatten_single_pass_aggs.cpp
   src/groupby/hash/groupby.cu
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu
index 465d58ed9ef..f8b0f65b92f 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cu
@@ -14,284 +14,10 @@
  * limitations under the License.
  */
 
-#include "compute_aggregations.hpp"
+#include "compute_single_pass_aggs.cuh"
 #include "compute_single_pass_aggs.hpp"
-#include "create_sparse_results_table.hpp"
-#include "flatten_single_pass_aggs.hpp"
-#include "helpers.cuh"
-#include "single_pass_functors.cuh"
-
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/aggregation/result_cache.hpp>
-#include <cudf/detail/cuco_helpers.hpp>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/cuda.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/groupby.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
-#include <cudf/utilities/span.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-
-#include <cooperative_groups.h>
-#include <cuco/static_set.cuh>
-
-#include <unordered_set>
 
 namespace cudf::groupby::detail::hash {
-namespace {
-template <typename SetType>
-// TODO pass block
-__device__ void find_local_mapping(cudf::size_type cur_idx,
-                                   cudf::size_type num_input_rows,
-                                   SetType shared_set,
-                                   bitmask_type const* row_bitmask,
-                                   bool skip_rows_with_nulls,
-                                   cudf::size_type* cardinality,
-                                   cudf::size_type* local_mapping_index,
-                                   cudf::size_type* shared_set_indices)
-{
-  cudf::size_type result_idx;
-  // TODO: un-init
-  bool inserted;
-  if (cur_idx < num_input_rows and
-      (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) {
-    auto const result = shared_set.insert_and_find(cur_idx);
-    result_idx        = *result.first;
-    inserted          = result.second;
-    // inserted a new element
-    if (result.second) {
-      auto const shared_set_index          = atomicAdd(cardinality, 1);
-      shared_set_indices[shared_set_index] = cur_idx;
-      local_mapping_index[cur_idx]         = shared_set_index;
-    }
-  }
-  // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all
-  // threads in the thread block.
-  __syncthreads();
-  if (cur_idx < num_input_rows and
-      (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) {
-    // element was already in set
-    if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; }
-  }
-}
-
-template <typename SetType>
-__device__ void find_global_mapping(cudf::size_type cur_idx,
-                                    SetType global_set,
-                                    cudf::size_type* shared_set_indices,
-                                    cudf::size_type* global_mapping_index)
-{
-  auto const input_idx = shared_set_indices[cur_idx];
-  global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] =
-    *global_set.insert_and_find(input_idx).first;
-}
-
-/*
- * Inserts keys into the shared memory hash set, and stores the row index of the local
- * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a
- * threadblock exceeds `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without
- * updating `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to
- * the global hash set, and save the row index of the global sparse table in `global_mapping_index`.
- */
-template <class SetRef, typename GlobalSetType, class WindowExtent>
-CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
-                                         cudf::size_type num_input_rows,
-                                         WindowExtent window_extent,
-                                         bitmask_type const* row_bitmask,
-                                         bool skip_rows_with_nulls,
-                                         cudf::size_type* local_mapping_index,
-                                         cudf::size_type* global_mapping_index,
-                                         cudf::size_type* block_cardinality,
-                                         bool* direct_aggregations)
-{
-  // TODO: indices inserted in each shared memory set
-  __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS];
-
-  // Shared set initialization
-  __shared__ typename SetRef::window_type windows[window_extent.value()];
-  auto storage     = SetRef::storage_ref_type(window_extent, windows);
-  auto shared_set  = SetRef(cuco::empty_key<cudf::size_type>{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
-                           global_set.key_eq(),
-                           probing_scheme_t{global_set.hash_function()},
-                            {},
-                           storage);
-  auto const block = cooperative_groups::this_thread_block();
-  shared_set.initialize(block);
-
-  auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find);
-
-  __shared__ cudf::size_type cardinality;
-  if (block.thread_rank() == 0) { cardinality = 0; }
-  block.sync();
-
-  auto const stride = cudf::detail::grid_1d::grid_stride();
-
-  for (auto cur_idx = cudf::detail::grid_1d::global_thread_id();
-       cur_idx - block.thread_rank() < num_input_rows;
-       cur_idx += stride) {
-    find_local_mapping(cur_idx,
-                       num_input_rows,
-                       shared_insert_ref,
-                       row_bitmask,
-                       skip_rows_with_nulls,
-                       &cardinality,
-                       local_mapping_index,
-                       shared_set_indices);
-
-    block.sync();
-
-    if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) {
-      if (block.thread_rank() == 0) { *direct_aggregations = true; }
-      break;
-    }
-
-    block.sync();
-  }
-
-  // Insert unique keys from shared to global hash set
-  if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) {
-    for (auto cur_idx = block.thread_rank(); cur_idx < cardinality;
-         cur_idx += block.num_threads()) {
-      find_global_mapping(cur_idx, global_set, shared_set_indices, global_mapping_index);
-    }
-  }
-
-  if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; }
-}
-
-template <typename Kernel>
-int max_occupancy_grid_size(Kernel kernel, cudf::size_type n)
-{
-  int max_active_blocks{-1};
-  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &max_active_blocks, kernel, GROUPBY_BLOCK_SIZE, 0));
-  auto const grid_size  = max_active_blocks * cudf::detail::num_multiprocessors();
-  auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE);
-  return std::min(grid_size, num_blocks);
-}
-}  // namespace
-
-/**
- * @brief Computes all aggregations from `requests` that require a single pass
- * over the data and stores the results in `sparse_results`
- */
-template <typename SetType>
-rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
-  cudf::table_view const& keys,
-  cudf::host_span<cudf::groupby::aggregation_request const> requests,
-  cudf::detail::result_cache* sparse_results,
-  SetType& global_set,
-  bool skip_rows_with_nulls,
-  rmm::cuda_stream_view stream)
-{
-  // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy
-  auto constexpr shared_set_capacity =
-    static_cast<std::size_t>(static_cast<double>(GROUPBY_SHM_MAX_ELEMENTS) * 1.43);
-  using extent_type            = cuco::extent<cudf::size_type, shared_set_capacity>;
-  using shared_set_type        = cuco::static_set<cudf::size_type,
-                                           extent_type,
-                                           cuda::thread_scope_block,
-                                           typename SetType::key_equal,
-                                           probing_scheme_t,
-                                           cuco::cuda_allocator<cudf::size_type>,
-                                           cuco::storage<GROUPBY_WINDOW_SIZE>>;
-  using shared_set_ref_type    = typename shared_set_type::ref_type<>;
-  auto constexpr window_extent = cuco::make_window_extent<shared_set_ref_type>(extent_type{});
-
-  auto const num_input_rows = keys.num_rows();
-
-  auto row_bitmask =
-    skip_rows_with_nulls
-      ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
-      : rmm::device_buffer{};
-
-  auto global_set_ref  = global_set.ref(cuco::op::insert_and_find);
-  auto const grid_size = max_occupancy_grid_size(
-    compute_mapping_indices<shared_set_ref_type, decltype(global_set_ref), decltype(window_extent)>,
-    num_input_rows);
-  // 'local_mapping_index' maps from the global row index of the input table to the row index of
-  // the local pre-aggregate table
-  rmm::device_uvector<cudf::size_type> local_mapping_index(num_input_rows, stream);
-  // 'global_mapping_index' maps from  the local pre-aggregate table to the row index of
-  // global aggregate table
-  rmm::device_uvector<cudf::size_type> global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS,
-                                                            stream);
-  rmm::device_uvector<cudf::size_type> block_cardinality(grid_size, stream);
-  rmm::device_scalar<bool> direct_aggregations(false, stream);
-  compute_mapping_indices<shared_set_ref_type>
-    <<<grid_size, GROUPBY_BLOCK_SIZE, 0, stream>>>(global_set_ref,
-                                                   num_input_rows,
-                                                   window_extent,
-                                                   static_cast<bitmask_type*>(row_bitmask.data()),
-                                                   skip_rows_with_nulls,
-                                                   local_mapping_index.data(),
-                                                   global_mapping_index.data(),
-                                                   block_cardinality.data(),
-                                                   direct_aggregations.data());
-  stream.synchronize();
-
-  // 'populated_keys' contains inserted row_indices (keys) of global hash set
-  rmm::device_uvector<cudf::size_type> populated_keys(keys.num_rows(), stream);
-
-  // flatten the aggs to a table that can be operated on by aggregate_row
-  auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
-  auto const d_agg_kinds                         = cudf::detail::make_device_uvector_async(
-    agg_kinds, stream, rmm::mr::get_current_device_resource());
-  // make table that will hold sparse results
-  cudf::table sparse_table = create_sparse_results_table(flattened_values,
-                                                         d_agg_kinds.data(),
-                                                         agg_kinds,
-                                                         direct_aggregations.value(stream),
-                                                         global_set,
-                                                         populated_keys,
-                                                         stream);
-  // prepare to launch kernel to do the actual aggregation
-  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
-  auto d_values       = table_device_view::create(flattened_values, stream);
-
-  compute_aggregations(grid_size,
-                       num_input_rows,
-                       static_cast<bitmask_type*>(row_bitmask.data()),
-                       skip_rows_with_nulls,
-                       local_mapping_index.data(),
-                       global_mapping_index.data(),
-                       block_cardinality.data(),
-                       *d_values,
-                       *d_sparse_table,
-                       d_agg_kinds.data(),
-                       stream);
-
-  if (direct_aggregations.value(stream)) {
-    auto const stride = GROUPBY_BLOCK_SIZE * grid_size;
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator(0),
-                       keys.num_rows(),
-                       compute_direct_aggregates{global_set_ref,
-                                                 *d_values,
-                                                 *d_sparse_table,
-                                                 d_agg_kinds.data(),
-                                                 block_cardinality.data(),
-                                                 stride,
-                                                 static_cast<bitmask_type*>(row_bitmask.data()),
-                                                 skip_rows_with_nulls});
-    extract_populated_keys(global_set, populated_keys, stream);
-  }
-
-  // Add results back to sparse_results cache
-  auto sparse_result_cols = sparse_table.release();
-  for (size_t i = 0; i < aggs.size(); i++) {
-    // Note that the cache will make a copy of this temporary aggregation
-    sparse_results->add_result(
-      flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i]));
-  }
-
-  return populated_keys;
-}
-
 template rmm::device_uvector<cudf::size_type> compute_single_pass_aggs<global_set_t>(
   cudf::table_view const& keys,
   cudf::host_span<cudf::groupby::aggregation_request const> requests,
@@ -299,12 +25,4 @@ template rmm::device_uvector<cudf::size_type> compute_single_pass_aggs<global_se
   global_set_t& global_set,
   bool skip_rows_with_nulls,
   rmm::cuda_stream_view stream);
-
-template rmm::device_uvector<cudf::size_type> compute_single_pass_aggs<nullable_global_set_t>(
-  cudf::table_view const& keys,
-  cudf::host_span<cudf::groupby::aggregation_request const> requests,
-  cudf::detail::result_cache* sparse_results,
-  nullable_global_set_t& global_set,
-  bool skip_rows_with_nulls,
-  rmm::cuda_stream_view stream);
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
new file mode 100644
index 00000000000..d74b7ac4aa0
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "compute_aggregations.hpp"
+#include "compute_single_pass_aggs.hpp"
+#include "create_sparse_results_table.hpp"
+#include "flatten_single_pass_aggs.hpp"
+#include "helpers.cuh"
+#include "single_pass_functors.cuh"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/cuco_helpers.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/cuda.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <cooperative_groups.h>
+#include <cuco/static_set.cuh>
+
+#include <unordered_set>
+
+namespace cudf::groupby::detail::hash {
+namespace {
+template <typename SetType>
+// TODO pass block
+__device__ void find_local_mapping(cudf::size_type cur_idx,
+                                   cudf::size_type num_input_rows,
+                                   SetType shared_set,
+                                   bitmask_type const* row_bitmask,
+                                   bool skip_rows_with_nulls,
+                                   cudf::size_type* cardinality,
+                                   cudf::size_type* local_mapping_index,
+                                   cudf::size_type* shared_set_indices)
+{
+  cudf::size_type result_idx;
+  // TODO: un-init
+  bool inserted;
+  if (cur_idx < num_input_rows and
+      (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) {
+    auto const result = shared_set.insert_and_find(cur_idx);
+    result_idx        = *result.first;
+    inserted          = result.second;
+    // inserted a new element
+    if (result.second) {
+      auto const shared_set_index          = atomicAdd(cardinality, 1);
+      shared_set_indices[shared_set_index] = cur_idx;
+      local_mapping_index[cur_idx]         = shared_set_index;
+    }
+  }
+  // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all
+  // threads in the thread block.
+  __syncthreads();
+  if (cur_idx < num_input_rows and
+      (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) {
+    // element was already in set
+    if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; }
+  }
+}
+
+template <typename SetType>
+__device__ void find_global_mapping(cudf::size_type cur_idx,
+                                    SetType global_set,
+                                    cudf::size_type* shared_set_indices,
+                                    cudf::size_type* global_mapping_index)
+{
+  auto const input_idx = shared_set_indices[cur_idx];
+  global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] =
+    *global_set.insert_and_find(input_idx).first;
+}
+
+/*
+ * Inserts keys into the shared memory hash set, and stores the row index of the local
+ * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a
+ * threadblock exceeds `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without
+ * updating `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to
+ * the global hash set, and save the row index of the global sparse table in `global_mapping_index`.
+ */
+template <class SetRef, typename GlobalSetType, class WindowExtent>
+CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
+                                         cudf::size_type num_input_rows,
+                                         WindowExtent window_extent,
+                                         bitmask_type const* row_bitmask,
+                                         bool skip_rows_with_nulls,
+                                         cudf::size_type* local_mapping_index,
+                                         cudf::size_type* global_mapping_index,
+                                         cudf::size_type* block_cardinality,
+                                         bool* direct_aggregations)
+{
+  // TODO: indices inserted in each shared memory set
+  __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS];
+
+  // Shared set initialization
+  __shared__ typename SetRef::window_type windows[window_extent.value()];
+  auto storage     = SetRef::storage_ref_type(window_extent, windows);
+  auto shared_set  = SetRef(cuco::empty_key<cudf::size_type>{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+                           global_set.key_eq(),
+                           probing_scheme_t{global_set.hash_function()},
+                            {},
+                           storage);
+  auto const block = cooperative_groups::this_thread_block();
+  shared_set.initialize(block);
+
+  auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find);
+
+  __shared__ cudf::size_type cardinality;
+  if (block.thread_rank() == 0) { cardinality = 0; }
+  block.sync();
+
+  auto const stride = cudf::detail::grid_1d::grid_stride();
+
+  for (auto cur_idx = cudf::detail::grid_1d::global_thread_id();
+       cur_idx - block.thread_rank() < num_input_rows;
+       cur_idx += stride) {
+    find_local_mapping(cur_idx,
+                       num_input_rows,
+                       shared_insert_ref,
+                       row_bitmask,
+                       skip_rows_with_nulls,
+                       &cardinality,
+                       local_mapping_index,
+                       shared_set_indices);
+
+    block.sync();
+
+    if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) {
+      if (block.thread_rank() == 0) { *direct_aggregations = true; }
+      break;
+    }
+
+    block.sync();
+  }
+
+  // Insert unique keys from shared to global hash set
+  if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) {
+    for (auto cur_idx = block.thread_rank(); cur_idx < cardinality;
+         cur_idx += block.num_threads()) {
+      find_global_mapping(cur_idx, global_set, shared_set_indices, global_mapping_index);
+    }
+  }
+
+  if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; }
+}
+
+template <typename Kernel>
+int max_occupancy_grid_size(Kernel kernel, cudf::size_type n)
+{
+  int max_active_blocks{-1};
+  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &max_active_blocks, kernel, GROUPBY_BLOCK_SIZE, 0));
+  auto const grid_size  = max_active_blocks * cudf::detail::num_multiprocessors();
+  auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE);
+  return std::min(grid_size, num_blocks);
+}
+}  // namespace
+
+/**
+ * @brief Computes all aggregations from `requests` that require a single pass
+ * over the data and stores the results in `sparse_results`
+ */
+template <typename SetType>
+rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
+  cudf::table_view const& keys,
+  cudf::host_span<cudf::groupby::aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  SetType& global_set,
+  bool skip_rows_with_nulls,
+  rmm::cuda_stream_view stream)
+{
+  // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy
+  auto constexpr shared_set_capacity =
+    static_cast<std::size_t>(static_cast<double>(GROUPBY_SHM_MAX_ELEMENTS) * 1.43);
+  using extent_type            = cuco::extent<cudf::size_type, shared_set_capacity>;
+  using shared_set_type        = cuco::static_set<cudf::size_type,
+                                           extent_type,
+                                           cuda::thread_scope_block,
+                                           typename SetType::key_equal,
+                                           probing_scheme_t,
+                                           cuco::cuda_allocator<cudf::size_type>,
+                                           cuco::storage<GROUPBY_WINDOW_SIZE>>;
+  using shared_set_ref_type    = typename shared_set_type::ref_type<>;
+  auto constexpr window_extent = cuco::make_window_extent<shared_set_ref_type>(extent_type{});
+
+  auto const num_input_rows = keys.num_rows();
+
+  auto row_bitmask =
+    skip_rows_with_nulls
+      ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
+      : rmm::device_buffer{};
+
+  auto global_set_ref  = global_set.ref(cuco::op::insert_and_find);
+  auto const grid_size = max_occupancy_grid_size(
+    compute_mapping_indices<shared_set_ref_type, decltype(global_set_ref), decltype(window_extent)>,
+    num_input_rows);
+  // 'local_mapping_index' maps from the global row index of the input table to the row index of
+  // the local pre-aggregate table
+  rmm::device_uvector<cudf::size_type> local_mapping_index(num_input_rows, stream);
+  // 'global_mapping_index' maps from  the local pre-aggregate table to the row index of
+  // global aggregate table
+  rmm::device_uvector<cudf::size_type> global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS,
+                                                            stream);
+  rmm::device_uvector<cudf::size_type> block_cardinality(grid_size, stream);
+  rmm::device_scalar<bool> direct_aggregations(false, stream);
+  compute_mapping_indices<shared_set_ref_type>
+    <<<grid_size, GROUPBY_BLOCK_SIZE, 0, stream>>>(global_set_ref,
+                                                   num_input_rows,
+                                                   window_extent,
+                                                   static_cast<bitmask_type*>(row_bitmask.data()),
+                                                   skip_rows_with_nulls,
+                                                   local_mapping_index.data(),
+                                                   global_mapping_index.data(),
+                                                   block_cardinality.data(),
+                                                   direct_aggregations.data());
+  stream.synchronize();
+
+  // 'populated_keys' contains inserted row_indices (keys) of global hash set
+  rmm::device_uvector<cudf::size_type> populated_keys(keys.num_rows(), stream);
+
+  // flatten the aggs to a table that can be operated on by aggregate_row
+  auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
+  auto const d_agg_kinds                         = cudf::detail::make_device_uvector_async(
+    agg_kinds, stream, rmm::mr::get_current_device_resource());
+  // make table that will hold sparse results
+  cudf::table sparse_table = create_sparse_results_table(flattened_values,
+                                                         d_agg_kinds.data(),
+                                                         agg_kinds,
+                                                         direct_aggregations.value(stream),
+                                                         global_set,
+                                                         populated_keys,
+                                                         stream);
+  // prepare to launch kernel to do the actual aggregation
+  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
+  auto d_values       = table_device_view::create(flattened_values, stream);
+
+  compute_aggregations(grid_size,
+                       num_input_rows,
+                       static_cast<bitmask_type*>(row_bitmask.data()),
+                       skip_rows_with_nulls,
+                       local_mapping_index.data(),
+                       global_mapping_index.data(),
+                       block_cardinality.data(),
+                       *d_values,
+                       *d_sparse_table,
+                       d_agg_kinds.data(),
+                       stream);
+
+  if (direct_aggregations.value(stream)) {
+    auto const stride = GROUPBY_BLOCK_SIZE * grid_size;
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator(0),
+                       keys.num_rows(),
+                       compute_direct_aggregates{global_set_ref,
+                                                 *d_values,
+                                                 *d_sparse_table,
+                                                 d_agg_kinds.data(),
+                                                 block_cardinality.data(),
+                                                 stride,
+                                                 static_cast<bitmask_type*>(row_bitmask.data()),
+                                                 skip_rows_with_nulls});
+    extract_populated_keys(global_set, populated_keys, stream);
+  }
+
+  // Add results back to sparse_results cache
+  auto sparse_result_cols = sparse_table.release();
+  for (size_t i = 0; i < aggs.size(); i++) {
+    // Note that the cache will make a copy of this temporary aggregation
+    sparse_results->add_result(
+      flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i]));
+  }
+
+  return populated_keys;
+}
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu b/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu
new file mode 100644
index 00000000000..b88f1a952d5
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_single_pass_aggs.cuh"
+#include "compute_single_pass_aggs.hpp"
+
+namespace cudf::groupby::detail::hash {
+template rmm::device_uvector<cudf::size_type> compute_single_pass_aggs<nullable_global_set_t>(
+  cudf::table_view const& keys,
+  cudf::host_span<cudf::groupby::aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  nullable_global_set_t& global_set,
+  bool skip_rows_with_nulls,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash

From 4a952982d9c557138f1983cb79a3cd1f74485c3d Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 26 Sep 2024 19:23:24 -0700
Subject: [PATCH 051/135] test

---
 cpp/src/groupby/hash/compute_aggregations.cu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu
index 8b559ffc1be..75d89af1313 100644
--- a/cpp/src/groupby/hash/compute_aggregations.cu
+++ b/cpp/src/groupby/hash/compute_aggregations.cu
@@ -234,6 +234,9 @@ constexpr size_t compute_shared_memory_size(Kernel kernel, int grid_size)
 
   size_t dynamic_shmem_size = 0;
 
+  std::cout << "### active_blocks_per_sm: " << active_blocks_per_sm << " grid_size: " << grid_size
+            << "\n";
+
   CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock(
     &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE));
   return get_previous_multiple_of_8(0.5 * dynamic_shmem_size);

From 4b247b1f323b506f6511bcb7ba37a6b31b0f92b4 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Sun, 29 Sep 2024 17:07:18 -0700
Subject: [PATCH 052/135] Renaming + minor cleanups

---
 cpp/src/groupby/hash/compute_aggregations.cu  | 51 ++++++++++---------
 cpp/src/groupby/hash/compute_aggregations.hpp |  4 +-
 .../groupby/hash/compute_single_pass_aggs.cuh |  2 +
 .../hash/create_sparse_results_table.cu       | 10 ++--
 .../hash/create_sparse_results_table.hpp      |  2 +-
 5 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu
index 75d89af1313..0a47d14d140 100644
--- a/cpp/src/groupby/hash/compute_aggregations.cu
+++ b/cpp/src/groupby/hash/compute_aggregations.cu
@@ -15,6 +15,7 @@
  */
 
 #include "compute_aggregations.hpp"
+#include "create_sparse_results_table.hpp"
 #include "global_memory_aggregator.cuh"
 #include "helpers.cuh"
 #include "shared_memory_aggregator.cuh"
@@ -69,12 +70,12 @@ __device__ void initialize_shared_memory_aggregates(int col_start,
                                                     std::byte** s_aggregates_pointer,
                                                     bool** s_aggregates_valid_pointer,
                                                     cudf::size_type cardinality,
-                                                    cudf::aggregation::Kind const* aggs)
+                                                    cudf::aggregation::Kind const* d_agg_kinds)
 {
   for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
     for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) {
       cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(),
-                                                  aggs[col_idx],
+                                                  d_agg_kinds[col_idx],
                                                   initialize_shmem{},
                                                   s_aggregates_pointer[col_idx],
                                                   idx,
@@ -92,7 +93,7 @@ __device__ void compute_pre_aggregrates(int col_start,
                                         cudf::size_type* local_mapping_index,
                                         std::byte** s_aggregates_pointer,
                                         bool** s_aggregates_valid_pointer,
-                                        cudf::aggregation::Kind const* aggs)
+                                        cudf::aggregation::Kind const* d_agg_kinds)
 {
   // TODO grid_1d utility
   for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows;
@@ -104,7 +105,7 @@ __device__ void compute_pre_aggregrates(int col_start,
         auto input_col = input_values.column(col_idx);
 
         cudf::detail::dispatch_type_and_aggregation(input_col.type(),
-                                                    aggs[col_idx],
+                                                    d_agg_kinds[col_idx],
                                                     shmem_element_aggregator{},
                                                     s_aggregates_pointer[col_idx],
                                                     map_idx,
@@ -124,7 +125,7 @@ __device__ void compute_final_aggregates(int col_start,
                                          cudf::size_type* global_mapping_index,
                                          std::byte** s_aggregates_pointer,
                                          bool** s_aggregates_valid_pointer,
-                                         cudf::aggregation::Kind const* aggs)
+                                         cudf::aggregation::Kind const* d_agg_kinds)
 {
   for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) {
     auto out_idx = global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx];
@@ -132,7 +133,7 @@ __device__ void compute_final_aggregates(int col_start,
       auto output_col = output_values.column(col_idx);
 
       cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(),
-                                                  aggs[col_idx],
+                                                  d_agg_kinds[col_idx],
                                                   gmem_element_aggregator{},
                                                   output_col,
                                                   out_idx,
@@ -146,17 +147,17 @@ __device__ void compute_final_aggregates(int col_start,
 
 /* Takes the local_mapping_index and global_mapping_index to compute
  * pre (shared) and final (global) aggregates*/
-CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows,
-                                     bitmask_type const* row_bitmask,
-                                     bool skip_rows_with_nulls,
-                                     cudf::size_type* local_mapping_index,
-                                     cudf::size_type* global_mapping_index,
-                                     cudf::size_type* block_cardinality,
-                                     cudf::table_device_view input_values,
-                                     cudf::mutable_table_device_view output_values,
-                                     cudf::aggregation::Kind const* aggs,
-                                     int total_agg_size,
-                                     int pointer_size)
+CUDF_KERNEL void compute_d_agg_kinds_kernel(cudf::size_type num_rows,
+                                            bitmask_type const* row_bitmask,
+                                            bool skip_rows_with_nulls,
+                                            cudf::size_type* local_mapping_index,
+                                            cudf::size_type* global_mapping_index,
+                                            cudf::size_type* block_cardinality,
+                                            cudf::table_device_view input_values,
+                                            cudf::mutable_table_device_view output_values,
+                                            cudf::aggregation::Kind const* d_agg_kinds,
+                                            int total_agg_size,
+                                            int pointer_size)
 {
   auto const block       = cooperative_groups::this_thread_block();
   auto const cardinality = block_cardinality[block.group_index().x];
@@ -195,7 +196,7 @@ CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows,
                                         s_aggregates_pointer,
                                         s_aggregates_valid_pointer,
                                         cardinality,
-                                        aggs);
+                                        d_agg_kinds);
     block.sync();
     compute_pre_aggregrates(col_start,
                             col_end,
@@ -206,7 +207,7 @@ CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows,
                             local_mapping_index,
                             s_aggregates_pointer,
                             s_aggregates_valid_pointer,
-                            aggs);
+                            d_agg_kinds);
     block.sync();
     compute_final_aggregates(col_start,
                              col_end,
@@ -216,7 +217,7 @@ CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows,
                              global_mapping_index,
                              s_aggregates_pointer,
                              s_aggregates_valid_pointer,
-                             aggs);
+                             d_agg_kinds);
     block.sync();
   }
 }
@@ -253,10 +254,12 @@ void compute_aggregations(int grid_size,
                           cudf::size_type* block_cardinality,
                           cudf::table_device_view input_values,
                           cudf::mutable_table_device_view output_values,
-                          cudf::aggregation::Kind const* aggs,
+                          cudf::table_view const& flattened_values,
+                          cudf::aggregation::Kind const* d_agg_kinds,
+                          std::vector<cudf::aggregation::Kind> const& agg_kinds,
                           rmm::cuda_stream_view stream)
 {
-  auto const shmem_size = compute_shared_memory_size(compute_aggs_kernel, grid_size);
+  auto const shmem_size = compute_shared_memory_size(compute_d_agg_kinds_kernel, grid_size);
 
   // For each aggregation, need two pointers to arrays in shmem
   // One where the aggregation is performed, one indicating the validity of the aggregation
@@ -264,7 +267,7 @@ void compute_aggregations(int grid_size,
     round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns());
   // The rest of shmem is utilized for the actual arrays in shmem
   auto const shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2;
-  compute_aggs_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, shmem_size, stream>>>(
+  compute_d_agg_kinds_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, shmem_size, stream>>>(
     num_input_rows,
     row_bitmask,
     skip_rows_with_nulls,
@@ -273,7 +276,7 @@ void compute_aggregations(int grid_size,
     block_cardinality,
     input_values,
     output_values,
-    aggs,
+    d_agg_kinds,
     shmem_agg_size,
     shmem_agg_pointer_size);
 }
diff --git a/cpp/src/groupby/hash/compute_aggregations.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp
index 87c37158cd0..badf8079875 100644
--- a/cpp/src/groupby/hash/compute_aggregations.hpp
+++ b/cpp/src/groupby/hash/compute_aggregations.hpp
@@ -35,7 +35,9 @@ void compute_aggregations(int grid_size,
                           cudf::size_type* block_cardinality,
                           cudf::table_device_view input_values,
                           cudf::mutable_table_device_view output_values,
-                          cudf::aggregation::Kind const* aggs,
+                          cudf::table_view const& flattened_values,
+                          cudf::aggregation::Kind const* d_agg_kinds,
+                          std::vector<cudf::aggregation::Kind> const& agg_kinds,
                           rmm::cuda_stream_view stream);
 
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index d74b7ac4aa0..51c131b59eb 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -263,7 +263,9 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
                        block_cardinality.data(),
                        *d_values,
                        *d_sparse_table,
+                       flattened_values,
                        d_agg_kinds.data(),
+                       agg_kinds,
                        stream);
 
   if (direct_aggregations.value(stream)) {
diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu
index 7ae0184528d..fa3e1b3a2ba 100644
--- a/cpp/src/groupby/hash/create_sparse_results_table.cu
+++ b/cpp/src/groupby/hash/create_sparse_results_table.cu
@@ -48,7 +48,7 @@ void extract_populated_keys(SetType const& key_set,
 template <typename GlobalSetType>
 cudf::table create_sparse_results_table(cudf::table_view const& flattened_values,
                                         cudf::aggregation::Kind const* d_agg_kinds,
-                                        std::vector<cudf::aggregation::Kind> aggs,
+                                        std::vector<cudf::aggregation::Kind> agg_kinds,
                                         bool direct_aggregations,
                                         GlobalSetType const& global_set,
                                         rmm::device_uvector<cudf::size_type>& populated_keys,
@@ -58,7 +58,7 @@ cudf::table create_sparse_results_table(cudf::table_view const& flattened_values
   std::vector<std::unique_ptr<cudf::column>> sparse_columns;
   std::transform(flattened_values.begin(),
                  flattened_values.end(),
-                 aggs.begin(),
+                 agg_kinds.begin(),
                  std::back_inserter(sparse_columns),
                  [stream](auto const& col, auto const& agg) {
                    auto const nullable =
@@ -89,7 +89,7 @@ cudf::table create_sparse_results_table(cudf::table_view const& flattened_values
   // Else initialize the whole table
   else {
     cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view();
-    cudf::detail::initialize_with_identity(sparse_table_view, aggs, stream);
+    cudf::detail::initialize_with_identity(sparse_table_view, agg_kinds, stream);
   }
   return sparse_table;
 }
@@ -107,7 +107,7 @@ template void extract_populated_keys<nullable_global_set_t>(
 template cudf::table create_sparse_results_table<global_set_t>(
   cudf::table_view const& flattened_values,
   cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> aggs,
+  std::vector<cudf::aggregation::Kind> agg_kinds,
   bool direct_aggregations,
   global_set_t const& global_set,
   rmm::device_uvector<cudf::size_type>& populated_keys,
@@ -116,7 +116,7 @@ template cudf::table create_sparse_results_table<global_set_t>(
 template cudf::table create_sparse_results_table<nullable_global_set_t>(
   cudf::table_view const& flattened_values,
   cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> aggs,
+  std::vector<cudf::aggregation::Kind> agg_kinds,
   bool direct_aggregations,
   nullable_global_set_t const& global_set,
   rmm::device_uvector<cudf::size_type>& populated_keys,
diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp
index 2daa88289c0..f2810bd0235 100644
--- a/cpp/src/groupby/hash/create_sparse_results_table.hpp
+++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp
@@ -34,7 +34,7 @@ void extract_populated_keys(SetType const& key_set,
 template <typename GlobalSetType>
 cudf::table create_sparse_results_table(cudf::table_view const& flattened_values,
                                         cudf::aggregation::Kind const* d_agg_kinds,
-                                        std::vector<cudf::aggregation::Kind> aggs,
+                                        std::vector<cudf::aggregation::Kind> agg_kinds,
                                         bool direct_aggregations,
                                         GlobalSetType const& global_set,
                                         rmm::device_uvector<cudf::size_type>& populated_keys,

From 90597288981a8813e9497a57939b8fe603472bed Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Sun, 29 Sep 2024 17:07:48 -0700
Subject: [PATCH 053/135] Remove unused code

---
 cpp/src/groupby/hash/compute_aggregations.cu | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu
index 0a47d14d140..dda4c1c5773 100644
--- a/cpp/src/groupby/hash/compute_aggregations.cu
+++ b/cpp/src/groupby/hash/compute_aggregations.cu
@@ -230,14 +230,8 @@ constexpr size_t compute_shared_memory_size(Kernel kernel, int grid_size)
   auto const active_blocks_per_sm =
     cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
 
-  CUDF_EXPECTS(active_blocks_per_sm >= 1, "active_blocks_per_sm must be larger than 1");
-  CUDF_EXPECTS(grid_size >= 1, "grid_size must be larger than 1");
-
   size_t dynamic_shmem_size = 0;
 
-  std::cout << "### active_blocks_per_sm: " << active_blocks_per_sm << " grid_size: " << grid_size
-            << "\n";
-
   CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock(
     &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE));
   return get_previous_multiple_of_8(0.5 * dynamic_shmem_size);

From 87312faf8d34f8cebf6b10bacb9c7482af9e3b16 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Sun, 29 Sep 2024 17:48:47 -0700
Subject: [PATCH 054/135] Make compute_aggregations return sparse table

---
 cpp/src/groupby/hash/compute_aggregations.cu  | 76 +++++++++++++++----
 cpp/src/groupby/hash/compute_aggregations.hpp | 29 +++----
 .../groupby/hash/compute_single_pass_aggs.cuh | 42 +++++-----
 3 files changed, 98 insertions(+), 49 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu
index dda4c1c5773..7861884562a 100644
--- a/cpp/src/groupby/hash/compute_aggregations.cu
+++ b/cpp/src/groupby/hash/compute_aggregations.cu
@@ -239,22 +239,36 @@ constexpr size_t compute_shared_memory_size(Kernel kernel, int grid_size)
 
 }  // namespace
 
-void compute_aggregations(int grid_size,
-                          cudf::size_type num_input_rows,
-                          bitmask_type const* row_bitmask,
-                          bool skip_rows_with_nulls,
-                          cudf::size_type* local_mapping_index,
-                          cudf::size_type* global_mapping_index,
-                          cudf::size_type* block_cardinality,
-                          cudf::table_device_view input_values,
-                          cudf::mutable_table_device_view output_values,
-                          cudf::table_view const& flattened_values,
-                          cudf::aggregation::Kind const* d_agg_kinds,
-                          std::vector<cudf::aggregation::Kind> const& agg_kinds,
-                          rmm::cuda_stream_view stream)
+template <typename SetType>
+cudf::table compute_aggregations(int grid_size,
+                                 cudf::size_type num_input_rows,
+                                 bitmask_type const* row_bitmask,
+                                 bool skip_rows_with_nulls,
+                                 cudf::size_type* local_mapping_index,
+                                 cudf::size_type* global_mapping_index,
+                                 cudf::size_type* block_cardinality,
+                                 cudf::table_device_view input_values,
+                                 cudf::table_view const& flattened_values,
+                                 cudf::aggregation::Kind const* d_agg_kinds,
+                                 std::vector<cudf::aggregation::Kind> const& agg_kinds,
+                                 bool direct_aggregations,
+                                 SetType& global_set,
+                                 rmm::device_uvector<cudf::size_type>& populated_keys,
+                                 rmm::cuda_stream_view stream)
 {
   auto const shmem_size = compute_shared_memory_size(compute_d_agg_kinds_kernel, grid_size);
 
+  // make table that will hold sparse results
+  cudf::table sparse_table = create_sparse_results_table(flattened_values,
+                                                         d_agg_kinds,
+                                                         agg_kinds,
+                                                         direct_aggregations,
+                                                         global_set,
+                                                         populated_keys,
+                                                         stream);
+  auto d_sparse_table      = mutable_table_device_view::create(sparse_table, stream);
+  auto output_values       = *d_sparse_table;
+
   // For each aggregation, need two pointers to arrays in shmem
   // One where the aggregation is performed, one indicating the validity of the aggregation
   auto const shmem_agg_pointer_size =
@@ -273,6 +287,42 @@ void compute_aggregations(int grid_size,
     d_agg_kinds,
     shmem_agg_size,
     shmem_agg_pointer_size);
+
+  return sparse_table;
 }
 
+template cudf::table compute_aggregations<global_set_t>(
+  int grid_size,
+  cudf::size_type num_input_rows,
+  bitmask_type const* row_bitmask,
+  bool skip_rows_with_nulls,
+  cudf::size_type* local_mapping_index,
+  cudf::size_type* global_mapping_index,
+  cudf::size_type* block_cardinality,
+  cudf::table_device_view input_values,
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  bool direct_aggregations,
+  global_set_t& global_set,
+  rmm::device_uvector<cudf::size_type>& populated_keys,
+  rmm::cuda_stream_view stream);
+
+template cudf::table compute_aggregations<nullable_global_set_t>(
+  int grid_size,
+  cudf::size_type num_input_rows,
+  bitmask_type const* row_bitmask,
+  bool skip_rows_with_nulls,
+  cudf::size_type* local_mapping_index,
+  cudf::size_type* global_mapping_index,
+  cudf::size_type* block_cardinality,
+  cudf::table_device_view input_values,
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  bool direct_aggregations,
+  nullable_global_set_t& global_set,
+  rmm::device_uvector<cudf::size_type>& populated_keys,
+  rmm::cuda_stream_view stream);
+
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_aggregations.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp
index badf8079875..bcb996b645d 100644
--- a/cpp/src/groupby/hash/compute_aggregations.hpp
+++ b/cpp/src/groupby/hash/compute_aggregations.hpp
@@ -26,18 +26,21 @@
 
 namespace cudf::groupby::detail::hash {
 
-void compute_aggregations(int grid_size,
-                          cudf::size_type num_input_rows,
-                          bitmask_type const* row_bitmask,
-                          bool skip_rows_with_nulls,
-                          cudf::size_type* local_mapping_index,
-                          cudf::size_type* global_mapping_index,
-                          cudf::size_type* block_cardinality,
-                          cudf::table_device_view input_values,
-                          cudf::mutable_table_device_view output_values,
-                          cudf::table_view const& flattened_values,
-                          cudf::aggregation::Kind const* d_agg_kinds,
-                          std::vector<cudf::aggregation::Kind> const& agg_kinds,
-                          rmm::cuda_stream_view stream);
+template <typename SetType>
+cudf::table compute_aggregations(int grid_size,
+                                 cudf::size_type num_input_rows,
+                                 bitmask_type const* row_bitmask,
+                                 bool skip_rows_with_nulls,
+                                 cudf::size_type* local_mapping_index,
+                                 cudf::size_type* global_mapping_index,
+                                 cudf::size_type* block_cardinality,
+                                 cudf::table_device_view input_values,
+                                 cudf::table_view const& flattened_values,
+                                 cudf::aggregation::Kind const* d_agg_kinds,
+                                 std::vector<cudf::aggregation::Kind> const& agg_kinds,
+                                 bool direct_aggregations,
+                                 SetType& global_set,
+                                 rmm::device_uvector<cudf::size_type>& populated_keys,
+                                 rmm::cuda_stream_view stream);
 
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index 51c131b59eb..3372d88e714 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -242,31 +242,27 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
   auto const d_agg_kinds                         = cudf::detail::make_device_uvector_async(
     agg_kinds, stream, rmm::mr::get_current_device_resource());
-  // make table that will hold sparse results
-  cudf::table sparse_table = create_sparse_results_table(flattened_values,
-                                                         d_agg_kinds.data(),
-                                                         agg_kinds,
-                                                         direct_aggregations.value(stream),
-                                                         global_set,
-                                                         populated_keys,
-                                                         stream);
+
   // prepare to launch kernel to do the actual aggregation
-  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
-  auto d_values       = table_device_view::create(flattened_values, stream);
+  auto d_values = table_device_view::create(flattened_values, stream);
 
-  compute_aggregations(grid_size,
-                       num_input_rows,
-                       static_cast<bitmask_type*>(row_bitmask.data()),
-                       skip_rows_with_nulls,
-                       local_mapping_index.data(),
-                       global_mapping_index.data(),
-                       block_cardinality.data(),
-                       *d_values,
-                       *d_sparse_table,
-                       flattened_values,
-                       d_agg_kinds.data(),
-                       agg_kinds,
-                       stream);
+  cudf::table sparse_table =
+    compute_aggregations<SetType>(grid_size,
+                                  num_input_rows,
+                                  static_cast<bitmask_type*>(row_bitmask.data()),
+                                  skip_rows_with_nulls,
+                                  local_mapping_index.data(),
+                                  global_mapping_index.data(),
+                                  block_cardinality.data(),
+                                  *d_values,
+                                  flattened_values,
+                                  d_agg_kinds.data(),
+                                  agg_kinds,
+                                  direct_aggregations.value(stream),
+                                  global_set,
+                                  populated_keys,
+                                  stream);
+  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
 
   if (direct_aggregations.value(stream)) {
     auto const stride = GROUPBY_BLOCK_SIZE * grid_size;

From bb7187dea240bcb9c6c6bddec37aa914bae6f23b Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Sun, 29 Sep 2024 18:17:11 -0700
Subject: [PATCH 055/135] Add rollback if encounting CUDA errors

---
 cpp/src/groupby/hash/compute_aggregations.cu  | 59 +++++++++++--------
 cpp/src/groupby/hash/compute_aggregations.hpp | 33 ++++++-----
 .../groupby/hash/compute_single_pass_aggs.cuh | 16 ++++-
 3 files changed, 66 insertions(+), 42 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu
index 7861884562a..d2d7aa83568 100644
--- a/cpp/src/groupby/hash/compute_aggregations.cu
+++ b/cpp/src/groupby/hash/compute_aggregations.cu
@@ -225,38 +225,44 @@ CUDF_KERNEL void compute_d_agg_kinds_kernel(cudf::size_type num_rows,
 constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; }
 
 template <typename Kernel>
-constexpr size_t compute_shared_memory_size(Kernel kernel, int grid_size)
+constexpr std::pair<cudaError_t, size_t> compute_shared_memory_size(Kernel kernel,
+                                                                    int grid_size) noexcept
 {
   auto const active_blocks_per_sm =
     cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
 
   size_t dynamic_shmem_size = 0;
 
-  CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock(
-    &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE));
-  return get_previous_multiple_of_8(0.5 * dynamic_shmem_size);
+  auto const status = cudaOccupancyAvailableDynamicSMemPerBlock(
+    &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE);
+  if (status != cudaSuccess) { cudaGetLastError(); }
+  return {status, get_previous_multiple_of_8(0.5 * dynamic_shmem_size)};
 }
 
 }  // namespace
 
 template <typename SetType>
-cudf::table compute_aggregations(int grid_size,
-                                 cudf::size_type num_input_rows,
-                                 bitmask_type const* row_bitmask,
-                                 bool skip_rows_with_nulls,
-                                 cudf::size_type* local_mapping_index,
-                                 cudf::size_type* global_mapping_index,
-                                 cudf::size_type* block_cardinality,
-                                 cudf::table_device_view input_values,
-                                 cudf::table_view const& flattened_values,
-                                 cudf::aggregation::Kind const* d_agg_kinds,
-                                 std::vector<cudf::aggregation::Kind> const& agg_kinds,
-                                 bool direct_aggregations,
-                                 SetType& global_set,
-                                 rmm::device_uvector<cudf::size_type>& populated_keys,
-                                 rmm::cuda_stream_view stream)
+std::pair<cudaError_t, cudf::table> compute_aggregations(
+  int grid_size,
+  cudf::size_type num_input_rows,
+  bitmask_type const* row_bitmask,
+  bool skip_rows_with_nulls,
+  cudf::size_type* local_mapping_index,
+  cudf::size_type* global_mapping_index,
+  cudf::size_type* block_cardinality,
+  cudf::table_device_view input_values,
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  bool direct_aggregations,
+  SetType& global_set,
+  rmm::device_uvector<cudf::size_type>& populated_keys,
+  rmm::cuda_stream_view stream)
 {
-  auto const shmem_size = compute_shared_memory_size(compute_d_agg_kinds_kernel, grid_size);
+  auto const [status, shmem_size] =
+    compute_shared_memory_size(compute_d_agg_kinds_kernel, grid_size);
+
+  if (status != cudaSuccess) { direct_aggregations = true; }
 
   // make table that will hold sparse results
   cudf::table sparse_table = create_sparse_results_table(flattened_values,
@@ -266,8 +272,11 @@ cudf::table compute_aggregations(int grid_size,
                                                          global_set,
                                                          populated_keys,
                                                          stream);
-  auto d_sparse_table      = mutable_table_device_view::create(sparse_table, stream);
-  auto output_values       = *d_sparse_table;
+
+  if (status != cudaSuccess) { return {status, sparse_table}; }
+
+  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
+  auto output_values  = *d_sparse_table;
 
   // For each aggregation, need two pointers to arrays in shmem
   // One where the aggregation is performed, one indicating the validity of the aggregation
@@ -288,10 +297,10 @@ cudf::table compute_aggregations(int grid_size,
     shmem_agg_size,
     shmem_agg_pointer_size);
 
-  return sparse_table;
+  return {status, sparse_table};
 }
 
-template cudf::table compute_aggregations<global_set_t>(
+template std::pair<cudaError_t, cudf::table> compute_aggregations<global_set_t>(
   int grid_size,
   cudf::size_type num_input_rows,
   bitmask_type const* row_bitmask,
@@ -308,7 +317,7 @@ template cudf::table compute_aggregations<global_set_t>(
   rmm::device_uvector<cudf::size_type>& populated_keys,
   rmm::cuda_stream_view stream);
 
-template cudf::table compute_aggregations<nullable_global_set_t>(
+template std::pair<cudaError_t, cudf::table> compute_aggregations<nullable_global_set_t>(
   int grid_size,
   cudf::size_type num_input_rows,
   bitmask_type const* row_bitmask,
diff --git a/cpp/src/groupby/hash/compute_aggregations.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp
index bcb996b645d..1c382f01195 100644
--- a/cpp/src/groupby/hash/compute_aggregations.hpp
+++ b/cpp/src/groupby/hash/compute_aggregations.hpp
@@ -22,25 +22,28 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <cuda_runtime_api.h>
+
 #include <cstddef>
 
 namespace cudf::groupby::detail::hash {
 
 template <typename SetType>
-cudf::table compute_aggregations(int grid_size,
-                                 cudf::size_type num_input_rows,
-                                 bitmask_type const* row_bitmask,
-                                 bool skip_rows_with_nulls,
-                                 cudf::size_type* local_mapping_index,
-                                 cudf::size_type* global_mapping_index,
-                                 cudf::size_type* block_cardinality,
-                                 cudf::table_device_view input_values,
-                                 cudf::table_view const& flattened_values,
-                                 cudf::aggregation::Kind const* d_agg_kinds,
-                                 std::vector<cudf::aggregation::Kind> const& agg_kinds,
-                                 bool direct_aggregations,
-                                 SetType& global_set,
-                                 rmm::device_uvector<cudf::size_type>& populated_keys,
-                                 rmm::cuda_stream_view stream);
+std::pair<cudaError_t, cudf::table> compute_aggregations(
+  int grid_size,
+  cudf::size_type num_input_rows,
+  bitmask_type const* row_bitmask,
+  bool skip_rows_with_nulls,
+  cudf::size_type* local_mapping_index,
+  cudf::size_type* global_mapping_index,
+  cudf::size_type* block_cardinality,
+  cudf::table_device_view input_values,
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  bool direct_aggregations,
+  SetType& global_set,
+  rmm::device_uvector<cudf::size_type>& populated_keys,
+  rmm::cuda_stream_view stream);
 
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index 3372d88e714..25c8aed957c 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -246,7 +246,7 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   // prepare to launch kernel to do the actual aggregation
   auto d_values = table_device_view::create(flattened_values, stream);
 
-  cudf::table sparse_table =
+  auto [status, sparse_table] =
     compute_aggregations<SetType>(grid_size,
                                   num_input_rows,
                                   static_cast<bitmask_type*>(row_bitmask.data()),
@@ -264,7 +264,19 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
                                   stream);
   auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
 
-  if (direct_aggregations.value(stream)) {
+  if (status != cudaSuccess) {
+    thrust::for_each_n(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(0),
+      keys.num_rows(),
+      hash::compute_single_pass_aggs_fn{global_set_ref,
+                                        *d_values,
+                                        *d_sparse_table,
+                                        d_agg_kinds.data(),
+                                        static_cast<bitmask_type*>(row_bitmask.data()),
+                                        skip_rows_with_nulls});
+    extract_populated_keys(global_set, populated_keys, stream);
+  } else if (direct_aggregations.value(stream)) {
     auto const stride = GROUPBY_BLOCK_SIZE * grid_size;
     thrust::for_each_n(rmm::exec_policy(stream),
                        thrust::make_counting_iterator(0),

From 5f05ca72e20e8d658dbb35ab1a614321f640aab9 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Sun, 29 Sep 2024 18:21:33 -0700
Subject: [PATCH 056/135] Add explicit instantiations for compute_aggregations

---
 cpp/CMakeLists.txt                            |   1 +
 cpp/src/groupby/hash/compute_aggregations.cu  | 302 +----------------
 cpp/src/groupby/hash/compute_aggregations.cuh | 303 ++++++++++++++++++
 .../groupby/hash/compute_aggregations_null.cu |  37 +++
 4 files changed, 342 insertions(+), 301 deletions(-)
 create mode 100644 cpp/src/groupby/hash/compute_aggregations.cuh
 create mode 100644 cpp/src/groupby/hash/compute_aggregations_null.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 4bc37eb212c..743c8cd1f7c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -315,6 +315,7 @@ add_library(
   src/filling/sequence.cu
   src/groupby/groupby.cu
   src/groupby/hash/compute_aggregations.cu
+  src/groupby/hash/compute_aggregations_null.cu
   src/groupby/hash/compute_groupby.cu
   src/groupby/hash/compute_single_pass_aggs.cu
   src/groupby/hash/compute_single_pass_aggs_null.cu
diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu
index d2d7aa83568..8e70a3a77f0 100644
--- a/cpp/src/groupby/hash/compute_aggregations.cu
+++ b/cpp/src/groupby/hash/compute_aggregations.cu
@@ -14,292 +14,10 @@
  * limitations under the License.
  */
 
+#include "compute_aggregations.cuh"
 #include "compute_aggregations.hpp"
-#include "create_sparse_results_table.hpp"
-#include "global_memory_aggregator.cuh"
-#include "helpers.cuh"
-#include "shared_memory_aggregator.cuh"
-#include "single_pass_functors.cuh"
-
-#include <cudf/aggregation.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/cuda.hpp>
-#include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/table/table_device_view.cuh>
-#include <cudf/types.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-
-#include <cooperative_groups.h>
-
-#include <cstddef>
 
 namespace cudf::groupby::detail::hash {
-namespace {
-__device__ void calculate_columns_to_aggregate(int& col_start,
-                                               int& col_end,
-                                               cudf::mutable_table_device_view output_values,
-                                               int num_input_cols,
-                                               std::byte** s_aggregates_pointer,
-                                               bool** s_aggregates_valid_pointer,
-                                               std::byte* shared_set_aggregates,
-                                               cudf::size_type cardinality,
-                                               int total_agg_size)
-{
-  if (threadIdx.x == 0) {
-    col_start           = col_end;
-    int bytes_allocated = 0;
-    int valid_col_size  = round_to_multiple_of_8(sizeof(bool) * cardinality);
-    while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) {
-      int next_col_size =
-        round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality);
-      int next_col_total_size = valid_col_size + next_col_size;
-      if (bytes_allocated + next_col_total_size > total_agg_size) { break; }
-      s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated;
-      s_aggregates_valid_pointer[col_end] =
-        reinterpret_cast<bool*>(shared_set_aggregates + bytes_allocated + next_col_size);
-      bytes_allocated += next_col_total_size;
-      col_end++;
-    }
-  }
-}
-
-__device__ void initialize_shared_memory_aggregates(int col_start,
-                                                    int col_end,
-                                                    cudf::mutable_table_device_view output_values,
-                                                    std::byte** s_aggregates_pointer,
-                                                    bool** s_aggregates_valid_pointer,
-                                                    cudf::size_type cardinality,
-                                                    cudf::aggregation::Kind const* d_agg_kinds)
-{
-  for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
-    for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) {
-      cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(),
-                                                  d_agg_kinds[col_idx],
-                                                  initialize_shmem{},
-                                                  s_aggregates_pointer[col_idx],
-                                                  idx,
-                                                  s_aggregates_valid_pointer[col_idx]);
-    }
-  }
-}
-
-__device__ void compute_pre_aggregrates(int col_start,
-                                        int col_end,
-                                        bitmask_type const* row_bitmask,
-                                        bool skip_rows_with_nulls,
-                                        cudf::table_device_view input_values,
-                                        cudf::size_type num_input_rows,
-                                        cudf::size_type* local_mapping_index,
-                                        std::byte** s_aggregates_pointer,
-                                        bool** s_aggregates_valid_pointer,
-                                        cudf::aggregation::Kind const* d_agg_kinds)
-{
-  // TODO grid_1d utility
-  for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows;
-       cur_idx += blockDim.x * gridDim.x) {
-    if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx)) {
-      auto map_idx = local_mapping_index[cur_idx];
-
-      for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
-        auto input_col = input_values.column(col_idx);
-
-        cudf::detail::dispatch_type_and_aggregation(input_col.type(),
-                                                    d_agg_kinds[col_idx],
-                                                    shmem_element_aggregator{},
-                                                    s_aggregates_pointer[col_idx],
-                                                    map_idx,
-                                                    s_aggregates_valid_pointer[col_idx],
-                                                    input_col,
-                                                    cur_idx);
-      }
-    }
-  }
-}
-
-__device__ void compute_final_aggregates(int col_start,
-                                         int col_end,
-                                         cudf::table_device_view input_values,
-                                         cudf::mutable_table_device_view output_values,
-                                         cudf::size_type cardinality,
-                                         cudf::size_type* global_mapping_index,
-                                         std::byte** s_aggregates_pointer,
-                                         bool** s_aggregates_valid_pointer,
-                                         cudf::aggregation::Kind const* d_agg_kinds)
-{
-  for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) {
-    auto out_idx = global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx];
-    for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
-      auto output_col = output_values.column(col_idx);
-
-      cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(),
-                                                  d_agg_kinds[col_idx],
-                                                  gmem_element_aggregator{},
-                                                  output_col,
-                                                  out_idx,
-                                                  input_values.column(col_idx),
-                                                  s_aggregates_pointer[col_idx],
-                                                  cur_idx,
-                                                  s_aggregates_valid_pointer[col_idx]);
-    }
-  }
-}
-
-/* Takes the local_mapping_index and global_mapping_index to compute
- * pre (shared) and final (global) aggregates*/
-CUDF_KERNEL void compute_d_agg_kinds_kernel(cudf::size_type num_rows,
-                                            bitmask_type const* row_bitmask,
-                                            bool skip_rows_with_nulls,
-                                            cudf::size_type* local_mapping_index,
-                                            cudf::size_type* global_mapping_index,
-                                            cudf::size_type* block_cardinality,
-                                            cudf::table_device_view input_values,
-                                            cudf::mutable_table_device_view output_values,
-                                            cudf::aggregation::Kind const* d_agg_kinds,
-                                            int total_agg_size,
-                                            int pointer_size)
-{
-  auto const block       = cooperative_groups::this_thread_block();
-  auto const cardinality = block_cardinality[block.group_index().x];
-  if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; }
-
-  auto const num_cols = output_values.num_columns();
-
-  __shared__ int col_start;
-  __shared__ int col_end;
-  extern __shared__ std::byte shared_set_aggregates[];
-  std::byte** s_aggregates_pointer =
-    reinterpret_cast<std::byte**>(shared_set_aggregates + total_agg_size);
-  bool** s_aggregates_valid_pointer =
-    reinterpret_cast<bool**>(shared_set_aggregates + total_agg_size + pointer_size);
-
-  if (block.thread_rank() == 0) {
-    col_start = 0;
-    col_end   = 0;
-  }
-  block.sync();
-
-  while (col_end < num_cols) {
-    calculate_columns_to_aggregate(col_start,
-                                   col_end,
-                                   output_values,
-                                   num_cols,
-                                   s_aggregates_pointer,
-                                   s_aggregates_valid_pointer,
-                                   shared_set_aggregates,
-                                   cardinality,
-                                   total_agg_size);
-    block.sync();
-    initialize_shared_memory_aggregates(col_start,
-                                        col_end,
-                                        output_values,
-                                        s_aggregates_pointer,
-                                        s_aggregates_valid_pointer,
-                                        cardinality,
-                                        d_agg_kinds);
-    block.sync();
-    compute_pre_aggregrates(col_start,
-                            col_end,
-                            row_bitmask,
-                            skip_rows_with_nulls,
-                            input_values,
-                            num_rows,
-                            local_mapping_index,
-                            s_aggregates_pointer,
-                            s_aggregates_valid_pointer,
-                            d_agg_kinds);
-    block.sync();
-    compute_final_aggregates(col_start,
-                             col_end,
-                             input_values,
-                             output_values,
-                             cardinality,
-                             global_mapping_index,
-                             s_aggregates_pointer,
-                             s_aggregates_valid_pointer,
-                             d_agg_kinds);
-    block.sync();
-  }
-}
-
-constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; }
-
-template <typename Kernel>
-constexpr std::pair<cudaError_t, size_t> compute_shared_memory_size(Kernel kernel,
-                                                                    int grid_size) noexcept
-{
-  auto const active_blocks_per_sm =
-    cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
-
-  size_t dynamic_shmem_size = 0;
-
-  auto const status = cudaOccupancyAvailableDynamicSMemPerBlock(
-    &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE);
-  if (status != cudaSuccess) { cudaGetLastError(); }
-  return {status, get_previous_multiple_of_8(0.5 * dynamic_shmem_size)};
-}
-
-}  // namespace
-
-template <typename SetType>
-std::pair<cudaError_t, cudf::table> compute_aggregations(
-  int grid_size,
-  cudf::size_type num_input_rows,
-  bitmask_type const* row_bitmask,
-  bool skip_rows_with_nulls,
-  cudf::size_type* local_mapping_index,
-  cudf::size_type* global_mapping_index,
-  cudf::size_type* block_cardinality,
-  cudf::table_device_view input_values,
-  cudf::table_view const& flattened_values,
-  cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> const& agg_kinds,
-  bool direct_aggregations,
-  SetType& global_set,
-  rmm::device_uvector<cudf::size_type>& populated_keys,
-  rmm::cuda_stream_view stream)
-{
-  auto const [status, shmem_size] =
-    compute_shared_memory_size(compute_d_agg_kinds_kernel, grid_size);
-
-  if (status != cudaSuccess) { direct_aggregations = true; }
-
-  // make table that will hold sparse results
-  cudf::table sparse_table = create_sparse_results_table(flattened_values,
-                                                         d_agg_kinds,
-                                                         agg_kinds,
-                                                         direct_aggregations,
-                                                         global_set,
-                                                         populated_keys,
-                                                         stream);
-
-  if (status != cudaSuccess) { return {status, sparse_table}; }
-
-  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
-  auto output_values  = *d_sparse_table;
-
-  // For each aggregation, need two pointers to arrays in shmem
-  // One where the aggregation is performed, one indicating the validity of the aggregation
-  auto const shmem_agg_pointer_size =
-    round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns());
-  // The rest of shmem is utilized for the actual arrays in shmem
-  auto const shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2;
-  compute_d_agg_kinds_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, shmem_size, stream>>>(
-    num_input_rows,
-    row_bitmask,
-    skip_rows_with_nulls,
-    local_mapping_index,
-    global_mapping_index,
-    block_cardinality,
-    input_values,
-    output_values,
-    d_agg_kinds,
-    shmem_agg_size,
-    shmem_agg_pointer_size);
-
-  return {status, sparse_table};
-}
-
 template std::pair<cudaError_t, cudf::table> compute_aggregations<global_set_t>(
   int grid_size,
   cudf::size_type num_input_rows,
@@ -316,22 +34,4 @@ template std::pair<cudaError_t, cudf::table> compute_aggregations<global_set_t>(
   global_set_t& global_set,
   rmm::device_uvector<cudf::size_type>& populated_keys,
   rmm::cuda_stream_view stream);
-
-template std::pair<cudaError_t, cudf::table> compute_aggregations<nullable_global_set_t>(
-  int grid_size,
-  cudf::size_type num_input_rows,
-  bitmask_type const* row_bitmask,
-  bool skip_rows_with_nulls,
-  cudf::size_type* local_mapping_index,
-  cudf::size_type* global_mapping_index,
-  cudf::size_type* block_cardinality,
-  cudf::table_device_view input_values,
-  cudf::table_view const& flattened_values,
-  cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> const& agg_kinds,
-  bool direct_aggregations,
-  nullable_global_set_t& global_set,
-  rmm::device_uvector<cudf::size_type>& populated_keys,
-  rmm::cuda_stream_view stream);
-
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh
new file mode 100644
index 00000000000..a7da0ec6e85
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_aggregations.cuh
@@ -0,0 +1,303 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "compute_aggregations.hpp"
+#include "create_sparse_results_table.hpp"
+#include "global_memory_aggregator.cuh"
+#include "helpers.cuh"
+#include "shared_memory_aggregator.cuh"
+#include "single_pass_functors.cuh"
+
+#include <cudf/aggregation.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/cuda.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cooperative_groups.h>
+
+#include <cstddef>
+
+namespace cudf::groupby::detail::hash {
+namespace {
+__device__ void calculate_columns_to_aggregate(int& col_start,
+                                               int& col_end,
+                                               cudf::mutable_table_device_view output_values,
+                                               int num_input_cols,
+                                               std::byte** s_aggregates_pointer,
+                                               bool** s_aggregates_valid_pointer,
+                                               std::byte* shared_set_aggregates,
+                                               cudf::size_type cardinality,
+                                               int total_agg_size)
+{
+  if (threadIdx.x == 0) {
+    col_start           = col_end;
+    int bytes_allocated = 0;
+    int valid_col_size  = round_to_multiple_of_8(sizeof(bool) * cardinality);
+    while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) {
+      int next_col_size =
+        round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality);
+      int next_col_total_size = valid_col_size + next_col_size;
+      if (bytes_allocated + next_col_total_size > total_agg_size) { break; }
+      s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated;
+      s_aggregates_valid_pointer[col_end] =
+        reinterpret_cast<bool*>(shared_set_aggregates + bytes_allocated + next_col_size);
+      bytes_allocated += next_col_total_size;
+      col_end++;
+    }
+  }
+}
+
+__device__ void initialize_shared_memory_aggregates(int col_start,
+                                                    int col_end,
+                                                    cudf::mutable_table_device_view output_values,
+                                                    std::byte** s_aggregates_pointer,
+                                                    bool** s_aggregates_valid_pointer,
+                                                    cudf::size_type cardinality,
+                                                    cudf::aggregation::Kind const* d_agg_kinds)
+{
+  for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+    for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) {
+      cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(),
+                                                  d_agg_kinds[col_idx],
+                                                  initialize_shmem{},
+                                                  s_aggregates_pointer[col_idx],
+                                                  idx,
+                                                  s_aggregates_valid_pointer[col_idx]);
+    }
+  }
+}
+
+__device__ void compute_pre_aggregrates(int col_start,
+                                        int col_end,
+                                        bitmask_type const* row_bitmask,
+                                        bool skip_rows_with_nulls,
+                                        cudf::table_device_view input_values,
+                                        cudf::size_type num_input_rows,
+                                        cudf::size_type* local_mapping_index,
+                                        std::byte** s_aggregates_pointer,
+                                        bool** s_aggregates_valid_pointer,
+                                        cudf::aggregation::Kind const* d_agg_kinds)
+{
+  // TODO grid_1d utility
+  for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows;
+       cur_idx += blockDim.x * gridDim.x) {
+    if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx)) {
+      auto map_idx = local_mapping_index[cur_idx];
+
+      for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+        auto input_col = input_values.column(col_idx);
+
+        cudf::detail::dispatch_type_and_aggregation(input_col.type(),
+                                                    d_agg_kinds[col_idx],
+                                                    shmem_element_aggregator{},
+                                                    s_aggregates_pointer[col_idx],
+                                                    map_idx,
+                                                    s_aggregates_valid_pointer[col_idx],
+                                                    input_col,
+                                                    cur_idx);
+      }
+    }
+  }
+}
+
+__device__ void compute_final_aggregates(int col_start,
+                                         int col_end,
+                                         cudf::table_device_view input_values,
+                                         cudf::mutable_table_device_view output_values,
+                                         cudf::size_type cardinality,
+                                         cudf::size_type* global_mapping_index,
+                                         std::byte** s_aggregates_pointer,
+                                         bool** s_aggregates_valid_pointer,
+                                         cudf::aggregation::Kind const* d_agg_kinds)
+{
+  for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) {
+    auto out_idx = global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx];
+    for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+      auto output_col = output_values.column(col_idx);
+
+      cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(),
+                                                  d_agg_kinds[col_idx],
+                                                  gmem_element_aggregator{},
+                                                  output_col,
+                                                  out_idx,
+                                                  input_values.column(col_idx),
+                                                  s_aggregates_pointer[col_idx],
+                                                  cur_idx,
+                                                  s_aggregates_valid_pointer[col_idx]);
+    }
+  }
+}
+
+/* Takes the local_mapping_index and global_mapping_index to compute
+ * pre (shared) and final (global) aggregates*/
+CUDF_KERNEL void compute_d_agg_kinds_kernel(cudf::size_type num_rows,
+                                            bitmask_type const* row_bitmask,
+                                            bool skip_rows_with_nulls,
+                                            cudf::size_type* local_mapping_index,
+                                            cudf::size_type* global_mapping_index,
+                                            cudf::size_type* block_cardinality,
+                                            cudf::table_device_view input_values,
+                                            cudf::mutable_table_device_view output_values,
+                                            cudf::aggregation::Kind const* d_agg_kinds,
+                                            int total_agg_size,
+                                            int pointer_size)
+{
+  auto const block       = cooperative_groups::this_thread_block();
+  auto const cardinality = block_cardinality[block.group_index().x];
+  if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; }
+
+  auto const num_cols = output_values.num_columns();
+
+  __shared__ int col_start;
+  __shared__ int col_end;
+  extern __shared__ std::byte shared_set_aggregates[];
+  std::byte** s_aggregates_pointer =
+    reinterpret_cast<std::byte**>(shared_set_aggregates + total_agg_size);
+  bool** s_aggregates_valid_pointer =
+    reinterpret_cast<bool**>(shared_set_aggregates + total_agg_size + pointer_size);
+
+  if (block.thread_rank() == 0) {
+    col_start = 0;
+    col_end   = 0;
+  }
+  block.sync();
+
+  while (col_end < num_cols) {
+    calculate_columns_to_aggregate(col_start,
+                                   col_end,
+                                   output_values,
+                                   num_cols,
+                                   s_aggregates_pointer,
+                                   s_aggregates_valid_pointer,
+                                   shared_set_aggregates,
+                                   cardinality,
+                                   total_agg_size);
+    block.sync();
+    initialize_shared_memory_aggregates(col_start,
+                                        col_end,
+                                        output_values,
+                                        s_aggregates_pointer,
+                                        s_aggregates_valid_pointer,
+                                        cardinality,
+                                        d_agg_kinds);
+    block.sync();
+    compute_pre_aggregrates(col_start,
+                            col_end,
+                            row_bitmask,
+                            skip_rows_with_nulls,
+                            input_values,
+                            num_rows,
+                            local_mapping_index,
+                            s_aggregates_pointer,
+                            s_aggregates_valid_pointer,
+                            d_agg_kinds);
+    block.sync();
+    compute_final_aggregates(col_start,
+                             col_end,
+                             input_values,
+                             output_values,
+                             cardinality,
+                             global_mapping_index,
+                             s_aggregates_pointer,
+                             s_aggregates_valid_pointer,
+                             d_agg_kinds);
+    block.sync();
+  }
+}
+
+constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; }
+
+template <typename Kernel>
+constexpr std::pair<cudaError_t, size_t> compute_shared_memory_size(Kernel kernel,
+                                                                    int grid_size) noexcept
+{
+  auto const active_blocks_per_sm =
+    cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
+
+  size_t dynamic_shmem_size = 0;
+
+  auto const status = cudaOccupancyAvailableDynamicSMemPerBlock(
+    &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE);
+  if (status != cudaSuccess) { cudaGetLastError(); }
+  return {status, get_previous_multiple_of_8(0.5 * dynamic_shmem_size)};
+}
+
+}  // namespace
+
+template <typename SetType>
+std::pair<cudaError_t, cudf::table> compute_aggregations(
+  int grid_size,
+  cudf::size_type num_input_rows,
+  bitmask_type const* row_bitmask,
+  bool skip_rows_with_nulls,
+  cudf::size_type* local_mapping_index,
+  cudf::size_type* global_mapping_index,
+  cudf::size_type* block_cardinality,
+  cudf::table_device_view input_values,
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  bool direct_aggregations,
+  SetType& global_set,
+  rmm::device_uvector<cudf::size_type>& populated_keys,
+  rmm::cuda_stream_view stream)
+{
+  auto const [status, shmem_size] =
+    compute_shared_memory_size(compute_d_agg_kinds_kernel, grid_size);
+
+  if (status != cudaSuccess) { direct_aggregations = true; }
+
+  // make table that will hold sparse results
+  cudf::table sparse_table = create_sparse_results_table(flattened_values,
+                                                         d_agg_kinds,
+                                                         agg_kinds,
+                                                         direct_aggregations,
+                                                         global_set,
+                                                         populated_keys,
+                                                         stream);
+
+  if (status != cudaSuccess) { return {status, sparse_table}; }
+
+  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
+  auto output_values  = *d_sparse_table;
+
+  // For each aggregation, need two pointers to arrays in shmem
+  // One where the aggregation is performed, one indicating the validity of the aggregation
+  auto const shmem_agg_pointer_size =
+    round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns());
+  // The rest of shmem is utilized for the actual arrays in shmem
+  auto const shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2;
+  compute_d_agg_kinds_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, shmem_size, stream>>>(
+    num_input_rows,
+    row_bitmask,
+    skip_rows_with_nulls,
+    local_mapping_index,
+    global_mapping_index,
+    block_cardinality,
+    input_values,
+    output_values,
+    d_agg_kinds,
+    shmem_agg_size,
+    shmem_agg_pointer_size);
+
+  return {status, sparse_table};
+}
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_aggregations_null.cu b/cpp/src/groupby/hash/compute_aggregations_null.cu
new file mode 100644
index 00000000000..d2c2a5f5830
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_aggregations_null.cu
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_aggregations.cuh"
+#include "compute_aggregations.hpp"
+
+namespace cudf::groupby::detail::hash {
+template std::pair<cudaError_t, cudf::table> compute_aggregations<nullable_global_set_t>(
+  int grid_size,
+  cudf::size_type num_input_rows,
+  bitmask_type const* row_bitmask,
+  bool skip_rows_with_nulls,
+  cudf::size_type* local_mapping_index,
+  cudf::size_type* global_mapping_index,
+  cudf::size_type* block_cardinality,
+  cudf::table_device_view input_values,
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  bool direct_aggregations,
+  nullable_global_set_t& global_set,
+  rmm::device_uvector<cudf::size_type>& populated_keys,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash

From 2e30d9ba039f0cda763cd11ddd2fcacd826698e2 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 30 Sep 2024 11:16:32 -0700
Subject: [PATCH 057/135] Clean up the shmem agg determination logic

---
 cpp/CMakeLists.txt                            |   1 -
 cpp/src/groupby/hash/compute_aggregations.cu  | 273 +++++++++++++++-
 cpp/src/groupby/hash/compute_aggregations.cuh | 303 ------------------
 cpp/src/groupby/hash/compute_aggregations.hpp |  32 +-
 .../groupby/hash/compute_aggregations_null.cu |  37 ---
 .../groupby/hash/compute_single_pass_aggs.cuh |  74 +++--
 6 files changed, 311 insertions(+), 409 deletions(-)
 delete mode 100644 cpp/src/groupby/hash/compute_aggregations.cuh
 delete mode 100644 cpp/src/groupby/hash/compute_aggregations_null.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 743c8cd1f7c..4bc37eb212c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -315,7 +315,6 @@ add_library(
   src/filling/sequence.cu
   src/groupby/groupby.cu
   src/groupby/hash/compute_aggregations.cu
-  src/groupby/hash/compute_aggregations_null.cu
   src/groupby/hash/compute_groupby.cu
   src/groupby/hash/compute_single_pass_aggs.cu
   src/groupby/hash/compute_single_pass_aggs_null.cu
diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu
index 8e70a3a77f0..0b1008493f4 100644
--- a/cpp/src/groupby/hash/compute_aggregations.cu
+++ b/cpp/src/groupby/hash/compute_aggregations.cu
@@ -14,24 +14,263 @@
  * limitations under the License.
  */
 
-#include "compute_aggregations.cuh"
 #include "compute_aggregations.hpp"
+#include "create_sparse_results_table.hpp"
+#include "global_memory_aggregator.cuh"
+#include "helpers.cuh"
+#include "shared_memory_aggregator.cuh"
+#include "single_pass_functors.cuh"
+
+#include <cudf/aggregation.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/cuda.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cooperative_groups.h>
+
+#include <cstddef>
 
 namespace cudf::groupby::detail::hash {
-template std::pair<cudaError_t, cudf::table> compute_aggregations<global_set_t>(
-  int grid_size,
-  cudf::size_type num_input_rows,
-  bitmask_type const* row_bitmask,
-  bool skip_rows_with_nulls,
-  cudf::size_type* local_mapping_index,
-  cudf::size_type* global_mapping_index,
-  cudf::size_type* block_cardinality,
-  cudf::table_device_view input_values,
-  cudf::table_view const& flattened_values,
-  cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> const& agg_kinds,
-  bool direct_aggregations,
-  global_set_t& global_set,
-  rmm::device_uvector<cudf::size_type>& populated_keys,
-  rmm::cuda_stream_view stream);
+namespace {
+__device__ void calculate_columns_to_aggregate(int& col_start,
+                                               int& col_end,
+                                               cudf::mutable_table_device_view output_values,
+                                               int num_input_cols,
+                                               std::byte** s_aggregates_pointer,
+                                               bool** s_aggregates_valid_pointer,
+                                               std::byte* shared_set_aggregates,
+                                               cudf::size_type cardinality,
+                                               int total_agg_size)
+{
+  if (threadIdx.x == 0) {
+    col_start           = col_end;
+    int bytes_allocated = 0;
+    int valid_col_size  = round_to_multiple_of_8(sizeof(bool) * cardinality);
+    while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) {
+      int next_col_size =
+        round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality);
+      int next_col_total_size = valid_col_size + next_col_size;
+      if (bytes_allocated + next_col_total_size > total_agg_size) { break; }
+      s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated;
+      s_aggregates_valid_pointer[col_end] =
+        reinterpret_cast<bool*>(shared_set_aggregates + bytes_allocated + next_col_size);
+      bytes_allocated += next_col_total_size;
+      col_end++;
+    }
+  }
+}
+
+__device__ void initialize_shared_memory_aggregates(int col_start,
+                                                    int col_end,
+                                                    cudf::mutable_table_device_view output_values,
+                                                    std::byte** s_aggregates_pointer,
+                                                    bool** s_aggregates_valid_pointer,
+                                                    cudf::size_type cardinality,
+                                                    cudf::aggregation::Kind const* d_agg_kinds)
+{
+  for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+    for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) {
+      cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(),
+                                                  d_agg_kinds[col_idx],
+                                                  initialize_shmem{},
+                                                  s_aggregates_pointer[col_idx],
+                                                  idx,
+                                                  s_aggregates_valid_pointer[col_idx]);
+    }
+  }
+}
+
+__device__ void compute_pre_aggregrates(int col_start,
+                                        int col_end,
+                                        bitmask_type const* row_bitmask,
+                                        bool skip_rows_with_nulls,
+                                        cudf::table_device_view input_values,
+                                        cudf::size_type num_input_rows,
+                                        cudf::size_type* local_mapping_index,
+                                        std::byte** s_aggregates_pointer,
+                                        bool** s_aggregates_valid_pointer,
+                                        cudf::aggregation::Kind const* d_agg_kinds)
+{
+  // TODO grid_1d utility
+  for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows;
+       cur_idx += blockDim.x * gridDim.x) {
+    if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx)) {
+      auto map_idx = local_mapping_index[cur_idx];
+
+      for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+        auto input_col = input_values.column(col_idx);
+
+        cudf::detail::dispatch_type_and_aggregation(input_col.type(),
+                                                    d_agg_kinds[col_idx],
+                                                    shmem_element_aggregator{},
+                                                    s_aggregates_pointer[col_idx],
+                                                    map_idx,
+                                                    s_aggregates_valid_pointer[col_idx],
+                                                    input_col,
+                                                    cur_idx);
+      }
+    }
+  }
+}
+
+__device__ void compute_final_aggregates(int col_start,
+                                         int col_end,
+                                         cudf::table_device_view input_values,
+                                         cudf::mutable_table_device_view output_values,
+                                         cudf::size_type cardinality,
+                                         cudf::size_type* global_mapping_index,
+                                         std::byte** s_aggregates_pointer,
+                                         bool** s_aggregates_valid_pointer,
+                                         cudf::aggregation::Kind const* d_agg_kinds)
+{
+  for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) {
+    auto out_idx = global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx];
+    for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+      auto output_col = output_values.column(col_idx);
+
+      cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(),
+                                                  d_agg_kinds[col_idx],
+                                                  gmem_element_aggregator{},
+                                                  output_col,
+                                                  out_idx,
+                                                  input_values.column(col_idx),
+                                                  s_aggregates_pointer[col_idx],
+                                                  cur_idx,
+                                                  s_aggregates_valid_pointer[col_idx]);
+    }
+  }
+}
+
+/* Takes the local_mapping_index and global_mapping_index to compute
+ * pre (shared) and final (global) aggregates*/
+CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows,
+                                     bitmask_type const* row_bitmask,
+                                     bool skip_rows_with_nulls,
+                                     cudf::size_type* local_mapping_index,
+                                     cudf::size_type* global_mapping_index,
+                                     cudf::size_type* block_cardinality,
+                                     cudf::table_device_view input_values,
+                                     cudf::mutable_table_device_view output_values,
+                                     cudf::aggregation::Kind const* d_agg_kinds,
+                                     int total_agg_size,
+                                     int pointer_size)
+{
+  auto const block       = cooperative_groups::this_thread_block();
+  auto const cardinality = block_cardinality[block.group_index().x];
+  if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; }
+
+  auto const num_cols = output_values.num_columns();
+
+  __shared__ int col_start;
+  __shared__ int col_end;
+  extern __shared__ std::byte shared_set_aggregates[];
+  std::byte** s_aggregates_pointer =
+    reinterpret_cast<std::byte**>(shared_set_aggregates + total_agg_size);
+  bool** s_aggregates_valid_pointer =
+    reinterpret_cast<bool**>(shared_set_aggregates + total_agg_size + pointer_size);
+
+  if (block.thread_rank() == 0) {
+    col_start = 0;
+    col_end   = 0;
+  }
+  block.sync();
+
+  while (col_end < num_cols) {
+    calculate_columns_to_aggregate(col_start,
+                                   col_end,
+                                   output_values,
+                                   num_cols,
+                                   s_aggregates_pointer,
+                                   s_aggregates_valid_pointer,
+                                   shared_set_aggregates,
+                                   cardinality,
+                                   total_agg_size);
+    block.sync();
+    initialize_shared_memory_aggregates(col_start,
+                                        col_end,
+                                        output_values,
+                                        s_aggregates_pointer,
+                                        s_aggregates_valid_pointer,
+                                        cardinality,
+                                        d_agg_kinds);
+    block.sync();
+    compute_pre_aggregrates(col_start,
+                            col_end,
+                            row_bitmask,
+                            skip_rows_with_nulls,
+                            input_values,
+                            num_rows,
+                            local_mapping_index,
+                            s_aggregates_pointer,
+                            s_aggregates_valid_pointer,
+                            d_agg_kinds);
+    block.sync();
+    compute_final_aggregates(col_start,
+                             col_end,
+                             input_values,
+                             output_values,
+                             cardinality,
+                             global_mapping_index,
+                             s_aggregates_pointer,
+                             s_aggregates_valid_pointer,
+                             d_agg_kinds);
+    block.sync();
+  }
+}
+
+constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; }
+
+}  // namespace
+
+constexpr std::pair<bool, size_t> can_use_shmem_aggs(int grid_size) noexcept
+{
+  auto const active_blocks_per_sm =
+    cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
+
+  size_t dynamic_shmem_size = 0;
+
+  auto const status = cudaOccupancyAvailableDynamicSMemPerBlock(
+    &dynamic_shmem_size, compute_aggs_kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE);
+  auto const success = status == cudaSuccess;
+  if (!success) { cudaGetLastError(); }
+
+  return {success, get_previous_multiple_of_8(0.5 * dynamic_shmem_size)};
+}
+
+void compute_aggregations(int grid_size,
+                          cudf::size_type num_input_rows,
+                          bitmask_type const* row_bitmask,
+                          bool skip_rows_with_nulls,
+                          cudf::size_type* local_mapping_index,
+                          cudf::size_type* global_mapping_index,
+                          cudf::size_type* block_cardinality,
+                          cudf::table_device_view input_values,
+                          cudf::mutable_table_device_view output_values,
+                          cudf::aggregation::Kind const* d_agg_kinds,
+                          size_t shmem_size,
+                          rmm::cuda_stream_view stream)
+{
+  // For each aggregation, need two pointers to arrays in shmem
+  // One where the aggregation is performed, one indicating the validity of the aggregation
+  auto const shmem_agg_pointer_size =
+    round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns());
+  // The rest of shmem is utilized for the actual arrays in shmem
+  auto const shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2;
+  compute_aggs_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, shmem_size, stream>>>(
+    num_input_rows,
+    row_bitmask,
+    skip_rows_with_nulls,
+    local_mapping_index,
+    global_mapping_index,
+    block_cardinality,
+    input_values,
+    output_values,
+    d_agg_kinds,
+    shmem_agg_size,
+    shmem_agg_pointer_size);
+}
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh
deleted file mode 100644
index a7da0ec6e85..00000000000
--- a/cpp/src/groupby/hash/compute_aggregations.cuh
+++ /dev/null
@@ -1,303 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include "compute_aggregations.hpp"
-#include "create_sparse_results_table.hpp"
-#include "global_memory_aggregator.cuh"
-#include "helpers.cuh"
-#include "shared_memory_aggregator.cuh"
-#include "single_pass_functors.cuh"
-
-#include <cudf/aggregation.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/cuda.hpp>
-#include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/table/table_device_view.cuh>
-#include <cudf/types.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-
-#include <cooperative_groups.h>
-
-#include <cstddef>
-
-namespace cudf::groupby::detail::hash {
-namespace {
-__device__ void calculate_columns_to_aggregate(int& col_start,
-                                               int& col_end,
-                                               cudf::mutable_table_device_view output_values,
-                                               int num_input_cols,
-                                               std::byte** s_aggregates_pointer,
-                                               bool** s_aggregates_valid_pointer,
-                                               std::byte* shared_set_aggregates,
-                                               cudf::size_type cardinality,
-                                               int total_agg_size)
-{
-  if (threadIdx.x == 0) {
-    col_start           = col_end;
-    int bytes_allocated = 0;
-    int valid_col_size  = round_to_multiple_of_8(sizeof(bool) * cardinality);
-    while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) {
-      int next_col_size =
-        round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality);
-      int next_col_total_size = valid_col_size + next_col_size;
-      if (bytes_allocated + next_col_total_size > total_agg_size) { break; }
-      s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated;
-      s_aggregates_valid_pointer[col_end] =
-        reinterpret_cast<bool*>(shared_set_aggregates + bytes_allocated + next_col_size);
-      bytes_allocated += next_col_total_size;
-      col_end++;
-    }
-  }
-}
-
-__device__ void initialize_shared_memory_aggregates(int col_start,
-                                                    int col_end,
-                                                    cudf::mutable_table_device_view output_values,
-                                                    std::byte** s_aggregates_pointer,
-                                                    bool** s_aggregates_valid_pointer,
-                                                    cudf::size_type cardinality,
-                                                    cudf::aggregation::Kind const* d_agg_kinds)
-{
-  for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
-    for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) {
-      cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(),
-                                                  d_agg_kinds[col_idx],
-                                                  initialize_shmem{},
-                                                  s_aggregates_pointer[col_idx],
-                                                  idx,
-                                                  s_aggregates_valid_pointer[col_idx]);
-    }
-  }
-}
-
-__device__ void compute_pre_aggregrates(int col_start,
-                                        int col_end,
-                                        bitmask_type const* row_bitmask,
-                                        bool skip_rows_with_nulls,
-                                        cudf::table_device_view input_values,
-                                        cudf::size_type num_input_rows,
-                                        cudf::size_type* local_mapping_index,
-                                        std::byte** s_aggregates_pointer,
-                                        bool** s_aggregates_valid_pointer,
-                                        cudf::aggregation::Kind const* d_agg_kinds)
-{
-  // TODO grid_1d utility
-  for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows;
-       cur_idx += blockDim.x * gridDim.x) {
-    if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx)) {
-      auto map_idx = local_mapping_index[cur_idx];
-
-      for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
-        auto input_col = input_values.column(col_idx);
-
-        cudf::detail::dispatch_type_and_aggregation(input_col.type(),
-                                                    d_agg_kinds[col_idx],
-                                                    shmem_element_aggregator{},
-                                                    s_aggregates_pointer[col_idx],
-                                                    map_idx,
-                                                    s_aggregates_valid_pointer[col_idx],
-                                                    input_col,
-                                                    cur_idx);
-      }
-    }
-  }
-}
-
-__device__ void compute_final_aggregates(int col_start,
-                                         int col_end,
-                                         cudf::table_device_view input_values,
-                                         cudf::mutable_table_device_view output_values,
-                                         cudf::size_type cardinality,
-                                         cudf::size_type* global_mapping_index,
-                                         std::byte** s_aggregates_pointer,
-                                         bool** s_aggregates_valid_pointer,
-                                         cudf::aggregation::Kind const* d_agg_kinds)
-{
-  for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) {
-    auto out_idx = global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx];
-    for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
-      auto output_col = output_values.column(col_idx);
-
-      cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(),
-                                                  d_agg_kinds[col_idx],
-                                                  gmem_element_aggregator{},
-                                                  output_col,
-                                                  out_idx,
-                                                  input_values.column(col_idx),
-                                                  s_aggregates_pointer[col_idx],
-                                                  cur_idx,
-                                                  s_aggregates_valid_pointer[col_idx]);
-    }
-  }
-}
-
-/* Takes the local_mapping_index and global_mapping_index to compute
- * pre (shared) and final (global) aggregates*/
-CUDF_KERNEL void compute_d_agg_kinds_kernel(cudf::size_type num_rows,
-                                            bitmask_type const* row_bitmask,
-                                            bool skip_rows_with_nulls,
-                                            cudf::size_type* local_mapping_index,
-                                            cudf::size_type* global_mapping_index,
-                                            cudf::size_type* block_cardinality,
-                                            cudf::table_device_view input_values,
-                                            cudf::mutable_table_device_view output_values,
-                                            cudf::aggregation::Kind const* d_agg_kinds,
-                                            int total_agg_size,
-                                            int pointer_size)
-{
-  auto const block       = cooperative_groups::this_thread_block();
-  auto const cardinality = block_cardinality[block.group_index().x];
-  if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; }
-
-  auto const num_cols = output_values.num_columns();
-
-  __shared__ int col_start;
-  __shared__ int col_end;
-  extern __shared__ std::byte shared_set_aggregates[];
-  std::byte** s_aggregates_pointer =
-    reinterpret_cast<std::byte**>(shared_set_aggregates + total_agg_size);
-  bool** s_aggregates_valid_pointer =
-    reinterpret_cast<bool**>(shared_set_aggregates + total_agg_size + pointer_size);
-
-  if (block.thread_rank() == 0) {
-    col_start = 0;
-    col_end   = 0;
-  }
-  block.sync();
-
-  while (col_end < num_cols) {
-    calculate_columns_to_aggregate(col_start,
-                                   col_end,
-                                   output_values,
-                                   num_cols,
-                                   s_aggregates_pointer,
-                                   s_aggregates_valid_pointer,
-                                   shared_set_aggregates,
-                                   cardinality,
-                                   total_agg_size);
-    block.sync();
-    initialize_shared_memory_aggregates(col_start,
-                                        col_end,
-                                        output_values,
-                                        s_aggregates_pointer,
-                                        s_aggregates_valid_pointer,
-                                        cardinality,
-                                        d_agg_kinds);
-    block.sync();
-    compute_pre_aggregrates(col_start,
-                            col_end,
-                            row_bitmask,
-                            skip_rows_with_nulls,
-                            input_values,
-                            num_rows,
-                            local_mapping_index,
-                            s_aggregates_pointer,
-                            s_aggregates_valid_pointer,
-                            d_agg_kinds);
-    block.sync();
-    compute_final_aggregates(col_start,
-                             col_end,
-                             input_values,
-                             output_values,
-                             cardinality,
-                             global_mapping_index,
-                             s_aggregates_pointer,
-                             s_aggregates_valid_pointer,
-                             d_agg_kinds);
-    block.sync();
-  }
-}
-
-constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; }
-
-template <typename Kernel>
-constexpr std::pair<cudaError_t, size_t> compute_shared_memory_size(Kernel kernel,
-                                                                    int grid_size) noexcept
-{
-  auto const active_blocks_per_sm =
-    cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
-
-  size_t dynamic_shmem_size = 0;
-
-  auto const status = cudaOccupancyAvailableDynamicSMemPerBlock(
-    &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE);
-  if (status != cudaSuccess) { cudaGetLastError(); }
-  return {status, get_previous_multiple_of_8(0.5 * dynamic_shmem_size)};
-}
-
-}  // namespace
-
-template <typename SetType>
-std::pair<cudaError_t, cudf::table> compute_aggregations(
-  int grid_size,
-  cudf::size_type num_input_rows,
-  bitmask_type const* row_bitmask,
-  bool skip_rows_with_nulls,
-  cudf::size_type* local_mapping_index,
-  cudf::size_type* global_mapping_index,
-  cudf::size_type* block_cardinality,
-  cudf::table_device_view input_values,
-  cudf::table_view const& flattened_values,
-  cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> const& agg_kinds,
-  bool direct_aggregations,
-  SetType& global_set,
-  rmm::device_uvector<cudf::size_type>& populated_keys,
-  rmm::cuda_stream_view stream)
-{
-  auto const [status, shmem_size] =
-    compute_shared_memory_size(compute_d_agg_kinds_kernel, grid_size);
-
-  if (status != cudaSuccess) { direct_aggregations = true; }
-
-  // make table that will hold sparse results
-  cudf::table sparse_table = create_sparse_results_table(flattened_values,
-                                                         d_agg_kinds,
-                                                         agg_kinds,
-                                                         direct_aggregations,
-                                                         global_set,
-                                                         populated_keys,
-                                                         stream);
-
-  if (status != cudaSuccess) { return {status, sparse_table}; }
-
-  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
-  auto output_values  = *d_sparse_table;
-
-  // For each aggregation, need two pointers to arrays in shmem
-  // One where the aggregation is performed, one indicating the validity of the aggregation
-  auto const shmem_agg_pointer_size =
-    round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns());
-  // The rest of shmem is utilized for the actual arrays in shmem
-  auto const shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2;
-  compute_d_agg_kinds_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, shmem_size, stream>>>(
-    num_input_rows,
-    row_bitmask,
-    skip_rows_with_nulls,
-    local_mapping_index,
-    global_mapping_index,
-    block_cardinality,
-    input_values,
-    output_values,
-    d_agg_kinds,
-    shmem_agg_size,
-    shmem_agg_pointer_size);
-
-  return {status, sparse_table};
-}
-}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_aggregations.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp
index 1c382f01195..f01d9f24c66 100644
--- a/cpp/src/groupby/hash/compute_aggregations.hpp
+++ b/cpp/src/groupby/hash/compute_aggregations.hpp
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
 #include <cudf/aggregation.hpp>
@@ -28,22 +27,19 @@
 
 namespace cudf::groupby::detail::hash {
 
-template <typename SetType>
-std::pair<cudaError_t, cudf::table> compute_aggregations(
-  int grid_size,
-  cudf::size_type num_input_rows,
-  bitmask_type const* row_bitmask,
-  bool skip_rows_with_nulls,
-  cudf::size_type* local_mapping_index,
-  cudf::size_type* global_mapping_index,
-  cudf::size_type* block_cardinality,
-  cudf::table_device_view input_values,
-  cudf::table_view const& flattened_values,
-  cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> const& agg_kinds,
-  bool direct_aggregations,
-  SetType& global_set,
-  rmm::device_uvector<cudf::size_type>& populated_keys,
-  rmm::cuda_stream_view stream);
+std::pair<bool, size_t> can_use_shmem_aggs(int grid_size) noexcept;
+
+void compute_aggregations(int grid_size,
+                          cudf::size_type num_input_rows,
+                          bitmask_type const* row_bitmask,
+                          bool skip_rows_with_nulls,
+                          cudf::size_type* local_mapping_index,
+                          cudf::size_type* global_mapping_index,
+                          cudf::size_type* block_cardinality,
+                          cudf::table_device_view input_values,
+                          cudf::mutable_table_device_view output_values,
+                          cudf::aggregation::Kind const* d_agg_kinds,
+                          size_t shmem_size,
+                          rmm::cuda_stream_view stream);
 
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_aggregations_null.cu b/cpp/src/groupby/hash/compute_aggregations_null.cu
deleted file mode 100644
index d2c2a5f5830..00000000000
--- a/cpp/src/groupby/hash/compute_aggregations_null.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "compute_aggregations.cuh"
-#include "compute_aggregations.hpp"
-
-namespace cudf::groupby::detail::hash {
-template std::pair<cudaError_t, cudf::table> compute_aggregations<nullable_global_set_t>(
-  int grid_size,
-  cudf::size_type num_input_rows,
-  bitmask_type const* row_bitmask,
-  bool skip_rows_with_nulls,
-  cudf::size_type* local_mapping_index,
-  cudf::size_type* global_mapping_index,
-  cudf::size_type* block_cardinality,
-  cudf::table_device_view input_values,
-  cudf::table_view const& flattened_values,
-  cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> const& agg_kinds,
-  bool direct_aggregations,
-  nullable_global_set_t& global_set,
-  rmm::device_uvector<cudf::size_type>& populated_keys,
-  rmm::cuda_stream_view stream);
-}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index 25c8aed957c..b534e9b8f1e 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -243,28 +243,22 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   auto const d_agg_kinds                         = cudf::detail::make_device_uvector_async(
     agg_kinds, stream, rmm::mr::get_current_device_resource());
 
-  // prepare to launch kernel to do the actual aggregation
-  auto d_values = table_device_view::create(flattened_values, stream);
+  auto const [uses_shmem_aggs, shmem_size] = can_use_shmem_aggs(grid_size);
 
-  auto [status, sparse_table] =
-    compute_aggregations<SetType>(grid_size,
-                                  num_input_rows,
-                                  static_cast<bitmask_type*>(row_bitmask.data()),
-                                  skip_rows_with_nulls,
-                                  local_mapping_index.data(),
-                                  global_mapping_index.data(),
-                                  block_cardinality.data(),
-                                  *d_values,
-                                  flattened_values,
-                                  d_agg_kinds.data(),
-                                  agg_kinds,
-                                  direct_aggregations.value(stream),
-                                  global_set,
-                                  populated_keys,
-                                  stream);
+  // make table that will hold sparse results
+  cudf::table sparse_table =
+    create_sparse_results_table(flattened_values,
+                                d_agg_kinds.data(),
+                                agg_kinds,
+                                uses_shmem_aggs ? direct_aggregations.value(stream) : true,
+                                global_set,
+                                populated_keys,
+                                stream);
+  // prepare to launch kernel to do the actual aggregation
+  auto d_values       = table_device_view::create(flattened_values, stream);
   auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
 
-  if (status != cudaSuccess) {
+  if (!uses_shmem_aggs) {
     thrust::for_each_n(
       rmm::exec_policy(stream),
       thrust::make_counting_iterator(0),
@@ -276,20 +270,34 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
                                         static_cast<bitmask_type*>(row_bitmask.data()),
                                         skip_rows_with_nulls});
     extract_populated_keys(global_set, populated_keys, stream);
-  } else if (direct_aggregations.value(stream)) {
-    auto const stride = GROUPBY_BLOCK_SIZE * grid_size;
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator(0),
-                       keys.num_rows(),
-                       compute_direct_aggregates{global_set_ref,
-                                                 *d_values,
-                                                 *d_sparse_table,
-                                                 d_agg_kinds.data(),
-                                                 block_cardinality.data(),
-                                                 stride,
-                                                 static_cast<bitmask_type*>(row_bitmask.data()),
-                                                 skip_rows_with_nulls});
-    extract_populated_keys(global_set, populated_keys, stream);
+  } else {
+    compute_aggregations(grid_size,
+                         num_input_rows,
+                         static_cast<bitmask_type*>(row_bitmask.data()),
+                         skip_rows_with_nulls,
+                         local_mapping_index.data(),
+                         global_mapping_index.data(),
+                         block_cardinality.data(),
+                         *d_values,
+                         *d_sparse_table,
+                         d_agg_kinds.data(),
+                         shmem_size,
+                         stream);
+    if (direct_aggregations.value(stream)) {
+      auto const stride = GROUPBY_BLOCK_SIZE * grid_size;
+      thrust::for_each_n(rmm::exec_policy(stream),
+                         thrust::make_counting_iterator(0),
+                         keys.num_rows(),
+                         compute_direct_aggregates{global_set_ref,
+                                                   *d_values,
+                                                   *d_sparse_table,
+                                                   d_agg_kinds.data(),
+                                                   block_cardinality.data(),
+                                                   stride,
+                                                   static_cast<bitmask_type*>(row_bitmask.data()),
+                                                   skip_rows_with_nulls});
+      extract_populated_keys(global_set, populated_keys, stream);
+    }
   }
 
   // Add results back to sparse_results cache

From c24247553284582bf1d6c3fd4abe313788cc28a9 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 30 Sep 2024 11:23:22 -0700
Subject: [PATCH 058/135] Fix mismatch

---
 cpp/src/groupby/hash/compute_aggregations.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu
index 0b1008493f4..aec6f39501e 100644
--- a/cpp/src/groupby/hash/compute_aggregations.cu
+++ b/cpp/src/groupby/hash/compute_aggregations.cu
@@ -226,7 +226,7 @@ constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 *
 
 }  // namespace
 
-constexpr std::pair<bool, size_t> can_use_shmem_aggs(int grid_size) noexcept
+std::pair<bool, size_t> can_use_shmem_aggs(int grid_size) noexcept
 {
   auto const active_blocks_per_sm =
     cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());

From 30e572e1fa4841eaab69f62be111838d21fe6cbd Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 30 Sep 2024 15:00:01 -0700
Subject: [PATCH 059/135] Clean up device aggregators

---
 .../groupby/hash/global_memory_aggregator.cuh | 113 ++++++------------
 .../groupby/hash/shared_memory_aggregator.cuh | 112 +++++++----------
 2 files changed, 79 insertions(+), 146 deletions(-)

diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh
index 9f38750060b..abf8bd71483 100644
--- a/cpp/src/groupby/hash/global_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh
@@ -25,29 +25,23 @@
 
 namespace cudf::groupby::detail::hash {
 
-template <typename Source,
-          cudf::aggregation::Kind k,
-          bool target_has_nulls,
-          bool source_has_nulls,
-          typename Enable = void>
+template <typename Source, cudf::aggregation::Kind k, typename Enable = void>
 struct update_target_element_gmem {
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
                              std::byte* source,
                              cudf::size_type source_index,
-                             bool* source_null) const noexcept
+                             bool* source_null) const
   {
     CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
   }
 };
 
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_gmem<
   Source,
   cudf::aggregation::MIN,
-  target_has_nulls,
-  source_has_nulls,
   std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
                    !cudf::is_fixed_point<Source>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
@@ -57,7 +51,7 @@ struct update_target_element_gmem<
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    if (source_has_nulls and source_null[source_index]) { return; }
+    if (source_null[source_index]) { return; }
 
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
 
@@ -65,16 +59,14 @@ struct update_target_element_gmem<
     cudf::detail::atomic_min(&target.element<Target>(target_index),
                              static_cast<Target>(source_casted[source_index]));
 
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
   }
 };
 
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_gmem<
   Source,
   cudf::aggregation::MIN,
-  target_has_nulls,
-  source_has_nulls,
   std::enable_if_t<cudf::is_fixed_point<Source>() &&
                    cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
@@ -84,23 +76,21 @@ struct update_target_element_gmem<
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    if (source_has_nulls and source_null[source_index]) { return; }
+    if (source_null[source_index]) { return; }
     using Target              = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
     using DeviceType          = cudf::device_storage_type_t<Target>;
     DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
     cudf::detail::atomic_min(&target.element<DeviceType>(target_index),
                              static_cast<DeviceType>(source_casted[source_index]));
 
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
   }
 };
 
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_gmem<
   Source,
   cudf::aggregation::MAX,
-  target_has_nulls,
-  source_has_nulls,
   std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
                    !cudf::is_fixed_point<Source>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
@@ -110,22 +100,20 @@ struct update_target_element_gmem<
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    if (source_has_nulls and source_null[source_index]) { return; }
+    if (source_null[source_index]) { return; }
     using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
     Target* source_casted = reinterpret_cast<Target*>(source);
     cudf::detail::atomic_max(&target.element<Target>(target_index),
                              static_cast<Target>(source_casted[source_index]));
 
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
   }
 };
 
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_gmem<
   Source,
   cudf::aggregation::MAX,
-  target_has_nulls,
-  source_has_nulls,
   std::enable_if_t<cudf::is_fixed_point<Source>() &&
                    cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
@@ -135,7 +123,7 @@ struct update_target_element_gmem<
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    if (source_has_nulls and source_null[source_index]) { return; }
+    if (source_null[source_index]) { return; }
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
 
     using DeviceType          = cudf::device_storage_type_t<Target>;
@@ -143,16 +131,14 @@ struct update_target_element_gmem<
     cudf::detail::atomic_max(&target.element<DeviceType>(target_index),
                              static_cast<DeviceType>(source_casted[source_index]));
 
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
   }
 };
 
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_gmem<
   Source,
   cudf::aggregation::SUM,
-  target_has_nulls,
-  source_has_nulls,
   std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
                    !cudf::is_fixed_point<Source>() && !cudf::is_timestamp<Source>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
@@ -162,23 +148,21 @@ struct update_target_element_gmem<
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    if (source_has_nulls and source_null[source_index]) { return; }
+    if (source_null[source_index]) { return; }
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
 
     Target* source_casted = reinterpret_cast<Target*>(source);
     cudf::detail::atomic_add(&target.element<Target>(target_index),
                              static_cast<Target>(source_casted[source_index]));
 
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
   }
 };
 
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_gmem<
   Source,
   cudf::aggregation::SUM,
-  target_has_nulls,
-  source_has_nulls,
   std::enable_if_t<cudf::has_atomic_support<cudf::device_storage_type_t<Source>>() &&
                    cudf::is_fixed_point<Source>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
@@ -188,14 +172,14 @@ struct update_target_element_gmem<
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    if (source_has_nulls and source_null[source_index]) { return; }
+    if (source_null[source_index]) { return; }
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
 
     using DeviceType          = cudf::device_storage_type_t<Target>;
     DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
     cudf::detail::atomic_add(&target.element<DeviceType>(target_index),
                              static_cast<DeviceType>(source_casted[source_index]));
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
   }
 };
 
@@ -207,7 +191,6 @@ struct update_target_element_gmem<
  * dictionary.
  *
  */
-template <bool target_has_nulls = true>
 struct update_target_from_dictionary_gmem {
   template <typename Source,
             aggregation::Kind k,
@@ -219,7 +202,7 @@ struct update_target_from_dictionary_gmem {
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    update_target_element_gmem<Source, k, target_has_nulls, false>{}(
+    update_target_element_gmem<Source, k>{}(
       target, target_index, source_column, source, source_index, source_null);
   }
   template <typename Source,
@@ -245,16 +228,11 @@ struct update_target_from_dictionary_gmem {
  * resolve the keys column type.
  *
  * `update_target_element( target, target_index, source.keys(), source.indices()[source_index] )`
- *
- * @tparam target_has_nulls Indicates presence of null elements in `target`
- * @tparam source_has_nulls Indicates presence of null elements in `source`.
  */
-template <aggregation::Kind k, bool target_has_nulls, bool source_has_nulls>
+template <aggregation::Kind k>
 struct update_target_element_gmem<
   dictionary32,
   k,
-  target_has_nulls,
-  source_has_nulls,
   std::enable_if_t<not(k == aggregation::ARGMIN or k == aggregation::ARGMAX or
                        k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL)>> {
   __device__ void operator()(mutable_column_device_view target,
@@ -264,12 +242,12 @@ struct update_target_element_gmem<
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    if (source_has_nulls and source_null[source_index]) { return; }
+    if (source_null[source_index]) { return; }
 
     dispatch_type_and_aggregation(
       source_column.child(cudf::dictionary_column_view::keys_column_index).type(),
       k,
-      update_target_from_dictionary_gmem<target_has_nulls>{},
+      update_target_from_dictionary_gmem{},
       target,
       target_index,
       source_column,
@@ -280,11 +258,9 @@ struct update_target_element_gmem<
 };
 
 // The shared memory will already have it squared
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_gmem<Source,
                                   cudf::aggregation::SUM_OF_SQUARES,
-                                  target_has_nulls,
-                                  source_has_nulls,
                                   std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
@@ -293,7 +269,7 @@ struct update_target_element_gmem<Source,
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    if (source_has_nulls and source_null[source_index]) { return; }
+    if (source_null[source_index]) { return; }
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM_OF_SQUARES>;
 
     Target* source_casted = reinterpret_cast<Target*>(source);
@@ -301,15 +277,13 @@ struct update_target_element_gmem<Source,
 
     cudf::detail::atomic_add(&target.element<Target>(target_index), value);
 
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
   }
 };
 
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_gmem<Source,
                                   cudf::aggregation::PRODUCT,
-                                  target_has_nulls,
-                                  source_has_nulls,
                                   std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
@@ -318,25 +292,23 @@ struct update_target_element_gmem<Source,
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    if (source_has_nulls and source_null[source_index]) { return; }
+    if (source_null[source_index]) { return; }
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::PRODUCT>;
 
     Target* source_casted = reinterpret_cast<Target*>(source);
     cudf::detail::atomic_mul(&target.element<Target>(target_index),
                              static_cast<Target>(source_casted[source_index]));
 
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
   }
 };
 
 // Assuming that the target column of COUNT_VALID, COUNT_ALL would be using fixed_width column and
 // non-fixed point column
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_gmem<
   Source,
   cudf::aggregation::COUNT_VALID,
-  target_has_nulls,
-  source_has_nulls,
   std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_VALID>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
@@ -356,12 +328,10 @@ struct update_target_element_gmem<
 };
 
 // TODO: VALID and ALL have same code
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_gmem<
   Source,
   cudf::aggregation::COUNT_ALL,
-  target_has_nulls,
-  source_has_nulls,
   std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_ALL>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
@@ -380,12 +350,10 @@ struct update_target_element_gmem<
   }
 };
 
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_gmem<
   Source,
   cudf::aggregation::ARGMAX,
-  target_has_nulls,
-  source_has_nulls,
   std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMAX>() and
                    cudf::is_relationally_comparable<Source, Source>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
@@ -395,7 +363,7 @@ struct update_target_element_gmem<
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    if (source_has_nulls and source_null[source_index]) { return; }
+    if (source_null[source_index]) { return; }
     using Target             = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMAX>;
     Target* source_casted    = reinterpret_cast<Target*>(source);
     auto source_argmax_index = source_casted[source_index];
@@ -409,15 +377,13 @@ struct update_target_element_gmem<
       }
     }
 
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
   }
 };
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_gmem<
   Source,
   cudf::aggregation::ARGMIN,
-  target_has_nulls,
-  source_has_nulls,
   std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMIN>() and
                    cudf::is_relationally_comparable<Source, Source>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
@@ -427,7 +393,7 @@ struct update_target_element_gmem<
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    if (source_has_nulls and source_null[source_index]) { return; }
+    if (source_null[source_index]) { return; }
     using Target             = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMIN>;
     Target* source_casted    = reinterpret_cast<Target*>(source);
     auto source_argmin_index = source_casted[source_index];
@@ -441,11 +407,10 @@ struct update_target_element_gmem<
       }
     }
 
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
   }
 };
 
-template <bool target_has_nulls = true, bool source_has_nulls = true>
 struct gmem_element_aggregator {
   template <typename Source, cudf::aggregation::Kind k>
   __device__ void operator()(cudf::mutable_column_device_view target,
@@ -455,7 +420,7 @@ struct gmem_element_aggregator {
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    update_target_element_gmem<Source, k, target_has_nulls, source_has_nulls>{}(
+    update_target_element_gmem<Source, k>{}(
       target, target_index, source_column, source, source_index, source_null);
   }
 };
diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
index ef46c9b4cb4..624a56710d5 100644
--- a/cpp/src/groupby/hash/shared_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
@@ -25,28 +25,22 @@
 
 namespace cudf::groupby::detail::hash {
 
-template <typename Source,
-          cudf::aggregation::Kind k,
-          bool target_has_nulls,
-          bool source_has_nulls,
-          typename Enable = void>
+template <typename Source, cudf::aggregation::Kind k, typename Enable = void>
 struct update_target_element_shmem {
   __device__ void operator()(std::byte* target,
                              cudf::size_type target_index,
                              bool* target_null,
                              cudf::column_device_view source,
-                             cudf::size_type source_index) const noexcept
+                             cudf::size_type source_index) const
   {
     CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
   }
 };
 
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_shmem<
   Source,
   cudf::aggregation::MIN,
-  target_has_nulls,
-  source_has_nulls,
   std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
                    !cudf::is_fixed_point<Source>()>> {
   __device__ void operator()(std::byte* target,
@@ -55,23 +49,21 @@ struct update_target_element_shmem<
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
+    if (source.is_null(source_index)) { return; }
 
     using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
     Target* target_casted = reinterpret_cast<Target*>(target);
     cudf::detail::atomic_min(&target_casted[target_index],
                              static_cast<Target>(source.element<Source>(source_index)));
 
-    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+    if (target_null[target_index]) { target_null[target_index] = false; }
   }
 };
 
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_shmem<
   Source,
   cudf::aggregation::MIN,
-  target_has_nulls,
-  source_has_nulls,
   std::enable_if_t<cudf::is_fixed_point<Source>() &&
                    cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
   __device__ void operator()(std::byte* target,
@@ -80,7 +72,7 @@ struct update_target_element_shmem<
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
+    if (source.is_null(source_index)) { return; }
 
     using Target       = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
     using DeviceTarget = cudf::device_storage_type_t<Target>;
@@ -89,16 +81,14 @@ struct update_target_element_shmem<
     DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
     cudf::detail::atomic_min(&target_casted[target_index],
                              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
-    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+    if (target_null[target_index]) { target_null[target_index] = false; }
   }
 };
 
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_shmem<
   Source,
   cudf::aggregation::MAX,
-  target_has_nulls,
-  source_has_nulls,
   std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
                    !cudf::is_fixed_point<Source>()>> {
   __device__ void operator()(std::byte* target,
@@ -107,22 +97,20 @@ struct update_target_element_shmem<
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
+    if (source.is_null(source_index)) { return; }
 
     using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
     Target* target_casted = reinterpret_cast<Target*>(target);
     cudf::detail::atomic_max(&target_casted[target_index],
                              static_cast<Target>(source.element<Source>(source_index)));
-    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+    if (target_null[target_index]) { target_null[target_index] = false; }
   }
 };
 
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_shmem<
   Source,
   cudf::aggregation::MAX,
-  target_has_nulls,
-  source_has_nulls,
   std::enable_if_t<cudf::is_fixed_point<Source>() &&
                    cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
   __device__ void operator()(std::byte* target,
@@ -131,7 +119,7 @@ struct update_target_element_shmem<
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
+    if (source.is_null(source_index)) { return; }
 
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
 
@@ -142,16 +130,14 @@ struct update_target_element_shmem<
     cudf::detail::atomic_max(&target_casted[target_index],
                              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
 
-    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+    if (target_null[target_index]) { target_null[target_index] = false; }
   }
 };
 
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_shmem<
   Source,
   cudf::aggregation::SUM,
-  target_has_nulls,
-  source_has_nulls,
   std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
                    !cudf::is_fixed_point<Source>() && !cudf::is_timestamp<Source>()>> {
   __device__ void operator()(std::byte* target,
@@ -160,23 +146,21 @@ struct update_target_element_shmem<
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
+    if (source.is_null(source_index)) { return; }
 
     using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
     Target* target_casted = reinterpret_cast<Target*>(target);
     cudf::detail::atomic_add(&target_casted[target_index],
                              static_cast<Target>(source.element<Source>(source_index)));
 
-    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+    if (target_null[target_index]) { target_null[target_index] = false; }
   }
 };
 
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_shmem<
   Source,
   cudf::aggregation::SUM,
-  target_has_nulls,
-  source_has_nulls,
   std::enable_if_t<cudf::has_atomic_support<cudf::device_storage_type_t<Source>>() &&
                    cudf::is_fixed_point<Source>()>> {
   __device__ void operator()(std::byte* target,
@@ -185,7 +169,7 @@ struct update_target_element_shmem<
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
+    if (source.is_null(source_index)) { return; }
 
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
 
@@ -196,11 +180,10 @@ struct update_target_element_shmem<
     cudf::detail::atomic_add(&target_casted[target_index],
                              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
 
-    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+    if (target_null[target_index]) { target_null[target_index] = false; }
   }
 };
 
-template <bool target_has_nulls = true>
 struct update_target_from_dictionary_shmem {
   template <typename Source,
             aggregation::Kind k,
@@ -211,7 +194,7 @@ struct update_target_from_dictionary_shmem {
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    update_target_element_shmem<Source, k, target_has_nulls, false>{}(
+    update_target_element_shmem<Source, k>{}(
       target, target_index, target_null, source, source_index);
   }
   template <typename Source,
@@ -226,12 +209,10 @@ struct update_target_from_dictionary_shmem {
   }
 };
 
-template <aggregation::Kind k, bool target_has_nulls, bool source_has_nulls>
+template <aggregation::Kind k>
 struct update_target_element_shmem<
   dictionary32,
   k,
-  target_has_nulls,
-  source_has_nulls,
   std::enable_if_t<not(k == aggregation::ARGMIN or k == aggregation::ARGMAX or
                        k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL)>> {
   __device__ void operator()(std::byte* target,
@@ -240,12 +221,12 @@ struct update_target_element_shmem<
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
+    if (source.is_null(source_index)) { return; }
 
     dispatch_type_and_aggregation(
       source.child(cudf::dictionary_column_view::keys_column_index).type(),
       k,
-      update_target_from_dictionary_shmem<target_has_nulls>{},
+      update_target_from_dictionary_shmem{},
       target,
       target_index,
       target_null,
@@ -254,11 +235,9 @@ struct update_target_element_shmem<
   }
 };
 
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_shmem<Source,
                                    cudf::aggregation::SUM_OF_SQUARES,
-                                   target_has_nulls,
-                                   source_has_nulls,
                                    std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
   __device__ void operator()(std::byte* target,
                              cudf::size_type target_index,
@@ -266,22 +245,20 @@ struct update_target_element_shmem<Source,
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
+    if (source.is_null(source_index)) { return; }
 
     using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::SUM_OF_SQUARES>;
     Target* target_casted = reinterpret_cast<Target*>(target);
     auto value            = static_cast<Target>(source.element<Source>(source_index));
     cudf::detail::atomic_add(&target_casted[target_index], value * value);
 
-    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+    if (target_null[target_index]) { target_null[target_index] = false; }
   }
 };
 
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_shmem<Source,
                                    cudf::aggregation::PRODUCT,
-                                   target_has_nulls,
-                                   source_has_nulls,
                                    std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
   __device__ void operator()(std::byte* target,
                              cudf::size_type target_index,
@@ -289,23 +266,21 @@ struct update_target_element_shmem<Source,
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
+    if (source.is_null(source_index)) { return; }
 
     using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::PRODUCT>;
     Target* target_casted = reinterpret_cast<Target*>(target);
     cudf::detail::atomic_mul(&target_casted[target_index],
                              static_cast<Target>(source.element<Source>(source_index)));
 
-    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+    if (target_null[target_index]) { target_null[target_index] = false; }
   }
 };
 
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_shmem<
   Source,
   cudf::aggregation::COUNT_VALID,
-  target_has_nulls,
-  source_has_nulls,
   std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_VALID>()>> {
   __device__ void operator()(std::byte* target,
                              cudf::size_type target_index,
@@ -313,7 +288,7 @@ struct update_target_element_shmem<
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
+    if (source.is_null(source_index)) { return; }
 
     using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_VALID>;
     Target* target_casted = reinterpret_cast<Target*>(target);
@@ -321,12 +296,10 @@ struct update_target_element_shmem<
   }
 };
 
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_shmem<
   Source,
   cudf::aggregation::COUNT_ALL,
-  target_has_nulls,
-  source_has_nulls,
   std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_ALL>()>> {
   __device__ void operator()(std::byte* target,
                              cudf::size_type target_index,
@@ -342,12 +315,10 @@ struct update_target_element_shmem<
   }
 };
 
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_shmem<
   Source,
   cudf::aggregation::ARGMAX,
-  target_has_nulls,
-  source_has_nulls,
   std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMAX>() and
                    cudf::is_relationally_comparable<Source, Source>()>> {
   __device__ void operator()(std::byte* target,
@@ -356,7 +327,7 @@ struct update_target_element_shmem<
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
+    if (source.is_null(source_index)) { return; }
 
     using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMAX>;
     Target* target_casted = reinterpret_cast<Target*>(target);
@@ -368,16 +339,14 @@ struct update_target_element_shmem<
       }
     }
 
-    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+    if (target_null[target_index]) { target_null[target_index] = false; }
   }
 };
 
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
+template <typename Source>
 struct update_target_element_shmem<
   Source,
   cudf::aggregation::ARGMIN,
-  target_has_nulls,
-  source_has_nulls,
   std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMIN>() and
                    cudf::is_relationally_comparable<Source, Source>()>> {
   __device__ void operator()(std::byte* target,
@@ -386,7 +355,7 @@ struct update_target_element_shmem<
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
+    if (source.is_null(source_index)) { return; }
 
     using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMIN>;
     Target* target_casted = reinterpret_cast<Target*>(target);
@@ -398,11 +367,10 @@ struct update_target_element_shmem<
       }
     }
 
-    if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; }
+    if (target_null[target_index]) { target_null[target_index] = false; }
   }
 };
 
-template <bool target_has_nulls = true, bool source_has_nulls = true>
 struct shmem_element_aggregator {
   template <typename Source, cudf::aggregation::Kind k>
   __device__ void operator()(std::byte* target,
@@ -411,7 +379,7 @@ struct shmem_element_aggregator {
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    update_target_element_shmem<Source, k, target_has_nulls, source_has_nulls>{}(
+    update_target_element_shmem<Source, k>{}(
       target, target_index, target_null, source, source_index);
   }
 };

From 0916fe79c2f9c0acf8720d794c585f7458c0c7fd Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 30 Sep 2024 15:08:01 -0700
Subject: [PATCH 060/135] Header cleanups

---
 cpp/src/groupby/hash/groupby.cu | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index b307b8a8d1f..c206da91375 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -19,21 +19,13 @@
 #include "helpers.cuh"
 
 #include <cudf/aggregation.hpp>
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_view.hpp>
-#include <cudf/detail/aggregation/aggregation.cuh>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/aggregation/result_cache.hpp>
 #include <cudf/detail/groupby.hpp>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
-#include <cudf/hashing/detail/default_hash.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table.hpp>
-#include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -42,11 +34,10 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-
+#include <algorithm>
 #include <memory>
 #include <utility>
+#include <vector>
 
 namespace cudf::groupby::detail::hash {
 namespace {

From e7ff94dfff54b0b511dae3b4c4642ac61599de97 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 30 Sep 2024 15:16:37 -0700
Subject: [PATCH 061/135] More header cleanups

---
 cpp/src/groupby/hash/compute_groupby.cu | 4 ----
 cpp/src/groupby/hash/groupby.cu         | 3 +--
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu
index 1eb208c588d..9021846f71e 100644
--- a/cpp/src/groupby/hash/compute_groupby.cu
+++ b/cpp/src/groupby/hash/compute_groupby.cu
@@ -19,12 +19,9 @@
 #include "sparse_to_dense_results.hpp"
 #include "var_hash_functor.cuh"
 
-#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/aggregation/result_cache.hpp>
-#include <cudf/detail/binaryop.hpp>
 #include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/gather.hpp>
-#include <cudf/detail/unary.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
@@ -38,7 +35,6 @@
 #include <memory>
 
 namespace cudf::groupby::detail::hash {
-
 /**
  * @brief Computes groupby using hash table.
  *
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index c206da91375..03b1a40d224 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -28,11 +28,11 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.cuh>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <algorithm>
 #include <memory>
@@ -41,7 +41,6 @@
 
 namespace cudf::groupby::detail::hash {
 namespace {
-
 /**
  * @brief List of aggregation operations that can be computed with a hash-based
  * implementation.

From d01f0a2e7e48cdafb85b94c47805de97259434fa Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 30 Sep 2024 15:27:12 -0700
Subject: [PATCH 062/135] Switch to cuda::std utilities for device APIs

---
 .../groupby/hash/global_memory_aggregator.cuh | 62 ++++++++++---------
 .../groupby/hash/shared_memory_aggregator.cuh | 62 ++++++++++---------
 2 files changed, 68 insertions(+), 56 deletions(-)

diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh
index abf8bd71483..f8baf7d84ba 100644
--- a/cpp/src/groupby/hash/global_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh
@@ -23,6 +23,8 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/utilities/traits.cuh>
 
+#include <cuda/std/type_traits>
+
 namespace cudf::groupby::detail::hash {
 
 template <typename Source, cudf::aggregation::Kind k, typename Enable = void>
@@ -42,8 +44,8 @@ template <typename Source>
 struct update_target_element_gmem<
   Source,
   cudf::aggregation::MIN,
-  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                   !cudf::is_fixed_point<Source>()>> {
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                         !cudf::is_fixed_point<Source>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
@@ -67,8 +69,8 @@ template <typename Source>
 struct update_target_element_gmem<
   Source,
   cudf::aggregation::MIN,
-  std::enable_if_t<cudf::is_fixed_point<Source>() &&
-                   cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
+  cuda::std::enable_if_t<cudf::is_fixed_point<Source>() &&
+                         cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
@@ -91,8 +93,8 @@ template <typename Source>
 struct update_target_element_gmem<
   Source,
   cudf::aggregation::MAX,
-  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                   !cudf::is_fixed_point<Source>()>> {
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                         !cudf::is_fixed_point<Source>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
@@ -114,8 +116,8 @@ template <typename Source>
 struct update_target_element_gmem<
   Source,
   cudf::aggregation::MAX,
-  std::enable_if_t<cudf::is_fixed_point<Source>() &&
-                   cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
+  cuda::std::enable_if_t<cudf::is_fixed_point<Source>() &&
+                         cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
@@ -139,8 +141,8 @@ template <typename Source>
 struct update_target_element_gmem<
   Source,
   cudf::aggregation::SUM,
-  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                   !cudf::is_fixed_point<Source>() && !cudf::is_timestamp<Source>()>> {
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                         !cudf::is_fixed_point<Source>() && !cudf::is_timestamp<Source>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
@@ -163,8 +165,8 @@ template <typename Source>
 struct update_target_element_gmem<
   Source,
   cudf::aggregation::SUM,
-  std::enable_if_t<cudf::has_atomic_support<cudf::device_storage_type_t<Source>>() &&
-                   cudf::is_fixed_point<Source>()>> {
+  cuda::std::enable_if_t<cudf::has_atomic_support<cudf::device_storage_type_t<Source>>() &&
+                         cudf::is_fixed_point<Source>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
@@ -194,7 +196,7 @@ struct update_target_element_gmem<
 struct update_target_from_dictionary_gmem {
   template <typename Source,
             aggregation::Kind k,
-            std::enable_if_t<!is_dictionary<Source>()>* = nullptr>
+            cuda::std::enable_if_t<!is_dictionary<Source>()>* = nullptr>
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source_column,
@@ -207,7 +209,7 @@ struct update_target_from_dictionary_gmem {
   }
   template <typename Source,
             aggregation::Kind k,
-            std::enable_if_t<is_dictionary<Source>()>* = nullptr>
+            cuda::std::enable_if_t<is_dictionary<Source>()>* = nullptr>
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source_column,
@@ -233,8 +235,8 @@ template <aggregation::Kind k>
 struct update_target_element_gmem<
   dictionary32,
   k,
-  std::enable_if_t<not(k == aggregation::ARGMIN or k == aggregation::ARGMAX or
-                       k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL)>> {
+  cuda::std::enable_if_t<not(k == aggregation::ARGMIN or k == aggregation::ARGMAX or
+                             k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL)>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source_column,
@@ -259,9 +261,10 @@ struct update_target_element_gmem<
 
 // The shared memory will already have it squared
 template <typename Source>
-struct update_target_element_gmem<Source,
-                                  cudf::aggregation::SUM_OF_SQUARES,
-                                  std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::SUM_OF_SQUARES,
+  cuda::std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
@@ -282,9 +285,10 @@ struct update_target_element_gmem<Source,
 };
 
 template <typename Source>
-struct update_target_element_gmem<Source,
-                                  cudf::aggregation::PRODUCT,
-                                  std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::PRODUCT,
+  cuda::std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
@@ -309,7 +313,8 @@ template <typename Source>
 struct update_target_element_gmem<
   Source,
   cudf::aggregation::COUNT_VALID,
-  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_VALID>()>> {
+  cuda::std::enable_if_t<
+    cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_VALID>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
@@ -332,7 +337,8 @@ template <typename Source>
 struct update_target_element_gmem<
   Source,
   cudf::aggregation::COUNT_ALL,
-  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_ALL>()>> {
+  cuda::std::enable_if_t<
+    cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_ALL>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
@@ -354,8 +360,8 @@ template <typename Source>
 struct update_target_element_gmem<
   Source,
   cudf::aggregation::ARGMAX,
-  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMAX>() and
-                   cudf::is_relationally_comparable<Source, Source>()>> {
+  cuda::std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMAX>() and
+                         cudf::is_relationally_comparable<Source, Source>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
@@ -384,8 +390,8 @@ template <typename Source>
 struct update_target_element_gmem<
   Source,
   cudf::aggregation::ARGMIN,
-  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMIN>() and
-                   cudf::is_relationally_comparable<Source, Source>()>> {
+  cuda::std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMIN>() and
+                         cudf::is_relationally_comparable<Source, Source>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
index 624a56710d5..3f1b4f01375 100644
--- a/cpp/src/groupby/hash/shared_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
@@ -23,6 +23,8 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/utilities/traits.cuh>
 
+#include <cuda/std/type_traits>
+
 namespace cudf::groupby::detail::hash {
 
 template <typename Source, cudf::aggregation::Kind k, typename Enable = void>
@@ -41,8 +43,8 @@ template <typename Source>
 struct update_target_element_shmem<
   Source,
   cudf::aggregation::MIN,
-  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                   !cudf::is_fixed_point<Source>()>> {
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                         !cudf::is_fixed_point<Source>()>> {
   __device__ void operator()(std::byte* target,
                              cudf::size_type target_index,
                              bool* target_null,
@@ -64,8 +66,8 @@ template <typename Source>
 struct update_target_element_shmem<
   Source,
   cudf::aggregation::MIN,
-  std::enable_if_t<cudf::is_fixed_point<Source>() &&
-                   cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
+  cuda::std::enable_if_t<cudf::is_fixed_point<Source>() &&
+                         cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
   __device__ void operator()(std::byte* target,
                              cudf::size_type target_index,
                              bool* target_null,
@@ -89,8 +91,8 @@ template <typename Source>
 struct update_target_element_shmem<
   Source,
   cudf::aggregation::MAX,
-  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                   !cudf::is_fixed_point<Source>()>> {
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                         !cudf::is_fixed_point<Source>()>> {
   __device__ void operator()(std::byte* target,
                              cudf::size_type target_index,
                              bool* target_null,
@@ -111,8 +113,8 @@ template <typename Source>
 struct update_target_element_shmem<
   Source,
   cudf::aggregation::MAX,
-  std::enable_if_t<cudf::is_fixed_point<Source>() &&
-                   cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
+  cuda::std::enable_if_t<cudf::is_fixed_point<Source>() &&
+                         cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
   __device__ void operator()(std::byte* target,
                              cudf::size_type target_index,
                              bool* target_null,
@@ -138,8 +140,8 @@ template <typename Source>
 struct update_target_element_shmem<
   Source,
   cudf::aggregation::SUM,
-  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                   !cudf::is_fixed_point<Source>() && !cudf::is_timestamp<Source>()>> {
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                         !cudf::is_fixed_point<Source>() && !cudf::is_timestamp<Source>()>> {
   __device__ void operator()(std::byte* target,
                              cudf::size_type target_index,
                              bool* target_null,
@@ -161,8 +163,8 @@ template <typename Source>
 struct update_target_element_shmem<
   Source,
   cudf::aggregation::SUM,
-  std::enable_if_t<cudf::has_atomic_support<cudf::device_storage_type_t<Source>>() &&
-                   cudf::is_fixed_point<Source>()>> {
+  cuda::std::enable_if_t<cudf::has_atomic_support<cudf::device_storage_type_t<Source>>() &&
+                         cudf::is_fixed_point<Source>()>> {
   __device__ void operator()(std::byte* target,
                              cudf::size_type target_index,
                              bool* target_null,
@@ -187,7 +189,7 @@ struct update_target_element_shmem<
 struct update_target_from_dictionary_shmem {
   template <typename Source,
             aggregation::Kind k,
-            std::enable_if_t<!is_dictionary<Source>()>* = nullptr>
+            cuda::std::enable_if_t<!is_dictionary<Source>()>* = nullptr>
   __device__ void operator()(std::byte* target,
                              cudf::size_type target_index,
                              bool* target_null,
@@ -199,7 +201,7 @@ struct update_target_from_dictionary_shmem {
   }
   template <typename Source,
             aggregation::Kind k,
-            std::enable_if_t<is_dictionary<Source>()>* = nullptr>
+            cuda::std::enable_if_t<is_dictionary<Source>()>* = nullptr>
   __device__ void operator()(std::byte* target,
                              cudf::size_type target_index,
                              bool* target_null,
@@ -213,8 +215,8 @@ template <aggregation::Kind k>
 struct update_target_element_shmem<
   dictionary32,
   k,
-  std::enable_if_t<not(k == aggregation::ARGMIN or k == aggregation::ARGMAX or
-                       k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL)>> {
+  cuda::std::enable_if_t<not(k == aggregation::ARGMIN or k == aggregation::ARGMAX or
+                             k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL)>> {
   __device__ void operator()(std::byte* target,
                              cudf::size_type target_index,
                              bool* target_null,
@@ -236,9 +238,10 @@ struct update_target_element_shmem<
 };
 
 template <typename Source>
-struct update_target_element_shmem<Source,
-                                   cudf::aggregation::SUM_OF_SQUARES,
-                                   std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::SUM_OF_SQUARES,
+  cuda::std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
   __device__ void operator()(std::byte* target,
                              cudf::size_type target_index,
                              bool* target_null,
@@ -257,9 +260,10 @@ struct update_target_element_shmem<Source,
 };
 
 template <typename Source>
-struct update_target_element_shmem<Source,
-                                   cudf::aggregation::PRODUCT,
-                                   std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::PRODUCT,
+  cuda::std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
   __device__ void operator()(std::byte* target,
                              cudf::size_type target_index,
                              bool* target_null,
@@ -281,7 +285,8 @@ template <typename Source>
 struct update_target_element_shmem<
   Source,
   cudf::aggregation::COUNT_VALID,
-  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_VALID>()>> {
+  cuda::std::enable_if_t<
+    cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_VALID>()>> {
   __device__ void operator()(std::byte* target,
                              cudf::size_type target_index,
                              bool* target_null,
@@ -300,7 +305,8 @@ template <typename Source>
 struct update_target_element_shmem<
   Source,
   cudf::aggregation::COUNT_ALL,
-  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_ALL>()>> {
+  cuda::std::enable_if_t<
+    cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_ALL>()>> {
   __device__ void operator()(std::byte* target,
                              cudf::size_type target_index,
                              bool* target_null,
@@ -319,8 +325,8 @@ template <typename Source>
 struct update_target_element_shmem<
   Source,
   cudf::aggregation::ARGMAX,
-  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMAX>() and
-                   cudf::is_relationally_comparable<Source, Source>()>> {
+  cuda::std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMAX>() and
+                         cudf::is_relationally_comparable<Source, Source>()>> {
   __device__ void operator()(std::byte* target,
                              cudf::size_type target_index,
                              bool* target_null,
@@ -347,8 +353,8 @@ template <typename Source>
 struct update_target_element_shmem<
   Source,
   cudf::aggregation::ARGMIN,
-  std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMIN>() and
-                   cudf::is_relationally_comparable<Source, Source>()>> {
+  cuda::std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMIN>() and
+                         cudf::is_relationally_comparable<Source, Source>()>> {
   __device__ void operator()(std::byte* target,
                              cudf::size_type target_index,
                              bool* target_null,

From 221bed4efd28b2149aecbc80215fdb0961d9f155 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 30 Sep 2024 16:45:02 -0700
Subject: [PATCH 063/135] Clean up shared aggregator early exit logic

---
 .../groupby/hash/global_memory_aggregator.cuh |  2 +-
 .../groupby/hash/shared_memory_aggregator.cuh | 28 ++-----------------
 cpp/src/groupby/hash/single_pass_functors.cuh |  1 +
 3 files changed, 5 insertions(+), 26 deletions(-)

diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh
index f8baf7d84ba..636c6e97c28 100644
--- a/cpp/src/groupby/hash/global_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh
@@ -352,7 +352,7 @@ struct update_target_element_gmem<
     cudf::detail::atomic_add(&target.element<Target>(target_index),
                              static_cast<Target>(source_casted[source_index]));
 
-    // It is assumed the output for COUNT_VALID is initialized to be all valid
+    // It is assumed the output for COUNT_ALL is initialized to be all valid
   }
 };
 
diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
index 3f1b4f01375..9be2e43eac0 100644
--- a/cpp/src/groupby/hash/shared_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
@@ -51,8 +51,6 @@ struct update_target_element_shmem<
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
     Target* target_casted = reinterpret_cast<Target*>(target);
     cudf::detail::atomic_min(&target_casted[target_index],
@@ -74,8 +72,6 @@ struct update_target_element_shmem<
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target       = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
     using DeviceTarget = cudf::device_storage_type_t<Target>;
     using DeviceSource = cudf::device_storage_type_t<Source>;
@@ -99,8 +95,6 @@ struct update_target_element_shmem<
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
     Target* target_casted = reinterpret_cast<Target*>(target);
     cudf::detail::atomic_max(&target_casted[target_index],
@@ -121,8 +115,6 @@ struct update_target_element_shmem<
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
 
     using DeviceTarget = cudf::device_storage_type_t<Target>;
@@ -148,8 +140,6 @@ struct update_target_element_shmem<
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
     Target* target_casted = reinterpret_cast<Target*>(target);
     cudf::detail::atomic_add(&target_casted[target_index],
@@ -171,8 +161,6 @@ struct update_target_element_shmem<
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
 
     using DeviceTarget = cudf::device_storage_type_t<Target>;
@@ -223,8 +211,6 @@ struct update_target_element_shmem<
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     dispatch_type_and_aggregation(
       source.child(cudf::dictionary_column_view::keys_column_index).type(),
       k,
@@ -248,8 +234,6 @@ struct update_target_element_shmem<
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::SUM_OF_SQUARES>;
     Target* target_casted = reinterpret_cast<Target*>(target);
     auto value            = static_cast<Target>(source.element<Source>(source_index));
@@ -270,8 +254,6 @@ struct update_target_element_shmem<
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::PRODUCT>;
     Target* target_casted = reinterpret_cast<Target*>(target);
     cudf::detail::atomic_mul(&target_casted[target_index],
@@ -293,8 +275,6 @@ struct update_target_element_shmem<
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_VALID>;
     Target* target_casted = reinterpret_cast<Target*>(target);
     cudf::detail::atomic_add(&target_casted[target_index], Target{1});
@@ -333,8 +313,6 @@ struct update_target_element_shmem<
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMAX>;
     Target* target_casted = reinterpret_cast<Target*>(target);
     auto old              = cudf::detail::atomic_cas(
@@ -361,8 +339,6 @@ struct update_target_element_shmem<
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMIN>;
     Target* target_casted = reinterpret_cast<Target*>(target);
     auto old              = cudf::detail::atomic_cas(
@@ -385,9 +361,11 @@ struct shmem_element_aggregator {
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
+    if constexpr (k != cudf::aggregation::COUNT_ALL) {
+      if (source.is_null(source_index)) { return; }
+    }
     update_target_element_shmem<Source, k>{}(
       target, target_index, target_null, source, source_index);
   }
 };
-
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index 19ba33e01e3..1a3e761c7ae 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -25,6 +25,7 @@
 
 namespace cudf::groupby::detail::hash {
 
+// TODO: TO BE REMOVED
 template <typename T, cudf::aggregation::Kind k>
 __device__ constexpr bool is_supported()
 {

From b31f16f38c5e3fb2e813f2f90aecdb8edfe82beb Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 30 Sep 2024 16:48:13 -0700
Subject: [PATCH 064/135] Clean up global aggregator early exit logic

---
 .../groupby/hash/global_memory_aggregator.cuh    | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh
index 636c6e97c28..053c95b40a4 100644
--- a/cpp/src/groupby/hash/global_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh
@@ -53,8 +53,6 @@ struct update_target_element_gmem<
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    if (source_null[source_index]) { return; }
-
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
 
     Target* source_casted = reinterpret_cast<Target*>(source);
@@ -78,7 +76,6 @@ struct update_target_element_gmem<
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    if (source_null[source_index]) { return; }
     using Target              = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
     using DeviceType          = cudf::device_storage_type_t<Target>;
     DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
@@ -102,7 +99,6 @@ struct update_target_element_gmem<
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    if (source_null[source_index]) { return; }
     using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
     Target* source_casted = reinterpret_cast<Target*>(source);
     cudf::detail::atomic_max(&target.element<Target>(target_index),
@@ -125,7 +121,6 @@ struct update_target_element_gmem<
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    if (source_null[source_index]) { return; }
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
 
     using DeviceType          = cudf::device_storage_type_t<Target>;
@@ -150,7 +145,6 @@ struct update_target_element_gmem<
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    if (source_null[source_index]) { return; }
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
 
     Target* source_casted = reinterpret_cast<Target*>(source);
@@ -174,7 +168,6 @@ struct update_target_element_gmem<
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    if (source_null[source_index]) { return; }
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
 
     using DeviceType          = cudf::device_storage_type_t<Target>;
@@ -244,8 +237,6 @@ struct update_target_element_gmem<
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    if (source_null[source_index]) { return; }
-
     dispatch_type_and_aggregation(
       source_column.child(cudf::dictionary_column_view::keys_column_index).type(),
       k,
@@ -272,7 +263,6 @@ struct update_target_element_gmem<
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    if (source_null[source_index]) { return; }
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM_OF_SQUARES>;
 
     Target* source_casted = reinterpret_cast<Target*>(source);
@@ -296,7 +286,6 @@ struct update_target_element_gmem<
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    if (source_null[source_index]) { return; }
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::PRODUCT>;
 
     Target* source_casted = reinterpret_cast<Target*>(source);
@@ -369,7 +358,6 @@ struct update_target_element_gmem<
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    if (source_null[source_index]) { return; }
     using Target             = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMAX>;
     Target* source_casted    = reinterpret_cast<Target*>(source);
     auto source_argmax_index = source_casted[source_index];
@@ -399,7 +387,6 @@ struct update_target_element_gmem<
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    if (source_null[source_index]) { return; }
     using Target             = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMIN>;
     Target* source_casted    = reinterpret_cast<Target*>(source);
     auto source_argmin_index = source_casted[source_index];
@@ -426,6 +413,9 @@ struct gmem_element_aggregator {
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
+    if constexpr (k != cudf::aggregation::COUNT_ALL) {
+      if (source_null[source_index]) { return; }
+    }
     update_target_element_gmem<Source, k>{}(
       target, target_index, source_column, source, source_index, source_null);
   }

From 9cea918d443b1737c4ca3ba18c3ca1cf46822c21 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 1 Oct 2024 10:49:10 -0700
Subject: [PATCH 065/135] Fix merge conflicts

---
 cpp/src/groupby/hash/single_pass_functors.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index 1a3e761c7ae..6d10c8065ca 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -18,7 +18,7 @@
 
 #include "helpers.cuh"
 
-#include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/aggregation/device_aggregators.cuh>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/utilities/traits.cuh>
@@ -217,7 +217,7 @@ struct compute_direct_aggregates {
     if (block_cardinality[block_id] >= GROUPBY_CARDINALITY_THRESHOLD and
         (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i))) {
       auto const result = set.insert_and_find(i);
-      cudf::detail::aggregate_row<true, true>(output_values, *result.first, input_values, i, aggs);
+      cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs);
     }
   }
 };
@@ -293,7 +293,7 @@ struct compute_single_pass_aggs_fn {
     if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i)) {
       auto const result = set.insert_and_find(i);
 
-      cudf::detail::aggregate_row<true, true>(output_values, *result.first, input_values, i, aggs);
+      cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs);
     }
   }
 };

From fe9c212f570a3ef334887cace12338bdb6ea1794 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 1 Oct 2024 12:40:49 -0700
Subject: [PATCH 066/135] Clean up device aggregator early exit logic

---
 .../detail/aggregation/device_aggregators.cuh | 27 +++----------------
 1 file changed, 3 insertions(+), 24 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
index 10be5e1d36f..dd92568465b 100644
--- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
+++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
@@ -51,8 +51,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::MIN>;
     cudf::detail::atomic_min(&target.element<Target>(target_index),
                              static_cast<Target>(source.element<Source>(source_index)));
@@ -72,8 +70,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target       = target_type_t<Source, aggregation::MIN>;
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
@@ -96,8 +92,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::MAX>;
     cudf::detail::atomic_max(&target.element<Target>(target_index),
                              static_cast<Target>(source.element<Source>(source_index)));
@@ -117,8 +111,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target       = target_type_t<Source, aggregation::MAX>;
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
@@ -141,8 +133,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::SUM>;
     cudf::detail::atomic_add(&target.element<Target>(target_index),
                              static_cast<Target>(source.element<Source>(source_index)));
@@ -162,8 +152,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target       = target_type_t<Source, aggregation::SUM>;
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
@@ -227,8 +215,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     dispatch_type_and_aggregation(
       source.child(cudf::dictionary_column_view::keys_column_index).type(),
       k,
@@ -249,8 +235,6 @@ struct update_target_element<Source,
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::SUM_OF_SQUARES>;
     auto value   = static_cast<Target>(source.element<Source>(source_index));
     cudf::detail::atomic_add(&target.element<Target>(target_index), value * value);
@@ -267,8 +251,6 @@ struct update_target_element<Source,
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::PRODUCT>;
     cudf::detail::atomic_mul(&target.element<Target>(target_index),
                              static_cast<Target>(source.element<Source>(source_index)));
@@ -286,8 +268,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::COUNT_VALID>;
     cudf::detail::atomic_add(&target.element<Target>(target_index), Target{1});
 
@@ -323,8 +303,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::ARGMAX>;
     auto old     = cudf::detail::atomic_cas(
       &target.element<Target>(target_index), ARGMAX_SENTINEL, source_index);
@@ -349,8 +327,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::ARGMIN>;
     auto old     = cudf::detail::atomic_cas(
       &target.element<Target>(target_index), ARGMIN_SENTINEL, source_index);
@@ -376,6 +352,9 @@ struct elementwise_aggregator {
                              column_device_view source,
                              size_type source_index) const noexcept
   {
+    if constexpr (k != cudf::aggregation::COUNT_ALL) {
+      if (source.is_null(source_index)) { return; }
+    }
     update_target_element<Source, k>{}(target, target_index, source, source_index);
   }
 };

From a7a9d75757b88dab1d03419d348a0fd56edb5516 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 1 Oct 2024 16:14:38 -0700
Subject: [PATCH 067/135] Add traits to minimize code duplication

---
 .../detail/aggregation/device_aggregators.cuh | 99 ++++++-------------
 1 file changed, 28 insertions(+), 71 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
index dd92568465b..bc370c59296 100644
--- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
+++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
@@ -29,6 +29,25 @@
 #include <cuda/std/type_traits>
 
 namespace cudf::detail {
+/// Checks if an aggregation kind needs to operate on the underlying storage type
+template <aggregation::Kind k>
+__device__ constexpr bool uses_underlying_type()
+{
+  return k == aggregation::MIN or k == aggregation::MAX or k == aggregation::SUM;
+}
+
+/// Gets the underlying target type for the given source type and aggregation kind
+template <typename Source, aggregation::Kind k>
+using underlying_target_t =
+  cuda::std::conditional_t<uses_underlying_type<k>(),
+                           cudf::device_storage_type_t<cudf::detail::target_type_t<Source, k>>,
+                           cudf::detail::target_type_t<Source, k>>;
+
+/// Gets the underlying source type for the given source type and aggregation kind
+template <typename Source, aggregation::Kind k>
+using underlying_source_t =
+  cuda::std::conditional_t<uses_underlying_type<k>(), cudf::device_storage_type_t<Source>, Source>;
+
 template <typename Source, aggregation::Kind k, typename Enable = void>
 struct update_target_element {
   __device__ void operator()(mutable_column_device_view target,
@@ -44,35 +63,14 @@ template <typename Source>
 struct update_target_element<
   Source,
   aggregation::MIN,
-  cuda::std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                         !is_fixed_point<Source>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    using Target = target_type_t<Source, aggregation::MIN>;
-    cudf::detail::atomic_min(&target.element<Target>(target_index),
-                             static_cast<Target>(source.element<Source>(source_index)));
-
-    if (target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source>
-struct update_target_element<
-  Source,
-  aggregation::MIN,
-  cuda::std::enable_if_t<is_fixed_point<Source>() &&
-                         cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
+  cuda::std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    using Target       = target_type_t<Source, aggregation::MIN>;
-    using DeviceTarget = device_storage_type_t<Target>;
-    using DeviceSource = device_storage_type_t<Source>;
+    using DeviceTarget = cudf::detail::underlying_target_t<Source, aggregation::MIN>;
+    using DeviceSource = cudf::detail::underlying_source_t<Source, aggregation::MIN>;
 
     cudf::detail::atomic_min(&target.element<DeviceTarget>(target_index),
                              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
@@ -85,35 +83,14 @@ template <typename Source>
 struct update_target_element<
   Source,
   aggregation::MAX,
-  cuda::std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                         !is_fixed_point<Source>()>> {
+  cuda::std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    using Target = target_type_t<Source, aggregation::MAX>;
-    cudf::detail::atomic_max(&target.element<Target>(target_index),
-                             static_cast<Target>(source.element<Source>(source_index)));
-
-    if (target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source>
-struct update_target_element<
-  Source,
-  aggregation::MAX,
-  cuda::std::enable_if_t<is_fixed_point<Source>() &&
-                         cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    using Target       = target_type_t<Source, aggregation::MAX>;
-    using DeviceTarget = device_storage_type_t<Target>;
-    using DeviceSource = device_storage_type_t<Source>;
+    using DeviceTarget = cudf::detail::underlying_target_t<Source, aggregation::MAX>;
+    using DeviceSource = cudf::detail::underlying_source_t<Source, aggregation::MAX>;
 
     cudf::detail::atomic_max(&target.element<DeviceTarget>(target_index),
                              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
@@ -127,34 +104,14 @@ struct update_target_element<
   Source,
   aggregation::SUM,
   cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                         !cudf::is_fixed_point<Source>() && !cudf::is_timestamp<Source>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    using Target = target_type_t<Source, aggregation::SUM>;
-    cudf::detail::atomic_add(&target.element<Target>(target_index),
-                             static_cast<Target>(source.element<Source>(source_index)));
-
-    if (target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source>
-struct update_target_element<
-  Source,
-  aggregation::SUM,
-  cuda::std::enable_if_t<is_fixed_point<Source>() &&
-                         cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
+                         !cudf::is_timestamp<Source>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    using Target       = target_type_t<Source, aggregation::SUM>;
-    using DeviceTarget = device_storage_type_t<Target>;
-    using DeviceSource = device_storage_type_t<Source>;
+    using DeviceTarget = cudf::detail::underlying_target_t<Source, aggregation::SUM>;
+    using DeviceSource = cudf::detail::underlying_source_t<Source, aggregation::SUM>;
 
     cudf::detail::atomic_add(&target.element<DeviceTarget>(target_index),
                              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));

From cb042ef8c066d095b93d93517bbe31a3f8e6de6b Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 1 Oct 2024 16:32:08 -0700
Subject: [PATCH 068/135] Use traits to avoid code duplication

---
 .../groupby/hash/global_memory_aggregator.cuh | 90 ++-----------------
 .../groupby/hash/shared_memory_aggregator.cuh | 89 +++---------------
 2 files changed, 18 insertions(+), 161 deletions(-)

diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh
index 053c95b40a4..62be580fa43 100644
--- a/cpp/src/groupby/hash/global_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/aggregation/device_aggregators.cuh>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
@@ -44,8 +44,7 @@ template <typename Source>
 struct update_target_element_gmem<
   Source,
   cudf::aggregation::MIN,
-  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                         !cudf::is_fixed_point<Source>()>> {
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
@@ -53,31 +52,7 @@ struct update_target_element_gmem<
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
-
-    Target* source_casted = reinterpret_cast<Target*>(source);
-    cudf::detail::atomic_min(&target.element<Target>(target_index),
-                             static_cast<Target>(source_casted[source_index]));
-
-    if (target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source>
-struct update_target_element_gmem<
-  Source,
-  cudf::aggregation::MIN,
-  cuda::std::enable_if_t<cudf::is_fixed_point<Source>() &&
-                         cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
-  __device__ void operator()(cudf::mutable_column_device_view target,
-                             cudf::size_type target_index,
-                             cudf::column_device_view source_column,
-                             std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
-  {
-    using Target              = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
-    using DeviceType          = cudf::device_storage_type_t<Target>;
+    using DeviceType          = cudf::detail::underlying_target_t<Source, aggregation::MIN>;
     DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
     cudf::detail::atomic_min(&target.element<DeviceType>(target_index),
                              static_cast<DeviceType>(source_casted[source_index]));
@@ -90,30 +65,7 @@ template <typename Source>
 struct update_target_element_gmem<
   Source,
   cudf::aggregation::MAX,
-  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                         !cudf::is_fixed_point<Source>()>> {
-  __device__ void operator()(cudf::mutable_column_device_view target,
-                             cudf::size_type target_index,
-                             cudf::column_device_view source_column,
-                             std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
-  {
-    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
-    Target* source_casted = reinterpret_cast<Target*>(source);
-    cudf::detail::atomic_max(&target.element<Target>(target_index),
-                             static_cast<Target>(source_casted[source_index]));
-
-    if (target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source>
-struct update_target_element_gmem<
-  Source,
-  cudf::aggregation::MAX,
-  cuda::std::enable_if_t<cudf::is_fixed_point<Source>() &&
-                         cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
@@ -121,9 +73,7 @@ struct update_target_element_gmem<
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
-
-    using DeviceType          = cudf::device_storage_type_t<Target>;
+    using DeviceType          = cudf::detail::underlying_target_t<Source, aggregation::MAX>;
     DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
     cudf::detail::atomic_max(&target.element<DeviceType>(target_index),
                              static_cast<DeviceType>(source_casted[source_index]));
@@ -137,7 +87,7 @@ struct update_target_element_gmem<
   Source,
   cudf::aggregation::SUM,
   cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                         !cudf::is_fixed_point<Source>() && !cudf::is_timestamp<Source>()>> {
+                         !cudf::is_timestamp<Source>()>> {
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
@@ -145,35 +95,11 @@ struct update_target_element_gmem<
                              cudf::size_type source_index,
                              bool* source_null) const noexcept
   {
-    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
-
-    Target* source_casted = reinterpret_cast<Target*>(source);
-    cudf::detail::atomic_add(&target.element<Target>(target_index),
-                             static_cast<Target>(source_casted[source_index]));
-
-    if (target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source>
-struct update_target_element_gmem<
-  Source,
-  cudf::aggregation::SUM,
-  cuda::std::enable_if_t<cudf::has_atomic_support<cudf::device_storage_type_t<Source>>() &&
-                         cudf::is_fixed_point<Source>()>> {
-  __device__ void operator()(cudf::mutable_column_device_view target,
-                             cudf::size_type target_index,
-                             cudf::column_device_view source_column,
-                             std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
-  {
-    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
-
-    using DeviceType          = cudf::device_storage_type_t<Target>;
+    using DeviceType          = cudf::detail::underlying_target_t<Source, aggregation::SUM>;
     DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
     cudf::detail::atomic_add(&target.element<DeviceType>(target_index),
                              static_cast<DeviceType>(source_casted[source_index]));
+
     if (target.is_null(target_index)) { target.set_valid(target_index); }
   }
 };
diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
index 9be2e43eac0..5bea0defe29 100644
--- a/cpp/src/groupby/hash/shared_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/aggregation/device_aggregators.cuh>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
@@ -43,38 +43,15 @@ template <typename Source>
 struct update_target_element_shmem<
   Source,
   cudf::aggregation::MIN,
-  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                         !cudf::is_fixed_point<Source>()>> {
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
   __device__ void operator()(std::byte* target,
                              cudf::size_type target_index,
                              bool* target_null,
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
-    Target* target_casted = reinterpret_cast<Target*>(target);
-    cudf::detail::atomic_min(&target_casted[target_index],
-                             static_cast<Target>(source.element<Source>(source_index)));
-
-    if (target_null[target_index]) { target_null[target_index] = false; }
-  }
-};
-
-template <typename Source>
-struct update_target_element_shmem<
-  Source,
-  cudf::aggregation::MIN,
-  cuda::std::enable_if_t<cudf::is_fixed_point<Source>() &&
-                         cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
-  __device__ void operator()(std::byte* target,
-                             cudf::size_type target_index,
-                             bool* target_null,
-                             cudf::column_device_view source,
-                             cudf::size_type source_index) const noexcept
-  {
-    using Target       = cudf::detail::target_type_t<Source, cudf::aggregation::MIN>;
-    using DeviceTarget = cudf::device_storage_type_t<Target>;
-    using DeviceSource = cudf::device_storage_type_t<Source>;
+    using DeviceTarget = cudf::detail::underlying_target_t<Source, aggregation::MIN>;
+    using DeviceSource = cudf::detail::underlying_source_t<Source, aggregation::MIN>;
 
     DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
     cudf::detail::atomic_min(&target_casted[target_index],
@@ -87,38 +64,15 @@ template <typename Source>
 struct update_target_element_shmem<
   Source,
   cudf::aggregation::MAX,
-  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                         !cudf::is_fixed_point<Source>()>> {
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
   __device__ void operator()(std::byte* target,
                              cudf::size_type target_index,
                              bool* target_null,
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
-    Target* target_casted = reinterpret_cast<Target*>(target);
-    cudf::detail::atomic_max(&target_casted[target_index],
-                             static_cast<Target>(source.element<Source>(source_index)));
-    if (target_null[target_index]) { target_null[target_index] = false; }
-  }
-};
-
-template <typename Source>
-struct update_target_element_shmem<
-  Source,
-  cudf::aggregation::MAX,
-  cuda::std::enable_if_t<cudf::is_fixed_point<Source>() &&
-                         cudf::has_atomic_support<cudf::device_storage_type_t<Source>>()>> {
-  __device__ void operator()(std::byte* target,
-                             cudf::size_type target_index,
-                             bool* target_null,
-                             cudf::column_device_view source,
-                             cudf::size_type source_index) const noexcept
-  {
-    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::MAX>;
-
-    using DeviceTarget = cudf::device_storage_type_t<Target>;
-    using DeviceSource = cudf::device_storage_type_t<Source>;
+    using DeviceTarget = cudf::detail::underlying_target_t<Source, aggregation::MAX>;
+    using DeviceSource = cudf::detail::underlying_source_t<Source, aggregation::MAX>;
 
     DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
     cudf::detail::atomic_max(&target_casted[target_index],
@@ -133,38 +87,15 @@ struct update_target_element_shmem<
   Source,
   cudf::aggregation::SUM,
   cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                         !cudf::is_fixed_point<Source>() && !cudf::is_timestamp<Source>()>> {
-  __device__ void operator()(std::byte* target,
-                             cudf::size_type target_index,
-                             bool* target_null,
-                             cudf::column_device_view source,
-                             cudf::size_type source_index) const noexcept
-  {
-    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
-    Target* target_casted = reinterpret_cast<Target*>(target);
-    cudf::detail::atomic_add(&target_casted[target_index],
-                             static_cast<Target>(source.element<Source>(source_index)));
-
-    if (target_null[target_index]) { target_null[target_index] = false; }
-  }
-};
-
-template <typename Source>
-struct update_target_element_shmem<
-  Source,
-  cudf::aggregation::SUM,
-  cuda::std::enable_if_t<cudf::has_atomic_support<cudf::device_storage_type_t<Source>>() &&
-                         cudf::is_fixed_point<Source>()>> {
+                         !cudf::is_timestamp<Source>()>> {
   __device__ void operator()(std::byte* target,
                              cudf::size_type target_index,
                              bool* target_null,
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
-    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM>;
-
-    using DeviceTarget = cudf::device_storage_type_t<Target>;
-    using DeviceSource = cudf::device_storage_type_t<Source>;
+    using DeviceTarget = cudf::detail::underlying_target_t<Source, aggregation::SUM>;
+    using DeviceSource = cudf::detail::underlying_source_t<Source, aggregation::SUM>;
 
     DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
     cudf::detail::atomic_add(&target_casted[target_index],

From ecdd3fddbcf4e90a8e27f4f34f47b7e82901aa7a Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 4 Oct 2024 13:36:06 -0700
Subject: [PATCH 069/135] Cannot query shmem with nested type dispatcher

---
 .../groupby/hash/compute_single_pass_aggs.cuh | 79 +++++++++++++------
 .../groupby/hash/global_memory_aggregator.cuh |  2 +
 .../groupby/hash/shared_memory_aggregator.cuh |  2 +
 3 files changed, 58 insertions(+), 25 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index b534e9b8f1e..aad63d33b91 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -210,7 +210,57 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
       ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
       : rmm::device_buffer{};
 
+  auto const has_dictionary_input = std::any_of(keys.begin(), keys.end(), [](cudf::column_view col) {
+      return cudf::is_dictionary(col.type());});
+
+  // 'populated_keys' contains inserted row_indices (keys) of global hash set
+  rmm::device_uvector<cudf::size_type> populated_keys(keys.num_rows(), stream);
+
+  // flatten the aggs to a table that can be operated on by aggregate_row
+  auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
+  auto const d_agg_kinds                         = cudf::detail::make_device_uvector_async(
+    agg_kinds, stream, rmm::mr::get_current_device_resource());
+
   auto global_set_ref  = global_set.ref(cuco::op::insert_and_find);
+
+  if (has_dictionary_input) {
+  // make table that will hold sparse results
+  cudf::table sparse_table =
+    create_sparse_results_table(flattened_values,
+                                d_agg_kinds.data(),
+                                agg_kinds,
+                                has_dictionary_input,
+                                global_set,
+                                populated_keys,
+                                stream);
+
+  // prepare to launch kernel to do the actual aggregation
+  auto d_values       = table_device_view::create(flattened_values, stream);
+  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
+
+    thrust::for_each_n(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(0),
+      keys.num_rows(),
+      hash::compute_single_pass_aggs_fn{global_set_ref,
+                                        *d_values,
+                                        *d_sparse_table,
+                                        d_agg_kinds.data(),
+                                        static_cast<bitmask_type*>(row_bitmask.data()),
+                                        skip_rows_with_nulls});
+    extract_populated_keys(global_set, populated_keys, stream);
+
+  // Add results back to sparse_results cache
+  auto sparse_result_cols = sparse_table.release();
+  for (size_t i = 0; i < aggs.size(); i++) {
+    // Note that the cache will make a copy of this temporary aggregation
+    sparse_results->add_result(
+      flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i]));
+  }
+
+  return populated_keys;
+  }
+
   auto const grid_size = max_occupancy_grid_size(
     compute_mapping_indices<shared_set_ref_type, decltype(global_set_ref), decltype(window_extent)>,
     num_input_rows);
@@ -235,22 +285,12 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
                                                    direct_aggregations.data());
   stream.synchronize();
 
-  // 'populated_keys' contains inserted row_indices (keys) of global hash set
-  rmm::device_uvector<cudf::size_type> populated_keys(keys.num_rows(), stream);
-
-  // flatten the aggs to a table that can be operated on by aggregate_row
-  auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
-  auto const d_agg_kinds                         = cudf::detail::make_device_uvector_async(
-    agg_kinds, stream, rmm::mr::get_current_device_resource());
-
-  auto const [uses_shmem_aggs, shmem_size] = can_use_shmem_aggs(grid_size);
-
   // make table that will hold sparse results
   cudf::table sparse_table =
     create_sparse_results_table(flattened_values,
                                 d_agg_kinds.data(),
                                 agg_kinds,
-                                uses_shmem_aggs ? direct_aggregations.value(stream) : true,
+                              direct_aggregations.value(stream),
                                 global_set,
                                 populated_keys,
                                 stream);
@@ -258,19 +298,9 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   auto d_values       = table_device_view::create(flattened_values, stream);
   auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
 
-  if (!uses_shmem_aggs) {
-    thrust::for_each_n(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator(0),
-      keys.num_rows(),
-      hash::compute_single_pass_aggs_fn{global_set_ref,
-                                        *d_values,
-                                        *d_sparse_table,
-                                        d_agg_kinds.data(),
-                                        static_cast<bitmask_type*>(row_bitmask.data()),
-                                        skip_rows_with_nulls});
-    extract_populated_keys(global_set, populated_keys, stream);
-  } else {
+  auto const [valid, shmem_size] = can_use_shmem_aggs(grid_size);
+  CUDF_EXPECTS(valid, "this must be usable");
+
     compute_aggregations(grid_size,
                          num_input_rows,
                          static_cast<bitmask_type*>(row_bitmask.data()),
@@ -298,7 +328,6 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
                                                    skip_rows_with_nulls});
       extract_populated_keys(global_set, populated_keys, stream);
     }
-  }
 
   // Add results back to sparse_results cache
   auto sparse_result_cols = sparse_table.release();
diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh
index 62be580fa43..cd0bb64c4ee 100644
--- a/cpp/src/groupby/hash/global_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh
@@ -150,6 +150,7 @@ struct update_target_from_dictionary_gmem {
  *
  * `update_target_element( target, target_index, source.keys(), source.indices()[source_index] )`
  */
+/*
 template <aggregation::Kind k>
 struct update_target_element_gmem<
   dictionary32,
@@ -175,6 +176,7 @@ struct update_target_element_gmem<
       source_null);
   }
 };
+*/
 
 // The shared memory will already have it squared
 template <typename Source>
diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
index 5bea0defe29..3b85ccf2ead 100644
--- a/cpp/src/groupby/hash/shared_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
@@ -130,6 +130,7 @@ struct update_target_from_dictionary_shmem {
   }
 };
 
+/*
 template <aggregation::Kind k>
 struct update_target_element_shmem<
   dictionary32,
@@ -153,6 +154,7 @@ struct update_target_element_shmem<
       static_cast<cudf::size_type>(source.element<dictionary32>(source_index)));
   }
 };
+*/
 
 template <typename Source>
 struct update_target_element_shmem<

From 2a96255de99cf03dcf3d1616480f174ac8dc2493 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 7 Oct 2024 10:48:12 -0700
Subject: [PATCH 070/135] Remove unused overloads

---
 .../groupby/hash/global_memory_aggregator.cuh | 74 -------------------
 .../groupby/hash/shared_memory_aggregator.cuh | 51 -------------
 2 files changed, 125 deletions(-)

diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh
index cd0bb64c4ee..08d2c0552b3 100644
--- a/cpp/src/groupby/hash/global_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh
@@ -104,80 +104,6 @@ struct update_target_element_gmem<
   }
 };
 
-/**
- * @brief Function object to update a single element in a target column using
- * the dictionary key addressed by the specific index.
- *
- * SFINAE is used to prevent recursion for dictionary type. Dictionary keys cannot be a
- * dictionary.
- *
- */
-struct update_target_from_dictionary_gmem {
-  template <typename Source,
-            aggregation::Kind k,
-            cuda::std::enable_if_t<!is_dictionary<Source>()>* = nullptr>
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source_column,
-                             std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
-  {
-    update_target_element_gmem<Source, k>{}(
-      target, target_index, source_column, source, source_index, source_null);
-  }
-  template <typename Source,
-            aggregation::Kind k,
-            cuda::std::enable_if_t<is_dictionary<Source>()>* = nullptr>
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source_column,
-                             std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
-  {
-  }
-};
-
-/**
- * @brief Specialization function for dictionary type and aggregations.
- *
- * The `source` column is a dictionary type. This functor de-references the
- * dictionary's keys child column and maps the input source index through
- * the dictionary's indices child column to pass to the `update_target_element`
- * in the above `update_target_from_dictionary` using the type-dispatcher to
- * resolve the keys column type.
- *
- * `update_target_element( target, target_index, source.keys(), source.indices()[source_index] )`
- */
-/*
-template <aggregation::Kind k>
-struct update_target_element_gmem<
-  dictionary32,
-  k,
-  cuda::std::enable_if_t<not(k == aggregation::ARGMIN or k == aggregation::ARGMAX or
-                             k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL)>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source_column,
-                             std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
-  {
-    dispatch_type_and_aggregation(
-      source_column.child(cudf::dictionary_column_view::keys_column_index).type(),
-      k,
-      update_target_from_dictionary_gmem{},
-      target,
-      target_index,
-      source_column,
-      source,
-      source_index,
-      source_null);
-  }
-};
-*/
-
 // The shared memory will already have it squared
 template <typename Source>
 struct update_target_element_gmem<
diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
index 3b85ccf2ead..c5713e4a72e 100644
--- a/cpp/src/groupby/hash/shared_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
@@ -105,57 +105,6 @@ struct update_target_element_shmem<
   }
 };
 
-struct update_target_from_dictionary_shmem {
-  template <typename Source,
-            aggregation::Kind k,
-            cuda::std::enable_if_t<!is_dictionary<Source>()>* = nullptr>
-  __device__ void operator()(std::byte* target,
-                             cudf::size_type target_index,
-                             bool* target_null,
-                             cudf::column_device_view source,
-                             cudf::size_type source_index) const noexcept
-  {
-    update_target_element_shmem<Source, k>{}(
-      target, target_index, target_null, source, source_index);
-  }
-  template <typename Source,
-            aggregation::Kind k,
-            cuda::std::enable_if_t<is_dictionary<Source>()>* = nullptr>
-  __device__ void operator()(std::byte* target,
-                             cudf::size_type target_index,
-                             bool* target_null,
-                             cudf::column_device_view source,
-                             cudf::size_type source_index) const noexcept
-  {
-  }
-};
-
-/*
-template <aggregation::Kind k>
-struct update_target_element_shmem<
-  dictionary32,
-  k,
-  cuda::std::enable_if_t<not(k == aggregation::ARGMIN or k == aggregation::ARGMAX or
-                             k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL)>> {
-  __device__ void operator()(std::byte* target,
-                             cudf::size_type target_index,
-                             bool* target_null,
-                             cudf::column_device_view source,
-                             cudf::size_type source_index) const noexcept
-  {
-    dispatch_type_and_aggregation(
-      source.child(cudf::dictionary_column_view::keys_column_index).type(),
-      k,
-      update_target_from_dictionary_shmem{},
-      target,
-      target_index,
-      target_null,
-      source.child(cudf::dictionary_column_view::keys_column_index),
-      static_cast<cudf::size_type>(source.element<dictionary32>(source_index)));
-  }
-};
-*/
-
 template <typename Source>
 struct update_target_element_shmem<
   Source,

From aa30df0e1749eb860ccf23897fc6169b4b9ec29e Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 7 Oct 2024 10:48:25 -0700
Subject: [PATCH 071/135] Formatting

---
 .../groupby/hash/compute_single_pass_aggs.cuh | 118 +++++++++---------
 1 file changed, 59 insertions(+), 59 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index aad63d33b91..e8b9ca3ff9b 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -210,8 +210,10 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
       ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
       : rmm::device_buffer{};
 
-  auto const has_dictionary_input = std::any_of(keys.begin(), keys.end(), [](cudf::column_view col) {
-      return cudf::is_dictionary(col.type());});
+  auto const has_dictionary_input =
+    std::any_of(keys.begin(), keys.end(), [](cudf::column_view col) {
+      return cudf::is_dictionary(col.type());
+    });
 
   // 'populated_keys' contains inserted row_indices (keys) of global hash set
   rmm::device_uvector<cudf::size_type> populated_keys(keys.num_rows(), stream);
@@ -221,22 +223,21 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   auto const d_agg_kinds                         = cudf::detail::make_device_uvector_async(
     agg_kinds, stream, rmm::mr::get_current_device_resource());
 
-  auto global_set_ref  = global_set.ref(cuco::op::insert_and_find);
+  auto global_set_ref = global_set.ref(cuco::op::insert_and_find);
 
   if (has_dictionary_input) {
-  // make table that will hold sparse results
-  cudf::table sparse_table =
-    create_sparse_results_table(flattened_values,
-                                d_agg_kinds.data(),
-                                agg_kinds,
-                                has_dictionary_input,
-                                global_set,
-                                populated_keys,
-                                stream);
-
-  // prepare to launch kernel to do the actual aggregation
-  auto d_values       = table_device_view::create(flattened_values, stream);
-  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
+    // make table that will hold sparse results
+    cudf::table sparse_table = create_sparse_results_table(flattened_values,
+                                                           d_agg_kinds.data(),
+                                                           agg_kinds,
+                                                           has_dictionary_input,
+                                                           global_set,
+                                                           populated_keys,
+                                                           stream);
+
+    // prepare to launch kernel to do the actual aggregation
+    auto d_values       = table_device_view::create(flattened_values, stream);
+    auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
 
     thrust::for_each_n(
       rmm::exec_policy(stream),
@@ -250,15 +251,15 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
                                         skip_rows_with_nulls});
     extract_populated_keys(global_set, populated_keys, stream);
 
-  // Add results back to sparse_results cache
-  auto sparse_result_cols = sparse_table.release();
-  for (size_t i = 0; i < aggs.size(); i++) {
-    // Note that the cache will make a copy of this temporary aggregation
-    sparse_results->add_result(
-      flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i]));
-  }
+    // Add results back to sparse_results cache
+    auto sparse_result_cols = sparse_table.release();
+    for (size_t i = 0; i < aggs.size(); i++) {
+      // Note that the cache will make a copy of this temporary aggregation
+      sparse_results->add_result(
+        flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i]));
+    }
 
-  return populated_keys;
+    return populated_keys;
   }
 
   auto const grid_size = max_occupancy_grid_size(
@@ -286,14 +287,13 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   stream.synchronize();
 
   // make table that will hold sparse results
-  cudf::table sparse_table =
-    create_sparse_results_table(flattened_values,
-                                d_agg_kinds.data(),
-                                agg_kinds,
-                              direct_aggregations.value(stream),
-                                global_set,
-                                populated_keys,
-                                stream);
+  cudf::table sparse_table = create_sparse_results_table(flattened_values,
+                                                         d_agg_kinds.data(),
+                                                         agg_kinds,
+                                                         direct_aggregations.value(stream),
+                                                         global_set,
+                                                         populated_keys,
+                                                         stream);
   // prepare to launch kernel to do the actual aggregation
   auto d_values       = table_device_view::create(flattened_values, stream);
   auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
@@ -301,33 +301,33 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   auto const [valid, shmem_size] = can_use_shmem_aggs(grid_size);
   CUDF_EXPECTS(valid, "this must be usable");
 
-    compute_aggregations(grid_size,
-                         num_input_rows,
-                         static_cast<bitmask_type*>(row_bitmask.data()),
-                         skip_rows_with_nulls,
-                         local_mapping_index.data(),
-                         global_mapping_index.data(),
-                         block_cardinality.data(),
-                         *d_values,
-                         *d_sparse_table,
-                         d_agg_kinds.data(),
-                         shmem_size,
-                         stream);
-    if (direct_aggregations.value(stream)) {
-      auto const stride = GROUPBY_BLOCK_SIZE * grid_size;
-      thrust::for_each_n(rmm::exec_policy(stream),
-                         thrust::make_counting_iterator(0),
-                         keys.num_rows(),
-                         compute_direct_aggregates{global_set_ref,
-                                                   *d_values,
-                                                   *d_sparse_table,
-                                                   d_agg_kinds.data(),
-                                                   block_cardinality.data(),
-                                                   stride,
-                                                   static_cast<bitmask_type*>(row_bitmask.data()),
-                                                   skip_rows_with_nulls});
-      extract_populated_keys(global_set, populated_keys, stream);
-    }
+  compute_aggregations(grid_size,
+                       num_input_rows,
+                       static_cast<bitmask_type*>(row_bitmask.data()),
+                       skip_rows_with_nulls,
+                       local_mapping_index.data(),
+                       global_mapping_index.data(),
+                       block_cardinality.data(),
+                       *d_values,
+                       *d_sparse_table,
+                       d_agg_kinds.data(),
+                       shmem_size,
+                       stream);
+  if (direct_aggregations.value(stream)) {
+    auto const stride = GROUPBY_BLOCK_SIZE * grid_size;
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator(0),
+                       keys.num_rows(),
+                       compute_direct_aggregates{global_set_ref,
+                                                 *d_values,
+                                                 *d_sparse_table,
+                                                 d_agg_kinds.data(),
+                                                 block_cardinality.data(),
+                                                 stride,
+                                                 static_cast<bitmask_type*>(row_bitmask.data()),
+                                                 skip_rows_with_nulls});
+    extract_populated_keys(global_set, populated_keys, stream);
+  }
 
   // Add results back to sparse_results cache
   auto sparse_result_cols = sparse_table.release();

From c0e1a323162c1a85c1bd5c13e313f1bcc6dec28d Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 7 Oct 2024 12:50:06 -0700
Subject: [PATCH 072/135] Fix dict request determination logic

---
 cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index e8b9ca3ff9b..37cfd4c0f4f 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -210,9 +210,9 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
       ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
       : rmm::device_buffer{};
 
-  auto const has_dictionary_input =
-    std::any_of(keys.begin(), keys.end(), [](cudf::column_view col) {
-      return cudf::is_dictionary(col.type());
+  auto const has_dictionary_request = std::any_of(
+    requests.begin(), requests.end(), [](cudf::groupby::aggregation_request const& request) {
+      return cudf::is_dictionary(request.values.type());
     });
 
   // 'populated_keys' contains inserted row_indices (keys) of global hash set
@@ -225,12 +225,12 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
 
   auto global_set_ref = global_set.ref(cuco::op::insert_and_find);
 
-  if (has_dictionary_input) {
+  if (has_dictionary_request) {
     // make table that will hold sparse results
     cudf::table sparse_table = create_sparse_results_table(flattened_values,
                                                            d_agg_kinds.data(),
                                                            agg_kinds,
-                                                           has_dictionary_input,
+                                                           has_dictionary_request,
                                                            global_set,
                                                            populated_keys,
                                                            stream);

From fc5dc018ab7461c3f74fb78013d23e4d49310736 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 7 Oct 2024 14:06:25 -0700
Subject: [PATCH 073/135] Remove can_use_shmem_aggs logic

---
 cpp/src/groupby/hash/compute_aggregations.cu      | 14 +++++---------
 cpp/src/groupby/hash/compute_aggregations.hpp     |  7 -------
 cpp/src/groupby/hash/compute_single_pass_aggs.cuh |  4 ----
 3 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu
index aec6f39501e..915ede5154b 100644
--- a/cpp/src/groupby/hash/compute_aggregations.cu
+++ b/cpp/src/groupby/hash/compute_aggregations.cu
@@ -226,19 +226,15 @@ constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 *
 
 }  // namespace
 
-std::pair<bool, size_t> can_use_shmem_aggs(int grid_size) noexcept
+size_t available_shared_memory_size(int grid_size)
 {
   auto const active_blocks_per_sm =
     cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
 
   size_t dynamic_shmem_size = 0;
-
-  auto const status = cudaOccupancyAvailableDynamicSMemPerBlock(
-    &dynamic_shmem_size, compute_aggs_kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE);
-  auto const success = status == cudaSuccess;
-  if (!success) { cudaGetLastError(); }
-
-  return {success, get_previous_multiple_of_8(0.5 * dynamic_shmem_size)};
+  CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock(
+    &dynamic_shmem_size, compute_aggs_kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE));
+  return get_previous_multiple_of_8(0.5 * dynamic_shmem_size);
 }
 
 void compute_aggregations(int grid_size,
@@ -251,9 +247,9 @@ void compute_aggregations(int grid_size,
                           cudf::table_device_view input_values,
                           cudf::mutable_table_device_view output_values,
                           cudf::aggregation::Kind const* d_agg_kinds,
-                          size_t shmem_size,
                           rmm::cuda_stream_view stream)
 {
+  auto const shmem_size = available_shared_memory_size(grid_size);
   // For each aggregation, need two pointers to arrays in shmem
   // One where the aggregation is performed, one indicating the validity of the aggregation
   auto const shmem_agg_pointer_size =
diff --git a/cpp/src/groupby/hash/compute_aggregations.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp
index f01d9f24c66..d0e8e354d12 100644
--- a/cpp/src/groupby/hash/compute_aggregations.hpp
+++ b/cpp/src/groupby/hash/compute_aggregations.hpp
@@ -21,14 +21,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <cuda_runtime_api.h>
-
-#include <cstddef>
-
 namespace cudf::groupby::detail::hash {
 
-std::pair<bool, size_t> can_use_shmem_aggs(int grid_size) noexcept;
-
 void compute_aggregations(int grid_size,
                           cudf::size_type num_input_rows,
                           bitmask_type const* row_bitmask,
@@ -39,7 +33,6 @@ void compute_aggregations(int grid_size,
                           cudf::table_device_view input_values,
                           cudf::mutable_table_device_view output_values,
                           cudf::aggregation::Kind const* d_agg_kinds,
-                          size_t shmem_size,
                           rmm::cuda_stream_view stream);
 
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index 37cfd4c0f4f..73a69f81200 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -298,9 +298,6 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   auto d_values       = table_device_view::create(flattened_values, stream);
   auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
 
-  auto const [valid, shmem_size] = can_use_shmem_aggs(grid_size);
-  CUDF_EXPECTS(valid, "this must be usable");
-
   compute_aggregations(grid_size,
                        num_input_rows,
                        static_cast<bitmask_type*>(row_bitmask.data()),
@@ -311,7 +308,6 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
                        *d_values,
                        *d_sparse_table,
                        d_agg_kinds.data(),
-                       shmem_size,
                        stream);
   if (direct_aggregations.value(stream)) {
     auto const stride = GROUPBY_BLOCK_SIZE * grid_size;

From c1a421f6bafcdcc0e54c82939850ee165b84923f Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 7 Oct 2024 14:07:55 -0700
Subject: [PATCH 074/135] Remove groupby multi-aggs cpp tests

---
 cpp/tests/CMakeLists.txt               |   1 -
 cpp/tests/groupby/multi_aggs_tests.cpp | 115 -------------------------
 2 files changed, 116 deletions(-)
 delete mode 100644 cpp/tests/groupby/multi_aggs_tests.cpp

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index c3672999eba..4596ec65ce7 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -137,7 +137,6 @@ ConfigureTest(
   groupby/merge_lists_tests.cpp
   groupby/merge_sets_tests.cpp
   groupby/min_scan_tests.cpp
-  groupby/multi_aggs_tests.cpp
   groupby/nth_element_tests.cpp
   groupby/nunique_tests.cpp
   groupby/product_scan_tests.cpp
diff --git a/cpp/tests/groupby/multi_aggs_tests.cpp b/cpp/tests/groupby/multi_aggs_tests.cpp
deleted file mode 100644
index ae491a8f796..00000000000
--- a/cpp/tests/groupby/multi_aggs_tests.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <tests/groupby/groupby_test_util.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/type_list_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <cudf/detail/aggregation/aggregation.hpp>
-
-#include <initializer_list>
-#include <vector>
-
-using namespace cudf::test::iterators;
-
-namespace {
-template <typename T, typename Elements>
-std::unique_ptr<cudf::table> create_fixed_table(cudf::size_type num_columns,
-                                                cudf::size_type num_rows,
-                                                bool include_validity,
-                                                Elements elements)
-{
-  auto valids =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
-  std::vector<cudf::test::fixed_width_column_wrapper<T>> src_cols(num_columns);
-  for (int idx = 0; idx < num_columns; idx++) {
-    if (include_validity) {
-      src_cols[idx] =
-        cudf::test::fixed_width_column_wrapper<T>(elements, elements + num_rows, valids);
-    } else {
-      src_cols[idx] = cudf::test::fixed_width_column_wrapper<T>(elements, elements + num_rows);
-    }
-  }
-  std::vector<std::unique_ptr<cudf::column>> columns(num_columns);
-  std::transform(src_cols.begin(),
-                 src_cols.end(),
-                 columns.begin(),
-                 [](cudf::test::fixed_width_column_wrapper<T>& in) {
-                   auto ret = in.release();
-                   // pre-cache the null count
-                   [[maybe_unused]] auto const nulls = ret->has_nulls();
-                   return ret;
-                 });
-  return std::make_unique<cudf::table>(std::move(columns));
-}
-
-template <typename T>
-std::unique_ptr<cudf::table> create_random_fixed_table(cudf::size_type num_columns,
-                                                       cudf::size_type num_rows)
-{
-  auto rand_elements =
-    cudf::detail::make_counting_transform_iterator(0, [](T i) { return rand(); });
-  return create_fixed_table<T>(num_columns, num_rows, false, rand_elements);
-}
-}  // namespace
-
-template <typename V>
-struct groupby_multi_aggs_test : public cudf::test::BaseFixture {};
-
-template <typename Target, typename Source>
-std::vector<Target> convert(std::initializer_list<Source> in)
-{
-  std::vector<Target> out(std::cbegin(in), std::cend(in));
-  return out;
-}
-
-using supported_types = cudf::test::Concat<cudf::test::Types<int32_t, int64_t, float, double>>;
-TYPED_TEST_SUITE(groupby_multi_aggs_test, supported_types);
-using K = int32_t;
-
-TYPED_TEST(groupby_multi_aggs_test, basic)
-{
-  using V = TypeParam;
-
-  auto constexpr num_cols = 3'000;
-  auto constexpr num_rows = 100'000;
-  auto keys               = create_random_fixed_table<K>(1, num_rows);
-
-  auto vals = create_random_fixed_table<V>(num_cols, num_rows);
-
-  std::vector<cudf::groupby::aggregation_request> requests;
-  for (auto i = 0; i < num_cols; i++) {
-    requests.emplace_back();
-
-    requests[i].values = vals->get_column(i).view();
-    requests[i].aggregations.push_back(
-      std::move(cudf::make_mean_aggregation<cudf::groupby_aggregation>()));
-    requests[i].aggregations.push_back(
-      std::move(cudf::make_min_aggregation<cudf::groupby_aggregation>()));
-    requests[i].aggregations.push_back(
-      std::move(cudf::make_max_aggregation<cudf::groupby_aggregation>()));
-    requests[i].aggregations.push_back(
-      std::move(cudf::make_count_aggregation<cudf::groupby_aggregation>()));
-  }
-
-  cudf::groupby::groupby gb_obj{keys->view()};
-
-  auto result = gb_obj.aggregate(requests, cudf::test::get_default_stream());
-}

From 7a7ad6140c40cd9324006cfeba969436ff455470 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 7 Oct 2024 14:39:59 -0700
Subject: [PATCH 075/135] Renaming for clarity

---
 cpp/CMakeLists.txt                            |   2 +-
 .../groupby/hash/compute_single_pass_aggs.cuh |  24 +--
 ...s.cu => compute_single_pass_shmem_aggs.cu} | 154 +++++++++---------
 ...hpp => compute_single_pass_shmem_aggs.hpp} |  22 +--
 4 files changed, 101 insertions(+), 101 deletions(-)
 rename cpp/src/groupby/hash/{compute_aggregations.cu => compute_single_pass_shmem_aggs.cu} (60%)
 rename cpp/src/groupby/hash/{compute_aggregations.hpp => compute_single_pass_shmem_aggs.hpp} (52%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 5c4fd5979dc..ea476f96af4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -315,10 +315,10 @@ add_library(
   src/filling/repeat.cu
   src/filling/sequence.cu
   src/groupby/groupby.cu
-  src/groupby/hash/compute_aggregations.cu
   src/groupby/hash/compute_groupby.cu
   src/groupby/hash/compute_single_pass_aggs.cu
   src/groupby/hash/compute_single_pass_aggs_null.cu
+  src/groupby/hash/compute_single_pass_shmem_aggs.cu
   src/groupby/hash/create_sparse_results_table.cu
   src/groupby/hash/flatten_single_pass_aggs.cpp
   src/groupby/hash/groupby.cu
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index 73a69f81200..86133605d44 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -15,8 +15,8 @@
  */
 #pragma once
 
-#include "compute_aggregations.hpp"
 #include "compute_single_pass_aggs.hpp"
+#include "compute_single_pass_shmem_aggs.hpp"
 #include "create_sparse_results_table.hpp"
 #include "flatten_single_pass_aggs.hpp"
 #include "helpers.cuh"
@@ -298,17 +298,17 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   auto d_values       = table_device_view::create(flattened_values, stream);
   auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
 
-  compute_aggregations(grid_size,
-                       num_input_rows,
-                       static_cast<bitmask_type*>(row_bitmask.data()),
-                       skip_rows_with_nulls,
-                       local_mapping_index.data(),
-                       global_mapping_index.data(),
-                       block_cardinality.data(),
-                       *d_values,
-                       *d_sparse_table,
-                       d_agg_kinds.data(),
-                       stream);
+  compute_single_pass_shmem_aggs(grid_size,
+                                 num_input_rows,
+                                 static_cast<bitmask_type*>(row_bitmask.data()),
+                                 skip_rows_with_nulls,
+                                 local_mapping_index.data(),
+                                 global_mapping_index.data(),
+                                 block_cardinality.data(),
+                                 *d_values,
+                                 *d_sparse_table,
+                                 d_agg_kinds.data(),
+                                 stream);
   if (direct_aggregations.value(stream)) {
     auto const stride = GROUPBY_BLOCK_SIZE * grid_size;
     thrust::for_each_n(rmm::exec_policy(stream),
diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
similarity index 60%
rename from cpp/src/groupby/hash/compute_aggregations.cu
rename to cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
index 915ede5154b..2f41b6b23d5 100644
--- a/cpp/src/groupby/hash/compute_aggregations.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "compute_aggregations.hpp"
+#include "compute_single_pass_shmem_aggs.hpp"
 #include "create_sparse_results_table.hpp"
 #include "global_memory_aggregator.cuh"
 #include "helpers.cuh"
@@ -64,13 +64,13 @@ __device__ void calculate_columns_to_aggregate(int& col_start,
   }
 }
 
-__device__ void initialize_shared_memory_aggregates(int col_start,
-                                                    int col_end,
-                                                    cudf::mutable_table_device_view output_values,
-                                                    std::byte** s_aggregates_pointer,
-                                                    bool** s_aggregates_valid_pointer,
-                                                    cudf::size_type cardinality,
-                                                    cudf::aggregation::Kind const* d_agg_kinds)
+__device__ void initialize_shared_memory_aggs(int col_start,
+                                              int col_end,
+                                              cudf::mutable_table_device_view output_values,
+                                              std::byte** s_aggregates_pointer,
+                                              bool** s_aggregates_valid_pointer,
+                                              cudf::size_type cardinality,
+                                              cudf::aggregation::Kind const* d_agg_kinds)
 {
   for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
     for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) {
@@ -84,16 +84,16 @@ __device__ void initialize_shared_memory_aggregates(int col_start,
   }
 }
 
-__device__ void compute_pre_aggregrates(int col_start,
-                                        int col_end,
-                                        bitmask_type const* row_bitmask,
-                                        bool skip_rows_with_nulls,
-                                        cudf::table_device_view input_values,
-                                        cudf::size_type num_input_rows,
-                                        cudf::size_type* local_mapping_index,
-                                        std::byte** s_aggregates_pointer,
-                                        bool** s_aggregates_valid_pointer,
-                                        cudf::aggregation::Kind const* d_agg_kinds)
+__device__ void compute_pre_aggregrations(int col_start,
+                                          int col_end,
+                                          bitmask_type const* row_bitmask,
+                                          bool skip_rows_with_nulls,
+                                          cudf::table_device_view input_values,
+                                          cudf::size_type num_input_rows,
+                                          cudf::size_type* local_mapping_index,
+                                          std::byte** s_aggregates_pointer,
+                                          bool** s_aggregates_valid_pointer,
+                                          cudf::aggregation::Kind const* d_agg_kinds)
 {
   // TODO grid_1d utility
   for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows;
@@ -117,15 +117,15 @@ __device__ void compute_pre_aggregrates(int col_start,
   }
 }
 
-__device__ void compute_final_aggregates(int col_start,
-                                         int col_end,
-                                         cudf::table_device_view input_values,
-                                         cudf::mutable_table_device_view output_values,
-                                         cudf::size_type cardinality,
-                                         cudf::size_type* global_mapping_index,
-                                         std::byte** s_aggregates_pointer,
-                                         bool** s_aggregates_valid_pointer,
-                                         cudf::aggregation::Kind const* d_agg_kinds)
+__device__ void compute_final_aggregations(int col_start,
+                                           int col_end,
+                                           cudf::table_device_view input_values,
+                                           cudf::mutable_table_device_view output_values,
+                                           cudf::size_type cardinality,
+                                           cudf::size_type* global_mapping_index,
+                                           std::byte** s_aggregates_pointer,
+                                           bool** s_aggregates_valid_pointer,
+                                           cudf::aggregation::Kind const* d_agg_kinds)
 {
   for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) {
     auto out_idx = global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx];
@@ -147,17 +147,17 @@ __device__ void compute_final_aggregates(int col_start,
 
 /* Takes the local_mapping_index and global_mapping_index to compute
  * pre (shared) and final (global) aggregates*/
-CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows,
-                                     bitmask_type const* row_bitmask,
-                                     bool skip_rows_with_nulls,
-                                     cudf::size_type* local_mapping_index,
-                                     cudf::size_type* global_mapping_index,
-                                     cudf::size_type* block_cardinality,
-                                     cudf::table_device_view input_values,
-                                     cudf::mutable_table_device_view output_values,
-                                     cudf::aggregation::Kind const* d_agg_kinds,
-                                     int total_agg_size,
-                                     int pointer_size)
+CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
+                                               bitmask_type const* row_bitmask,
+                                               bool skip_rows_with_nulls,
+                                               cudf::size_type* local_mapping_index,
+                                               cudf::size_type* global_mapping_index,
+                                               cudf::size_type* block_cardinality,
+                                               cudf::table_device_view input_values,
+                                               cudf::mutable_table_device_view output_values,
+                                               cudf::aggregation::Kind const* d_agg_kinds,
+                                               int total_agg_size,
+                                               int pointer_size)
 {
   auto const block       = cooperative_groups::this_thread_block();
   auto const cardinality = block_cardinality[block.group_index().x];
@@ -190,34 +190,34 @@ CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows,
                                    cardinality,
                                    total_agg_size);
     block.sync();
-    initialize_shared_memory_aggregates(col_start,
-                                        col_end,
-                                        output_values,
-                                        s_aggregates_pointer,
-                                        s_aggregates_valid_pointer,
-                                        cardinality,
-                                        d_agg_kinds);
+    initialize_shared_memory_aggs(col_start,
+                                  col_end,
+                                  output_values,
+                                  s_aggregates_pointer,
+                                  s_aggregates_valid_pointer,
+                                  cardinality,
+                                  d_agg_kinds);
     block.sync();
-    compute_pre_aggregrates(col_start,
-                            col_end,
-                            row_bitmask,
-                            skip_rows_with_nulls,
-                            input_values,
-                            num_rows,
-                            local_mapping_index,
-                            s_aggregates_pointer,
-                            s_aggregates_valid_pointer,
-                            d_agg_kinds);
+    compute_pre_aggregrations(col_start,
+                              col_end,
+                              row_bitmask,
+                              skip_rows_with_nulls,
+                              input_values,
+                              num_rows,
+                              local_mapping_index,
+                              s_aggregates_pointer,
+                              s_aggregates_valid_pointer,
+                              d_agg_kinds);
     block.sync();
-    compute_final_aggregates(col_start,
-                             col_end,
-                             input_values,
-                             output_values,
-                             cardinality,
-                             global_mapping_index,
-                             s_aggregates_pointer,
-                             s_aggregates_valid_pointer,
-                             d_agg_kinds);
+    compute_final_aggregations(col_start,
+                               col_end,
+                               input_values,
+                               output_values,
+                               cardinality,
+                               global_mapping_index,
+                               s_aggregates_pointer,
+                               s_aggregates_valid_pointer,
+                               d_agg_kinds);
     block.sync();
   }
 }
@@ -233,21 +233,21 @@ size_t available_shared_memory_size(int grid_size)
 
   size_t dynamic_shmem_size = 0;
   CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock(
-    &dynamic_shmem_size, compute_aggs_kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE));
+    &dynamic_shmem_size, single_pass_shmem_aggs_kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE));
   return get_previous_multiple_of_8(0.5 * dynamic_shmem_size);
 }
 
-void compute_aggregations(int grid_size,
-                          cudf::size_type num_input_rows,
-                          bitmask_type const* row_bitmask,
-                          bool skip_rows_with_nulls,
-                          cudf::size_type* local_mapping_index,
-                          cudf::size_type* global_mapping_index,
-                          cudf::size_type* block_cardinality,
-                          cudf::table_device_view input_values,
-                          cudf::mutable_table_device_view output_values,
-                          cudf::aggregation::Kind const* d_agg_kinds,
-                          rmm::cuda_stream_view stream)
+void compute_single_pass_shmem_aggs(int grid_size,
+                                    cudf::size_type num_input_rows,
+                                    bitmask_type const* row_bitmask,
+                                    bool skip_rows_with_nulls,
+                                    cudf::size_type* local_mapping_index,
+                                    cudf::size_type* global_mapping_index,
+                                    cudf::size_type* block_cardinality,
+                                    cudf::table_device_view input_values,
+                                    cudf::mutable_table_device_view output_values,
+                                    cudf::aggregation::Kind const* d_agg_kinds,
+                                    rmm::cuda_stream_view stream)
 {
   auto const shmem_size = available_shared_memory_size(grid_size);
   // For each aggregation, need two pointers to arrays in shmem
@@ -256,7 +256,7 @@ void compute_aggregations(int grid_size,
     round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns());
   // The rest of shmem is utilized for the actual arrays in shmem
   auto const shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2;
-  compute_aggs_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, shmem_size, stream>>>(
+  single_pass_shmem_aggs_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, shmem_size, stream>>>(
     num_input_rows,
     row_bitmask,
     skip_rows_with_nulls,
diff --git a/cpp/src/groupby/hash/compute_aggregations.hpp b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp
similarity index 52%
rename from cpp/src/groupby/hash/compute_aggregations.hpp
rename to cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp
index d0e8e354d12..033cfa39a8c 100644
--- a/cpp/src/groupby/hash/compute_aggregations.hpp
+++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp
@@ -23,16 +23,16 @@
 
 namespace cudf::groupby::detail::hash {
 
-void compute_aggregations(int grid_size,
-                          cudf::size_type num_input_rows,
-                          bitmask_type const* row_bitmask,
-                          bool skip_rows_with_nulls,
-                          cudf::size_type* local_mapping_index,
-                          cudf::size_type* global_mapping_index,
-                          cudf::size_type* block_cardinality,
-                          cudf::table_device_view input_values,
-                          cudf::mutable_table_device_view output_values,
-                          cudf::aggregation::Kind const* d_agg_kinds,
-                          rmm::cuda_stream_view stream);
+void compute_single_pass_shmem_aggs(int grid_size,
+                                    cudf::size_type num_input_rows,
+                                    bitmask_type const* row_bitmask,
+                                    bool skip_rows_with_nulls,
+                                    cudf::size_type* local_mapping_index,
+                                    cudf::size_type* global_mapping_index,
+                                    cudf::size_type* block_cardinality,
+                                    cudf::table_device_view input_values,
+                                    cudf::mutable_table_device_view output_values,
+                                    cudf::aggregation::Kind const* d_agg_kinds,
+                                    rmm::cuda_stream_view stream);
 
 }  // namespace cudf::groupby::detail::hash

From 9a7d432e3267bb351037a93c293a946909cc2569 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 7 Oct 2024 14:44:03 -0700
Subject: [PATCH 076/135] Renaming

---
 cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
index 2f41b6b23d5..9f03b5af08b 100644
--- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
@@ -64,7 +64,7 @@ __device__ void calculate_columns_to_aggregate(int& col_start,
   }
 }
 
-__device__ void initialize_shared_memory_aggs(int col_start,
+__device__ void initialize_shmem_aggregations(int col_start,
                                               int col_end,
                                               cudf::mutable_table_device_view output_values,
                                               std::byte** s_aggregates_pointer,
@@ -190,7 +190,7 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
                                    cardinality,
                                    total_agg_size);
     block.sync();
-    initialize_shared_memory_aggs(col_start,
+    initialize_shmem_aggregations(col_start,
                                   col_end,
                                   output_values,
                                   s_aggregates_pointer,

From c81cbdd6b3ce332827bc7c75a9940fcfbde3e8fb Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 8 Oct 2024 14:16:59 -0700
Subject: [PATCH 077/135] Add rollback for insufficient shared memory case

---
 .../groupby/hash/compute_single_pass_aggs.cuh    | 16 +++++++++++-----
 .../hash/compute_single_pass_shmem_aggs.cu       | 11 +++++++----
 .../hash/compute_single_pass_shmem_aggs.hpp      |  4 ++++
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index 86133605d44..2340595afc8 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -225,12 +225,21 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
 
   auto global_set_ref = global_set.ref(cuco::op::insert_and_find);
 
-  if (has_dictionary_request) {
+  auto const grid_size = max_occupancy_grid_size(
+    compute_mapping_indices<shared_set_ref_type, decltype(global_set_ref), decltype(window_extent)>,
+    num_input_rows);
+  auto const has_sufficient_shmem = available_shared_memory_size(grid_size) >
+                                    (shmem_agg_pointer_size(flattened_values.num_columns()) * 2);
+  auto const uses_global_aggs = has_dictionary_request or !has_sufficient_shmem;
+
+  // Use naive global memory aggregations when there are dictionary columns to aggregagte or when
+  // there is no sufficient dynamic shared memory for shared memory aggregations
+  if (uses_global_aggs) {
     // make table that will hold sparse results
     cudf::table sparse_table = create_sparse_results_table(flattened_values,
                                                            d_agg_kinds.data(),
                                                            agg_kinds,
-                                                           has_dictionary_request,
+                                                           uses_global_aggs,
                                                            global_set,
                                                            populated_keys,
                                                            stream);
@@ -262,9 +271,6 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
     return populated_keys;
   }
 
-  auto const grid_size = max_occupancy_grid_size(
-    compute_mapping_indices<shared_set_ref_type, decltype(global_set_ref), decltype(window_extent)>,
-    num_input_rows);
   // 'local_mapping_index' maps from the global row index of the input table to the row index of
   // the local pre-aggregate table
   rmm::device_uvector<cudf::size_type> local_mapping_index(num_input_rows, stream);
diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
index 9f03b5af08b..70fd2dbc36f 100644
--- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
@@ -237,6 +237,8 @@ size_t available_shared_memory_size(int grid_size)
   return get_previous_multiple_of_8(0.5 * dynamic_shmem_size);
 }
 
+size_t shmem_agg_pointer_size(int num_cols) { return sizeof(void*) * num_cols; }
+
 void compute_single_pass_shmem_aggs(int grid_size,
                                     cudf::size_type num_input_rows,
                                     bitmask_type const* row_bitmask,
@@ -252,10 +254,11 @@ void compute_single_pass_shmem_aggs(int grid_size,
   auto const shmem_size = available_shared_memory_size(grid_size);
   // For each aggregation, need two pointers to arrays in shmem
   // One where the aggregation is performed, one indicating the validity of the aggregation
-  auto const shmem_agg_pointer_size =
-    round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns());
+  auto const shmem_pointer_size = shmem_agg_pointer_size(output_values.num_columns());
   // The rest of shmem is utilized for the actual arrays in shmem
-  auto const shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2;
+  CUDF_EXPECTS(shmem_size > shmem_pointer_size * 2,
+               "No enough space for shared memory aggregations");
+  auto const shmem_agg_size = shmem_size - shmem_pointer_size * 2;
   single_pass_shmem_aggs_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, shmem_size, stream>>>(
     num_input_rows,
     row_bitmask,
@@ -267,6 +270,6 @@ void compute_single_pass_shmem_aggs(int grid_size,
     output_values,
     d_agg_kinds,
     shmem_agg_size,
-    shmem_agg_pointer_size);
+    shmem_pointer_size);
 }
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp
index 033cfa39a8c..c871752e7e3 100644
--- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp
+++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp
@@ -23,6 +23,10 @@
 
 namespace cudf::groupby::detail::hash {
 
+size_t available_shared_memory_size(int grid_size);
+
+size_t shmem_agg_pointer_size(int num_cols);
+
 void compute_single_pass_shmem_aggs(int grid_size,
                                     cudf::size_type num_input_rows,
                                     bitmask_type const* row_bitmask,

From ed3e92bcd6a187da4c2d926128f09c0ef6ba3615 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 8 Oct 2024 14:21:47 -0700
Subject: [PATCH 078/135] Minor cleanups

---
 cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index 2340595afc8..e3c6df48638 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -210,11 +210,6 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
       ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
       : rmm::device_buffer{};
 
-  auto const has_dictionary_request = std::any_of(
-    requests.begin(), requests.end(), [](cudf::groupby::aggregation_request const& request) {
-      return cudf::is_dictionary(request.values.type());
-    });
-
   // 'populated_keys' contains inserted row_indices (keys) of global hash set
   rmm::device_uvector<cudf::size_type> populated_keys(keys.num_rows(), stream);
 
@@ -230,9 +225,13 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
     num_input_rows);
   auto const has_sufficient_shmem = available_shared_memory_size(grid_size) >
                                     (shmem_agg_pointer_size(flattened_values.num_columns()) * 2);
-  auto const uses_global_aggs = has_dictionary_request or !has_sufficient_shmem;
+  auto const uses_global_aggs       = has_dictionary_request or !has_sufficient_shmem;
+  auto const has_dictionary_request = std::any_of(
+    requests.begin(), requests.end(), [](cudf::groupby::aggregation_request const& request) {
+      return cudf::is_dictionary(request.values.type());
+    });
 
-  // Use naive global memory aggregations when there are dictionary columns to aggregagte or when
+  // Use naive global memory aggregations when there are dictionary columns to aggregagte or
   // there is no sufficient dynamic shared memory for shared memory aggregations
   if (uses_global_aggs) {
     // make table that will hold sparse results

From 7c1aa4a8bf37faa7c022d4f02c79e80e82c60b09 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 8 Oct 2024 14:30:29 -0700
Subject: [PATCH 079/135] Minor fix

---
 cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index e3c6df48638..76c8d77ff84 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -225,11 +225,11 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
     num_input_rows);
   auto const has_sufficient_shmem = available_shared_memory_size(grid_size) >
                                     (shmem_agg_pointer_size(flattened_values.num_columns()) * 2);
-  auto const uses_global_aggs       = has_dictionary_request or !has_sufficient_shmem;
   auto const has_dictionary_request = std::any_of(
     requests.begin(), requests.end(), [](cudf::groupby::aggregation_request const& request) {
       return cudf::is_dictionary(request.values.type());
     });
+  auto const uses_global_aggs = has_dictionary_request or !has_sufficient_shmem;
 
   // Use naive global memory aggregations when there are dictionary columns to aggregagte or
   // there is no sufficient dynamic shared memory for shared memory aggregations

From e9766786cebcb91fecfc087d90404bd378657a34 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 8 Oct 2024 14:46:19 -0700
Subject: [PATCH 080/135] Revert custom cuco

---
 rapids_config.cmake | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/rapids_config.cmake b/rapids_config.cmake
index 96df5adedac..3a88769f6e7 100644
--- a/rapids_config.cmake
+++ b/rapids_config.cmake
@@ -11,10 +11,6 @@
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
 # =============================================================================
-
-set(rapids-cmake-repo PointKernel/rapids-cmake)
-set(rapids-cmake-branch cuco-hash-function)
-
 file(READ "${CMAKE_CURRENT_LIST_DIR}/VERSION" _rapids_version)
 if(_rapids_version MATCHES [[^([0-9][0-9])\.([0-9][0-9])\.([0-9][0-9])]])
   set(RAPIDS_VERSION_MAJOR "${CMAKE_MATCH_1}")

From e028fa5d74cf0fad01af24efaf7a9b37ef477f07 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 8 Oct 2024 15:55:29 -0700
Subject: [PATCH 081/135] Set proper ref type on host

---
 cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index 76c8d77ff84..5210e6db67b 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -123,8 +123,6 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
   auto const block = cooperative_groups::this_thread_block();
   shared_set.initialize(block);
 
-  auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find);
-
   __shared__ cudf::size_type cardinality;
   if (block.thread_rank() == 0) { cardinality = 0; }
   block.sync();
@@ -136,7 +134,7 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
        cur_idx += stride) {
     find_local_mapping(cur_idx,
                        num_input_rows,
-                       shared_insert_ref,
+                       shared_set,
                        row_bitmask,
                        skip_rows_with_nulls,
                        &cardinality,
@@ -200,7 +198,7 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
                                            probing_scheme_t,
                                            cuco::cuda_allocator<cudf::size_type>,
                                            cuco::storage<GROUPBY_WINDOW_SIZE>>;
-  using shared_set_ref_type    = typename shared_set_type::ref_type<>;
+  using shared_set_ref_type    = typename shared_set_type::ref_type<cuco::op::insert_and_find_tag>;
   auto constexpr window_extent = cuco::make_window_extent<shared_set_ref_type>(extent_type{});
 
   auto const num_input_rows = keys.num_rows();

From 5ea276cf6488e2c287db410ff5f4884d269fd055 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 10 Oct 2024 15:46:28 -0700
Subject: [PATCH 082/135] Clean up mapping indices calculations

---
 .../groupby/hash/compute_single_pass_aggs.cuh | 73 +++++++++----------
 1 file changed, 35 insertions(+), 38 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index 5210e6db67b..9a697bfb4e8 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -45,8 +45,8 @@
 namespace cudf::groupby::detail::hash {
 namespace {
 template <typename SetType>
-// TODO pass block
-__device__ void find_local_mapping(cudf::size_type cur_idx,
+__device__ void find_local_mapping(cooperative_groups::thread_block const& block,
+                                   cudf::size_type idx,
                                    cudf::size_type num_input_rows,
                                    SetType shared_set,
                                    bitmask_type const* row_bitmask,
@@ -55,48 +55,50 @@ __device__ void find_local_mapping(cudf::size_type cur_idx,
                                    cudf::size_type* local_mapping_index,
                                    cudf::size_type* shared_set_indices)
 {
-  cudf::size_type result_idx;
-  // TODO: un-init
-  bool inserted;
-  if (cur_idx < num_input_rows and
-      (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) {
-    auto const result = shared_set.insert_and_find(cur_idx);
+  cudf::size_type result_idx{};
+  bool inserted{};
+  if (idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx))) {
+    auto const result = shared_set.insert_and_find(idx);
     result_idx        = *result.first;
     inserted          = result.second;
     // inserted a new element
     if (result.second) {
       auto const shared_set_index          = atomicAdd(cardinality, 1);
-      shared_set_indices[shared_set_index] = cur_idx;
-      local_mapping_index[cur_idx]         = shared_set_index;
+      shared_set_indices[shared_set_index] = idx;
+      local_mapping_index[idx]             = shared_set_index;
     }
   }
   // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all
   // threads in the thread block.
-  __syncthreads();
-  if (cur_idx < num_input_rows and
-      (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) {
+  block.sync();
+  if (idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx))) {
     // element was already in set
-    if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; }
+    if (!inserted) { local_mapping_index[idx] = local_mapping_index[result_idx]; }
   }
 }
 
 template <typename SetType>
-__device__ void find_global_mapping(cudf::size_type cur_idx,
+__device__ void find_global_mapping(cooperative_groups::thread_block const& block,
+                                    cudf::size_type cardinality,
                                     SetType global_set,
                                     cudf::size_type* shared_set_indices,
                                     cudf::size_type* global_mapping_index)
 {
-  auto const input_idx = shared_set_indices[cur_idx];
-  global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] =
-    *global_set.insert_and_find(input_idx).first;
+  for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) {
+    auto const input_idx = shared_set_indices[idx];
+    // for a unique key in shared memory hash set, `global_mapping_index` stores
+    // its match in global hash set
+    global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx] =
+      *global_set.insert_and_find(input_idx).first;
+  }
 }
 
 /*
- * Inserts keys into the shared memory hash set, and stores the row index of the local
- * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a
- * threadblock exceeds `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without
- * updating `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to
- * the global hash set, and save the row index of the global sparse table in `global_mapping_index`.
+ * @brief Inserts keys into the shared memory hash set, and stores the block-wise rank for a given
+ * row index in `local_mapping_index`. If the number of unique keys found in a threadblock exceeds
+ * `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without updating
+ * `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to the
+ * global hash set, and save the row index of the global sparse table in `global_mapping_index`.
  */
 template <class SetRef, typename GlobalSetType, class WindowExtent>
 CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
@@ -129,10 +131,11 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
 
   auto const stride = cudf::detail::grid_1d::grid_stride();
 
-  for (auto cur_idx = cudf::detail::grid_1d::global_thread_id();
-       cur_idx - block.thread_rank() < num_input_rows;
-       cur_idx += stride) {
-    find_local_mapping(cur_idx,
+  for (auto idx = cudf::detail::grid_1d::global_thread_id();
+       idx - block.thread_rank() < num_input_rows;
+       idx += stride) {
+    find_local_mapping(block,
+                       idx,
                        num_input_rows,
                        shared_set,
                        row_bitmask,
@@ -147,16 +150,12 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
       if (block.thread_rank() == 0) { *direct_aggregations = true; }
       break;
     }
-
-    block.sync();
   }
 
-  // Insert unique keys from shared to global hash set
+  // Insert unique keys from shared to global hash set if block-cardinality
+  // doesn't exceed the threshold upper-limit
   if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) {
-    for (auto cur_idx = block.thread_rank(); cur_idx < cardinality;
-         cur_idx += block.num_threads()) {
-      find_global_mapping(cur_idx, global_set, shared_set_indices, global_mapping_index);
-    }
+    find_global_mapping(block, cardinality, global_set, shared_set_indices, global_mapping_index);
   }
 
   if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; }
@@ -268,11 +267,9 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
     return populated_keys;
   }
 
-  // 'local_mapping_index' maps from the global row index of the input table to the row index of
-  // the local pre-aggregate table
+  // 'local_mapping_index' maps from the global row index of the input table to its block-wise rank
   rmm::device_uvector<cudf::size_type> local_mapping_index(num_input_rows, stream);
-  // 'global_mapping_index' maps from  the local pre-aggregate table to the row index of
-  // global aggregate table
+  // 'global_mapping_index' maps from the block-wise rank to the row index of global aggregate table
   rmm::device_uvector<cudf::size_type> global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS,
                                                             stream);
   rmm::device_uvector<cudf::size_type> block_cardinality(grid_size, stream);

From d32b1e7d9159d1d6bfb8ed1d87b16adfba91a278 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 10 Oct 2024 16:24:06 -0700
Subject: [PATCH 083/135] Minor cleanups for find_global_mapping

---
 cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index 9a697bfb4e8..411bc0a1b1e 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -77,17 +77,17 @@ __device__ void find_local_mapping(cooperative_groups::thread_block const& block
   }
 }
 
-template <typename SetType>
+template <typename GlobalSetT>
 __device__ void find_global_mapping(cooperative_groups::thread_block const& block,
                                     cudf::size_type cardinality,
-                                    SetType global_set,
+                                    GlobalSetT global_set,
                                     cudf::size_type* shared_set_indices,
                                     cudf::size_type* global_mapping_index)
 {
+  // for all unique keys in shared memory hash set, stores their matches in
+  // global hash set to `global_mapping_index`
   for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) {
     auto const input_idx = shared_set_indices[idx];
-    // for a unique key in shared memory hash set, `global_mapping_index` stores
-    // its match in global hash set
     global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx] =
       *global_set.insert_and_find(input_idx).first;
   }

From 32655cf694675141ba526990844432e52f4f8fff Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 10 Oct 2024 16:43:31 -0700
Subject: [PATCH 084/135] Use size_type instead of int

---
 .../hash/compute_single_pass_shmem_aggs.cu    | 82 +++++++++----------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
index 70fd2dbc36f..5bdde0ff832 100644
--- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
@@ -36,36 +36,34 @@
 
 namespace cudf::groupby::detail::hash {
 namespace {
-__device__ void calculate_columns_to_aggregate(int& col_start,
-                                               int& col_end,
+__device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
+                                               cudf::size_type& col_end,
                                                cudf::mutable_table_device_view output_values,
-                                               int num_input_cols,
+                                               cudf::size_type num_input_cols,
                                                std::byte** s_aggregates_pointer,
                                                bool** s_aggregates_valid_pointer,
                                                std::byte* shared_set_aggregates,
                                                cudf::size_type cardinality,
-                                               int total_agg_size)
+                                               cudf::size_type total_agg_size)
 {
-  if (threadIdx.x == 0) {
-    col_start           = col_end;
-    int bytes_allocated = 0;
-    int valid_col_size  = round_to_multiple_of_8(sizeof(bool) * cardinality);
-    while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) {
-      int next_col_size =
-        round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality);
-      int next_col_total_size = valid_col_size + next_col_size;
-      if (bytes_allocated + next_col_total_size > total_agg_size) { break; }
-      s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated;
-      s_aggregates_valid_pointer[col_end] =
-        reinterpret_cast<bool*>(shared_set_aggregates + bytes_allocated + next_col_size);
-      bytes_allocated += next_col_total_size;
-      col_end++;
-    }
+  col_start                       = col_end;
+  cudf::size_type bytes_allocated = 0;
+  cudf::size_type valid_col_size  = round_to_multiple_of_8(sizeof(bool) * cardinality);
+  while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) {
+    cudf::size_type next_col_size =
+      round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality);
+    cudf::size_type next_col_total_size = valid_col_size + next_col_size;
+    if (bytes_allocated + next_col_total_size > total_agg_size) { break; }
+    s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated;
+    s_aggregates_valid_pointer[col_end] =
+      reinterpret_cast<bool*>(shared_set_aggregates + bytes_allocated + next_col_size);
+    bytes_allocated += next_col_total_size;
+    col_end++;
   }
 }
 
-__device__ void initialize_shmem_aggregations(int col_start,
-                                              int col_end,
+__device__ void initialize_shmem_aggregations(cudf::size_type col_start,
+                                              cudf::size_type col_end,
                                               cudf::mutable_table_device_view output_values,
                                               std::byte** s_aggregates_pointer,
                                               bool** s_aggregates_valid_pointer,
@@ -84,8 +82,8 @@ __device__ void initialize_shmem_aggregations(int col_start,
   }
 }
 
-__device__ void compute_pre_aggregrations(int col_start,
-                                          int col_end,
+__device__ void compute_pre_aggregrations(cudf::size_type col_start,
+                                          cudf::size_type col_end,
                                           bitmask_type const* row_bitmask,
                                           bool skip_rows_with_nulls,
                                           cudf::table_device_view input_values,
@@ -117,8 +115,8 @@ __device__ void compute_pre_aggregrations(int col_start,
   }
 }
 
-__device__ void compute_final_aggregations(int col_start,
-                                           int col_end,
+__device__ void compute_final_aggregations(cudf::size_type col_start,
+                                           cudf::size_type col_end,
                                            cudf::table_device_view input_values,
                                            cudf::mutable_table_device_view output_values,
                                            cudf::size_type cardinality,
@@ -156,8 +154,8 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
                                                cudf::table_device_view input_values,
                                                cudf::mutable_table_device_view output_values,
                                                cudf::aggregation::Kind const* d_agg_kinds,
-                                               int total_agg_size,
-                                               int pointer_size)
+                                               cudf::size_type total_agg_size,
+                                               cudf::size_type pointer_size)
 {
   auto const block       = cooperative_groups::this_thread_block();
   auto const cardinality = block_cardinality[block.group_index().x];
@@ -165,8 +163,8 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
 
   auto const num_cols = output_values.num_columns();
 
-  __shared__ int col_start;
-  __shared__ int col_end;
+  __shared__ cudf::size_type col_start;
+  __shared__ cudf::size_type col_end;
   extern __shared__ std::byte shared_set_aggregates[];
   std::byte** s_aggregates_pointer =
     reinterpret_cast<std::byte**>(shared_set_aggregates + total_agg_size);
@@ -180,15 +178,17 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
   block.sync();
 
   while (col_end < num_cols) {
-    calculate_columns_to_aggregate(col_start,
-                                   col_end,
-                                   output_values,
-                                   num_cols,
-                                   s_aggregates_pointer,
-                                   s_aggregates_valid_pointer,
-                                   shared_set_aggregates,
-                                   cardinality,
-                                   total_agg_size);
+    if (block.thread_rank() == 0) {
+      calculate_columns_to_aggregate(col_start,
+                                     col_end,
+                                     output_values,
+                                     num_cols,
+                                     s_aggregates_pointer,
+                                     s_aggregates_valid_pointer,
+                                     shared_set_aggregates,
+                                     cardinality,
+                                     total_agg_size);
+    }
     block.sync();
     initialize_shmem_aggregations(col_start,
                                   col_end,
@@ -226,7 +226,7 @@ constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 *
 
 }  // namespace
 
-size_t available_shared_memory_size(int grid_size)
+size_t available_shared_memory_size(cudf::size_type grid_size)
 {
   auto const active_blocks_per_sm =
     cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
@@ -237,9 +237,9 @@ size_t available_shared_memory_size(int grid_size)
   return get_previous_multiple_of_8(0.5 * dynamic_shmem_size);
 }
 
-size_t shmem_agg_pointer_size(int num_cols) { return sizeof(void*) * num_cols; }
+size_t shmem_agg_pointer_size(cudf::size_type num_cols) { return sizeof(void*) * num_cols; }
 
-void compute_single_pass_shmem_aggs(int grid_size,
+void compute_single_pass_shmem_aggs(cudf::size_type grid_size,
                                     cudf::size_type num_input_rows,
                                     bitmask_type const* row_bitmask,
                                     bool skip_rows_with_nulls,

From 2548871749a63c5f7f76a0783d29e4edda0a4513 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 10 Oct 2024 17:06:24 -0700
Subject: [PATCH 085/135] Renaming + spacing for clarity

---
 .../hash/compute_single_pass_shmem_aggs.cu      | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
index 5bdde0ff832..1406303a8f4 100644
--- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
@@ -39,7 +39,7 @@ namespace {
 __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
                                                cudf::size_type& col_end,
                                                cudf::mutable_table_device_view output_values,
-                                               cudf::size_type num_input_cols,
+                                               cudf::size_type output_size,
                                                std::byte** s_aggregates_pointer,
                                                bool** s_aggregates_valid_pointer,
                                                std::byte* shared_set_aggregates,
@@ -48,17 +48,22 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
 {
   col_start                       = col_end;
   cudf::size_type bytes_allocated = 0;
-  cudf::size_type valid_col_size  = round_to_multiple_of_8(sizeof(bool) * cardinality);
-  while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) {
-    cudf::size_type next_col_size =
+
+  auto const valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality);
+
+  while (bytes_allocated < total_agg_size && col_end < output_size) {
+    auto const next_col_size =
       round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality);
-    cudf::size_type next_col_total_size = valid_col_size + next_col_size;
+    auto const next_col_total_size = next_col_size + valid_col_size;
+
     if (bytes_allocated + next_col_total_size > total_agg_size) { break; }
+
     s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated;
     s_aggregates_valid_pointer[col_end] =
       reinterpret_cast<bool*>(shared_set_aggregates + bytes_allocated + next_col_size);
+
     bytes_allocated += next_col_total_size;
-    col_end++;
+    ++col_end;
   }
 }
 

From 1b09ec14061c66880155719abd7890b495a3bc44 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 10 Oct 2024 17:29:23 -0700
Subject: [PATCH 086/135] Clean up shared memory agg init

---
 .../groupby/hash/compute_single_pass_shmem_aggs.cu   | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
index 1406303a8f4..0d271115adc 100644
--- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
@@ -67,7 +67,8 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
   }
 }
 
-__device__ void initialize_shmem_aggregations(cudf::size_type col_start,
+__device__ void initialize_shmem_aggregations(cooperative_groups::thread_block const& block,
+                                              cudf::size_type col_start,
                                               cudf::size_type col_end,
                                               cudf::mutable_table_device_view output_values,
                                               std::byte** s_aggregates_pointer,
@@ -76,7 +77,7 @@ __device__ void initialize_shmem_aggregations(cudf::size_type col_start,
                                               cudf::aggregation::Kind const* d_agg_kinds)
 {
   for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
-    for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) {
+    for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) {
       cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(),
                                                   d_agg_kinds[col_idx],
                                                   initialize_shmem{},
@@ -85,6 +86,7 @@ __device__ void initialize_shmem_aggregations(cudf::size_type col_start,
                                                   s_aggregates_valid_pointer[col_idx]);
     }
   }
+  block.sync();
 }
 
 __device__ void compute_pre_aggregrations(cudf::size_type col_start,
@@ -195,14 +197,16 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
                                      total_agg_size);
     }
     block.sync();
-    initialize_shmem_aggregations(col_start,
+
+    initialize_shmem_aggregations(block,
+                                  col_start,
                                   col_end,
                                   output_values,
                                   s_aggregates_pointer,
                                   s_aggregates_valid_pointer,
                                   cardinality,
                                   d_agg_kinds);
-    block.sync();
+
     compute_pre_aggregrations(col_start,
                               col_end,
                               row_bitmask,

From 56d75fbd14bb82b3f2f6265b0dfac09f2398eaa9 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 11 Oct 2024 11:05:01 -0700
Subject: [PATCH 087/135] Move compute_mapping_indices to its own TU to reduce
 build time

---
 cpp/CMakeLists.txt                            |   2 +
 .../groupby/hash/compute_mapping_indices.cu   |  35 ++++
 .../groupby/hash/compute_mapping_indices.cuh  | 188 ++++++++++++++++++
 .../groupby/hash/compute_mapping_indices.hpp  |  42 ++++
 .../hash/compute_mapping_indices_null.cu      |  35 ++++
 .../groupby/hash/compute_single_pass_aggs.cuh | 177 ++---------------
 .../hash/hash_compound_agg_finalizer.cu       |   4 +-
 cpp/src/groupby/hash/helpers.cuh              |  16 +-
 .../groupby/hash/sparse_to_dense_results.cu   |  29 ++-
 9 files changed, 344 insertions(+), 184 deletions(-)
 create mode 100644 cpp/src/groupby/hash/compute_mapping_indices.cu
 create mode 100644 cpp/src/groupby/hash/compute_mapping_indices.cuh
 create mode 100644 cpp/src/groupby/hash/compute_mapping_indices.hpp
 create mode 100644 cpp/src/groupby/hash/compute_mapping_indices_null.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index ea476f96af4..e4f44a85947 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -316,6 +316,8 @@ add_library(
   src/filling/sequence.cu
   src/groupby/groupby.cu
   src/groupby/hash/compute_groupby.cu
+  src/groupby/hash/compute_mapping_indices.cu
+  src/groupby/hash/compute_mapping_indices_null.cu
   src/groupby/hash/compute_single_pass_aggs.cu
   src/groupby/hash/compute_single_pass_aggs_null.cu
   src/groupby/hash/compute_single_pass_shmem_aggs.cu
diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cu b/cpp/src/groupby/hash/compute_mapping_indices.cu
new file mode 100644
index 00000000000..1cbe70d651f
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_mapping_indices.cu
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_mapping_indices.cuh"
+#include "compute_mapping_indices.hpp"
+
+namespace cudf::groupby::detail::hash {
+template cudf::size_type max_occupancy_grid_size<hash_set_ref_t<cuco::insert_and_find_tag>>(
+  cudf::size_type n);
+
+template void compute_mapping_indices<hash_set_ref_t<cuco::insert_and_find_tag>>(
+  cudf::size_type grid_size,
+  cudf::size_type num,
+  hash_set_ref_t<cuco::insert_and_find_tag> global_set,
+  bitmask_type const* row_bitmask,
+  bool skip_rows_with_nulls,
+  cudf::size_type* local_mapping_index,
+  cudf::size_type* global_mapping_index,
+  cudf::size_type* block_cardinality,
+  bool* direct_aggregations,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh
new file mode 100644
index 00000000000..91e7c83a2a2
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "compute_mapping_indices.hpp"
+#include "helpers.cuh"
+
+#include <cudf/detail/cuco_helpers.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/cuda.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cooperative_groups.h>
+#include <cuco/static_set_ref.cuh>
+
+#include <algorithm>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
+__device__ void find_local_mapping(cooperative_groups::thread_block const& block,
+                                   cudf::size_type idx,
+                                   cudf::size_type num_input_rows,
+                                   SetType shared_set,
+                                   bitmask_type const* row_bitmask,
+                                   bool skip_rows_with_nulls,
+                                   cudf::size_type* cardinality,
+                                   cudf::size_type* local_mapping_index,
+                                   cudf::size_type* shared_set_indices)
+{
+  cudf::size_type result_idx{};
+  bool inserted{};
+  if (idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx))) {
+    auto const result = shared_set.insert_and_find(idx);
+    result_idx        = *result.first;
+    inserted          = result.second;
+    // inserted a new element
+    if (result.second) {
+      auto const shared_set_index          = atomicAdd(cardinality, 1);
+      shared_set_indices[shared_set_index] = idx;
+      local_mapping_index[idx]             = shared_set_index;
+    }
+  }
+  // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all
+  // threads in the thread block.
+  block.sync();
+  if (idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx))) {
+    // element was already in set
+    if (!inserted) { local_mapping_index[idx] = local_mapping_index[result_idx]; }
+  }
+}
+
+template <typename SetRef>
+__device__ void find_global_mapping(cooperative_groups::thread_block const& block,
+                                    cudf::size_type cardinality,
+                                    SetRef global_set,
+                                    cudf::size_type* shared_set_indices,
+                                    cudf::size_type* global_mapping_index)
+{
+  // for all unique keys in shared memory hash set, stores their matches in
+  // global hash set to `global_mapping_index`
+  for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) {
+    auto const input_idx = shared_set_indices[idx];
+    global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx] =
+      *global_set.insert_and_find(input_idx).first;
+  }
+}
+
+/*
+ * @brief Inserts keys into the shared memory hash set, and stores the block-wise rank for a given
+ * row index in `local_mapping_index`. If the number of unique keys found in a threadblock exceeds
+ * `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without updating
+ * `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to the
+ * global hash set, and save the row index of the global sparse table in `global_mapping_index`.
+ */
+template <class SetRef>
+CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows,
+                                        SetRef global_set,
+                                        bitmask_type const* row_bitmask,
+                                        bool skip_rows_with_nulls,
+                                        cudf::size_type* local_mapping_index,
+                                        cudf::size_type* global_mapping_index,
+                                        cudf::size_type* block_cardinality,
+                                        bool* direct_aggregations)
+{
+  // TODO: indices inserted in each shared memory set
+  __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS];
+
+  // Shared set initialization
+  __shared__ cuco::window<cudf::size_type, GROUPBY_WINDOW_SIZE> windows[window_extent.value()];
+
+  auto raw_set = cuco::static_set_ref{
+    cuco::empty_key<cudf::size_type>{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+    global_set.key_eq(),
+    probing_scheme_t{global_set.hash_function()},
+    cuco::thread_scope_block,
+    cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, decltype(window_extent)>{
+      window_extent, windows}};
+  auto shared_set = raw_set.rebind_operators(cuco::insert_and_find);
+
+  auto const block = cooperative_groups::this_thread_block();
+  shared_set.initialize(block);
+
+  __shared__ cudf::size_type cardinality;
+  if (block.thread_rank() == 0) { cardinality = 0; }
+  block.sync();
+
+  auto const stride = cudf::detail::grid_1d::grid_stride();
+
+  for (auto idx = cudf::detail::grid_1d::global_thread_id();
+       idx - block.thread_rank() < num_input_rows;
+       idx += stride) {
+    find_local_mapping(block,
+                       idx,
+                       num_input_rows,
+                       shared_set,
+                       row_bitmask,
+                       skip_rows_with_nulls,
+                       &cardinality,
+                       local_mapping_index,
+                       shared_set_indices);
+
+    block.sync();
+
+    if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) {
+      if (block.thread_rank() == 0) { *direct_aggregations = true; }
+      break;
+    }
+  }
+
+  // Insert unique keys from shared to global hash set if block-cardinality
+  // doesn't exceed the threshold upper-limit
+  if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) {
+    find_global_mapping(block, cardinality, global_set, shared_set_indices, global_mapping_index);
+  }
+
+  if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; }
+}
+
+template <class SetRef>
+int max_occupancy_grid_size(cudf::size_type n)
+{
+  int max_active_blocks{-1};
+  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &max_active_blocks, mapping_indices_kernel<SetRef>, GROUPBY_BLOCK_SIZE, 0));
+  auto const grid_size  = max_active_blocks * cudf::detail::num_multiprocessors();
+  auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE);
+  return std::min(grid_size, num_blocks);
+}
+
+template <class SetRef>
+void compute_mapping_indices(cudf::size_type grid_size,
+                             cudf::size_type num,
+                             SetRef global_set,
+                             bitmask_type const* row_bitmask,
+                             bool skip_rows_with_nulls,
+                             cudf::size_type* local_mapping_index,
+                             cudf::size_type* global_mapping_index,
+                             cudf::size_type* block_cardinality,
+                             bool* direct_aggregations,
+                             rmm::cuda_stream_view stream)
+{
+  mapping_indices_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, 0, stream>>>(num,
+                                                                       global_set,
+                                                                       row_bitmask,
+                                                                       skip_rows_with_nulls,
+                                                                       local_mapping_index,
+                                                                       global_mapping_index,
+                                                                       block_cardinality,
+                                                                       direct_aggregations);
+  stream.synchronize();
+}
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_mapping_indices.hpp b/cpp/src/groupby/hash/compute_mapping_indices.hpp
new file mode 100644
index 00000000000..d2cf3450730
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_mapping_indices.hpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf::groupby::detail::hash {
+/*
+ * @brief Computes the maximum number of active blocks of the given kernel that can be executed on
+ * the underlying device
+ */
+template <class SetRef>
+[[nodiscard]] cudf::size_type max_occupancy_grid_size(cudf::size_type n);
+
+template <class SetRef>
+void compute_mapping_indices(cudf::size_type grid_size,
+                             cudf::size_type num,
+                             SetRef global_set,
+                             bitmask_type const* row_bitmask,
+                             bool skip_rows_with_nulls,
+                             cudf::size_type* local_mapping_index,
+                             cudf::size_type* global_mapping_index,
+                             cudf::size_type* block_cardinality,
+                             bool* direct_aggregations,
+                             rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_mapping_indices_null.cu b/cpp/src/groupby/hash/compute_mapping_indices_null.cu
new file mode 100644
index 00000000000..1b04016f9a1
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_mapping_indices_null.cu
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_mapping_indices.cuh"
+#include "compute_mapping_indices.hpp"
+
+namespace cudf::groupby::detail::hash {
+template cudf::size_type
+max_occupancy_grid_size<nullable_hash_set_ref_t<cuco::insert_and_find_tag>>(cudf::size_type n);
+
+template void compute_mapping_indices<nullable_hash_set_ref_t<cuco::insert_and_find_tag>>(
+  cudf::size_type grid_size,
+  cudf::size_type num,
+  nullable_hash_set_ref_t<cuco::insert_and_find_tag> global_set,
+  bitmask_type const* row_bitmask,
+  bool skip_rows_with_nulls,
+  cudf::size_type* local_mapping_index,
+  cudf::size_type* global_mapping_index,
+  cudf::size_type* block_cardinality,
+  bool* direct_aggregations,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index 411bc0a1b1e..974b973b1fa 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include "compute_mapping_indices.hpp"
 #include "compute_single_pass_aggs.hpp"
 #include "compute_single_pass_shmem_aggs.hpp"
 #include "create_sparse_results_table.hpp"
@@ -22,12 +23,8 @@
 #include "helpers.cuh"
 #include "single_pass_functors.cuh"
 
-#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/aggregation/result_cache.hpp>
-#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/cuda.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/table/table_view.hpp>
@@ -40,139 +37,10 @@
 #include <cooperative_groups.h>
 #include <cuco/static_set.cuh>
 
-#include <unordered_set>
+#include <algorithm>
+#include <memory>
 
 namespace cudf::groupby::detail::hash {
-namespace {
-template <typename SetType>
-__device__ void find_local_mapping(cooperative_groups::thread_block const& block,
-                                   cudf::size_type idx,
-                                   cudf::size_type num_input_rows,
-                                   SetType shared_set,
-                                   bitmask_type const* row_bitmask,
-                                   bool skip_rows_with_nulls,
-                                   cudf::size_type* cardinality,
-                                   cudf::size_type* local_mapping_index,
-                                   cudf::size_type* shared_set_indices)
-{
-  cudf::size_type result_idx{};
-  bool inserted{};
-  if (idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx))) {
-    auto const result = shared_set.insert_and_find(idx);
-    result_idx        = *result.first;
-    inserted          = result.second;
-    // inserted a new element
-    if (result.second) {
-      auto const shared_set_index          = atomicAdd(cardinality, 1);
-      shared_set_indices[shared_set_index] = idx;
-      local_mapping_index[idx]             = shared_set_index;
-    }
-  }
-  // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all
-  // threads in the thread block.
-  block.sync();
-  if (idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx))) {
-    // element was already in set
-    if (!inserted) { local_mapping_index[idx] = local_mapping_index[result_idx]; }
-  }
-}
-
-template <typename GlobalSetT>
-__device__ void find_global_mapping(cooperative_groups::thread_block const& block,
-                                    cudf::size_type cardinality,
-                                    GlobalSetT global_set,
-                                    cudf::size_type* shared_set_indices,
-                                    cudf::size_type* global_mapping_index)
-{
-  // for all unique keys in shared memory hash set, stores their matches in
-  // global hash set to `global_mapping_index`
-  for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) {
-    auto const input_idx = shared_set_indices[idx];
-    global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx] =
-      *global_set.insert_and_find(input_idx).first;
-  }
-}
-
-/*
- * @brief Inserts keys into the shared memory hash set, and stores the block-wise rank for a given
- * row index in `local_mapping_index`. If the number of unique keys found in a threadblock exceeds
- * `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without updating
- * `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to the
- * global hash set, and save the row index of the global sparse table in `global_mapping_index`.
- */
-template <class SetRef, typename GlobalSetType, class WindowExtent>
-CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
-                                         cudf::size_type num_input_rows,
-                                         WindowExtent window_extent,
-                                         bitmask_type const* row_bitmask,
-                                         bool skip_rows_with_nulls,
-                                         cudf::size_type* local_mapping_index,
-                                         cudf::size_type* global_mapping_index,
-                                         cudf::size_type* block_cardinality,
-                                         bool* direct_aggregations)
-{
-  // TODO: indices inserted in each shared memory set
-  __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS];
-
-  // Shared set initialization
-  __shared__ typename SetRef::window_type windows[window_extent.value()];
-  auto storage     = SetRef::storage_ref_type(window_extent, windows);
-  auto shared_set  = SetRef(cuco::empty_key<cudf::size_type>{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
-                           global_set.key_eq(),
-                           probing_scheme_t{global_set.hash_function()},
-                            {},
-                           storage);
-  auto const block = cooperative_groups::this_thread_block();
-  shared_set.initialize(block);
-
-  __shared__ cudf::size_type cardinality;
-  if (block.thread_rank() == 0) { cardinality = 0; }
-  block.sync();
-
-  auto const stride = cudf::detail::grid_1d::grid_stride();
-
-  for (auto idx = cudf::detail::grid_1d::global_thread_id();
-       idx - block.thread_rank() < num_input_rows;
-       idx += stride) {
-    find_local_mapping(block,
-                       idx,
-                       num_input_rows,
-                       shared_set,
-                       row_bitmask,
-                       skip_rows_with_nulls,
-                       &cardinality,
-                       local_mapping_index,
-                       shared_set_indices);
-
-    block.sync();
-
-    if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) {
-      if (block.thread_rank() == 0) { *direct_aggregations = true; }
-      break;
-    }
-  }
-
-  // Insert unique keys from shared to global hash set if block-cardinality
-  // doesn't exceed the threshold upper-limit
-  if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) {
-    find_global_mapping(block, cardinality, global_set, shared_set_indices, global_mapping_index);
-  }
-
-  if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; }
-}
-
-template <typename Kernel>
-int max_occupancy_grid_size(Kernel kernel, cudf::size_type n)
-{
-  int max_active_blocks{-1};
-  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &max_active_blocks, kernel, GROUPBY_BLOCK_SIZE, 0));
-  auto const grid_size  = max_active_blocks * cudf::detail::num_multiprocessors();
-  auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE);
-  return std::min(grid_size, num_blocks);
-}
-}  // namespace
-
 /**
  * @brief Computes all aggregations from `requests` that require a single pass
  * over the data and stores the results in `sparse_results`
@@ -186,20 +54,6 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   bool skip_rows_with_nulls,
   rmm::cuda_stream_view stream)
 {
-  // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy
-  auto constexpr shared_set_capacity =
-    static_cast<std::size_t>(static_cast<double>(GROUPBY_SHM_MAX_ELEMENTS) * 1.43);
-  using extent_type            = cuco::extent<cudf::size_type, shared_set_capacity>;
-  using shared_set_type        = cuco::static_set<cudf::size_type,
-                                           extent_type,
-                                           cuda::thread_scope_block,
-                                           typename SetType::key_equal,
-                                           probing_scheme_t,
-                                           cuco::cuda_allocator<cudf::size_type>,
-                                           cuco::storage<GROUPBY_WINDOW_SIZE>>;
-  using shared_set_ref_type    = typename shared_set_type::ref_type<cuco::op::insert_and_find_tag>;
-  auto constexpr window_extent = cuco::make_window_extent<shared_set_ref_type>(extent_type{});
-
   auto const num_input_rows = keys.num_rows();
 
   auto row_bitmask =
@@ -217,9 +71,7 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
 
   auto global_set_ref = global_set.ref(cuco::op::insert_and_find);
 
-  auto const grid_size = max_occupancy_grid_size(
-    compute_mapping_indices<shared_set_ref_type, decltype(global_set_ref), decltype(window_extent)>,
-    num_input_rows);
+  auto const grid_size = max_occupancy_grid_size<decltype(global_set_ref)>(num_input_rows);
   auto const has_sufficient_shmem = available_shared_memory_size(grid_size) >
                                     (shmem_agg_pointer_size(flattened_values.num_columns()) * 2);
   auto const has_dictionary_request = std::any_of(
@@ -274,17 +126,16 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
                                                             stream);
   rmm::device_uvector<cudf::size_type> block_cardinality(grid_size, stream);
   rmm::device_scalar<bool> direct_aggregations(false, stream);
-  compute_mapping_indices<shared_set_ref_type>
-    <<<grid_size, GROUPBY_BLOCK_SIZE, 0, stream>>>(global_set_ref,
-                                                   num_input_rows,
-                                                   window_extent,
-                                                   static_cast<bitmask_type*>(row_bitmask.data()),
-                                                   skip_rows_with_nulls,
-                                                   local_mapping_index.data(),
-                                                   global_mapping_index.data(),
-                                                   block_cardinality.data(),
-                                                   direct_aggregations.data());
-  stream.synchronize();
+  compute_mapping_indices(grid_size,
+                          num_input_rows,
+                          global_set_ref,
+                          static_cast<bitmask_type*>(row_bitmask.data()),
+                          skip_rows_with_nulls,
+                          local_mapping_index.data(),
+                          global_mapping_index.data(),
+                          block_cardinality.data(),
+                          direct_aggregations.data(),
+                          stream);
 
   // make table that will hold sparse results
   cudf::table sparse_table = create_sparse_results_table(flattened_values,
diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
index 119ac8cf6fd..221e63ac121 100644
--- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
+++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
@@ -194,7 +194,7 @@ void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::std_aggregation c
   dense_results->add_result(col, agg, std::move(result));
 }
 
-template class hash_compound_agg_finalizer<hash_set_ref_t>;
-template class hash_compound_agg_finalizer<nullable_hash_set_ref_t>;
+template class hash_compound_agg_finalizer<hash_set_ref_t<cuco::find_tag>>;
+template class hash_compound_agg_finalizer<nullable_hash_set_ref_t<cuco::find_tag>>;
 
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh
index 651a6a2014a..f00996b6127 100644
--- a/cpp/src/groupby/hash/helpers.cuh
+++ b/cpp/src/groupby/hash/helpers.cuh
@@ -45,6 +45,16 @@ CUDF_HOST_DEVICE auto constexpr GROUPBY_CARDINALITY_THRESHOLD = 128;
 CUDF_HOST_DEVICE auto constexpr GROUPBY_SHM_MAX_ELEMENTS =
   GROUPBY_CARDINALITY_THRESHOLD + GROUPBY_BLOCK_SIZE;
 
+// GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy
+/// Shared memory hash set extent type
+using shmem_extent_t =
+  cuco::extent<cudf::size_type,
+               static_cast<cudf::size_type>(static_cast<double>(GROUPBY_SHM_MAX_ELEMENTS) * 1.43)>;
+
+/// Number of windows needed by each shared memory hash set
+CUDF_HOST_DEVICE auto constexpr window_extent =
+  cuco::make_window_extent<GROUPBY_CG_SIZE, GROUPBY_WINDOW_SIZE>(shmem_extent_t{});
+
 /**
  * @brief Returns the smallest multiple of 8 that is greater than or equal to the given integer.
  */
@@ -87,20 +97,22 @@ using nullable_global_set_t = cuco::static_set<cudf::size_type,
                                                cudf::detail::cuco_allocator<char>,
                                                cuco::storage<GROUPBY_WINDOW_SIZE>>;
 
+template <typename Op>
 using hash_set_ref_t = cuco::static_set_ref<
   cudf::size_type,
   cuda::thread_scope_device,
   row_comparator_t,
   probing_scheme_t,
   cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, cuco::window_extent<int64_t>>,
-  cuco::op::find_tag>;
+  Op>;
 
+template <typename Op>
 using nullable_hash_set_ref_t = cuco::static_set_ref<
   cudf::size_type,
   cuda::thread_scope_device,
   nullable_row_comparator_t,
   probing_scheme_t,
   cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, cuco::window_extent<int64_t>>,
-  cuco::op::find_tag>;
+  Op>;
 
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu
index af61173fb6a..36dc306879e 100644
--- a/cpp/src/groupby/hash/sparse_to_dense_results.cu
+++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu
@@ -28,12 +28,6 @@
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 namespace cudf::groupby::detail::hash {
-/**
- * @brief Gather sparse results into dense using `gather_map` and add to
- * `dense_cache`
- *
- * @see groupby_null_templated()
- */
 template <typename SetType>
 void sparse_to_dense_results(table_view const& keys,
                              host_span<aggregation_request const> requests,
@@ -64,23 +58,24 @@ void sparse_to_dense_results(table_view const& keys,
   }
 }
 
-template void sparse_to_dense_results<hash_set_ref_t>(table_view const& keys,
-                                                      host_span<aggregation_request const> requests,
-                                                      cudf::detail::result_cache* sparse_results,
-                                                      cudf::detail::result_cache* dense_results,
-                                                      device_span<size_type const> gather_map,
-                                                      hash_set_ref_t set,
-                                                      bool skip_key_rows_with_nulls,
-                                                      rmm::cuda_stream_view stream,
-                                                      rmm::device_async_resource_ref mr);
+template void sparse_to_dense_results<hash_set_ref_t<cuco::find_tag>>(
+  table_view const& keys,
+  host_span<aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  cudf::detail::result_cache* dense_results,
+  device_span<size_type const> gather_map,
+  hash_set_ref_t<cuco::find_tag> set,
+  bool skip_key_rows_with_nulls,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
 
-template void sparse_to_dense_results<nullable_hash_set_ref_t>(
+template void sparse_to_dense_results<nullable_hash_set_ref_t<cuco::find_tag>>(
   table_view const& keys,
   host_span<aggregation_request const> requests,
   cudf::detail::result_cache* sparse_results,
   cudf::detail::result_cache* dense_results,
   device_span<size_type const> gather_map,
-  nullable_hash_set_ref_t set,
+  nullable_hash_set_ref_t<cuco::find_tag> set,
   bool skip_key_rows_with_nulls,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr);

From ab5ef604bb7245624dae30830169b8ea96a59b56 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 11 Oct 2024 12:26:25 -0700
Subject: [PATCH 088/135] Clean up the shared memory init function

---
 .../hash/compute_single_pass_shmem_aggs.cu    | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
index 0d271115adc..9e3c62f46ac 100644
--- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
@@ -15,7 +15,6 @@
  */
 
 #include "compute_single_pass_shmem_aggs.hpp"
-#include "create_sparse_results_table.hpp"
 #include "global_memory_aggregator.cuh"
 #include "helpers.cuh"
 #include "shared_memory_aggregator.cuh"
@@ -36,6 +35,9 @@
 
 namespace cudf::groupby::detail::hash {
 namespace {
+// Prepares shared memory data required by each output column, exits if
+// no enough memory space to perform the shared memory aggregation for the
+// current output column
 __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
                                                cudf::size_type& col_end,
                                                cudf::mutable_table_device_view output_values,
@@ -67,6 +69,7 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
   }
 }
 
+// Each block initialize its own shared memory aggregation results
 __device__ void initialize_shmem_aggregations(cooperative_groups::thread_block const& block,
                                               cudf::size_type col_start,
                                               cudf::size_type col_end,
@@ -100,14 +103,13 @@ __device__ void compute_pre_aggregrations(cudf::size_type col_start,
                                           bool** s_aggregates_valid_pointer,
                                           cudf::aggregation::Kind const* d_agg_kinds)
 {
-  // TODO grid_1d utility
-  for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows;
-       cur_idx += blockDim.x * gridDim.x) {
-    if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx)) {
-      auto map_idx = local_mapping_index[cur_idx];
+  for (auto idx = cudf::detail::grid_1d::global_thread_id(); idx < num_input_rows;
+       idx += cudf::detail::grid_1d::grid_stride()) {
+    if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx)) {
+      auto const map_idx = local_mapping_index[idx];
 
       for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
-        auto input_col = input_values.column(col_idx);
+        auto const input_col = input_values.column(col_idx);
 
         cudf::detail::dispatch_type_and_aggregation(input_col.type(),
                                                     d_agg_kinds[col_idx],
@@ -116,7 +118,7 @@ __device__ void compute_pre_aggregrations(cudf::size_type col_start,
                                                     map_idx,
                                                     s_aggregates_valid_pointer[col_idx],
                                                     input_col,
-                                                    cur_idx);
+                                                    idx);
       }
     }
   }
@@ -218,6 +220,7 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
                               s_aggregates_valid_pointer,
                               d_agg_kinds);
     block.sync();
+
     compute_final_aggregations(col_start,
                                col_end,
                                input_values,

From 5bfe6ea4781ea188ca380172039f49444a76e3b1 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 11 Oct 2024 12:35:21 -0700
Subject: [PATCH 089/135] Add reminder

---
 cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
index 9e3c62f46ac..3c0a00c4798 100644
--- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
@@ -58,6 +58,7 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
       round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality);
     auto const next_col_total_size = next_col_size + valid_col_size;
 
+    // TODO: it seems early exit will break the followup calculatons. To verify
     if (bytes_allocated + next_col_total_size > total_agg_size) { break; }
 
     s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated;
@@ -107,10 +108,8 @@ __device__ void compute_pre_aggregrations(cudf::size_type col_start,
        idx += cudf::detail::grid_1d::grid_stride()) {
     if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx)) {
       auto const map_idx = local_mapping_index[idx];
-
       for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
         auto const input_col = input_values.column(col_idx);
-
         cudf::detail::dispatch_type_and_aggregation(input_col.type(),
                                                     d_agg_kinds[col_idx],
                                                     shmem_element_aggregator{},

From d597ea70edc403563f2fd3c5f7fa6bf1be55b1e7 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 11 Oct 2024 13:09:20 -0700
Subject: [PATCH 090/135] Remove unused header

---
 cpp/src/groupby/groupby.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index cc0682b68b9..6eb82618e2a 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -29,7 +29,6 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/reduction/detail/histogram.hpp>
-#include <cudf/strings/string_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>

From 1e85f08551b0b5f184717f1e2180ff5ceb622098 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 11 Oct 2024 14:18:40 -0700
Subject: [PATCH 091/135] Renaming + API cleanups

---
 cpp/src/groupby/hash/compute_groupby.cu       | 26 +++++++---
 cpp/src/groupby/hash/compute_groupby.hpp      |  2 +-
 .../groupby/hash/compute_single_pass_aggs.cu  |  5 +-
 .../groupby/hash/compute_single_pass_aggs.cuh | 51 ++++++++-----------
 .../groupby/hash/compute_single_pass_aggs.hpp |  7 +--
 .../hash/compute_single_pass_aggs_null.cu     |  5 +-
 .../groupby/hash/sparse_to_dense_results.cu   |  8 +--
 .../groupby/hash/sparse_to_dense_results.hpp  |  2 +-
 8 files changed, 56 insertions(+), 50 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu
index 9021846f71e..377d0361bd1 100644
--- a/cpp/src/groupby/hash/compute_groupby.cu
+++ b/cpp/src/groupby/hash/compute_groupby.cu
@@ -65,21 +65,21 @@ template <typename Equal>
 std::unique_ptr<table> compute_groupby(table_view const& keys,
                                        host_span<aggregation_request const> requests,
                                        cudf::detail::result_cache* cache,
-                                       bool skip_key_rows_with_nulls,
+                                       bool skip_rows_with_nulls,
                                        Equal const& d_row_equal,
                                        row_hash_t const& d_row_hash,
                                        rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
   // convert to int64_t to avoid potential overflow with large `keys`
-  auto const num_keys = static_cast<int64_t>(keys.num_rows());
+  auto const num_rows = static_cast<int64_t>(keys.num_rows());
 
   // Cache of sparse results where the location of aggregate value in each
   // column is indexed by the hash set
   cudf::detail::result_cache sparse_results(requests.size());
 
   auto set = cuco::static_set{
-    cuco::extent<int64_t>{num_keys},
+    cuco::extent<int64_t>{num_rows},
     cudf::detail::CUCO_DESIRED_LOAD_FACTOR,  // 50% occupancy
     cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
     d_row_equal,
@@ -89,9 +89,19 @@ std::unique_ptr<table> compute_groupby(table_view const& keys,
     cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
     stream.value()};
 
+  auto row_bitmask =
+    skip_rows_with_nulls
+      ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
+      : rmm::device_buffer{};
+
   // Compute all single pass aggs first
-  auto gather_map = compute_single_pass_aggs(
-    keys, requests, &sparse_results, set, skip_key_rows_with_nulls, stream);
+  auto gather_map = compute_single_pass_aggs(num_rows,
+                                             static_cast<bitmask_type*>(row_bitmask.data()),
+                                             requests,
+                                             &sparse_results,
+                                             set,
+                                             skip_rows_with_nulls,
+                                             stream);
 
   // Compact all results from sparse_results and insert into cache
   sparse_to_dense_results(keys,
@@ -100,7 +110,7 @@ std::unique_ptr<table> compute_groupby(table_view const& keys,
                           cache,
                           gather_map,
                           set.ref(cuco::find),
-                          skip_key_rows_with_nulls,
+                          skip_rows_with_nulls,
                           stream,
                           mr);
 
@@ -116,7 +126,7 @@ template std::unique_ptr<table> compute_groupby<row_comparator_t>(
   table_view const& keys,
   host_span<aggregation_request const> requests,
   cudf::detail::result_cache* cache,
-  bool skip_key_rows_with_nulls,
+  bool skip_rows_with_nulls,
   row_comparator_t const& d_row_equal,
   row_hash_t const& d_row_hash,
   rmm::cuda_stream_view stream,
@@ -126,7 +136,7 @@ template std::unique_ptr<table> compute_groupby<nullable_row_comparator_t>(
   table_view const& keys,
   host_span<aggregation_request const> requests,
   cudf::detail::result_cache* cache,
-  bool skip_key_rows_with_nulls,
+  bool skip_rows_with_nulls,
   nullable_row_comparator_t const& d_row_equal,
   row_hash_t const& d_row_hash,
   rmm::cuda_stream_view stream,
diff --git a/cpp/src/groupby/hash/compute_groupby.hpp b/cpp/src/groupby/hash/compute_groupby.hpp
index 358c81365a0..a11c1db4262 100644
--- a/cpp/src/groupby/hash/compute_groupby.hpp
+++ b/cpp/src/groupby/hash/compute_groupby.hpp
@@ -59,7 +59,7 @@ template <typename Equal>
 std::unique_ptr<cudf::table> compute_groupby(table_view const& keys,
                                              host_span<aggregation_request const> requests,
                                              cudf::detail::result_cache* cache,
-                                             bool skip_key_rows_with_nulls,
+                                             bool skip_rows_with_nulls,
                                              Equal const& d_row_equal,
                                              row_hash_t const& d_row_hash,
                                              rmm::cuda_stream_view stream,
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu
index f8b0f65b92f..8ba78653957 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,8 @@
 
 namespace cudf::groupby::detail::hash {
 template rmm::device_uvector<cudf::size_type> compute_single_pass_aggs<global_set_t>(
-  cudf::table_view const& keys,
+  int64_t num_rows,
+  bitmask_type const* row_bitmask,
   cudf::host_span<cudf::groupby::aggregation_request const> requests,
   cudf::detail::result_cache* sparse_results,
   global_set_t& global_set,
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index 974b973b1fa..fab5887d7b8 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/groupby.hpp>
-#include <cudf/table/table_view.hpp>
+#include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -47,22 +47,16 @@ namespace cudf::groupby::detail::hash {
  */
 template <typename SetType>
 rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
-  cudf::table_view const& keys,
+  int64_t num_rows,
+  bitmask_type const* row_bitmask,
   cudf::host_span<cudf::groupby::aggregation_request const> requests,
   cudf::detail::result_cache* sparse_results,
   SetType& global_set,
   bool skip_rows_with_nulls,
   rmm::cuda_stream_view stream)
 {
-  auto const num_input_rows = keys.num_rows();
-
-  auto row_bitmask =
-    skip_rows_with_nulls
-      ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
-      : rmm::device_buffer{};
-
   // 'populated_keys' contains inserted row_indices (keys) of global hash set
-  rmm::device_uvector<cudf::size_type> populated_keys(keys.num_rows(), stream);
+  rmm::device_uvector<cudf::size_type> populated_keys(num_rows, stream);
 
   // flatten the aggs to a table that can be operated on by aggregate_row
   auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
@@ -71,7 +65,7 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
 
   auto global_set_ref = global_set.ref(cuco::op::insert_and_find);
 
-  auto const grid_size = max_occupancy_grid_size<decltype(global_set_ref)>(num_input_rows);
+  auto const grid_size            = max_occupancy_grid_size<decltype(global_set_ref)>(num_rows);
   auto const has_sufficient_shmem = available_shared_memory_size(grid_size) >
                                     (shmem_agg_pointer_size(flattened_values.num_columns()) * 2);
   auto const has_dictionary_request = std::any_of(
@@ -96,16 +90,15 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
     auto d_values       = table_device_view::create(flattened_values, stream);
     auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
 
-    thrust::for_each_n(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator(0),
-      keys.num_rows(),
-      hash::compute_single_pass_aggs_fn{global_set_ref,
-                                        *d_values,
-                                        *d_sparse_table,
-                                        d_agg_kinds.data(),
-                                        static_cast<bitmask_type*>(row_bitmask.data()),
-                                        skip_rows_with_nulls});
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator(0),
+                       num_rows,
+                       hash::compute_single_pass_aggs_fn{global_set_ref,
+                                                         *d_values,
+                                                         *d_sparse_table,
+                                                         d_agg_kinds.data(),
+                                                         row_bitmask,
+                                                         skip_rows_with_nulls});
     extract_populated_keys(global_set, populated_keys, stream);
 
     // Add results back to sparse_results cache
@@ -120,16 +113,16 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   }
 
   // 'local_mapping_index' maps from the global row index of the input table to its block-wise rank
-  rmm::device_uvector<cudf::size_type> local_mapping_index(num_input_rows, stream);
+  rmm::device_uvector<cudf::size_type> local_mapping_index(num_rows, stream);
   // 'global_mapping_index' maps from the block-wise rank to the row index of global aggregate table
   rmm::device_uvector<cudf::size_type> global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS,
                                                             stream);
   rmm::device_uvector<cudf::size_type> block_cardinality(grid_size, stream);
   rmm::device_scalar<bool> direct_aggregations(false, stream);
   compute_mapping_indices(grid_size,
-                          num_input_rows,
+                          num_rows,
                           global_set_ref,
-                          static_cast<bitmask_type*>(row_bitmask.data()),
+                          row_bitmask,
                           skip_rows_with_nulls,
                           local_mapping_index.data(),
                           global_mapping_index.data(),
@@ -150,8 +143,8 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
 
   compute_single_pass_shmem_aggs(grid_size,
-                                 num_input_rows,
-                                 static_cast<bitmask_type*>(row_bitmask.data()),
+                                 num_rows,
+                                 row_bitmask,
                                  skip_rows_with_nulls,
                                  local_mapping_index.data(),
                                  global_mapping_index.data(),
@@ -164,14 +157,14 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
     auto const stride = GROUPBY_BLOCK_SIZE * grid_size;
     thrust::for_each_n(rmm::exec_policy(stream),
                        thrust::make_counting_iterator(0),
-                       keys.num_rows(),
+                       num_rows,
                        compute_direct_aggregates{global_set_ref,
                                                  *d_values,
                                                  *d_sparse_table,
                                                  d_agg_kinds.data(),
                                                  block_cardinality.data(),
                                                  stride,
-                                                 static_cast<bitmask_type*>(row_bitmask.data()),
+                                                 row_bitmask,
                                                  skip_rows_with_nulls});
     extract_populated_keys(global_set, populated_keys, stream);
   }
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
index 6cbea9fcd3c..a0d2452d39f 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@
 #pragma once
 
 #include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/groupby.hpp>
-#include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -31,7 +31,8 @@ namespace cudf::groupby::detail::hash {
  */
 template <typename SetType>
 rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
-  cudf::table_view const& keys,
+  int64_t num_rows,
+  bitmask_type const* row_bitmask,
   cudf::host_span<cudf::groupby::aggregation_request const> requests,
   cudf::detail::result_cache* sparse_results,
   SetType& global_set,
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu b/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu
index b88f1a952d5..be7c667766c 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,8 @@
 
 namespace cudf::groupby::detail::hash {
 template rmm::device_uvector<cudf::size_type> compute_single_pass_aggs<nullable_global_set_t>(
-  cudf::table_view const& keys,
+  int64_t num_rows,
+  bitmask_type const* row_bitmask,
   cudf::host_span<cudf::groupby::aggregation_request const> requests,
   cudf::detail::result_cache* sparse_results,
   nullable_global_set_t& global_set,
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu
index 36dc306879e..e960cc1f4e0 100644
--- a/cpp/src/groupby/hash/sparse_to_dense_results.cu
+++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu
@@ -35,14 +35,14 @@ void sparse_to_dense_results(table_view const& keys,
                              cudf::detail::result_cache* dense_results,
                              device_span<size_type const> gather_map,
                              SetType set,
-                             bool skip_key_rows_with_nulls,
+                             bool skip_rows_with_nulls,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr)
 {
   auto row_bitmask =
     cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first;
   bitmask_type const* row_bitmask_ptr =
-    skip_key_rows_with_nulls ? static_cast<bitmask_type*>(row_bitmask.data()) : nullptr;
+    skip_rows_with_nulls ? static_cast<bitmask_type*>(row_bitmask.data()) : nullptr;
 
   for (auto const& request : requests) {
     auto const& agg_v = request.aggregations;
@@ -65,7 +65,7 @@ template void sparse_to_dense_results<hash_set_ref_t<cuco::find_tag>>(
   cudf::detail::result_cache* dense_results,
   device_span<size_type const> gather_map,
   hash_set_ref_t<cuco::find_tag> set,
-  bool skip_key_rows_with_nulls,
+  bool skip_rows_with_nulls,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr);
 
@@ -76,7 +76,7 @@ template void sparse_to_dense_results<nullable_hash_set_ref_t<cuco::find_tag>>(
   cudf::detail::result_cache* dense_results,
   device_span<size_type const> gather_map,
   nullable_hash_set_ref_t<cuco::find_tag> set,
-  bool skip_key_rows_with_nulls,
+  bool skip_rows_with_nulls,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr);
 
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.hpp b/cpp/src/groupby/hash/sparse_to_dense_results.hpp
index bfdc42953ad..2c14cc1e7f6 100644
--- a/cpp/src/groupby/hash/sparse_to_dense_results.hpp
+++ b/cpp/src/groupby/hash/sparse_to_dense_results.hpp
@@ -38,7 +38,7 @@ void sparse_to_dense_results(table_view const& keys,
                              cudf::detail::result_cache* dense_results,
                              device_span<size_type const> gather_map,
                              SetType set,
-                             bool skip_key_rows_with_nulls,
+                             bool skip_rows_with_nulls,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr);
 }  // namespace cudf::groupby::detail::hash

From 80f92752599c1dfa016b39951e1252d1add2494b Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 11 Oct 2024 14:28:09 -0700
Subject: [PATCH 092/135] Get rid of redundant bitmask calculation

---
 cpp/src/groupby/hash/compute_groupby.cu          |  2 +-
 cpp/src/groupby/hash/sparse_to_dense_results.cu  | 15 ++++-----------
 cpp/src/groupby/hash/sparse_to_dense_results.hpp |  4 ++--
 3 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu
index 377d0361bd1..95bf74c9e84 100644
--- a/cpp/src/groupby/hash/compute_groupby.cu
+++ b/cpp/src/groupby/hash/compute_groupby.cu
@@ -104,7 +104,7 @@ std::unique_ptr<table> compute_groupby(table_view const& keys,
                                              stream);
 
   // Compact all results from sparse_results and insert into cache
-  sparse_to_dense_results(keys,
+  sparse_to_dense_results(static_cast<bitmask_type*>(row_bitmask.data()),
                           requests,
                           &sparse_results,
                           cache,
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu
index e960cc1f4e0..adba5dee8f5 100644
--- a/cpp/src/groupby/hash/sparse_to_dense_results.cu
+++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu
@@ -20,7 +20,6 @@
 #include <cudf/detail/aggregation/result_cache.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/groupby.hpp>
-#include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -29,7 +28,7 @@
 
 namespace cudf::groupby::detail::hash {
 template <typename SetType>
-void sparse_to_dense_results(table_view const& keys,
+void sparse_to_dense_results(bitmask_type const* row_bitmask,
                              host_span<aggregation_request const> requests,
                              cudf::detail::result_cache* sparse_results,
                              cudf::detail::result_cache* dense_results,
@@ -39,11 +38,6 @@ void sparse_to_dense_results(table_view const& keys,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr)
 {
-  auto row_bitmask =
-    cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first;
-  bitmask_type const* row_bitmask_ptr =
-    skip_rows_with_nulls ? static_cast<bitmask_type*>(row_bitmask.data()) : nullptr;
-
   for (auto const& request : requests) {
     auto const& agg_v = request.aggregations;
     auto const& col   = request.values;
@@ -51,7 +45,7 @@ void sparse_to_dense_results(table_view const& keys,
     // Given an aggregation, this will get the result from sparse_results and
     // convert and return dense, compacted result
     auto finalizer = hash_compound_agg_finalizer(
-      col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr);
+      col, sparse_results, dense_results, gather_map, set, row_bitmask, stream, mr);
     for (auto&& agg : agg_v) {
       agg->finalize(finalizer);
     }
@@ -59,7 +53,7 @@ void sparse_to_dense_results(table_view const& keys,
 }
 
 template void sparse_to_dense_results<hash_set_ref_t<cuco::find_tag>>(
-  table_view const& keys,
+  bitmask_type const* row_bitmask,
   host_span<aggregation_request const> requests,
   cudf::detail::result_cache* sparse_results,
   cudf::detail::result_cache* dense_results,
@@ -70,7 +64,7 @@ template void sparse_to_dense_results<hash_set_ref_t<cuco::find_tag>>(
   rmm::device_async_resource_ref mr);
 
 template void sparse_to_dense_results<nullable_hash_set_ref_t<cuco::find_tag>>(
-  table_view const& keys,
+  bitmask_type const* row_bitmask,
   host_span<aggregation_request const> requests,
   cudf::detail::result_cache* sparse_results,
   cudf::detail::result_cache* dense_results,
@@ -79,5 +73,4 @@ template void sparse_to_dense_results<nullable_hash_set_ref_t<cuco::find_tag>>(
   bool skip_rows_with_nulls,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr);
-
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.hpp b/cpp/src/groupby/hash/sparse_to_dense_results.hpp
index 2c14cc1e7f6..31fc02e7a38 100644
--- a/cpp/src/groupby/hash/sparse_to_dense_results.hpp
+++ b/cpp/src/groupby/hash/sparse_to_dense_results.hpp
@@ -16,8 +16,8 @@
 #pragma once
 
 #include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/groupby.hpp>
-#include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -32,7 +32,7 @@ namespace cudf::groupby::detail::hash {
  * @see groupby_null_templated()
  */
 template <typename SetType>
-void sparse_to_dense_results(table_view const& keys,
+void sparse_to_dense_results(bitmask_type const* row_bitmask,
                              host_span<aggregation_request const> requests,
                              cudf::detail::result_cache* sparse_results,
                              cudf::detail::result_cache* dense_results,

From 5baa2cf1a140710339ded54d741e2c4b42289195 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 11 Oct 2024 14:31:47 -0700
Subject: [PATCH 093/135] Add missing header

---
 cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index fab5887d7b8..ca2256d78dd 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -33,9 +33,11 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <cooperative_groups.h>
 #include <cuco/static_set.cuh>
+#include <thrust/for_each.h>
 
 #include <algorithm>
 #include <memory>

From 53e0e00ca5edf16d9a2a4e51411a1bfc64a7bc81 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 11 Oct 2024 14:44:59 -0700
Subject: [PATCH 094/135] Add missing header

---
 cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index ca2256d78dd..4e41429bd46 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -32,6 +32,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 

From 57a450af4110f83194e3eed7e952093f4111968f Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 11 Oct 2024 14:49:46 -0700
Subject: [PATCH 095/135] Clean up headers

---
 cpp/src/groupby/hash/compute_groupby.cu           | 3 ++-
 cpp/src/groupby/hash/compute_mapping_indices.cuh  | 1 -
 cpp/src/groupby/hash/compute_mapping_indices.hpp  | 1 -
 cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 1 -
 cpp/src/groupby/hash/compute_single_pass_aggs.hpp | 1 -
 cpp/src/groupby/hash/sparse_to_dense_results.cu   | 1 -
 cpp/src/groupby/hash/sparse_to_dense_results.hpp  | 1 -
 7 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu
index 95bf74c9e84..7565e8ecfbb 100644
--- a/cpp/src/groupby/hash/compute_groupby.cu
+++ b/cpp/src/groupby/hash/compute_groupby.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/groupby.hpp>
+#include <cudf/null_mask.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
@@ -91,7 +92,7 @@ std::unique_ptr<table> compute_groupby(table_view const& keys,
 
   auto row_bitmask =
     skip_rows_with_nulls
-      ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
+      ? cudf::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
       : rmm::device_buffer{};
 
   // Compute all single pass aggs first
diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh
index 91e7c83a2a2..dd369a123ca 100644
--- a/cpp/src/groupby/hash/compute_mapping_indices.cuh
+++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh
@@ -19,7 +19,6 @@
 #include "helpers.cuh"
 
 #include <cudf/detail/cuco_helpers.hpp>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/cuda.hpp>
 #include <cudf/types.hpp>
diff --git a/cpp/src/groupby/hash/compute_mapping_indices.hpp b/cpp/src/groupby/hash/compute_mapping_indices.hpp
index d2cf3450730..d8047f9a5d8 100644
--- a/cpp/src/groupby/hash/compute_mapping_indices.hpp
+++ b/cpp/src/groupby/hash/compute_mapping_indices.hpp
@@ -15,7 +15,6 @@
  */
 #pragma once
 
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index 4e41429bd46..fb199c28b21 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -24,7 +24,6 @@
 #include "single_pass_functors.cuh"
 
 #include <cudf/detail/aggregation/result_cache.hpp>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/table/table_device_view.cuh>
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
index a0d2452d39f..7dda9d4c4be 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
@@ -16,7 +16,6 @@
 #pragma once
 
 #include <cudf/detail/aggregation/result_cache.hpp>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu
index adba5dee8f5..eb037e69937 100644
--- a/cpp/src/groupby/hash/sparse_to_dense_results.cu
+++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu
@@ -18,7 +18,6 @@
 #include "helpers.cuh"
 
 #include <cudf/detail/aggregation/result_cache.hpp>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.hpp b/cpp/src/groupby/hash/sparse_to_dense_results.hpp
index 31fc02e7a38..0b8975d235c 100644
--- a/cpp/src/groupby/hash/sparse_to_dense_results.hpp
+++ b/cpp/src/groupby/hash/sparse_to_dense_results.hpp
@@ -16,7 +16,6 @@
 #pragma once
 
 #include <cudf/detail/aggregation/result_cache.hpp>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>

From 5b92cd01167e462f83b04a58a0c04298581c99d3 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 14 Oct 2024 10:10:47 -0700
Subject: [PATCH 096/135] Minor cleanup on ref type determination

---
 cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index fb199c28b21..e55f095f764 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -67,7 +67,8 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
 
   auto global_set_ref = global_set.ref(cuco::op::insert_and_find);
 
-  auto const grid_size            = max_occupancy_grid_size<decltype(global_set_ref)>(num_rows);
+  auto const grid_size =
+    max_occupancy_grid_size<typename SetType::ref_type<cuco::insert_and_find_tag>>(num_rows);
   auto const has_sufficient_shmem = available_shared_memory_size(grid_size) >
                                     (shmem_agg_pointer_size(flattened_values.num_columns()) * 2);
   auto const has_dictionary_request = std::any_of(

From 98aa46829a1aaafebd10d7608b03c11ef7e0deab Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 14 Oct 2024 10:57:54 -0700
Subject: [PATCH 097/135] Add device num_bitmask_words device utility

---
 cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
index 3c0a00c4798..5fa818df2ae 100644
--- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
@@ -35,6 +35,16 @@
 
 namespace cudf::groupby::detail::hash {
 namespace {
+/// Computes number of *actual* bitmask_type elements needed
+__device__ constexpr size_type num_bitmask_words(size_type number_of_bits)
+{
+  // TODO: This duplicates `cudf::num_bitmask_words`. Converting it into
+  // a public host-device utility will require non-trivial effort, so the
+  // cleanup will be addressed in a separate PR.
+  return cudf::util::div_rounding_up_safe<size_type>(number_of_bits,
+                                                     detail::size_in_bits<bitmask_type>());
+}
+
 // Prepares shared memory data required by each output column, exits if
 // no enough memory space to perform the shared memory aggregation for the
 // current output column

From 8be8d158928ffc85adad904885bc6ce7f424f724 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 14 Oct 2024 10:59:14 -0700
Subject: [PATCH 098/135] Fix a minor bug determining column C++ type

---
 cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
index 5fa818df2ae..7f4d8030dd4 100644
--- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
@@ -64,8 +64,8 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
   auto const valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality);
 
   while (bytes_allocated < total_agg_size && col_end < output_size) {
-    auto const next_col_size =
-      round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality);
+    auto const next_col_size = round_to_multiple_of_8(
+      sizeof(cudf::id_to_type(output_values.column(col_end).type().id())) * cardinality);
     auto const next_col_total_size = next_col_size + valid_col_size;
 
     // TODO: it seems early exit will break the followup calculatons. To verify

From d3c465ba7dc18e0ea6f6876485fc849d5cd3c803 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 14 Oct 2024 11:31:21 -0700
Subject: [PATCH 099/135] Bug fix: use type_dispatcher

---
 .../hash/compute_single_pass_shmem_aggs.cu      | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
index 7f4d8030dd4..ed8e4d1d756 100644
--- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
@@ -26,6 +26,7 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/bit.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -35,6 +36,16 @@
 
 namespace cudf::groupby::detail::hash {
 namespace {
+/// Functor used by type dispatcher returning the size of the underlying C++ type
+struct size_of_functor {
+  template <typename T>
+  __device__ constexpr cudf::size_type operator()()
+  {
+    return sizeof(T);
+  }
+};
+
+/*
 /// Computes number of *actual* bitmask_type elements needed
 __device__ constexpr size_type num_bitmask_words(size_type number_of_bits)
 {
@@ -42,8 +53,9 @@ __device__ constexpr size_type num_bitmask_words(size_type number_of_bits)
   // a public host-device utility will require non-trivial effort, so the
   // cleanup will be addressed in a separate PR.
   return cudf::util::div_rounding_up_safe<size_type>(number_of_bits,
-                                                     detail::size_in_bits<bitmask_type>());
+                                                     cudf::detail::size_in_bits<bitmask_type>());
 }
+*/
 
 // Prepares shared memory data required by each output column, exits if
 // no enough memory space to perform the shared memory aggregation for the
@@ -64,8 +76,9 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
   auto const valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality);
 
   while (bytes_allocated < total_agg_size && col_end < output_size) {
+    auto const col_idx       = col_end;
     auto const next_col_size = round_to_multiple_of_8(
-      sizeof(cudf::id_to_type(output_values.column(col_end).type().id())) * cardinality);
+      cudf::type_dispatcher(output_values.column(col_idx).type(), size_of_functor{}) * cardinality);
     auto const next_col_total_size = next_col_size + valid_col_size;
 
     // TODO: it seems early exit will break the followup calculatons. To verify

From f109b814e1cb74fd340de7e0f930c3afcf79032b Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 14 Oct 2024 11:47:05 -0700
Subject: [PATCH 100/135] Pass block to compute_final_aggregations

---
 .../groupby/hash/compute_single_pass_shmem_aggs.cu | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
index ed8e4d1d756..7df3de93b34 100644
--- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
@@ -146,7 +146,8 @@ __device__ void compute_pre_aggregrations(cudf::size_type col_start,
   }
 }
 
-__device__ void compute_final_aggregations(cudf::size_type col_start,
+__device__ void compute_final_aggregations(cooperative_groups::thread_block const& block,
+                                           cudf::size_type col_start,
                                            cudf::size_type col_end,
                                            cudf::table_device_view input_values,
                                            cudf::mutable_table_device_view output_values,
@@ -156,8 +157,8 @@ __device__ void compute_final_aggregations(cudf::size_type col_start,
                                            bool** s_aggregates_valid_pointer,
                                            cudf::aggregation::Kind const* d_agg_kinds)
 {
-  for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) {
-    auto out_idx = global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx];
+  for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) {
+    auto out_idx = global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx];
     for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
       auto output_col = output_values.column(col_idx);
 
@@ -168,10 +169,11 @@ __device__ void compute_final_aggregations(cudf::size_type col_start,
                                                   out_idx,
                                                   input_values.column(col_idx),
                                                   s_aggregates_pointer[col_idx],
-                                                  cur_idx,
+                                                  idx,
                                                   s_aggregates_valid_pointer[col_idx]);
     }
   }
+  block.sync();
 }
 
 /* Takes the local_mapping_index and global_mapping_index to compute
@@ -243,7 +245,8 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
                               d_agg_kinds);
     block.sync();
 
-    compute_final_aggregations(col_start,
+    compute_final_aggregations(block,
+                               col_start,
                                col_end,
                                input_values,
                                output_values,
@@ -252,7 +255,6 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
                                s_aggregates_pointer,
                                s_aggregates_valid_pointer,
                                d_agg_kinds);
-    block.sync();
   }
 }
 

From 280db67bab7a38c62f9bc8c934b4bed49f89cf57 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 14 Oct 2024 13:44:13 -0700
Subject: [PATCH 101/135] Cleanup: use offsets instead pointers to save memory
 space

---
 .../groupby/hash/compute_single_pass_aggs.cuh |   2 +-
 .../hash/compute_single_pass_shmem_aggs.cu    | 130 ++++++++++--------
 .../hash/compute_single_pass_shmem_aggs.hpp   |   2 +-
 .../groupby/hash/global_memory_aggregator.cuh |  48 +++----
 .../groupby/hash/shared_memory_aggregator.cuh |  38 ++---
 cpp/src/groupby/hash/single_pass_functors.cuh |  30 ++--
 6 files changed, 129 insertions(+), 121 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
index e55f095f764..94c7f4b59c7 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
@@ -70,7 +70,7 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   auto const grid_size =
     max_occupancy_grid_size<typename SetType::ref_type<cuco::insert_and_find_tag>>(num_rows);
   auto const has_sufficient_shmem = available_shared_memory_size(grid_size) >
-                                    (shmem_agg_pointer_size(flattened_values.num_columns()) * 2);
+                                    (shmem_offsets_size(flattened_values.num_columns()) * 2);
   auto const has_dictionary_request = std::any_of(
     requests.begin(), requests.end(), [](cudf::groupby::aggregation_request const& request) {
       return cudf::is_dictionary(request.values.type());
diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
index 7df3de93b34..1a984d6f100 100644
--- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
@@ -64,9 +64,8 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
                                                cudf::size_type& col_end,
                                                cudf::mutable_table_device_view output_values,
                                                cudf::size_type output_size,
-                                               std::byte** s_aggregates_pointer,
-                                               bool** s_aggregates_valid_pointer,
-                                               std::byte* shared_set_aggregates,
+                                               cudf::size_type* target_offsets,
+                                               cudf::size_type* target_mask_offsets,
                                                cudf::size_type cardinality,
                                                cudf::size_type total_agg_size)
 {
@@ -84,9 +83,8 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
     // TODO: it seems early exit will break the followup calculatons. To verify
     if (bytes_allocated + next_col_total_size > total_agg_size) { break; }
 
-    s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated;
-    s_aggregates_valid_pointer[col_end] =
-      reinterpret_cast<bool*>(shared_set_aggregates + bytes_allocated + next_col_size);
+    target_offsets[col_end]      = bytes_allocated;
+    target_mask_offsets[col_end] = bytes_allocated + next_col_size;
 
     bytes_allocated += next_col_total_size;
     ++col_end;
@@ -98,19 +96,22 @@ __device__ void initialize_shmem_aggregations(cooperative_groups::thread_block c
                                               cudf::size_type col_start,
                                               cudf::size_type col_end,
                                               cudf::mutable_table_device_view output_values,
-                                              std::byte** s_aggregates_pointer,
-                                              bool** s_aggregates_valid_pointer,
+                                              std::byte* shared_set_aggs,
+                                              cudf::size_type* target_offsets,
+                                              cudf::size_type* target_mask_offsets,
                                               cudf::size_type cardinality,
                                               cudf::aggregation::Kind const* d_agg_kinds)
 {
   for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
     for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) {
+      std::byte* target = reinterpret_cast<std::byte*>(shared_set_aggs + target_offsets[col_idx]);
+      bool* target_mask = reinterpret_cast<bool*>(shared_set_aggs + target_mask_offsets[col_idx]);
       cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(),
                                                   d_agg_kinds[col_idx],
                                                   initialize_shmem{},
-                                                  s_aggregates_pointer[col_idx],
-                                                  idx,
-                                                  s_aggregates_valid_pointer[col_idx]);
+                                                  target,
+                                                  target_mask,
+                                                  idx);
     }
   }
   block.sync();
@@ -120,27 +121,32 @@ __device__ void compute_pre_aggregrations(cudf::size_type col_start,
                                           cudf::size_type col_end,
                                           bitmask_type const* row_bitmask,
                                           bool skip_rows_with_nulls,
-                                          cudf::table_device_view input_values,
+                                          cudf::table_device_view source,
                                           cudf::size_type num_input_rows,
                                           cudf::size_type* local_mapping_index,
-                                          std::byte** s_aggregates_pointer,
-                                          bool** s_aggregates_valid_pointer,
+                                          std::byte* shared_set_aggs,
+                                          cudf::size_type* target_offsets,
+                                          cudf::size_type* target_mask_offsets,
                                           cudf::aggregation::Kind const* d_agg_kinds)
 {
-  for (auto idx = cudf::detail::grid_1d::global_thread_id(); idx < num_input_rows;
-       idx += cudf::detail::grid_1d::grid_stride()) {
-    if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx)) {
-      auto const map_idx = local_mapping_index[idx];
+  for (auto source_idx = cudf::detail::grid_1d::global_thread_id(); source_idx < num_input_rows;
+       source_idx += cudf::detail::grid_1d::grid_stride()) {
+    if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, source_idx)) {
+      auto const target_idx = local_mapping_index[source_idx];
       for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
-        auto const input_col = input_values.column(col_idx);
-        cudf::detail::dispatch_type_and_aggregation(input_col.type(),
+        auto const source_col = source.column(col_idx);
+
+        std::byte* target = reinterpret_cast<std::byte*>(shared_set_aggs + target_offsets[col_idx]);
+        bool* target_mask = reinterpret_cast<bool*>(shared_set_aggs + target_mask_offsets[col_idx]);
+
+        cudf::detail::dispatch_type_and_aggregation(source_col.type(),
                                                     d_agg_kinds[col_idx],
                                                     shmem_element_aggregator{},
-                                                    s_aggregates_pointer[col_idx],
-                                                    map_idx,
-                                                    s_aggregates_valid_pointer[col_idx],
-                                                    input_col,
-                                                    idx);
+                                                    target,
+                                                    target_mask,
+                                                    target_idx,
+                                                    source_col,
+                                                    source_idx);
       }
     }
   }
@@ -150,27 +156,33 @@ __device__ void compute_final_aggregations(cooperative_groups::thread_block cons
                                            cudf::size_type col_start,
                                            cudf::size_type col_end,
                                            cudf::table_device_view input_values,
-                                           cudf::mutable_table_device_view output_values,
+                                           cudf::mutable_table_device_view target,
                                            cudf::size_type cardinality,
                                            cudf::size_type* global_mapping_index,
-                                           std::byte** s_aggregates_pointer,
-                                           bool** s_aggregates_valid_pointer,
+                                           std::byte* shared_set_aggs,
+                                           cudf::size_type* agg_res_offsets,
+                                           cudf::size_type* agg_mask_offsets,
                                            cudf::aggregation::Kind const* d_agg_kinds)
 {
+  // Aggregate shared memory sources to global memory targets
   for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) {
-    auto out_idx = global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx];
+    auto const target_idx =
+      global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx];
     for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
-      auto output_col = output_values.column(col_idx);
+      auto target_col = target.column(col_idx);
+
+      std::byte* source = reinterpret_cast<std::byte*>(shared_set_aggs + agg_res_offsets[col_idx]);
+      bool* source_mask = reinterpret_cast<bool*>(shared_set_aggs + agg_mask_offsets[col_idx]);
 
       cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(),
                                                   d_agg_kinds[col_idx],
                                                   gmem_element_aggregator{},
-                                                  output_col,
-                                                  out_idx,
+                                                  target_col,
+                                                  target_idx,
                                                   input_values.column(col_idx),
-                                                  s_aggregates_pointer[col_idx],
-                                                  idx,
-                                                  s_aggregates_valid_pointer[col_idx]);
+                                                  source,
+                                                  source_mask,
+                                                  idx);
     }
   }
   block.sync();
@@ -188,7 +200,7 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
                                                cudf::mutable_table_device_view output_values,
                                                cudf::aggregation::Kind const* d_agg_kinds,
                                                cudf::size_type total_agg_size,
-                                               cudf::size_type pointer_size)
+                                               cudf::size_type offsets_size)
 {
   auto const block       = cooperative_groups::this_thread_block();
   auto const cardinality = block_cardinality[block.group_index().x];
@@ -198,11 +210,12 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
 
   __shared__ cudf::size_type col_start;
   __shared__ cudf::size_type col_end;
-  extern __shared__ std::byte shared_set_aggregates[];
-  std::byte** s_aggregates_pointer =
-    reinterpret_cast<std::byte**>(shared_set_aggregates + total_agg_size);
-  bool** s_aggregates_valid_pointer =
-    reinterpret_cast<bool**>(shared_set_aggregates + total_agg_size + pointer_size);
+  extern __shared__ std::byte shared_set_aggs[];
+
+  cudf::size_type* target_offsets =
+    reinterpret_cast<cudf::size_type*>(shared_set_aggs + total_agg_size);
+  cudf::size_type* target_mask_offsets =
+    reinterpret_cast<cudf::size_type*>(shared_set_aggs + total_agg_size + offsets_size);
 
   if (block.thread_rank() == 0) {
     col_start = 0;
@@ -216,9 +229,8 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
                                      col_end,
                                      output_values,
                                      num_cols,
-                                     s_aggregates_pointer,
-                                     s_aggregates_valid_pointer,
-                                     shared_set_aggregates,
+                                     target_offsets,
+                                     target_mask_offsets,
                                      cardinality,
                                      total_agg_size);
     }
@@ -228,8 +240,9 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
                                   col_start,
                                   col_end,
                                   output_values,
-                                  s_aggregates_pointer,
-                                  s_aggregates_valid_pointer,
+                                  shared_set_aggs,
+                                  target_offsets,
+                                  target_mask_offsets,
                                   cardinality,
                                   d_agg_kinds);
 
@@ -240,8 +253,9 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
                               input_values,
                               num_rows,
                               local_mapping_index,
-                              s_aggregates_pointer,
-                              s_aggregates_valid_pointer,
+                              shared_set_aggs,
+                              target_offsets,
+                              target_mask_offsets,
                               d_agg_kinds);
     block.sync();
 
@@ -252,8 +266,9 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
                                output_values,
                                cardinality,
                                global_mapping_index,
-                               s_aggregates_pointer,
-                               s_aggregates_valid_pointer,
+                               shared_set_aggs,
+                               target_offsets,
+                               target_mask_offsets,
                                d_agg_kinds);
   }
 }
@@ -273,7 +288,7 @@ size_t available_shared_memory_size(cudf::size_type grid_size)
   return get_previous_multiple_of_8(0.5 * dynamic_shmem_size);
 }
 
-size_t shmem_agg_pointer_size(cudf::size_type num_cols) { return sizeof(void*) * num_cols; }
+size_t shmem_offsets_size(cudf::size_type num_cols) { return sizeof(cudf::size_type) * num_cols; }
 
 void compute_single_pass_shmem_aggs(cudf::size_type grid_size,
                                     cudf::size_type num_input_rows,
@@ -288,13 +303,12 @@ void compute_single_pass_shmem_aggs(cudf::size_type grid_size,
                                     rmm::cuda_stream_view stream)
 {
   auto const shmem_size = available_shared_memory_size(grid_size);
-  // For each aggregation, need two pointers to arrays in shmem
-  // One where the aggregation is performed, one indicating the validity of the aggregation
-  auto const shmem_pointer_size = shmem_agg_pointer_size(output_values.num_columns());
+  // For each aggregation, need one offset determining where the aggregation is
+  // performed, another indicating the validity of the aggregation
+  auto const offsets_size = shmem_offsets_size(output_values.num_columns());
   // The rest of shmem is utilized for the actual arrays in shmem
-  CUDF_EXPECTS(shmem_size > shmem_pointer_size * 2,
-               "No enough space for shared memory aggregations");
-  auto const shmem_agg_size = shmem_size - shmem_pointer_size * 2;
+  CUDF_EXPECTS(shmem_size > offsets_size * 2, "No enough space for shared memory aggregations");
+  auto const shmem_agg_size = shmem_size - offsets_size * 2;
   single_pass_shmem_aggs_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, shmem_size, stream>>>(
     num_input_rows,
     row_bitmask,
@@ -306,6 +320,6 @@ void compute_single_pass_shmem_aggs(cudf::size_type grid_size,
     output_values,
     d_agg_kinds,
     shmem_agg_size,
-    shmem_pointer_size);
+    offsets_size);
 }
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp
index c871752e7e3..73db4750a1f 100644
--- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp
+++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp
@@ -25,7 +25,7 @@ namespace cudf::groupby::detail::hash {
 
 size_t available_shared_memory_size(int grid_size);
 
-size_t shmem_agg_pointer_size(int num_cols);
+size_t shmem_offsets_size(int num_cols);
 
 void compute_single_pass_shmem_aggs(int grid_size,
                                     cudf::size_type num_input_rows,
diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh
index 08d2c0552b3..89394790117 100644
--- a/cpp/src/groupby/hash/global_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh
@@ -33,8 +33,8 @@ struct update_target_element_gmem {
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
                              std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const
+                             bool* source_mask,
+                             cudf::size_type source_index) const noexcept
   {
     CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
   }
@@ -49,8 +49,8 @@ struct update_target_element_gmem<
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
                              std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
+                             bool* source_mask,
+                             cudf::size_type source_index) const noexcept
   {
     using DeviceType          = cudf::detail::underlying_target_t<Source, aggregation::MIN>;
     DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
@@ -70,8 +70,8 @@ struct update_target_element_gmem<
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
                              std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
+                             bool* source_mask,
+                             cudf::size_type source_index) const noexcept
   {
     using DeviceType          = cudf::detail::underlying_target_t<Source, aggregation::MAX>;
     DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
@@ -92,8 +92,8 @@ struct update_target_element_gmem<
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
                              std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
+                             bool* source_mask,
+                             cudf::size_type source_index) const noexcept
   {
     using DeviceType          = cudf::detail::underlying_target_t<Source, aggregation::SUM>;
     DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
@@ -114,8 +114,8 @@ struct update_target_element_gmem<
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
                              std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
+                             bool* source_mask,
+                             cudf::size_type source_index) const noexcept
   {
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM_OF_SQUARES>;
 
@@ -137,8 +137,8 @@ struct update_target_element_gmem<
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
                              std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
+                             bool* source_mask,
+                             cudf::size_type source_index) const noexcept
   {
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::PRODUCT>;
 
@@ -162,8 +162,8 @@ struct update_target_element_gmem<
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
                              std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
+                             bool* source_mask,
+                             cudf::size_type source_index) const noexcept
   {
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_VALID>;
 
@@ -186,8 +186,8 @@ struct update_target_element_gmem<
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
                              std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
+                             bool* source_mask,
+                             cudf::size_type source_index) const noexcept
   {
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_ALL>;
 
@@ -209,8 +209,8 @@ struct update_target_element_gmem<
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
                              std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
+                             bool* source_mask,
+                             cudf::size_type source_index) const noexcept
   {
     using Target             = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMAX>;
     Target* source_casted    = reinterpret_cast<Target*>(source);
@@ -238,8 +238,8 @@ struct update_target_element_gmem<
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
                              std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
+                             bool* source_mask,
+                             cudf::size_type source_index) const noexcept
   {
     using Target             = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMIN>;
     Target* source_casted    = reinterpret_cast<Target*>(source);
@@ -264,14 +264,14 @@ struct gmem_element_aggregator {
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
                              std::byte* source,
-                             cudf::size_type source_index,
-                             bool* source_null) const noexcept
+                             bool* source_mask,
+                             cudf::size_type source_index) const noexcept
   {
     if constexpr (k != cudf::aggregation::COUNT_ALL) {
-      if (source_null[source_index]) { return; }
+      if (source_mask[source_index]) { return; }
     }
     update_target_element_gmem<Source, k>{}(
-      target, target_index, source_column, source, source_index, source_null);
+      target, target_index, source_column, source, source_mask, source_index);
   }
 };
 
diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
index c5713e4a72e..f4be32ed723 100644
--- a/cpp/src/groupby/hash/shared_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
@@ -30,8 +30,8 @@ namespace cudf::groupby::detail::hash {
 template <typename Source, cudf::aggregation::Kind k, typename Enable = void>
 struct update_target_element_shmem {
   __device__ void operator()(std::byte* target,
+                             bool* target_mask,
                              cudf::size_type target_index,
-                             bool* target_null,
                              cudf::column_device_view source,
                              cudf::size_type source_index) const
   {
@@ -45,8 +45,8 @@ struct update_target_element_shmem<
   cudf::aggregation::MIN,
   cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
   __device__ void operator()(std::byte* target,
+                             bool* target_mask,
                              cudf::size_type target_index,
-                             bool* target_null,
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
@@ -56,7 +56,7 @@ struct update_target_element_shmem<
     DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
     cudf::detail::atomic_min(&target_casted[target_index],
                              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
-    if (target_null[target_index]) { target_null[target_index] = false; }
+    if (target_mask[target_index]) { target_mask[target_index] = false; }
   }
 };
 
@@ -66,8 +66,8 @@ struct update_target_element_shmem<
   cudf::aggregation::MAX,
   cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
   __device__ void operator()(std::byte* target,
+                             bool* target_mask,
                              cudf::size_type target_index,
-                             bool* target_null,
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
@@ -78,7 +78,7 @@ struct update_target_element_shmem<
     cudf::detail::atomic_max(&target_casted[target_index],
                              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
 
-    if (target_null[target_index]) { target_null[target_index] = false; }
+    if (target_mask[target_index]) { target_mask[target_index] = false; }
   }
 };
 
@@ -89,8 +89,8 @@ struct update_target_element_shmem<
   cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
                          !cudf::is_timestamp<Source>()>> {
   __device__ void operator()(std::byte* target,
+                             bool* target_mask,
                              cudf::size_type target_index,
-                             bool* target_null,
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
@@ -101,7 +101,7 @@ struct update_target_element_shmem<
     cudf::detail::atomic_add(&target_casted[target_index],
                              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
 
-    if (target_null[target_index]) { target_null[target_index] = false; }
+    if (target_mask[target_index]) { target_mask[target_index] = false; }
   }
 };
 
@@ -111,8 +111,8 @@ struct update_target_element_shmem<
   cudf::aggregation::SUM_OF_SQUARES,
   cuda::std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
   __device__ void operator()(std::byte* target,
+                             bool* target_mask,
                              cudf::size_type target_index,
-                             bool* target_null,
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
@@ -121,7 +121,7 @@ struct update_target_element_shmem<
     auto value            = static_cast<Target>(source.element<Source>(source_index));
     cudf::detail::atomic_add(&target_casted[target_index], value * value);
 
-    if (target_null[target_index]) { target_null[target_index] = false; }
+    if (target_mask[target_index]) { target_mask[target_index] = false; }
   }
 };
 
@@ -131,8 +131,8 @@ struct update_target_element_shmem<
   cudf::aggregation::PRODUCT,
   cuda::std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
   __device__ void operator()(std::byte* target,
+                             bool* target_mask,
                              cudf::size_type target_index,
-                             bool* target_null,
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
@@ -141,7 +141,7 @@ struct update_target_element_shmem<
     cudf::detail::atomic_mul(&target_casted[target_index],
                              static_cast<Target>(source.element<Source>(source_index)));
 
-    if (target_null[target_index]) { target_null[target_index] = false; }
+    if (target_mask[target_index]) { target_mask[target_index] = false; }
   }
 };
 
@@ -152,8 +152,8 @@ struct update_target_element_shmem<
   cuda::std::enable_if_t<
     cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_VALID>()>> {
   __device__ void operator()(std::byte* target,
+                             bool* target_mask,
                              cudf::size_type target_index,
-                             bool* target_null,
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
@@ -170,8 +170,8 @@ struct update_target_element_shmem<
   cuda::std::enable_if_t<
     cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_ALL>()>> {
   __device__ void operator()(std::byte* target,
+                             bool* target_mask,
                              cudf::size_type target_index,
-                             bool* target_null,
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
@@ -190,8 +190,8 @@ struct update_target_element_shmem<
   cuda::std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMAX>() and
                          cudf::is_relationally_comparable<Source, Source>()>> {
   __device__ void operator()(std::byte* target,
+                             bool* target_mask,
                              cudf::size_type target_index,
-                             bool* target_null,
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
@@ -205,7 +205,7 @@ struct update_target_element_shmem<
       }
     }
 
-    if (target_null[target_index]) { target_null[target_index] = false; }
+    if (target_mask[target_index]) { target_mask[target_index] = false; }
   }
 };
 
@@ -216,8 +216,8 @@ struct update_target_element_shmem<
   cuda::std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMIN>() and
                          cudf::is_relationally_comparable<Source, Source>()>> {
   __device__ void operator()(std::byte* target,
+                             bool* target_mask,
                              cudf::size_type target_index,
-                             bool* target_null,
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
@@ -231,15 +231,15 @@ struct update_target_element_shmem<
       }
     }
 
-    if (target_null[target_index]) { target_null[target_index] = false; }
+    if (target_mask[target_index]) { target_mask[target_index] = false; }
   }
 };
 
 struct shmem_element_aggregator {
   template <typename Source, cudf::aggregation::Kind k>
   __device__ void operator()(std::byte* target,
+                             bool* target_mask,
                              cudf::size_type target_index,
-                             bool* target_null,
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
@@ -247,7 +247,7 @@ struct shmem_element_aggregator {
       if (source.is_null(source_index)) { return; }
     }
     update_target_element_shmem<Source, k>{}(
-      target, target_index, target_null, source, source_index);
+      target, target_mask, target_index, source, source_index);
   }
 };
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index 6d10c8065ca..93b2bff8990 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -74,41 +74,35 @@ __device__ T get_identity()
 template <typename Target, cudf::aggregation::Kind k, typename Enable = void>
 struct initialize_target_element {
   __device__ void operator()(std::byte* target,
-                             cudf::size_type target_index,
-                             bool* target_null) const noexcept
+                             bool* target_mask,
+                             cudf::size_type idx) const noexcept
   {
     CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
   }
 };
 
-// TODO: are the conditions correctly checked?
 template <typename Target, cudf::aggregation::Kind k>
 struct initialize_target_element<Target, k, std::enable_if_t<is_supported<Target, k>()>> {
   __device__ void operator()(std::byte* target,
-                             cudf::size_type target_index,
-                             bool* target_null) const noexcept
+                             bool* target_mask,
+                             cudf::size_type idx) const noexcept
   {
-    using DeviceType            = cudf::device_storage_type_t<Target>;
-    DeviceType* target_casted   = reinterpret_cast<DeviceType*>(target);
-    target_casted[target_index] = get_identity<DeviceType, k>();
+    using DeviceType          = cudf::device_storage_type_t<Target>;
+    DeviceType* target_casted = reinterpret_cast<DeviceType*>(target);
 
-    if (k == cudf::aggregation::COUNT_ALL || k == cudf::aggregation::COUNT_VALID) {
-      target_null[target_index] = false;
-    } else {
-      target_null[target_index] = true;
-    }
+    target_casted[idx] = get_identity<DeviceType, k>();
+    target_mask[idx] = !(k == cudf::aggregation::COUNT_ALL || k == cudf::aggregation::COUNT_VALID);
   }
 };
 
 struct initialize_shmem {
   template <typename Target, cudf::aggregation::Kind k>
+  // TODO naming
   __device__ void operator()(std::byte* target,
-                             cudf::size_type target_index,
-                             bool* target_null) const noexcept
+                             bool* target_mask,
+                             cudf::size_type idx) const noexcept
   {
-    // TODO: typecasting work for every datatype
-
-    initialize_target_element<Target, k>{}(target, target_index, target_null);
+    initialize_target_element<Target, k>{}(target, target_mask, idx);
   }
 };
 

From 8a0551e9c235906ad70438606fcd57a3686999e3 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 14 Oct 2024 14:10:32 -0700
Subject: [PATCH 102/135] Rename for clarity

---
 .../hash/compute_single_pass_shmem_aggs.cu    | 74 ++++++++++---------
 1 file changed, 40 insertions(+), 34 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
index 1a984d6f100..444cfdf4d79 100644
--- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
@@ -64,8 +64,8 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
                                                cudf::size_type& col_end,
                                                cudf::mutable_table_device_view output_values,
                                                cudf::size_type output_size,
-                                               cudf::size_type* target_offsets,
-                                               cudf::size_type* target_mask_offsets,
+                                               cudf::size_type* shmem_agg_res_offsets,
+                                               cudf::size_type* shmem_agg_mask_offsets,
                                                cudf::size_type cardinality,
                                                cudf::size_type total_agg_size)
 {
@@ -83,8 +83,8 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
     // TODO: it seems early exit will break the followup calculatons. To verify
     if (bytes_allocated + next_col_total_size > total_agg_size) { break; }
 
-    target_offsets[col_end]      = bytes_allocated;
-    target_mask_offsets[col_end] = bytes_allocated + next_col_size;
+    shmem_agg_res_offsets[col_end]  = bytes_allocated;
+    shmem_agg_mask_offsets[col_end] = bytes_allocated + next_col_size;
 
     bytes_allocated += next_col_total_size;
     ++col_end;
@@ -96,16 +96,18 @@ __device__ void initialize_shmem_aggregations(cooperative_groups::thread_block c
                                               cudf::size_type col_start,
                                               cudf::size_type col_end,
                                               cudf::mutable_table_device_view output_values,
-                                              std::byte* shared_set_aggs,
-                                              cudf::size_type* target_offsets,
-                                              cudf::size_type* target_mask_offsets,
+                                              std::byte* shmem_agg_storage,
+                                              cudf::size_type* shmem_agg_res_offsets,
+                                              cudf::size_type* shmem_agg_mask_offsets,
                                               cudf::size_type cardinality,
                                               cudf::aggregation::Kind const* d_agg_kinds)
 {
   for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
     for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) {
-      std::byte* target = reinterpret_cast<std::byte*>(shared_set_aggs + target_offsets[col_idx]);
-      bool* target_mask = reinterpret_cast<bool*>(shared_set_aggs + target_mask_offsets[col_idx]);
+      std::byte* target =
+        reinterpret_cast<std::byte*>(shmem_agg_storage + shmem_agg_res_offsets[col_idx]);
+      bool* target_mask =
+        reinterpret_cast<bool*>(shmem_agg_storage + shmem_agg_mask_offsets[col_idx]);
       cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(),
                                                   d_agg_kinds[col_idx],
                                                   initialize_shmem{},
@@ -124,11 +126,12 @@ __device__ void compute_pre_aggregrations(cudf::size_type col_start,
                                           cudf::table_device_view source,
                                           cudf::size_type num_input_rows,
                                           cudf::size_type* local_mapping_index,
-                                          std::byte* shared_set_aggs,
-                                          cudf::size_type* target_offsets,
-                                          cudf::size_type* target_mask_offsets,
+                                          std::byte* shmem_agg_storage,
+                                          cudf::size_type* shmem_agg_res_offsets,
+                                          cudf::size_type* shmem_agg_mask_offsets,
                                           cudf::aggregation::Kind const* d_agg_kinds)
 {
+  // Aggregates global memory sources to shared memory targets
   for (auto source_idx = cudf::detail::grid_1d::global_thread_id(); source_idx < num_input_rows;
        source_idx += cudf::detail::grid_1d::grid_stride()) {
     if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, source_idx)) {
@@ -136,8 +139,10 @@ __device__ void compute_pre_aggregrations(cudf::size_type col_start,
       for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
         auto const source_col = source.column(col_idx);
 
-        std::byte* target = reinterpret_cast<std::byte*>(shared_set_aggs + target_offsets[col_idx]);
-        bool* target_mask = reinterpret_cast<bool*>(shared_set_aggs + target_mask_offsets[col_idx]);
+        std::byte* target =
+          reinterpret_cast<std::byte*>(shmem_agg_storage + shmem_agg_res_offsets[col_idx]);
+        bool* target_mask =
+          reinterpret_cast<bool*>(shmem_agg_storage + shmem_agg_mask_offsets[col_idx]);
 
         cudf::detail::dispatch_type_and_aggregation(source_col.type(),
                                                     d_agg_kinds[col_idx],
@@ -159,20 +164,21 @@ __device__ void compute_final_aggregations(cooperative_groups::thread_block cons
                                            cudf::mutable_table_device_view target,
                                            cudf::size_type cardinality,
                                            cudf::size_type* global_mapping_index,
-                                           std::byte* shared_set_aggs,
+                                           std::byte* shmem_agg_storage,
                                            cudf::size_type* agg_res_offsets,
                                            cudf::size_type* agg_mask_offsets,
                                            cudf::aggregation::Kind const* d_agg_kinds)
 {
-  // Aggregate shared memory sources to global memory targets
+  // Aggregates shared memory sources to global memory targets
   for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) {
     auto const target_idx =
       global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx];
     for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
       auto target_col = target.column(col_idx);
 
-      std::byte* source = reinterpret_cast<std::byte*>(shared_set_aggs + agg_res_offsets[col_idx]);
-      bool* source_mask = reinterpret_cast<bool*>(shared_set_aggs + agg_mask_offsets[col_idx]);
+      std::byte* source =
+        reinterpret_cast<std::byte*>(shmem_agg_storage + agg_res_offsets[col_idx]);
+      bool* source_mask = reinterpret_cast<bool*>(shmem_agg_storage + agg_mask_offsets[col_idx]);
 
       cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(),
                                                   d_agg_kinds[col_idx],
@@ -210,12 +216,12 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
 
   __shared__ cudf::size_type col_start;
   __shared__ cudf::size_type col_end;
-  extern __shared__ std::byte shared_set_aggs[];
+  extern __shared__ std::byte shmem_agg_storage[];
 
-  cudf::size_type* target_offsets =
-    reinterpret_cast<cudf::size_type*>(shared_set_aggs + total_agg_size);
-  cudf::size_type* target_mask_offsets =
-    reinterpret_cast<cudf::size_type*>(shared_set_aggs + total_agg_size + offsets_size);
+  cudf::size_type* shmem_agg_res_offsets =
+    reinterpret_cast<cudf::size_type*>(shmem_agg_storage + total_agg_size);
+  cudf::size_type* shmem_agg_mask_offsets =
+    reinterpret_cast<cudf::size_type*>(shmem_agg_storage + total_agg_size + offsets_size);
 
   if (block.thread_rank() == 0) {
     col_start = 0;
@@ -229,8 +235,8 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
                                      col_end,
                                      output_values,
                                      num_cols,
-                                     target_offsets,
-                                     target_mask_offsets,
+                                     shmem_agg_res_offsets,
+                                     shmem_agg_mask_offsets,
                                      cardinality,
                                      total_agg_size);
     }
@@ -240,9 +246,9 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
                                   col_start,
                                   col_end,
                                   output_values,
-                                  shared_set_aggs,
-                                  target_offsets,
-                                  target_mask_offsets,
+                                  shmem_agg_storage,
+                                  shmem_agg_res_offsets,
+                                  shmem_agg_mask_offsets,
                                   cardinality,
                                   d_agg_kinds);
 
@@ -253,9 +259,9 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
                               input_values,
                               num_rows,
                               local_mapping_index,
-                              shared_set_aggs,
-                              target_offsets,
-                              target_mask_offsets,
+                              shmem_agg_storage,
+                              shmem_agg_res_offsets,
+                              shmem_agg_mask_offsets,
                               d_agg_kinds);
     block.sync();
 
@@ -266,9 +272,9 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
                                output_values,
                                cardinality,
                                global_mapping_index,
-                               shared_set_aggs,
-                               target_offsets,
-                               target_mask_offsets,
+                               shmem_agg_storage,
+                               shmem_agg_res_offsets,
+                               shmem_agg_mask_offsets,
                                d_agg_kinds);
   }
 }

From 5c493008eb6274299a74bad14d83f354aa23efaf Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 14 Oct 2024 15:54:53 -0700
Subject: [PATCH 103/135] Minor improvement to reduce build time

---
 cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
index 444cfdf4d79..cd1759a10ea 100644
--- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
@@ -75,9 +75,11 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
   auto const valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality);
 
   while (bytes_allocated < total_agg_size && col_end < output_size) {
-    auto const col_idx       = col_end;
-    auto const next_col_size = round_to_multiple_of_8(
-      cudf::type_dispatcher(output_values.column(col_idx).type(), size_of_functor{}) * cardinality);
+    auto const col_idx = col_end;
+    auto const next_col_size =
+      round_to_multiple_of_8(cudf::type_dispatcher<cudf::dispatch_storage_type>(
+                               output_values.column(col_idx).type(), size_of_functor{}) *
+                             cardinality);
     auto const next_col_total_size = next_col_size + valid_col_size;
 
     // TODO: it seems early exit will break the followup calculatons. To verify

From 99010b301a767418e85a16d305383ac7a485b815 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 14 Oct 2024 16:42:59 -0700
Subject: [PATCH 104/135] Use mask logic instead of null logic

---
 cpp/src/groupby/hash/global_memory_aggregator.cuh | 14 ++------------
 cpp/src/groupby/hash/shared_memory_aggregator.cuh | 15 ++++++++-------
 cpp/src/groupby/hash/single_pass_functors.cuh     |  7 ++++++-
 3 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh
index 89394790117..4c682ad6fae 100644
--- a/cpp/src/groupby/hash/global_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh
@@ -33,7 +33,6 @@ struct update_target_element_gmem {
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
                              std::byte* source,
-                             bool* source_mask,
                              cudf::size_type source_index) const noexcept
   {
     CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
@@ -49,7 +48,6 @@ struct update_target_element_gmem<
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
                              std::byte* source,
-                             bool* source_mask,
                              cudf::size_type source_index) const noexcept
   {
     using DeviceType          = cudf::detail::underlying_target_t<Source, aggregation::MIN>;
@@ -70,7 +68,6 @@ struct update_target_element_gmem<
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
                              std::byte* source,
-                             bool* source_mask,
                              cudf::size_type source_index) const noexcept
   {
     using DeviceType          = cudf::detail::underlying_target_t<Source, aggregation::MAX>;
@@ -92,7 +89,6 @@ struct update_target_element_gmem<
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
                              std::byte* source,
-                             bool* source_mask,
                              cudf::size_type source_index) const noexcept
   {
     using DeviceType          = cudf::detail::underlying_target_t<Source, aggregation::SUM>;
@@ -114,7 +110,6 @@ struct update_target_element_gmem<
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
                              std::byte* source,
-                             bool* source_mask,
                              cudf::size_type source_index) const noexcept
   {
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM_OF_SQUARES>;
@@ -137,7 +132,6 @@ struct update_target_element_gmem<
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
                              std::byte* source,
-                             bool* source_mask,
                              cudf::size_type source_index) const noexcept
   {
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::PRODUCT>;
@@ -162,7 +156,6 @@ struct update_target_element_gmem<
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
                              std::byte* source,
-                             bool* source_mask,
                              cudf::size_type source_index) const noexcept
   {
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_VALID>;
@@ -186,7 +179,6 @@ struct update_target_element_gmem<
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
                              std::byte* source,
-                             bool* source_mask,
                              cudf::size_type source_index) const noexcept
   {
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_ALL>;
@@ -209,7 +201,6 @@ struct update_target_element_gmem<
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
                              std::byte* source,
-                             bool* source_mask,
                              cudf::size_type source_index) const noexcept
   {
     using Target             = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMAX>;
@@ -238,7 +229,6 @@ struct update_target_element_gmem<
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
                              std::byte* source,
-                             bool* source_mask,
                              cudf::size_type source_index) const noexcept
   {
     using Target             = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMIN>;
@@ -268,10 +258,10 @@ struct gmem_element_aggregator {
                              cudf::size_type source_index) const noexcept
   {
     if constexpr (k != cudf::aggregation::COUNT_ALL) {
-      if (source_mask[source_index]) { return; }
+      if (!source_mask[source_index]) { return; }
     }
     update_target_element_gmem<Source, k>{}(
-      target, target_index, source_column, source, source_mask, source_index);
+      target, target_index, source_column, source, source_index);
   }
 };
 
diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
index f4be32ed723..32248025fe2 100644
--- a/cpp/src/groupby/hash/shared_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
@@ -56,7 +56,8 @@ struct update_target_element_shmem<
     DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
     cudf::detail::atomic_min(&target_casted[target_index],
                              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
-    if (target_mask[target_index]) { target_mask[target_index] = false; }
+
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
   }
 };
 
@@ -78,7 +79,7 @@ struct update_target_element_shmem<
     cudf::detail::atomic_max(&target_casted[target_index],
                              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
 
-    if (target_mask[target_index]) { target_mask[target_index] = false; }
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
   }
 };
 
@@ -101,7 +102,7 @@ struct update_target_element_shmem<
     cudf::detail::atomic_add(&target_casted[target_index],
                              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
 
-    if (target_mask[target_index]) { target_mask[target_index] = false; }
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
   }
 };
 
@@ -121,7 +122,7 @@ struct update_target_element_shmem<
     auto value            = static_cast<Target>(source.element<Source>(source_index));
     cudf::detail::atomic_add(&target_casted[target_index], value * value);
 
-    if (target_mask[target_index]) { target_mask[target_index] = false; }
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
   }
 };
 
@@ -141,7 +142,7 @@ struct update_target_element_shmem<
     cudf::detail::atomic_mul(&target_casted[target_index],
                              static_cast<Target>(source.element<Source>(source_index)));
 
-    if (target_mask[target_index]) { target_mask[target_index] = false; }
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
   }
 };
 
@@ -205,7 +206,7 @@ struct update_target_element_shmem<
       }
     }
 
-    if (target_mask[target_index]) { target_mask[target_index] = false; }
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
   }
 };
 
@@ -231,7 +232,7 @@ struct update_target_element_shmem<
       }
     }
 
-    if (target_mask[target_index]) { target_mask[target_index] = false; }
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
   }
 };
 
diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index 93b2bff8990..5c788522ac9 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -91,7 +91,12 @@ struct initialize_target_element<Target, k, std::enable_if_t<is_supported<Target
     DeviceType* target_casted = reinterpret_cast<DeviceType*>(target);
 
     target_casted[idx] = get_identity<DeviceType, k>();
-    target_mask[idx] = !(k == cudf::aggregation::COUNT_ALL || k == cudf::aggregation::COUNT_VALID);
+
+    if (k == cudf::aggregation::COUNT_ALL || k == cudf::aggregation::COUNT_VALID) {
+      target_mask[idx] = true;
+    } else {
+      target_mask[idx] = false;
+    }
   }
 };
 

From d071662e234a3b40213c7e7b57ccd2f86ed96801 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 14 Oct 2024 17:03:58 -0700
Subject: [PATCH 105/135] Minor header cleanup

---
 cpp/src/groupby/hash/var_hash_functor.cuh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cpp/src/groupby/hash/var_hash_functor.cuh b/cpp/src/groupby/hash/var_hash_functor.cuh
index e02c322c68f..bb55cc9188c 100644
--- a/cpp/src/groupby/hash/var_hash_functor.cuh
+++ b/cpp/src/groupby/hash/var_hash_functor.cuh
@@ -15,8 +15,6 @@
  */
 #pragma once
 
-#include "helpers.cuh"
-
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/aggregation/aggregation.hpp>

From 4e2c2cc8f4d66901536a9282df577a072d1f0edb Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 14 Oct 2024 19:08:48 -0700
Subject: [PATCH 106/135] Remove unused code + clean up null check

---
 .../groupby/hash/compute_single_pass_shmem_aggs.cu   | 12 ------------
 cpp/src/groupby/hash/global_memory_aggregator.cuh    |  5 ++---
 2 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
index cd1759a10ea..314adf336a5 100644
--- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
@@ -45,18 +45,6 @@ struct size_of_functor {
   }
 };
 
-/*
-/// Computes number of *actual* bitmask_type elements needed
-__device__ constexpr size_type num_bitmask_words(size_type number_of_bits)
-{
-  // TODO: This duplicates `cudf::num_bitmask_words`. Converting it into
-  // a public host-device utility will require non-trivial effort, so the
-  // cleanup will be addressed in a separate PR.
-  return cudf::util::div_rounding_up_safe<size_type>(number_of_bits,
-                                                     cudf::detail::size_in_bits<bitmask_type>());
-}
-*/
-
 // Prepares shared memory data required by each output column, exits if
 // no enough memory space to perform the shared memory aggregation for the
 // current output column
diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh
index 4c682ad6fae..5747a10ed1b 100644
--- a/cpp/src/groupby/hash/global_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh
@@ -257,9 +257,8 @@ struct gmem_element_aggregator {
                              bool* source_mask,
                              cudf::size_type source_index) const noexcept
   {
-    if constexpr (k != cudf::aggregation::COUNT_ALL) {
-      if (!source_mask[source_index]) { return; }
-    }
+    if (!source_mask[source_index]) { return; }
+
     update_target_element_gmem<Source, k>{}(
       target, target_index, source_column, source, source_index);
   }

From c2514f6147ac13e53f68f04220f2356b58fa4a3f Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 14 Oct 2024 19:30:28 -0700
Subject: [PATCH 107/135] Use cuda::std::byte on device

---
 .../hash/compute_single_pass_shmem_aggs.cu    | 23 +++++++++---------
 .../groupby/hash/global_memory_aggregator.cuh | 23 +++++++++---------
 .../groupby/hash/shared_memory_aggregator.cuh | 24 ++++++++++---------
 cpp/src/groupby/hash/single_pass_functors.cuh |  8 ++++---
 4 files changed, 41 insertions(+), 37 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
index 314adf336a5..9874e2f7444 100644
--- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
@@ -31,8 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <cooperative_groups.h>
-
-#include <cstddef>
+#include <cuda/std/cstddef>
 
 namespace cudf::groupby::detail::hash {
 namespace {
@@ -86,7 +85,7 @@ __device__ void initialize_shmem_aggregations(cooperative_groups::thread_block c
                                               cudf::size_type col_start,
                                               cudf::size_type col_end,
                                               cudf::mutable_table_device_view output_values,
-                                              std::byte* shmem_agg_storage,
+                                              cuda::std::byte* shmem_agg_storage,
                                               cudf::size_type* shmem_agg_res_offsets,
                                               cudf::size_type* shmem_agg_mask_offsets,
                                               cudf::size_type cardinality,
@@ -94,8 +93,8 @@ __device__ void initialize_shmem_aggregations(cooperative_groups::thread_block c
 {
   for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
     for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) {
-      std::byte* target =
-        reinterpret_cast<std::byte*>(shmem_agg_storage + shmem_agg_res_offsets[col_idx]);
+      cuda::std::byte* target =
+        reinterpret_cast<cuda::std::byte*>(shmem_agg_storage + shmem_agg_res_offsets[col_idx]);
       bool* target_mask =
         reinterpret_cast<bool*>(shmem_agg_storage + shmem_agg_mask_offsets[col_idx]);
       cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(),
@@ -116,7 +115,7 @@ __device__ void compute_pre_aggregrations(cudf::size_type col_start,
                                           cudf::table_device_view source,
                                           cudf::size_type num_input_rows,
                                           cudf::size_type* local_mapping_index,
-                                          std::byte* shmem_agg_storage,
+                                          cuda::std::byte* shmem_agg_storage,
                                           cudf::size_type* shmem_agg_res_offsets,
                                           cudf::size_type* shmem_agg_mask_offsets,
                                           cudf::aggregation::Kind const* d_agg_kinds)
@@ -129,8 +128,8 @@ __device__ void compute_pre_aggregrations(cudf::size_type col_start,
       for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
         auto const source_col = source.column(col_idx);
 
-        std::byte* target =
-          reinterpret_cast<std::byte*>(shmem_agg_storage + shmem_agg_res_offsets[col_idx]);
+        cuda::std::byte* target =
+          reinterpret_cast<cuda::std::byte*>(shmem_agg_storage + shmem_agg_res_offsets[col_idx]);
         bool* target_mask =
           reinterpret_cast<bool*>(shmem_agg_storage + shmem_agg_mask_offsets[col_idx]);
 
@@ -154,7 +153,7 @@ __device__ void compute_final_aggregations(cooperative_groups::thread_block cons
                                            cudf::mutable_table_device_view target,
                                            cudf::size_type cardinality,
                                            cudf::size_type* global_mapping_index,
-                                           std::byte* shmem_agg_storage,
+                                           cuda::std::byte* shmem_agg_storage,
                                            cudf::size_type* agg_res_offsets,
                                            cudf::size_type* agg_mask_offsets,
                                            cudf::aggregation::Kind const* d_agg_kinds)
@@ -166,8 +165,8 @@ __device__ void compute_final_aggregations(cooperative_groups::thread_block cons
     for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
       auto target_col = target.column(col_idx);
 
-      std::byte* source =
-        reinterpret_cast<std::byte*>(shmem_agg_storage + agg_res_offsets[col_idx]);
+      cuda::std::byte* source =
+        reinterpret_cast<cuda::std::byte*>(shmem_agg_storage + agg_res_offsets[col_idx]);
       bool* source_mask = reinterpret_cast<bool*>(shmem_agg_storage + agg_mask_offsets[col_idx]);
 
       cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(),
@@ -206,7 +205,7 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
 
   __shared__ cudf::size_type col_start;
   __shared__ cudf::size_type col_end;
-  extern __shared__ std::byte shmem_agg_storage[];
+  extern __shared__ cuda::std::byte shmem_agg_storage[];
 
   cudf::size_type* shmem_agg_res_offsets =
     reinterpret_cast<cudf::size_type*>(shmem_agg_storage + total_agg_size);
diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh
index 5747a10ed1b..fa4190491e9 100644
--- a/cpp/src/groupby/hash/global_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh
@@ -23,6 +23,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/utilities/traits.cuh>
 
+#include <cuda/std/cstddef>
 #include <cuda/std/type_traits>
 
 namespace cudf::groupby::detail::hash {
@@ -32,7 +33,7 @@ struct update_target_element_gmem {
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
-                             std::byte* source,
+                             cuda::std::byte* source,
                              cudf::size_type source_index) const noexcept
   {
     CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
@@ -47,7 +48,7 @@ struct update_target_element_gmem<
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
-                             std::byte* source,
+                             cuda::std::byte* source,
                              cudf::size_type source_index) const noexcept
   {
     using DeviceType          = cudf::detail::underlying_target_t<Source, aggregation::MIN>;
@@ -67,7 +68,7 @@ struct update_target_element_gmem<
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
-                             std::byte* source,
+                             cuda::std::byte* source,
                              cudf::size_type source_index) const noexcept
   {
     using DeviceType          = cudf::detail::underlying_target_t<Source, aggregation::MAX>;
@@ -88,7 +89,7 @@ struct update_target_element_gmem<
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
-                             std::byte* source,
+                             cuda::std::byte* source,
                              cudf::size_type source_index) const noexcept
   {
     using DeviceType          = cudf::detail::underlying_target_t<Source, aggregation::SUM>;
@@ -109,7 +110,7 @@ struct update_target_element_gmem<
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
-                             std::byte* source,
+                             cuda::std::byte* source,
                              cudf::size_type source_index) const noexcept
   {
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM_OF_SQUARES>;
@@ -131,7 +132,7 @@ struct update_target_element_gmem<
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
-                             std::byte* source,
+                             cuda::std::byte* source,
                              cudf::size_type source_index) const noexcept
   {
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::PRODUCT>;
@@ -155,7 +156,7 @@ struct update_target_element_gmem<
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
-                             std::byte* source,
+                             cuda::std::byte* source,
                              cudf::size_type source_index) const noexcept
   {
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_VALID>;
@@ -178,7 +179,7 @@ struct update_target_element_gmem<
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
-                             std::byte* source,
+                             cuda::std::byte* source,
                              cudf::size_type source_index) const noexcept
   {
     using Target = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_ALL>;
@@ -200,7 +201,7 @@ struct update_target_element_gmem<
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
-                             std::byte* source,
+                             cuda::std::byte* source,
                              cudf::size_type source_index) const noexcept
   {
     using Target             = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMAX>;
@@ -228,7 +229,7 @@ struct update_target_element_gmem<
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
-                             std::byte* source,
+                             cuda::std::byte* source,
                              cudf::size_type source_index) const noexcept
   {
     using Target             = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMIN>;
@@ -253,7 +254,7 @@ struct gmem_element_aggregator {
   __device__ void operator()(cudf::mutable_column_device_view target,
                              cudf::size_type target_index,
                              cudf::column_device_view source_column,
-                             std::byte* source,
+                             cuda::std::byte* source,
                              bool* source_mask,
                              cudf::size_type source_index) const noexcept
   {
diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
index 32248025fe2..c5bdfe253ea 100644
--- a/cpp/src/groupby/hash/shared_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
@@ -23,13 +23,14 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/utilities/traits.cuh>
 
+#include <cuda/std/cstddef>
 #include <cuda/std/type_traits>
 
 namespace cudf::groupby::detail::hash {
 
 template <typename Source, cudf::aggregation::Kind k, typename Enable = void>
 struct update_target_element_shmem {
-  __device__ void operator()(std::byte* target,
+  __device__ void operator()(cuda::std::byte* target,
                              bool* target_mask,
                              cudf::size_type target_index,
                              cudf::column_device_view source,
@@ -44,7 +45,7 @@ struct update_target_element_shmem<
   Source,
   cudf::aggregation::MIN,
   cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
-  __device__ void operator()(std::byte* target,
+  __device__ void operator()(cuda::std::byte* target,
                              bool* target_mask,
                              cudf::size_type target_index,
                              cudf::column_device_view source,
@@ -66,7 +67,7 @@ struct update_target_element_shmem<
   Source,
   cudf::aggregation::MAX,
   cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
-  __device__ void operator()(std::byte* target,
+  __device__ void operator()(cuda::std::byte* target,
                              bool* target_mask,
                              cudf::size_type target_index,
                              cudf::column_device_view source,
@@ -89,7 +90,7 @@ struct update_target_element_shmem<
   cudf::aggregation::SUM,
   cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
                          !cudf::is_timestamp<Source>()>> {
-  __device__ void operator()(std::byte* target,
+  __device__ void operator()(cuda::std::byte* target,
                              bool* target_mask,
                              cudf::size_type target_index,
                              cudf::column_device_view source,
@@ -111,7 +112,7 @@ struct update_target_element_shmem<
   Source,
   cudf::aggregation::SUM_OF_SQUARES,
   cuda::std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
-  __device__ void operator()(std::byte* target,
+  __device__ void operator()(cuda::std::byte* target,
                              bool* target_mask,
                              cudf::size_type target_index,
                              cudf::column_device_view source,
@@ -131,7 +132,7 @@ struct update_target_element_shmem<
   Source,
   cudf::aggregation::PRODUCT,
   cuda::std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
-  __device__ void operator()(std::byte* target,
+  __device__ void operator()(cuda::std::byte* target,
                              bool* target_mask,
                              cudf::size_type target_index,
                              cudf::column_device_view source,
@@ -152,12 +153,13 @@ struct update_target_element_shmem<
   cudf::aggregation::COUNT_VALID,
   cuda::std::enable_if_t<
     cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_VALID>()>> {
-  __device__ void operator()(std::byte* target,
+  __device__ void operator()(cuda::std::byte* target,
                              bool* target_mask,
                              cudf::size_type target_index,
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
+    // The nullability was checked prior to this call in the `shmem_element_aggregator` functor
     using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_VALID>;
     Target* target_casted = reinterpret_cast<Target*>(target);
     cudf::detail::atomic_add(&target_casted[target_index], Target{1});
@@ -170,7 +172,7 @@ struct update_target_element_shmem<
   cudf::aggregation::COUNT_ALL,
   cuda::std::enable_if_t<
     cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_ALL>()>> {
-  __device__ void operator()(std::byte* target,
+  __device__ void operator()(cuda::std::byte* target,
                              bool* target_mask,
                              cudf::size_type target_index,
                              cudf::column_device_view source,
@@ -190,7 +192,7 @@ struct update_target_element_shmem<
   cudf::aggregation::ARGMAX,
   cuda::std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMAX>() and
                          cudf::is_relationally_comparable<Source, Source>()>> {
-  __device__ void operator()(std::byte* target,
+  __device__ void operator()(cuda::std::byte* target,
                              bool* target_mask,
                              cudf::size_type target_index,
                              cudf::column_device_view source,
@@ -216,7 +218,7 @@ struct update_target_element_shmem<
   cudf::aggregation::ARGMIN,
   cuda::std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMIN>() and
                          cudf::is_relationally_comparable<Source, Source>()>> {
-  __device__ void operator()(std::byte* target,
+  __device__ void operator()(cuda::std::byte* target,
                              bool* target_mask,
                              cudf::size_type target_index,
                              cudf::column_device_view source,
@@ -238,7 +240,7 @@ struct update_target_element_shmem<
 
 struct shmem_element_aggregator {
   template <typename Source, cudf::aggregation::Kind k>
-  __device__ void operator()(std::byte* target,
+  __device__ void operator()(cuda::std::byte* target,
                              bool* target_mask,
                              cudf::size_type target_index,
                              cudf::column_device_view source,
diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index 5c788522ac9..eaa71ae6d8c 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -23,6 +23,8 @@
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/utilities/traits.cuh>
 
+#include <cuda/std/cstddef>
+
 namespace cudf::groupby::detail::hash {
 
 // TODO: TO BE REMOVED
@@ -73,7 +75,7 @@ __device__ T get_identity()
 
 template <typename Target, cudf::aggregation::Kind k, typename Enable = void>
 struct initialize_target_element {
-  __device__ void operator()(std::byte* target,
+  __device__ void operator()(cuda::std::byte* target,
                              bool* target_mask,
                              cudf::size_type idx) const noexcept
   {
@@ -83,7 +85,7 @@ struct initialize_target_element {
 
 template <typename Target, cudf::aggregation::Kind k>
 struct initialize_target_element<Target, k, std::enable_if_t<is_supported<Target, k>()>> {
-  __device__ void operator()(std::byte* target,
+  __device__ void operator()(cuda::std::byte* target,
                              bool* target_mask,
                              cudf::size_type idx) const noexcept
   {
@@ -103,7 +105,7 @@ struct initialize_target_element<Target, k, std::enable_if_t<is_supported<Target
 struct initialize_shmem {
   template <typename Target, cudf::aggregation::Kind k>
   // TODO naming
-  __device__ void operator()(std::byte* target,
+  __device__ void operator()(cuda::std::byte* target,
                              bool* target_mask,
                              cudf::size_type idx) const noexcept
   {

From 51114c9b57bc81adf84e5fee72962606e9b89779 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Sat, 19 Oct 2024 14:43:00 -0700
Subject: [PATCH 108/135] Revert agg details

---
 .../cudf/detail/aggregation/aggregation.cuh   | 21 +----
 .../detail/aggregation/device_aggregators.cuh | 91 ++++++++++++++++---
 2 files changed, 77 insertions(+), 35 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index 91d25d99c1d..de53e7586cd 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -170,7 +170,7 @@ struct identity_initializer {
   }
 
   template <typename T, aggregation::Kind k>
-  constexpr T get_identity()
+  T get_identity()
   {
     if (k == aggregation::ARGMAX || k == aggregation::ARGMIN) {
       if constexpr (cudf::is_timestamp<T>())
@@ -186,25 +186,6 @@ struct identity_initializer {
   }
 
  public:
-  template <typename Target, cudf::aggregation::Kind k>
-  __device__ std::enable_if_t<is_supported<Target, k>(), void> operator()(
-    cudf::mutable_column_device_view target, cudf::size_type target_index)
-  {
-    using DeviceType = device_storage_type_t<Target>;
-    using ElementType =
-      cuda::std::conditional_t<cudf::is_fixed_width<Target>() && !cudf::is_fixed_point<Target>(),
-                               Target,
-                               DeviceType>;
-    target.element<ElementType>(target_index) = get_identity<DeviceType, k>();
-  }
-
-  template <typename Target, cudf::aggregation::Kind k>
-  __device__ std::enable_if_t<!is_supported<Target, k>(), void> operator()(
-    cudf::mutable_column_device_view target, cudf::size_type target_index)
-  {
-    CUDF_UNREACHABLE("Unsupported aggregation for initializing values");
-  }
-
   template <typename T, aggregation::Kind k>
   std::enable_if_t<is_supported<T, k>(), void> operator()(mutable_column_view const& col,
                                                           rmm::cuda_stream_view stream)
diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
index bc370c59296..204eee49a2a 100644
--- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
+++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
 #include <cudf/aggregation.hpp>
@@ -50,12 +49,31 @@ using underlying_source_t =
 
 template <typename Source, aggregation::Kind k, typename Enable = void>
 struct update_target_element {
+  __device__ void operator()(mutable_column_device_view,
+                             size_type,
+                             column_device_view,
+                             size_type) const noexcept
+  {
+    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::MIN,
+  cuda::std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                         !is_fixed_point<Source>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
+    using Target = target_type_t<Source, aggregation::MIN>;
+    cudf::detail::atomic_min(&target.element<Target>(target_index),
+                             static_cast<Target>(source.element<Source>(source_index)));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
   }
 };
 
@@ -63,14 +81,16 @@ template <typename Source>
 struct update_target_element<
   Source,
   aggregation::MIN,
-  cuda::std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
+  cuda::std::enable_if_t<is_fixed_point<Source>() &&
+                         cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    using DeviceTarget = cudf::detail::underlying_target_t<Source, aggregation::MIN>;
-    using DeviceSource = cudf::detail::underlying_source_t<Source, aggregation::MIN>;
+    using Target       = target_type_t<Source, aggregation::MIN>;
+    using DeviceTarget = device_storage_type_t<Target>;
+    using DeviceSource = device_storage_type_t<Source>;
 
     cudf::detail::atomic_min(&target.element<DeviceTarget>(target_index),
                              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
@@ -83,14 +103,35 @@ template <typename Source>
 struct update_target_element<
   Source,
   aggregation::MAX,
-  cuda::std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
+  cuda::std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                         !is_fixed_point<Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    using Target = target_type_t<Source, aggregation::MAX>;
+    cudf::detail::atomic_max(&target.element<Target>(target_index),
+                             static_cast<Target>(source.element<Source>(source_index)));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::MAX,
+  cuda::std::enable_if_t<is_fixed_point<Source>() &&
+                         cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    using DeviceTarget = cudf::detail::underlying_target_t<Source, aggregation::MAX>;
-    using DeviceSource = cudf::detail::underlying_source_t<Source, aggregation::MAX>;
+    using Target       = target_type_t<Source, aggregation::MAX>;
+    using DeviceTarget = device_storage_type_t<Target>;
+    using DeviceSource = device_storage_type_t<Source>;
 
     cudf::detail::atomic_max(&target.element<DeviceTarget>(target_index),
                              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
@@ -104,14 +145,34 @@ struct update_target_element<
   Source,
   aggregation::SUM,
   cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                         !cudf::is_timestamp<Source>()>> {
+                         !cudf::is_fixed_point<Source>() && !cudf::is_timestamp<Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    using Target = target_type_t<Source, aggregation::SUM>;
+    cudf::detail::atomic_add(&target.element<Target>(target_index),
+                             static_cast<Target>(source.element<Source>(source_index)));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::SUM,
+  cuda::std::enable_if_t<is_fixed_point<Source>() &&
+                         cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    using DeviceTarget = cudf::detail::underlying_target_t<Source, aggregation::SUM>;
-    using DeviceSource = cudf::detail::underlying_source_t<Source, aggregation::SUM>;
+    using Target       = target_type_t<Source, aggregation::SUM>;
+    using DeviceTarget = device_storage_type_t<Target>;
+    using DeviceSource = device_storage_type_t<Source>;
 
     cudf::detail::atomic_add(&target.element<DeviceTarget>(target_index),
                              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
@@ -142,10 +203,10 @@ struct update_target_from_dictionary {
   template <typename Source,
             aggregation::Kind k,
             cuda::std::enable_if_t<is_dictionary<Source>()>* = nullptr>
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
+  __device__ void operator()(mutable_column_device_view,
+                             size_type,
+                             column_device_view,
+                             size_type) const noexcept
   {
   }
 };

From a3c6eb24e4ec3561a1fbd39e289bc8b7e03521b2 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Sat, 19 Oct 2024 14:48:47 -0700
Subject: [PATCH 109/135] Fetch trunk aggregators

---
 .../groupby/hash/global_memory_aggregator.cuh | 29 ++++++++++++-------
 .../groupby/hash/shared_memory_aggregator.cuh | 23 ++++++++++-----
 2 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh
index fa4190491e9..50e89c727ff 100644
--- a/cpp/src/groupby/hash/global_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh
@@ -13,28 +13,25 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/aggregation/device_aggregators.cuh>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/device_atomics.cuh>
-#include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/utilities/traits.cuh>
 
 #include <cuda/std/cstddef>
 #include <cuda/std/type_traits>
 
 namespace cudf::groupby::detail::hash {
-
 template <typename Source, cudf::aggregation::Kind k, typename Enable = void>
 struct update_target_element_gmem {
-  __device__ void operator()(cudf::mutable_column_device_view target,
-                             cudf::size_type target_index,
-                             cudf::column_device_view source_column,
-                             cuda::std::byte* source,
-                             cudf::size_type source_index) const noexcept
+  __device__ void operator()(cudf::mutable_column_device_view,
+                             cudf::size_type,
+                             cudf::column_device_view,
+                             cuda::std::byte*,
+                             cudf::size_type) const noexcept
   {
     CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
   }
@@ -169,7 +166,6 @@ struct update_target_element_gmem<
   }
 };
 
-// TODO: VALID and ALL have same code
 template <typename Source>
 struct update_target_element_gmem<
   Source,
@@ -249,6 +245,18 @@ struct update_target_element_gmem<
   }
 };
 
+/**
+ * @brief A functor that updates a single element in the target column stored in global memory by
+ * applying an aggregation operation to a corresponding element from a source column in shared
+ * memory.
+ *
+ * This functor can NOT be used for dictionary columns.
+ *
+ * This is a redundant copy replicating the behavior of `elementwise_aggregator` from
+ * `cudf/detail/aggregation/device_aggregators.cuh`. The key difference is that this functor accepts
+ * a pointer to raw bytes as the source, as `column_device_view` cannot yet be constructed from
+ * shared memory.
+ */
 struct gmem_element_aggregator {
   template <typename Source, cudf::aggregation::Kind k>
   __device__ void operator()(cudf::mutable_column_device_view target,
@@ -258,11 +266,12 @@ struct gmem_element_aggregator {
                              bool* source_mask,
                              cudf::size_type source_index) const noexcept
   {
+    // Early exit for all aggregation kinds since shared memory aggregation of
+    // `COUNT_ALL` is always valid
     if (!source_mask[source_index]) { return; }
 
     update_target_element_gmem<Source, k>{}(
       target, target_index, source_column, source, source_index);
   }
 };
-
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
index c5bdfe253ea..9cbeeb34b86 100644
--- a/cpp/src/groupby/hash/shared_memory_aggregator.cuh
+++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
@@ -13,28 +13,22 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/aggregation/device_aggregators.cuh>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/device_atomics.cuh>
-#include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/utilities/traits.cuh>
 
 #include <cuda/std/cstddef>
 #include <cuda/std/type_traits>
 
 namespace cudf::groupby::detail::hash {
-
 template <typename Source, cudf::aggregation::Kind k, typename Enable = void>
 struct update_target_element_shmem {
-  __device__ void operator()(cuda::std::byte* target,
-                             bool* target_mask,
-                             cudf::size_type target_index,
-                             cudf::column_device_view source,
-                             cudf::size_type source_index) const
+  __device__ void operator()(
+    cuda::std::byte*, bool*, cudf::size_type, cudf::column_device_view, cudf::size_type) const
   {
     CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
   }
@@ -238,6 +232,18 @@ struct update_target_element_shmem<
   }
 };
 
+/**
+ * @brief A functor that updates a single element in the target column stored in shared memory by
+ * applying an aggregation operation to a corresponding element from a source column in global
+ * memory.
+ *
+ * This functor can NOT be used for dictionary columns.
+ *
+ * This is a redundant copy replicating the behavior of `elementwise_aggregator` from
+ * `cudf/detail/aggregation/device_aggregators.cuh`. The key difference is that this functor accepts
+ * a pointer to raw bytes as the source, as `column_device_view` cannot yet be constructed from
+ * shared memory.
+ */
 struct shmem_element_aggregator {
   template <typename Source, cudf::aggregation::Kind k>
   __device__ void operator()(cuda::std::byte* target,
@@ -246,6 +252,7 @@ struct shmem_element_aggregator {
                              cudf::column_device_view source,
                              cudf::size_type source_index) const noexcept
   {
+    // Check nullability for all aggregation kinds but `COUNT_ALL`
     if constexpr (k != cudf::aggregation::COUNT_ALL) {
       if (source.is_null(source_index)) { return; }
     }

From a8f8ab3519ddd1750ac40576bab71551a802cad3 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Sat, 19 Oct 2024 14:52:53 -0700
Subject: [PATCH 110/135] Fetch trunk hash_compound_agg_finalizer

---
 cpp/src/groupby/hash/hash_compound_agg_finalizer.cu  | 5 ++---
 cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
index 221e63ac121..37a61c1a22c 100644
--- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
+++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -173,7 +173,7 @@ void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::var_aggregation c
   cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream);
 
   thrust::for_each_n(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     thrust::make_counting_iterator(0),
     col.size(),
     var_hash_functor{
@@ -196,5 +196,4 @@ void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::std_aggregation c
 
 template class hash_compound_agg_finalizer<hash_set_ref_t<cuco::find_tag>>;
 template class hash_compound_agg_finalizer<nullable_hash_set_ref_t<cuco::find_tag>>;
-
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp
index 16cbe92511f..8bee1a92c40 100644
--- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp
+++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 9746891d80374589ee1ee0847f9882206dbb70c1 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Sat, 19 Oct 2024 14:54:52 -0700
Subject: [PATCH 111/135] Fetch trunk groupby

---
 cpp/src/groupby/hash/groupby.cu | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index c2947316c9f..30e1d52fdbf 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -32,7 +32,6 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <algorithm>
 #include <memory>
@@ -80,7 +79,7 @@ constexpr bool array_contains(std::array<T, N> const& haystack, T needle)
  * @return true `t` is valid for a hash based groupby
  * @return false `t` is invalid for a hash based groupby
  */
-constexpr bool is_hash_aggregation(aggregation::Kind t)
+bool constexpr is_hash_aggregation(aggregation::Kind t)
 {
   return array_contains(hash_aggregations, t);
 }
@@ -88,8 +87,8 @@ constexpr bool is_hash_aggregation(aggregation::Kind t)
 std::unique_ptr<table> dispatch_groupby(table_view const& keys,
                                         host_span<aggregation_request const> requests,
                                         cudf::detail::result_cache* cache,
-                                        bool keys_have_nulls,
-                                        null_policy include_null_keys,
+                                        bool const keys_have_nulls,
+                                        null_policy const include_null_keys,
                                         rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
@@ -105,11 +104,11 @@ std::unique_ptr<table> dispatch_groupby(table_view const& keys,
   if (cudf::detail::has_nested_columns(keys)) {
     auto const d_row_equal = comparator.equal_to<true>(has_null, null_keys_are_equal);
     return compute_groupby<nullable_row_comparator_t>(
-      keys, requests, cache, skip_rows_with_nulls, d_row_equal, d_row_hash, stream, mr);
+      keys, requests, skip_rows_with_nulls, d_row_equal, d_row_hash, cache, stream, mr);
   } else {
     auto const d_row_equal = comparator.equal_to<false>(has_null, null_keys_are_equal);
     return compute_groupby<row_comparator_t>(
-      keys, requests, cache, skip_rows_with_nulls, d_row_equal, d_row_hash, stream, mr);
+      keys, requests, skip_rows_with_nulls, d_row_equal, d_row_hash, cache, stream, mr);
   }
 }
 }  // namespace

From 91c75a2185c140ae8eb763154c76615a2cb18591 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Sat, 19 Oct 2024 15:01:07 -0700
Subject: [PATCH 112/135] Fetch trunk compute_groupby

---
 cpp/src/groupby/hash/compute_groupby.cu       | 86 ++++++++-----------
 cpp/src/groupby/hash/compute_groupby.hpp      | 24 ++++--
 .../hash/create_sparse_results_table.hpp      | 11 +++
 3 files changed, 63 insertions(+), 58 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu
index 7565e8ecfbb..bd2e5c8148e 100644
--- a/cpp/src/groupby/hash/compute_groupby.cu
+++ b/cpp/src/groupby/hash/compute_groupby.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,74 +14,50 @@
  * limitations under the License.
  */
 
+#include "compute_groupby.hpp"
 #include "compute_single_pass_aggs.hpp"
 #include "helpers.cuh"
 #include "sparse_to_dense_results.hpp"
-#include "var_hash_functor.cuh"
 
+#include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/detail/aggregation/result_cache.hpp>
 #include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/null_mask.hpp>
-#include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <cuco/static_set.cuh>
 
+#include <iterator>
 #include <memory>
 
 namespace cudf::groupby::detail::hash {
-/**
- * @brief Computes groupby using hash table.
- *
- * First, we create a hash table that stores the indices of unique rows in
- * `keys`. The upper limit on the number of values in this map is the number
- * of rows in `keys`.
- *
- * To store the results of aggregations, we create temporary sparse columns
- * which have the same size as input value columns. Using the hash map, we
- * determine the location within the sparse column to write the result of the
- * aggregation into.
- *
- * The sparse column results of all aggregations are stored into the cache
- * `sparse_results`. This enables the use of previously calculated results in
- * other aggregations.
- *
- * All the aggregations which can be computed in a single pass are computed
- * first, in a combined kernel. Then using these results, aggregations that
- * require multiple passes, will be computed.
- *
- * Finally, using the hash map, we generate a vector of indices of populated
- * values in sparse result columns. Then, for each aggregation originally
- * requested in `requests`, we gather sparse results into a column of dense
- * results using the aforementioned index vector. Dense results are stored into
- * the in/out parameter `cache`.
- */
-template <typename Equal>
+template <typename Equal, typename Hash>
 std::unique_ptr<table> compute_groupby(table_view const& keys,
                                        host_span<aggregation_request const> requests,
-                                       cudf::detail::result_cache* cache,
                                        bool skip_rows_with_nulls,
                                        Equal const& d_row_equal,
-                                       row_hash_t const& d_row_hash,
+                                       Hash const& d_row_hash,
+                                       cudf::detail::result_cache* cache,
                                        rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
   // convert to int64_t to avoid potential overflow with large `keys`
-  auto const num_rows = static_cast<int64_t>(keys.num_rows());
+  auto const num_keys = static_cast<int64_t>(keys.num_rows());
 
   // Cache of sparse results where the location of aggregate value in each
   // column is indexed by the hash set
   cudf::detail::result_cache sparse_results(requests.size());
 
-  auto set = cuco::static_set{
-    cuco::extent<int64_t>{num_rows},
-    cudf::detail::CUCO_DESIRED_LOAD_FACTOR,  // 50% occupancy
+  auto const set = cuco::static_set{
+    num_keys,
+    cudf::detail::CUCO_DESIRED_LOAD_FACTOR,  // 50% load factor
     cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
     d_row_equal,
     probing_scheme_t{d_row_hash},
@@ -96,22 +72,25 @@ std::unique_ptr<table> compute_groupby(table_view const& keys,
       : rmm::device_buffer{};
 
   // Compute all single pass aggs first
-  auto gather_map = compute_single_pass_aggs(num_rows,
-                                             static_cast<bitmask_type*>(row_bitmask.data()),
-                                             requests,
-                                             &sparse_results,
-                                             set,
-                                             skip_rows_with_nulls,
-                                             stream);
+  compute_single_pass_aggs(num_keys,
+                           skip_rows_with_nulls,
+                           static_cast<bitmask_type*>(row_bitmask.data()),
+                           set.ref(cuco::insert_and_find),
+                           requests,
+                           &sparse_results,
+                           stream);
+
+  // Extract the populated indices from the hash set and create a gather map.
+  // Gathering using this map from sparse results will give dense results.
+  auto gather_map = extract_populated_keys(set, keys.num_rows(), stream);
 
   // Compact all results from sparse_results and insert into cache
-  sparse_to_dense_results(static_cast<bitmask_type*>(row_bitmask.data()),
-                          requests,
+  sparse_to_dense_results(requests,
                           &sparse_results,
                           cache,
                           gather_map,
                           set.ref(cuco::find),
-                          skip_rows_with_nulls,
+                          static_cast<bitmask_type*>(row_bitmask.data()),
                           stream,
                           mr);
 
@@ -123,24 +102,29 @@ std::unique_ptr<table> compute_groupby(table_view const& keys,
                               mr);
 }
 
-template std::unique_ptr<table> compute_groupby<row_comparator_t>(
+template rmm::device_uvector<size_type> extract_populated_keys<global_set_t>(
+  global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream);
+
+template rmm::device_uvector<size_type> extract_populated_keys<nullable_global_set_t>(
+  nullable_global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream);
+
+template std::unique_ptr<table> compute_groupby<row_comparator_t, row_hash_t>(
   table_view const& keys,
   host_span<aggregation_request const> requests,
-  cudf::detail::result_cache* cache,
   bool skip_rows_with_nulls,
   row_comparator_t const& d_row_equal,
   row_hash_t const& d_row_hash,
+  cudf::detail::result_cache* cache,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr);
 
-template std::unique_ptr<table> compute_groupby<nullable_row_comparator_t>(
+template std::unique_ptr<table> compute_groupby<nullable_row_comparator_t, row_hash_t>(
   table_view const& keys,
   host_span<aggregation_request const> requests,
-  cudf::detail::result_cache* cache,
   bool skip_rows_with_nulls,
   nullable_row_comparator_t const& d_row_equal,
   row_hash_t const& d_row_hash,
+  cudf::detail::result_cache* cache,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr);
-
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_groupby.hpp b/cpp/src/groupby/hash/compute_groupby.hpp
index a11c1db4262..77243dc0a4f 100644
--- a/cpp/src/groupby/hash/compute_groupby.hpp
+++ b/cpp/src/groupby/hash/compute_groupby.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,6 @@
  */
 #pragma once
 
-#include "helpers.cuh"
-
 #include <cudf/detail/aggregation/result_cache.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/table/table_view.hpp>
@@ -54,15 +52,27 @@ namespace cudf::groupby::detail::hash {
  * requested in `requests`, we gather sparse results into a column of dense
  * results using the aforementioned index vector. Dense results are stored into
  * the in/out parameter `cache`.
+ *
+ * @tparam Equal Device row comparator type
+ * @tparam Hash Device row hasher type
+ *
+ * @param keys Table whose rows act as the groupby keys
+ * @param requests The set of columns to aggregate and the aggregations to perform
+ * @param skip_rows_with_nulls Flag indicating whether to ignore nulls or not
+ * @param d_row_equal Device row comparator
+ * @param d_row_hash Device row hasher
+ * @param cache Dense aggregation results
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned table
+ * @return Table of unique keys
  */
-template <typename Equal>
+template <typename Equal, typename Hash>
 std::unique_ptr<cudf::table> compute_groupby(table_view const& keys,
                                              host_span<aggregation_request const> requests,
-                                             cudf::detail::result_cache* cache,
                                              bool skip_rows_with_nulls,
                                              Equal const& d_row_equal,
-                                             row_hash_t const& d_row_hash,
+                                             Hash const& d_row_hash,
+                                             cudf::detail::result_cache* cache,
                                              rmm::cuda_stream_view stream,
                                              rmm::device_async_resource_ref mr);
-
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp
index f2810bd0235..6e667228045 100644
--- a/cpp/src/groupby/hash/create_sparse_results_table.hpp
+++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp
@@ -25,6 +25,17 @@
 #include <vector>
 
 namespace cudf::groupby::detail::hash {
+/**
+ * @brief Computes and returns a device vector containing all populated keys in
+ * `key_set`.
+ *
+ * @tparam SetType Type of the key hash set
+ *
+ * @param key_set Key hash set
+ * TODO
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return An array of unique keys contained in `key_set`
+ */
 template <typename SetType>
 void extract_populated_keys(SetType const& key_set,
                             rmm::device_uvector<cudf::size_type>& populated_keys,

From 17072b0b8b1490825d42cda1291ef2df73dc99a1 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 21 Oct 2024 10:12:47 -0700
Subject: [PATCH 113/135] Make mask const

---
 cpp/src/groupby/hash/create_sparse_results_table.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu
index 5db4249740a..bc32e306b3f 100644
--- a/cpp/src/groupby/hash/create_sparse_results_table.cu
+++ b/cpp/src/groupby/hash/create_sparse_results_table.cu
@@ -66,7 +66,7 @@ cudf::table create_sparse_results_table(cudf::table_view const& flattened_values
                        ? false
                        : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or
                           agg == cudf::aggregation::STD);
-                   auto mask_flag =
+                   auto const mask_flag =
                      (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED;
                    auto const col_type = cudf::is_dictionary(col.type())
                                            ? cudf::dictionary_column_view(col).keys().type()

From 4672734fba0cf8d799e39107aab99488a27c731a Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 21 Oct 2024 10:16:08 -0700
Subject: [PATCH 114/135] Use size_type instead of int

---
 cpp/src/groupby/hash/compute_mapping_indices.cuh        | 4 ++--
 cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp | 6 +++---
 cpp/src/groupby/hash/single_pass_functors.cuh           | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh
index dd369a123ca..fa080709fd0 100644
--- a/cpp/src/groupby/hash/compute_mapping_indices.cuh
+++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh
@@ -152,9 +152,9 @@ CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows,
 }
 
 template <class SetRef>
-int max_occupancy_grid_size(cudf::size_type n)
+cudf::size_type max_occupancy_grid_size(cudf::size_type n)
 {
-  int max_active_blocks{-1};
+  cudf::size_type max_active_blocks{-1};
   CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
     &max_active_blocks, mapping_indices_kernel<SetRef>, GROUPBY_BLOCK_SIZE, 0));
   auto const grid_size  = max_active_blocks * cudf::detail::num_multiprocessors();
diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp
index 73db4750a1f..2fdb590324c 100644
--- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp
+++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp
@@ -23,11 +23,11 @@
 
 namespace cudf::groupby::detail::hash {
 
-size_t available_shared_memory_size(int grid_size);
+size_t available_shared_memory_size(cudf::size_type grid_size);
 
-size_t shmem_offsets_size(int num_cols);
+size_t shmem_offsets_size(cudf::size_type num_cols);
 
-void compute_single_pass_shmem_aggs(int grid_size,
+void compute_single_pass_shmem_aggs(cudf::size_type grid_size,
                                     cudf::size_type num_input_rows,
                                     bitmask_type const* row_bitmask,
                                     bool skip_rows_with_nulls,
diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index 9d8cc0ad73b..b36bdd32af5 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -188,7 +188,7 @@ struct compute_direct_aggregates {
   cudf::mutable_table_device_view output_values;
   cudf::aggregation::Kind const* __restrict__ aggs;
   cudf::size_type* block_cardinality;
-  int stride;
+  cudf::size_type stride;
   bitmask_type const* __restrict__ row_bitmask;
   bool skip_rows_with_nulls;
 
@@ -197,7 +197,7 @@ struct compute_direct_aggregates {
                             cudf::mutable_table_device_view output_values,
                             cudf::aggregation::Kind const* aggs,
                             cudf::size_type* block_cardinality,
-                            int stride,
+                            cudf::size_type stride,
                             bitmask_type const* row_bitmask,
                             bool skip_rows_with_nulls)
     : set(set),
@@ -213,7 +213,7 @@ struct compute_direct_aggregates {
 
   __device__ void operator()(cudf::size_type i)
   {
-    int block_id = (i % stride) / GROUPBY_BLOCK_SIZE;
+    auto const block_id = (i % stride) / GROUPBY_BLOCK_SIZE;
     if (block_cardinality[block_id] >= GROUPBY_CARDINALITY_THRESHOLD and
         (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i))) {
       auto const result = set.insert_and_find(i);

From f8220d937ded20a76851ecbebe34cb87a94cc087 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 22 Oct 2024 12:57:26 -0700
Subject: [PATCH 115/135] Move global agg to its own TU + renaming

---
 cpp/CMakeLists.txt                            |   7 +-
 ...e_pass_aggs.cu => compute_aggregations.cu} |   6 +-
 ...pass_aggs.cuh => compute_aggregations.cuh} | 103 +++++++---------
 ...pass_aggs.hpp => compute_aggregations.hpp} |   2 +-
 ...s_null.cu => compute_aggregations_null.cu} |   6 +-
 .../hash/compute_global_memory_aggs.cu        | 111 ++++++++++++++++++
 .../hash/compute_global_memory_aggs.hpp       |  42 +++++++
 cpp/src/groupby/hash/compute_groupby.cu       |  16 +--
 ..._aggs.cu => compute_shared_memory_aggs.cu} |  24 ++--
 ...ggs.hpp => compute_shared_memory_aggs.hpp} |  22 ++--
 10 files changed, 236 insertions(+), 103 deletions(-)
 rename cpp/src/groupby/hash/{compute_single_pass_aggs.cu => compute_aggregations.cu} (85%)
 rename cpp/src/groupby/hash/{compute_single_pass_aggs.cuh => compute_aggregations.cuh} (64%)
 rename cpp/src/groupby/hash/{compute_single_pass_aggs.hpp => compute_aggregations.hpp} (95%)
 rename cpp/src/groupby/hash/{compute_single_pass_aggs_null.cu => compute_aggregations_null.cu} (84%)
 create mode 100644 cpp/src/groupby/hash/compute_global_memory_aggs.cu
 create mode 100644 cpp/src/groupby/hash/compute_global_memory_aggs.hpp
 rename cpp/src/groupby/hash/{compute_single_pass_shmem_aggs.cu => compute_shared_memory_aggs.cu} (94%)
 rename cpp/src/groupby/hash/{compute_single_pass_shmem_aggs.hpp => compute_shared_memory_aggs.hpp} (55%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index ebd2b2b6d8c..57bcc2df604 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -371,9 +371,10 @@ add_library(
   src/groupby/hash/compute_groupby.cu
   src/groupby/hash/compute_mapping_indices.cu
   src/groupby/hash/compute_mapping_indices_null.cu
-  src/groupby/hash/compute_single_pass_aggs.cu
-  src/groupby/hash/compute_single_pass_aggs_null.cu
-  src/groupby/hash/compute_single_pass_shmem_aggs.cu
+  src/groupby/hash/compute_aggregations.cu
+  src/groupby/hash/compute_aggregations_null.cu
+  src/groupby/hash/compute_global_memory_aggs.cu
+  src/groupby/hash/compute_shared_memory_aggs.cu
   src/groupby/hash/create_sparse_results_table.cu
   src/groupby/hash/flatten_single_pass_aggs.cpp
   src/groupby/hash/groupby.cu
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_aggregations.cu
similarity index 85%
rename from cpp/src/groupby/hash/compute_single_pass_aggs.cu
rename to cpp/src/groupby/hash/compute_aggregations.cu
index 04519edf791..cac6c2224f0 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cu
+++ b/cpp/src/groupby/hash/compute_aggregations.cu
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
-#include "compute_single_pass_aggs.cuh"
-#include "compute_single_pass_aggs.hpp"
+#include "compute_aggregations.cuh"
+#include "compute_aggregations.hpp"
 
 namespace cudf::groupby::detail::hash {
-template rmm::device_uvector<cudf::size_type> compute_single_pass_aggs<global_set_t>(
+template rmm::device_uvector<cudf::size_type> compute_aggregations<global_set_t>(
   int64_t num_rows,
   bool skip_rows_with_nulls,
   bitmask_type const* row_bitmask,
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh
similarity index 64%
rename from cpp/src/groupby/hash/compute_single_pass_aggs.cuh
rename to cpp/src/groupby/hash/compute_aggregations.cuh
index 6929d04dba1..8117b3fe0fa 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_aggregations.cuh
@@ -15,9 +15,10 @@
  */
 #pragma once
 
+#include "compute_aggregations.hpp"
+#include "compute_global_memory_aggs.hpp"
 #include "compute_mapping_indices.hpp"
-#include "compute_single_pass_aggs.hpp"
-#include "compute_single_pass_shmem_aggs.hpp"
+#include "compute_shared_memory_aggs.hpp"
 #include "create_sparse_results_table.hpp"
 #include "flatten_single_pass_aggs.hpp"
 #include "helpers.cuh"
@@ -35,12 +36,12 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <cooperative_groups.h>
 #include <cuco/static_set.cuh>
 #include <thrust/for_each.h>
 
 #include <algorithm>
 #include <memory>
+#include <vector>
 
 namespace cudf::groupby::detail::hash {
 /**
@@ -48,7 +49,7 @@ namespace cudf::groupby::detail::hash {
  * over the data and stores the results in `sparse_results`
  */
 template <typename SetType>
-rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
+rmm::device_uvector<cudf::size_type> compute_aggregations(
   int64_t num_rows,
   bool skip_rows_with_nulls,
   bitmask_type const* row_bitmask,
@@ -57,16 +58,11 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   cudf::detail::result_cache* sparse_results,
   rmm::cuda_stream_view stream)
 {
-  // 'populated_keys' contains inserted row_indices (keys) of global hash set
-  rmm::device_uvector<cudf::size_type> populated_keys(num_rows, stream);
-
   // flatten the aggs to a table that can be operated on by aggregate_row
-  auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
-  auto const d_agg_kinds                         = cudf::detail::make_device_uvector_async(
+  auto [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
+  auto const d_agg_kinds                   = cudf::detail::make_device_uvector_async(
     agg_kinds, stream, rmm::mr::get_current_device_resource());
 
-  auto global_set_ref = global_set.ref(cuco::op::insert_and_find);
-
   auto const grid_size =
     max_occupancy_grid_size<typename SetType::ref_type<cuco::insert_and_find_tag>>(num_rows);
   auto const has_sufficient_shmem = available_shared_memory_size(grid_size) >
@@ -75,46 +71,26 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
     requests.begin(), requests.end(), [](cudf::groupby::aggregation_request const& request) {
       return cudf::is_dictionary(request.values.type());
     });
-  auto const uses_global_aggs = has_dictionary_request or !has_sufficient_shmem;
-
-  // Use naive global memory aggregations when there are dictionary columns to aggregagte or
-  // there is no sufficient dynamic shared memory for shared memory aggregations
-  if (uses_global_aggs) {
-    // make table that will hold sparse results
-    cudf::table sparse_table = create_sparse_results_table(flattened_values,
-                                                           d_agg_kinds.data(),
-                                                           agg_kinds,
-                                                           uses_global_aggs,
-                                                           global_set,
-                                                           populated_keys,
-                                                           stream);
-
-    // prepare to launch kernel to do the actual aggregation
-    auto d_values       = table_device_view::create(flattened_values, stream);
-    auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
-
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator(0),
-                       num_rows,
-                       hash::compute_single_pass_aggs_fn{global_set_ref,
-                                                         *d_values,
-                                                         *d_sparse_table,
-                                                         d_agg_kinds.data(),
-                                                         row_bitmask,
-                                                         skip_rows_with_nulls});
-    extract_populated_keys(global_set, populated_keys, stream);
-
-    // Add results back to sparse_results cache
-    auto sparse_result_cols = sparse_table.release();
-    for (size_t i = 0; i < aggs.size(); i++) {
-      // Note that the cache will make a copy of this temporary aggregation
-      sparse_results->add_result(
-        flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i]));
-    }
-
-    return populated_keys;
+  auto const is_shared_memory_compatible = !has_dictionary_request and has_sufficient_shmem;
+
+  // Performs naive global memory aggregations when the workload is not compatible with shared
+  // memory, such as when aggregating dictionary columns or when there is insufficient dynamic
+  // shared memory for shared memory aggregations.
+  if (!is_shared_memory_compatible) {
+    return compute_global_memory_aggs(num_rows,
+                                      skip_rows_with_nulls,
+                                      row_bitmask,
+                                      flattened_values,
+                                      d_agg_kinds.data(),
+                                      agg_kinds,
+                                      global_set,
+                                      aggs,
+                                      sparse_results,
+                                      stream);
   }
 
+  // 'populated_keys' contains inserted row_indices (keys) of global hash set
+  rmm::device_uvector<cudf::size_type> populated_keys(num_rows, stream);
   // 'local_mapping_index' maps from the global row index of the input table to its block-wise rank
   rmm::device_uvector<cudf::size_type> local_mapping_index(num_rows, stream);
   // 'global_mapping_index' maps from the block-wise rank to the row index of global aggregate table
@@ -122,6 +98,9 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
                                                             stream);
   rmm::device_uvector<cudf::size_type> block_cardinality(grid_size, stream);
   rmm::device_scalar<bool> direct_aggregations(false, stream);
+
+  auto global_set_ref = global_set.ref(cuco::op::insert_and_find);
+
   compute_mapping_indices(grid_size,
                           num_rows,
                           global_set_ref,
@@ -145,21 +124,21 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   auto d_values       = table_device_view::create(flattened_values, stream);
   auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
 
-  compute_single_pass_shmem_aggs(grid_size,
-                                 num_rows,
-                                 row_bitmask,
-                                 skip_rows_with_nulls,
-                                 local_mapping_index.data(),
-                                 global_mapping_index.data(),
-                                 block_cardinality.data(),
-                                 *d_values,
-                                 *d_sparse_table,
-                                 d_agg_kinds.data(),
-                                 stream);
+  compute_shared_memory_aggs(grid_size,
+                             num_rows,
+                             row_bitmask,
+                             skip_rows_with_nulls,
+                             local_mapping_index.data(),
+                             global_mapping_index.data(),
+                             block_cardinality.data(),
+                             *d_values,
+                             *d_sparse_table,
+                             d_agg_kinds.data(),
+                             stream);
   if (direct_aggregations.value(stream)) {
     auto const stride = GROUPBY_BLOCK_SIZE * grid_size;
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator(0),
+    thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                       thrust::counting_iterator{0},
                        num_rows,
                        compute_direct_aggregates{global_set_ref,
                                                  *d_values,
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp
similarity index 95%
rename from cpp/src/groupby/hash/compute_single_pass_aggs.hpp
rename to cpp/src/groupby/hash/compute_aggregations.hpp
index e409b3ff685..829c3c808b0 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
+++ b/cpp/src/groupby/hash/compute_aggregations.hpp
@@ -29,7 +29,7 @@ namespace cudf::groupby::detail::hash {
  * over the data and stores the results in `sparse_results`
  */
 template <typename SetType>
-rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
+rmm::device_uvector<cudf::size_type> compute_aggregations(
   int64_t num_rows,
   bool skip_rows_with_nulls,
   bitmask_type const* row_bitmask,
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu b/cpp/src/groupby/hash/compute_aggregations_null.cu
similarity index 84%
rename from cpp/src/groupby/hash/compute_single_pass_aggs_null.cu
rename to cpp/src/groupby/hash/compute_aggregations_null.cu
index 135ba4188f2..1d7184227ea 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu
+++ b/cpp/src/groupby/hash/compute_aggregations_null.cu
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
-#include "compute_single_pass_aggs.cuh"
-#include "compute_single_pass_aggs.hpp"
+#include "compute_aggregations.cuh"
+#include "compute_aggregations.hpp"
 
 namespace cudf::groupby::detail::hash {
-template rmm::device_uvector<cudf::size_type> compute_single_pass_aggs<nullable_global_set_t>(
+template rmm::device_uvector<cudf::size_type> compute_aggregations<nullable_global_set_t>(
   int64_t num_rows,
   bool skip_rows_with_nulls,
   bitmask_type const* row_bitmask,
diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cu b/cpp/src/groupby/hash/compute_global_memory_aggs.cu
new file mode 100644
index 00000000000..ad0cfbb6e12
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cu
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "create_sparse_results_table.hpp"
+#include "flatten_single_pass_aggs.hpp"
+#include "helpers.cuh"
+#include "single_pass_functors.cuh"
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuco/static_set.cuh>
+#include <thrust/for_each.h>
+
+#include <memory>
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
+rmm::device_uvector<cudf::size_type> compute_global_memory_aggs(
+  cudf::size_type num_rows,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  SetType& global_set,
+  std::vector<std::unique_ptr<aggregation>>& aggregations,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream)
+{
+  auto constexpr uses_global_memory_aggs = true;
+  // 'populated_keys' contains inserted row_indices (keys) of global hash set
+  rmm::device_uvector<cudf::size_type> populated_keys(num_rows, stream);
+
+  // make table that will hold sparse results
+  cudf::table sparse_table = create_sparse_results_table(flattened_values,
+                                                         d_agg_kinds,
+                                                         agg_kinds,
+                                                         uses_global_memory_aggs,
+                                                         global_set,
+                                                         populated_keys,
+                                                         stream);
+
+  // prepare to launch kernel to do the actual aggregation
+  auto d_values       = table_device_view::create(flattened_values, stream);
+  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
+  auto global_set_ref = global_set.ref(cuco::op::insert_and_find);
+
+  thrust::for_each_n(
+    rmm::exec_policy_nosync(stream),
+    thrust::counting_iterator{0},
+    num_rows,
+    hash::compute_single_pass_aggs_fn{
+      global_set_ref, *d_values, *d_sparse_table, d_agg_kinds, row_bitmask, skip_rows_with_nulls});
+  extract_populated_keys(global_set, populated_keys, stream);
+
+  // Add results back to sparse_results cache
+  auto sparse_result_cols = sparse_table.release();
+  for (size_t i = 0; i < aggregations.size(); i++) {
+    // Note that the cache will make a copy of this temporary aggregation
+    sparse_results->add_result(
+      flattened_values.column(i), *aggregations[i], std::move(sparse_result_cols[i]));
+  }
+
+  return populated_keys;
+}
+
+template rmm::device_uvector<cudf::size_type> compute_global_memory_aggs<global_set_t>(
+  cudf::size_type num_rows,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  global_set_t& global_set,
+  std::vector<std::unique_ptr<aggregation>>& aggregations,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream);
+
+template rmm::device_uvector<cudf::size_type> compute_global_memory_aggs<nullable_global_set_t>(
+  cudf::size_type num_rows,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  nullable_global_set_t& global_set,
+  std::vector<std::unique_ptr<aggregation>>& aggregations,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp
new file mode 100644
index 00000000000..0777b9ffd93
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <memory>
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
+rmm::device_uvector<cudf::size_type> compute_global_memory_aggs(
+  cudf::size_type num_rows,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  SetType& global_set,
+  std::vector<std::unique_ptr<aggregation>>& aggregations,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu
index fd416710439..e1dbf2a3d9e 100644
--- a/cpp/src/groupby/hash/compute_groupby.cu
+++ b/cpp/src/groupby/hash/compute_groupby.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
+#include "compute_aggregations.hpp"
 #include "compute_groupby.hpp"
-#include "compute_single_pass_aggs.hpp"
 #include "helpers.cuh"
 #include "sparse_to_dense_results.hpp"
 
@@ -71,13 +71,13 @@ std::unique_ptr<table> compute_groupby(table_view const& keys,
       : rmm::device_buffer{};
 
   // Compute all single pass aggs first
-  auto gather_map = compute_single_pass_aggs(num_keys,
-                                             skip_rows_with_nulls,
-                                             static_cast<bitmask_type*>(row_bitmask.data()),
-                                             set,
-                                             requests,
-                                             &sparse_results,
-                                             stream);
+  auto gather_map = compute_aggregations(num_keys,
+                                         skip_rows_with_nulls,
+                                         static_cast<bitmask_type*>(row_bitmask.data()),
+                                         set,
+                                         requests,
+                                         &sparse_results,
+                                         stream);
 
   // Compact all results from sparse_results and insert into cache
   sparse_to_dense_results(requests,
diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
similarity index 94%
rename from cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
rename to cpp/src/groupby/hash/compute_shared_memory_aggs.cu
index 9874e2f7444..9b479eae037 100644
--- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "compute_single_pass_shmem_aggs.hpp"
+#include "compute_shared_memory_aggs.hpp"
 #include "global_memory_aggregator.cuh"
 #include "helpers.cuh"
 #include "shared_memory_aggregator.cuh"
@@ -285,17 +285,17 @@ size_t available_shared_memory_size(cudf::size_type grid_size)
 
 size_t shmem_offsets_size(cudf::size_type num_cols) { return sizeof(cudf::size_type) * num_cols; }
 
-void compute_single_pass_shmem_aggs(cudf::size_type grid_size,
-                                    cudf::size_type num_input_rows,
-                                    bitmask_type const* row_bitmask,
-                                    bool skip_rows_with_nulls,
-                                    cudf::size_type* local_mapping_index,
-                                    cudf::size_type* global_mapping_index,
-                                    cudf::size_type* block_cardinality,
-                                    cudf::table_device_view input_values,
-                                    cudf::mutable_table_device_view output_values,
-                                    cudf::aggregation::Kind const* d_agg_kinds,
-                                    rmm::cuda_stream_view stream)
+void compute_shared_memory_aggs(cudf::size_type grid_size,
+                                cudf::size_type num_input_rows,
+                                bitmask_type const* row_bitmask,
+                                bool skip_rows_with_nulls,
+                                cudf::size_type* local_mapping_index,
+                                cudf::size_type* global_mapping_index,
+                                cudf::size_type* block_cardinality,
+                                cudf::table_device_view input_values,
+                                cudf::mutable_table_device_view output_values,
+                                cudf::aggregation::Kind const* d_agg_kinds,
+                                rmm::cuda_stream_view stream)
 {
   auto const shmem_size = available_shared_memory_size(grid_size);
   // For each aggregation, need one offset determining where the aggregation is
diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
similarity index 55%
rename from cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp
rename to cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
index 2fdb590324c..7dc2b448a60 100644
--- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
@@ -27,16 +27,16 @@ size_t available_shared_memory_size(cudf::size_type grid_size);
 
 size_t shmem_offsets_size(cudf::size_type num_cols);
 
-void compute_single_pass_shmem_aggs(cudf::size_type grid_size,
-                                    cudf::size_type num_input_rows,
-                                    bitmask_type const* row_bitmask,
-                                    bool skip_rows_with_nulls,
-                                    cudf::size_type* local_mapping_index,
-                                    cudf::size_type* global_mapping_index,
-                                    cudf::size_type* block_cardinality,
-                                    cudf::table_device_view input_values,
-                                    cudf::mutable_table_device_view output_values,
-                                    cudf::aggregation::Kind const* d_agg_kinds,
-                                    rmm::cuda_stream_view stream);
+void compute_shared_memory_aggs(cudf::size_type grid_size,
+                                cudf::size_type num_input_rows,
+                                bitmask_type const* row_bitmask,
+                                bool skip_rows_with_nulls,
+                                cudf::size_type* local_mapping_index,
+                                cudf::size_type* global_mapping_index,
+                                cudf::size_type* block_cardinality,
+                                cudf::table_device_view input_values,
+                                cudf::mutable_table_device_view output_values,
+                                cudf::aggregation::Kind const* d_agg_kinds,
+                                rmm::cuda_stream_view stream);
 
 }  // namespace cudf::groupby::detail::hash

From aeef28bff91d620733c5cedb8609e3f4c0125c7a Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 22 Oct 2024 13:42:34 -0700
Subject: [PATCH 116/135] Rename for clarity

---
 cpp/src/groupby/hash/compute_aggregations.cuh | 14 +++++++++-----
 cpp/src/groupby/hash/single_pass_functors.cuh |  4 ++--
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh
index 8117b3fe0fa..83dce5813ac 100644
--- a/cpp/src/groupby/hash/compute_aggregations.cuh
+++ b/cpp/src/groupby/hash/compute_aggregations.cuh
@@ -97,7 +97,9 @@ rmm::device_uvector<cudf::size_type> compute_aggregations(
   rmm::device_uvector<cudf::size_type> global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS,
                                                             stream);
   rmm::device_uvector<cudf::size_type> block_cardinality(grid_size, stream);
-  rmm::device_scalar<bool> direct_aggregations(false, stream);
+
+  // Flag indicating whether a global memory aggregation fallback is required or not
+  rmm::device_scalar<bool> needs_global_memory_fallback(false, stream);
 
   auto global_set_ref = global_set.ref(cuco::op::insert_and_find);
 
@@ -109,14 +111,16 @@ rmm::device_uvector<cudf::size_type> compute_aggregations(
                           local_mapping_index.data(),
                           global_mapping_index.data(),
                           block_cardinality.data(),
-                          direct_aggregations.data(),
+                          needs_global_memory_fallback.data(),
                           stream);
 
+  auto const needs_fallback = needs_global_memory_fallback.value(stream);
+
   // make table that will hold sparse results
   cudf::table sparse_table = create_sparse_results_table(flattened_values,
                                                          d_agg_kinds.data(),
                                                          agg_kinds,
-                                                         direct_aggregations.value(stream),
+                                                         needs_fallback,
                                                          global_set,
                                                          populated_keys,
                                                          stream);
@@ -135,12 +139,12 @@ rmm::device_uvector<cudf::size_type> compute_aggregations(
                              *d_sparse_table,
                              d_agg_kinds.data(),
                              stream);
-  if (direct_aggregations.value(stream)) {
+  if (needs_fallback) {
     auto const stride = GROUPBY_BLOCK_SIZE * grid_size;
     thrust::for_each_n(rmm::exec_policy_nosync(stream),
                        thrust::counting_iterator{0},
                        num_rows,
-                       compute_direct_aggregates{global_set_ref,
+                       global_memory_fallback_fn{global_set_ref,
                                                  *d_values,
                                                  *d_sparse_table,
                                                  d_agg_kinds.data(),
diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index b36bdd32af5..dc43dbb7179 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -182,7 +182,7 @@ struct initialize_sparse_table {
 };
 
 template <typename SetType>
-struct compute_direct_aggregates {
+struct global_memory_fallback_fn {
   SetType set;
   cudf::table_device_view input_values;
   cudf::mutable_table_device_view output_values;
@@ -192,7 +192,7 @@ struct compute_direct_aggregates {
   bitmask_type const* __restrict__ row_bitmask;
   bool skip_rows_with_nulls;
 
-  compute_direct_aggregates(SetType set,
+  global_memory_fallback_fn(SetType set,
                             cudf::table_device_view input_values,
                             cudf::mutable_table_device_view output_values,
                             cudf::aggregation::Kind const* aggs,

From 6b323f0d2991f61077c74be9f5c1d740681de2f2 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 22 Oct 2024 14:16:44 -0700
Subject: [PATCH 117/135] Rename direct_aggregations as
 needs_global_memory_fallback

---
 .../groupby/hash/compute_mapping_indices.cu   |  2 +-
 .../groupby/hash/compute_mapping_indices.cuh  | 23 ++++++++++---------
 .../groupby/hash/compute_mapping_indices.hpp  |  2 +-
 .../hash/compute_mapping_indices_null.cu      |  2 +-
 4 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cu b/cpp/src/groupby/hash/compute_mapping_indices.cu
index 1cbe70d651f..5b746b87a14 100644
--- a/cpp/src/groupby/hash/compute_mapping_indices.cu
+++ b/cpp/src/groupby/hash/compute_mapping_indices.cu
@@ -30,6 +30,6 @@ template void compute_mapping_indices<hash_set_ref_t<cuco::insert_and_find_tag>>
   cudf::size_type* local_mapping_index,
   cudf::size_type* global_mapping_index,
   cudf::size_type* block_cardinality,
-  bool* direct_aggregations,
+  bool* needs_global_memory_fallback,
   rmm::cuda_stream_view stream);
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh
index fa080709fd0..0ff567c28f0 100644
--- a/cpp/src/groupby/hash/compute_mapping_indices.cuh
+++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh
@@ -95,7 +95,7 @@ CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows,
                                         cudf::size_type* local_mapping_index,
                                         cudf::size_type* global_mapping_index,
                                         cudf::size_type* block_cardinality,
-                                        bool* direct_aggregations)
+                                        bool* needs_global_memory_fallback)
 {
   // TODO: indices inserted in each shared memory set
   __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS];
@@ -137,7 +137,7 @@ CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows,
     block.sync();
 
     if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) {
-      if (block.thread_rank() == 0) { *direct_aggregations = true; }
+      if (block.thread_rank() == 0) { *needs_global_memory_fallback = true; }
       break;
     }
   }
@@ -171,17 +171,18 @@ void compute_mapping_indices(cudf::size_type grid_size,
                              cudf::size_type* local_mapping_index,
                              cudf::size_type* global_mapping_index,
                              cudf::size_type* block_cardinality,
-                             bool* direct_aggregations,
+                             bool* needs_global_memory_fallback,
                              rmm::cuda_stream_view stream)
 {
-  mapping_indices_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, 0, stream>>>(num,
-                                                                       global_set,
-                                                                       row_bitmask,
-                                                                       skip_rows_with_nulls,
-                                                                       local_mapping_index,
-                                                                       global_mapping_index,
-                                                                       block_cardinality,
-                                                                       direct_aggregations);
+  mapping_indices_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, 0, stream>>>(
+    num,
+    global_set,
+    row_bitmask,
+    skip_rows_with_nulls,
+    local_mapping_index,
+    global_mapping_index,
+    block_cardinality,
+    needs_global_memory_fallback);
   stream.synchronize();
 }
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_mapping_indices.hpp b/cpp/src/groupby/hash/compute_mapping_indices.hpp
index d8047f9a5d8..b4eb2597118 100644
--- a/cpp/src/groupby/hash/compute_mapping_indices.hpp
+++ b/cpp/src/groupby/hash/compute_mapping_indices.hpp
@@ -36,6 +36,6 @@ void compute_mapping_indices(cudf::size_type grid_size,
                              cudf::size_type* local_mapping_index,
                              cudf::size_type* global_mapping_index,
                              cudf::size_type* block_cardinality,
-                             bool* direct_aggregations,
+                             bool* needs_global_memory_fallback,
                              rmm::cuda_stream_view stream);
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_mapping_indices_null.cu b/cpp/src/groupby/hash/compute_mapping_indices_null.cu
index 1b04016f9a1..cfccd0a0009 100644
--- a/cpp/src/groupby/hash/compute_mapping_indices_null.cu
+++ b/cpp/src/groupby/hash/compute_mapping_indices_null.cu
@@ -30,6 +30,6 @@ template void compute_mapping_indices<nullable_hash_set_ref_t<cuco::insert_and_f
   cudf::size_type* local_mapping_index,
   cudf::size_type* global_mapping_index,
   cudf::size_type* block_cardinality,
-  bool* direct_aggregations,
+  bool* needs_global_memory_fallback,
   rmm::cuda_stream_view stream);
 }  // namespace cudf::groupby::detail::hash

From ed9243b83181d15646b46e15e1aa42963131c5f6 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 22 Oct 2024 15:49:04 -0700
Subject: [PATCH 118/135] Use atomic_flag to avoid UB

---
 cpp/src/groupby/hash/compute_aggregations.cuh        | 11 +++++++++--
 cpp/src/groupby/hash/compute_mapping_indices.cu      |  2 +-
 cpp/src/groupby/hash/compute_mapping_indices.cuh     |  7 ++++---
 cpp/src/groupby/hash/compute_mapping_indices.hpp     |  4 +++-
 cpp/src/groupby/hash/compute_mapping_indices_null.cu |  2 +-
 5 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh
index 83dce5813ac..9df9779f209 100644
--- a/cpp/src/groupby/hash/compute_aggregations.cuh
+++ b/cpp/src/groupby/hash/compute_aggregations.cuh
@@ -37,6 +37,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cuco/static_set.cuh>
+#include <cuda/std/atomic>
 #include <thrust/for_each.h>
 
 #include <algorithm>
@@ -99,7 +100,7 @@ rmm::device_uvector<cudf::size_type> compute_aggregations(
   rmm::device_uvector<cudf::size_type> block_cardinality(grid_size, stream);
 
   // Flag indicating whether a global memory aggregation fallback is required or not
-  rmm::device_scalar<bool> needs_global_memory_fallback(false, stream);
+  rmm::device_scalar<cuda::std::atomic_flag> needs_global_memory_fallback(stream);
 
   auto global_set_ref = global_set.ref(cuco::op::insert_and_find);
 
@@ -114,7 +115,13 @@ rmm::device_uvector<cudf::size_type> compute_aggregations(
                           needs_global_memory_fallback.data(),
                           stream);
 
-  auto const needs_fallback = needs_global_memory_fallback.value(stream);
+  cuda::std::atomic_flag h_needs_fallback;
+  CUDF_CUDA_TRY(cudaMemcpyAsync(&h_needs_fallback,
+                                needs_global_memory_fallback.data(),
+                                sizeof(cuda::std::atomic_flag),
+                                cudaMemcpyDefault,
+                                stream.value()));
+  auto const needs_fallback = h_needs_fallback.test();
 
   // make table that will hold sparse results
   cudf::table sparse_table = create_sparse_results_table(flattened_values,
diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cu b/cpp/src/groupby/hash/compute_mapping_indices.cu
index 5b746b87a14..519d7cd2f1c 100644
--- a/cpp/src/groupby/hash/compute_mapping_indices.cu
+++ b/cpp/src/groupby/hash/compute_mapping_indices.cu
@@ -30,6 +30,6 @@ template void compute_mapping_indices<hash_set_ref_t<cuco::insert_and_find_tag>>
   cudf::size_type* local_mapping_index,
   cudf::size_type* global_mapping_index,
   cudf::size_type* block_cardinality,
-  bool* needs_global_memory_fallback,
+  cuda::std::atomic_flag* needs_global_memory_fallback,
   rmm::cuda_stream_view stream);
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh
index 0ff567c28f0..c5f542b7905 100644
--- a/cpp/src/groupby/hash/compute_mapping_indices.cuh
+++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh
@@ -27,6 +27,7 @@
 
 #include <cooperative_groups.h>
 #include <cuco/static_set_ref.cuh>
+#include <cuda/std/atomic>
 
 #include <algorithm>
 
@@ -95,7 +96,7 @@ CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows,
                                         cudf::size_type* local_mapping_index,
                                         cudf::size_type* global_mapping_index,
                                         cudf::size_type* block_cardinality,
-                                        bool* needs_global_memory_fallback)
+                                        cuda::std::atomic_flag* needs_global_memory_fallback)
 {
   // TODO: indices inserted in each shared memory set
   __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS];
@@ -137,7 +138,7 @@ CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows,
     block.sync();
 
     if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) {
-      if (block.thread_rank() == 0) { *needs_global_memory_fallback = true; }
+      if (block.thread_rank() == 0) { needs_global_memory_fallback->test_and_set(); }
       break;
     }
   }
@@ -171,7 +172,7 @@ void compute_mapping_indices(cudf::size_type grid_size,
                              cudf::size_type* local_mapping_index,
                              cudf::size_type* global_mapping_index,
                              cudf::size_type* block_cardinality,
-                             bool* needs_global_memory_fallback,
+                             cuda::std::atomic_flag* needs_global_memory_fallback,
                              rmm::cuda_stream_view stream)
 {
   mapping_indices_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, 0, stream>>>(
diff --git a/cpp/src/groupby/hash/compute_mapping_indices.hpp b/cpp/src/groupby/hash/compute_mapping_indices.hpp
index b4eb2597118..473ad99e650 100644
--- a/cpp/src/groupby/hash/compute_mapping_indices.hpp
+++ b/cpp/src/groupby/hash/compute_mapping_indices.hpp
@@ -19,6 +19,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <cuda/std/atomic>
+
 namespace cudf::groupby::detail::hash {
 /*
  * @brief Computes the maximum number of active blocks of the given kernel that can be executed on
@@ -36,6 +38,6 @@ void compute_mapping_indices(cudf::size_type grid_size,
                              cudf::size_type* local_mapping_index,
                              cudf::size_type* global_mapping_index,
                              cudf::size_type* block_cardinality,
-                             bool* needs_global_memory_fallback,
+                             cuda::std::atomic_flag* needs_global_memory_fallback,
                              rmm::cuda_stream_view stream);
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_mapping_indices_null.cu b/cpp/src/groupby/hash/compute_mapping_indices_null.cu
index cfccd0a0009..81c3c9e456f 100644
--- a/cpp/src/groupby/hash/compute_mapping_indices_null.cu
+++ b/cpp/src/groupby/hash/compute_mapping_indices_null.cu
@@ -30,6 +30,6 @@ template void compute_mapping_indices<nullable_hash_set_ref_t<cuco::insert_and_f
   cudf::size_type* local_mapping_index,
   cudf::size_type* global_mapping_index,
   cudf::size_type* block_cardinality,
-  bool* needs_global_memory_fallback,
+  cuda::std::atomic_flag* needs_global_memory_fallback,
   rmm::cuda_stream_view stream);
 }  // namespace cudf::groupby::detail::hash

From bcce437a39621ef4027eceefd196edcc590c2bde Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 22 Oct 2024 16:21:35 -0700
Subject: [PATCH 119/135] Cleanups

---
 .../groupby/hash/compute_mapping_indices.cuh  | 32 +++++++++++--------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh
index c5f542b7905..0c7897d0f19 100644
--- a/cpp/src/groupby/hash/compute_mapping_indices.cuh
+++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh
@@ -28,6 +28,7 @@
 #include <cooperative_groups.h>
 #include <cuco/static_set_ref.cuh>
 #include <cuda/std/atomic>
+#include <cuda/std/utility>
 
 #include <algorithm>
 
@@ -43,23 +44,27 @@ __device__ void find_local_mapping(cooperative_groups::thread_block const& block
                                    cudf::size_type* local_mapping_index,
                                    cudf::size_type* shared_set_indices)
 {
-  cudf::size_type result_idx{};
-  bool inserted{};
-  if (idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx))) {
-    auto const result = shared_set.insert_and_find(idx);
-    result_idx        = *result.first;
-    inserted          = result.second;
-    // inserted a new element
-    if (result.second) {
-      auto const shared_set_index          = atomicAdd(cardinality, 1);
-      shared_set_indices[shared_set_index] = idx;
-      local_mapping_index[idx]             = shared_set_index;
+  auto const is_valid_input =
+    idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx));
+  auto const [result_idx, inserted] = [&]() {
+    if (is_valid_input) {
+      auto const result      = shared_set.insert_and_find(idx);
+      auto const matched_idx = *result.first;
+      auto const inserted    = result.second;
+      // inserted a new element
+      if (result.second) {
+        auto const shared_set_index          = atomicAdd(cardinality, 1);
+        shared_set_indices[shared_set_index] = idx;
+        local_mapping_index[idx]             = shared_set_index;
+      }
+      return cuda::std::pair{matched_idx, inserted};
     }
-  }
+    return cuda::std::pair{0, false};  // dummy values
+  }();
   // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all
   // threads in the thread block.
   block.sync();
-  if (idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx))) {
+  if (is_valid_input) {
     // element was already in set
     if (!inserted) { local_mapping_index[idx] = local_mapping_index[result_idx]; }
   }
@@ -98,7 +103,6 @@ CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows,
                                         cudf::size_type* block_cardinality,
                                         cuda::std::atomic_flag* needs_global_memory_fallback)
 {
-  // TODO: indices inserted in each shared memory set
   __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS];
 
   // Shared set initialization

From e3726e3c1b572edf53b75da350baad643d33bfc9 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 22 Oct 2024 16:42:25 -0700
Subject: [PATCH 120/135] Further split compute_global_memory_aggs

---
 cpp/CMakeLists.txt                            |  1 +
 .../hash/compute_global_memory_aggs.cu        | 83 +----------------
 .../hash/compute_global_memory_aggs.cuh       | 89 +++++++++++++++++++
 .../hash/compute_global_memory_aggs_null.cu   | 32 +++++++
 4 files changed, 124 insertions(+), 81 deletions(-)
 create mode 100644 cpp/src/groupby/hash/compute_global_memory_aggs.cuh
 create mode 100644 cpp/src/groupby/hash/compute_global_memory_aggs_null.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 57bcc2df604..fb098031f7d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -374,6 +374,7 @@ add_library(
   src/groupby/hash/compute_aggregations.cu
   src/groupby/hash/compute_aggregations_null.cu
   src/groupby/hash/compute_global_memory_aggs.cu
+  src/groupby/hash/compute_global_memory_aggs_null.cu
   src/groupby/hash/compute_shared_memory_aggs.cu
   src/groupby/hash/create_sparse_results_table.cu
   src/groupby/hash/flatten_single_pass_aggs.cpp
diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cu b/cpp/src/groupby/hash/compute_global_memory_aggs.cu
index ad0cfbb6e12..6025686953e 100644
--- a/cpp/src/groupby/hash/compute_global_memory_aggs.cu
+++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cu
@@ -14,77 +14,10 @@
  * limitations under the License.
  */
 
-#include "create_sparse_results_table.hpp"
-#include "flatten_single_pass_aggs.hpp"
-#include "helpers.cuh"
-#include "single_pass_functors.cuh"
-
-#include <cudf/detail/aggregation/result_cache.hpp>
-#include <cudf/groupby.hpp>
-#include <cudf/table/table_device_view.cuh>
-#include <cudf/types.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <cuco/static_set.cuh>
-#include <thrust/for_each.h>
-
-#include <memory>
-#include <vector>
+#include "compute_global_memory_aggs.cuh"
+#include "compute_global_memory_aggs.hpp"
 
 namespace cudf::groupby::detail::hash {
-template <typename SetType>
-rmm::device_uvector<cudf::size_type> compute_global_memory_aggs(
-  cudf::size_type num_rows,
-  bool skip_rows_with_nulls,
-  bitmask_type const* row_bitmask,
-  cudf::table_view const& flattened_values,
-  cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> const& agg_kinds,
-  SetType& global_set,
-  std::vector<std::unique_ptr<aggregation>>& aggregations,
-  cudf::detail::result_cache* sparse_results,
-  rmm::cuda_stream_view stream)
-{
-  auto constexpr uses_global_memory_aggs = true;
-  // 'populated_keys' contains inserted row_indices (keys) of global hash set
-  rmm::device_uvector<cudf::size_type> populated_keys(num_rows, stream);
-
-  // make table that will hold sparse results
-  cudf::table sparse_table = create_sparse_results_table(flattened_values,
-                                                         d_agg_kinds,
-                                                         agg_kinds,
-                                                         uses_global_memory_aggs,
-                                                         global_set,
-                                                         populated_keys,
-                                                         stream);
-
-  // prepare to launch kernel to do the actual aggregation
-  auto d_values       = table_device_view::create(flattened_values, stream);
-  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
-  auto global_set_ref = global_set.ref(cuco::op::insert_and_find);
-
-  thrust::for_each_n(
-    rmm::exec_policy_nosync(stream),
-    thrust::counting_iterator{0},
-    num_rows,
-    hash::compute_single_pass_aggs_fn{
-      global_set_ref, *d_values, *d_sparse_table, d_agg_kinds, row_bitmask, skip_rows_with_nulls});
-  extract_populated_keys(global_set, populated_keys, stream);
-
-  // Add results back to sparse_results cache
-  auto sparse_result_cols = sparse_table.release();
-  for (size_t i = 0; i < aggregations.size(); i++) {
-    // Note that the cache will make a copy of this temporary aggregation
-    sparse_results->add_result(
-      flattened_values.column(i), *aggregations[i], std::move(sparse_result_cols[i]));
-  }
-
-  return populated_keys;
-}
-
 template rmm::device_uvector<cudf::size_type> compute_global_memory_aggs<global_set_t>(
   cudf::size_type num_rows,
   bool skip_rows_with_nulls,
@@ -96,16 +29,4 @@ template rmm::device_uvector<cudf::size_type> compute_global_memory_aggs<global_
   std::vector<std::unique_ptr<aggregation>>& aggregations,
   cudf::detail::result_cache* sparse_results,
   rmm::cuda_stream_view stream);
-
-template rmm::device_uvector<cudf::size_type> compute_global_memory_aggs<nullable_global_set_t>(
-  cudf::size_type num_rows,
-  bool skip_rows_with_nulls,
-  bitmask_type const* row_bitmask,
-  cudf::table_view const& flattened_values,
-  cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> const& agg_kinds,
-  nullable_global_set_t& global_set,
-  std::vector<std::unique_ptr<aggregation>>& aggregations,
-  cudf::detail::result_cache* sparse_results,
-  rmm::cuda_stream_view stream);
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh
new file mode 100644
index 00000000000..00db149c6d9
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "compute_global_memory_aggs.hpp"
+#include "create_sparse_results_table.hpp"
+#include "flatten_single_pass_aggs.hpp"
+#include "helpers.cuh"
+#include "single_pass_functors.cuh"
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuco/static_set.cuh>
+#include <thrust/for_each.h>
+
+#include <memory>
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
+rmm::device_uvector<cudf::size_type> compute_global_memory_aggs(
+  cudf::size_type num_rows,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  SetType& global_set,
+  std::vector<std::unique_ptr<aggregation>>& aggregations,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream)
+{
+  auto constexpr uses_global_memory_aggs = true;
+  // 'populated_keys' contains inserted row_indices (keys) of global hash set
+  rmm::device_uvector<cudf::size_type> populated_keys(num_rows, stream);
+
+  // make table that will hold sparse results
+  cudf::table sparse_table = create_sparse_results_table(flattened_values,
+                                                         d_agg_kinds,
+                                                         agg_kinds,
+                                                         uses_global_memory_aggs,
+                                                         global_set,
+                                                         populated_keys,
+                                                         stream);
+
+  // prepare to launch kernel to do the actual aggregation
+  auto d_values       = table_device_view::create(flattened_values, stream);
+  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
+  auto global_set_ref = global_set.ref(cuco::op::insert_and_find);
+
+  thrust::for_each_n(
+    rmm::exec_policy_nosync(stream),
+    thrust::counting_iterator{0},
+    num_rows,
+    hash::compute_single_pass_aggs_fn{
+      global_set_ref, *d_values, *d_sparse_table, d_agg_kinds, row_bitmask, skip_rows_with_nulls});
+  extract_populated_keys(global_set, populated_keys, stream);
+
+  // Add results back to sparse_results cache
+  auto sparse_result_cols = sparse_table.release();
+  for (size_t i = 0; i < aggregations.size(); i++) {
+    // Note that the cache will make a copy of this temporary aggregation
+    sparse_results->add_result(
+      flattened_values.column(i), *aggregations[i], std::move(sparse_result_cols[i]));
+  }
+
+  return populated_keys;
+}
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu
new file mode 100644
index 00000000000..209e2b7f20a
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_global_memory_aggs.cuh"
+#include "compute_global_memory_aggs.hpp"
+
+namespace cudf::groupby::detail::hash {
+template rmm::device_uvector<cudf::size_type> compute_global_memory_aggs<nullable_global_set_t>(
+  cudf::size_type num_rows,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  nullable_global_set_t& global_set,
+  std::vector<std::unique_ptr<aggregation>>& aggregations,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash

From 3775ec85e06ace02c40c2c9c25866a2b22cf24cd Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 23 Oct 2024 15:27:30 -0700
Subject: [PATCH 121/135] Remove unused code

---
 cpp/src/groupby/hash/single_pass_functors.cuh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index dc43dbb7179..abf29f098af 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -173,7 +173,6 @@ struct initialize_sparse_table {
     for (auto col_idx = 0; col_idx < sparse_table.num_columns(); col_idx++) {
       cudf::detail::dispatch_type_and_aggregation(sparse_table.column(col_idx).type(),
                                                   aggs[col_idx],
-                                                  // cudf::detail::identity_initializer{},
                                                   initialize_gmem{},
                                                   sparse_table.column(col_idx),
                                                   key_idx);

From 8dd5535bb9f73f48eb7c43631167c45fd5bcbebc Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 23 Oct 2024 15:42:50 -0700
Subject: [PATCH 122/135] Sync to make sure the data is valid

---
 cpp/src/groupby/hash/compute_aggregations.cuh | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh
index 9df9779f209..519e9f55eaf 100644
--- a/cpp/src/groupby/hash/compute_aggregations.cuh
+++ b/cpp/src/groupby/hash/compute_aggregations.cuh
@@ -121,6 +121,7 @@ rmm::device_uvector<cudf::size_type> compute_aggregations(
                                 sizeof(cuda::std::atomic_flag),
                                 cudaMemcpyDefault,
                                 stream.value()));
+  stream.synchronize();
   auto const needs_fallback = h_needs_fallback.test();
 
   // make table that will hold sparse results
@@ -146,21 +147,6 @@ rmm::device_uvector<cudf::size_type> compute_aggregations(
                              *d_sparse_table,
                              d_agg_kinds.data(),
                              stream);
-  if (needs_fallback) {
-    auto const stride = GROUPBY_BLOCK_SIZE * grid_size;
-    thrust::for_each_n(rmm::exec_policy_nosync(stream),
-                       thrust::counting_iterator{0},
-                       num_rows,
-                       global_memory_fallback_fn{global_set_ref,
-                                                 *d_values,
-                                                 *d_sparse_table,
-                                                 d_agg_kinds.data(),
-                                                 block_cardinality.data(),
-                                                 stride,
-                                                 row_bitmask,
-                                                 skip_rows_with_nulls});
-    extract_populated_keys(global_set, populated_keys, stream);
-  }
 
   // Add results back to sparse_results cache
   auto sparse_result_cols = sparse_table.release();

From 59accf609e175f4f721c78b3d9515e37ae1cbb6e Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 23 Oct 2024 15:51:35 -0700
Subject: [PATCH 123/135] Add comments

---
 cpp/src/groupby/hash/compute_aggregations.cuh | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh
index 519e9f55eaf..71db8d10e7d 100644
--- a/cpp/src/groupby/hash/compute_aggregations.cuh
+++ b/cpp/src/groupby/hash/compute_aggregations.cuh
@@ -148,6 +148,26 @@ rmm::device_uvector<cudf::size_type> compute_aggregations(
                              d_agg_kinds.data(),
                              stream);
 
+  // The shared memory groupby is designed so that each thread block can handle up to 128 unique
+  // keys. When a block reaches this cardinality limit, shared memory becomes insufficient to store
+  // the temporary aggregation results. In these situations, we must fall back to a global memory
+  // aggregator to process the remaining aggregation requests.
+  if (needs_fallback) {
+    auto const stride = GROUPBY_BLOCK_SIZE * grid_size;
+    thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                       thrust::counting_iterator{0},
+                       num_rows,
+                       global_memory_fallback_fn{global_set_ref,
+                                                 *d_values,
+                                                 *d_sparse_table,
+                                                 d_agg_kinds.data(),
+                                                 block_cardinality.data(),
+                                                 stride,
+                                                 row_bitmask,
+                                                 skip_rows_with_nulls});
+    extract_populated_keys(global_set, populated_keys, stream);
+  }
+
   // Add results back to sparse_results cache
   auto sparse_result_cols = sparse_table.release();
   for (size_t i = 0; i < aggs.size(); i++) {

From 8be28d02540d972434a0bba85eee7e8f4fa60243 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 23 Oct 2024 15:55:53 -0700
Subject: [PATCH 124/135] Add comments

---
 cpp/src/groupby/hash/compute_aggregations.cuh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh
index 71db8d10e7d..68f5a8434f1 100644
--- a/cpp/src/groupby/hash/compute_aggregations.cuh
+++ b/cpp/src/groupby/hash/compute_aggregations.cuh
@@ -116,6 +116,8 @@ rmm::device_uvector<cudf::size_type> compute_aggregations(
                           stream);
 
   cuda::std::atomic_flag h_needs_fallback;
+  // Cannot use `device_scalar::value` as it requires a copy constructor, which
+  // `atomic_flag` doesn't have.
   CUDF_CUDA_TRY(cudaMemcpyAsync(&h_needs_fallback,
                                 needs_global_memory_fallback.data(),
                                 sizeof(cuda::std::atomic_flag),

From 91da22e5cf297a291fe1a5a2076ba076e055cf22 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 23 Oct 2024 16:02:41 -0700
Subject: [PATCH 125/135] Remove redundant sync

---
 cpp/src/groupby/hash/compute_mapping_indices.cuh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh
index 0c7897d0f19..d353830780f 100644
--- a/cpp/src/groupby/hash/compute_mapping_indices.cuh
+++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh
@@ -188,6 +188,5 @@ void compute_mapping_indices(cudf::size_type grid_size,
     global_mapping_index,
     block_cardinality,
     needs_global_memory_fallback);
-  stream.synchronize();
 }
 }  // namespace cudf::groupby::detail::hash

From cf289d10dee2b07565064fd566328dd2d916cc2d Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 23 Oct 2024 16:18:53 -0700
Subject: [PATCH 126/135] Add CUDF_UNREACHABLE instead of silent break + remove
 outdated comments

---
 cpp/src/groupby/hash/compute_shared_memory_aggs.cu | 5 +++--
 cpp/src/groupby/hash/helpers.cuh                   | 2 --
 cpp/src/groupby/hash/single_pass_functors.cuh      | 1 -
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
index 9b479eae037..3e961fa1d76 100644
--- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
@@ -69,8 +69,9 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
                              cardinality);
     auto const next_col_total_size = next_col_size + valid_col_size;
 
-    // TODO: it seems early exit will break the followup calculatons. To verify
-    if (bytes_allocated + next_col_total_size > total_agg_size) { break; }
+    if (bytes_allocated + next_col_total_size > total_agg_size) {
+      CUDF_UNREACHABLE("No enough memory space for shared memory aggregations");
+    }
 
     shmem_agg_res_offsets[col_end]  = bytes_allocated;
     shmem_agg_mask_offsets[col_end] = bytes_allocated + next_col_size;
diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh
index 0d117ca35b3..7879518e660 100644
--- a/cpp/src/groupby/hash/helpers.cuh
+++ b/cpp/src/groupby/hash/helpers.cuh
@@ -23,8 +23,6 @@
 #include <cuco/static_set.cuh>
 
 namespace cudf::groupby::detail::hash {
-// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested
-// types and `cg_size = 1`for flat data to improve performance
 /// Number of threads to handle each input element
 CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1;
 
diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index abf29f098af..28c6ba717f1 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -103,7 +103,6 @@ struct initialize_target_element<Target, k, std::enable_if_t<is_supported<Target
 
 struct initialize_shmem {
   template <typename Target, cudf::aggregation::Kind k>
-  // TODO naming
   __device__ void operator()(cuda::std::byte* target,
                              bool* target_mask,
                              cudf::size_type idx) const noexcept

From a1d139aad9e461400de5225e9ffee83aded7af96 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 23 Oct 2024 16:20:10 -0700
Subject: [PATCH 127/135] Add doc

---
 cpp/src/groupby/hash/create_sparse_results_table.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp
index 416a5cff0d1..8155ce852e0 100644
--- a/cpp/src/groupby/hash/create_sparse_results_table.hpp
+++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp
@@ -34,7 +34,7 @@ namespace cudf::groupby::detail::hash {
  * @tparam SetType Type of the key hash set
  *
  * @param key_set Key hash set
- * TODO
+ * @param populated_keys Array of unique keys
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @return An array of unique keys contained in `key_set`
  */

From f9f201a76f4e1edae93b5ea2be87e38ccba7ba0e Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 28 Oct 2024 10:22:36 -0700
Subject: [PATCH 128/135] Fix leftover

---
 cpp/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 0f9d5118e13..ce147a988ea 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -376,7 +376,6 @@ add_library(
   src/groupby/hash/compute_mapping_indices.cu
   src/groupby/hash/compute_mapping_indices_null.cu
   src/groupby/hash/compute_shared_memory_aggs.cu
-  src/groupby/hash/compute_single_pass_aggs.cu
   src/groupby/hash/create_sparse_results_table.cu
   src/groupby/hash/flatten_single_pass_aggs.cpp
   src/groupby/hash/groupby.cu

From fef1ca8a08ba90bbf9b298ca9756903786c6ca44 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 30 Oct 2024 17:51:27 -0700
Subject: [PATCH 129/135] Renaming for clarity + add missing func

---
 cpp/src/groupby/hash/compute_aggregations.cuh     |  4 ++--
 .../groupby/hash/compute_shared_memory_aggs.cu    | 15 ++++++++++-----
 .../groupby/hash/compute_shared_memory_aggs.hpp   |  4 +++-
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh
index f730ef96c5f..e8b29a0e7a8 100644
--- a/cpp/src/groupby/hash/compute_aggregations.cuh
+++ b/cpp/src/groupby/hash/compute_aggregations.cuh
@@ -66,9 +66,9 @@ rmm::device_uvector<cudf::size_type> compute_aggregations(
 
   auto const grid_size =
     max_occupancy_grid_size<typename SetType::ref_type<cuco::insert_and_find_tag>>(num_rows);
-  auto const available_shmem_size = available_shared_memory_size(grid_size);
+  auto const available_shmem_size = get_available_shared_memory_size(grid_size);
   auto const has_sufficient_shmem =
-    available_shmem_size > (shmem_offsets_size(flattened_values.num_columns()) * 2);
+    available_shmem_size > (compute_shmem_offsets_size(flattened_values.num_columns()) * 2);
   auto const has_dictionary_request = std::any_of(
     requests.begin(), requests.end(), [](cudf::groupby::aggregation_request const& request) {
       return cudf::is_dictionary(request.values.type());
diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
index 12c02a1865e..3371b667be7 100644
--- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
@@ -275,7 +275,7 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
 }
 }  // namespace
 
-std::size_t available_shared_memory_size(cudf::size_type grid_size)
+std::size_t get_available_shared_memory_size(cudf::size_type grid_size)
 {
   auto const active_blocks_per_sm =
     cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
@@ -287,6 +287,11 @@ std::size_t available_shared_memory_size(cudf::size_type grid_size)
                                      ALIGNMENT);
 }
 
+std::size_t compute_shmem_offsets_size(cudf::size_type num_cols)
+{
+  return sizeof(cudf::size_type) * num_cols;
+}
+
 void compute_shared_memory_aggs(cudf::size_type grid_size,
                                 std::size_t available_shmem_size,
                                 cudf::size_type num_input_rows,
@@ -302,11 +307,11 @@ void compute_shared_memory_aggs(cudf::size_type grid_size,
 {
   // For each aggregation, need one offset determining where the aggregation is
   // performed, another indicating the validity of the aggregation
-  auto const shmem_offsets_size = output_values.num_columns() * sizeof(cudf::size_type);
+  auto const offsets_size = compute_shmem_offsets_size(output_values.num_columns());
   // The rest of shmem is utilized for the actual arrays in shmem
-  CUDF_EXPECTS(available_shmem_size > shmem_offsets_size * 2,
+  CUDF_EXPECTS(available_shmem_size > offsets_size * 2,
                "No enough space for shared memory aggregations");
-  auto const shmem_agg_size = available_shmem_size - shmem_offsets_size * 2;
+  auto const shmem_agg_size = available_shmem_size - offsets_size * 2;
   single_pass_shmem_aggs_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, available_shmem_size, stream>>>(
     num_input_rows,
     row_bitmask,
@@ -318,6 +323,6 @@ void compute_shared_memory_aggs(cudf::size_type grid_size,
     output_values,
     d_agg_kinds,
     shmem_agg_size,
-    shmem_offsets_size);
+    offsets_size);
 }
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
index 23b9858afa3..65b658a021d 100644
--- a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
@@ -22,7 +22,9 @@
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf::groupby::detail::hash {
-std::size_t available_shared_memory_size(cudf::size_type grid_size);
+std::size_t get_available_shared_memory_size(cudf::size_type grid_size);
+
+std::size_t compute_shmem_offsets_size(cudf::size_type num_cols);
 
 void compute_shared_memory_aggs(cudf::size_type grid_size,
                                 std::size_t available_shmem_size,

From 8ccd81789af6f0719c82c8c194a22aaa97c48d86 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 30 Oct 2024 18:29:18 -0700
Subject: [PATCH 130/135] Minor fix

---
 cpp/src/groupby/hash/compute_shared_memory_aggs.cu | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
index 3371b667be7..a47ba27558b 100644
--- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
@@ -74,9 +74,7 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
                                 ALIGNMENT);
     auto const next_col_total_size = next_col_size + valid_col_size;
 
-    if (bytes_allocated + next_col_total_size > total_agg_size) {
-      CUDF_UNREACHABLE("Not enough memory for shared memory aggregations");
-    }
+    if (bytes_allocated + next_col_total_size > total_agg_size) { break; }
 
     shmem_agg_res_offsets[col_end]  = bytes_allocated;
     shmem_agg_mask_offsets[col_end] = bytes_allocated + next_col_size;

From 0c315f8bbe8f696b1a3d55e28935e1faa48ca11e Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 4 Nov 2024 13:03:41 -0800
Subject: [PATCH 131/135] Update comments

---
 cpp/src/groupby/hash/compute_shared_memory_aggs.cu | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
index a47ba27558b..c15d8d44127 100644
--- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
@@ -47,9 +47,8 @@ struct size_of_functor {
 /// Shared memory data alignment
 CUDF_HOST_DEVICE cudf::size_type constexpr ALIGNMENT = 8;
 
-// Prepares shared memory data required by each output column, exits if
-// no enough memory space to perform the shared memory aggregation for the
-// current output column
+// Allocates shared memory required for output columns. Exits if there is insufficient memory to
+// perform shared memory aggregation for the current output column.
 __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
                                                cudf::size_type& col_end,
                                                cudf::mutable_table_device_view output_values,

From 7131c9f7985f6e5d733217fce64c61d1413194bf Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 6 Nov 2024 13:25:11 -0800
Subject: [PATCH 132/135] Apply suggestions from code review

Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com>
---
 cpp/src/groupby/hash/single_pass_functors.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index e22998b01dd..7a9a95f3059 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -108,8 +108,8 @@ struct initialize_shmem {
 
 template <typename Target, cudf::aggregation::Kind k, typename Enable = void>
 struct initialize_target_element_gmem {
-  __device__ void operator()(cudf::mutable_column_device_view target,
-                             cudf::size_type target_index) const noexcept
+  __device__ void operator()(cudf::mutable_column_device_view,
+                             cudf::size_type) const noexcept
   {
     CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
   }

From 5c6b33c3390448da9bf27d2e7b7b8a2cb4de8337 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 6 Nov 2024 13:37:35 -0800
Subject: [PATCH 133/135] Make compute_shmem_offsets_size constexpr

---
 cpp/src/groupby/hash/compute_shared_memory_aggs.cu  | 5 -----
 cpp/src/groupby/hash/compute_shared_memory_aggs.hpp | 5 ++++-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
index c15d8d44127..f0361ccced2 100644
--- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
@@ -284,11 +284,6 @@ std::size_t get_available_shared_memory_size(cudf::size_type grid_size)
                                      ALIGNMENT);
 }
 
-std::size_t compute_shmem_offsets_size(cudf::size_type num_cols)
-{
-  return sizeof(cudf::size_type) * num_cols;
-}
-
 void compute_shared_memory_aggs(cudf::size_type grid_size,
                                 std::size_t available_shmem_size,
                                 cudf::size_type num_input_rows,
diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
index 65b658a021d..346956cdab0 100644
--- a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
@@ -24,7 +24,10 @@
 namespace cudf::groupby::detail::hash {
 std::size_t get_available_shared_memory_size(cudf::size_type grid_size);
 
-std::size_t compute_shmem_offsets_size(cudf::size_type num_cols);
+std::size_t constexpr compute_shmem_offsets_size(cudf::size_type num_cols)
+{
+  return sizeof(cudf::size_type) * num_cols;
+}
 
 void compute_shared_memory_aggs(cudf::size_type grid_size,
                                 std::size_t available_shmem_size,

From b05fab40a19c093e087f0cf55e4b8db99ca056fb Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 6 Nov 2024 18:03:53 -0800
Subject: [PATCH 134/135] Formatting

---
 cpp/src/groupby/hash/single_pass_functors.cuh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index 7a9a95f3059..572098c75f8 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -108,8 +108,7 @@ struct initialize_shmem {
 
 template <typename Target, cudf::aggregation::Kind k, typename Enable = void>
 struct initialize_target_element_gmem {
-  __device__ void operator()(cudf::mutable_column_device_view,
-                             cudf::size_type) const noexcept
+  __device__ void operator()(cudf::mutable_column_device_view, cudf::size_type) const noexcept
   {
     CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
   }

From 96fbaa97c18988f60ba148c86c2f95add3e8e598 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 7 Nov 2024 17:07:17 -0800
Subject: [PATCH 135/135] Update cpp/src/groupby/hash/single_pass_functors.cuh

---
 cpp/src/groupby/hash/single_pass_functors.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index 572098c75f8..048c9252773 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.