From 1fa441e2d2e1048b5f35b6d92afb5917ad523884 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 19 Aug 2024 12:04:07 -0700 Subject: [PATCH 001/135] Update docs --- cpp/src/groupby/hash/groupby.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 35161eada28..741a20f72a3 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -505,7 +505,7 @@ void compute_single_pass_aggs(table_view const& keys, /** * @brief Computes and returns a device vector containing all populated keys in - * `map`. + * `key_set`. */ template rmm::device_uvector extract_populated_keys(SetType const& key_set, From 65e1b5a5a33a5e8be36692fbd7829dbcd7c12e12 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 19 Aug 2024 12:05:13 -0700 Subject: [PATCH 002/135] Minor improvement --- cpp/src/groupby/hash/groupby.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 741a20f72a3..f8a3563a4ad 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -570,7 +570,7 @@ std::unique_ptr groupby(table_view const& keys, auto const comparator_helper = [&](auto const d_key_equal) { auto const set = cuco::static_set{ num_keys, - 0.5, // desired load factor + cudf::detail::CUCO_DESIRED_LOAD_FACTOR, // 50% occupancy cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, d_key_equal, probing_scheme_type{d_row_hash}, From c58ddeff1bffdf3054a119b8afa3cd118507f463 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 20 Aug 2024 15:17:52 -0700 Subject: [PATCH 003/135] Migrate the GQE shared memory groupby to cudf --- cpp/src/groupby/hash/groupby.cu | 590 ++++++++++++-- cpp/src/groupby/hash/groupby_functors.cuh | 908 ++++++++++++++++++++++ 2 files changed, 1438 insertions(+), 60 deletions(-) create mode 100644 cpp/src/groupby/hash/groupby_functors.cuh diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index f8a3563a4ad..a536c48143e 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -14,8 +14,11 @@ * limitations under the License. */ +#include "cuco/types.cuh" +#include "cudf/utilities/error.hpp" #include "groupby/common/utils.hpp" -#include "groupby/hash/groupby_kernels.cuh" +#include "groupby_functors.cuh" +#include "groupby_kernels.cuh" #include #include @@ -30,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -61,8 +65,11 @@ namespace { // TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested // types and `cg_size = 1`for flat data to improve performance +auto constexpr window_size = 1; +auto constexpr cg_size = 1; + using probing_scheme_type = cuco::linear_probing< - 1, ///< Number of threads used to handle each input key + cg_size, ///< Number of threads used to handle each input key cudf::experimental::row::hash::device_row_hasher>; @@ -420,80 +427,550 @@ void sparse_to_dense_results(table_view const& keys, } } +template +void extract_populated_keys(SetType const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream) +{ + auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); + + populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); +} + // make table that will hold sparse results -auto create_sparse_results_table(table_view const& flattened_values, - std::vector aggs, +template +auto create_sparse_results_table(cudf::table_view const& flattened_values, + const cudf::aggregation::Kind* d_aggs, + std::vector aggs, + bool direct_aggregations, + GlobalSetType const& global_set, + rmm::device_uvector& populated_keys, rmm::cuda_stream_view stream) { // TODO single allocation - room for performance improvement - std::vector> sparse_columns; + std::vector> sparse_columns; std::transform( flattened_values.begin(), flattened_values.end(), aggs.begin(), std::back_inserter(sparse_columns), [stream](auto const& col, auto const& agg) { - bool nullable = - (agg == aggregation::COUNT_VALID or agg == aggregation::COUNT_ALL) - ? false - : (col.has_nulls() or agg == aggregation::VARIANCE or agg == aggregation::STD); - auto mask_flag = (nullable) ? mask_state::ALL_NULL : mask_state::UNALLOCATED; - - auto col_type = cudf::is_dictionary(col.type()) - ? cudf::dictionary_column_view(col).keys().type() - : col.type(); - + bool nullable = (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL) + ? false + : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or + agg == cudf::aggregation::STD); + auto mask_flag = (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED; + auto col_type = cudf::is_dictionary(col.type()) + ? cudf::dictionary_column_view(col).keys().type() + : col.type(); return make_fixed_width_column( cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); }); - - table sparse_table(std::move(sparse_columns)); - mutable_table_view table_view = sparse_table.mutable_view(); - cudf::detail::initialize_with_identity(table_view, aggs, stream); + cudf::table sparse_table(std::move(sparse_columns)); + // If no direct aggregations, initialize the sparse table + // only for the keys inserted in global hash set + if (!direct_aggregations) { + auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream); + extract_populated_keys(global_set, populated_keys, stream); + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + populated_keys.size(), + initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_aggs}); + } + // Else initialise the whole table + else { + cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view(); + cudf::detail::initialize_with_identity(sparse_table_view, aggs, stream); + } return sparse_table; } +template +__device__ void find_local_mapping(cudf::size_type cur_idx, + cudf::size_type num_input_rows, + cudf::size_type* cardinality, + SetType shared_set, + cudf::size_type* local_mapping_index, + cudf::size_type* shared_set_indices) +{ + cudf::size_type result_idx; + bool inserted; + if (cur_idx < num_input_rows) { + auto const result = shared_set.insert_and_find(cur_idx); + result_idx = *result.first; + inserted = result.second; + // inserted a new element + if (result.second) { + auto shared_set_index = atomicAdd(cardinality, 1); + shared_set_indices[shared_set_index] = cur_idx; + local_mapping_index[cur_idx] = shared_set_index; + } + } + // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all + // threads in the thread block. + __syncthreads(); + if (cur_idx < num_input_rows) { + // element was already in set + if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; } + } +} + +template +__device__ void find_global_mapping(cudf::size_type cur_idx, + SetType global_set, + cudf::size_type* shared_set_indices, + cudf::size_type* global_mapping_index, + cudf::size_type shared_set_num_elements) +{ + auto input_idx = shared_set_indices[cur_idx]; + auto result = global_set.insert_and_find(input_idx); + global_mapping_index[blockIdx.x * shared_set_num_elements + cur_idx] = *result.first; +} + +/* + * Inserts keys into the shared memory hash set, and stores the row index of the local + * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a + * threadblock exceeds `cardinality_threshold`, the threads in that block will exit without updating + * `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to the + * global hash set, and save the row index of the global sparse table in `global_mapping_index`. + */ +template +__global__ void compute_mapping_indices(GlobalSetType global_set, + cudf::size_type num_input_rows, + WindowExtent window_extent, + KeyEqual d_key_equal, + RowHasher d_row_hash, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + bool* direct_aggregations) +{ + __shared__ cudf::size_type shared_set_indices[shared_set_num_elements]; + + // Shared set initialization + __shared__ typename SetRef::window_type windows[window_extent.value()]; + auto storage = SetRef::storage_ref_type(window_extent, windows); + auto shared_set = SetRef(cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, + d_key_equal, + probing_scheme_type{d_row_hash}, + {}, + storage); + auto const block = cooperative_groups::this_thread_block(); + shared_set.initialize(block); + block.sync(); + + auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find); + + __shared__ cudf::size_type cardinality; + + if (threadIdx.x == 0) { cardinality = 0; } + + __syncthreads(); + + int num_loops = + cudf::util::div_rounding_up_safe(num_input_rows, (cudf::size_type)(blockDim.x * gridDim.x)); + auto end_idx = num_loops * blockDim.x * gridDim.x; + + for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < end_idx; + cur_idx += blockDim.x * gridDim.x) { + find_local_mapping(cur_idx, + num_input_rows, + &cardinality, + shared_insert_ref, + local_mapping_index, + shared_set_indices); + + __syncthreads(); + + if (cardinality >= cardinality_threshold) { + if (threadIdx.x == 0) { *direct_aggregations = true; } + break; + } + + __syncthreads(); + } + + // Insert unique keys from shared to global hash set + if (cardinality < cardinality_threshold) { + for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) { + find_global_mapping( + cur_idx, global_set, shared_set_indices, global_mapping_index, shared_set_num_elements); + } + } + + if (threadIdx.x == 0) block_cardinality[blockIdx.x] = cardinality; +} + +int find_num_sms() +{ + int dev_id{-1}; + CUDF_CUDA_TRY(cudaGetDevice(&dev_id)); + int num_sms{-1}; + CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id)); + return num_sms; +} +template +int find_grid_size(FuncType func, int block_size, cudf::size_type num_input_rows, int num_sms) +{ + int max_active_blocks{-1}; + CUDF_CUDA_TRY( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func, block_size, 0)); + auto max_grid_size = max_active_blocks * num_sms; + int needed_active_blocks = cudf::util::div_rounding_up_safe(num_input_rows, block_size); + return std::min(max_grid_size, needed_active_blocks); +} + +__device__ __host__ size_t round_to_multiple_of_8(size_t num) +{ + size_t constexpr multiple_of = 8; + return cudf::util::div_rounding_up_safe(num, multiple_of) * multiple_of; +} + +__device__ void calculate_columns_to_aggregate(int& col_start, + int& col_end, + cudf::mutable_table_device_view output_values, + int num_input_cols, + std::byte** s_aggregates_pointer, + bool** s_aggregates_valid_pointer, + std::byte* shared_set_aggregates, + cudf::size_type cardinality, + int total_agg_size) +{ + if (threadIdx.x == 0) { + col_start = col_end; + int bytes_allocated = 0; + int valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality); + while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) { + int next_col_size = + round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality); + int next_col_total_size = valid_col_size + next_col_size; + if (bytes_allocated + next_col_total_size > total_agg_size) { break; } + s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated; + s_aggregates_valid_pointer[col_end] = + reinterpret_cast(shared_set_aggregates + bytes_allocated + next_col_size); + bytes_allocated += next_col_total_size; + col_end++; + } + } +} + +__device__ void initialize_shared_memory_aggregates(int col_start, + int col_end, + cudf::mutable_table_device_view output_values, + std::byte** s_aggregates_pointer, + bool** s_aggregates_valid_pointer, + cudf::size_type cardinality, + cudf::aggregation::Kind const* aggs) +{ + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) { + cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(), + aggs[col_idx], + initialize_shmem{}, + s_aggregates_pointer[col_idx], + idx, + s_aggregates_valid_pointer[col_idx]); + } + } +} + +__device__ void compute_pre_aggregrates(int col_start, + int col_end, + cudf::table_device_view input_values, + cudf::size_type num_input_rows, + cudf::size_type* local_mapping_index, + std::byte** s_aggregates_pointer, + bool** s_aggregates_valid_pointer, + cudf::aggregation::Kind const* aggs) +{ + for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows; + cur_idx += blockDim.x * gridDim.x) { + auto map_idx = local_mapping_index[cur_idx]; + + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + auto input_col = input_values.column(col_idx); + + cudf::detail::dispatch_type_and_aggregation(input_col.type(), + aggs[col_idx], + shmem_element_aggregator{}, + s_aggregates_pointer[col_idx], + map_idx, + s_aggregates_valid_pointer[col_idx], + input_col, + cur_idx); + } + } +} + +template +__device__ void compute_final_aggregates(int col_start, + int col_end, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::size_type cardinality, + cudf::size_type* global_mapping_index, + std::byte** s_aggregates_pointer, + bool** s_aggregates_valid_pointer, + cudf::aggregation::Kind const* aggs) +{ + for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) { + auto out_idx = global_mapping_index[blockIdx.x * shared_set_num_elements + cur_idx]; + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + auto output_col = output_values.column(col_idx); + + cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(), + aggs[col_idx], + gmem_element_aggregator{}, + output_col, + out_idx, + input_values.column(col_idx), + s_aggregates_pointer[col_idx], + cur_idx, + s_aggregates_valid_pointer[col_idx]); + } + } +} + +/* Takes the local_mapping_index and global_mapping_index to compute + * pre (shared) and final (global) aggregates*/ +template +__global__ void compute_aggregates(cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::size_type num_input_rows, + cudf::aggregation::Kind const* aggs, + int total_agg_size, + int pointer_size) +{ + cudf::size_type cardinality = block_cardinality[blockIdx.x]; + if (cardinality >= cardinality_threshold) { return; } + int num_input_cols = output_values.num_columns(); + extern __shared__ std::byte shared_set_aggregates[]; + std::byte** s_aggregates_pointer = + reinterpret_cast(shared_set_aggregates + total_agg_size); + bool** s_aggregates_valid_pointer = + reinterpret_cast(shared_set_aggregates + total_agg_size + pointer_size); + __shared__ int col_start; + __shared__ int col_end; + if (threadIdx.x == 0) { + col_start = 0; + col_end = 0; + } + __syncthreads(); + while (col_end < num_input_cols) { + calculate_columns_to_aggregate(col_start, + col_end, + output_values, + num_input_cols, + s_aggregates_pointer, + s_aggregates_valid_pointer, + shared_set_aggregates, + cardinality, + total_agg_size); + __syncthreads(); + initialize_shared_memory_aggregates(col_start, + col_end, + output_values, + s_aggregates_pointer, + s_aggregates_valid_pointer, + cardinality, + aggs); + __syncthreads(); + compute_pre_aggregrates(col_start, + col_end, + input_values, + num_input_rows, + local_mapping_index, + s_aggregates_pointer, + s_aggregates_valid_pointer, + aggs); + __syncthreads(); + compute_final_aggregates(col_start, + col_end, + input_values, + output_values, + cardinality, + global_mapping_index, + s_aggregates_pointer, + s_aggregates_valid_pointer, + aggs); + __syncthreads(); + } +} + +size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; } + +template +size_t find_shmem_size(FuncType func, int block_size, int grid_size, int num_sms) +{ + auto active_blocks_per_sm = cudf::util::div_rounding_up_safe(grid_size, num_sms); + + size_t dynamic_smem_size; + CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock( + &dynamic_smem_size, func, active_blocks_per_sm, block_size)); + return get_previous_multiple_of_8(0.5 * dynamic_smem_size); +} + +template +void launch_compute_aggregates(int block_size, + int grid_size, + int num_sms, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::size_type num_input_rows, + cudf::aggregation::Kind const* aggs, + rmm::cuda_stream_view stream) +{ + auto compute_aggregates_fn_ptr = + compute_aggregates; + size_t d_shmem_size = find_shmem_size(compute_aggregates_fn_ptr, block_size, grid_size, num_sms); + // For each aggregation, need two pointers to arrays in shmem + // One where the aggregation is performed, one indicating the validity of the aggregation + auto shmem_agg_pointer_size = + round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns()); + // The rest of shmem is utilized for the actual arrays in shmem + auto shmem_agg_size = d_shmem_size - shmem_agg_pointer_size * 2; + compute_aggregates + <<>>(local_mapping_index, + global_mapping_index, + block_cardinality, + input_values, + output_values, + num_input_rows, + aggs, + shmem_agg_size, + shmem_agg_pointer_size); +} + /** * @brief Computes all aggregations from `requests` that require a single pass * over the data and stores the results in `sparse_results` */ -template -void compute_single_pass_aggs(table_view const& keys, - host_span requests, - cudf::detail::result_cache* sparse_results, - SetType set, - bool keys_have_nulls, - null_policy include_null_keys, - rmm::cuda_stream_view stream) +template +rmm::device_uvector compute_single_pass_set_aggs( + cudf::table_view const& keys, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + SetType& global_set, + rmm::cuda_stream_view stream, + KeyEqual d_key_equal, + RowHasher d_row_hash) { + auto constexpr block_size = 128; + auto constexpr cardinality_threshold = 128; + + auto const num_input_rows = keys.num_rows(); + + // We add additional `block_size`, because after the number of elements in the local hash set + // exceeds the threshold, all threads in the thread block can still insert one more element. + auto constexpr shared_set_num_elements = cardinality_threshold + block_size; + // shared_set_num_elements with 0.7 occupancy + auto constexpr shared_set_capacity = + static_cast(static_cast(shared_set_num_elements) * 1.43); + using extent_type = cuco::extent; + using shared_set_type = cuco::static_set, + cuco::storage>; + using shared_set_ref_type = typename shared_set_type::ref_type<>; + auto constexpr window_extent = cuco::make_window_extent(extent_type{}); + + auto global_set_ref = global_set.ref(cuco::op::insert_and_find); + + int num_sms = find_num_sms(); + auto compute_mapping_indices_fn_ptr = compute_mapping_indices; + int grid_size = + find_grid_size(compute_mapping_indices_fn_ptr, block_size, num_input_rows, num_sms); + // 'local_mapping_index' maps from the global row index of the input table to the row index of + // the local pre-aggregate table + rmm::device_uvector local_mapping_index(num_input_rows, stream); + // 'global_mapping_index' maps from the local pre-aggregate table to the row index of + // global aggregate table + rmm::device_uvector global_mapping_index(grid_size * shared_set_num_elements, + stream); + rmm::device_uvector block_cardinality(grid_size, stream); + rmm::device_scalar direct_aggregations(false, stream); + compute_mapping_indices + <<>>(global_set_ref, + num_input_rows, + window_extent, + d_key_equal, + d_row_hash, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + direct_aggregations.data()); + stream.synchronize(); + + // 'populated_keys' contains inserted row_indices (keys) of global hash set + rmm::device_uvector populated_keys(keys.num_rows(), stream); + // flatten the aggs to a table that can be operated on by aggregate_row auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); - + auto const d_aggs = cudf::detail::make_device_uvector_async( + agg_kinds, stream, rmm::mr::get_current_device_resource()); // make table that will hold sparse results - table sparse_table = create_sparse_results_table(flattened_values, agg_kinds, stream); + cudf::table sparse_table = create_sparse_results_table(flattened_values, + d_aggs.data(), + agg_kinds, + direct_aggregations.value(stream), + global_set, + populated_keys, + stream); // prepare to launch kernel to do the actual aggregation auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - auto d_values = table_device_view::create(flattened_values, stream); - auto const d_aggs = cudf::detail::make_device_uvector_async( - agg_kinds, stream, rmm::mr::get_current_device_resource()); - auto const skip_key_rows_with_nulls = - keys_have_nulls and include_null_keys == null_policy::EXCLUDE; - auto row_bitmask = - skip_key_rows_with_nulls - ? cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first - : rmm::device_buffer{}; - - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - keys.num_rows(), - hash::compute_single_pass_aggs_fn{set, - *d_values, - *d_sparse_table, - d_aggs.data(), - static_cast(row_bitmask.data()), - skip_key_rows_with_nulls}); + auto d_values = table_device_view::create(flattened_values, stream); + + launch_compute_aggregates( + block_size, + grid_size, + num_sms, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + *d_values, + *d_sparse_table, + num_input_rows, + d_aggs.data(), + stream); + + if (direct_aggregations.value(stream)) { + int stride = block_size * grid_size; + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + keys.num_rows(), + compute_direct_aggregates{global_set_ref, + *d_values, + *d_sparse_table, + d_aggs.data(), + block_cardinality.data(), + stride, + block_size, + cardinality_threshold}); + extract_populated_keys(global_set, populated_keys, stream); + } + // Add results back to sparse_results cache auto sparse_result_cols = sparse_table.release(); for (size_t i = 0; i < aggs.size(); i++) { @@ -501,6 +978,8 @@ void compute_single_pass_aggs(table_view const& keys, sparse_results->add_result( flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); } + + return populated_keys; } /** @@ -580,17 +1059,8 @@ std::unique_ptr
groupby(table_view const& keys, stream.value()}; // Compute all single pass aggs first - compute_single_pass_aggs(keys, - requests, - &sparse_results, - set.ref(cuco::insert_and_find), - keys_have_nulls, - include_null_keys, - stream); - - // Extract the populated indices from the hash set and create a gather map. - // Gathering using this map from sparse results will give dense results. - auto gather_map = extract_populated_keys(set, keys.num_rows(), stream); + auto gather_map = compute_single_pass_set_aggs( + keys, requests, &sparse_results, set, stream, d_key_equal, d_row_hash); // Compact all results from sparse_results and insert into cache sparse_to_dense_results(keys, diff --git a/cpp/src/groupby/hash/groupby_functors.cuh b/cpp/src/groupby/hash/groupby_functors.cuh new file mode 100644 index 00000000000..5630e838272 --- /dev/null +++ b/cpp/src/groupby/hash/groupby_functors.cuh @@ -0,0 +1,908 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace cudf::groupby::detail::hash { +template +struct update_target_element_gmem { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::MIN, + target_has_nulls, + source_has_nulls, + std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_fixed_point()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + if (source_has_nulls and source_null[source_index]) { return; } + + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + cudf::detail::atomic_min(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::MIN, + target_has_nulls, + source_has_nulls, + std::enable_if_t() && + cudf::has_atomic_support>()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + if (source_has_nulls and source_null[source_index]) { return; } + using Target = cudf::detail::target_type_t; + using DeviceType = cudf::device_storage_type_t; + DeviceType* source_casted = reinterpret_cast(source); + cudf::detail::atomic_min(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::MAX, + target_has_nulls, + source_has_nulls, + std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_fixed_point()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + if (source_has_nulls and source_null[source_index]) { return; } + using Target = cudf::detail::target_type_t; + Target* source_casted = reinterpret_cast(source); + cudf::detail::atomic_max(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::MAX, + target_has_nulls, + source_has_nulls, + std::enable_if_t() && + cudf::has_atomic_support>()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + if (source_has_nulls and source_null[source_index]) { return; } + using Target = cudf::detail::target_type_t; + + using DeviceType = cudf::device_storage_type_t; + DeviceType* source_casted = reinterpret_cast(source); + cudf::detail::atomic_max(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::SUM, + target_has_nulls, + source_has_nulls, + std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_fixed_point() && !cudf::is_timestamp()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + if (source_has_nulls and source_null[source_index]) { return; } + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::SUM, + target_has_nulls, + source_has_nulls, + std::enable_if_t>() && + cudf::is_fixed_point()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + if (source_has_nulls and source_null[source_index]) { return; } + using Target = cudf::detail::target_type_t; + + using DeviceType = cudf::device_storage_type_t; + DeviceType* source_casted = reinterpret_cast(source); + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source_casted[source_index])); + if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +// The shared memory will already have it squared +template +struct update_target_element_gmem()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + if (source_has_nulls and source_null[source_index]) { return; } + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + Target value = static_cast(source_casted[source_index]); + + cudf::detail::atomic_add(&target.element(target_index), value); + + if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element_gmem()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + if (source_has_nulls and source_null[source_index]) { return; } + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + cudf::detail::atomic_mul(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +// Assuming that the target column of COUNT_VALID, COUNT_ALL would be using fixed_width column and +// non-fixed point column +template +struct update_target_element_gmem< + Source, + cudf::aggregation::COUNT_VALID, + target_has_nulls, + source_has_nulls, + std::enable_if_t()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source_casted[source_index])); + + // It is assumed the output for COUNT_VALID is initialized to be all valid + } +}; + +// TODO: VALID and ALL have same code +template +struct update_target_element_gmem< + Source, + cudf::aggregation::COUNT_ALL, + target_has_nulls, + source_has_nulls, + std::enable_if_t()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source_casted[source_index])); + + // It is assumed the output for COUNT_VALID is initialized to be all valid + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::ARGMAX, + target_has_nulls, + source_has_nulls, + std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + if (source_has_nulls and source_null[source_index]) { return; } + using Target = cudf::detail::target_type_t; + Target* source_casted = reinterpret_cast(source); + auto source_argmax_index = source_casted[source_index]; + auto old = cudf::detail::atomic_cas( + &target.element(target_index), cudf::detail::ARGMAX_SENTINEL, source_argmax_index); + if (old != cudf::detail::ARGMAX_SENTINEL) { + while (source_column.element(source_argmax_index) > + source_column.element(old)) { + old = + cudf::detail::atomic_cas(&target.element(target_index), old, source_argmax_index); + } + } + + if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + } +}; +template +struct update_target_element_gmem< + Source, + cudf::aggregation::ARGMIN, + target_has_nulls, + source_has_nulls, + std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + if (source_has_nulls and source_null[source_index]) { return; } + using Target = cudf::detail::target_type_t; + Target* source_casted = reinterpret_cast(source); + auto source_argmin_index = source_casted[source_index]; + auto old = cudf::detail::atomic_cas( + &target.element(target_index), cudf::detail::ARGMIN_SENTINEL, source_argmin_index); + if (old != cudf::detail::ARGMIN_SENTINEL) { + while (source_column.element(source_argmin_index) < + source_column.element(old)) { + old = + cudf::detail::atomic_cas(&target.element(target_index), old, source_argmin_index); + } + } + + if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct gmem_element_aggregator { + template + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + update_target_element_gmem{}( + target, target_index, source_column, source, source_index, source_null); + } +}; + +template +struct update_target_element_shmem { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::MIN, + target_has_nulls, + source_has_nulls, + std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_fixed_point()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + if (source_has_nulls and source.is_null(source_index)) { return; } + + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + cudf::detail::atomic_min(&target_casted[target_index], + static_cast(source.element(source_index))); + + if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::MIN, + target_has_nulls, + source_has_nulls, + std::enable_if_t() && + cudf::has_atomic_support>()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + if (source_has_nulls and source.is_null(source_index)) { return; } + + using Target = cudf::detail::target_type_t; + using DeviceTarget = cudf::device_storage_type_t; + using DeviceSource = cudf::device_storage_type_t; + + DeviceTarget* target_casted = reinterpret_cast(target); + cudf::detail::atomic_min(&target_casted[target_index], + static_cast(source.element(source_index))); + if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::MAX, + target_has_nulls, + source_has_nulls, + std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_fixed_point()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + if (source_has_nulls and source.is_null(source_index)) { return; } + + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + cudf::detail::atomic_max(&target_casted[target_index], + static_cast(source.element(source_index))); + if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::MAX, + target_has_nulls, + source_has_nulls, + std::enable_if_t() && + cudf::has_atomic_support>()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + if (source_has_nulls and source.is_null(source_index)) { return; } + + using Target = cudf::detail::target_type_t; + + using DeviceTarget = cudf::device_storage_type_t; + using DeviceSource = cudf::device_storage_type_t; + + DeviceTarget* target_casted = reinterpret_cast(target); + cudf::detail::atomic_max(&target_casted[target_index], + static_cast(source.element(source_index))); + + if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::SUM, + target_has_nulls, + source_has_nulls, + std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_fixed_point() && !cudf::is_timestamp()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + if (source_has_nulls and source.is_null(source_index)) { return; } + + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + cudf::detail::atomic_add(&target_casted[target_index], + static_cast(source.element(source_index))); + + if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::SUM, + target_has_nulls, + source_has_nulls, + std::enable_if_t>() && + cudf::is_fixed_point()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + if (source_has_nulls and source.is_null(source_index)) { return; } + + using Target = cudf::detail::target_type_t; + + using DeviceTarget = cudf::device_storage_type_t; + using DeviceSource = cudf::device_storage_type_t; + + DeviceTarget* target_casted = reinterpret_cast(target); + cudf::detail::atomic_add(&target_casted[target_index], + static_cast(source.element(source_index))); + + if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + } +}; + +template +struct update_target_element_shmem()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + if (source_has_nulls and source.is_null(source_index)) { return; } + + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + auto value = static_cast(source.element(source_index)); + cudf::detail::atomic_add(&target_casted[target_index], value * value); + + if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + } +}; + +template +struct update_target_element_shmem()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + if (source_has_nulls and source.is_null(source_index)) { return; } + + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + cudf::detail::atomic_mul(&target_casted[target_index], + static_cast(source.element(source_index))); + + if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::COUNT_VALID, + target_has_nulls, + source_has_nulls, + std::enable_if_t()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + if (source_has_nulls and source.is_null(source_index)) { return; } + + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + cudf::detail::atomic_add(&target_casted[target_index], Target{1}); + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::COUNT_ALL, + target_has_nulls, + source_has_nulls, + std::enable_if_t()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + cudf::detail::atomic_add(&target_casted[target_index], Target{1}); + + // Assumes target is already set to be valid + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::ARGMAX, + target_has_nulls, + source_has_nulls, + std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + if (source_has_nulls and source.is_null(source_index)) { return; } + + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + auto old = cudf::detail::atomic_cas( + &target_casted[target_index], cudf::detail::ARGMAX_SENTINEL, source_index); + if (old != cudf::detail::ARGMAX_SENTINEL) { + while (source.element(source_index) > source.element(old)) { + old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index); + } + } + + if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::ARGMIN, + target_has_nulls, + source_has_nulls, + std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + if (source_has_nulls and source.is_null(source_index)) { return; } + + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + auto old = cudf::detail::atomic_cas( + &target_casted[target_index], cudf::detail::ARGMIN_SENTINEL, source_index); + if (old != cudf::detail::ARGMIN_SENTINEL) { + while (source.element(source_index) < source.element(old)) { + old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index); + } + } + + if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + } +}; + +template +struct shmem_element_aggregator { + template + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + update_target_element_shmem{}( + target, target_index, target_null, source, source_index); + } +}; + +template +__device__ constexpr bool is_supported() +{ + return cudf::is_fixed_width() and + ((k == cudf::aggregation::SUM) or (k == cudf::aggregation::MIN) or + (k == cudf::aggregation::MAX) or (k == cudf::aggregation::COUNT_VALID) or + (k == cudf::aggregation::COUNT_ALL) or (k == cudf::aggregation::ARGMAX) or + (k == cudf::aggregation::ARGMIN) or (k == cudf::aggregation::SUM_OF_SQUARES) or + (k == cudf::aggregation::STD) or (k == cudf::aggregation::VARIANCE) or + (k == cudf::aggregation::PRODUCT) and cudf::detail::is_product_supported()); +} + +template +__device__ std::enable_if_t, void>, T> +identity_from_operator() +{ + using DeviceType = cudf::device_storage_type_t; + return cudf::detail::corresponding_operator_t::template identity(); +} + +template +__device__ std::enable_if_t, void>, T> +identity_from_operator() +{ + CUDF_UNREACHABLE("Unable to get identity/sentinel from device operator"); +} + +template +__device__ T get_identity() +{ + if ((k == cudf::aggregation::ARGMAX) || (k == cudf::aggregation::ARGMIN)) { + if constexpr (cudf::is_timestamp()) + return k == cudf::aggregation::ARGMAX + ? T{typename T::duration(cudf::detail::ARGMAX_SENTINEL)} + : T{typename T::duration(cudf::detail::ARGMIN_SENTINEL)}; + else { + using DeviceType = cudf::device_storage_type_t; + return k == cudf::aggregation::ARGMAX + ? static_cast(cudf::detail::ARGMAX_SENTINEL) + : static_cast(cudf::detail::ARGMIN_SENTINEL); + } + } + return identity_from_operator(); +} + +template +struct initialize_target_element { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null) const noexcept + { + CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + } +}; + +// TODO: are the conditions correctly checked? +template +struct initialize_target_element()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null) const noexcept + { + using DeviceType = cudf::device_storage_type_t; + DeviceType* target_casted = reinterpret_cast(target); + target_casted[target_index] = get_identity(); + + if (k == cudf::aggregation::COUNT_ALL || k == cudf::aggregation::COUNT_VALID) { + target_null[target_index] = false; + } else { + target_null[target_index] = true; + } + } +}; + +struct initialize_shmem { + template + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null) const noexcept + { + // TODO: typecasting work for every datatype + + initialize_target_element{}(target, target_index, target_null); + } +}; + +template +struct initialize_target_element_gmem { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index) const noexcept + { + CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + } +}; + +template +struct initialize_target_element_gmem< + Target, + k, + std::enable_if_t() && cudf::is_fixed_width() && + !cudf::is_fixed_point()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index) const noexcept + { + using DeviceType = cudf::device_storage_type_t; + target.element(target_index) = get_identity(); + } +}; + +template +struct initialize_target_element_gmem< + Target, + k, + std::enable_if_t() && cudf::is_fixed_point()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index) const noexcept + { + using DeviceType = cudf::device_storage_type_t; + target.element(target_index) = get_identity(); + } +}; + +struct initialize_gmem { + template + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index) const noexcept + { + initialize_target_element_gmem{}(target, target_index); + } +}; + +struct initialize_sparse_table { + cudf::size_type const* row_indices; + cudf::mutable_table_device_view sparse_table; + cudf::aggregation::Kind const* __restrict__ aggs; + initialize_sparse_table(cudf::size_type const* row_indices, + cudf::mutable_table_device_view sparse_table, + cudf::aggregation::Kind const* aggs) + : row_indices(row_indices), sparse_table(sparse_table), aggs(aggs) + { + } + __device__ void operator()(cudf::size_type i) + { + auto key_idx = row_indices[i]; + for (auto col_idx = 0; col_idx < sparse_table.num_columns(); col_idx++) { + cudf::detail::dispatch_type_and_aggregation(sparse_table.column(col_idx).type(), + aggs[col_idx], + initialize_gmem{}, + sparse_table.column(col_idx), + key_idx); + } + } +}; + +template +struct compute_direct_aggregates { + SetType set; + cudf::table_device_view input_values; + cudf::mutable_table_device_view output_values; + cudf::aggregation::Kind const* __restrict__ aggs; + cudf::size_type* block_cardinality; + int stride; + int block_size; + cudf::size_type cardinality_threshold; + compute_direct_aggregates(SetType set, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* aggs, + cudf::size_type* block_cardinality, + int stride, + int block_size, + cudf::size_type cardinality_threshold) + : set(set), + input_values(input_values), + output_values(output_values), + aggs(aggs), + block_cardinality(block_cardinality), + stride(stride), + block_size(block_size), + cardinality_threshold(cardinality_threshold) + { + } + __device__ void operator()(cudf::size_type i) + { + int block_id = (i % stride) / block_size; + if (block_cardinality[block_id] >= cardinality_threshold) { + auto const result = set.insert_and_find(i); + cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); + } + } +}; + +} // namespace cudf::groupby::detail::hash From d604d0a75fc55e49a1167c1606f83aa5e470c31f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 20 Aug 2024 17:52:46 -0700 Subject: [PATCH 004/135] Many cleanups --- cpp/src/groupby/hash/groupby.cu | 328 +----------------- cpp/src/groupby/hash/groupby_kernels.cuh | 111 ------ cpp/src/groupby/hash/helpers.cuh | 45 +++ cpp/src/groupby/hash/kernels.cuh | 324 +++++++++++++++++ ...ss_kernels.cuh => multi_pass_functors.cuh} | 0 ..._functors.cuh => single_pass_functors.cuh} | 77 ++++ 6 files changed, 453 insertions(+), 432 deletions(-) delete mode 100644 cpp/src/groupby/hash/groupby_kernels.cuh create mode 100644 cpp/src/groupby/hash/helpers.cuh create mode 100644 cpp/src/groupby/hash/kernels.cuh rename cpp/src/groupby/hash/{multi_pass_kernels.cuh => multi_pass_functors.cuh} (100%) rename cpp/src/groupby/hash/{groupby_functors.cuh => single_pass_functors.cuh} (91%) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index a536c48143e..92e8f4a6f8b 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -14,11 +14,11 @@ * limitations under the License. */ -#include "cuco/types.cuh" -#include "cudf/utilities/error.hpp" #include "groupby/common/utils.hpp" -#include "groupby_functors.cuh" -#include "groupby_kernels.cuh" +#include "helpers.cuh" +#include "kernels.cuh" +#include "multi_pass_functors.cuh" +#include "single_pass_functors.cuh" #include #include @@ -28,12 +28,10 @@ #include #include #include -#include #include #include #include #include -#include #include #include #include @@ -49,7 +47,6 @@ #include #include -#include #include #include @@ -63,16 +60,6 @@ namespace detail { namespace hash { namespace { -// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested -// types and `cg_size = 1`for flat data to improve performance -auto constexpr window_size = 1; -auto constexpr cg_size = 1; - -using probing_scheme_type = cuco::linear_probing< - cg_size, ///< Number of threads used to handle each input key - cudf::experimental::row::hash::device_row_hasher>; - /** * @brief List of aggregation operations that can be computed with a hash-based * implementation. @@ -485,128 +472,6 @@ auto create_sparse_results_table(cudf::table_view const& flattened_values, return sparse_table; } -template -__device__ void find_local_mapping(cudf::size_type cur_idx, - cudf::size_type num_input_rows, - cudf::size_type* cardinality, - SetType shared_set, - cudf::size_type* local_mapping_index, - cudf::size_type* shared_set_indices) -{ - cudf::size_type result_idx; - bool inserted; - if (cur_idx < num_input_rows) { - auto const result = shared_set.insert_and_find(cur_idx); - result_idx = *result.first; - inserted = result.second; - // inserted a new element - if (result.second) { - auto shared_set_index = atomicAdd(cardinality, 1); - shared_set_indices[shared_set_index] = cur_idx; - local_mapping_index[cur_idx] = shared_set_index; - } - } - // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all - // threads in the thread block. - __syncthreads(); - if (cur_idx < num_input_rows) { - // element was already in set - if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; } - } -} - -template -__device__ void find_global_mapping(cudf::size_type cur_idx, - SetType global_set, - cudf::size_type* shared_set_indices, - cudf::size_type* global_mapping_index, - cudf::size_type shared_set_num_elements) -{ - auto input_idx = shared_set_indices[cur_idx]; - auto result = global_set.insert_and_find(input_idx); - global_mapping_index[blockIdx.x * shared_set_num_elements + cur_idx] = *result.first; -} - -/* - * Inserts keys into the shared memory hash set, and stores the row index of the local - * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a - * threadblock exceeds `cardinality_threshold`, the threads in that block will exit without updating - * `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to the - * global hash set, and save the row index of the global sparse table in `global_mapping_index`. - */ -template -__global__ void compute_mapping_indices(GlobalSetType global_set, - cudf::size_type num_input_rows, - WindowExtent window_extent, - KeyEqual d_key_equal, - RowHasher d_row_hash, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - bool* direct_aggregations) -{ - __shared__ cudf::size_type shared_set_indices[shared_set_num_elements]; - - // Shared set initialization - __shared__ typename SetRef::window_type windows[window_extent.value()]; - auto storage = SetRef::storage_ref_type(window_extent, windows); - auto shared_set = SetRef(cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, - d_key_equal, - probing_scheme_type{d_row_hash}, - {}, - storage); - auto const block = cooperative_groups::this_thread_block(); - shared_set.initialize(block); - block.sync(); - - auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find); - - __shared__ cudf::size_type cardinality; - - if (threadIdx.x == 0) { cardinality = 0; } - - __syncthreads(); - - int num_loops = - cudf::util::div_rounding_up_safe(num_input_rows, (cudf::size_type)(blockDim.x * gridDim.x)); - auto end_idx = num_loops * blockDim.x * gridDim.x; - - for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < end_idx; - cur_idx += blockDim.x * gridDim.x) { - find_local_mapping(cur_idx, - num_input_rows, - &cardinality, - shared_insert_ref, - local_mapping_index, - shared_set_indices); - - __syncthreads(); - - if (cardinality >= cardinality_threshold) { - if (threadIdx.x == 0) { *direct_aggregations = true; } - break; - } - - __syncthreads(); - } - - // Insert unique keys from shared to global hash set - if (cardinality < cardinality_threshold) { - for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) { - find_global_mapping( - cur_idx, global_set, shared_set_indices, global_mapping_index, shared_set_num_elements); - } - } - - if (threadIdx.x == 0) block_cardinality[blockIdx.x] = cardinality; -} - int find_num_sms() { int dev_id{-1}; @@ -615,6 +480,7 @@ int find_num_sms() CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id)); return num_sms; } + template int find_grid_size(FuncType func, int block_size, cudf::size_type num_input_rows, int num_sms) { @@ -626,186 +492,6 @@ int find_grid_size(FuncType func, int block_size, cudf::size_type num_input_rows return std::min(max_grid_size, needed_active_blocks); } -__device__ __host__ size_t round_to_multiple_of_8(size_t num) -{ - size_t constexpr multiple_of = 8; - return cudf::util::div_rounding_up_safe(num, multiple_of) * multiple_of; -} - -__device__ void calculate_columns_to_aggregate(int& col_start, - int& col_end, - cudf::mutable_table_device_view output_values, - int num_input_cols, - std::byte** s_aggregates_pointer, - bool** s_aggregates_valid_pointer, - std::byte* shared_set_aggregates, - cudf::size_type cardinality, - int total_agg_size) -{ - if (threadIdx.x == 0) { - col_start = col_end; - int bytes_allocated = 0; - int valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality); - while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) { - int next_col_size = - round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality); - int next_col_total_size = valid_col_size + next_col_size; - if (bytes_allocated + next_col_total_size > total_agg_size) { break; } - s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated; - s_aggregates_valid_pointer[col_end] = - reinterpret_cast(shared_set_aggregates + bytes_allocated + next_col_size); - bytes_allocated += next_col_total_size; - col_end++; - } - } -} - -__device__ void initialize_shared_memory_aggregates(int col_start, - int col_end, - cudf::mutable_table_device_view output_values, - std::byte** s_aggregates_pointer, - bool** s_aggregates_valid_pointer, - cudf::size_type cardinality, - cudf::aggregation::Kind const* aggs) -{ - for (auto col_idx = col_start; col_idx < col_end; col_idx++) { - for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) { - cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(), - aggs[col_idx], - initialize_shmem{}, - s_aggregates_pointer[col_idx], - idx, - s_aggregates_valid_pointer[col_idx]); - } - } -} - -__device__ void compute_pre_aggregrates(int col_start, - int col_end, - cudf::table_device_view input_values, - cudf::size_type num_input_rows, - cudf::size_type* local_mapping_index, - std::byte** s_aggregates_pointer, - bool** s_aggregates_valid_pointer, - cudf::aggregation::Kind const* aggs) -{ - for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows; - cur_idx += blockDim.x * gridDim.x) { - auto map_idx = local_mapping_index[cur_idx]; - - for (auto col_idx = col_start; col_idx < col_end; col_idx++) { - auto input_col = input_values.column(col_idx); - - cudf::detail::dispatch_type_and_aggregation(input_col.type(), - aggs[col_idx], - shmem_element_aggregator{}, - s_aggregates_pointer[col_idx], - map_idx, - s_aggregates_valid_pointer[col_idx], - input_col, - cur_idx); - } - } -} - -template -__device__ void compute_final_aggregates(int col_start, - int col_end, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::size_type cardinality, - cudf::size_type* global_mapping_index, - std::byte** s_aggregates_pointer, - bool** s_aggregates_valid_pointer, - cudf::aggregation::Kind const* aggs) -{ - for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) { - auto out_idx = global_mapping_index[blockIdx.x * shared_set_num_elements + cur_idx]; - for (auto col_idx = col_start; col_idx < col_end; col_idx++) { - auto output_col = output_values.column(col_idx); - - cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(), - aggs[col_idx], - gmem_element_aggregator{}, - output_col, - out_idx, - input_values.column(col_idx), - s_aggregates_pointer[col_idx], - cur_idx, - s_aggregates_valid_pointer[col_idx]); - } - } -} - -/* Takes the local_mapping_index and global_mapping_index to compute - * pre (shared) and final (global) aggregates*/ -template -__global__ void compute_aggregates(cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::size_type num_input_rows, - cudf::aggregation::Kind const* aggs, - int total_agg_size, - int pointer_size) -{ - cudf::size_type cardinality = block_cardinality[blockIdx.x]; - if (cardinality >= cardinality_threshold) { return; } - int num_input_cols = output_values.num_columns(); - extern __shared__ std::byte shared_set_aggregates[]; - std::byte** s_aggregates_pointer = - reinterpret_cast(shared_set_aggregates + total_agg_size); - bool** s_aggregates_valid_pointer = - reinterpret_cast(shared_set_aggregates + total_agg_size + pointer_size); - __shared__ int col_start; - __shared__ int col_end; - if (threadIdx.x == 0) { - col_start = 0; - col_end = 0; - } - __syncthreads(); - while (col_end < num_input_cols) { - calculate_columns_to_aggregate(col_start, - col_end, - output_values, - num_input_cols, - s_aggregates_pointer, - s_aggregates_valid_pointer, - shared_set_aggregates, - cardinality, - total_agg_size); - __syncthreads(); - initialize_shared_memory_aggregates(col_start, - col_end, - output_values, - s_aggregates_pointer, - s_aggregates_valid_pointer, - cardinality, - aggs); - __syncthreads(); - compute_pre_aggregrates(col_start, - col_end, - input_values, - num_input_rows, - local_mapping_index, - s_aggregates_pointer, - s_aggregates_valid_pointer, - aggs); - __syncthreads(); - compute_final_aggregates(col_start, - col_end, - input_values, - output_values, - cardinality, - global_mapping_index, - s_aggregates_pointer, - s_aggregates_valid_pointer, - aggs); - __syncthreads(); - } -} - size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; } template @@ -885,7 +571,7 @@ rmm::device_uvector compute_single_pass_set_aggs( typename SetType::key_equal, probing_scheme_type, cuco::cuda_allocator, - cuco::storage>; + cuco::storage>; using shared_set_ref_type = typename shared_set_type::ref_type<>; auto constexpr window_extent = cuco::make_window_extent(extent_type{}); @@ -1054,7 +740,7 @@ std::unique_ptr
groupby(table_view const& keys, d_key_equal, probing_scheme_type{d_row_hash}, cuco::thread_scope_device, - cuco::storage<1>{}, + cuco::storage{}, cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, stream.value()}; diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/groupby_kernels.cuh deleted file mode 100644 index 9abfe22950a..00000000000 --- a/cpp/src/groupby/hash/groupby_kernels.cuh +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "multi_pass_kernels.cuh" - -#include -#include -#include -#include - -#include - -namespace cudf { -namespace groupby { -namespace detail { -namespace hash { -/** - * @brief Computes single-pass aggregations and store results into a sparse `output_values` table, - * and populate `set` with indices of unique keys - * - * The hash set is built by inserting every row index `i` from the `keys` and `values` tables. If - * the index was not present in the set, insert they index and then copy it to the output. If the - * key was already present in the set, then the inserted index is aggregated with the existing row. - * This aggregation is done for every element `j` in the row by applying aggregation operation `j` - * between the new and existing element. - * - * Instead of storing the entire rows from `input_keys` and `input_values` in - * the hashset, we instead store the row indices. For example, when inserting - * row at index `i` from `input_keys` into the hash set, the value `i` is what - * gets stored for the hash set's "key". It is assumed the `set` was constructed - * with a custom comparator that uses these row indices to check for equality - * between key rows. For example, comparing two keys `k0` and `k1` will compare - * the two rows `input_keys[k0] ?= input_keys[k1]` - * - * The exact size of the result is not known a priori, but can be upper bounded - * by the number of rows in `input_keys` & `input_values`. Therefore, it is - * assumed `output_values` has sufficient storage for an equivalent number of - * rows. In this way, after all rows are aggregated, `output_values` will likely - * be "sparse", meaning that not all rows contain the result of an aggregation. - * - * @tparam SetType The type of the hash set device ref - */ -template -struct compute_single_pass_aggs_fn { - SetType set; - table_device_view input_values; - mutable_table_device_view output_values; - aggregation::Kind const* __restrict__ aggs; - bitmask_type const* __restrict__ row_bitmask; - bool skip_rows_with_nulls; - - /** - * @brief Construct a new compute_single_pass_aggs_fn functor object - * - * @param set_ref Hash set object to insert key,value pairs into. - * @param input_values The table whose rows will be aggregated in the values - * of the hash set - * @param output_values Table that stores the results of aggregating rows of - * `input_values`. - * @param aggs The set of aggregation operations to perform across the - * columns of the `input_values` rows - * @param row_bitmask Bitmask where bit `i` indicates the presence of a null - * value in row `i` of input keys. Only used if `skip_rows_with_nulls` is `true` - * @param skip_rows_with_nulls Indicates if rows in `input_keys` containing - * null values should be skipped. It `true`, it is assumed `row_bitmask` is a - * bitmask where bit `i` indicates the presence of a null value in row `i`. - */ - compute_single_pass_aggs_fn(SetType set, - table_device_view input_values, - mutable_table_device_view output_values, - aggregation::Kind const* aggs, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls) - : set(set), - input_values(input_values), - output_values(output_values), - aggs(aggs), - row_bitmask(row_bitmask), - skip_rows_with_nulls(skip_rows_with_nulls) - { - } - - __device__ void operator()(size_type i) - { - if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i)) { - auto const result = set.insert_and_find(i); - - cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); - } - } -}; - -} // namespace hash -} // namespace detail -} // namespace groupby -} // namespace cudf diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh new file mode 100644 index 00000000000..32aca69accf --- /dev/null +++ b/cpp/src/groupby/hash/helpers.cuh @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include + +namespace cudf::groupby::detail::hash { + +CUDF_HOST_DEVICE constexpr std::size_t round_to_multiple_of_8(std::size_t num) +{ + std::size_t constexpr base = 8; + return cudf::util::div_rounding_up_safe(num, base) * base; +} + +// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested +// types and `cg_size = 1`for flat data to improve performance +/// Number of threads to handle each input element +CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1; +/// Number of slots per thread +CUDF_HOST_DEVICE auto constexpr GROUPBY_WINDOW_SIZE = 1; + +using probing_scheme_type = cuco::linear_probing< + GROUPBY_CG_SIZE, + cudf::experimental::row::hash::device_row_hasher>; + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/kernels.cuh b/cpp/src/groupby/hash/kernels.cuh new file mode 100644 index 00000000000..a50083f2082 --- /dev/null +++ b/cpp/src/groupby/hash/kernels.cuh @@ -0,0 +1,324 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cudf/types.hpp" +#include "helpers.cuh" +#include "single_pass_functors.cuh" + +#include +#include + +namespace cudf::groupby::detail::hash { + +__device__ void calculate_columns_to_aggregate(int& col_start, + int& col_end, + cudf::mutable_table_device_view output_values, + int num_input_cols, + std::byte** s_aggregates_pointer, + bool** s_aggregates_valid_pointer, + std::byte* shared_set_aggregates, + cudf::size_type cardinality, + int total_agg_size) +{ + if (threadIdx.x == 0) { + col_start = col_end; + int bytes_allocated = 0; + int valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality); + while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) { + int next_col_size = + round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality); + int next_col_total_size = valid_col_size + next_col_size; + if (bytes_allocated + next_col_total_size > total_agg_size) { break; } + s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated; + s_aggregates_valid_pointer[col_end] = + reinterpret_cast(shared_set_aggregates + bytes_allocated + next_col_size); + bytes_allocated += next_col_total_size; + col_end++; + } + } +} + +__device__ void initialize_shared_memory_aggregates(int col_start, + int col_end, + cudf::mutable_table_device_view output_values, + std::byte** s_aggregates_pointer, + bool** s_aggregates_valid_pointer, + cudf::size_type cardinality, + cudf::aggregation::Kind const* aggs) +{ + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) { + cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(), + aggs[col_idx], + initialize_shmem{}, + s_aggregates_pointer[col_idx], + idx, + s_aggregates_valid_pointer[col_idx]); + } + } +} + +__device__ void compute_pre_aggregrates(int col_start, + int col_end, + cudf::table_device_view input_values, + cudf::size_type num_input_rows, + cudf::size_type* local_mapping_index, + std::byte** s_aggregates_pointer, + bool** s_aggregates_valid_pointer, + cudf::aggregation::Kind const* aggs) +{ + for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows; + cur_idx += blockDim.x * gridDim.x) { + auto map_idx = local_mapping_index[cur_idx]; + + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + auto input_col = input_values.column(col_idx); + + cudf::detail::dispatch_type_and_aggregation(input_col.type(), + aggs[col_idx], + shmem_element_aggregator{}, + s_aggregates_pointer[col_idx], + map_idx, + s_aggregates_valid_pointer[col_idx], + input_col, + cur_idx); + } + } +} + +template +__device__ void compute_final_aggregates(int col_start, + int col_end, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::size_type cardinality, + cudf::size_type* global_mapping_index, + std::byte** s_aggregates_pointer, + bool** s_aggregates_valid_pointer, + cudf::aggregation::Kind const* aggs) +{ + for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) { + auto out_idx = global_mapping_index[blockIdx.x * shared_set_num_elements + cur_idx]; + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + auto output_col = output_values.column(col_idx); + + cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(), + aggs[col_idx], + gmem_element_aggregator{}, + output_col, + out_idx, + input_values.column(col_idx), + s_aggregates_pointer[col_idx], + cur_idx, + s_aggregates_valid_pointer[col_idx]); + } + } +} + +/* Takes the local_mapping_index and global_mapping_index to compute + * pre (shared) and final (global) aggregates*/ +template +CUDF_KERNEL void compute_aggregates(cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::size_type num_input_rows, + cudf::aggregation::Kind const* aggs, + int total_agg_size, + int pointer_size) +{ + cudf::size_type cardinality = block_cardinality[blockIdx.x]; + if (cardinality >= cardinality_threshold) { return; } + int num_input_cols = output_values.num_columns(); + extern __shared__ std::byte shared_set_aggregates[]; + std::byte** s_aggregates_pointer = + reinterpret_cast(shared_set_aggregates + total_agg_size); + bool** s_aggregates_valid_pointer = + reinterpret_cast(shared_set_aggregates + total_agg_size + pointer_size); + __shared__ int col_start; + __shared__ int col_end; + if (threadIdx.x == 0) { + col_start = 0; + col_end = 0; + } + __syncthreads(); + while (col_end < num_input_cols) { + calculate_columns_to_aggregate(col_start, + col_end, + output_values, + num_input_cols, + s_aggregates_pointer, + s_aggregates_valid_pointer, + shared_set_aggregates, + cardinality, + total_agg_size); + __syncthreads(); + initialize_shared_memory_aggregates(col_start, + col_end, + output_values, + s_aggregates_pointer, + s_aggregates_valid_pointer, + cardinality, + aggs); + __syncthreads(); + compute_pre_aggregrates(col_start, + col_end, + input_values, + num_input_rows, + local_mapping_index, + s_aggregates_pointer, + s_aggregates_valid_pointer, + aggs); + __syncthreads(); + compute_final_aggregates(col_start, + col_end, + input_values, + output_values, + cardinality, + global_mapping_index, + s_aggregates_pointer, + s_aggregates_valid_pointer, + aggs); + __syncthreads(); + } +} + +template +__device__ void find_local_mapping(cudf::size_type cur_idx, + cudf::size_type num_input_rows, + cudf::size_type* cardinality, + SetType shared_set, + cudf::size_type* local_mapping_index, + cudf::size_type* shared_set_indices) +{ + cudf::size_type result_idx; + bool inserted; + if (cur_idx < num_input_rows) { + auto const result = shared_set.insert_and_find(cur_idx); + result_idx = *result.first; + inserted = result.second; + // inserted a new element + if (result.second) { + auto shared_set_index = atomicAdd(cardinality, 1); + shared_set_indices[shared_set_index] = cur_idx; + local_mapping_index[cur_idx] = shared_set_index; + } + } + // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all + // threads in the thread block. + __syncthreads(); + if (cur_idx < num_input_rows) { + // element was already in set + if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; } + } +} + +template +__device__ void find_global_mapping(cudf::size_type cur_idx, + SetType global_set, + cudf::size_type* shared_set_indices, + cudf::size_type* global_mapping_index, + cudf::size_type shared_set_num_elements) +{ + auto input_idx = shared_set_indices[cur_idx]; + auto result = global_set.insert_and_find(input_idx); + global_mapping_index[blockIdx.x * shared_set_num_elements + cur_idx] = *result.first; +} + +/* + * Inserts keys into the shared memory hash set, and stores the row index of the local + * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a + * threadblock exceeds `cardinality_threshold`, the threads in that block will exit without updating + * `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to the + * global hash set, and save the row index of the global sparse table in `global_mapping_index`. + */ +template +CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, + cudf::size_type num_input_rows, + WindowExtent window_extent, + KeyEqual d_key_equal, + RowHasher d_row_hash, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + bool* direct_aggregations) +{ + __shared__ cudf::size_type shared_set_indices[shared_set_num_elements]; + + // Shared set initialization + __shared__ typename SetRef::window_type windows[window_extent.value()]; + auto storage = SetRef::storage_ref_type(window_extent, windows); + auto shared_set = SetRef(cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, + d_key_equal, + probing_scheme_type{d_row_hash}, + {}, + storage); + auto const block = cooperative_groups::this_thread_block(); + shared_set.initialize(block); + block.sync(); + + auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find); + + __shared__ cudf::size_type cardinality; + + if (threadIdx.x == 0) { cardinality = 0; } + + __syncthreads(); + + int num_loops = + cudf::util::div_rounding_up_safe(num_input_rows, (cudf::size_type)(blockDim.x * gridDim.x)); + auto end_idx = num_loops * blockDim.x * gridDim.x; + + for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < end_idx; + cur_idx += blockDim.x * gridDim.x) { + find_local_mapping(cur_idx, + num_input_rows, + &cardinality, + shared_insert_ref, + local_mapping_index, + shared_set_indices); + + __syncthreads(); + + if (cardinality >= cardinality_threshold) { + if (threadIdx.x == 0) { *direct_aggregations = true; } + break; + } + + __syncthreads(); + } + + // Insert unique keys from shared to global hash set + if (cardinality < cardinality_threshold) { + for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) { + find_global_mapping( + cur_idx, global_set, shared_set_indices, global_mapping_index, shared_set_num_elements); + } + } + + if (threadIdx.x == 0) block_cardinality[blockIdx.x] = cardinality; +} + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/multi_pass_kernels.cuh b/cpp/src/groupby/hash/multi_pass_functors.cuh similarity index 100% rename from cpp/src/groupby/hash/multi_pass_kernels.cuh rename to cpp/src/groupby/hash/multi_pass_functors.cuh diff --git a/cpp/src/groupby/hash/groupby_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh similarity index 91% rename from cpp/src/groupby/hash/groupby_functors.cuh rename to cpp/src/groupby/hash/single_pass_functors.cuh index 5630e838272..2b92ed63098 100644 --- a/cpp/src/groupby/hash/groupby_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -21,6 +21,7 @@ #include namespace cudf::groupby::detail::hash { + template +struct compute_single_pass_aggs_fn { + SetType set; + table_device_view input_values; + mutable_table_device_view output_values; + aggregation::Kind const* __restrict__ aggs; + bitmask_type const* __restrict__ row_bitmask; + bool skip_rows_with_nulls; + + /** + * @brief Construct a new compute_single_pass_aggs_fn functor object + * + * @param set_ref Hash set object to insert key,value pairs into. + * @param input_values The table whose rows will be aggregated in the values + * of the hash set + * @param output_values Table that stores the results of aggregating rows of + * `input_values`. + * @param aggs The set of aggregation operations to perform across the + * columns of the `input_values` rows + * @param row_bitmask Bitmask where bit `i` indicates the presence of a null + * value in row `i` of input keys. Only used if `skip_rows_with_nulls` is `true` + * @param skip_rows_with_nulls Indicates if rows in `input_keys` containing + * null values should be skipped. It `true`, it is assumed `row_bitmask` is a + * bitmask where bit `i` indicates the presence of a null value in row `i`. + */ + compute_single_pass_aggs_fn(SetType set, + table_device_view input_values, + mutable_table_device_view output_values, + aggregation::Kind const* aggs, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls) + : set(set), + input_values(input_values), + output_values(output_values), + aggs(aggs), + row_bitmask(row_bitmask), + skip_rows_with_nulls(skip_rows_with_nulls) + { + } + + __device__ void operator()(size_type i) + { + if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i)) { + auto const result = set.insert_and_find(i); + + cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); + } + } +}; + } // namespace cudf::groupby::detail::hash From 9ab1c0229fdb0e3f0014ee089f84d723be017fef Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 20 Aug 2024 17:56:52 -0700 Subject: [PATCH 005/135] Minor cleanups: use CCCL traits in device APIs --- cpp/src/groupby/hash/multi_pass_functors.cuh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/groupby/hash/multi_pass_functors.cuh b/cpp/src/groupby/hash/multi_pass_functors.cuh index 7043eafdc10..6fbec5fe19e 100644 --- a/cpp/src/groupby/hash/multi_pass_functors.cuh +++ b/cpp/src/groupby/hash/multi_pass_functors.cuh @@ -25,6 +25,7 @@ #include #include +#include #include @@ -64,17 +65,15 @@ struct var_hash_functor { } template - __device__ std::enable_if_t()> operator()(column_device_view const& source, - size_type source_index, - size_type target_index) noexcept + __device__ cuda::std::enable_if_t()> operator()( + column_device_view const& source, size_type source_index, size_type target_index) noexcept { CUDF_UNREACHABLE("Invalid source type for std, var aggregation combination."); } template - __device__ std::enable_if_t()> operator()(column_device_view const& source, - size_type source_index, - size_type target_index) noexcept + __device__ cuda::std::enable_if_t()> operator()( + column_device_view const& source, size_type source_index, size_type target_index) noexcept { using Target = target_type_t; using SumType = target_type_t; @@ -93,6 +92,7 @@ struct var_hash_functor { if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } } + __device__ inline void operator()(size_type source_index) { if (row_bitmask == nullptr or cudf::bit_is_set(row_bitmask, source_index)) { From db1b26ab3585a956a35a4a5e74ff56885dcaaa96 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 20 Aug 2024 18:27:49 -0700 Subject: [PATCH 006/135] Move more constexpr to the helper --- cpp/src/groupby/hash/groupby.cu | 91 ++++++++----------- cpp/src/groupby/hash/helpers.cuh | 27 ++++-- cpp/src/groupby/hash/kernels.cuh | 46 ++++------ cpp/src/groupby/hash/single_pass_functors.cuh | 9 +- 4 files changed, 83 insertions(+), 90 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 92e8f4a6f8b..36f7bd08a37 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -505,7 +505,6 @@ size_t find_shmem_size(FuncType func, int block_size, int grid_size, int num_sms return get_previous_multiple_of_8(0.5 * dynamic_smem_size); } -template void launch_compute_aggregates(int block_size, int grid_size, int num_sms, @@ -518,8 +517,7 @@ void launch_compute_aggregates(int block_size, cudf::aggregation::Kind const* aggs, rmm::cuda_stream_view stream) { - auto compute_aggregates_fn_ptr = - compute_aggregates; + auto compute_aggregates_fn_ptr = compute_aggregates; size_t d_shmem_size = find_shmem_size(compute_aggregates_fn_ptr, block_size, grid_size, num_sms); // For each aggregation, need two pointers to arrays in shmem // One where the aggregation is performed, one indicating the validity of the aggregation @@ -527,16 +525,15 @@ void launch_compute_aggregates(int block_size, round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns()); // The rest of shmem is utilized for the actual arrays in shmem auto shmem_agg_size = d_shmem_size - shmem_agg_pointer_size * 2; - compute_aggregates - <<>>(local_mapping_index, - global_mapping_index, - block_cardinality, - input_values, - output_values, - num_input_rows, - aggs, - shmem_agg_size, - shmem_agg_pointer_size); + compute_aggregates<<>>(local_mapping_index, + global_mapping_index, + block_cardinality, + input_values, + output_values, + num_input_rows, + aggs, + shmem_agg_size, + shmem_agg_pointer_size); } /** @@ -553,17 +550,9 @@ rmm::device_uvector compute_single_pass_set_aggs( KeyEqual d_key_equal, RowHasher d_row_hash) { - auto constexpr block_size = 128; - auto constexpr cardinality_threshold = 128; - - auto const num_input_rows = keys.num_rows(); - - // We add additional `block_size`, because after the number of elements in the local hash set - // exceeds the threshold, all threads in the thread block can still insert one more element. - auto constexpr shared_set_num_elements = cardinality_threshold + block_size; - // shared_set_num_elements with 0.7 occupancy + // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy auto constexpr shared_set_capacity = - static_cast(static_cast(shared_set_num_elements) * 1.43); + static_cast(static_cast(GROUPBY_SHM_MAX_ELEMENTS) * 1.43); using extent_type = cuco::extent; using shared_set_type = cuco::static_set compute_single_pass_set_aggs( using shared_set_ref_type = typename shared_set_type::ref_type<>; auto constexpr window_extent = cuco::make_window_extent(extent_type{}); + auto const num_input_rows = keys.num_rows(); + auto global_set_ref = global_set.ref(cuco::op::insert_and_find); int num_sms = find_num_sms(); auto compute_mapping_indices_fn_ptr = compute_mapping_indices; int grid_size = - find_grid_size(compute_mapping_indices_fn_ptr, block_size, num_input_rows, num_sms); + find_grid_size(compute_mapping_indices_fn_ptr, GROUPBY_BLOCK_SIZE, num_input_rows, num_sms); // 'local_mapping_index' maps from the global row index of the input table to the row index of // the local pre-aggregate table rmm::device_uvector local_mapping_index(num_input_rows, stream); // 'global_mapping_index' maps from the local pre-aggregate table to the row index of // global aggregate table - rmm::device_uvector global_mapping_index(grid_size * shared_set_num_elements, + rmm::device_uvector global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS, stream); rmm::device_uvector block_cardinality(grid_size, stream); rmm::device_scalar direct_aggregations(false, stream); - compute_mapping_indices - <<>>(global_set_ref, - num_input_rows, - window_extent, - d_key_equal, - d_row_hash, - local_mapping_index.data(), - global_mapping_index.data(), - block_cardinality.data(), - direct_aggregations.data()); + compute_mapping_indices + <<>>(global_set_ref, + num_input_rows, + window_extent, + d_key_equal, + d_row_hash, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + direct_aggregations.data()); stream.synchronize(); // 'populated_keys' contains inserted row_indices (keys) of global hash set @@ -628,21 +617,20 @@ rmm::device_uvector compute_single_pass_set_aggs( auto d_values = table_device_view::create(flattened_values, stream); - launch_compute_aggregates( - block_size, - grid_size, - num_sms, - local_mapping_index.data(), - global_mapping_index.data(), - block_cardinality.data(), - *d_values, - *d_sparse_table, - num_input_rows, - d_aggs.data(), - stream); + launch_compute_aggregates(GROUPBY_BLOCK_SIZE, + grid_size, + num_sms, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + *d_values, + *d_sparse_table, + num_input_rows, + d_aggs.data(), + stream); if (direct_aggregations.value(stream)) { - int stride = block_size * grid_size; + int stride = GROUPBY_BLOCK_SIZE * grid_size; thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), keys.num_rows(), @@ -652,8 +640,7 @@ rmm::device_uvector compute_single_pass_set_aggs( d_aggs.data(), block_cardinality.data(), stride, - block_size, - cardinality_threshold}); + GROUPBY_BLOCK_SIZE}); extract_populated_keys(global_set, populated_keys, stream); } diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh index 32aca69accf..9e5e628966c 100644 --- a/cpp/src/groupby/hash/helpers.cuh +++ b/cpp/src/groupby/hash/helpers.cuh @@ -24,22 +24,37 @@ namespace cudf::groupby::detail::hash { -CUDF_HOST_DEVICE constexpr std::size_t round_to_multiple_of_8(std::size_t num) -{ - std::size_t constexpr base = 8; - return cudf::util::div_rounding_up_safe(num, base) * base; -} - // TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested // types and `cg_size = 1`for flat data to improve performance /// Number of threads to handle each input element CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1; + /// Number of slots per thread CUDF_HOST_DEVICE auto constexpr GROUPBY_WINDOW_SIZE = 1; +/// Probing scheme type used by groupby hash table using probing_scheme_type = cuco::linear_probing< GROUPBY_CG_SIZE, cudf::experimental::row::hash::device_row_hasher>; +/// Thread block size +CUDF_HOST_DEVICE auto constexpr GROUPBY_BLOCK_SIZE = 128; + +/// Threshold cardinality to switch between shared memory aggregations and global memory +/// aggregations +CUDF_HOST_DEVICE auto constexpr GROUPBY_CARDINALITY_THRESHOLD = 128; + +// We add additional `block_size`, because after the number of elements in the local hash set +// exceeds the threshold, all threads in the thread block can still insert one more element. +/// The maximum number of elements handled per block +CUDF_HOST_DEVICE auto constexpr GROUPBY_SHM_MAX_ELEMENTS = + GROUPBY_CARDINALITY_THRESHOLD + GROUPBY_BLOCK_SIZE; + +CUDF_HOST_DEVICE constexpr std::size_t round_to_multiple_of_8(std::size_t num) +{ + std::size_t constexpr base = 8; + return cudf::util::div_rounding_up_safe(num, base) * base; +} + } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/kernels.cuh b/cpp/src/groupby/hash/kernels.cuh index a50083f2082..aee3b416b2d 100644 --- a/cpp/src/groupby/hash/kernels.cuh +++ b/cpp/src/groupby/hash/kernels.cuh @@ -101,7 +101,6 @@ __device__ void compute_pre_aggregrates(int col_start, } } -template __device__ void compute_final_aggregates(int col_start, int col_end, cudf::table_device_view input_values, @@ -113,7 +112,7 @@ __device__ void compute_final_aggregates(int col_start, cudf::aggregation::Kind const* aggs) { for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) { - auto out_idx = global_mapping_index[blockIdx.x * shared_set_num_elements + cur_idx]; + auto out_idx = global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx]; for (auto col_idx = col_start; col_idx < col_end; col_idx++) { auto output_col = output_values.column(col_idx); @@ -132,7 +131,6 @@ __device__ void compute_final_aggregates(int col_start, /* Takes the local_mapping_index and global_mapping_index to compute * pre (shared) and final (global) aggregates*/ -template CUDF_KERNEL void compute_aggregates(cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, @@ -144,7 +142,7 @@ CUDF_KERNEL void compute_aggregates(cudf::size_type* local_mapping_index, int pointer_size) { cudf::size_type cardinality = block_cardinality[blockIdx.x]; - if (cardinality >= cardinality_threshold) { return; } + if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; } int num_input_cols = output_values.num_columns(); extern __shared__ std::byte shared_set_aggregates[]; std::byte** s_aggregates_pointer = @@ -186,15 +184,15 @@ CUDF_KERNEL void compute_aggregates(cudf::size_type* local_mapping_index, s_aggregates_valid_pointer, aggs); __syncthreads(); - compute_final_aggregates(col_start, - col_end, - input_values, - output_values, - cardinality, - global_mapping_index, - s_aggregates_pointer, - s_aggregates_valid_pointer, - aggs); + compute_final_aggregates(col_start, + col_end, + input_values, + output_values, + cardinality, + global_mapping_index, + s_aggregates_pointer, + s_aggregates_valid_pointer, + aggs); __syncthreads(); } } @@ -233,24 +231,21 @@ template __device__ void find_global_mapping(cudf::size_type cur_idx, SetType global_set, cudf::size_type* shared_set_indices, - cudf::size_type* global_mapping_index, - cudf::size_type shared_set_num_elements) + cudf::size_type* global_mapping_index) { auto input_idx = shared_set_indices[cur_idx]; auto result = global_set.insert_and_find(input_idx); - global_mapping_index[blockIdx.x * shared_set_num_elements + cur_idx] = *result.first; + global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] = *result.first; } /* * Inserts keys into the shared memory hash set, and stores the row index of the local * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a - * threadblock exceeds `cardinality_threshold`, the threads in that block will exit without updating - * `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to the - * global hash set, and save the row index of the global sparse table in `global_mapping_index`. + * threadblock exceeds `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without + * updating `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to + * the global hash set, and save the row index of the global sparse table in `global_mapping_index`. */ template = cardinality_threshold) { + if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { if (threadIdx.x == 0) { *direct_aggregations = true; } break; } @@ -311,10 +306,9 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, } // Insert unique keys from shared to global hash set - if (cardinality < cardinality_threshold) { + if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) { for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) { - find_global_mapping( - cur_idx, global_set, shared_set_indices, global_mapping_index, shared_set_num_elements); + find_global_mapping(cur_idx, global_set, shared_set_indices, global_mapping_index); } } diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index 2b92ed63098..324a2286a3e 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -877,29 +877,26 @@ struct compute_direct_aggregates { cudf::size_type* block_cardinality; int stride; int block_size; - cudf::size_type cardinality_threshold; compute_direct_aggregates(SetType set, cudf::table_device_view input_values, cudf::mutable_table_device_view output_values, cudf::aggregation::Kind const* aggs, cudf::size_type* block_cardinality, int stride, - int block_size, - cudf::size_type cardinality_threshold) + int block_size) : set(set), input_values(input_values), output_values(output_values), aggs(aggs), block_cardinality(block_cardinality), stride(stride), - block_size(block_size), - cardinality_threshold(cardinality_threshold) + block_size(block_size) { } __device__ void operator()(cudf::size_type i) { int block_id = (i % stride) / block_size; - if (block_cardinality[block_id] >= cardinality_threshold) { + if (block_cardinality[block_id] >= GROUPBY_CARDINALITY_THRESHOLD) { auto const result = set.insert_and_find(i); cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); } From 9993283b9ef3805e08a890e9d9d0c755f8a7ad05 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 20 Aug 2024 18:58:59 -0700 Subject: [PATCH 007/135] More cleanups with constexprs --- cpp/src/groupby/hash/groupby.cu | 43 +++++++++---------- cpp/src/groupby/hash/single_pass_functors.cuh | 9 ++-- 2 files changed, 23 insertions(+), 29 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 36f7bd08a37..41dff4e5d0d 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -482,31 +482,30 @@ int find_num_sms() } template -int find_grid_size(FuncType func, int block_size, cudf::size_type num_input_rows, int num_sms) +int find_grid_size(FuncType func, cudf::size_type num_input_rows, int num_sms) { int max_active_blocks{-1}; CUDF_CUDA_TRY( - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func, block_size, 0)); + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func, GROUPBY_BLOCK_SIZE, 0)); auto max_grid_size = max_active_blocks * num_sms; - int needed_active_blocks = cudf::util::div_rounding_up_safe(num_input_rows, block_size); + int needed_active_blocks = cudf::util::div_rounding_up_safe(num_input_rows, GROUPBY_BLOCK_SIZE); return std::min(max_grid_size, needed_active_blocks); } size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; } template -size_t find_shmem_size(FuncType func, int block_size, int grid_size, int num_sms) +size_t find_shmem_size(FuncType func, int grid_size, int num_sms) { auto active_blocks_per_sm = cudf::util::div_rounding_up_safe(grid_size, num_sms); size_t dynamic_smem_size; CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock( - &dynamic_smem_size, func, active_blocks_per_sm, block_size)); + &dynamic_smem_size, func, active_blocks_per_sm, GROUPBY_BLOCK_SIZE)); return get_previous_multiple_of_8(0.5 * dynamic_smem_size); } -void launch_compute_aggregates(int block_size, - int grid_size, +void launch_compute_aggregates(int grid_size, int num_sms, cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, @@ -518,22 +517,23 @@ void launch_compute_aggregates(int block_size, rmm::cuda_stream_view stream) { auto compute_aggregates_fn_ptr = compute_aggregates; - size_t d_shmem_size = find_shmem_size(compute_aggregates_fn_ptr, block_size, grid_size, num_sms); + size_t d_shmem_size = find_shmem_size(compute_aggregates_fn_ptr, grid_size, num_sms); // For each aggregation, need two pointers to arrays in shmem // One where the aggregation is performed, one indicating the validity of the aggregation auto shmem_agg_pointer_size = round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns()); // The rest of shmem is utilized for the actual arrays in shmem auto shmem_agg_size = d_shmem_size - shmem_agg_pointer_size * 2; - compute_aggregates<<>>(local_mapping_index, - global_mapping_index, - block_cardinality, - input_values, - output_values, - num_input_rows, - aggs, - shmem_agg_size, - shmem_agg_pointer_size); + compute_aggregates<<>>( + local_mapping_index, + global_mapping_index, + block_cardinality, + input_values, + output_values, + num_input_rows, + aggs, + shmem_agg_size, + shmem_agg_pointer_size); } /** @@ -574,8 +574,7 @@ rmm::device_uvector compute_single_pass_set_aggs( KeyEqual, RowHasher, decltype(window_extent)>; - int grid_size = - find_grid_size(compute_mapping_indices_fn_ptr, GROUPBY_BLOCK_SIZE, num_input_rows, num_sms); + int grid_size = find_grid_size(compute_mapping_indices_fn_ptr, num_input_rows, num_sms); // 'local_mapping_index' maps from the global row index of the input table to the row index of // the local pre-aggregate table rmm::device_uvector local_mapping_index(num_input_rows, stream); @@ -617,8 +616,7 @@ rmm::device_uvector compute_single_pass_set_aggs( auto d_values = table_device_view::create(flattened_values, stream); - launch_compute_aggregates(GROUPBY_BLOCK_SIZE, - grid_size, + launch_compute_aggregates(grid_size, num_sms, local_mapping_index.data(), global_mapping_index.data(), @@ -639,8 +637,7 @@ rmm::device_uvector compute_single_pass_set_aggs( *d_sparse_table, d_aggs.data(), block_cardinality.data(), - stride, - GROUPBY_BLOCK_SIZE}); + stride}); extract_populated_keys(global_set, populated_keys, stream); } diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index 324a2286a3e..170539576d1 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -876,26 +876,23 @@ struct compute_direct_aggregates { cudf::aggregation::Kind const* __restrict__ aggs; cudf::size_type* block_cardinality; int stride; - int block_size; compute_direct_aggregates(SetType set, cudf::table_device_view input_values, cudf::mutable_table_device_view output_values, cudf::aggregation::Kind const* aggs, cudf::size_type* block_cardinality, - int stride, - int block_size) + int stride) : set(set), input_values(input_values), output_values(output_values), aggs(aggs), block_cardinality(block_cardinality), - stride(stride), - block_size(block_size) + stride(stride) { } __device__ void operator()(cudf::size_type i) { - int block_id = (i % stride) / block_size; + int block_id = (i % stride) / GROUPBY_BLOCK_SIZE; if (block_cardinality[block_id] >= GROUPBY_CARDINALITY_THRESHOLD) { auto const result = set.insert_and_find(i); cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); From c96d02cb8168cc7b9ff2997d2febc2a02c613aa4 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 21 Aug 2024 09:42:56 -0700 Subject: [PATCH 008/135] Add doc --- cpp/src/groupby/hash/helpers.cuh | 3 +++ cpp/src/groupby/hash/single_pass_functors.cuh | 1 + 2 files changed, 4 insertions(+) diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh index 9e5e628966c..9287325c3fb 100644 --- a/cpp/src/groupby/hash/helpers.cuh +++ b/cpp/src/groupby/hash/helpers.cuh @@ -51,6 +51,9 @@ CUDF_HOST_DEVICE auto constexpr GROUPBY_CARDINALITY_THRESHOLD = 128; CUDF_HOST_DEVICE auto constexpr GROUPBY_SHM_MAX_ELEMENTS = GROUPBY_CARDINALITY_THRESHOLD + GROUPBY_BLOCK_SIZE; +/** + * @brief Returns the smallest multiple of 8 that is greater than or equal to the given integer. + */ CUDF_HOST_DEVICE constexpr std::size_t round_to_multiple_of_8(std::size_t num) { std::size_t constexpr base = 8; diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index 170539576d1..a8cc7492c52 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -890,6 +890,7 @@ struct compute_direct_aggregates { stride(stride) { } + __device__ void operator()(cudf::size_type i) { int block_id = (i % stride) / GROUPBY_BLOCK_SIZE; From 7cd14d6c7e13a9d94bd5b365f06a47ceea395d66 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 21 Aug 2024 10:00:35 -0700 Subject: [PATCH 009/135] Renaming --- cpp/src/groupby/hash/groupby.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 41dff4e5d0d..1a13bcde8fa 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -541,7 +541,7 @@ void launch_compute_aggregates(int grid_size, * over the data and stores the results in `sparse_results` */ template -rmm::device_uvector compute_single_pass_set_aggs( +rmm::device_uvector compute_single_pass_aggs( cudf::table_view const& keys, cudf::host_span requests, cudf::detail::result_cache* sparse_results, @@ -729,7 +729,7 @@ std::unique_ptr
groupby(table_view const& keys, stream.value()}; // Compute all single pass aggs first - auto gather_map = compute_single_pass_set_aggs( + auto gather_map = compute_single_pass_aggs( keys, requests, &sparse_results, set, stream, d_key_equal, d_row_hash); // Compact all results from sparse_results and insert into cache From 1e04c10ccc99b28f10956dee2ae56ef3344d2a7f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 21 Aug 2024 15:51:52 -0700 Subject: [PATCH 010/135] Fix cardinality bench --- cpp/benchmarks/groupby/group_max.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/benchmarks/groupby/group_max.cpp b/cpp/benchmarks/groupby/group_max.cpp index f41285008c4..b9a701a71f4 100644 --- a/cpp/benchmarks/groupby/group_max.cpp +++ b/cpp/benchmarks/groupby/group_max.cpp @@ -101,4 +101,5 @@ NVBENCH_BENCH_TYPES(bench_groupby_max, NVBENCH_BENCH_TYPES(bench_groupby_max_cardinality, NVBENCH_TYPE_AXES(nvbench::type_list)) .set_name("groupby_max_cardinality") + .add_int64_axis("num_aggregations", {1}) .add_int64_axis("cardinality", {10, 20, 50, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000}); From 47aee18270590c357d927b1c605c9b50792659ec Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 22 Aug 2024 13:20:47 -0700 Subject: [PATCH 011/135] More cleanups with CG --- cpp/src/groupby/hash/groupby.cu | 2 +- cpp/src/groupby/hash/kernels.cuh | 25 +++++++++++++------------ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 1a13bcde8fa..730b03bee2a 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -574,7 +574,7 @@ rmm::device_uvector compute_single_pass_aggs( KeyEqual, RowHasher, decltype(window_extent)>; - int grid_size = find_grid_size(compute_mapping_indices_fn_ptr, num_input_rows, num_sms); + auto const grid_size = find_grid_size(compute_mapping_indices_fn_ptr, num_input_rows, num_sms); // 'local_mapping_index' maps from the global row index of the input table to the row index of // the local pre-aggregate table rmm::device_uvector local_mapping_index(num_input_rows, stream); diff --git a/cpp/src/groupby/hash/kernels.cuh b/cpp/src/groupby/hash/kernels.cuh index aee3b416b2d..6299a3b2acf 100644 --- a/cpp/src/groupby/hash/kernels.cuh +++ b/cpp/src/groupby/hash/kernels.cuh @@ -21,6 +21,7 @@ #include "single_pass_functors.cuh" #include +#include #include namespace cudf::groupby::detail::hash { @@ -278,16 +279,15 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, __shared__ cudf::size_type cardinality; - if (threadIdx.x == 0) { cardinality = 0; } + if (block.thread_rank() == 0) { cardinality = 0; } - __syncthreads(); + block.sync(); - int num_loops = - cudf::util::div_rounding_up_safe(num_input_rows, (cudf::size_type)(blockDim.x * gridDim.x)); - auto end_idx = num_loops * blockDim.x * gridDim.x; + auto const stride = cudf::detail::grid_1d::grid_stride(); - for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < end_idx; - cur_idx += blockDim.x * gridDim.x) { + for (auto cur_idx = cudf::detail::grid_1d::global_thread_id(); + cur_idx - block.thread_rank() < num_input_rows; + cur_idx += stride) { find_local_mapping(cur_idx, num_input_rows, &cardinality, @@ -295,24 +295,25 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, local_mapping_index, shared_set_indices); - __syncthreads(); + block.sync(); if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { - if (threadIdx.x == 0) { *direct_aggregations = true; } + if (block.thread_rank() == 0) { *direct_aggregations = true; } break; } - __syncthreads(); + block.sync(); } // Insert unique keys from shared to global hash set if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) { - for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) { + for (auto cur_idx = block.thread_rank(); cur_idx < cardinality; + cur_idx += block.num_threads()) { find_global_mapping(cur_idx, global_set, shared_set_indices, global_mapping_index); } } - if (threadIdx.x == 0) block_cardinality[blockIdx.x] = cardinality; + if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; } } } // namespace cudf::groupby::detail::hash From 6eb34598f6304c42a1a2e5ba01efd7a8a5ee69d7 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 28 Aug 2024 11:39:49 -0700 Subject: [PATCH 012/135] Use custom cuco --- rapids_config.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/rapids_config.cmake b/rapids_config.cmake index 3a88769f6e7..96df5adedac 100644 --- a/rapids_config.cmake +++ b/rapids_config.cmake @@ -11,6 +11,10 @@ # or implied. See the License for the specific language governing permissions and limitations under # the License. # ============================================================================= + +set(rapids-cmake-repo PointKernel/rapids-cmake) +set(rapids-cmake-branch cuco-hash-function) + file(READ "${CMAKE_CURRENT_LIST_DIR}/VERSION" _rapids_version) if(_rapids_version MATCHES [[^([0-9][0-9])\.([0-9][0-9])\.([0-9][0-9])]]) set(RAPIDS_VERSION_MAJOR "${CMAKE_MATCH_1}") From ee5f7fa2eb121ba32525bf2b0e5614fed32903f9 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 28 Aug 2024 13:34:14 -0700 Subject: [PATCH 013/135] Cleanups with new key_eq and hash_function --- cpp/src/groupby/hash/groupby.cu | 23 +++++++---------------- cpp/src/groupby/hash/kernels.cuh | 12 +++--------- 2 files changed, 10 insertions(+), 25 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 730b03bee2a..0d53a0f46ea 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -516,8 +516,7 @@ void launch_compute_aggregates(int grid_size, cudf::aggregation::Kind const* aggs, rmm::cuda_stream_view stream) { - auto compute_aggregates_fn_ptr = compute_aggregates; - size_t d_shmem_size = find_shmem_size(compute_aggregates_fn_ptr, grid_size, num_sms); + size_t d_shmem_size = find_shmem_size(compute_aggregates, grid_size, num_sms); // For each aggregation, need two pointers to arrays in shmem // One where the aggregation is performed, one indicating the validity of the aggregation auto shmem_agg_pointer_size = @@ -540,15 +539,13 @@ void launch_compute_aggregates(int grid_size, * @brief Computes all aggregations from `requests` that require a single pass * over the data and stores the results in `sparse_results` */ -template +template rmm::device_uvector compute_single_pass_aggs( cudf::table_view const& keys, cudf::host_span requests, cudf::detail::result_cache* sparse_results, SetType& global_set, - rmm::cuda_stream_view stream, - KeyEqual d_key_equal, - RowHasher d_row_hash) + rmm::cuda_stream_view stream) { // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy auto constexpr shared_set_capacity = @@ -568,12 +565,9 @@ rmm::device_uvector compute_single_pass_aggs( auto global_set_ref = global_set.ref(cuco::op::insert_and_find); - int num_sms = find_num_sms(); - auto compute_mapping_indices_fn_ptr = compute_mapping_indices; + int num_sms = find_num_sms(); + auto compute_mapping_indices_fn_ptr = + compute_mapping_indices; auto const grid_size = find_grid_size(compute_mapping_indices_fn_ptr, num_input_rows, num_sms); // 'local_mapping_index' maps from the global row index of the input table to the row index of // the local pre-aggregate table @@ -588,8 +582,6 @@ rmm::device_uvector compute_single_pass_aggs( <<>>(global_set_ref, num_input_rows, window_extent, - d_key_equal, - d_row_hash, local_mapping_index.data(), global_mapping_index.data(), block_cardinality.data(), @@ -729,8 +721,7 @@ std::unique_ptr
groupby(table_view const& keys, stream.value()}; // Compute all single pass aggs first - auto gather_map = compute_single_pass_aggs( - keys, requests, &sparse_results, set, stream, d_key_equal, d_row_hash); + auto gather_map = compute_single_pass_aggs(keys, requests, &sparse_results, set, stream); // Compact all results from sparse_results and insert into cache sparse_to_dense_results(keys, diff --git a/cpp/src/groupby/hash/kernels.cuh b/cpp/src/groupby/hash/kernels.cuh index 6299a3b2acf..1b7add50024 100644 --- a/cpp/src/groupby/hash/kernels.cuh +++ b/cpp/src/groupby/hash/kernels.cuh @@ -246,16 +246,10 @@ __device__ void find_global_mapping(cudf::size_type cur_idx, * updating `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to * the global hash set, and save the row index of the global sparse table in `global_mapping_index`. */ -template +template CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, cudf::size_type num_input_rows, WindowExtent window_extent, - KeyEqual d_key_equal, - RowHasher d_row_hash, cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, @@ -267,8 +261,8 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, __shared__ typename SetRef::window_type windows[window_extent.value()]; auto storage = SetRef::storage_ref_type(window_extent, windows); auto shared_set = SetRef(cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, - d_key_equal, - probing_scheme_type{d_row_hash}, + global_set.key_eq(), + probing_scheme_type{global_set.hash_function()}, {}, storage); auto const block = cooperative_groups::this_thread_block(); From aa4e9570d232feb7bcc8f5af5e0774ec82d762d0 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 28 Aug 2024 13:45:44 -0700 Subject: [PATCH 014/135] Remove the redundant num_sms function --- cpp/src/groupby/hash/groupby.cu | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 0d53a0f46ea..c8f0a816b77 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -472,22 +472,13 @@ auto create_sparse_results_table(cudf::table_view const& flattened_values, return sparse_table; } -int find_num_sms() -{ - int dev_id{-1}; - CUDF_CUDA_TRY(cudaGetDevice(&dev_id)); - int num_sms{-1}; - CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id)); - return num_sms; -} - template -int find_grid_size(FuncType func, cudf::size_type num_input_rows, int num_sms) +int find_grid_size(FuncType func, cudf::size_type num_input_rows) { int max_active_blocks{-1}; CUDF_CUDA_TRY( cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func, GROUPBY_BLOCK_SIZE, 0)); - auto max_grid_size = max_active_blocks * num_sms; + auto max_grid_size = max_active_blocks * cudf::detail::num_multiprocessors(); int needed_active_blocks = cudf::util::div_rounding_up_safe(num_input_rows, GROUPBY_BLOCK_SIZE); return std::min(max_grid_size, needed_active_blocks); } @@ -495,9 +486,10 @@ int find_grid_size(FuncType func, cudf::size_type num_input_rows, int num_sms) size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; } template -size_t find_shmem_size(FuncType func, int grid_size, int num_sms) +size_t find_shmem_size(FuncType func, int grid_size) { - auto active_blocks_per_sm = cudf::util::div_rounding_up_safe(grid_size, num_sms); + auto active_blocks_per_sm = + cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); size_t dynamic_smem_size; CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock( @@ -506,7 +498,6 @@ size_t find_shmem_size(FuncType func, int grid_size, int num_sms) } void launch_compute_aggregates(int grid_size, - int num_sms, cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, @@ -516,7 +507,7 @@ void launch_compute_aggregates(int grid_size, cudf::aggregation::Kind const* aggs, rmm::cuda_stream_view stream) { - size_t d_shmem_size = find_shmem_size(compute_aggregates, grid_size, num_sms); + size_t d_shmem_size = find_shmem_size(compute_aggregates, grid_size); // For each aggregation, need two pointers to arrays in shmem // One where the aggregation is performed, one indicating the validity of the aggregation auto shmem_agg_pointer_size = @@ -565,10 +556,10 @@ rmm::device_uvector compute_single_pass_aggs( auto global_set_ref = global_set.ref(cuco::op::insert_and_find); - int num_sms = find_num_sms(); auto compute_mapping_indices_fn_ptr = compute_mapping_indices; - auto const grid_size = find_grid_size(compute_mapping_indices_fn_ptr, num_input_rows, num_sms); + auto const grid_size = find_grid_size( + compute_mapping_indices_fn_ptr, num_input_rows, cudf::detail::num_multiprocessors()); // 'local_mapping_index' maps from the global row index of the input table to the row index of // the local pre-aggregate table rmm::device_uvector local_mapping_index(num_input_rows, stream); @@ -609,7 +600,6 @@ rmm::device_uvector compute_single_pass_aggs( auto d_values = table_device_view::create(flattened_values, stream); launch_compute_aggregates(grid_size, - num_sms, local_mapping_index.data(), global_mapping_index.data(), block_cardinality.data(), From 4fdb4b87ea51f35694045b6e0cb9deabdd3782e1 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 28 Aug 2024 13:52:20 -0700 Subject: [PATCH 015/135] Add missing header + minor cleanup --- cpp/src/groupby/hash/groupby.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index c8f0a816b77..aeb506efc9a 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -558,8 +559,7 @@ rmm::device_uvector compute_single_pass_aggs( auto compute_mapping_indices_fn_ptr = compute_mapping_indices; - auto const grid_size = find_grid_size( - compute_mapping_indices_fn_ptr, num_input_rows, cudf::detail::num_multiprocessors()); + auto const grid_size = find_grid_size(compute_mapping_indices_fn_ptr, num_input_rows); // 'local_mapping_index' maps from the global row index of the input table to the row index of // the local pre-aggregate table rmm::device_uvector local_mapping_index(num_input_rows, stream); From 4049aeb25d4070a2609b05a237beb0ff94ce87b3 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 28 Aug 2024 14:23:46 -0700 Subject: [PATCH 016/135] Clean up grid_size and shmem_size utilities --- cpp/src/groupby/hash/groupby.cu | 70 ++++++++++++++++---------------- cpp/src/groupby/hash/kernels.cuh | 18 ++++---- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index aeb506efc9a..eb4f856e289 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -473,49 +473,49 @@ auto create_sparse_results_table(cudf::table_view const& flattened_values, return sparse_table; } -template -int find_grid_size(FuncType func, cudf::size_type num_input_rows) +template +int max_occupancy_grid_size(Kernel kernel, cudf::size_type n) { int max_active_blocks{-1}; - CUDF_CUDA_TRY( - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func, GROUPBY_BLOCK_SIZE, 0)); - auto max_grid_size = max_active_blocks * cudf::detail::num_multiprocessors(); - int needed_active_blocks = cudf::util::div_rounding_up_safe(num_input_rows, GROUPBY_BLOCK_SIZE); - return std::min(max_grid_size, needed_active_blocks); + CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks, kernel, GROUPBY_BLOCK_SIZE, 0)); + auto const grid_size = max_active_blocks * cudf::detail::num_multiprocessors(); + auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE); + return std::min(grid_size, num_blocks); } size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; } -template -size_t find_shmem_size(FuncType func, int grid_size) +template +size_t compute_shared_memory_size(Kernel kernel, int grid_size) { - auto active_blocks_per_sm = + auto const active_blocks_per_sm = cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); - size_t dynamic_smem_size; + size_t dynamic_shmem_size; CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock( - &dynamic_smem_size, func, active_blocks_per_sm, GROUPBY_BLOCK_SIZE)); - return get_previous_multiple_of_8(0.5 * dynamic_smem_size); + &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE)); + return get_previous_multiple_of_8(0.5 * dynamic_shmem_size); } -void launch_compute_aggregates(int grid_size, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::size_type num_input_rows, - cudf::aggregation::Kind const* aggs, - rmm::cuda_stream_view stream) +void compute_aggregations(int grid_size, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::size_type num_input_rows, + cudf::aggregation::Kind const* aggs, + rmm::cuda_stream_view stream) { - size_t d_shmem_size = find_shmem_size(compute_aggregates, grid_size); + auto const shmem_size = compute_shared_memory_size(compute_aggs_kernel, grid_size); // For each aggregation, need two pointers to arrays in shmem // One where the aggregation is performed, one indicating the validity of the aggregation auto shmem_agg_pointer_size = round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns()); // The rest of shmem is utilized for the actual arrays in shmem - auto shmem_agg_size = d_shmem_size - shmem_agg_pointer_size * 2; - compute_aggregates<<>>( + auto shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2; + compute_aggs_kernel<<>>( local_mapping_index, global_mapping_index, block_cardinality, @@ -559,7 +559,7 @@ rmm::device_uvector compute_single_pass_aggs( auto compute_mapping_indices_fn_ptr = compute_mapping_indices; - auto const grid_size = find_grid_size(compute_mapping_indices_fn_ptr, num_input_rows); + auto const grid_size = max_occupancy_grid_size(compute_mapping_indices_fn_ptr, num_input_rows); // 'local_mapping_index' maps from the global row index of the input table to the row index of // the local pre-aggregate table rmm::device_uvector local_mapping_index(num_input_rows, stream); @@ -599,15 +599,15 @@ rmm::device_uvector compute_single_pass_aggs( auto d_values = table_device_view::create(flattened_values, stream); - launch_compute_aggregates(grid_size, - local_mapping_index.data(), - global_mapping_index.data(), - block_cardinality.data(), - *d_values, - *d_sparse_table, - num_input_rows, - d_aggs.data(), - stream); + compute_aggregations(grid_size, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + *d_values, + *d_sparse_table, + num_input_rows, + d_aggs.data(), + stream); if (direct_aggregations.value(stream)) { int stride = GROUPBY_BLOCK_SIZE * grid_size; diff --git a/cpp/src/groupby/hash/kernels.cuh b/cpp/src/groupby/hash/kernels.cuh index 1b7add50024..b8aa7304725 100644 --- a/cpp/src/groupby/hash/kernels.cuh +++ b/cpp/src/groupby/hash/kernels.cuh @@ -132,15 +132,15 @@ __device__ void compute_final_aggregates(int col_start, /* Takes the local_mapping_index and global_mapping_index to compute * pre (shared) and final (global) aggregates*/ -CUDF_KERNEL void compute_aggregates(cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::size_type num_input_rows, - cudf::aggregation::Kind const* aggs, - int total_agg_size, - int pointer_size) +CUDF_KERNEL void compute_aggs_kernel(cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::size_type num_input_rows, + cudf::aggregation::Kind const* aggs, + int total_agg_size, + int pointer_size) { cudf::size_type cardinality = block_cardinality[blockIdx.x]; if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; } From 690fceea020a979f458d3f526ff3d3fbc7cc8a7d Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 28 Aug 2024 15:26:04 -0700 Subject: [PATCH 017/135] Minor cleanups with CG --- cpp/src/groupby/hash/groupby.cu | 6 +++--- cpp/src/groupby/hash/kernels.cuh | 35 ++++++++++++++++++-------------- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index eb4f856e289..3f73e12adab 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -499,12 +499,12 @@ size_t compute_shared_memory_size(Kernel kernel, int grid_size) } void compute_aggregations(int grid_size, + cudf::size_type num_input_rows, cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, cudf::table_device_view input_values, cudf::mutable_table_device_view output_values, - cudf::size_type num_input_rows, cudf::aggregation::Kind const* aggs, rmm::cuda_stream_view stream) { @@ -516,12 +516,12 @@ void compute_aggregations(int grid_size, // The rest of shmem is utilized for the actual arrays in shmem auto shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2; compute_aggs_kernel<<>>( + num_input_rows, local_mapping_index, global_mapping_index, block_cardinality, input_values, output_values, - num_input_rows, aggs, shmem_agg_size, shmem_agg_pointer_size); @@ -600,12 +600,12 @@ rmm::device_uvector compute_single_pass_aggs( auto d_values = table_device_view::create(flattened_values, stream); compute_aggregations(grid_size, + num_input_rows, local_mapping_index.data(), global_mapping_index.data(), block_cardinality.data(), *d_values, *d_sparse_table, - num_input_rows, d_aggs.data(), stream); diff --git a/cpp/src/groupby/hash/kernels.cuh b/cpp/src/groupby/hash/kernels.cuh index b8aa7304725..3051901fb37 100644 --- a/cpp/src/groupby/hash/kernels.cuh +++ b/cpp/src/groupby/hash/kernels.cuh @@ -132,42 +132,47 @@ __device__ void compute_final_aggregates(int col_start, /* Takes the local_mapping_index and global_mapping_index to compute * pre (shared) and final (global) aggregates*/ -CUDF_KERNEL void compute_aggs_kernel(cudf::size_type* local_mapping_index, +CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows, + cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, cudf::table_device_view input_values, cudf::mutable_table_device_view output_values, - cudf::size_type num_input_rows, cudf::aggregation::Kind const* aggs, int total_agg_size, int pointer_size) { - cudf::size_type cardinality = block_cardinality[blockIdx.x]; + auto const block = cooperative_groups::this_thread_block(); + auto const cardinality = block_cardinality[block.group_index().x]; if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; } - int num_input_cols = output_values.num_columns(); + + auto const num_cols = output_values.num_columns(); + + __shared__ int col_start; + __shared__ int col_end; extern __shared__ std::byte shared_set_aggregates[]; std::byte** s_aggregates_pointer = reinterpret_cast(shared_set_aggregates + total_agg_size); bool** s_aggregates_valid_pointer = reinterpret_cast(shared_set_aggregates + total_agg_size + pointer_size); - __shared__ int col_start; - __shared__ int col_end; - if (threadIdx.x == 0) { + + if (block.thread_rank() == 0) { col_start = 0; col_end = 0; } - __syncthreads(); - while (col_end < num_input_cols) { + block.sync(); + + while (col_end < num_cols) { calculate_columns_to_aggregate(col_start, col_end, output_values, - num_input_cols, + num_cols, s_aggregates_pointer, s_aggregates_valid_pointer, shared_set_aggregates, cardinality, total_agg_size); - __syncthreads(); + block.sync(); initialize_shared_memory_aggregates(col_start, col_end, output_values, @@ -175,16 +180,16 @@ CUDF_KERNEL void compute_aggs_kernel(cudf::size_type* local_mapping_index, s_aggregates_valid_pointer, cardinality, aggs); - __syncthreads(); + block.sync(); compute_pre_aggregrates(col_start, col_end, input_values, - num_input_rows, + num_rows, local_mapping_index, s_aggregates_pointer, s_aggregates_valid_pointer, aggs); - __syncthreads(); + block.sync(); compute_final_aggregates(col_start, col_end, input_values, @@ -194,7 +199,7 @@ CUDF_KERNEL void compute_aggs_kernel(cudf::size_type* local_mapping_index, s_aggregates_pointer, s_aggregates_valid_pointer, aggs); - __syncthreads(); + block.sync(); } } From 716a73c9e87e609bac8c29df813dfb418232fe59 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 28 Aug 2024 16:40:57 -0700 Subject: [PATCH 018/135] Improve docs for aggregation details --- cpp/include/cudf/detail/aggregation/aggregation.hpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index b257eef1e9e..78d9951670d 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -1497,7 +1497,7 @@ AGG_KIND_MAPPING(aggregation::VARIANCE, var_aggregation); * * @tparam F Type of callable * @param k The `aggregation::Kind` value to dispatch - * aram f The callable that accepts an `aggregation::Kind` non-type template + * @param f The callable that accepts an `aggregation::Kind` non-type template * argument. * @param args Parameter pack forwarded to the `operator()` invocation * @return Forwards the return value of the callable. @@ -1626,6 +1626,8 @@ struct dispatch_source { * parameter of the callable `F` * @param k The `aggregation::Kind` used to dispatch an `aggregation::Kind` * non-type template parameter for the second template parameter of the callable + * @param f The callable that accepts `data_type` and `aggregation::Kind` non-type template + * arguments. * @param args Parameter pack forwarded to the `operator()` invocation * `F`. */ @@ -1644,8 +1646,8 @@ CUDF_HOST_DEVICE inline constexpr decltype(auto) dispatch_type_and_aggregation(d * @brief Returns the target `data_type` for the specified aggregation k * performed on elements of type source_type. * - * aram source_type The element type to be aggregated - * aram k The aggregation + * @param source_type The element type to be aggregated + * @param k The aggregation * @return data_type The target_type of k performed on source_type * elements */ From 3c8403ddde6da2bfaf5694b2f2a3789a0513baa1 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 28 Aug 2024 17:21:37 -0700 Subject: [PATCH 019/135] Minor cleanup --- cpp/src/groupby/hash/groupby.cu | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 3f73e12adab..2833ca55522 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -555,11 +555,10 @@ rmm::device_uvector compute_single_pass_aggs( auto const num_input_rows = keys.num_rows(); - auto global_set_ref = global_set.ref(cuco::op::insert_and_find); - - auto compute_mapping_indices_fn_ptr = - compute_mapping_indices; - auto const grid_size = max_occupancy_grid_size(compute_mapping_indices_fn_ptr, num_input_rows); + auto global_set_ref = global_set.ref(cuco::op::insert_and_find); + auto const grid_size = max_occupancy_grid_size( + compute_mapping_indices, + num_input_rows); // 'local_mapping_index' maps from the global row index of the input table to the row index of // the local pre-aggregate table rmm::device_uvector local_mapping_index(num_input_rows, stream); From e7224cbdc67418e0d9b7c93547e6a4faef59ad35 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 18 Sep 2024 18:10:44 -0700 Subject: [PATCH 020/135] Update device operator overloads to agg identity_initializer --- .../cudf/detail/aggregation/aggregation.cuh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh index ecf2f610697..bc2d0edbeba 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.cuh +++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh @@ -636,6 +636,25 @@ struct identity_initializer { } public: + template + __device__ std::enable_if_t(), void> operator()( + cudf::mutable_column_device_view target, cudf::size_type target_index) + { + using DeviceType = device_storage_type_t; + using ElementType = + cuda::std::conditional_t() && !cudf::is_fixed_point(), + Target, + DeviceType>; + target.element(target_index) = get_identity(); + } + + template + __device__ std::enable_if_t(), void> operator()( + cudf::mutable_column_device_view target, cudf::size_type target_index) + { + CUDF_UNREACHABLE("Unsupported aggregation for initializing values"); + } + template std::enable_if_t(), void> operator()(mutable_column_view const& col, rmm::cuda_stream_view stream) From 124aac0994af7f3f204c5150b20101cdc053867f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 18 Sep 2024 18:12:23 -0700 Subject: [PATCH 021/135] Clean up groupby details for ODR --- .../groupby/hash/compute_single_pass_aggs.cuh | 480 ++++++++++++++++++ .../groupby/hash/compute_single_pass_aggs.hpp | 45 ++ cpp/src/groupby/hash/groupby.cu | 341 +------------ cpp/src/groupby/hash/kernels.cuh | 112 ---- cpp/src/groupby/hash/single_pass_functors.cuh | 2 +- 5 files changed, 528 insertions(+), 452 deletions(-) create mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs.cuh create mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs.hpp diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh new file mode 100644 index 00000000000..3b36f8a1f81 --- /dev/null +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -0,0 +1,480 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_single_pass_aggs.hpp" +#include "helpers.cuh" +#include "kernels.cuh" +#include "single_pass_functors.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include + +namespace cudf { +namespace groupby { +namespace detail { +namespace hash { + +template +__device__ void find_local_mapping(cudf::size_type cur_idx, + cudf::size_type num_input_rows, + cudf::size_type* cardinality, + SetType shared_set, + cudf::size_type* local_mapping_index, + cudf::size_type* shared_set_indices) +{ + cudf::size_type result_idx; + bool inserted; + if (cur_idx < num_input_rows) { + auto const result = shared_set.insert_and_find(cur_idx); + result_idx = *result.first; + inserted = result.second; + // inserted a new element + if (result.second) { + auto shared_set_index = atomicAdd(cardinality, 1); + shared_set_indices[shared_set_index] = cur_idx; + local_mapping_index[cur_idx] = shared_set_index; + } + } + // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all + // threads in the thread block. + __syncthreads(); + if (cur_idx < num_input_rows) { + // element was already in set + if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; } + } +} + +template +__device__ void find_global_mapping(cudf::size_type cur_idx, + SetType global_set, + cudf::size_type* shared_set_indices, + cudf::size_type* global_mapping_index) +{ + auto input_idx = shared_set_indices[cur_idx]; + auto result = global_set.insert_and_find(input_idx); + global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] = *result.first; +} + +/* + * Inserts keys into the shared memory hash set, and stores the row index of the local + * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a + * threadblock exceeds `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without + * updating `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to + * the global hash set, and save the row index of the global sparse table in `global_mapping_index`. + */ +template +CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, + cudf::size_type num_input_rows, + WindowExtent window_extent, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + bool* direct_aggregations) +{ + __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS]; + + // Shared set initialization + __shared__ typename SetRef::window_type windows[window_extent.value()]; + auto storage = SetRef::storage_ref_type(window_extent, windows); + auto shared_set = SetRef(cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, + global_set.key_eq(), + probing_scheme_type{global_set.hash_function()}, + {}, + storage); + auto const block = cooperative_groups::this_thread_block(); + shared_set.initialize(block); + block.sync(); + + auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find); + + __shared__ cudf::size_type cardinality; + + if (block.thread_rank() == 0) { cardinality = 0; } + + block.sync(); + + auto const stride = cudf::detail::grid_1d::grid_stride(); + + for (auto cur_idx = cudf::detail::grid_1d::global_thread_id(); + cur_idx - block.thread_rank() < num_input_rows; + cur_idx += stride) { + find_local_mapping(cur_idx, + num_input_rows, + &cardinality, + shared_insert_ref, + local_mapping_index, + shared_set_indices); + + block.sync(); + + if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { + if (block.thread_rank() == 0) { *direct_aggregations = true; } + break; + } + + block.sync(); + } + + // Insert unique keys from shared to global hash set + if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) { + for (auto cur_idx = block.thread_rank(); cur_idx < cardinality; + cur_idx += block.num_threads()) { + find_global_mapping(cur_idx, global_set, shared_set_indices, global_mapping_index); + } + } + + if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; } +} + +class groupby_simple_aggregations_collector final + : public cudf::detail::simple_aggregations_collector { + public: + using cudf::detail::simple_aggregations_collector::visit; + + std::vector> visit(data_type col_type, + cudf::detail::min_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation() + : make_min_aggregation()); + return aggs; + } + + std::vector> visit(data_type col_type, + cudf::detail::max_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation() + : make_max_aggregation()); + return aggs; + } + + std::vector> visit(data_type col_type, + cudf::detail::mean_aggregation const&) override + { + (void)col_type; + CUDF_EXPECTS(is_fixed_width(col_type), "MEAN aggregation expects fixed width type"); + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } + + std::vector> visit(data_type, + cudf::detail::var_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } + + std::vector> visit(data_type, + cudf::detail::std_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } + + std::vector> visit( + data_type, cudf::detail::correlation_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } +}; + +// flatten aggs to filter in single pass aggs +std::tuple, std::vector>> +flatten_single_pass_aggs(host_span requests) +{ + std::vector columns; + std::vector> aggs; + std::vector agg_kinds; + + for (auto const& request : requests) { + auto const& agg_v = request.aggregations; + + std::unordered_set agg_kinds_set; + auto insert_agg = [&](column_view const& request_values, std::unique_ptr&& agg) { + if (agg_kinds_set.insert(agg->kind).second) { + agg_kinds.push_back(agg->kind); + aggs.push_back(std::move(agg)); + columns.push_back(request_values); + } + }; + + auto values_type = cudf::is_dictionary(request.values.type()) + ? cudf::dictionary_column_view(request.values).keys().type() + : request.values.type(); + for (auto&& agg : agg_v) { + groupby_simple_aggregations_collector collector; + + for (auto& agg_s : agg->get_simple_aggregations(values_type, collector)) { + insert_agg(request.values, std::move(agg_s)); + } + } + } + + return std::make_tuple(table_view(columns), std::move(agg_kinds), std::move(aggs)); +} + +template +int max_occupancy_grid_size(Kernel kernel, cudf::size_type n) +{ + int max_active_blocks{-1}; + CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks, kernel, GROUPBY_BLOCK_SIZE, 0)); + auto const grid_size = max_active_blocks * cudf::detail::num_multiprocessors(); + auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE); + return std::min(grid_size, num_blocks); +} + +size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; } + +template +size_t compute_shared_memory_size(Kernel kernel, int grid_size) +{ + auto const active_blocks_per_sm = + cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); + + size_t dynamic_shmem_size; + CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock( + &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE)); + return get_previous_multiple_of_8(0.5 * dynamic_shmem_size); +} + +void compute_aggregations(int grid_size, + cudf::size_type num_input_rows, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* aggs, + rmm::cuda_stream_view stream) +{ + auto const shmem_size = compute_shared_memory_size(compute_aggs_kernel, grid_size); + // For each aggregation, need two pointers to arrays in shmem + // One where the aggregation is performed, one indicating the validity of the aggregation + auto shmem_agg_pointer_size = + round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns()); + // The rest of shmem is utilized for the actual arrays in shmem + auto shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2; + compute_aggs_kernel<<>>( + num_input_rows, + local_mapping_index, + global_mapping_index, + block_cardinality, + input_values, + output_values, + aggs, + shmem_agg_size, + shmem_agg_pointer_size); +} + +template +void extract_populated_keys(SetType const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream) +{ + auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); + + populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); +} + +// make table that will hold sparse results +template +auto create_sparse_results_table(cudf::table_view const& flattened_values, + const cudf::aggregation::Kind* d_aggs, + std::vector aggs, + bool direct_aggregations, + GlobalSetType const& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream) +{ + // TODO single allocation - room for performance improvement + std::vector> sparse_columns; + std::transform( + flattened_values.begin(), + flattened_values.end(), + aggs.begin(), + std::back_inserter(sparse_columns), + [stream](auto const& col, auto const& agg) { + bool nullable = (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL) + ? false + : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or + agg == cudf::aggregation::STD); + auto mask_flag = (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED; + auto col_type = cudf::is_dictionary(col.type()) + ? cudf::dictionary_column_view(col).keys().type() + : col.type(); + return make_fixed_width_column( + cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); + }); + cudf::table sparse_table(std::move(sparse_columns)); + // If no direct aggregations, initialize the sparse table + // only for the keys inserted in global hash set + if (!direct_aggregations) { + auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream); + extract_populated_keys(global_set, populated_keys, stream); + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + populated_keys.size(), + initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_aggs}); + } + // Else initialise the whole table + else { + cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view(); + cudf::detail::initialize_with_identity(sparse_table_view, aggs, stream); + } + return sparse_table; +} + +/** + * @brief Computes all aggregations from `requests` that require a single pass + * over the data and stores the results in `sparse_results` + */ +template +rmm::device_uvector compute_single_pass_aggs( + cudf::table_view const& keys, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + SetType& global_set, + rmm::cuda_stream_view stream) +{ + // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy + auto constexpr shared_set_capacity = + static_cast(static_cast(GROUPBY_SHM_MAX_ELEMENTS) * 1.43); + using extent_type = cuco::extent; + using shared_set_type = cuco::static_set, + cuco::storage>; + using shared_set_ref_type = typename shared_set_type::ref_type<>; + auto constexpr window_extent = cuco::make_window_extent(extent_type{}); + + auto const num_input_rows = keys.num_rows(); + + auto global_set_ref = global_set.ref(cuco::op::insert_and_find); + auto const grid_size = max_occupancy_grid_size( + compute_mapping_indices, + num_input_rows); + // 'local_mapping_index' maps from the global row index of the input table to the row index of + // the local pre-aggregate table + rmm::device_uvector local_mapping_index(num_input_rows, stream); + // 'global_mapping_index' maps from the local pre-aggregate table to the row index of + // global aggregate table + rmm::device_uvector global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS, + stream); + rmm::device_uvector block_cardinality(grid_size, stream); + rmm::device_scalar direct_aggregations(false, stream); + compute_mapping_indices + <<>>(global_set_ref, + num_input_rows, + window_extent, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + direct_aggregations.data()); + stream.synchronize(); + + // 'populated_keys' contains inserted row_indices (keys) of global hash set + rmm::device_uvector populated_keys(keys.num_rows(), stream); + + // flatten the aggs to a table that can be operated on by aggregate_row + auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); + auto const d_aggs = cudf::detail::make_device_uvector_async( + agg_kinds, stream, rmm::mr::get_current_device_resource()); + // make table that will hold sparse results + cudf::table sparse_table = create_sparse_results_table(flattened_values, + d_aggs.data(), + agg_kinds, + direct_aggregations.value(stream), + global_set, + populated_keys, + stream); + // prepare to launch kernel to do the actual aggregation + auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); + auto d_values = table_device_view::create(flattened_values, stream); + + compute_aggregations(grid_size, + num_input_rows, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + *d_values, + *d_sparse_table, + d_aggs.data(), + stream); + + if (direct_aggregations.value(stream)) { + auto const stride = GROUPBY_BLOCK_SIZE * grid_size; + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + keys.num_rows(), + compute_direct_aggregates{global_set_ref, + *d_values, + *d_sparse_table, + d_aggs.data(), + block_cardinality.data(), + stride}); + extract_populated_keys(global_set, populated_keys, stream); + } + + // Add results back to sparse_results cache + auto sparse_result_cols = sparse_table.release(); + for (size_t i = 0; i < aggs.size(); i++) { + // Note that the cache will make a copy of this temporary aggregation + sparse_results->add_result( + flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); + } + + return populated_keys; +} + +} // namespace hash +} // namespace detail +} // namespace groupby +} // namespace cudf diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp new file mode 100644 index 00000000000..73a85d67627 --- /dev/null +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include + +namespace cudf { +namespace groupby { +namespace detail { +namespace hash { +/** + * @brief Computes all aggregations from `requests` that require a single pass + * over the data and stores the results in `sparse_results` + */ +template +CUDF_EXPORT rmm::device_uvector compute_single_pass_aggs( + cudf::table_view const& keys, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + SetType& global_set, + rmm::cuda_stream_view stream); + +} // namespace hash +} // namespace detail +} // namespace groupby +} // namespace cudf diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index ceda6c5a4cb..e93c8b46613 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -14,9 +14,10 @@ * limitations under the License. */ +#include "compute_single_pass_aggs.cuh" +#include "compute_single_pass_aggs.hpp" #include "groupby/common/utils.hpp" #include "helpers.cuh" -#include "kernels.cuh" #include "multi_pass_functors.cuh" #include "single_pass_functors.cuh" @@ -52,7 +53,6 @@ #include #include -#include #include namespace cudf { @@ -105,76 +105,6 @@ bool constexpr is_hash_aggregation(aggregation::Kind t) return array_contains(hash_aggregations, t); } -class groupby_simple_aggregations_collector final - : public cudf::detail::simple_aggregations_collector { - public: - using cudf::detail::simple_aggregations_collector::visit; - - std::vector> visit(data_type col_type, - cudf::detail::min_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation() - : make_min_aggregation()); - return aggs; - } - - std::vector> visit(data_type col_type, - cudf::detail::max_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation() - : make_max_aggregation()); - return aggs; - } - - std::vector> visit(data_type col_type, - cudf::detail::mean_aggregation const&) override - { - (void)col_type; - CUDF_EXPECTS(is_fixed_width(col_type), "MEAN aggregation expects fixed width type"); - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } - - std::vector> visit(data_type, - cudf::detail::var_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } - - std::vector> visit(data_type, - cudf::detail::std_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } - - std::vector> visit( - data_type, cudf::detail::correlation_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } -}; - template class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer { column_view col; @@ -342,40 +272,6 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final dense_results->add_result(col, agg, std::move(result)); } }; -// flatten aggs to filter in single pass aggs -std::tuple, std::vector>> -flatten_single_pass_aggs(host_span requests) -{ - std::vector columns; - std::vector> aggs; - std::vector agg_kinds; - - for (auto const& request : requests) { - auto const& agg_v = request.aggregations; - - std::unordered_set agg_kinds_set; - auto insert_agg = [&](column_view const& request_values, std::unique_ptr&& agg) { - if (agg_kinds_set.insert(agg->kind).second) { - agg_kinds.push_back(agg->kind); - aggs.push_back(std::move(agg)); - columns.push_back(request_values); - } - }; - - auto values_type = cudf::is_dictionary(request.values.type()) - ? cudf::dictionary_column_view(request.values).keys().type() - : request.values.type(); - for (auto&& agg : agg_v) { - groupby_simple_aggregations_collector collector; - - for (auto& agg_s : agg->get_simple_aggregations(values_type, collector)) { - insert_agg(request.values, std::move(agg_s)); - } - } - } - - return std::make_tuple(table_view(columns), std::move(agg_kinds), std::move(aggs)); -} /** * @brief Gather sparse results into dense using `gather_map` and add to @@ -415,239 +311,6 @@ void sparse_to_dense_results(table_view const& keys, } } -template -void extract_populated_keys(SetType const& key_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream) -{ - auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); - - populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); -} - -// make table that will hold sparse results -template -auto create_sparse_results_table(cudf::table_view const& flattened_values, - const cudf::aggregation::Kind* d_aggs, - std::vector aggs, - bool direct_aggregations, - GlobalSetType const& global_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream) -{ - // TODO single allocation - room for performance improvement - std::vector> sparse_columns; - std::transform( - flattened_values.begin(), - flattened_values.end(), - aggs.begin(), - std::back_inserter(sparse_columns), - [stream](auto const& col, auto const& agg) { - bool nullable = (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL) - ? false - : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or - agg == cudf::aggregation::STD); - auto mask_flag = (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED; - auto col_type = cudf::is_dictionary(col.type()) - ? cudf::dictionary_column_view(col).keys().type() - : col.type(); - return make_fixed_width_column( - cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); - }); - cudf::table sparse_table(std::move(sparse_columns)); - // If no direct aggregations, initialize the sparse table - // only for the keys inserted in global hash set - if (!direct_aggregations) { - auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream); - extract_populated_keys(global_set, populated_keys, stream); - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - populated_keys.size(), - initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_aggs}); - } - // Else initialise the whole table - else { - cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view(); - cudf::detail::initialize_with_identity(sparse_table_view, aggs, stream); - } - return sparse_table; -} - -template -int max_occupancy_grid_size(Kernel kernel, cudf::size_type n) -{ - int max_active_blocks{-1}; - CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks, kernel, GROUPBY_BLOCK_SIZE, 0)); - auto const grid_size = max_active_blocks * cudf::detail::num_multiprocessors(); - auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE); - return std::min(grid_size, num_blocks); -} - -size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; } - -template -size_t compute_shared_memory_size(Kernel kernel, int grid_size) -{ - auto const active_blocks_per_sm = - cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); - - size_t dynamic_shmem_size; - CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock( - &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE)); - return get_previous_multiple_of_8(0.5 * dynamic_shmem_size); -} - -void compute_aggregations(int grid_size, - cudf::size_type num_input_rows, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::aggregation::Kind const* aggs, - rmm::cuda_stream_view stream) -{ - auto const shmem_size = compute_shared_memory_size(compute_aggs_kernel, grid_size); - // For each aggregation, need two pointers to arrays in shmem - // One where the aggregation is performed, one indicating the validity of the aggregation - auto shmem_agg_pointer_size = - round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns()); - // The rest of shmem is utilized for the actual arrays in shmem - auto shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2; - compute_aggs_kernel<<>>( - num_input_rows, - local_mapping_index, - global_mapping_index, - block_cardinality, - input_values, - output_values, - aggs, - shmem_agg_size, - shmem_agg_pointer_size); -} - -/** - * @brief Computes all aggregations from `requests` that require a single pass - * over the data and stores the results in `sparse_results` - */ -template -rmm::device_uvector compute_single_pass_aggs( - cudf::table_view const& keys, - cudf::host_span requests, - cudf::detail::result_cache* sparse_results, - SetType& global_set, - rmm::cuda_stream_view stream) -{ - // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy - auto constexpr shared_set_capacity = - static_cast(static_cast(GROUPBY_SHM_MAX_ELEMENTS) * 1.43); - using extent_type = cuco::extent; - using shared_set_type = cuco::static_set, - cuco::storage>; - using shared_set_ref_type = typename shared_set_type::ref_type<>; - auto constexpr window_extent = cuco::make_window_extent(extent_type{}); - - auto const num_input_rows = keys.num_rows(); - - auto global_set_ref = global_set.ref(cuco::op::insert_and_find); - auto const grid_size = max_occupancy_grid_size( - compute_mapping_indices, - num_input_rows); - // 'local_mapping_index' maps from the global row index of the input table to the row index of - // the local pre-aggregate table - rmm::device_uvector local_mapping_index(num_input_rows, stream); - // 'global_mapping_index' maps from the local pre-aggregate table to the row index of - // global aggregate table - rmm::device_uvector global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS, - stream); - rmm::device_uvector block_cardinality(grid_size, stream); - rmm::device_scalar direct_aggregations(false, stream); - compute_mapping_indices - <<>>(global_set_ref, - num_input_rows, - window_extent, - local_mapping_index.data(), - global_mapping_index.data(), - block_cardinality.data(), - direct_aggregations.data()); - stream.synchronize(); - - // 'populated_keys' contains inserted row_indices (keys) of global hash set - rmm::device_uvector populated_keys(keys.num_rows(), stream); - - // flatten the aggs to a table that can be operated on by aggregate_row - auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); - auto const d_aggs = cudf::detail::make_device_uvector_async( - agg_kinds, stream, rmm::mr::get_current_device_resource()); - // make table that will hold sparse results - cudf::table sparse_table = create_sparse_results_table(flattened_values, - d_aggs.data(), - agg_kinds, - direct_aggregations.value(stream), - global_set, - populated_keys, - stream); - // prepare to launch kernel to do the actual aggregation - auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - auto d_values = table_device_view::create(flattened_values, stream); - - compute_aggregations(grid_size, - num_input_rows, - local_mapping_index.data(), - global_mapping_index.data(), - block_cardinality.data(), - *d_values, - *d_sparse_table, - d_aggs.data(), - stream); - - if (direct_aggregations.value(stream)) { - int stride = GROUPBY_BLOCK_SIZE * grid_size; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - keys.num_rows(), - compute_direct_aggregates{global_set_ref, - *d_values, - *d_sparse_table, - d_aggs.data(), - block_cardinality.data(), - stride}); - extract_populated_keys(global_set, populated_keys, stream); - } - - // Add results back to sparse_results cache - auto sparse_result_cols = sparse_table.release(); - for (size_t i = 0; i < aggs.size(); i++) { - // Note that the cache will make a copy of this temporary aggregation - sparse_results->add_result( - flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); - } - - return populated_keys; -} - -/** - * @brief Computes and returns a device vector containing all populated keys in - * `key_set`. - */ -template -rmm::device_uvector extract_populated_keys(SetType const& key_set, - size_type num_keys, - rmm::cuda_stream_view stream) -{ - rmm::device_uvector populated_keys(num_keys, stream); - auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); - - populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); - return populated_keys; -} - /** * @brief Computes groupby using hash table. * diff --git a/cpp/src/groupby/hash/kernels.cuh b/cpp/src/groupby/hash/kernels.cuh index 3051901fb37..7db66d0f526 100644 --- a/cpp/src/groupby/hash/kernels.cuh +++ b/cpp/src/groupby/hash/kernels.cuh @@ -203,116 +203,4 @@ CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows, } } -template -__device__ void find_local_mapping(cudf::size_type cur_idx, - cudf::size_type num_input_rows, - cudf::size_type* cardinality, - SetType shared_set, - cudf::size_type* local_mapping_index, - cudf::size_type* shared_set_indices) -{ - cudf::size_type result_idx; - bool inserted; - if (cur_idx < num_input_rows) { - auto const result = shared_set.insert_and_find(cur_idx); - result_idx = *result.first; - inserted = result.second; - // inserted a new element - if (result.second) { - auto shared_set_index = atomicAdd(cardinality, 1); - shared_set_indices[shared_set_index] = cur_idx; - local_mapping_index[cur_idx] = shared_set_index; - } - } - // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all - // threads in the thread block. - __syncthreads(); - if (cur_idx < num_input_rows) { - // element was already in set - if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; } - } -} - -template -__device__ void find_global_mapping(cudf::size_type cur_idx, - SetType global_set, - cudf::size_type* shared_set_indices, - cudf::size_type* global_mapping_index) -{ - auto input_idx = shared_set_indices[cur_idx]; - auto result = global_set.insert_and_find(input_idx); - global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] = *result.first; -} - -/* - * Inserts keys into the shared memory hash set, and stores the row index of the local - * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a - * threadblock exceeds `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without - * updating `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to - * the global hash set, and save the row index of the global sparse table in `global_mapping_index`. - */ -template -CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, - cudf::size_type num_input_rows, - WindowExtent window_extent, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - bool* direct_aggregations) -{ - __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS]; - - // Shared set initialization - __shared__ typename SetRef::window_type windows[window_extent.value()]; - auto storage = SetRef::storage_ref_type(window_extent, windows); - auto shared_set = SetRef(cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, - global_set.key_eq(), - probing_scheme_type{global_set.hash_function()}, - {}, - storage); - auto const block = cooperative_groups::this_thread_block(); - shared_set.initialize(block); - block.sync(); - - auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find); - - __shared__ cudf::size_type cardinality; - - if (block.thread_rank() == 0) { cardinality = 0; } - - block.sync(); - - auto const stride = cudf::detail::grid_1d::grid_stride(); - - for (auto cur_idx = cudf::detail::grid_1d::global_thread_id(); - cur_idx - block.thread_rank() < num_input_rows; - cur_idx += stride) { - find_local_mapping(cur_idx, - num_input_rows, - &cardinality, - shared_insert_ref, - local_mapping_index, - shared_set_indices); - - block.sync(); - - if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { - if (block.thread_rank() == 0) { *direct_aggregations = true; } - break; - } - - block.sync(); - } - - // Insert unique keys from shared to global hash set - if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) { - for (auto cur_idx = block.thread_rank(); cur_idx < cardinality; - cur_idx += block.num_threads()) { - find_global_mapping(cur_idx, global_set, shared_set_indices, global_mapping_index); - } - } - - if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; } -} - } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index a8cc7492c52..b6bf7a9d500 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -861,7 +861,7 @@ struct initialize_sparse_table { for (auto col_idx = 0; col_idx < sparse_table.num_columns(); col_idx++) { cudf::detail::dispatch_type_and_aggregation(sparse_table.column(col_idx).type(), aggs[col_idx], - initialize_gmem{}, + cudf::detail::identity_initializer{}, sparse_table.column(col_idx), key_idx); } From 50094f7b2f817a346e502879fea7f6f50e89dd2a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 18 Sep 2024 19:18:12 -0700 Subject: [PATCH 022/135] Revert back to GQE init --- cpp/include/cudf/detail/aggregation/aggregation.cuh | 2 +- cpp/src/groupby/hash/single_pass_functors.cuh | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh index bc2d0edbeba..82383023ef1 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.cuh +++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh @@ -620,7 +620,7 @@ struct identity_initializer { } template - T get_identity() + constexpr T get_identity() { if (k == aggregation::ARGMAX || k == aggregation::ARGMIN) { if constexpr (cudf::is_timestamp()) diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index b6bf7a9d500..5aff267fd6f 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -861,7 +861,8 @@ struct initialize_sparse_table { for (auto col_idx = 0; col_idx < sparse_table.num_columns(); col_idx++) { cudf::detail::dispatch_type_and_aggregation(sparse_table.column(col_idx).type(), aggs[col_idx], - cudf::detail::identity_initializer{}, + // cudf::detail::identity_initializer{}, + initialize_gmem{}, sparse_table.column(col_idx), key_idx); } From 13620c7056159daac575dcd8c6f02d3abd3beed2 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 20 Sep 2024 11:29:24 -0700 Subject: [PATCH 023/135] Pass null policies to agg kernels --- cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 15 ++++++++++++++- cpp/src/groupby/hash/compute_single_pass_aggs.hpp | 2 ++ cpp/src/groupby/hash/groupby.cu | 13 ++++++------- cpp/src/groupby/hash/single_pass_functors.cuh | 14 +++++++++++--- 4 files changed, 33 insertions(+), 11 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 3b36f8a1f81..70fda95bc8e 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -92,6 +93,8 @@ template CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, cudf::size_type num_input_rows, WindowExtent window_extent, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, @@ -379,6 +382,7 @@ rmm::device_uvector compute_single_pass_aggs( cudf::host_span requests, cudf::detail::result_cache* sparse_results, SetType& global_set, + bool skip_key_rows_with_nulls, rmm::cuda_stream_view stream) { // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy @@ -397,6 +401,11 @@ rmm::device_uvector compute_single_pass_aggs( auto const num_input_rows = keys.num_rows(); + auto row_bitmask = + skip_key_rows_with_nulls + ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first + : rmm::device_buffer{}; + auto global_set_ref = global_set.ref(cuco::op::insert_and_find); auto const grid_size = max_occupancy_grid_size( compute_mapping_indices, @@ -414,6 +423,8 @@ rmm::device_uvector compute_single_pass_aggs( <<>>(global_set_ref, num_input_rows, window_extent, + static_cast(row_bitmask.data()), + skip_key_rows_with_nulls, local_mapping_index.data(), global_mapping_index.data(), block_cardinality.data(), @@ -459,7 +470,9 @@ rmm::device_uvector compute_single_pass_aggs( *d_sparse_table, d_aggs.data(), block_cardinality.data(), - stride}); + stride, + static_cast(row_bitmask.data()), + skip_key_rows_with_nulls}); extract_populated_keys(global_set, populated_keys, stream); } diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp index 73a85d67627..70eb7bb0c89 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp @@ -37,6 +37,8 @@ CUDF_EXPORT rmm::device_uvector compute_single_pass_aggs( cudf::host_span requests, cudf::detail::result_cache* sparse_results, SetType& global_set, + bool keys_have_nulls, + null_policy include_null_keys, rmm::cuda_stream_view stream); } // namespace hash diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index e93c8b46613..2c32930c061 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -19,7 +19,6 @@ #include "groupby/common/utils.hpp" #include "helpers.cuh" #include "multi_pass_functors.cuh" -#include "single_pass_functors.cuh" #include #include @@ -286,14 +285,12 @@ void sparse_to_dense_results(table_view const& keys, cudf::detail::result_cache* dense_results, device_span gather_map, SetType set, - bool keys_have_nulls, - null_policy include_null_keys, + bool skip_key_rows_with_nulls, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { auto row_bitmask = cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first; - bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; bitmask_type const* row_bitmask_ptr = skip_key_rows_with_nulls ? static_cast(row_bitmask.data()) : nullptr; @@ -349,6 +346,8 @@ std::unique_ptr
groupby(table_view const& keys, auto const num_keys = static_cast(keys.num_rows()); auto const null_keys_are_equal = null_equality::EQUAL; auto const has_null = nullate::DYNAMIC{cudf::has_nested_nulls(keys)}; + auto const skip_key_rows_with_nulls = + keys_have_nulls and include_null_keys == null_policy::EXCLUDE; auto preprocessed_keys = cudf::experimental::row::hash::preprocessed_table::create(keys, stream); auto const comparator = cudf::experimental::row::equality::self_comparator{preprocessed_keys}; @@ -372,7 +371,8 @@ std::unique_ptr
groupby(table_view const& keys, stream.value()}; // Compute all single pass aggs first - auto gather_map = compute_single_pass_aggs(keys, requests, &sparse_results, set, stream); + auto gather_map = compute_single_pass_aggs( + keys, requests, &sparse_results, set, skip_key_rows_with_nulls, stream); // Compact all results from sparse_results and insert into cache sparse_to_dense_results(keys, @@ -381,8 +381,7 @@ std::unique_ptr
groupby(table_view const& keys, cache, gather_map, set.ref(cuco::find), - keys_have_nulls, - include_null_keys, + skip_key_rows_with_nulls, stream, mr); diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index 5aff267fd6f..9ab774e5fe1 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -877,25 +877,33 @@ struct compute_direct_aggregates { cudf::aggregation::Kind const* __restrict__ aggs; cudf::size_type* block_cardinality; int stride; + bitmask_type const* __restrict__ row_bitmask; + bool skip_rows_with_nulls; + compute_direct_aggregates(SetType set, cudf::table_device_view input_values, cudf::mutable_table_device_view output_values, cudf::aggregation::Kind const* aggs, cudf::size_type* block_cardinality, - int stride) + int stride, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls) : set(set), input_values(input_values), output_values(output_values), aggs(aggs), block_cardinality(block_cardinality), - stride(stride) + stride(stride), + row_bitmask(row_bitmask), + skip_rows_with_nulls(skip_rows_with_nulls) { } __device__ void operator()(cudf::size_type i) { int block_id = (i % stride) / GROUPBY_BLOCK_SIZE; - if (block_cardinality[block_id] >= GROUPBY_CARDINALITY_THRESHOLD) { + if (block_cardinality[block_id] >= GROUPBY_CARDINALITY_THRESHOLD and + (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i))) { auto const result = set.insert_and_find(i); cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); } From 47de4b3305cd2db6dc9d1b0deceeeb9282fdd377 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 20 Sep 2024 13:27:26 -0700 Subject: [PATCH 024/135] Add notes + cleanups --- .../groupby/hash/compute_single_pass_aggs.cuh | 38 +++++++++++-------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 70fda95bc8e..8ee4aecfc10 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -42,22 +42,28 @@ namespace detail { namespace hash { template +// TODO pass block __device__ void find_local_mapping(cudf::size_type cur_idx, cudf::size_type num_input_rows, - cudf::size_type* cardinality, SetType shared_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* cardinality, cudf::size_type* local_mapping_index, cudf::size_type* shared_set_indices) { cudf::size_type result_idx; + // TODO: un-init bool inserted; - if (cur_idx < num_input_rows) { + if (cur_idx < num_input_rows + // and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx)) + ) { auto const result = shared_set.insert_and_find(cur_idx); result_idx = *result.first; inserted = result.second; // inserted a new element if (result.second) { - auto shared_set_index = atomicAdd(cardinality, 1); + auto const shared_set_index = atomicAdd(cardinality, 1); shared_set_indices[shared_set_index] = cur_idx; local_mapping_index[cur_idx] = shared_set_index; } @@ -65,7 +71,9 @@ __device__ void find_local_mapping(cudf::size_type cur_idx, // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all // threads in the thread block. __syncthreads(); - if (cur_idx < num_input_rows) { + if (cur_idx < num_input_rows + // and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx)) + ) { // element was already in set if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; } } @@ -77,9 +85,9 @@ __device__ void find_global_mapping(cudf::size_type cur_idx, cudf::size_type* shared_set_indices, cudf::size_type* global_mapping_index) { - auto input_idx = shared_set_indices[cur_idx]; - auto result = global_set.insert_and_find(input_idx); - global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] = *result.first; + auto const input_idx = shared_set_indices[cur_idx]; + global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] = + *global_set.insert_and_find(input_idx).first; } /* @@ -100,6 +108,7 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, cudf::size_type* block_cardinality, bool* direct_aggregations) { + // TODO: indices inserted in each shared memory set __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS]; // Shared set initialization @@ -112,14 +121,11 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, storage); auto const block = cooperative_groups::this_thread_block(); shared_set.initialize(block); - block.sync(); auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find); __shared__ cudf::size_type cardinality; - if (block.thread_rank() == 0) { cardinality = 0; } - block.sync(); auto const stride = cudf::detail::grid_1d::grid_stride(); @@ -129,8 +135,10 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, cur_idx += stride) { find_local_mapping(cur_idx, num_input_rows, - &cardinality, shared_insert_ref, + row_bitmask, + skip_rows_with_nulls, + &cardinality, local_mapping_index, shared_set_indices); @@ -382,7 +390,7 @@ rmm::device_uvector compute_single_pass_aggs( cudf::host_span requests, cudf::detail::result_cache* sparse_results, SetType& global_set, - bool skip_key_rows_with_nulls, + bool skip_rows_with_nulls, rmm::cuda_stream_view stream) { // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy @@ -402,7 +410,7 @@ rmm::device_uvector compute_single_pass_aggs( auto const num_input_rows = keys.num_rows(); auto row_bitmask = - skip_key_rows_with_nulls + skip_rows_with_nulls ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first : rmm::device_buffer{}; @@ -424,7 +432,7 @@ rmm::device_uvector compute_single_pass_aggs( num_input_rows, window_extent, static_cast(row_bitmask.data()), - skip_key_rows_with_nulls, + skip_rows_with_nulls, local_mapping_index.data(), global_mapping_index.data(), block_cardinality.data(), @@ -472,7 +480,7 @@ rmm::device_uvector compute_single_pass_aggs( block_cardinality.data(), stride, static_cast(row_bitmask.data()), - skip_key_rows_with_nulls}); + skip_rows_with_nulls}); extract_populated_keys(global_set, populated_keys, stream); } From 2f04781da3f3d078915151ad429e874e3913321e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 20 Sep 2024 16:30:13 -0700 Subject: [PATCH 025/135] Fix null bugs --- .../groupby/hash/compute_single_pass_aggs.cuh | 16 +++++---- cpp/src/groupby/hash/kernels.cuh | 35 ++++++++++++------- 2 files changed, 32 insertions(+), 19 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 8ee4aecfc10..96bc851b9e6 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -55,9 +55,8 @@ __device__ void find_local_mapping(cudf::size_type cur_idx, cudf::size_type result_idx; // TODO: un-init bool inserted; - if (cur_idx < num_input_rows - // and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx)) - ) { + if (cur_idx < num_input_rows and + (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) { auto const result = shared_set.insert_and_find(cur_idx); result_idx = *result.first; inserted = result.second; @@ -71,9 +70,8 @@ __device__ void find_local_mapping(cudf::size_type cur_idx, // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all // threads in the thread block. __syncthreads(); - if (cur_idx < num_input_rows - // and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx)) - ) { + if (cur_idx < num_input_rows and + (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) { // element was already in set if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; } } @@ -295,6 +293,8 @@ size_t compute_shared_memory_size(Kernel kernel, int grid_size) void compute_aggregations(int grid_size, cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, @@ -312,6 +312,8 @@ void compute_aggregations(int grid_size, auto shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2; compute_aggs_kernel<<>>( num_input_rows, + row_bitmask, + skip_rows_with_nulls, local_mapping_index, global_mapping_index, block_cardinality, @@ -460,6 +462,8 @@ rmm::device_uvector compute_single_pass_aggs( compute_aggregations(grid_size, num_input_rows, + static_cast(row_bitmask.data()), + skip_rows_with_nulls, local_mapping_index.data(), global_mapping_index.data(), block_cardinality.data(), diff --git a/cpp/src/groupby/hash/kernels.cuh b/cpp/src/groupby/hash/kernels.cuh index 7db66d0f526..ecbb013902f 100644 --- a/cpp/src/groupby/hash/kernels.cuh +++ b/cpp/src/groupby/hash/kernels.cuh @@ -76,6 +76,8 @@ __device__ void initialize_shared_memory_aggregates(int col_start, __device__ void compute_pre_aggregrates(int col_start, int col_end, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, cudf::table_device_view input_values, cudf::size_type num_input_rows, cudf::size_type* local_mapping_index, @@ -83,21 +85,24 @@ __device__ void compute_pre_aggregrates(int col_start, bool** s_aggregates_valid_pointer, cudf::aggregation::Kind const* aggs) { + // TODO grid_1d utility for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows; cur_idx += blockDim.x * gridDim.x) { - auto map_idx = local_mapping_index[cur_idx]; - - for (auto col_idx = col_start; col_idx < col_end; col_idx++) { - auto input_col = input_values.column(col_idx); - - cudf::detail::dispatch_type_and_aggregation(input_col.type(), - aggs[col_idx], - shmem_element_aggregator{}, - s_aggregates_pointer[col_idx], - map_idx, - s_aggregates_valid_pointer[col_idx], - input_col, - cur_idx); + if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx)) { + auto map_idx = local_mapping_index[cur_idx]; + + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + auto input_col = input_values.column(col_idx); + + cudf::detail::dispatch_type_and_aggregation(input_col.type(), + aggs[col_idx], + shmem_element_aggregator{}, + s_aggregates_pointer[col_idx], + map_idx, + s_aggregates_valid_pointer[col_idx], + input_col, + cur_idx); + } } } } @@ -133,6 +138,8 @@ __device__ void compute_final_aggregates(int col_start, /* Takes the local_mapping_index and global_mapping_index to compute * pre (shared) and final (global) aggregates*/ CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, @@ -183,6 +190,8 @@ CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows, block.sync(); compute_pre_aggregrates(col_start, col_end, + row_bitmask, + skip_rows_with_nulls, input_values, num_rows, local_mapping_index, From 4a0d7a05000f6d48cc87e34c23cd95da2755f4d6 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 20 Sep 2024 17:27:30 -0700 Subject: [PATCH 026/135] Make var const --- .../groupby/hash/compute_single_pass_aggs.cuh | 35 ++++++++++--------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 96bc851b9e6..c059c2dd3cf 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -346,23 +346,24 @@ auto create_sparse_results_table(cudf::table_view const& flattened_values, { // TODO single allocation - room for performance improvement std::vector> sparse_columns; - std::transform( - flattened_values.begin(), - flattened_values.end(), - aggs.begin(), - std::back_inserter(sparse_columns), - [stream](auto const& col, auto const& agg) { - bool nullable = (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL) - ? false - : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or - agg == cudf::aggregation::STD); - auto mask_flag = (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED; - auto col_type = cudf::is_dictionary(col.type()) - ? cudf::dictionary_column_view(col).keys().type() - : col.type(); - return make_fixed_width_column( - cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); - }); + std::transform(flattened_values.begin(), + flattened_values.end(), + aggs.begin(), + std::back_inserter(sparse_columns), + [stream](auto const& col, auto const& agg) { + auto const nullable = + (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL) + ? false + : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or + agg == cudf::aggregation::STD); + auto mask_flag = + (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED; + auto const col_type = cudf::is_dictionary(col.type()) + ? cudf::dictionary_column_view(col).keys().type() + : col.type(); + return make_fixed_width_column( + cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); + }); cudf::table sparse_table(std::move(sparse_columns)); // If no direct aggregations, initialize the sparse table // only for the keys inserted in global hash set From 398c9f4009891b6dd1acbcb62aaac65650178b47 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 23 Sep 2024 10:48:49 -0700 Subject: [PATCH 027/135] Make vars const --- cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index c059c2dd3cf..7bdef7e6d2b 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -306,10 +306,10 @@ void compute_aggregations(int grid_size, auto const shmem_size = compute_shared_memory_size(compute_aggs_kernel, grid_size); // For each aggregation, need two pointers to arrays in shmem // One where the aggregation is performed, one indicating the validity of the aggregation - auto shmem_agg_pointer_size = + auto const shmem_agg_pointer_size = round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns()); // The rest of shmem is utilized for the actual arrays in shmem - auto shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2; + auto const shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2; compute_aggs_kernel<<>>( num_input_rows, row_bitmask, From c1c53a3a25de296ca70bfa14f063405d47a95682 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 23 Sep 2024 11:46:43 -0700 Subject: [PATCH 028/135] Cleanups for ODR --- cpp/CMakeLists.txt | 1 + cpp/src/groupby/hash/compute_aggregations.cu | 272 ++++++++++++++++++ cpp/src/groupby/hash/compute_aggregations.hpp | 41 +++ .../groupby/hash/compute_single_pass_aggs.cuh | 51 +--- .../groupby/hash/compute_single_pass_aggs.hpp | 2 +- cpp/src/groupby/hash/single_pass_functors.cuh | 3 + 6 files changed, 322 insertions(+), 48 deletions(-) create mode 100644 cpp/src/groupby/hash/compute_aggregations.cu create mode 100644 cpp/src/groupby/hash/compute_aggregations.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7bc01e64441..0f096bd7e4f 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -314,6 +314,7 @@ add_library( src/filling/repeat.cu src/filling/sequence.cu src/groupby/groupby.cu + src/groupby/hash/compute_aggregations.cu src/groupby/hash/groupby.cu src/groupby/sort/aggregate.cpp src/groupby/sort/group_argmax.cu diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu new file mode 100644 index 00000000000..218f513e964 --- /dev/null +++ b/cpp/src/groupby/hash/compute_aggregations.cu @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_aggregations.hpp" +#include "helpers.cuh" +#include "single_pass_functors.cuh" + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +namespace cudf::groupby::detail::hash { +namespace { +__device__ void calculate_columns_to_aggregate(int& col_start, + int& col_end, + cudf::mutable_table_device_view output_values, + int num_input_cols, + std::byte** s_aggregates_pointer, + bool** s_aggregates_valid_pointer, + std::byte* shared_set_aggregates, + cudf::size_type cardinality, + int total_agg_size) +{ + if (threadIdx.x == 0) { + col_start = col_end; + int bytes_allocated = 0; + int valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality); + while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) { + int next_col_size = + round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality); + int next_col_total_size = valid_col_size + next_col_size; + if (bytes_allocated + next_col_total_size > total_agg_size) { break; } + s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated; + s_aggregates_valid_pointer[col_end] = + reinterpret_cast(shared_set_aggregates + bytes_allocated + next_col_size); + bytes_allocated += next_col_total_size; + col_end++; + } + } +} + +__device__ void initialize_shared_memory_aggregates(int col_start, + int col_end, + cudf::mutable_table_device_view output_values, + std::byte** s_aggregates_pointer, + bool** s_aggregates_valid_pointer, + cudf::size_type cardinality, + cudf::aggregation::Kind const* aggs) +{ + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) { + cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(), + aggs[col_idx], + initialize_shmem{}, + s_aggregates_pointer[col_idx], + idx, + s_aggregates_valid_pointer[col_idx]); + } + } +} + +__device__ void compute_pre_aggregrates(int col_start, + int col_end, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::table_device_view input_values, + cudf::size_type num_input_rows, + cudf::size_type* local_mapping_index, + std::byte** s_aggregates_pointer, + bool** s_aggregates_valid_pointer, + cudf::aggregation::Kind const* aggs) +{ + // TODO grid_1d utility + for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows; + cur_idx += blockDim.x * gridDim.x) { + if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx)) { + auto map_idx = local_mapping_index[cur_idx]; + + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + auto input_col = input_values.column(col_idx); + + cudf::detail::dispatch_type_and_aggregation(input_col.type(), + aggs[col_idx], + shmem_element_aggregator{}, + s_aggregates_pointer[col_idx], + map_idx, + s_aggregates_valid_pointer[col_idx], + input_col, + cur_idx); + } + } + } +} + +__device__ void compute_final_aggregates(int col_start, + int col_end, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::size_type cardinality, + cudf::size_type* global_mapping_index, + std::byte** s_aggregates_pointer, + bool** s_aggregates_valid_pointer, + cudf::aggregation::Kind const* aggs) +{ + for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) { + auto out_idx = global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx]; + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + auto output_col = output_values.column(col_idx); + + cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(), + aggs[col_idx], + gmem_element_aggregator{}, + output_col, + out_idx, + input_values.column(col_idx), + s_aggregates_pointer[col_idx], + cur_idx, + s_aggregates_valid_pointer[col_idx]); + } + } +} + +/* Takes the local_mapping_index and global_mapping_index to compute + * pre (shared) and final (global) aggregates*/ +CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* aggs, + int total_agg_size, + int pointer_size) +{ + auto const block = cooperative_groups::this_thread_block(); + auto const cardinality = block_cardinality[block.group_index().x]; + if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; } + + auto const num_cols = output_values.num_columns(); + + __shared__ int col_start; + __shared__ int col_end; + extern __shared__ std::byte shared_set_aggregates[]; + std::byte** s_aggregates_pointer = + reinterpret_cast(shared_set_aggregates + total_agg_size); + bool** s_aggregates_valid_pointer = + reinterpret_cast(shared_set_aggregates + total_agg_size + pointer_size); + + if (block.thread_rank() == 0) { + col_start = 0; + col_end = 0; + } + block.sync(); + + while (col_end < num_cols) { + calculate_columns_to_aggregate(col_start, + col_end, + output_values, + num_cols, + s_aggregates_pointer, + s_aggregates_valid_pointer, + shared_set_aggregates, + cardinality, + total_agg_size); + block.sync(); + initialize_shared_memory_aggregates(col_start, + col_end, + output_values, + s_aggregates_pointer, + s_aggregates_valid_pointer, + cardinality, + aggs); + block.sync(); + compute_pre_aggregrates(col_start, + col_end, + row_bitmask, + skip_rows_with_nulls, + input_values, + num_rows, + local_mapping_index, + s_aggregates_pointer, + s_aggregates_valid_pointer, + aggs); + block.sync(); + compute_final_aggregates(col_start, + col_end, + input_values, + output_values, + cardinality, + global_mapping_index, + s_aggregates_pointer, + s_aggregates_valid_pointer, + aggs); + block.sync(); + } +} + +constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; } + +template +constexpr size_t compute_shared_memory_size(Kernel kernel, int grid_size) +{ + auto const active_blocks_per_sm = + cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); + + size_t dynamic_shmem_size; + CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock( + &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE)); + return get_previous_multiple_of_8(0.5 * dynamic_shmem_size); +} + +} // namespace + +void compute_aggregations(int grid_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* aggs, + rmm::cuda_stream_view stream) +{ + auto const shmem_size = compute_shared_memory_size(compute_aggs_kernel, grid_size); + // For each aggregation, need two pointers to arrays in shmem + // One where the aggregation is performed, one indicating the validity of the aggregation + auto const shmem_agg_pointer_size = + round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns()); + // The rest of shmem is utilized for the actual arrays in shmem + auto const shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2; + compute_aggs_kernel<<>>( + num_input_rows, + row_bitmask, + skip_rows_with_nulls, + local_mapping_index, + global_mapping_index, + block_cardinality, + input_values, + output_values, + aggs, + shmem_agg_size, + shmem_agg_pointer_size); +} + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_aggregations.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp new file mode 100644 index 00000000000..87c37158cd0 --- /dev/null +++ b/cpp/src/groupby/hash/compute_aggregations.hpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include + +#include + +namespace cudf::groupby::detail::hash { + +void compute_aggregations(int grid_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* aggs, + rmm::cuda_stream_view stream); + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 7bdef7e6d2b..78af61639c2 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -14,6 +14,9 @@ * limitations under the License. */ +#pragma once + +#include "compute_aggregations.hpp" #include "compute_single_pass_aggs.hpp" #include "helpers.cuh" #include "kernels.cuh" @@ -32,6 +35,7 @@ #include #include +#include #include #include @@ -277,53 +281,6 @@ int max_occupancy_grid_size(Kernel kernel, cudf::size_type n) return std::min(grid_size, num_blocks); } -size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; } - -template -size_t compute_shared_memory_size(Kernel kernel, int grid_size) -{ - auto const active_blocks_per_sm = - cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); - - size_t dynamic_shmem_size; - CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock( - &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE)); - return get_previous_multiple_of_8(0.5 * dynamic_shmem_size); -} - -void compute_aggregations(int grid_size, - cudf::size_type num_input_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::aggregation::Kind const* aggs, - rmm::cuda_stream_view stream) -{ - auto const shmem_size = compute_shared_memory_size(compute_aggs_kernel, grid_size); - // For each aggregation, need two pointers to arrays in shmem - // One where the aggregation is performed, one indicating the validity of the aggregation - auto const shmem_agg_pointer_size = - round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns()); - // The rest of shmem is utilized for the actual arrays in shmem - auto const shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2; - compute_aggs_kernel<<>>( - num_input_rows, - row_bitmask, - skip_rows_with_nulls, - local_mapping_index, - global_mapping_index, - block_cardinality, - input_values, - output_values, - aggs, - shmem_agg_size, - shmem_agg_pointer_size); -} - template void extract_populated_keys(SetType const& key_set, rmm::device_uvector& populated_keys, diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp index 70eb7bb0c89..12e5ff459e9 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp @@ -32,7 +32,7 @@ namespace hash { * over the data and stores the results in `sparse_results` */ template -CUDF_EXPORT rmm::device_uvector compute_single_pass_aggs( +rmm::device_uvector compute_single_pass_aggs( cudf::table_view const& keys, cudf::host_span requests, cudf::detail::result_cache* sparse_results, diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index 9ab774e5fe1..bb8cbaf4b46 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -16,7 +16,10 @@ #pragma once +#include "helpers.cuh" + #include +#include #include #include From 367d698c615468dc77375ba85f4a60cddaaa3012 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 23 Sep 2024 12:01:52 -0700 Subject: [PATCH 029/135] Fix a typo --- cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 78af61639c2..58baefbe1ab 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -332,7 +332,7 @@ auto create_sparse_results_table(cudf::table_view const& flattened_values, populated_keys.size(), initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_aggs}); } - // Else initialise the whole table + // Else initialize the whole table else { cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view(); cudf::detail::initialize_with_identity(sparse_table_view, aggs, stream); From df15519bedba15bf30c6be2077cd5422f93583ee Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 23 Sep 2024 12:29:04 -0700 Subject: [PATCH 030/135] Renaming for clarity --- .../groupby/hash/compute_single_pass_aggs.cuh | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 58baefbe1ab..8d91c382dbe 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -294,7 +294,7 @@ void extract_populated_keys(SetType const& key_set, // make table that will hold sparse results template auto create_sparse_results_table(cudf::table_view const& flattened_values, - const cudf::aggregation::Kind* d_aggs, + cudf::aggregation::Kind const* d_agg_kinds, std::vector aggs, bool direct_aggregations, GlobalSetType const& global_set, @@ -327,10 +327,11 @@ auto create_sparse_results_table(cudf::table_view const& flattened_values, if (!direct_aggregations) { auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream); extract_populated_keys(global_set, populated_keys, stream); - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - populated_keys.size(), - initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_aggs}); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + populated_keys.size(), + initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_agg_kinds}); } // Else initialize the whole table else { @@ -404,11 +405,11 @@ rmm::device_uvector compute_single_pass_aggs( // flatten the aggs to a table that can be operated on by aggregate_row auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); - auto const d_aggs = cudf::detail::make_device_uvector_async( + auto const d_agg_kinds = cudf::detail::make_device_uvector_async( agg_kinds, stream, rmm::mr::get_current_device_resource()); // make table that will hold sparse results cudf::table sparse_table = create_sparse_results_table(flattened_values, - d_aggs.data(), + d_agg_kinds.data(), agg_kinds, direct_aggregations.value(stream), global_set, @@ -427,7 +428,7 @@ rmm::device_uvector compute_single_pass_aggs( block_cardinality.data(), *d_values, *d_sparse_table, - d_aggs.data(), + d_agg_kinds.data(), stream); if (direct_aggregations.value(stream)) { @@ -438,7 +439,7 @@ rmm::device_uvector compute_single_pass_aggs( compute_direct_aggregates{global_set_ref, *d_values, *d_sparse_table, - d_aggs.data(), + d_agg_kinds.data(), block_cardinality.data(), stride, static_cast(row_bitmask.data()), From 2a39f8fcb17917733c195b6042d0036370d9f588 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 23 Sep 2024 14:38:22 -0700 Subject: [PATCH 031/135] Remove unused file --- cpp/src/groupby/hash/kernels.cuh | 215 ------------------------------- 1 file changed, 215 deletions(-) delete mode 100644 cpp/src/groupby/hash/kernels.cuh diff --git a/cpp/src/groupby/hash/kernels.cuh b/cpp/src/groupby/hash/kernels.cuh deleted file mode 100644 index ecbb013902f..00000000000 --- a/cpp/src/groupby/hash/kernels.cuh +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "cudf/types.hpp" -#include "helpers.cuh" -#include "single_pass_functors.cuh" - -#include -#include -#include - -namespace cudf::groupby::detail::hash { - -__device__ void calculate_columns_to_aggregate(int& col_start, - int& col_end, - cudf::mutable_table_device_view output_values, - int num_input_cols, - std::byte** s_aggregates_pointer, - bool** s_aggregates_valid_pointer, - std::byte* shared_set_aggregates, - cudf::size_type cardinality, - int total_agg_size) -{ - if (threadIdx.x == 0) { - col_start = col_end; - int bytes_allocated = 0; - int valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality); - while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) { - int next_col_size = - round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality); - int next_col_total_size = valid_col_size + next_col_size; - if (bytes_allocated + next_col_total_size > total_agg_size) { break; } - s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated; - s_aggregates_valid_pointer[col_end] = - reinterpret_cast(shared_set_aggregates + bytes_allocated + next_col_size); - bytes_allocated += next_col_total_size; - col_end++; - } - } -} - -__device__ void initialize_shared_memory_aggregates(int col_start, - int col_end, - cudf::mutable_table_device_view output_values, - std::byte** s_aggregates_pointer, - bool** s_aggregates_valid_pointer, - cudf::size_type cardinality, - cudf::aggregation::Kind const* aggs) -{ - for (auto col_idx = col_start; col_idx < col_end; col_idx++) { - for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) { - cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(), - aggs[col_idx], - initialize_shmem{}, - s_aggregates_pointer[col_idx], - idx, - s_aggregates_valid_pointer[col_idx]); - } - } -} - -__device__ void compute_pre_aggregrates(int col_start, - int col_end, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::table_device_view input_values, - cudf::size_type num_input_rows, - cudf::size_type* local_mapping_index, - std::byte** s_aggregates_pointer, - bool** s_aggregates_valid_pointer, - cudf::aggregation::Kind const* aggs) -{ - // TODO grid_1d utility - for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows; - cur_idx += blockDim.x * gridDim.x) { - if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx)) { - auto map_idx = local_mapping_index[cur_idx]; - - for (auto col_idx = col_start; col_idx < col_end; col_idx++) { - auto input_col = input_values.column(col_idx); - - cudf::detail::dispatch_type_and_aggregation(input_col.type(), - aggs[col_idx], - shmem_element_aggregator{}, - s_aggregates_pointer[col_idx], - map_idx, - s_aggregates_valid_pointer[col_idx], - input_col, - cur_idx); - } - } - } -} - -__device__ void compute_final_aggregates(int col_start, - int col_end, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::size_type cardinality, - cudf::size_type* global_mapping_index, - std::byte** s_aggregates_pointer, - bool** s_aggregates_valid_pointer, - cudf::aggregation::Kind const* aggs) -{ - for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) { - auto out_idx = global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx]; - for (auto col_idx = col_start; col_idx < col_end; col_idx++) { - auto output_col = output_values.column(col_idx); - - cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(), - aggs[col_idx], - gmem_element_aggregator{}, - output_col, - out_idx, - input_values.column(col_idx), - s_aggregates_pointer[col_idx], - cur_idx, - s_aggregates_valid_pointer[col_idx]); - } - } -} - -/* Takes the local_mapping_index and global_mapping_index to compute - * pre (shared) and final (global) aggregates*/ -CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::aggregation::Kind const* aggs, - int total_agg_size, - int pointer_size) -{ - auto const block = cooperative_groups::this_thread_block(); - auto const cardinality = block_cardinality[block.group_index().x]; - if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; } - - auto const num_cols = output_values.num_columns(); - - __shared__ int col_start; - __shared__ int col_end; - extern __shared__ std::byte shared_set_aggregates[]; - std::byte** s_aggregates_pointer = - reinterpret_cast(shared_set_aggregates + total_agg_size); - bool** s_aggregates_valid_pointer = - reinterpret_cast(shared_set_aggregates + total_agg_size + pointer_size); - - if (block.thread_rank() == 0) { - col_start = 0; - col_end = 0; - } - block.sync(); - - while (col_end < num_cols) { - calculate_columns_to_aggregate(col_start, - col_end, - output_values, - num_cols, - s_aggregates_pointer, - s_aggregates_valid_pointer, - shared_set_aggregates, - cardinality, - total_agg_size); - block.sync(); - initialize_shared_memory_aggregates(col_start, - col_end, - output_values, - s_aggregates_pointer, - s_aggregates_valid_pointer, - cardinality, - aggs); - block.sync(); - compute_pre_aggregrates(col_start, - col_end, - row_bitmask, - skip_rows_with_nulls, - input_values, - num_rows, - local_mapping_index, - s_aggregates_pointer, - s_aggregates_valid_pointer, - aggs); - block.sync(); - compute_final_aggregates(col_start, - col_end, - input_values, - output_values, - cardinality, - global_mapping_index, - s_aggregates_pointer, - s_aggregates_valid_pointer, - aggs); - block.sync(); - } -} - -} // namespace cudf::groupby::detail::hash From 890ef4561f548e75d4c0fbe7ce8ab24b8882bf97 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 23 Sep 2024 14:53:58 -0700 Subject: [PATCH 032/135] Add missing pragma once for header --- cpp/src/groupby/hash/compute_single_pass_aggs.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp index 12e5ff459e9..848ace94ff9 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#pragma once + #include #include #include From 57bdf2c5fcbcf7d72847cc650dc2ab775fa51b7e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 23 Sep 2024 15:18:52 -0700 Subject: [PATCH 033/135] Minor fixes --- cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 8d91c382dbe..617a4411243 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -19,12 +19,12 @@ #include "compute_aggregations.hpp" #include "compute_single_pass_aggs.hpp" #include "helpers.cuh" -#include "kernels.cuh" #include "single_pass_functors.cuh" #include #include #include +#include #include #include #include From d5856784133a9a3de92adf05fa4b952b28ca55c0 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 23 Sep 2024 17:07:21 -0700 Subject: [PATCH 034/135] Fix dictionary test failures --- cpp/src/groupby/hash/compute_aggregations.cu | 2 + .../groupby/hash/global_memory_aggregator.cuh | 460 ++++++++++++ .../groupby/hash/shared_memory_aggregator.cuh | 416 +++++++++++ cpp/src/groupby/hash/single_pass_functors.cuh | 692 ------------------ 4 files changed, 878 insertions(+), 692 deletions(-) create mode 100644 cpp/src/groupby/hash/global_memory_aggregator.cuh create mode 100644 cpp/src/groupby/hash/shared_memory_aggregator.cuh diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu index 218f513e964..73dca45edf7 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cu +++ b/cpp/src/groupby/hash/compute_aggregations.cu @@ -15,7 +15,9 @@ */ #include "compute_aggregations.hpp" +#include "global_memory_aggregator.cuh" #include "helpers.cuh" +#include "shared_memory_aggregator.cuh" #include "single_pass_functors.cuh" #include diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh new file mode 100644 index 00000000000..4dd39e640e0 --- /dev/null +++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh @@ -0,0 +1,460 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace cudf::groupby::detail::hash { + +template +struct update_target_element_gmem { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::MIN, + target_has_nulls, + source_has_nulls, + std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_fixed_point()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + if (source_has_nulls and source_null[source_index]) { return; } + + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + cudf::detail::atomic_min(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::MIN, + target_has_nulls, + source_has_nulls, + std::enable_if_t() && + cudf::has_atomic_support>()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + if (source_has_nulls and source_null[source_index]) { return; } + using Target = cudf::detail::target_type_t; + using DeviceType = cudf::device_storage_type_t; + DeviceType* source_casted = reinterpret_cast(source); + cudf::detail::atomic_min(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::MAX, + target_has_nulls, + source_has_nulls, + std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_fixed_point()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + if (source_has_nulls and source_null[source_index]) { return; } + using Target = cudf::detail::target_type_t; + Target* source_casted = reinterpret_cast(source); + cudf::detail::atomic_max(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::MAX, + target_has_nulls, + source_has_nulls, + std::enable_if_t() && + cudf::has_atomic_support>()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + if (source_has_nulls and source_null[source_index]) { return; } + using Target = cudf::detail::target_type_t; + + using DeviceType = cudf::device_storage_type_t; + DeviceType* source_casted = reinterpret_cast(source); + cudf::detail::atomic_max(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::SUM, + target_has_nulls, + source_has_nulls, + std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_fixed_point() && !cudf::is_timestamp()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + if (source_has_nulls and source_null[source_index]) { return; } + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::SUM, + target_has_nulls, + source_has_nulls, + std::enable_if_t>() && + cudf::is_fixed_point()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + if (source_has_nulls and source_null[source_index]) { return; } + using Target = cudf::detail::target_type_t; + + using DeviceType = cudf::device_storage_type_t; + DeviceType* source_casted = reinterpret_cast(source); + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source_casted[source_index])); + if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +/** + * @brief Function object to update a single element in a target column using + * the dictionary key addressed by the specific index. + * + * SFINAE is used to prevent recursion for dictionary type. Dictionary keys cannot be a + * dictionary. + * + */ +template +struct update_target_from_dictionary_gmem { + template ()>* = nullptr> + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + update_target_element_gmem{}( + target, target_index, source_column, source, source_index, source_null); + } + template ()>* = nullptr> + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + } +}; + +/** + * @brief Specialization function for dictionary type and aggregations. + * + * The `source` column is a dictionary type. This functor de-references the + * dictionary's keys child column and maps the input source index through + * the dictionary's indices child column to pass to the `update_target_element` + * in the above `update_target_from_dictionary` using the type-dispatcher to + * resolve the keys column type. + * + * `update_target_element( target, target_index, source.keys(), source.indices()[source_index] )` + * + * @tparam target_has_nulls Indicates presence of null elements in `target` + * @tparam source_has_nulls Indicates presence of null elements in `source`. + */ +template +struct update_target_element_gmem< + dictionary32, + k, + target_has_nulls, + source_has_nulls, + std::enable_if_t> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + if (source_has_nulls and source_null[source_index]) { return; } + + dispatch_type_and_aggregation( + source_column.child(cudf::dictionary_column_view::keys_column_index).type(), + k, + update_target_from_dictionary_gmem{}, + target, + target_index, + source_column, + source, + source_index, + source_null); + } +}; + +// The shared memory will already have it squared +template +struct update_target_element_gmem()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + if (source_has_nulls and source_null[source_index]) { return; } + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + Target value = static_cast(source_casted[source_index]); + + cudf::detail::atomic_add(&target.element(target_index), value); + + if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element_gmem()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + if (source_has_nulls and source_null[source_index]) { return; } + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + cudf::detail::atomic_mul(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +// Assuming that the target column of COUNT_VALID, COUNT_ALL would be using fixed_width column and +// non-fixed point column +template +struct update_target_element_gmem< + Source, + cudf::aggregation::COUNT_VALID, + target_has_nulls, + source_has_nulls, + std::enable_if_t()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source_casted[source_index])); + + // It is assumed the output for COUNT_VALID is initialized to be all valid + } +}; + +// TODO: VALID and ALL have same code +template +struct update_target_element_gmem< + Source, + cudf::aggregation::COUNT_ALL, + target_has_nulls, + source_has_nulls, + std::enable_if_t()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source_casted[source_index])); + + // It is assumed the output for COUNT_VALID is initialized to be all valid + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::ARGMAX, + target_has_nulls, + source_has_nulls, + std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + if (source_has_nulls and source_null[source_index]) { return; } + using Target = cudf::detail::target_type_t; + Target* source_casted = reinterpret_cast(source); + auto source_argmax_index = source_casted[source_index]; + auto old = cudf::detail::atomic_cas( + &target.element(target_index), cudf::detail::ARGMAX_SENTINEL, source_argmax_index); + if (old != cudf::detail::ARGMAX_SENTINEL) { + while (source_column.element(source_argmax_index) > + source_column.element(old)) { + old = + cudf::detail::atomic_cas(&target.element(target_index), old, source_argmax_index); + } + } + + if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + } +}; +template +struct update_target_element_gmem< + Source, + cudf::aggregation::ARGMIN, + target_has_nulls, + source_has_nulls, + std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + if (source_has_nulls and source_null[source_index]) { return; } + using Target = cudf::detail::target_type_t; + Target* source_casted = reinterpret_cast(source); + auto source_argmin_index = source_casted[source_index]; + auto old = cudf::detail::atomic_cas( + &target.element(target_index), cudf::detail::ARGMIN_SENTINEL, source_argmin_index); + if (old != cudf::detail::ARGMIN_SENTINEL) { + while (source_column.element(source_argmin_index) < + source_column.element(old)) { + old = + cudf::detail::atomic_cas(&target.element(target_index), old, source_argmin_index); + } + } + + if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct gmem_element_aggregator { + template + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + std::byte* source, + cudf::size_type source_index, + bool* source_null) const noexcept + { + update_target_element_gmem{}( + target, target_index, source_column, source, source_index, source_null); + } +}; + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh new file mode 100644 index 00000000000..c2d72d84b5b --- /dev/null +++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh @@ -0,0 +1,416 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace cudf::groupby::detail::hash { + +template +struct update_target_element_shmem { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::MIN, + target_has_nulls, + source_has_nulls, + std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_fixed_point()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + if (source_has_nulls and source.is_null(source_index)) { return; } + + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + cudf::detail::atomic_min(&target_casted[target_index], + static_cast(source.element(source_index))); + + if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::MIN, + target_has_nulls, + source_has_nulls, + std::enable_if_t() && + cudf::has_atomic_support>()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + if (source_has_nulls and source.is_null(source_index)) { return; } + + using Target = cudf::detail::target_type_t; + using DeviceTarget = cudf::device_storage_type_t; + using DeviceSource = cudf::device_storage_type_t; + + DeviceTarget* target_casted = reinterpret_cast(target); + cudf::detail::atomic_min(&target_casted[target_index], + static_cast(source.element(source_index))); + if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::MAX, + target_has_nulls, + source_has_nulls, + std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_fixed_point()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + if (source_has_nulls and source.is_null(source_index)) { return; } + + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + cudf::detail::atomic_max(&target_casted[target_index], + static_cast(source.element(source_index))); + if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::MAX, + target_has_nulls, + source_has_nulls, + std::enable_if_t() && + cudf::has_atomic_support>()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + if (source_has_nulls and source.is_null(source_index)) { return; } + + using Target = cudf::detail::target_type_t; + + using DeviceTarget = cudf::device_storage_type_t; + using DeviceSource = cudf::device_storage_type_t; + + DeviceTarget* target_casted = reinterpret_cast(target); + cudf::detail::atomic_max(&target_casted[target_index], + static_cast(source.element(source_index))); + + if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::SUM, + target_has_nulls, + source_has_nulls, + std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_fixed_point() && !cudf::is_timestamp()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + if (source_has_nulls and source.is_null(source_index)) { return; } + + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + cudf::detail::atomic_add(&target_casted[target_index], + static_cast(source.element(source_index))); + + if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::SUM, + target_has_nulls, + source_has_nulls, + std::enable_if_t>() && + cudf::is_fixed_point()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + if (source_has_nulls and source.is_null(source_index)) { return; } + + using Target = cudf::detail::target_type_t; + + using DeviceTarget = cudf::device_storage_type_t; + using DeviceSource = cudf::device_storage_type_t; + + DeviceTarget* target_casted = reinterpret_cast(target); + cudf::detail::atomic_add(&target_casted[target_index], + static_cast(source.element(source_index))); + + if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + } +}; + +template +struct update_target_from_dictionary_shmem { + template ()>* = nullptr> + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + update_target_element_shmem{}( + target, target_index, target_null, source, source_index); + } + template ()>* = nullptr> + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + } +}; + +template +struct update_target_element_shmem< + dictionary32, + k, + target_has_nulls, + source_has_nulls, + std::enable_if_t> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + if (source_has_nulls and source.is_null(source_index)) { return; } + + dispatch_type_and_aggregation( + source.child(cudf::dictionary_column_view::keys_column_index).type(), + k, + update_target_from_dictionary_shmem{}, + target, + target_index, + target_null, + source.child(cudf::dictionary_column_view::keys_column_index), + static_cast(source.element(source_index))); + } +}; + +template +struct update_target_element_shmem()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + if (source_has_nulls and source.is_null(source_index)) { return; } + + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + auto value = static_cast(source.element(source_index)); + cudf::detail::atomic_add(&target_casted[target_index], value * value); + + if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + } +}; + +template +struct update_target_element_shmem()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + if (source_has_nulls and source.is_null(source_index)) { return; } + + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + cudf::detail::atomic_mul(&target_casted[target_index], + static_cast(source.element(source_index))); + + if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::COUNT_VALID, + target_has_nulls, + source_has_nulls, + std::enable_if_t()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + if (source_has_nulls and source.is_null(source_index)) { return; } + + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + cudf::detail::atomic_add(&target_casted[target_index], Target{1}); + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::COUNT_ALL, + target_has_nulls, + source_has_nulls, + std::enable_if_t()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + cudf::detail::atomic_add(&target_casted[target_index], Target{1}); + + // Assumes target is already set to be valid + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::ARGMAX, + target_has_nulls, + source_has_nulls, + std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + if (source_has_nulls and source.is_null(source_index)) { return; } + + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + auto old = cudf::detail::atomic_cas( + &target_casted[target_index], cudf::detail::ARGMAX_SENTINEL, source_index); + if (old != cudf::detail::ARGMAX_SENTINEL) { + while (source.element(source_index) > source.element(old)) { + old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index); + } + } + + if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::ARGMIN, + target_has_nulls, + source_has_nulls, + std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + if (source_has_nulls and source.is_null(source_index)) { return; } + + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + auto old = cudf::detail::atomic_cas( + &target_casted[target_index], cudf::detail::ARGMIN_SENTINEL, source_index); + if (old != cudf::detail::ARGMIN_SENTINEL) { + while (source.element(source_index) < source.element(old)) { + old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index); + } + } + + if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + } +}; + +template +struct shmem_element_aggregator { + template + __device__ void operator()(std::byte* target, + cudf::size_type target_index, + bool* target_null, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + update_target_element_shmem{}( + target, target_index, target_null, source, source_index); + } +}; + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index bb8cbaf4b46..19ba33e01e3 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -25,698 +25,6 @@ namespace cudf::groupby::detail::hash { -template -struct update_target_element_gmem { - __device__ void operator()(cudf::mutable_column_device_view target, - cudf::size_type target_index, - cudf::column_device_view source_column, - std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept - { - CUDF_UNREACHABLE("Invalid source type and aggregation combination."); - } -}; - -template -struct update_target_element_gmem< - Source, - cudf::aggregation::MIN, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && cudf::has_atomic_support() && - !cudf::is_fixed_point()>> { - __device__ void operator()(cudf::mutable_column_device_view target, - cudf::size_type target_index, - cudf::column_device_view source_column, - std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept - { - if (source_has_nulls and source_null[source_index]) { return; } - - using Target = cudf::detail::target_type_t; - - Target* source_casted = reinterpret_cast(source); - cudf::detail::atomic_min(&target.element(target_index), - static_cast(source_casted[source_index])); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element_gmem< - Source, - cudf::aggregation::MIN, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && - cudf::has_atomic_support>()>> { - __device__ void operator()(cudf::mutable_column_device_view target, - cudf::size_type target_index, - cudf::column_device_view source_column, - std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept - { - if (source_has_nulls and source_null[source_index]) { return; } - using Target = cudf::detail::target_type_t; - using DeviceType = cudf::device_storage_type_t; - DeviceType* source_casted = reinterpret_cast(source); - cudf::detail::atomic_min(&target.element(target_index), - static_cast(source_casted[source_index])); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element_gmem< - Source, - cudf::aggregation::MAX, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && cudf::has_atomic_support() && - !cudf::is_fixed_point()>> { - __device__ void operator()(cudf::mutable_column_device_view target, - cudf::size_type target_index, - cudf::column_device_view source_column, - std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept - { - if (source_has_nulls and source_null[source_index]) { return; } - using Target = cudf::detail::target_type_t; - Target* source_casted = reinterpret_cast(source); - cudf::detail::atomic_max(&target.element(target_index), - static_cast(source_casted[source_index])); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element_gmem< - Source, - cudf::aggregation::MAX, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && - cudf::has_atomic_support>()>> { - __device__ void operator()(cudf::mutable_column_device_view target, - cudf::size_type target_index, - cudf::column_device_view source_column, - std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept - { - if (source_has_nulls and source_null[source_index]) { return; } - using Target = cudf::detail::target_type_t; - - using DeviceType = cudf::device_storage_type_t; - DeviceType* source_casted = reinterpret_cast(source); - cudf::detail::atomic_max(&target.element(target_index), - static_cast(source_casted[source_index])); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element_gmem< - Source, - cudf::aggregation::SUM, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && cudf::has_atomic_support() && - !cudf::is_fixed_point() && !cudf::is_timestamp()>> { - __device__ void operator()(cudf::mutable_column_device_view target, - cudf::size_type target_index, - cudf::column_device_view source_column, - std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept - { - if (source_has_nulls and source_null[source_index]) { return; } - using Target = cudf::detail::target_type_t; - - Target* source_casted = reinterpret_cast(source); - cudf::detail::atomic_add(&target.element(target_index), - static_cast(source_casted[source_index])); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element_gmem< - Source, - cudf::aggregation::SUM, - target_has_nulls, - source_has_nulls, - std::enable_if_t>() && - cudf::is_fixed_point()>> { - __device__ void operator()(cudf::mutable_column_device_view target, - cudf::size_type target_index, - cudf::column_device_view source_column, - std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept - { - if (source_has_nulls and source_null[source_index]) { return; } - using Target = cudf::detail::target_type_t; - - using DeviceType = cudf::device_storage_type_t; - DeviceType* source_casted = reinterpret_cast(source); - cudf::detail::atomic_add(&target.element(target_index), - static_cast(source_casted[source_index])); - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -// The shared memory will already have it squared -template -struct update_target_element_gmem()>> { - __device__ void operator()(cudf::mutable_column_device_view target, - cudf::size_type target_index, - cudf::column_device_view source_column, - std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept - { - if (source_has_nulls and source_null[source_index]) { return; } - using Target = cudf::detail::target_type_t; - - Target* source_casted = reinterpret_cast(source); - Target value = static_cast(source_casted[source_index]); - - cudf::detail::atomic_add(&target.element(target_index), value); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element_gmem()>> { - __device__ void operator()(cudf::mutable_column_device_view target, - cudf::size_type target_index, - cudf::column_device_view source_column, - std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept - { - if (source_has_nulls and source_null[source_index]) { return; } - using Target = cudf::detail::target_type_t; - - Target* source_casted = reinterpret_cast(source); - cudf::detail::atomic_mul(&target.element(target_index), - static_cast(source_casted[source_index])); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -// Assuming that the target column of COUNT_VALID, COUNT_ALL would be using fixed_width column and -// non-fixed point column -template -struct update_target_element_gmem< - Source, - cudf::aggregation::COUNT_VALID, - target_has_nulls, - source_has_nulls, - std::enable_if_t()>> { - __device__ void operator()(cudf::mutable_column_device_view target, - cudf::size_type target_index, - cudf::column_device_view source_column, - std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept - { - using Target = cudf::detail::target_type_t; - - Target* source_casted = reinterpret_cast(source); - cudf::detail::atomic_add(&target.element(target_index), - static_cast(source_casted[source_index])); - - // It is assumed the output for COUNT_VALID is initialized to be all valid - } -}; - -// TODO: VALID and ALL have same code -template -struct update_target_element_gmem< - Source, - cudf::aggregation::COUNT_ALL, - target_has_nulls, - source_has_nulls, - std::enable_if_t()>> { - __device__ void operator()(cudf::mutable_column_device_view target, - cudf::size_type target_index, - cudf::column_device_view source_column, - std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept - { - using Target = cudf::detail::target_type_t; - - Target* source_casted = reinterpret_cast(source); - cudf::detail::atomic_add(&target.element(target_index), - static_cast(source_casted[source_index])); - - // It is assumed the output for COUNT_VALID is initialized to be all valid - } -}; - -template -struct update_target_element_gmem< - Source, - cudf::aggregation::ARGMAX, - target_has_nulls, - source_has_nulls, - std::enable_if_t() and - cudf::is_relationally_comparable()>> { - __device__ void operator()(cudf::mutable_column_device_view target, - cudf::size_type target_index, - cudf::column_device_view source_column, - std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept - { - if (source_has_nulls and source_null[source_index]) { return; } - using Target = cudf::detail::target_type_t; - Target* source_casted = reinterpret_cast(source); - auto source_argmax_index = source_casted[source_index]; - auto old = cudf::detail::atomic_cas( - &target.element(target_index), cudf::detail::ARGMAX_SENTINEL, source_argmax_index); - if (old != cudf::detail::ARGMAX_SENTINEL) { - while (source_column.element(source_argmax_index) > - source_column.element(old)) { - old = - cudf::detail::atomic_cas(&target.element(target_index), old, source_argmax_index); - } - } - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; -template -struct update_target_element_gmem< - Source, - cudf::aggregation::ARGMIN, - target_has_nulls, - source_has_nulls, - std::enable_if_t() and - cudf::is_relationally_comparable()>> { - __device__ void operator()(cudf::mutable_column_device_view target, - cudf::size_type target_index, - cudf::column_device_view source_column, - std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept - { - if (source_has_nulls and source_null[source_index]) { return; } - using Target = cudf::detail::target_type_t; - Target* source_casted = reinterpret_cast(source); - auto source_argmin_index = source_casted[source_index]; - auto old = cudf::detail::atomic_cas( - &target.element(target_index), cudf::detail::ARGMIN_SENTINEL, source_argmin_index); - if (old != cudf::detail::ARGMIN_SENTINEL) { - while (source_column.element(source_argmin_index) < - source_column.element(old)) { - old = - cudf::detail::atomic_cas(&target.element(target_index), old, source_argmin_index); - } - } - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct gmem_element_aggregator { - template - __device__ void operator()(cudf::mutable_column_device_view target, - cudf::size_type target_index, - cudf::column_device_view source_column, - std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept - { - update_target_element_gmem{}( - target, target_index, source_column, source, source_index, source_null); - } -}; - -template -struct update_target_element_shmem { - __device__ void operator()(std::byte* target, - cudf::size_type target_index, - bool* target_null, - cudf::column_device_view source, - cudf::size_type source_index) const noexcept - { - CUDF_UNREACHABLE("Invalid source type and aggregation combination."); - } -}; - -template -struct update_target_element_shmem< - Source, - cudf::aggregation::MIN, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && cudf::has_atomic_support() && - !cudf::is_fixed_point()>> { - __device__ void operator()(std::byte* target, - cudf::size_type target_index, - bool* target_null, - cudf::column_device_view source, - cudf::size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = cudf::detail::target_type_t; - Target* target_casted = reinterpret_cast(target); - cudf::detail::atomic_min(&target_casted[target_index], - static_cast(source.element(source_index))); - - if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } - } -}; - -template -struct update_target_element_shmem< - Source, - cudf::aggregation::MIN, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && - cudf::has_atomic_support>()>> { - __device__ void operator()(std::byte* target, - cudf::size_type target_index, - bool* target_null, - cudf::column_device_view source, - cudf::size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = cudf::detail::target_type_t; - using DeviceTarget = cudf::device_storage_type_t; - using DeviceSource = cudf::device_storage_type_t; - - DeviceTarget* target_casted = reinterpret_cast(target); - cudf::detail::atomic_min(&target_casted[target_index], - static_cast(source.element(source_index))); - if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } - } -}; - -template -struct update_target_element_shmem< - Source, - cudf::aggregation::MAX, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && cudf::has_atomic_support() && - !cudf::is_fixed_point()>> { - __device__ void operator()(std::byte* target, - cudf::size_type target_index, - bool* target_null, - cudf::column_device_view source, - cudf::size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = cudf::detail::target_type_t; - Target* target_casted = reinterpret_cast(target); - cudf::detail::atomic_max(&target_casted[target_index], - static_cast(source.element(source_index))); - if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } - } -}; - -template -struct update_target_element_shmem< - Source, - cudf::aggregation::MAX, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && - cudf::has_atomic_support>()>> { - __device__ void operator()(std::byte* target, - cudf::size_type target_index, - bool* target_null, - cudf::column_device_view source, - cudf::size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = cudf::detail::target_type_t; - - using DeviceTarget = cudf::device_storage_type_t; - using DeviceSource = cudf::device_storage_type_t; - - DeviceTarget* target_casted = reinterpret_cast(target); - cudf::detail::atomic_max(&target_casted[target_index], - static_cast(source.element(source_index))); - - if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } - } -}; - -template -struct update_target_element_shmem< - Source, - cudf::aggregation::SUM, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && cudf::has_atomic_support() && - !cudf::is_fixed_point() && !cudf::is_timestamp()>> { - __device__ void operator()(std::byte* target, - cudf::size_type target_index, - bool* target_null, - cudf::column_device_view source, - cudf::size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = cudf::detail::target_type_t; - Target* target_casted = reinterpret_cast(target); - cudf::detail::atomic_add(&target_casted[target_index], - static_cast(source.element(source_index))); - - if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } - } -}; - -template -struct update_target_element_shmem< - Source, - cudf::aggregation::SUM, - target_has_nulls, - source_has_nulls, - std::enable_if_t>() && - cudf::is_fixed_point()>> { - __device__ void operator()(std::byte* target, - cudf::size_type target_index, - bool* target_null, - cudf::column_device_view source, - cudf::size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = cudf::detail::target_type_t; - - using DeviceTarget = cudf::device_storage_type_t; - using DeviceSource = cudf::device_storage_type_t; - - DeviceTarget* target_casted = reinterpret_cast(target); - cudf::detail::atomic_add(&target_casted[target_index], - static_cast(source.element(source_index))); - - if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } - } -}; - -template -struct update_target_element_shmem()>> { - __device__ void operator()(std::byte* target, - cudf::size_type target_index, - bool* target_null, - cudf::column_device_view source, - cudf::size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = cudf::detail::target_type_t; - Target* target_casted = reinterpret_cast(target); - auto value = static_cast(source.element(source_index)); - cudf::detail::atomic_add(&target_casted[target_index], value * value); - - if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } - } -}; - -template -struct update_target_element_shmem()>> { - __device__ void operator()(std::byte* target, - cudf::size_type target_index, - bool* target_null, - cudf::column_device_view source, - cudf::size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = cudf::detail::target_type_t; - Target* target_casted = reinterpret_cast(target); - cudf::detail::atomic_mul(&target_casted[target_index], - static_cast(source.element(source_index))); - - if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } - } -}; - -template -struct update_target_element_shmem< - Source, - cudf::aggregation::COUNT_VALID, - target_has_nulls, - source_has_nulls, - std::enable_if_t()>> { - __device__ void operator()(std::byte* target, - cudf::size_type target_index, - bool* target_null, - cudf::column_device_view source, - cudf::size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = cudf::detail::target_type_t; - Target* target_casted = reinterpret_cast(target); - cudf::detail::atomic_add(&target_casted[target_index], Target{1}); - } -}; - -template -struct update_target_element_shmem< - Source, - cudf::aggregation::COUNT_ALL, - target_has_nulls, - source_has_nulls, - std::enable_if_t()>> { - __device__ void operator()(std::byte* target, - cudf::size_type target_index, - bool* target_null, - cudf::column_device_view source, - cudf::size_type source_index) const noexcept - { - using Target = cudf::detail::target_type_t; - Target* target_casted = reinterpret_cast(target); - cudf::detail::atomic_add(&target_casted[target_index], Target{1}); - - // Assumes target is already set to be valid - } -}; - -template -struct update_target_element_shmem< - Source, - cudf::aggregation::ARGMAX, - target_has_nulls, - source_has_nulls, - std::enable_if_t() and - cudf::is_relationally_comparable()>> { - __device__ void operator()(std::byte* target, - cudf::size_type target_index, - bool* target_null, - cudf::column_device_view source, - cudf::size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = cudf::detail::target_type_t; - Target* target_casted = reinterpret_cast(target); - auto old = cudf::detail::atomic_cas( - &target_casted[target_index], cudf::detail::ARGMAX_SENTINEL, source_index); - if (old != cudf::detail::ARGMAX_SENTINEL) { - while (source.element(source_index) > source.element(old)) { - old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index); - } - } - - if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } - } -}; - -template -struct update_target_element_shmem< - Source, - cudf::aggregation::ARGMIN, - target_has_nulls, - source_has_nulls, - std::enable_if_t() and - cudf::is_relationally_comparable()>> { - __device__ void operator()(std::byte* target, - cudf::size_type target_index, - bool* target_null, - cudf::column_device_view source, - cudf::size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = cudf::detail::target_type_t; - Target* target_casted = reinterpret_cast(target); - auto old = cudf::detail::atomic_cas( - &target_casted[target_index], cudf::detail::ARGMIN_SENTINEL, source_index); - if (old != cudf::detail::ARGMIN_SENTINEL) { - while (source.element(source_index) < source.element(old)) { - old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index); - } - } - - if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } - } -}; - -template -struct shmem_element_aggregator { - template - __device__ void operator()(std::byte* target, - cudf::size_type target_index, - bool* target_null, - cudf::column_device_view source, - cudf::size_type source_index) const noexcept - { - update_target_element_shmem{}( - target, target_index, target_null, source, source_index); - } -}; - template __device__ constexpr bool is_supported() { From f75f2c927121e279c74bf49a1fbc5002fe057607 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 23 Sep 2024 17:16:30 -0700 Subject: [PATCH 035/135] Add missing headers --- cpp/src/groupby/hash/global_memory_aggregator.cuh | 3 +++ cpp/src/groupby/hash/shared_memory_aggregator.cuh | 3 +++ 2 files changed, 6 insertions(+) diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh index 4dd39e640e0..9f38750060b 100644 --- a/cpp/src/groupby/hash/global_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh @@ -16,8 +16,11 @@ #pragma once +#include +#include #include #include +#include #include namespace cudf::groupby::detail::hash { diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh index c2d72d84b5b..ef46c9b4cb4 100644 --- a/cpp/src/groupby/hash/shared_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh @@ -16,8 +16,11 @@ #pragma once +#include +#include #include #include +#include #include namespace cudf::groupby::detail::hash { From feb93c36c280324e91efc818e7fef3ec07c8f441 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 24 Sep 2024 15:25:34 -0700 Subject: [PATCH 036/135] Separate files to reduce build time --- cpp/CMakeLists.txt | 3 + cpp/src/groupby/hash/compute_groupby.cu | 32 ++ cpp/src/groupby/hash/compute_groupby.cuh | 312 +++++++++++++++++ cpp/src/groupby/hash/compute_groupby_null.cu | 32 ++ .../groupby/hash/compute_single_pass_aggs.cuh | 122 +------ .../groupby/hash/flatten_single_pass_aggs.cpp | 138 ++++++++ .../groupby/hash/flatten_single_pass_aggs.hpp | 34 ++ cpp/src/groupby/hash/groupby.cu | 324 ++---------------- cpp/src/groupby/hash/helpers.cuh | 23 +- 9 files changed, 594 insertions(+), 426 deletions(-) create mode 100644 cpp/src/groupby/hash/compute_groupby.cu create mode 100644 cpp/src/groupby/hash/compute_groupby.cuh create mode 100644 cpp/src/groupby/hash/compute_groupby_null.cu create mode 100644 cpp/src/groupby/hash/flatten_single_pass_aggs.cpp create mode 100644 cpp/src/groupby/hash/flatten_single_pass_aggs.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index bfec788fc0a..e405f907289 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -315,6 +315,9 @@ add_library( src/filling/sequence.cu src/groupby/groupby.cu src/groupby/hash/compute_aggregations.cu + src/groupby/hash/compute_groupby.cu + src/groupby/hash/compute_groupby_null.cu + src/groupby/hash/flatten_single_pass_aggs.cpp src/groupby/hash/groupby.cu src/groupby/sort/aggregate.cpp src/groupby/sort/group_argmax.cu diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu new file mode 100644 index 00000000000..111e7c7972a --- /dev/null +++ b/cpp/src/groupby/hash/compute_groupby.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_groupby.cuh" + +namespace cudf::groupby::detail::hash { + +template std::unique_ptr
compute_groupby( + table_view const& keys, + host_span requests, + cudf::detail::result_cache* cache, + bool skip_key_rows_with_nulls, + row_comparator_t const& d_row_equal, + cudf::experimental::row::hash::device_row_hasher const& d_row_hash, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_groupby.cuh b/cpp/src/groupby/hash/compute_groupby.cuh new file mode 100644 index 00000000000..5b2cc6e1ba6 --- /dev/null +++ b/cpp/src/groupby/hash/compute_groupby.cuh @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "compute_single_pass_aggs.cuh" +#include "compute_single_pass_aggs.hpp" +#include "multi_pass_functors.cuh" + +#include +#include +#include +#include + +namespace cudf::groupby::detail::hash { + +template +class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer { + column_view col; + data_type result_type; + cudf::detail::result_cache* sparse_results; + cudf::detail::result_cache* dense_results; + device_span gather_map; + SetType set; + bitmask_type const* __restrict__ row_bitmask; + rmm::cuda_stream_view stream; + rmm::device_async_resource_ref mr; + + public: + using cudf::detail::aggregation_finalizer::visit; + + hash_compound_agg_finalizer(column_view col, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + SetType set, + bitmask_type const* row_bitmask, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + : col(col), + sparse_results(sparse_results), + dense_results(dense_results), + gather_map(gather_map), + set(set), + row_bitmask(row_bitmask), + stream(stream), + mr(mr) + { + result_type = cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() + : col.type(); + } + + auto to_dense_agg_result(cudf::aggregation const& agg) + { + auto s = sparse_results->get_result(col, agg); + auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}), + gather_map, + out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); + return std::move(dense_result_table->release()[0]); + } + + // Enables conversion of ARGMIN/ARGMAX into MIN/MAX + auto gather_argminmax(aggregation const& agg) + { + auto arg_result = to_dense_agg_result(agg); + // We make a view of ARG(MIN/MAX) result without a null mask and gather + // using this map. The values in data buffer of ARG(MIN/MAX) result + // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL + // which is an out of bounds index value (-1) and causes the gathered + // value to be null. + column_view null_removed_map( + data_type(type_to_id()), + arg_result->size(), + static_cast(arg_result->view().template data()), + nullptr, + 0); + auto gather_argminmax = + cudf::detail::gather(table_view({col}), + null_removed_map, + arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY + : cudf::out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); + return std::move(gather_argminmax->release()[0]); + } + + // Declare overloads for each kind of aggregation to dispatch + void visit(cudf::aggregation const& agg) override + { + if (dense_results->has_result(col, agg)) return; + dense_results->add_result(col, agg, to_dense_agg_result(agg)); + } + + void visit(cudf::detail::min_aggregation const& agg) override + { + if (dense_results->has_result(col, agg)) return; + if (result_type.id() == type_id::STRING) { + auto transformed_agg = make_argmin_aggregation(); + dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); + } else { + dense_results->add_result(col, agg, to_dense_agg_result(agg)); + } + } + + void visit(cudf::detail::max_aggregation const& agg) override + { + if (dense_results->has_result(col, agg)) return; + + if (result_type.id() == type_id::STRING) { + auto transformed_agg = make_argmax_aggregation(); + dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); + } else { + dense_results->add_result(col, agg, to_dense_agg_result(agg)); + } + } + + void visit(cudf::detail::mean_aggregation const& agg) override + { + if (dense_results->has_result(col, agg)) return; + + auto sum_agg = make_sum_aggregation(); + auto count_agg = make_count_aggregation(); + this->visit(*sum_agg); + this->visit(*count_agg); + column_view sum_result = dense_results->get_result(col, *sum_agg); + column_view count_result = dense_results->get_result(col, *count_agg); + + auto result = + cudf::detail::binary_operation(sum_result, + count_result, + binary_operator::DIV, + cudf::detail::target_type(result_type, aggregation::MEAN), + stream, + mr); + dense_results->add_result(col, agg, std::move(result)); + } + + void visit(cudf::detail::var_aggregation const& agg) override + { + if (dense_results->has_result(col, agg)) return; + + auto sum_agg = make_sum_aggregation(); + auto count_agg = make_count_aggregation(); + this->visit(*sum_agg); + this->visit(*count_agg); + column_view sum_result = sparse_results->get_result(col, *sum_agg); + column_view count_result = sparse_results->get_result(col, *count_agg); + + auto values_view = column_device_view::create(col, stream); + auto sum_view = column_device_view::create(sum_result, stream); + auto count_view = column_device_view::create(count_result, stream); + + auto var_result = make_fixed_width_column( + cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream); + auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream); + mutable_table_view var_table_view{{var_result->mutable_view()}}; + cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream); + + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col.size(), + ::cudf::detail::var_hash_functor{ + set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof}); + sparse_results->add_result(col, agg, std::move(var_result)); + dense_results->add_result(col, agg, to_dense_agg_result(agg)); + } + + void visit(cudf::detail::std_aggregation const& agg) override + { + if (dense_results->has_result(col, agg)) return; + auto var_agg = make_variance_aggregation(agg._ddof); + this->visit(*dynamic_cast(var_agg.get())); + column_view variance = dense_results->get_result(col, *var_agg); + + auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr); + dense_results->add_result(col, agg, std::move(result)); + } +}; + +/** + * @brief Gather sparse results into dense using `gather_map` and add to + * `dense_cache` + * + * @see groupby_null_templated() + */ +template +void sparse_to_dense_results(table_view const& keys, + host_span requests, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + SetType set, + bool skip_key_rows_with_nulls, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto row_bitmask = + cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first; + bitmask_type const* row_bitmask_ptr = + skip_key_rows_with_nulls ? static_cast(row_bitmask.data()) : nullptr; + + for (auto const& request : requests) { + auto const& agg_v = request.aggregations; + auto const& col = request.values; + + // Given an aggregation, this will get the result from sparse_results and + // convert and return dense, compacted result + auto finalizer = hash_compound_agg_finalizer( + col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr); + for (auto&& agg : agg_v) { + agg->finalize(finalizer); + } + } +} + +/** + * @brief Computes groupby using hash table. + * + * First, we create a hash table that stores the indices of unique rows in + * `keys`. The upper limit on the number of values in this map is the number + * of rows in `keys`. + * + * To store the results of aggregations, we create temporary sparse columns + * which have the same size as input value columns. Using the hash map, we + * determine the location within the sparse column to write the result of the + * aggregation into. + * + * The sparse column results of all aggregations are stored into the cache + * `sparse_results`. This enables the use of previously calculated results in + * other aggregations. + * + * All the aggregations which can be computed in a single pass are computed + * first, in a combined kernel. Then using these results, aggregations that + * require multiple passes, will be computed. + * + * Finally, using the hash map, we generate a vector of indices of populated + * values in sparse result columns. Then, for each aggregation originally + * requested in `requests`, we gather sparse results into a column of dense + * results using the aforementioned index vector. Dense results are stored into + * the in/out parameter `cache`. + */ +template +std::unique_ptr
compute_groupby( + table_view const& keys, + host_span requests, + cudf::detail::result_cache* cache, + bool skip_key_rows_with_nulls, + Equal const& d_row_equal, + cudf::experimental::row::hash::device_row_hasher const& d_row_hash, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + // convert to int64_t to avoid potential overflow with large `keys` + auto const num_keys = static_cast(keys.num_rows()); + + // Cache of sparse results where the location of aggregate value in each + // column is indexed by the hash set + cudf::detail::result_cache sparse_results(requests.size()); + + auto const set = cuco::static_set{ + cuco::extent{num_keys}, + cudf::detail::CUCO_DESIRED_LOAD_FACTOR, // 50% occupancy + cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, + d_row_equal, + probing_scheme_t{d_row_hash}, + cuco::thread_scope_device, + cuco::storage{}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + stream.value()}; + + // Compute all single pass aggs first + auto gather_map = compute_single_pass_aggs( + keys, requests, &sparse_results, set, skip_key_rows_with_nulls, stream); + + // Compact all results from sparse_results and insert into cache + sparse_to_dense_results(keys, + requests, + &sparse_results, + cache, + gather_map, + set.ref(cuco::find), + skip_key_rows_with_nulls, + stream, + mr); + + return cudf::detail::gather(keys, + gather_map, + out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); +} + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_groupby_null.cu b/cpp/src/groupby/hash/compute_groupby_null.cu new file mode 100644 index 00000000000..1420bd2a987 --- /dev/null +++ b/cpp/src/groupby/hash/compute_groupby_null.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_groupby.cuh" + +namespace cudf::groupby::detail::hash { + +template std::unique_ptr
compute_groupby( + table_view const& keys, + host_span requests, + cudf::detail::result_cache* cache, + bool skip_key_rows_with_nulls, + nullable_row_comparator_t const& d_row_equal, + cudf::experimental::row::hash::device_row_hasher const& d_row_hash, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 617a4411243..464365c0416 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -17,7 +17,8 @@ #pragma once #include "compute_aggregations.hpp" -#include "compute_single_pass_aggs.hpp" +// #include "compute_single_pass_aggs.hpp" +#include "flatten_single_pass_aggs.hpp" #include "helpers.cuh" #include "single_pass_functors.cuh" @@ -40,10 +41,7 @@ #include -namespace cudf { -namespace groupby { -namespace detail { -namespace hash { +namespace cudf::groupby::detail::hash { template // TODO pass block @@ -118,7 +116,7 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, auto storage = SetRef::storage_ref_type(window_extent, windows); auto shared_set = SetRef(cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, global_set.key_eq(), - probing_scheme_type{global_set.hash_function()}, + probing_scheme_t{global_set.hash_function()}, {}, storage); auto const block = cooperative_groups::this_thread_block(); @@ -165,111 +163,6 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; } } -class groupby_simple_aggregations_collector final - : public cudf::detail::simple_aggregations_collector { - public: - using cudf::detail::simple_aggregations_collector::visit; - - std::vector> visit(data_type col_type, - cudf::detail::min_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation() - : make_min_aggregation()); - return aggs; - } - - std::vector> visit(data_type col_type, - cudf::detail::max_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation() - : make_max_aggregation()); - return aggs; - } - - std::vector> visit(data_type col_type, - cudf::detail::mean_aggregation const&) override - { - (void)col_type; - CUDF_EXPECTS(is_fixed_width(col_type), "MEAN aggregation expects fixed width type"); - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } - - std::vector> visit(data_type, - cudf::detail::var_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } - - std::vector> visit(data_type, - cudf::detail::std_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } - - std::vector> visit( - data_type, cudf::detail::correlation_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } -}; - -// flatten aggs to filter in single pass aggs -std::tuple, std::vector>> -flatten_single_pass_aggs(host_span requests) -{ - std::vector columns; - std::vector> aggs; - std::vector agg_kinds; - - for (auto const& request : requests) { - auto const& agg_v = request.aggregations; - - std::unordered_set agg_kinds_set; - auto insert_agg = [&](column_view const& request_values, std::unique_ptr&& agg) { - if (agg_kinds_set.insert(agg->kind).second) { - agg_kinds.push_back(agg->kind); - aggs.push_back(std::move(agg)); - columns.push_back(request_values); - } - }; - - auto values_type = cudf::is_dictionary(request.values.type()) - ? cudf::dictionary_column_view(request.values).keys().type() - : request.values.type(); - for (auto&& agg : agg_v) { - groupby_simple_aggregations_collector collector; - - for (auto& agg_s : agg->get_simple_aggregations(values_type, collector)) { - insert_agg(request.values, std::move(agg_s)); - } - } - } - - return std::make_tuple(table_view(columns), std::move(agg_kinds), std::move(aggs)); -} - template int max_occupancy_grid_size(Kernel kernel, cudf::size_type n) { @@ -362,7 +255,7 @@ rmm::device_uvector compute_single_pass_aggs( extent_type, cuda::thread_scope_block, typename SetType::key_equal, - probing_scheme_type, + probing_scheme_t, cuco::cuda_allocator, cuco::storage>; using shared_set_ref_type = typename shared_set_type::ref_type<>; @@ -458,7 +351,4 @@ rmm::device_uvector compute_single_pass_aggs( return populated_keys; } -} // namespace hash -} // namespace detail -} // namespace groupby -} // namespace cudf +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp new file mode 100644 index 00000000000..2d34a757a6f --- /dev/null +++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flatten_single_pass_aggs.hpp" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace cudf::groupby::detail::hash { + +class groupby_simple_aggregations_collector final + : public cudf::detail::simple_aggregations_collector { + public: + using cudf::detail::simple_aggregations_collector::visit; + + std::vector> visit(data_type col_type, + cudf::detail::min_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation() + : make_min_aggregation()); + return aggs; + } + + std::vector> visit(data_type col_type, + cudf::detail::max_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation() + : make_max_aggregation()); + return aggs; + } + + std::vector> visit(data_type col_type, + cudf::detail::mean_aggregation const&) override + { + (void)col_type; + CUDF_EXPECTS(is_fixed_width(col_type), "MEAN aggregation expects fixed width type"); + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } + + std::vector> visit(data_type, + cudf::detail::var_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } + + std::vector> visit(data_type, + cudf::detail::std_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } + + std::vector> visit( + data_type, cudf::detail::correlation_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } +}; + +// flatten aggs to filter in single pass aggs +std::tuple, std::vector>> +flatten_single_pass_aggs(host_span requests) +{ + std::vector columns; + std::vector> aggs; + std::vector agg_kinds; + + for (auto const& request : requests) { + auto const& agg_v = request.aggregations; + + std::unordered_set agg_kinds_set; + auto insert_agg = [&](column_view const& request_values, std::unique_ptr&& agg) { + if (agg_kinds_set.insert(agg->kind).second) { + agg_kinds.push_back(agg->kind); + aggs.push_back(std::move(agg)); + columns.push_back(request_values); + } + }; + + auto values_type = cudf::is_dictionary(request.values.type()) + ? cudf::dictionary_column_view(request.values).keys().type() + : request.values.type(); + for (auto&& agg : agg_v) { + groupby_simple_aggregations_collector collector; + + for (auto& agg_s : agg->get_simple_aggregations(values_type, collector)) { + insert_agg(request.values, std::move(agg_s)); + } + } + } + + return std::make_tuple(table_view(columns), std::move(agg_kinds), std::move(aggs)); +} + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp new file mode 100644 index 00000000000..d79e826112b --- /dev/null +++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +namespace cudf::groupby::detail::hash { + +// flatten aggs to filter in single pass aggs +std::tuple, std::vector>> +flatten_single_pass_aggs(host_span requests); + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 2c32930c061..62434bf5fd2 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -14,24 +14,18 @@ * limitations under the License. */ -#include "compute_single_pass_aggs.cuh" -#include "compute_single_pass_aggs.hpp" +#include "compute_groupby.cuh" #include "groupby/common/utils.hpp" #include "helpers.cuh" -#include "multi_pass_functors.cuh" #include #include -#include #include #include #include #include -#include -#include #include #include -#include #include #include #include @@ -54,10 +48,7 @@ #include #include -namespace cudf { -namespace groupby { -namespace detail { -namespace hash { +namespace cudf::groupby::detail::hash { namespace { /** @@ -99,306 +90,36 @@ constexpr bool array_contains(std::array const& haystack, T needle) * @return true `t` is valid for a hash based groupby * @return false `t` is invalid for a hash based groupby */ -bool constexpr is_hash_aggregation(aggregation::Kind t) +constexpr bool is_hash_aggregation(aggregation::Kind t) { return array_contains(hash_aggregations, t); } -template -class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer { - column_view col; - data_type result_type; - cudf::detail::result_cache* sparse_results; - cudf::detail::result_cache* dense_results; - device_span gather_map; - SetType set; - bitmask_type const* __restrict__ row_bitmask; - rmm::cuda_stream_view stream; - rmm::device_async_resource_ref mr; - - public: - using cudf::detail::aggregation_finalizer::visit; - - hash_compound_agg_finalizer(column_view col, - cudf::detail::result_cache* sparse_results, - cudf::detail::result_cache* dense_results, - device_span gather_map, - SetType set, - bitmask_type const* row_bitmask, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) - : col(col), - sparse_results(sparse_results), - dense_results(dense_results), - gather_map(gather_map), - set(set), - row_bitmask(row_bitmask), - stream(stream), - mr(mr) - { - result_type = cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() - : col.type(); - } - - auto to_dense_agg_result(cudf::aggregation const& agg) - { - auto s = sparse_results->get_result(col, agg); - auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}), - gather_map, - out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); - return std::move(dense_result_table->release()[0]); - } - - // Enables conversion of ARGMIN/ARGMAX into MIN/MAX - auto gather_argminmax(aggregation const& agg) - { - auto arg_result = to_dense_agg_result(agg); - // We make a view of ARG(MIN/MAX) result without a null mask and gather - // using this map. The values in data buffer of ARG(MIN/MAX) result - // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL - // which is an out of bounds index value (-1) and causes the gathered - // value to be null. - column_view null_removed_map( - data_type(type_to_id()), - arg_result->size(), - static_cast(arg_result->view().template data()), - nullptr, - 0); - auto gather_argminmax = - cudf::detail::gather(table_view({col}), - null_removed_map, - arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY - : cudf::out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); - return std::move(gather_argminmax->release()[0]); - } - - // Declare overloads for each kind of aggregation to dispatch - void visit(cudf::aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - - void visit(cudf::detail::min_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - if (result_type.id() == type_id::STRING) { - auto transformed_agg = make_argmin_aggregation(); - dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); - } else { - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - } - - void visit(cudf::detail::max_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - - if (result_type.id() == type_id::STRING) { - auto transformed_agg = make_argmax_aggregation(); - dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); - } else { - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - } - - void visit(cudf::detail::mean_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - - auto sum_agg = make_sum_aggregation(); - auto count_agg = make_count_aggregation(); - this->visit(*sum_agg); - this->visit(*count_agg); - column_view sum_result = dense_results->get_result(col, *sum_agg); - column_view count_result = dense_results->get_result(col, *count_agg); - - auto result = - cudf::detail::binary_operation(sum_result, - count_result, - binary_operator::DIV, - cudf::detail::target_type(result_type, aggregation::MEAN), - stream, - mr); - dense_results->add_result(col, agg, std::move(result)); - } - - void visit(cudf::detail::var_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - - auto sum_agg = make_sum_aggregation(); - auto count_agg = make_count_aggregation(); - this->visit(*sum_agg); - this->visit(*count_agg); - column_view sum_result = sparse_results->get_result(col, *sum_agg); - column_view count_result = sparse_results->get_result(col, *count_agg); - - auto values_view = column_device_view::create(col, stream); - auto sum_view = column_device_view::create(sum_result, stream); - auto count_view = column_device_view::create(count_result, stream); - - auto var_result = make_fixed_width_column( - cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream); - auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream); - mutable_table_view var_table_view{{var_result->mutable_view()}}; - cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream); - - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - col.size(), - ::cudf::detail::var_hash_functor{ - set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof}); - sparse_results->add_result(col, agg, std::move(var_result)); - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - - void visit(cudf::detail::std_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - auto var_agg = make_variance_aggregation(agg._ddof); - this->visit(*dynamic_cast(var_agg.get())); - column_view variance = dense_results->get_result(col, *var_agg); - - auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr); - dense_results->add_result(col, agg, std::move(result)); - } -}; - -/** - * @brief Gather sparse results into dense using `gather_map` and add to - * `dense_cache` - * - * @see groupby_null_templated() - */ -template -void sparse_to_dense_results(table_view const& keys, - host_span requests, - cudf::detail::result_cache* sparse_results, - cudf::detail::result_cache* dense_results, - device_span gather_map, - SetType set, - bool skip_key_rows_with_nulls, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto row_bitmask = - cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first; - bitmask_type const* row_bitmask_ptr = - skip_key_rows_with_nulls ? static_cast(row_bitmask.data()) : nullptr; - - for (auto const& request : requests) { - auto const& agg_v = request.aggregations; - auto const& col = request.values; - - // Given an aggregation, this will get the result from sparse_results and - // convert and return dense, compacted result - auto finalizer = hash_compound_agg_finalizer( - col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr); - for (auto&& agg : agg_v) { - agg->finalize(finalizer); - } - } -} - -/** - * @brief Computes groupby using hash table. - * - * First, we create a hash table that stores the indices of unique rows in - * `keys`. The upper limit on the number of values in this map is the number - * of rows in `keys`. - * - * To store the results of aggregations, we create temporary sparse columns - * which have the same size as input value columns. Using the hash map, we - * determine the location within the sparse column to write the result of the - * aggregation into. - * - * The sparse column results of all aggregations are stored into the cache - * `sparse_results`. This enables the use of previously calculated results in - * other aggregations. - * - * All the aggregations which can be computed in a single pass are computed - * first, in a combined kernel. Then using these results, aggregations that - * require multiple passes, will be computed. - * - * Finally, using the hash map, we generate a vector of indices of populated - * values in sparse result columns. Then, for each aggregation originally - * requested in `requests`, we gather sparse results into a column of dense - * results using the aforementioned index vector. Dense results are stored into - * the in/out parameter `cache`. - */ -std::unique_ptr
groupby(table_view const& keys, - host_span requests, - cudf::detail::result_cache* cache, - bool const keys_have_nulls, - null_policy const include_null_keys, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr
dispatch_groupby(table_view const& keys, + host_span requests, + cudf::detail::result_cache* cache, + bool keys_have_nulls, + null_policy include_null_keys, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { - // convert to int64_t to avoid potential overflow with large `keys` - auto const num_keys = static_cast(keys.num_rows()); - auto const null_keys_are_equal = null_equality::EQUAL; - auto const has_null = nullate::DYNAMIC{cudf::has_nested_nulls(keys)}; - auto const skip_key_rows_with_nulls = - keys_have_nulls and include_null_keys == null_policy::EXCLUDE; + auto const null_keys_are_equal = null_equality::EQUAL; + auto const has_null = nullate::DYNAMIC{cudf::has_nested_nulls(keys)}; + auto const skip_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; auto preprocessed_keys = cudf::experimental::row::hash::preprocessed_table::create(keys, stream); auto const comparator = cudf::experimental::row::equality::self_comparator{preprocessed_keys}; auto const row_hash = cudf::experimental::row::hash::row_hasher{std::move(preprocessed_keys)}; auto const d_row_hash = row_hash.device_hasher(has_null); - // Cache of sparse results where the location of aggregate value in each - // column is indexed by the hash set - cudf::detail::result_cache sparse_results(requests.size()); - - auto const comparator_helper = [&](auto const d_key_equal) { - auto const set = cuco::static_set{ - num_keys, - cudf::detail::CUCO_DESIRED_LOAD_FACTOR, // 50% occupancy - cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, - d_key_equal, - probing_scheme_type{d_row_hash}, - cuco::thread_scope_device, - cuco::storage{}, - cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, - stream.value()}; - - // Compute all single pass aggs first - auto gather_map = compute_single_pass_aggs( - keys, requests, &sparse_results, set, skip_key_rows_with_nulls, stream); - - // Compact all results from sparse_results and insert into cache - sparse_to_dense_results(keys, - requests, - &sparse_results, - cache, - gather_map, - set.ref(cuco::find), - skip_key_rows_with_nulls, - stream, - mr); - - return cudf::detail::gather(keys, - gather_map, - out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); - }; - if (cudf::detail::has_nested_columns(keys)) { - auto const d_key_equal = comparator.equal_to(has_null, null_keys_are_equal); - return comparator_helper(d_key_equal); + auto const d_row_equal = comparator.equal_to(has_null, null_keys_are_equal); + return compute_groupby( + keys, requests, cache, skip_rows_with_nulls, d_row_equal, d_row_hash, stream, mr); } else { - auto const d_key_equal = comparator.equal_to(has_null, null_keys_are_equal); - return comparator_helper(d_key_equal); + auto const d_row_equal = comparator.equal_to(has_null, null_keys_are_equal); + return compute_groupby( + keys, requests, cache, skip_rows_with_nulls, d_row_equal, d_row_hash, stream, mr); } } @@ -442,11 +163,8 @@ std::pair, std::vector> groupby( cudf::detail::result_cache cache(requests.size()); std::unique_ptr
unique_keys = - groupby(keys, requests, &cache, cudf::has_nulls(keys), include_null_keys, stream, mr); + dispatch_groupby(keys, requests, &cache, cudf::has_nulls(keys), include_null_keys, stream, mr); return std::pair(std::move(unique_keys), extract_results(requests, cache, stream, mr)); } -} // namespace hash -} // namespace detail -} // namespace groupby -} // namespace cudf +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh index 9287325c3fb..9918aa5575a 100644 --- a/cpp/src/groupby/hash/helpers.cuh +++ b/cpp/src/groupby/hash/helpers.cuh @@ -23,7 +23,6 @@ #include namespace cudf::groupby::detail::hash { - // TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested // types and `cg_size = 1`for flat data to improve performance /// Number of threads to handle each input element @@ -32,12 +31,6 @@ CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1; /// Number of slots per thread CUDF_HOST_DEVICE auto constexpr GROUPBY_WINDOW_SIZE = 1; -/// Probing scheme type used by groupby hash table -using probing_scheme_type = cuco::linear_probing< - GROUPBY_CG_SIZE, - cudf::experimental::row::hash::device_row_hasher>; - /// Thread block size CUDF_HOST_DEVICE auto constexpr GROUPBY_BLOCK_SIZE = 128; @@ -60,4 +53,20 @@ CUDF_HOST_DEVICE constexpr std::size_t round_to_multiple_of_8(std::size_t num) return cudf::util::div_rounding_up_safe(num, base) * base; } +/// Probing scheme type used by groupby hash table +using probing_scheme_t = cuco::linear_probing< + GROUPBY_CG_SIZE, + cudf::experimental::row::hash::device_row_hasher>; + +using row_comparator_t = cudf::experimental::row::equality::device_row_comparator< + false, + cudf::nullate::DYNAMIC, + cudf::experimental::row::equality::nan_equal_physical_equality_comparator>; + +using nullable_row_comparator_t = cudf::experimental::row::equality::device_row_comparator< + true, + cudf::nullate::DYNAMIC, + cudf::experimental::row::equality::nan_equal_physical_equality_comparator>; + } // namespace cudf::groupby::detail::hash From 29cba47ac7dcd7df48a3427d06b59ee74011bd9e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 24 Sep 2024 15:46:43 -0700 Subject: [PATCH 037/135] Minor cleanups --- cpp/src/groupby/hash/compute_groupby.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/groupby/hash/compute_groupby.cuh b/cpp/src/groupby/hash/compute_groupby.cuh index 5b2cc6e1ba6..4dab0ae29cf 100644 --- a/cpp/src/groupby/hash/compute_groupby.cuh +++ b/cpp/src/groupby/hash/compute_groupby.cuh @@ -16,7 +16,7 @@ #pragma once #include "compute_single_pass_aggs.cuh" -#include "compute_single_pass_aggs.hpp" +// #include "compute_single_pass_aggs.hpp" #include "multi_pass_functors.cuh" #include From 523737fd13c2ffb57171d0e3080c2c7ca9e522d7 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 24 Sep 2024 19:43:08 -0700 Subject: [PATCH 038/135] More explicit instantiations --- cpp/CMakeLists.txt | 1 + cpp/src/groupby/hash/compute_groupby.cu | 13 ++++++++-- cpp/src/groupby/hash/compute_groupby.cuh | 2 +- cpp/src/groupby/hash/compute_groupby_null.cu | 14 +++++++++-- cpp/src/groupby/hash/helpers.cuh | 15 ++++++++--- cpp/src/groupby/hash/multi_pass_functors.cu | 26 ++++++++++++++++++++ cpp/src/groupby/hash/multi_pass_functors.cuh | 18 ++++++-------- 7 files changed, 70 insertions(+), 19 deletions(-) create mode 100644 cpp/src/groupby/hash/multi_pass_functors.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e405f907289..76552a88d7c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -319,6 +319,7 @@ add_library( src/groupby/hash/compute_groupby_null.cu src/groupby/hash/flatten_single_pass_aggs.cpp src/groupby/hash/groupby.cu + src/groupby/hash/multi_pass_functors.cu src/groupby/sort/aggregate.cpp src/groupby/sort/group_argmax.cu src/groupby/sort/group_argmin.cu diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu index 111e7c7972a..4944fed9b68 100644 --- a/cpp/src/groupby/hash/compute_groupby.cu +++ b/cpp/src/groupby/hash/compute_groupby.cu @@ -18,14 +18,23 @@ namespace cudf::groupby::detail::hash { +template void sparse_to_dense_results(table_view const& keys, + host_span requests, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + hash_set_ref_t set, + bool skip_key_rows_with_nulls, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + template std::unique_ptr
compute_groupby( table_view const& keys, host_span requests, cudf::detail::result_cache* cache, bool skip_key_rows_with_nulls, row_comparator_t const& d_row_equal, - cudf::experimental::row::hash::device_row_hasher const& d_row_hash, + row_hash_t const& d_row_hash, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); diff --git a/cpp/src/groupby/hash/compute_groupby.cuh b/cpp/src/groupby/hash/compute_groupby.cuh index 4dab0ae29cf..6599a93f730 100644 --- a/cpp/src/groupby/hash/compute_groupby.cuh +++ b/cpp/src/groupby/hash/compute_groupby.cuh @@ -176,7 +176,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final rmm::exec_policy(stream), thrust::make_counting_iterator(0), col.size(), - ::cudf::detail::var_hash_functor{ + var_hash_functor{ set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof}); sparse_results->add_result(col, agg, std::move(var_result)); dense_results->add_result(col, agg, to_dense_agg_result(agg)); diff --git a/cpp/src/groupby/hash/compute_groupby_null.cu b/cpp/src/groupby/hash/compute_groupby_null.cu index 1420bd2a987..fc05e98ed4f 100644 --- a/cpp/src/groupby/hash/compute_groupby_null.cu +++ b/cpp/src/groupby/hash/compute_groupby_null.cu @@ -18,14 +18,24 @@ namespace cudf::groupby::detail::hash { +template void sparse_to_dense_results( + table_view const& keys, + host_span requests, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + nullable_hash_set_ref_t set, + bool skip_key_rows_with_nulls, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + template std::unique_ptr
compute_groupby( table_view const& keys, host_span requests, cudf::detail::result_cache* cache, bool skip_key_rows_with_nulls, nullable_row_comparator_t const& d_row_equal, - cudf::experimental::row::hash::device_row_hasher const& d_row_hash, + row_hash_t const& d_row_hash, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh index 9918aa5575a..650b936372d 100644 --- a/cpp/src/groupby/hash/helpers.cuh +++ b/cpp/src/groupby/hash/helpers.cuh @@ -53,11 +53,12 @@ CUDF_HOST_DEVICE constexpr std::size_t round_to_multiple_of_8(std::size_t num) return cudf::util::div_rounding_up_safe(num, base) * base; } -/// Probing scheme type used by groupby hash table -using probing_scheme_t = cuco::linear_probing< - GROUPBY_CG_SIZE, +using row_hash_t = cudf::experimental::row::hash::device_row_hasher>; + cudf::nullate::DYNAMIC>; + +/// Probing scheme type used by groupby hash table +using probing_scheme_t = cuco::linear_probing; using row_comparator_t = cudf::experimental::row::equality::device_row_comparator< false, @@ -69,4 +70,10 @@ using nullable_row_comparator_t = cudf::experimental::row::equality::device_row_ cudf::nullate::DYNAMIC, cudf::experimental::row::equality::nan_equal_physical_equality_comparator>; +using hash_set_ref_t = cuco:: + static_set_ref>, cuco::op::find_tag, >; + +using nullable_hash_set_ref_t = cuco:: + static_set_ref>, cuco::op::find_tag, >; + } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/multi_pass_functors.cu b/cpp/src/groupby/hash/multi_pass_functors.cu new file mode 100644 index 00000000000..1f4c2a6a923 --- /dev/null +++ b/cpp/src/groupby/hash/multi_pass_functors.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "helpers.cuh" +#include "multi_pass_functors.cuh" + +namespace cudf::groupby::detail::hash { + +// explicit template instantiation to reduce build time +template struct var_hash_functor; +template struct var_hash_functor; + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/multi_pass_functors.cuh b/cpp/src/groupby/hash/multi_pass_functors.cuh index 6fbec5fe19e..98668d0cb45 100644 --- a/cpp/src/groupby/hash/multi_pass_functors.cuh +++ b/cpp/src/groupby/hash/multi_pass_functors.cuh @@ -29,10 +29,9 @@ #include -namespace cudf { -namespace detail { +namespace cudf::groupby::detail::hash { -template +template struct var_hash_functor { SetType set; bitmask_type const* __restrict__ row_bitmask; @@ -75,11 +74,11 @@ struct var_hash_functor { __device__ cuda::std::enable_if_t()> operator()( column_device_view const& source, size_type source_index, size_type target_index) noexcept { - using Target = target_type_t; - using SumType = target_type_t; - using CountType = target_type_t; + using Target = cudf::detail::target_type_t; + using SumType = cudf::detail::target_type_t; + using CountType = cudf::detail::target_type_t; - if (source_has_nulls and source.is_null(source_index)) return; + if (source.is_null(source_index)) return; CountType group_size = count.element(target_index); if (group_size == 0 or group_size - ddof <= 0) return; @@ -90,7 +89,7 @@ struct var_hash_functor { ref.fetch_add(result, cuda::std::memory_order_relaxed); // STD sqrt is applied in finalize() - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + if (target.is_null(target_index)) { target.set_valid(target_index); } } __device__ inline void operator()(size_type source_index) @@ -111,5 +110,4 @@ struct var_hash_functor { } }; -} // namespace detail -} // namespace cudf +} // namespace cudf::groupby::detail::hash From a5743459acdb246d420c6501cfde41d2279b2eaa Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 25 Sep 2024 19:08:26 -0700 Subject: [PATCH 039/135] Test rollback --- cpp/src/groupby/hash/compute_aggregations.cu | 40 +++++++++++-------- cpp/src/groupby/hash/compute_aggregations.hpp | 22 +++++----- .../groupby/hash/compute_single_pass_aggs.cuh | 27 ++++++++----- 3 files changed, 50 insertions(+), 39 deletions(-) diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu index 73dca45edf7..7eb8216ee13 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cu +++ b/cpp/src/groupby/hash/compute_aggregations.cu @@ -225,32 +225,36 @@ CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows, constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; } template -constexpr size_t compute_shared_memory_size(Kernel kernel, int grid_size) +constexpr std::pair compute_shared_memory_size(Kernel kernel, int grid_size) { auto const active_blocks_per_sm = cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); - size_t dynamic_shmem_size; - CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock( - &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE)); - return get_previous_multiple_of_8(0.5 * dynamic_shmem_size); + size_t dynamic_shmem_size = 0; + + auto const cuda_error = cudaOccupancyAvailableDynamicSMemPerBlock( + &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE); + return {cuda_error, get_previous_multiple_of_8(0.5 * dynamic_shmem_size)}; } } // namespace -void compute_aggregations(int grid_size, - cudf::size_type num_input_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::aggregation::Kind const* aggs, - rmm::cuda_stream_view stream) +cudaError_t compute_aggregations(int grid_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* aggs, + rmm::cuda_stream_view stream) { - auto const shmem_size = compute_shared_memory_size(compute_aggs_kernel, grid_size); + auto const [cuda_error, shmem_size] = compute_shared_memory_size(compute_aggs_kernel, grid_size); + + if (cuda_error != cudaSuccess) { return cuda_error; } + // For each aggregation, need two pointers to arrays in shmem // One where the aggregation is performed, one indicating the validity of the aggregation auto const shmem_agg_pointer_size = @@ -269,6 +273,8 @@ void compute_aggregations(int grid_size, aggs, shmem_agg_size, shmem_agg_pointer_size); + + return cudaSuccess; } } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_aggregations.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp index 87c37158cd0..862462c8b9f 100644 --- a/cpp/src/groupby/hash/compute_aggregations.hpp +++ b/cpp/src/groupby/hash/compute_aggregations.hpp @@ -26,16 +26,16 @@ namespace cudf::groupby::detail::hash { -void compute_aggregations(int grid_size, - cudf::size_type num_input_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::aggregation::Kind const* aggs, - rmm::cuda_stream_view stream); +cudaError_t compute_aggregations(int grid_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* aggs, + rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 464365c0416..a52d6ecd530 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -312,17 +312,22 @@ rmm::device_uvector compute_single_pass_aggs( auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); auto d_values = table_device_view::create(flattened_values, stream); - compute_aggregations(grid_size, - num_input_rows, - static_cast(row_bitmask.data()), - skip_rows_with_nulls, - local_mapping_index.data(), - global_mapping_index.data(), - block_cardinality.data(), - *d_values, - *d_sparse_table, - d_agg_kinds.data(), - stream); + auto const cuda_error = compute_aggregations(grid_size, + num_input_rows, + static_cast(row_bitmask.data()), + skip_rows_with_nulls, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + stream); + + if (cuda_error != cudaSuccess) { + constexpr bool uses_direct_aggs = true; + direct_aggregations.set_value_async(uses_direct_aggs, stream); + } if (direct_aggregations.value(stream)) { auto const stride = GROUPBY_BLOCK_SIZE * grid_size; From 4b2b55fe37350dd1c48ddb92b1b03aa387d458e2 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 25 Sep 2024 19:32:01 -0700 Subject: [PATCH 040/135] More explicit instantiations --- cpp/CMakeLists.txt | 5 +- cpp/src/groupby/hash/compute_groupby.cu | 10 - cpp/src/groupby/hash/compute_groupby.cuh | 207 +----------------- cpp/src/groupby/hash/compute_groupby_null.cu | 11 - .../hash/hash_compound_agg_finalizer.cu | 25 +++ .../hash/hash_compound_agg_finalizer.cuh | 197 +++++++++++++++++ .../groupby/hash/sparse_to_dense_results.cu | 32 +++ .../groupby/hash/sparse_to_dense_results.cuh | 65 ++++++ .../hash/sparse_to_dense_results_null.cu | 33 +++ ...i_pass_functors.cu => var_hash_functor.cu} | 2 +- ...pass_functors.cuh => var_hash_functor.cuh} | 0 11 files changed, 360 insertions(+), 227 deletions(-) create mode 100644 cpp/src/groupby/hash/hash_compound_agg_finalizer.cu create mode 100644 cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh create mode 100644 cpp/src/groupby/hash/sparse_to_dense_results.cu create mode 100644 cpp/src/groupby/hash/sparse_to_dense_results.cuh create mode 100644 cpp/src/groupby/hash/sparse_to_dense_results_null.cu rename cpp/src/groupby/hash/{multi_pass_functors.cu => var_hash_functor.cu} (96%) rename cpp/src/groupby/hash/{multi_pass_functors.cuh => var_hash_functor.cuh} (100%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 76552a88d7c..b48b480f6fc 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -318,8 +318,11 @@ add_library( src/groupby/hash/compute_groupby.cu src/groupby/hash/compute_groupby_null.cu src/groupby/hash/flatten_single_pass_aggs.cpp + src/groupby/hash/sparse_to_dense_results.cu + src/groupby/hash/sparse_to_dense_results_null.cu src/groupby/hash/groupby.cu - src/groupby/hash/multi_pass_functors.cu + src/groupby/hash/var_hash_functor.cu + src/groupby/hash/hash_compound_agg_finalizer.cu src/groupby/sort/aggregate.cpp src/groupby/sort/group_argmax.cu src/groupby/sort/group_argmin.cu diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu index 4944fed9b68..7965d0891a7 100644 --- a/cpp/src/groupby/hash/compute_groupby.cu +++ b/cpp/src/groupby/hash/compute_groupby.cu @@ -18,16 +18,6 @@ namespace cudf::groupby::detail::hash { -template void sparse_to_dense_results(table_view const& keys, - host_span requests, - cudf::detail::result_cache* sparse_results, - cudf::detail::result_cache* dense_results, - device_span gather_map, - hash_set_ref_t set, - bool skip_key_rows_with_nulls, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); - template std::unique_ptr
compute_groupby( table_view const& keys, host_span requests, diff --git a/cpp/src/groupby/hash/compute_groupby.cuh b/cpp/src/groupby/hash/compute_groupby.cuh index 6599a93f730..e97853fa155 100644 --- a/cpp/src/groupby/hash/compute_groupby.cuh +++ b/cpp/src/groupby/hash/compute_groupby.cuh @@ -18,217 +18,16 @@ #include "compute_single_pass_aggs.cuh" // #include "compute_single_pass_aggs.hpp" #include "multi_pass_functors.cuh" +#include "sparse_to_dense_results.cuh" #include #include #include #include -namespace cudf::groupby::detail::hash { - -template -class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer { - column_view col; - data_type result_type; - cudf::detail::result_cache* sparse_results; - cudf::detail::result_cache* dense_results; - device_span gather_map; - SetType set; - bitmask_type const* __restrict__ row_bitmask; - rmm::cuda_stream_view stream; - rmm::device_async_resource_ref mr; - - public: - using cudf::detail::aggregation_finalizer::visit; - - hash_compound_agg_finalizer(column_view col, - cudf::detail::result_cache* sparse_results, - cudf::detail::result_cache* dense_results, - device_span gather_map, - SetType set, - bitmask_type const* row_bitmask, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) - : col(col), - sparse_results(sparse_results), - dense_results(dense_results), - gather_map(gather_map), - set(set), - row_bitmask(row_bitmask), - stream(stream), - mr(mr) - { - result_type = cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() - : col.type(); - } - - auto to_dense_agg_result(cudf::aggregation const& agg) - { - auto s = sparse_results->get_result(col, agg); - auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}), - gather_map, - out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); - return std::move(dense_result_table->release()[0]); - } - - // Enables conversion of ARGMIN/ARGMAX into MIN/MAX - auto gather_argminmax(aggregation const& agg) - { - auto arg_result = to_dense_agg_result(agg); - // We make a view of ARG(MIN/MAX) result without a null mask and gather - // using this map. The values in data buffer of ARG(MIN/MAX) result - // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL - // which is an out of bounds index value (-1) and causes the gathered - // value to be null. - column_view null_removed_map( - data_type(type_to_id()), - arg_result->size(), - static_cast(arg_result->view().template data()), - nullptr, - 0); - auto gather_argminmax = - cudf::detail::gather(table_view({col}), - null_removed_map, - arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY - : cudf::out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); - return std::move(gather_argminmax->release()[0]); - } - - // Declare overloads for each kind of aggregation to dispatch - void visit(cudf::aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - - void visit(cudf::detail::min_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - if (result_type.id() == type_id::STRING) { - auto transformed_agg = make_argmin_aggregation(); - dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); - } else { - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - } - - void visit(cudf::detail::max_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - - if (result_type.id() == type_id::STRING) { - auto transformed_agg = make_argmax_aggregation(); - dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); - } else { - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - } - - void visit(cudf::detail::mean_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - - auto sum_agg = make_sum_aggregation(); - auto count_agg = make_count_aggregation(); - this->visit(*sum_agg); - this->visit(*count_agg); - column_view sum_result = dense_results->get_result(col, *sum_agg); - column_view count_result = dense_results->get_result(col, *count_agg); - - auto result = - cudf::detail::binary_operation(sum_result, - count_result, - binary_operator::DIV, - cudf::detail::target_type(result_type, aggregation::MEAN), - stream, - mr); - dense_results->add_result(col, agg, std::move(result)); - } - - void visit(cudf::detail::var_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; +#include - auto sum_agg = make_sum_aggregation(); - auto count_agg = make_count_aggregation(); - this->visit(*sum_agg); - this->visit(*count_agg); - column_view sum_result = sparse_results->get_result(col, *sum_agg); - column_view count_result = sparse_results->get_result(col, *count_agg); - - auto values_view = column_device_view::create(col, stream); - auto sum_view = column_device_view::create(sum_result, stream); - auto count_view = column_device_view::create(count_result, stream); - - auto var_result = make_fixed_width_column( - cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream); - auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream); - mutable_table_view var_table_view{{var_result->mutable_view()}}; - cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream); - - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - col.size(), - var_hash_functor{ - set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof}); - sparse_results->add_result(col, agg, std::move(var_result)); - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - - void visit(cudf::detail::std_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - auto var_agg = make_variance_aggregation(agg._ddof); - this->visit(*dynamic_cast(var_agg.get())); - column_view variance = dense_results->get_result(col, *var_agg); - - auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr); - dense_results->add_result(col, agg, std::move(result)); - } -}; - -/** - * @brief Gather sparse results into dense using `gather_map` and add to - * `dense_cache` - * - * @see groupby_null_templated() - */ -template -void sparse_to_dense_results(table_view const& keys, - host_span requests, - cudf::detail::result_cache* sparse_results, - cudf::detail::result_cache* dense_results, - device_span gather_map, - SetType set, - bool skip_key_rows_with_nulls, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto row_bitmask = - cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first; - bitmask_type const* row_bitmask_ptr = - skip_key_rows_with_nulls ? static_cast(row_bitmask.data()) : nullptr; - - for (auto const& request : requests) { - auto const& agg_v = request.aggregations; - auto const& col = request.values; - - // Given an aggregation, this will get the result from sparse_results and - // convert and return dense, compacted result - auto finalizer = hash_compound_agg_finalizer( - col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr); - for (auto&& agg : agg_v) { - agg->finalize(finalizer); - } - } -} +namespace cudf::groupby::detail::hash { /** * @brief Computes groupby using hash table. diff --git a/cpp/src/groupby/hash/compute_groupby_null.cu b/cpp/src/groupby/hash/compute_groupby_null.cu index fc05e98ed4f..1f9707902cc 100644 --- a/cpp/src/groupby/hash/compute_groupby_null.cu +++ b/cpp/src/groupby/hash/compute_groupby_null.cu @@ -18,17 +18,6 @@ namespace cudf::groupby::detail::hash { -template void sparse_to_dense_results( - table_view const& keys, - host_span requests, - cudf::detail::result_cache* sparse_results, - cudf::detail::result_cache* dense_results, - device_span gather_map, - nullable_hash_set_ref_t set, - bool skip_key_rows_with_nulls, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); - template std::unique_ptr
compute_groupby( table_view const& keys, host_span requests, diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu new file mode 100644 index 00000000000..e7a7af92f15 --- /dev/null +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hash_compound_agg_finalizer.cuh" +#include "helpers.cuh" + +namespace cudf::groupby::detail::hash { + +template class hash_compound_agg_finalizer; +template class hash_compound_agg_finalizer; + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh new file mode 100644 index 00000000000..a9326873282 --- /dev/null +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "var_hash_functor.cuh" + +#include +#include +#include +#include +#include +#include + +namespace cudf::groupby::detail::hash { + +template +class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer { + column_view col; + data_type result_type; + cudf::detail::result_cache* sparse_results; + cudf::detail::result_cache* dense_results; + device_span gather_map; + SetType set; + bitmask_type const* __restrict__ row_bitmask; + rmm::cuda_stream_view stream; + rmm::device_async_resource_ref mr; + + public: + using cudf::detail::aggregation_finalizer::visit; + + hash_compound_agg_finalizer(column_view col, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + SetType set, + bitmask_type const* row_bitmask, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + : col(col), + sparse_results(sparse_results), + dense_results(dense_results), + gather_map(gather_map), + set(set), + row_bitmask(row_bitmask), + stream(stream), + mr(mr) + { + result_type = cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() + : col.type(); + } + + auto to_dense_agg_result(cudf::aggregation const& agg) + { + auto s = sparse_results->get_result(col, agg); + auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}), + gather_map, + out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); + return std::move(dense_result_table->release()[0]); + } + + // Enables conversion of ARGMIN/ARGMAX into MIN/MAX + auto gather_argminmax(aggregation const& agg) + { + auto arg_result = to_dense_agg_result(agg); + // We make a view of ARG(MIN/MAX) result without a null mask and gather + // using this map. The values in data buffer of ARG(MIN/MAX) result + // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL + // which is an out of bounds index value (-1) and causes the gathered + // value to be null. + column_view null_removed_map( + data_type(type_to_id()), + arg_result->size(), + static_cast(arg_result->view().template data()), + nullptr, + 0); + auto gather_argminmax = + cudf::detail::gather(table_view({col}), + null_removed_map, + arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY + : cudf::out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); + return std::move(gather_argminmax->release()[0]); + } + + // Declare overloads for each kind of aggregation to dispatch + void visit(cudf::aggregation const& agg) override + { + if (dense_results->has_result(col, agg)) return; + dense_results->add_result(col, agg, to_dense_agg_result(agg)); + } + + void visit(cudf::detail::min_aggregation const& agg) override + { + if (dense_results->has_result(col, agg)) return; + if (result_type.id() == type_id::STRING) { + auto transformed_agg = make_argmin_aggregation(); + dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); + } else { + dense_results->add_result(col, agg, to_dense_agg_result(agg)); + } + } + + void visit(cudf::detail::max_aggregation const& agg) override + { + if (dense_results->has_result(col, agg)) return; + + if (result_type.id() == type_id::STRING) { + auto transformed_agg = make_argmax_aggregation(); + dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); + } else { + dense_results->add_result(col, agg, to_dense_agg_result(agg)); + } + } + + void visit(cudf::detail::mean_aggregation const& agg) override + { + if (dense_results->has_result(col, agg)) return; + + auto sum_agg = make_sum_aggregation(); + auto count_agg = make_count_aggregation(); + this->visit(*sum_agg); + this->visit(*count_agg); + column_view sum_result = dense_results->get_result(col, *sum_agg); + column_view count_result = dense_results->get_result(col, *count_agg); + + auto result = + cudf::detail::binary_operation(sum_result, + count_result, + binary_operator::DIV, + cudf::detail::target_type(result_type, aggregation::MEAN), + stream, + mr); + dense_results->add_result(col, agg, std::move(result)); + } + + void visit(cudf::detail::var_aggregation const& agg) override + { + if (dense_results->has_result(col, agg)) return; + + auto sum_agg = make_sum_aggregation(); + auto count_agg = make_count_aggregation(); + this->visit(*sum_agg); + this->visit(*count_agg); + column_view sum_result = sparse_results->get_result(col, *sum_agg); + column_view count_result = sparse_results->get_result(col, *count_agg); + + auto values_view = column_device_view::create(col, stream); + auto sum_view = column_device_view::create(sum_result, stream); + auto count_view = column_device_view::create(count_result, stream); + + auto var_result = make_fixed_width_column( + cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream); + auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream); + mutable_table_view var_table_view{{var_result->mutable_view()}}; + cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream); + + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col.size(), + var_hash_functor{ + set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof}); + sparse_results->add_result(col, agg, std::move(var_result)); + dense_results->add_result(col, agg, to_dense_agg_result(agg)); + } + + void visit(cudf::detail::std_aggregation const& agg) override + { + if (dense_results->has_result(col, agg)) return; + auto var_agg = make_variance_aggregation(agg._ddof); + this->visit(*dynamic_cast(var_agg.get())); + column_view variance = dense_results->get_result(col, *var_agg); + + auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr); + dense_results->add_result(col, agg, std::move(result)); + } +}; + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu new file mode 100644 index 00000000000..760926afa13 --- /dev/null +++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "helpers.cuh" +#include "sparse_to_dense_results.cuh" + +namespace cudf::groupby::detail::hash { + +template void sparse_to_dense_results(table_view const& keys, + host_span requests, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + hash_set_ref_t set, + bool skip_key_rows_with_nulls, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cuh b/cpp/src/groupby/hash/sparse_to_dense_results.cuh new file mode 100644 index 00000000000..8d40358d0c8 --- /dev/null +++ b/cpp/src/groupby/hash/sparse_to_dense_results.cuh @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "compute_single_pass_aggs.cuh" +#include "hash_compound_agg_finalizer.cuh" +#include "var_hash_functor.cuh" + +#include +#include +#include +#include + +namespace cudf::groupby::detail::hash { + +/** + * @brief Gather sparse results into dense using `gather_map` and add to + * `dense_cache` + * + * @see groupby_null_templated() + */ +template +void sparse_to_dense_results(table_view const& keys, + host_span requests, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + SetType set, + bool skip_key_rows_with_nulls, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto row_bitmask = + cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first; + bitmask_type const* row_bitmask_ptr = + skip_key_rows_with_nulls ? static_cast(row_bitmask.data()) : nullptr; + + for (auto const& request : requests) { + auto const& agg_v = request.aggregations; + auto const& col = request.values; + + // Given an aggregation, this will get the result from sparse_results and + // convert and return dense, compacted result + auto finalizer = hash_compound_agg_finalizer( + col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr); + for (auto&& agg : agg_v) { + agg->finalize(finalizer); + } + } +} + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/sparse_to_dense_results_null.cu b/cpp/src/groupby/hash/sparse_to_dense_results_null.cu new file mode 100644 index 00000000000..b6820f7f6db --- /dev/null +++ b/cpp/src/groupby/hash/sparse_to_dense_results_null.cu @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "helpers.cuh" +#include "sparse_to_dense_results.cuh" + +namespace cudf::groupby::detail::hash { + +template void sparse_to_dense_results( + table_view const& keys, + host_span requests, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + nullable_hash_set_ref_t set, + bool skip_key_rows_with_nulls, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/multi_pass_functors.cu b/cpp/src/groupby/hash/var_hash_functor.cu similarity index 96% rename from cpp/src/groupby/hash/multi_pass_functors.cu rename to cpp/src/groupby/hash/var_hash_functor.cu index 1f4c2a6a923..4881f4ed85e 100644 --- a/cpp/src/groupby/hash/multi_pass_functors.cu +++ b/cpp/src/groupby/hash/var_hash_functor.cu @@ -15,7 +15,7 @@ */ #include "helpers.cuh" -#include "multi_pass_functors.cuh" +#include "var_hash_functor.cuh" namespace cudf::groupby::detail::hash { diff --git a/cpp/src/groupby/hash/multi_pass_functors.cuh b/cpp/src/groupby/hash/var_hash_functor.cuh similarity index 100% rename from cpp/src/groupby/hash/multi_pass_functors.cuh rename to cpp/src/groupby/hash/var_hash_functor.cuh From 44806bacac8dedbf78cd0852d52322f4a5c0ec64 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 25 Sep 2024 20:00:26 -0700 Subject: [PATCH 041/135] Add missing headers + more explicit instantiations --- cpp/CMakeLists.txt | 2 + cpp/src/groupby/hash/compute_groupby.cuh | 2 +- .../groupby/hash/compute_single_pass_aggs.cu | 59 +++++++++++++++++++ .../hash/compute_single_pass_aggs_null.cu | 59 +++++++++++++++++++ .../hash/hash_compound_agg_finalizer.cuh | 2 + .../groupby/hash/sparse_to_dense_results.cuh | 1 + 6 files changed, 124 insertions(+), 1 deletion(-) create mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs.cu create mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs_null.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 3dfcbe69b2f..6854295a6a6 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -318,6 +318,8 @@ add_library( src/groupby/hash/compute_groupby.cu src/groupby/hash/compute_groupby_null.cu src/groupby/hash/flatten_single_pass_aggs.cpp + src/groupby/hash/compute_single_pass_aggs_null.cu + src/groupby/hash/compute_single_pass_aggs.cu src/groupby/hash/sparse_to_dense_results.cu src/groupby/hash/sparse_to_dense_results_null.cu src/groupby/hash/groupby.cu diff --git a/cpp/src/groupby/hash/compute_groupby.cuh b/cpp/src/groupby/hash/compute_groupby.cuh index e97853fa155..146900f61eb 100644 --- a/cpp/src/groupby/hash/compute_groupby.cuh +++ b/cpp/src/groupby/hash/compute_groupby.cuh @@ -17,8 +17,8 @@ #include "compute_single_pass_aggs.cuh" // #include "compute_single_pass_aggs.hpp" -#include "multi_pass_functors.cuh" #include "sparse_to_dense_results.cuh" +#include "var_hash_functor.cuh" #include #include diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu new file mode 100644 index 00000000000..aa883f25315 --- /dev/null +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cu @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_single_pass_aggs.cuh" +#include "helpers.cuh" + +namespace cudf { +namespace groupby { +namespace detail { +namespace hash { + +using global_set_t = cuco::static_set, + cuda::thread_scope_device, + row_comparator_t, + probing_scheme_t, + cudf::detail::cuco_allocator, + cuco::storage>; + +template void extract_populated_keys( + global_set_t const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); + +template auto create_sparse_results_table( + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector aggs, + bool direct_aggregations, + global_set_t const& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); + +template rmm::device_uvector compute_single_pass_aggs( + cudf::table_view const& keys, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + global_set_t& global_set, + bool keys_have_nulls, + null_policy include_null_keys, + rmm::cuda_stream_view stream); + +} // namespace hash +} // namespace detail +} // namespace groupby +} // namespace cudf diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu b/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu new file mode 100644 index 00000000000..f0889a362fe --- /dev/null +++ b/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_single_pass_aggs.cuh" +#include "helpers.cuh" + +namespace cudf { +namespace groupby { +namespace detail { +namespace hash { + +using global_set_t = cuco::static_set, + cuda::thread_scope_device, + nullable_row_comparator_t, + probing_scheme_t, + cudf::detail::cuco_allocator, + cuco::storage>; + +template void extract_populated_keys( + global_set_t const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); + +template auto create_sparse_results_table( + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector aggs, + bool direct_aggregations, + global_set_t const& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); + +template rmm::device_uvector compute_single_pass_aggs( + cudf::table_view const& keys, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + global_set_t& global_set, + bool keys_have_nulls, + null_policy include_null_keys, + rmm::cuda_stream_view stream); + +} // namespace hash +} // namespace detail +} // namespace groupby +} // namespace cudf diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh index a9326873282..1c40b77b5a1 100644 --- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh @@ -18,7 +18,9 @@ #include "var_hash_functor.cuh" #include +#include #include +#include #include #include #include diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cuh b/cpp/src/groupby/hash/sparse_to_dense_results.cuh index 8d40358d0c8..b89fc308e6e 100644 --- a/cpp/src/groupby/hash/sparse_to_dense_results.cuh +++ b/cpp/src/groupby/hash/sparse_to_dense_results.cuh @@ -20,6 +20,7 @@ #include "var_hash_functor.cuh" #include +#include #include #include #include From 85bf877889cafb0d7963cdc1ebe53f6ea6432276 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 25 Sep 2024 20:02:12 -0700 Subject: [PATCH 042/135] Reorder files --- cpp/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6854295a6a6..0d9529ef58d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -317,14 +317,14 @@ add_library( src/groupby/hash/compute_aggregations.cu src/groupby/hash/compute_groupby.cu src/groupby/hash/compute_groupby_null.cu - src/groupby/hash/flatten_single_pass_aggs.cpp src/groupby/hash/compute_single_pass_aggs_null.cu src/groupby/hash/compute_single_pass_aggs.cu + src/groupby/hash/flatten_single_pass_aggs.cpp + src/groupby/hash/groupby.cu + src/groupby/hash/hash_compound_agg_finalizer.cu src/groupby/hash/sparse_to_dense_results.cu src/groupby/hash/sparse_to_dense_results_null.cu - src/groupby/hash/groupby.cu src/groupby/hash/var_hash_functor.cu - src/groupby/hash/hash_compound_agg_finalizer.cu src/groupby/sort/aggregate.cpp src/groupby/sort/group_argmax.cu src/groupby/sort/group_argmin.cu From 049acffb986d4ae5d56f95df713e75730cc0452f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 25 Sep 2024 20:31:30 -0700 Subject: [PATCH 043/135] Fix typos + add missing header --- cpp/src/groupby/hash/compute_single_pass_aggs.cu | 3 +-- cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 1 + cpp/src/groupby/hash/compute_single_pass_aggs_null.cu | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu index aa883f25315..2770dc2a84c 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cu +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cu @@ -49,8 +49,7 @@ template rmm::device_uvector compute_single_pass_aggs requests, cudf::detail::result_cache* sparse_results, global_set_t& global_set, - bool keys_have_nulls, - null_policy include_null_keys, + bool skip_rows_with_nulls, rmm::cuda_stream_view stream); } // namespace hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index a52d6ecd530..b2891c1df1f 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -22,6 +22,7 @@ #include "helpers.cuh" #include "single_pass_functors.cuh" +#include #include #include #include diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu b/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu index f0889a362fe..e566c2c5d27 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu +++ b/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu @@ -49,8 +49,7 @@ template rmm::device_uvector compute_single_pass_aggs requests, cudf::detail::result_cache* sparse_results, global_set_t& global_set, - bool keys_have_nulls, - null_policy include_null_keys, + bool skip_rows_with_nulls, rmm::cuda_stream_view stream); } // namespace hash From 2d42b9b10aeedb6d80e83b738e7a7a79ad1dfb92 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 26 Sep 2024 10:50:17 -0700 Subject: [PATCH 044/135] Revert temp rollback --- cpp/src/groupby/hash/compute_aggregations.cu | 39 +++++++++---------- cpp/src/groupby/hash/compute_aggregations.hpp | 22 +++++------ .../groupby/hash/compute_single_pass_aggs.cuh | 27 ++++++------- 3 files changed, 41 insertions(+), 47 deletions(-) diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu index 7eb8216ee13..54f37c7f397 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cu +++ b/cpp/src/groupby/hash/compute_aggregations.cu @@ -225,35 +225,36 @@ CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows, constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; } template -constexpr std::pair compute_shared_memory_size(Kernel kernel, int grid_size) +constexpr size_t compute_shared_memory_size(Kernel kernel, int grid_size) { auto const active_blocks_per_sm = cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); + CUDF_EXPECTS(active_blocks_per_sm >= 1, "active_blocks_per_sm must be larger than 1"); + CUDF_EXPECTS(grid_size >= 1, "grid_size must be larger than 1"); + size_t dynamic_shmem_size = 0; - auto const cuda_error = cudaOccupancyAvailableDynamicSMemPerBlock( - &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE); - return {cuda_error, get_previous_multiple_of_8(0.5 * dynamic_shmem_size)}; + CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock( + &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE)); + return get_previous_multiple_of_8(0.5 * dynamic_shmem_size); } } // namespace -cudaError_t compute_aggregations(int grid_size, - cudf::size_type num_input_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::aggregation::Kind const* aggs, - rmm::cuda_stream_view stream) +void compute_aggregations(int grid_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* aggs, + rmm::cuda_stream_view stream) { - auto const [cuda_error, shmem_size] = compute_shared_memory_size(compute_aggs_kernel, grid_size); - - if (cuda_error != cudaSuccess) { return cuda_error; } + auto const shmem_size = compute_shared_memory_size(compute_aggs_kernel, grid_size); // For each aggregation, need two pointers to arrays in shmem // One where the aggregation is performed, one indicating the validity of the aggregation @@ -273,8 +274,6 @@ cudaError_t compute_aggregations(int grid_size, aggs, shmem_agg_size, shmem_agg_pointer_size); - - return cudaSuccess; } } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_aggregations.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp index 862462c8b9f..87c37158cd0 100644 --- a/cpp/src/groupby/hash/compute_aggregations.hpp +++ b/cpp/src/groupby/hash/compute_aggregations.hpp @@ -26,16 +26,16 @@ namespace cudf::groupby::detail::hash { -cudaError_t compute_aggregations(int grid_size, - cudf::size_type num_input_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::aggregation::Kind const* aggs, - rmm::cuda_stream_view stream); +void compute_aggregations(int grid_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* aggs, + rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index b2891c1df1f..051259bf9f4 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -313,22 +313,17 @@ rmm::device_uvector compute_single_pass_aggs( auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); auto d_values = table_device_view::create(flattened_values, stream); - auto const cuda_error = compute_aggregations(grid_size, - num_input_rows, - static_cast(row_bitmask.data()), - skip_rows_with_nulls, - local_mapping_index.data(), - global_mapping_index.data(), - block_cardinality.data(), - *d_values, - *d_sparse_table, - d_agg_kinds.data(), - stream); - - if (cuda_error != cudaSuccess) { - constexpr bool uses_direct_aggs = true; - direct_aggregations.set_value_async(uses_direct_aggs, stream); - } + compute_aggregations(grid_size, + num_input_rows, + static_cast(row_bitmask.data()), + skip_rows_with_nulls, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + stream); if (direct_aggregations.value(stream)) { auto const stride = GROUPBY_BLOCK_SIZE * grid_size; From 45573e0b1f69c8c497407e98ad4f9dbe3baf36c4 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 26 Sep 2024 12:37:39 -0700 Subject: [PATCH 045/135] Cleanups --- cpp/CMakeLists.txt | 3 - cpp/src/groupby/hash/compute_groupby.cu | 111 +++++- cpp/src/groupby/hash/compute_groupby.cuh | 111 ------ cpp/src/groupby/hash/compute_groupby.hpp | 68 ++++ cpp/src/groupby/hash/compute_groupby_null.cu | 31 -- .../groupby/hash/compute_single_pass_aggs.cu | 375 ++++++++++++++++-- .../groupby/hash/compute_single_pass_aggs.cuh | 355 ----------------- .../groupby/hash/compute_single_pass_aggs.hpp | 15 +- .../hash/compute_single_pass_aggs_null.cu | 58 --- cpp/src/groupby/hash/groupby.cu | 2 +- .../groupby/hash/sparse_to_dense_results.cu | 61 ++- ...esults.cuh => sparse_to_dense_results.hpp} | 38 +- .../hash/sparse_to_dense_results_null.cu | 33 -- 13 files changed, 599 insertions(+), 662 deletions(-) delete mode 100644 cpp/src/groupby/hash/compute_groupby.cuh create mode 100644 cpp/src/groupby/hash/compute_groupby.hpp delete mode 100644 cpp/src/groupby/hash/compute_groupby_null.cu delete mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs.cuh delete mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs_null.cu rename cpp/src/groupby/hash/{sparse_to_dense_results.cuh => sparse_to_dense_results.hpp} (57%) delete mode 100644 cpp/src/groupby/hash/sparse_to_dense_results_null.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 0d9529ef58d..663f2210ef4 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -316,14 +316,11 @@ add_library( src/groupby/groupby.cu src/groupby/hash/compute_aggregations.cu src/groupby/hash/compute_groupby.cu - src/groupby/hash/compute_groupby_null.cu - src/groupby/hash/compute_single_pass_aggs_null.cu src/groupby/hash/compute_single_pass_aggs.cu src/groupby/hash/flatten_single_pass_aggs.cpp src/groupby/hash/groupby.cu src/groupby/hash/hash_compound_agg_finalizer.cu src/groupby/hash/sparse_to_dense_results.cu - src/groupby/hash/sparse_to_dense_results_null.cu src/groupby/hash/var_hash_functor.cu src/groupby/sort/aggregate.cpp src/groupby/sort/group_argmax.cu diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu index 7965d0891a7..9643567a825 100644 --- a/cpp/src/groupby/hash/compute_groupby.cu +++ b/cpp/src/groupby/hash/compute_groupby.cu @@ -13,11 +13,110 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#pragma once -#include "compute_groupby.cuh" +#include "compute_single_pass_aggs.cuh" +// #include "compute_single_pass_aggs.hpp" +#include "helpers.cuh" +#include "sparse_to_dense_results.cuh" +#include "var_hash_functor.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include namespace cudf::groupby::detail::hash { +/** + * @brief Computes groupby using hash table. + * + * First, we create a hash table that stores the indices of unique rows in + * `keys`. The upper limit on the number of values in this map is the number + * of rows in `keys`. + * + * To store the results of aggregations, we create temporary sparse columns + * which have the same size as input value columns. Using the hash map, we + * determine the location within the sparse column to write the result of the + * aggregation into. + * + * The sparse column results of all aggregations are stored into the cache + * `sparse_results`. This enables the use of previously calculated results in + * other aggregations. + * + * All the aggregations which can be computed in a single pass are computed + * first, in a combined kernel. Then using these results, aggregations that + * require multiple passes, will be computed. + * + * Finally, using the hash map, we generate a vector of indices of populated + * values in sparse result columns. Then, for each aggregation originally + * requested in `requests`, we gather sparse results into a column of dense + * results using the aforementioned index vector. Dense results are stored into + * the in/out parameter `cache`. + */ +template +std::unique_ptr
compute_groupby(table_view const& keys, + host_span requests, + cudf::detail::result_cache* cache, + bool skip_key_rows_with_nulls, + Equal const& d_row_equal, + row_hash_t const& d_row_hash, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + // convert to int64_t to avoid potential overflow with large `keys` + auto const num_keys = static_cast(keys.num_rows()); + + // Cache of sparse results where the location of aggregate value in each + // column is indexed by the hash set + cudf::detail::result_cache sparse_results(requests.size()); + + auto const set = cuco::static_set{ + cuco::extent{num_keys}, + cudf::detail::CUCO_DESIRED_LOAD_FACTOR, // 50% occupancy + cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, + d_row_equal, + probing_scheme_t{d_row_hash}, + cuco::thread_scope_device, + cuco::storage{}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + stream.value()}; + + // Compute all single pass aggs first + auto gather_map = compute_single_pass_aggs( + keys, requests, &sparse_results, set, skip_key_rows_with_nulls, stream); + + // Compact all results from sparse_results and insert into cache + sparse_to_dense_results(keys, + requests, + &sparse_results, + cache, + gather_map, + set.ref(cuco::find), + skip_key_rows_with_nulls, + stream, + mr); + + return cudf::detail::gather(keys, + gather_map, + out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); +} + template std::unique_ptr
compute_groupby( table_view const& keys, host_span requests, @@ -28,4 +127,14 @@ template std::unique_ptr
compute_groupby( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +template std::unique_ptr
compute_groupby( + table_view const& keys, + host_span requests, + cudf::detail::result_cache* cache, + bool skip_key_rows_with_nulls, + nullable_row_comparator_t const& d_row_equal, + row_hash_t const& d_row_hash, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_groupby.cuh b/cpp/src/groupby/hash/compute_groupby.cuh deleted file mode 100644 index 146900f61eb..00000000000 --- a/cpp/src/groupby/hash/compute_groupby.cuh +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "compute_single_pass_aggs.cuh" -// #include "compute_single_pass_aggs.hpp" -#include "sparse_to_dense_results.cuh" -#include "var_hash_functor.cuh" - -#include -#include -#include -#include - -#include - -namespace cudf::groupby::detail::hash { - -/** - * @brief Computes groupby using hash table. - * - * First, we create a hash table that stores the indices of unique rows in - * `keys`. The upper limit on the number of values in this map is the number - * of rows in `keys`. - * - * To store the results of aggregations, we create temporary sparse columns - * which have the same size as input value columns. Using the hash map, we - * determine the location within the sparse column to write the result of the - * aggregation into. - * - * The sparse column results of all aggregations are stored into the cache - * `sparse_results`. This enables the use of previously calculated results in - * other aggregations. - * - * All the aggregations which can be computed in a single pass are computed - * first, in a combined kernel. Then using these results, aggregations that - * require multiple passes, will be computed. - * - * Finally, using the hash map, we generate a vector of indices of populated - * values in sparse result columns. Then, for each aggregation originally - * requested in `requests`, we gather sparse results into a column of dense - * results using the aforementioned index vector. Dense results are stored into - * the in/out parameter `cache`. - */ -template -std::unique_ptr
compute_groupby( - table_view const& keys, - host_span requests, - cudf::detail::result_cache* cache, - bool skip_key_rows_with_nulls, - Equal const& d_row_equal, - cudf::experimental::row::hash::device_row_hasher const& d_row_hash, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - // convert to int64_t to avoid potential overflow with large `keys` - auto const num_keys = static_cast(keys.num_rows()); - - // Cache of sparse results where the location of aggregate value in each - // column is indexed by the hash set - cudf::detail::result_cache sparse_results(requests.size()); - - auto const set = cuco::static_set{ - cuco::extent{num_keys}, - cudf::detail::CUCO_DESIRED_LOAD_FACTOR, // 50% occupancy - cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, - d_row_equal, - probing_scheme_t{d_row_hash}, - cuco::thread_scope_device, - cuco::storage{}, - cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, - stream.value()}; - - // Compute all single pass aggs first - auto gather_map = compute_single_pass_aggs( - keys, requests, &sparse_results, set, skip_key_rows_with_nulls, stream); - - // Compact all results from sparse_results and insert into cache - sparse_to_dense_results(keys, - requests, - &sparse_results, - cache, - gather_map, - set.ref(cuco::find), - skip_key_rows_with_nulls, - stream, - mr); - - return cudf::detail::gather(keys, - gather_map, - out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); -} - -} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_groupby.hpp b/cpp/src/groupby/hash/compute_groupby.hpp new file mode 100644 index 00000000000..358c81365a0 --- /dev/null +++ b/cpp/src/groupby/hash/compute_groupby.hpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "helpers.cuh" + +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +/** + * @brief Computes groupby using hash table. + * + * First, we create a hash table that stores the indices of unique rows in + * `keys`. The upper limit on the number of values in this map is the number + * of rows in `keys`. + * + * To store the results of aggregations, we create temporary sparse columns + * which have the same size as input value columns. Using the hash map, we + * determine the location within the sparse column to write the result of the + * aggregation into. + * + * The sparse column results of all aggregations are stored into the cache + * `sparse_results`. This enables the use of previously calculated results in + * other aggregations. + * + * All the aggregations which can be computed in a single pass are computed + * first, in a combined kernel. Then using these results, aggregations that + * require multiple passes, will be computed. + * + * Finally, using the hash map, we generate a vector of indices of populated + * values in sparse result columns. Then, for each aggregation originally + * requested in `requests`, we gather sparse results into a column of dense + * results using the aforementioned index vector. Dense results are stored into + * the in/out parameter `cache`. + */ +template +std::unique_ptr compute_groupby(table_view const& keys, + host_span requests, + cudf::detail::result_cache* cache, + bool skip_key_rows_with_nulls, + Equal const& d_row_equal, + row_hash_t const& d_row_hash, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_groupby_null.cu b/cpp/src/groupby/hash/compute_groupby_null.cu deleted file mode 100644 index 1f9707902cc..00000000000 --- a/cpp/src/groupby/hash/compute_groupby_null.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "compute_groupby.cuh" - -namespace cudf::groupby::detail::hash { - -template std::unique_ptr
compute_groupby( - table_view const& keys, - host_span requests, - cudf::detail::result_cache* cache, - bool skip_key_rows_with_nulls, - nullable_row_comparator_t const& d_row_equal, - row_hash_t const& d_row_hash, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); - -} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu index 2770dc2a84c..b5c68ea639a 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cu +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,13 +14,342 @@ * limitations under the License. */ -#include "compute_single_pass_aggs.cuh" +#include "compute_aggregations.hpp" +// #include "compute_single_pass_aggs.hpp" +#include "flatten_single_pass_aggs.hpp" #include "helpers.cuh" +#include "single_pass_functors.cuh" -namespace cudf { -namespace groupby { -namespace detail { -namespace hash { +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +namespace { +template +// TODO pass block +__device__ void find_local_mapping(cudf::size_type cur_idx, + cudf::size_type num_input_rows, + SetType shared_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* cardinality, + cudf::size_type* local_mapping_index, + cudf::size_type* shared_set_indices) +{ + cudf::size_type result_idx; + // TODO: un-init + bool inserted; + if (cur_idx < num_input_rows and + (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) { + auto const result = shared_set.insert_and_find(cur_idx); + result_idx = *result.first; + inserted = result.second; + // inserted a new element + if (result.second) { + auto const shared_set_index = atomicAdd(cardinality, 1); + shared_set_indices[shared_set_index] = cur_idx; + local_mapping_index[cur_idx] = shared_set_index; + } + } + // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all + // threads in the thread block. + __syncthreads(); + if (cur_idx < num_input_rows and + (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) { + // element was already in set + if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; } + } +} + +template +__device__ void find_global_mapping(cudf::size_type cur_idx, + SetType global_set, + cudf::size_type* shared_set_indices, + cudf::size_type* global_mapping_index) +{ + auto const input_idx = shared_set_indices[cur_idx]; + global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] = + *global_set.insert_and_find(input_idx).first; +} + +/* + * Inserts keys into the shared memory hash set, and stores the row index of the local + * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a + * threadblock exceeds `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without + * updating `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to + * the global hash set, and save the row index of the global sparse table in `global_mapping_index`. + */ +template +CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, + cudf::size_type num_input_rows, + WindowExtent window_extent, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + bool* direct_aggregations) +{ + // TODO: indices inserted in each shared memory set + __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS]; + + // Shared set initialization + __shared__ typename SetRef::window_type windows[window_extent.value()]; + auto storage = SetRef::storage_ref_type(window_extent, windows); + auto shared_set = SetRef(cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, + global_set.key_eq(), + probing_scheme_t{global_set.hash_function()}, + {}, + storage); + auto const block = cooperative_groups::this_thread_block(); + shared_set.initialize(block); + + auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find); + + __shared__ cudf::size_type cardinality; + if (block.thread_rank() == 0) { cardinality = 0; } + block.sync(); + + auto const stride = cudf::detail::grid_1d::grid_stride(); + + for (auto cur_idx = cudf::detail::grid_1d::global_thread_id(); + cur_idx - block.thread_rank() < num_input_rows; + cur_idx += stride) { + find_local_mapping(cur_idx, + num_input_rows, + shared_insert_ref, + row_bitmask, + skip_rows_with_nulls, + &cardinality, + local_mapping_index, + shared_set_indices); + + block.sync(); + + if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { + if (block.thread_rank() == 0) { *direct_aggregations = true; } + break; + } + + block.sync(); + } + + // Insert unique keys from shared to global hash set + if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) { + for (auto cur_idx = block.thread_rank(); cur_idx < cardinality; + cur_idx += block.num_threads()) { + find_global_mapping(cur_idx, global_set, shared_set_indices, global_mapping_index); + } + } + + if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; } +} + +template +int max_occupancy_grid_size(Kernel kernel, cudf::size_type n) +{ + int max_active_blocks{-1}; + CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks, kernel, GROUPBY_BLOCK_SIZE, 0)); + auto const grid_size = max_active_blocks * cudf::detail::num_multiprocessors(); + auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE); + return std::min(grid_size, num_blocks); +} + +template +void extract_populated_keys(SetType const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream) +{ + auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); + + populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); +} + +// make table that will hold sparse results +template +auto create_sparse_results_table(cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector aggs, + bool direct_aggregations, + GlobalSetType const& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream) +{ + // TODO single allocation - room for performance improvement + std::vector> sparse_columns; + std::transform(flattened_values.begin(), + flattened_values.end(), + aggs.begin(), + std::back_inserter(sparse_columns), + [stream](auto const& col, auto const& agg) { + auto const nullable = + (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL) + ? false + : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or + agg == cudf::aggregation::STD); + auto mask_flag = + (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED; + auto const col_type = cudf::is_dictionary(col.type()) + ? cudf::dictionary_column_view(col).keys().type() + : col.type(); + return make_fixed_width_column( + cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); + }); + cudf::table sparse_table(std::move(sparse_columns)); + // If no direct aggregations, initialize the sparse table + // only for the keys inserted in global hash set + if (!direct_aggregations) { + auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream); + extract_populated_keys(global_set, populated_keys, stream); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + populated_keys.size(), + initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_agg_kinds}); + } + // Else initialize the whole table + else { + cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view(); + cudf::detail::initialize_with_identity(sparse_table_view, aggs, stream); + } + return sparse_table; +} +} // namespace + +/** + * @brief Computes all aggregations from `requests` that require a single pass + * over the data and stores the results in `sparse_results` + */ +template +rmm::device_uvector compute_single_pass_aggs( + cudf::table_view const& keys, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + SetType& global_set, + bool skip_rows_with_nulls, + rmm::cuda_stream_view stream) +{ + // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy + auto constexpr shared_set_capacity = + static_cast(static_cast(GROUPBY_SHM_MAX_ELEMENTS) * 1.43); + using extent_type = cuco::extent; + using shared_set_type = cuco::static_set, + cuco::storage>; + using shared_set_ref_type = typename shared_set_type::ref_type<>; + auto constexpr window_extent = cuco::make_window_extent(extent_type{}); + + auto const num_input_rows = keys.num_rows(); + + auto row_bitmask = + skip_rows_with_nulls + ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first + : rmm::device_buffer{}; + + auto global_set_ref = global_set.ref(cuco::op::insert_and_find); + auto const grid_size = max_occupancy_grid_size( + compute_mapping_indices, + num_input_rows); + // 'local_mapping_index' maps from the global row index of the input table to the row index of + // the local pre-aggregate table + rmm::device_uvector local_mapping_index(num_input_rows, stream); + // 'global_mapping_index' maps from the local pre-aggregate table to the row index of + // global aggregate table + rmm::device_uvector global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS, + stream); + rmm::device_uvector block_cardinality(grid_size, stream); + rmm::device_scalar direct_aggregations(false, stream); + compute_mapping_indices + <<>>(global_set_ref, + num_input_rows, + window_extent, + static_cast(row_bitmask.data()), + skip_rows_with_nulls, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + direct_aggregations.data()); + stream.synchronize(); + + // 'populated_keys' contains inserted row_indices (keys) of global hash set + rmm::device_uvector populated_keys(keys.num_rows(), stream); + + // flatten the aggs to a table that can be operated on by aggregate_row + auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); + auto const d_agg_kinds = cudf::detail::make_device_uvector_async( + agg_kinds, stream, rmm::mr::get_current_device_resource()); + // make table that will hold sparse results + cudf::table sparse_table = create_sparse_results_table(flattened_values, + d_agg_kinds.data(), + agg_kinds, + direct_aggregations.value(stream), + global_set, + populated_keys, + stream); + // prepare to launch kernel to do the actual aggregation + auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); + auto d_values = table_device_view::create(flattened_values, stream); + + compute_aggregations(grid_size, + num_input_rows, + static_cast(row_bitmask.data()), + skip_rows_with_nulls, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + stream); + + if (direct_aggregations.value(stream)) { + auto const stride = GROUPBY_BLOCK_SIZE * grid_size; + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + keys.num_rows(), + compute_direct_aggregates{global_set_ref, + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + block_cardinality.data(), + stride, + static_cast(row_bitmask.data()), + skip_rows_with_nulls}); + extract_populated_keys(global_set, populated_keys, stream); + } + + // Add results back to sparse_results cache + auto sparse_result_cols = sparse_table.release(); + for (size_t i = 0; i < aggs.size(); i++) { + // Note that the cache will make a copy of this temporary aggregation + sparse_results->add_result( + flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); + } + + return populated_keys; +} using global_set_t = cuco::static_set, @@ -30,20 +359,6 @@ using global_set_t = cuco::static_set, cuco::storage>; -template void extract_populated_keys( - global_set_t const& key_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream); - -template auto create_sparse_results_table( - cudf::table_view const& flattened_values, - cudf::aggregation::Kind const* d_agg_kinds, - std::vector aggs, - bool direct_aggregations, - global_set_t const& global_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream); - template rmm::device_uvector compute_single_pass_aggs( cudf::table_view const& keys, cudf::host_span requests, @@ -52,7 +367,19 @@ template rmm::device_uvector compute_single_pass_aggs, + cuda::thread_scope_device, + nullable_row_comparator_t, + probing_scheme_t, + cudf::detail::cuco_allocator, + cuco::storage>; + +template rmm::device_uvector compute_single_pass_aggs( + cudf::table_view const& keys, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + nullable_global_set_t& global_set, + bool skip_rows_with_nulls, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh deleted file mode 100644 index 051259bf9f4..00000000000 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ /dev/null @@ -1,355 +0,0 @@ -/* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "compute_aggregations.hpp" -// #include "compute_single_pass_aggs.hpp" -#include "flatten_single_pass_aggs.hpp" -#include "helpers.cuh" -#include "single_pass_functors.cuh" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -#include - -namespace cudf::groupby::detail::hash { - -template -// TODO pass block -__device__ void find_local_mapping(cudf::size_type cur_idx, - cudf::size_type num_input_rows, - SetType shared_set, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* cardinality, - cudf::size_type* local_mapping_index, - cudf::size_type* shared_set_indices) -{ - cudf::size_type result_idx; - // TODO: un-init - bool inserted; - if (cur_idx < num_input_rows and - (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) { - auto const result = shared_set.insert_and_find(cur_idx); - result_idx = *result.first; - inserted = result.second; - // inserted a new element - if (result.second) { - auto const shared_set_index = atomicAdd(cardinality, 1); - shared_set_indices[shared_set_index] = cur_idx; - local_mapping_index[cur_idx] = shared_set_index; - } - } - // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all - // threads in the thread block. - __syncthreads(); - if (cur_idx < num_input_rows and - (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) { - // element was already in set - if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; } - } -} - -template -__device__ void find_global_mapping(cudf::size_type cur_idx, - SetType global_set, - cudf::size_type* shared_set_indices, - cudf::size_type* global_mapping_index) -{ - auto const input_idx = shared_set_indices[cur_idx]; - global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] = - *global_set.insert_and_find(input_idx).first; -} - -/* - * Inserts keys into the shared memory hash set, and stores the row index of the local - * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a - * threadblock exceeds `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without - * updating `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to - * the global hash set, and save the row index of the global sparse table in `global_mapping_index`. - */ -template -CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, - cudf::size_type num_input_rows, - WindowExtent window_extent, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - bool* direct_aggregations) -{ - // TODO: indices inserted in each shared memory set - __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS]; - - // Shared set initialization - __shared__ typename SetRef::window_type windows[window_extent.value()]; - auto storage = SetRef::storage_ref_type(window_extent, windows); - auto shared_set = SetRef(cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, - global_set.key_eq(), - probing_scheme_t{global_set.hash_function()}, - {}, - storage); - auto const block = cooperative_groups::this_thread_block(); - shared_set.initialize(block); - - auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find); - - __shared__ cudf::size_type cardinality; - if (block.thread_rank() == 0) { cardinality = 0; } - block.sync(); - - auto const stride = cudf::detail::grid_1d::grid_stride(); - - for (auto cur_idx = cudf::detail::grid_1d::global_thread_id(); - cur_idx - block.thread_rank() < num_input_rows; - cur_idx += stride) { - find_local_mapping(cur_idx, - num_input_rows, - shared_insert_ref, - row_bitmask, - skip_rows_with_nulls, - &cardinality, - local_mapping_index, - shared_set_indices); - - block.sync(); - - if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { - if (block.thread_rank() == 0) { *direct_aggregations = true; } - break; - } - - block.sync(); - } - - // Insert unique keys from shared to global hash set - if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) { - for (auto cur_idx = block.thread_rank(); cur_idx < cardinality; - cur_idx += block.num_threads()) { - find_global_mapping(cur_idx, global_set, shared_set_indices, global_mapping_index); - } - } - - if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; } -} - -template -int max_occupancy_grid_size(Kernel kernel, cudf::size_type n) -{ - int max_active_blocks{-1}; - CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks, kernel, GROUPBY_BLOCK_SIZE, 0)); - auto const grid_size = max_active_blocks * cudf::detail::num_multiprocessors(); - auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE); - return std::min(grid_size, num_blocks); -} - -template -void extract_populated_keys(SetType const& key_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream) -{ - auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); - - populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); -} - -// make table that will hold sparse results -template -auto create_sparse_results_table(cudf::table_view const& flattened_values, - cudf::aggregation::Kind const* d_agg_kinds, - std::vector aggs, - bool direct_aggregations, - GlobalSetType const& global_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream) -{ - // TODO single allocation - room for performance improvement - std::vector> sparse_columns; - std::transform(flattened_values.begin(), - flattened_values.end(), - aggs.begin(), - std::back_inserter(sparse_columns), - [stream](auto const& col, auto const& agg) { - auto const nullable = - (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL) - ? false - : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or - agg == cudf::aggregation::STD); - auto mask_flag = - (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED; - auto const col_type = cudf::is_dictionary(col.type()) - ? cudf::dictionary_column_view(col).keys().type() - : col.type(); - return make_fixed_width_column( - cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); - }); - cudf::table sparse_table(std::move(sparse_columns)); - // If no direct aggregations, initialize the sparse table - // only for the keys inserted in global hash set - if (!direct_aggregations) { - auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream); - extract_populated_keys(global_set, populated_keys, stream); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - populated_keys.size(), - initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_agg_kinds}); - } - // Else initialize the whole table - else { - cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view(); - cudf::detail::initialize_with_identity(sparse_table_view, aggs, stream); - } - return sparse_table; -} - -/** - * @brief Computes all aggregations from `requests` that require a single pass - * over the data and stores the results in `sparse_results` - */ -template -rmm::device_uvector compute_single_pass_aggs( - cudf::table_view const& keys, - cudf::host_span requests, - cudf::detail::result_cache* sparse_results, - SetType& global_set, - bool skip_rows_with_nulls, - rmm::cuda_stream_view stream) -{ - // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy - auto constexpr shared_set_capacity = - static_cast(static_cast(GROUPBY_SHM_MAX_ELEMENTS) * 1.43); - using extent_type = cuco::extent; - using shared_set_type = cuco::static_set, - cuco::storage>; - using shared_set_ref_type = typename shared_set_type::ref_type<>; - auto constexpr window_extent = cuco::make_window_extent(extent_type{}); - - auto const num_input_rows = keys.num_rows(); - - auto row_bitmask = - skip_rows_with_nulls - ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first - : rmm::device_buffer{}; - - auto global_set_ref = global_set.ref(cuco::op::insert_and_find); - auto const grid_size = max_occupancy_grid_size( - compute_mapping_indices, - num_input_rows); - // 'local_mapping_index' maps from the global row index of the input table to the row index of - // the local pre-aggregate table - rmm::device_uvector local_mapping_index(num_input_rows, stream); - // 'global_mapping_index' maps from the local pre-aggregate table to the row index of - // global aggregate table - rmm::device_uvector global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS, - stream); - rmm::device_uvector block_cardinality(grid_size, stream); - rmm::device_scalar direct_aggregations(false, stream); - compute_mapping_indices - <<>>(global_set_ref, - num_input_rows, - window_extent, - static_cast(row_bitmask.data()), - skip_rows_with_nulls, - local_mapping_index.data(), - global_mapping_index.data(), - block_cardinality.data(), - direct_aggregations.data()); - stream.synchronize(); - - // 'populated_keys' contains inserted row_indices (keys) of global hash set - rmm::device_uvector populated_keys(keys.num_rows(), stream); - - // flatten the aggs to a table that can be operated on by aggregate_row - auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); - auto const d_agg_kinds = cudf::detail::make_device_uvector_async( - agg_kinds, stream, rmm::mr::get_current_device_resource()); - // make table that will hold sparse results - cudf::table sparse_table = create_sparse_results_table(flattened_values, - d_agg_kinds.data(), - agg_kinds, - direct_aggregations.value(stream), - global_set, - populated_keys, - stream); - // prepare to launch kernel to do the actual aggregation - auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - auto d_values = table_device_view::create(flattened_values, stream); - - compute_aggregations(grid_size, - num_input_rows, - static_cast(row_bitmask.data()), - skip_rows_with_nulls, - local_mapping_index.data(), - global_mapping_index.data(), - block_cardinality.data(), - *d_values, - *d_sparse_table, - d_agg_kinds.data(), - stream); - - if (direct_aggregations.value(stream)) { - auto const stride = GROUPBY_BLOCK_SIZE * grid_size; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - keys.num_rows(), - compute_direct_aggregates{global_set_ref, - *d_values, - *d_sparse_table, - d_agg_kinds.data(), - block_cardinality.data(), - stride, - static_cast(row_bitmask.data()), - skip_rows_with_nulls}); - extract_populated_keys(global_set, populated_keys, stream); - } - - // Add results back to sparse_results cache - auto sparse_result_cols = sparse_table.release(); - for (size_t i = 0; i < aggs.size(); i++) { - // Note that the cache will make a copy of this temporary aggregation - sparse_results->add_result( - flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); - } - - return populated_keys; -} - -} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp index 848ace94ff9..6cbea9fcd3c 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp @@ -13,7 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once #include @@ -25,10 +24,7 @@ #include #include -namespace cudf { -namespace groupby { -namespace detail { -namespace hash { +namespace cudf::groupby::detail::hash { /** * @brief Computes all aggregations from `requests` that require a single pass * over the data and stores the results in `sparse_results` @@ -39,11 +35,6 @@ rmm::device_uvector compute_single_pass_aggs( cudf::host_span requests, cudf::detail::result_cache* sparse_results, SetType& global_set, - bool keys_have_nulls, - null_policy include_null_keys, + bool skip_rows_with_nulls, rmm::cuda_stream_view stream); - -} // namespace hash -} // namespace detail -} // namespace groupby -} // namespace cudf +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu b/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu deleted file mode 100644 index e566c2c5d27..00000000000 --- a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "compute_single_pass_aggs.cuh" -#include "helpers.cuh" - -namespace cudf { -namespace groupby { -namespace detail { -namespace hash { - -using global_set_t = cuco::static_set, - cuda::thread_scope_device, - nullable_row_comparator_t, - probing_scheme_t, - cudf::detail::cuco_allocator, - cuco::storage>; - -template void extract_populated_keys( - global_set_t const& key_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream); - -template auto create_sparse_results_table( - cudf::table_view const& flattened_values, - cudf::aggregation::Kind const* d_agg_kinds, - std::vector aggs, - bool direct_aggregations, - global_set_t const& global_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream); - -template rmm::device_uvector compute_single_pass_aggs( - cudf::table_view const& keys, - cudf::host_span requests, - cudf::detail::result_cache* sparse_results, - global_set_t& global_set, - bool skip_rows_with_nulls, - rmm::cuda_stream_view stream); - -} // namespace hash -} // namespace detail -} // namespace groupby -} // namespace cudf diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 62434bf5fd2..b307b8a8d1f 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "compute_groupby.cuh" +#include "compute_groupby.hpp" #include "groupby/common/utils.hpp" #include "helpers.cuh" diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu index 760926afa13..a416e2124ce 100644 --- a/cpp/src/groupby/hash/sparse_to_dense_results.cu +++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,11 +13,55 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#pragma once -#include "helpers.cuh" -#include "sparse_to_dense_results.cuh" +#include "hash_compound_agg_finalizer.cuh" + +#include +#include +#include +#include +#include + +#include +#include namespace cudf::groupby::detail::hash { +/** + * @brief Gather sparse results into dense using `gather_map` and add to + * `dense_cache` + * + * @see groupby_null_templated() + */ +template +void sparse_to_dense_results(table_view const& keys, + host_span requests, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + SetType set, + bool skip_key_rows_with_nulls, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto row_bitmask = + cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first; + bitmask_type const* row_bitmask_ptr = + skip_key_rows_with_nulls ? static_cast(row_bitmask.data()) : nullptr; + + for (auto const& request : requests) { + auto const& agg_v = request.aggregations; + auto const& col = request.values; + + // Given an aggregation, this will get the result from sparse_results and + // convert and return dense, compacted result + auto finalizer = hash_compound_agg_finalizer( + col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr); + for (auto&& agg : agg_v) { + agg->finalize(finalizer); + } + } +} template void sparse_to_dense_results(table_view const& keys, host_span requests, @@ -29,4 +73,15 @@ template void sparse_to_dense_results(table_view const& keys, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +template void sparse_to_dense_results( + table_view const& keys, + host_span requests, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + nullable_hash_set_ref_t set, + bool skip_key_rows_with_nulls, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cuh b/cpp/src/groupby/hash/sparse_to_dense_results.hpp similarity index 57% rename from cpp/src/groupby/hash/sparse_to_dense_results.cuh rename to cpp/src/groupby/hash/sparse_to_dense_results.hpp index b89fc308e6e..bfdc42953ad 100644 --- a/cpp/src/groupby/hash/sparse_to_dense_results.cuh +++ b/cpp/src/groupby/hash/sparse_to_dense_results.hpp @@ -15,18 +15,16 @@ */ #pragma once -#include "compute_single_pass_aggs.cuh" -#include "hash_compound_agg_finalizer.cuh" -#include "var_hash_functor.cuh" - -#include #include -#include -#include -#include +#include +#include +#include +#include -namespace cudf::groupby::detail::hash { +#include +#include +namespace cudf::groupby::detail::hash { /** * @brief Gather sparse results into dense using `gather_map` and add to * `dense_cache` @@ -42,25 +40,5 @@ void sparse_to_dense_results(table_view const& keys, SetType set, bool skip_key_rows_with_nulls, rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto row_bitmask = - cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first; - bitmask_type const* row_bitmask_ptr = - skip_key_rows_with_nulls ? static_cast(row_bitmask.data()) : nullptr; - - for (auto const& request : requests) { - auto const& agg_v = request.aggregations; - auto const& col = request.values; - - // Given an aggregation, this will get the result from sparse_results and - // convert and return dense, compacted result - auto finalizer = hash_compound_agg_finalizer( - col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr); - for (auto&& agg : agg_v) { - agg->finalize(finalizer); - } - } -} - + rmm::device_async_resource_ref mr); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/sparse_to_dense_results_null.cu b/cpp/src/groupby/hash/sparse_to_dense_results_null.cu deleted file mode 100644 index b6820f7f6db..00000000000 --- a/cpp/src/groupby/hash/sparse_to_dense_results_null.cu +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "helpers.cuh" -#include "sparse_to_dense_results.cuh" - -namespace cudf::groupby::detail::hash { - -template void sparse_to_dense_results( - table_view const& keys, - host_span requests, - cudf::detail::result_cache* sparse_results, - cudf::detail::result_cache* dense_results, - device_span gather_map, - nullable_hash_set_ref_t set, - bool skip_key_rows_with_nulls, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); - -} // namespace cudf::groupby::detail::hash From dec49a828409709ce1f585d470d3138802164be4 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 26 Sep 2024 12:50:35 -0700 Subject: [PATCH 046/135] Header cleanups --- cpp/src/groupby/hash/compute_aggregations.cu | 1 - cpp/src/groupby/hash/compute_groupby.cu | 7 +++---- cpp/src/groupby/hash/sparse_to_dense_results.cu | 3 ++- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu index 54f37c7f397..8b559ffc1be 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cu +++ b/cpp/src/groupby/hash/compute_aggregations.cu @@ -21,7 +21,6 @@ #include "single_pass_functors.cuh" #include -#include #include #include #include diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu index 9643567a825..4aa03d17999 100644 --- a/cpp/src/groupby/hash/compute_groupby.cu +++ b/cpp/src/groupby/hash/compute_groupby.cu @@ -13,17 +13,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#pragma once -#include "compute_single_pass_aggs.cuh" -// #include "compute_single_pass_aggs.hpp" +#include "compute_single_pass_aggs.hpp" #include "helpers.cuh" -#include "sparse_to_dense_results.cuh" +#include "sparse_to_dense_results.hpp" #include "var_hash_functor.cuh" #include #include #include +#include #include #include #include diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu index a416e2124ce..7f7290141f9 100644 --- a/cpp/src/groupby/hash/sparse_to_dense_results.cu +++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu @@ -13,11 +13,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#pragma once #include "hash_compound_agg_finalizer.cuh" +#include "helpers.cuh" #include +#include #include #include #include From 777400978907f9d88ba9f172a8f1b82d9e8118a5 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 26 Sep 2024 14:06:20 -0700 Subject: [PATCH 047/135] More cleanups for hash_compound_agg_finalizer --- cpp/CMakeLists.txt | 1 - cpp/src/groupby/hash/compute_groupby.cu | 2 +- .../hash/hash_compound_agg_finalizer.cu | 177 +++++++++++++++- .../hash/hash_compound_agg_finalizer.cuh | 199 ------------------ .../hash/hash_compound_agg_finalizer.hpp | 69 ++++++ cpp/src/groupby/hash/helpers.cuh | 20 +- .../groupby/hash/sparse_to_dense_results.cu | 2 +- cpp/src/groupby/hash/var_hash_functor.cu | 26 --- cpp/src/groupby/hash/var_hash_functor.cuh | 2 + 9 files changed, 264 insertions(+), 234 deletions(-) delete mode 100644 cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh create mode 100644 cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp delete mode 100644 cpp/src/groupby/hash/var_hash_functor.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 663f2210ef4..cd92e086329 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -321,7 +321,6 @@ add_library( src/groupby/hash/groupby.cu src/groupby/hash/hash_compound_agg_finalizer.cu src/groupby/hash/sparse_to_dense_results.cu - src/groupby/hash/var_hash_functor.cu src/groupby/sort/aggregate.cpp src/groupby/sort/group_argmax.cu src/groupby/sort/group_argmin.cu diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu index 4aa03d17999..1eb208c588d 100644 --- a/cpp/src/groupby/hash/compute_groupby.cu +++ b/cpp/src/groupby/hash/compute_groupby.cu @@ -82,7 +82,7 @@ std::unique_ptr
compute_groupby(table_view const& keys, // column is indexed by the hash set cudf::detail::result_cache sparse_results(requests.size()); - auto const set = cuco::static_set{ + auto set = cuco::static_set{ cuco::extent{num_keys}, cudf::detail::CUCO_DESIRED_LOAD_FACTOR, // 50% occupancy cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu index e7a7af92f15..119ac8cf6fd 100644 --- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu @@ -14,10 +14,185 @@ * limitations under the License. */ -#include "hash_compound_agg_finalizer.cuh" +#include "hash_compound_agg_finalizer.hpp" #include "helpers.cuh" +#include "var_hash_functor.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include namespace cudf::groupby::detail::hash { +template +hash_compound_agg_finalizer::hash_compound_agg_finalizer( + column_view col, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + SetType set, + bitmask_type const* row_bitmask, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + : col(col), + sparse_results(sparse_results), + dense_results(dense_results), + gather_map(gather_map), + set(set), + row_bitmask(row_bitmask), + stream(stream), + mr(mr) +{ + result_type = + cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() : col.type(); +} + +template +auto hash_compound_agg_finalizer::to_dense_agg_result(cudf::aggregation const& agg) +{ + auto s = sparse_results->get_result(col, agg); + auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}), + gather_map, + out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); + return std::move(dense_result_table->release()[0]); +} + +template +auto hash_compound_agg_finalizer::gather_argminmax(aggregation const& agg) +{ + auto arg_result = to_dense_agg_result(agg); + // We make a view of ARG(MIN/MAX) result without a null mask and gather + // using this map. The values in data buffer of ARG(MIN/MAX) result + // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL + // which is an out of bounds index value (-1) and causes the gathered + // value to be null. + column_view null_removed_map( + data_type(type_to_id()), + arg_result->size(), + static_cast(arg_result->view().template data()), + nullptr, + 0); + auto gather_argminmax = + cudf::detail::gather(table_view({col}), + null_removed_map, + arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY + : cudf::out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); + return std::move(gather_argminmax->release()[0]); +} + +template +void hash_compound_agg_finalizer::visit(cudf::aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + dense_results->add_result(col, agg, to_dense_agg_result(agg)); +} + +template +void hash_compound_agg_finalizer::visit(cudf::detail::min_aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + if (result_type.id() == type_id::STRING) { + auto transformed_agg = make_argmin_aggregation(); + dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); + } else { + dense_results->add_result(col, agg, to_dense_agg_result(agg)); + } +} + +template +void hash_compound_agg_finalizer::visit(cudf::detail::max_aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + + if (result_type.id() == type_id::STRING) { + auto transformed_agg = make_argmax_aggregation(); + dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); + } else { + dense_results->add_result(col, agg, to_dense_agg_result(agg)); + } +} + +template +void hash_compound_agg_finalizer::visit(cudf::detail::mean_aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + + auto sum_agg = make_sum_aggregation(); + auto count_agg = make_count_aggregation(); + this->visit(*sum_agg); + this->visit(*count_agg); + column_view sum_result = dense_results->get_result(col, *sum_agg); + column_view count_result = dense_results->get_result(col, *count_agg); + + auto result = + cudf::detail::binary_operation(sum_result, + count_result, + binary_operator::DIV, + cudf::detail::target_type(result_type, aggregation::MEAN), + stream, + mr); + dense_results->add_result(col, agg, std::move(result)); +} + +template +void hash_compound_agg_finalizer::visit(cudf::detail::var_aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + + auto sum_agg = make_sum_aggregation(); + auto count_agg = make_count_aggregation(); + this->visit(*sum_agg); + this->visit(*count_agg); + column_view sum_result = sparse_results->get_result(col, *sum_agg); + column_view count_result = sparse_results->get_result(col, *count_agg); + + auto values_view = column_device_view::create(col, stream); + auto sum_view = column_device_view::create(sum_result, stream); + auto count_view = column_device_view::create(count_result, stream); + + auto var_result = make_fixed_width_column( + cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream); + auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream); + mutable_table_view var_table_view{{var_result->mutable_view()}}; + cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream); + + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col.size(), + var_hash_functor{ + set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof}); + sparse_results->add_result(col, agg, std::move(var_result)); + dense_results->add_result(col, agg, to_dense_agg_result(agg)); +} + +template +void hash_compound_agg_finalizer::visit(cudf::detail::std_aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + auto var_agg = make_variance_aggregation(agg._ddof); + this->visit(*dynamic_cast(var_agg.get())); + column_view variance = dense_results->get_result(col, *var_agg); + + auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr); + dense_results->add_result(col, agg, std::move(result)); +} template class hash_compound_agg_finalizer; template class hash_compound_agg_finalizer; diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh deleted file mode 100644 index 1c40b77b5a1..00000000000 --- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cuh +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "var_hash_functor.cuh" - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace cudf::groupby::detail::hash { - -template -class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer { - column_view col; - data_type result_type; - cudf::detail::result_cache* sparse_results; - cudf::detail::result_cache* dense_results; - device_span gather_map; - SetType set; - bitmask_type const* __restrict__ row_bitmask; - rmm::cuda_stream_view stream; - rmm::device_async_resource_ref mr; - - public: - using cudf::detail::aggregation_finalizer::visit; - - hash_compound_agg_finalizer(column_view col, - cudf::detail::result_cache* sparse_results, - cudf::detail::result_cache* dense_results, - device_span gather_map, - SetType set, - bitmask_type const* row_bitmask, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) - : col(col), - sparse_results(sparse_results), - dense_results(dense_results), - gather_map(gather_map), - set(set), - row_bitmask(row_bitmask), - stream(stream), - mr(mr) - { - result_type = cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() - : col.type(); - } - - auto to_dense_agg_result(cudf::aggregation const& agg) - { - auto s = sparse_results->get_result(col, agg); - auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}), - gather_map, - out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); - return std::move(dense_result_table->release()[0]); - } - - // Enables conversion of ARGMIN/ARGMAX into MIN/MAX - auto gather_argminmax(aggregation const& agg) - { - auto arg_result = to_dense_agg_result(agg); - // We make a view of ARG(MIN/MAX) result without a null mask and gather - // using this map. The values in data buffer of ARG(MIN/MAX) result - // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL - // which is an out of bounds index value (-1) and causes the gathered - // value to be null. - column_view null_removed_map( - data_type(type_to_id()), - arg_result->size(), - static_cast(arg_result->view().template data()), - nullptr, - 0); - auto gather_argminmax = - cudf::detail::gather(table_view({col}), - null_removed_map, - arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY - : cudf::out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); - return std::move(gather_argminmax->release()[0]); - } - - // Declare overloads for each kind of aggregation to dispatch - void visit(cudf::aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - - void visit(cudf::detail::min_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - if (result_type.id() == type_id::STRING) { - auto transformed_agg = make_argmin_aggregation(); - dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); - } else { - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - } - - void visit(cudf::detail::max_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - - if (result_type.id() == type_id::STRING) { - auto transformed_agg = make_argmax_aggregation(); - dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); - } else { - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - } - - void visit(cudf::detail::mean_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - - auto sum_agg = make_sum_aggregation(); - auto count_agg = make_count_aggregation(); - this->visit(*sum_agg); - this->visit(*count_agg); - column_view sum_result = dense_results->get_result(col, *sum_agg); - column_view count_result = dense_results->get_result(col, *count_agg); - - auto result = - cudf::detail::binary_operation(sum_result, - count_result, - binary_operator::DIV, - cudf::detail::target_type(result_type, aggregation::MEAN), - stream, - mr); - dense_results->add_result(col, agg, std::move(result)); - } - - void visit(cudf::detail::var_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - - auto sum_agg = make_sum_aggregation(); - auto count_agg = make_count_aggregation(); - this->visit(*sum_agg); - this->visit(*count_agg); - column_view sum_result = sparse_results->get_result(col, *sum_agg); - column_view count_result = sparse_results->get_result(col, *count_agg); - - auto values_view = column_device_view::create(col, stream); - auto sum_view = column_device_view::create(sum_result, stream); - auto count_view = column_device_view::create(count_result, stream); - - auto var_result = make_fixed_width_column( - cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream); - auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream); - mutable_table_view var_table_view{{var_result->mutable_view()}}; - cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream); - - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - col.size(), - var_hash_functor{ - set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof}); - sparse_results->add_result(col, agg, std::move(var_result)); - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - - void visit(cudf::detail::std_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - auto var_agg = make_variance_aggregation(agg._ddof); - this->visit(*dynamic_cast(var_agg.get())); - column_view variance = dense_results->get_result(col, *var_agg); - - auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr); - dense_results->add_result(col, agg, std::move(result)); - } -}; - -} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp new file mode 100644 index 00000000000..16cbe92511f --- /dev/null +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer { + column_view col; + data_type result_type; + cudf::detail::result_cache* sparse_results; + cudf::detail::result_cache* dense_results; + device_span gather_map; + SetType set; + bitmask_type const* __restrict__ row_bitmask; + rmm::cuda_stream_view stream; + rmm::device_async_resource_ref mr; + + public: + using cudf::detail::aggregation_finalizer::visit; + + hash_compound_agg_finalizer(column_view col, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + SetType set, + bitmask_type const* row_bitmask, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + + auto to_dense_agg_result(cudf::aggregation const& agg); + + // Enables conversion of ARGMIN/ARGMAX into MIN/MAX + auto gather_argminmax(cudf::aggregation const& agg); + + // Declare overloads for each kind of aggregation to dispatch + void visit(cudf::aggregation const& agg) override; + + void visit(cudf::detail::min_aggregation const& agg) override; + + void visit(cudf::detail::max_aggregation const& agg) override; + + void visit(cudf::detail::mean_aggregation const& agg) override; + + void visit(cudf::detail::var_aggregation const& agg) override; + + void visit(cudf::detail::std_aggregation const& agg) override; +}; +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh index 650b936372d..c1dd68c2b78 100644 --- a/cpp/src/groupby/hash/helpers.cuh +++ b/cpp/src/groupby/hash/helpers.cuh @@ -70,10 +70,20 @@ using nullable_row_comparator_t = cudf::experimental::row::equality::device_row_ cudf::nullate::DYNAMIC, cudf::experimental::row::equality::nan_equal_physical_equality_comparator>; -using hash_set_ref_t = cuco:: - static_set_ref>, cuco::op::find_tag, >; - -using nullable_hash_set_ref_t = cuco:: - static_set_ref>, cuco::op::find_tag, >; +using hash_set_ref_t = cuco::static_set_ref< + cudf::size_type, + cuda::thread_scope_device, + row_comparator_t, + probing_scheme_t, + cuco::aow_storage_ref>, + cuco::op::find_tag>; + +using nullable_hash_set_ref_t = cuco::static_set_ref< + cudf::size_type, + cuda::thread_scope_device, + nullable_row_comparator_t, + probing_scheme_t, + cuco::aow_storage_ref>, + cuco::op::find_tag>; } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu index 7f7290141f9..af61173fb6a 100644 --- a/cpp/src/groupby/hash/sparse_to_dense_results.cu +++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "hash_compound_agg_finalizer.cuh" +#include "hash_compound_agg_finalizer.hpp" #include "helpers.cuh" #include diff --git a/cpp/src/groupby/hash/var_hash_functor.cu b/cpp/src/groupby/hash/var_hash_functor.cu deleted file mode 100644 index 4881f4ed85e..00000000000 --- a/cpp/src/groupby/hash/var_hash_functor.cu +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "helpers.cuh" -#include "var_hash_functor.cuh" - -namespace cudf::groupby::detail::hash { - -// explicit template instantiation to reduce build time -template struct var_hash_functor; -template struct var_hash_functor; - -} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/var_hash_functor.cuh b/cpp/src/groupby/hash/var_hash_functor.cuh index 98668d0cb45..abcd57263f4 100644 --- a/cpp/src/groupby/hash/var_hash_functor.cuh +++ b/cpp/src/groupby/hash/var_hash_functor.cuh @@ -16,6 +16,8 @@ #pragma once +#include "helpers.cuh" + #include #include #include From b4422c0427237d04970f26ec01e40f140f79e723 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 26 Sep 2024 15:58:40 -0700 Subject: [PATCH 048/135] Separate create_sparse_results_table --- cpp/CMakeLists.txt | 1 + .../groupby/hash/compute_single_pass_aggs.cu | 79 +---------- .../hash/create_sparse_results_table.cu | 125 ++++++++++++++++++ .../hash/create_sparse_results_table.hpp | 42 ++++++ cpp/src/groupby/hash/helpers.cuh | 17 +++ 5 files changed, 187 insertions(+), 77 deletions(-) create mode 100644 cpp/src/groupby/hash/create_sparse_results_table.cu create mode 100644 cpp/src/groupby/hash/create_sparse_results_table.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index cd92e086329..7f90fb388dc 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -317,6 +317,7 @@ add_library( src/groupby/hash/compute_aggregations.cu src/groupby/hash/compute_groupby.cu src/groupby/hash/compute_single_pass_aggs.cu + src/groupby/hash/create_sparse_results_table.cu src/groupby/hash/flatten_single_pass_aggs.cpp src/groupby/hash/groupby.cu src/groupby/hash/hash_compound_agg_finalizer.cu diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu index b5c68ea639a..465d58ed9ef 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cu +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cu @@ -15,7 +15,8 @@ */ #include "compute_aggregations.hpp" -// #include "compute_single_pass_aggs.hpp" +#include "compute_single_pass_aggs.hpp" +#include "create_sparse_results_table.hpp" #include "flatten_single_pass_aggs.hpp" #include "helpers.cuh" #include "single_pass_functors.cuh" @@ -172,66 +173,6 @@ int max_occupancy_grid_size(Kernel kernel, cudf::size_type n) auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE); return std::min(grid_size, num_blocks); } - -template -void extract_populated_keys(SetType const& key_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream) -{ - auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); - - populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); -} - -// make table that will hold sparse results -template -auto create_sparse_results_table(cudf::table_view const& flattened_values, - cudf::aggregation::Kind const* d_agg_kinds, - std::vector aggs, - bool direct_aggregations, - GlobalSetType const& global_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream) -{ - // TODO single allocation - room for performance improvement - std::vector> sparse_columns; - std::transform(flattened_values.begin(), - flattened_values.end(), - aggs.begin(), - std::back_inserter(sparse_columns), - [stream](auto const& col, auto const& agg) { - auto const nullable = - (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL) - ? false - : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or - agg == cudf::aggregation::STD); - auto mask_flag = - (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED; - auto const col_type = cudf::is_dictionary(col.type()) - ? cudf::dictionary_column_view(col).keys().type() - : col.type(); - return make_fixed_width_column( - cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); - }); - cudf::table sparse_table(std::move(sparse_columns)); - // If no direct aggregations, initialize the sparse table - // only for the keys inserted in global hash set - if (!direct_aggregations) { - auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream); - extract_populated_keys(global_set, populated_keys, stream); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - populated_keys.size(), - initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_agg_kinds}); - } - // Else initialize the whole table - else { - cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view(); - cudf::detail::initialize_with_identity(sparse_table_view, aggs, stream); - } - return sparse_table; -} } // namespace /** @@ -351,14 +292,6 @@ rmm::device_uvector compute_single_pass_aggs( return populated_keys; } -using global_set_t = cuco::static_set, - cuda::thread_scope_device, - row_comparator_t, - probing_scheme_t, - cudf::detail::cuco_allocator, - cuco::storage>; - template rmm::device_uvector compute_single_pass_aggs( cudf::table_view const& keys, cudf::host_span requests, @@ -367,14 +300,6 @@ template rmm::device_uvector compute_single_pass_aggs, - cuda::thread_scope_device, - nullable_row_comparator_t, - probing_scheme_t, - cudf::detail::cuco_allocator, - cuco::storage>; - template rmm::device_uvector compute_single_pass_aggs( cudf::table_view const& keys, cudf::host_span requests, diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu new file mode 100644 index 00000000000..7ae0184528d --- /dev/null +++ b/cpp/src/groupby/hash/create_sparse_results_table.cu @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "create_sparse_results_table.hpp" +#include "helpers.cuh" +#include "single_pass_functors.cuh" + +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include + +namespace cudf::groupby::detail::hash { +template +void extract_populated_keys(SetType const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream) +{ + auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); + + populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); +} + +// make table that will hold sparse results +template +cudf::table create_sparse_results_table(cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector aggs, + bool direct_aggregations, + GlobalSetType const& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream) +{ + // TODO single allocation - room for performance improvement + std::vector> sparse_columns; + std::transform(flattened_values.begin(), + flattened_values.end(), + aggs.begin(), + std::back_inserter(sparse_columns), + [stream](auto const& col, auto const& agg) { + auto const nullable = + (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL) + ? false + : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or + agg == cudf::aggregation::STD); + auto mask_flag = + (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED; + auto const col_type = cudf::is_dictionary(col.type()) + ? cudf::dictionary_column_view(col).keys().type() + : col.type(); + return make_fixed_width_column( + cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); + }); + cudf::table sparse_table(std::move(sparse_columns)); + // If no direct aggregations, initialize the sparse table + // only for the keys inserted in global hash set + if (!direct_aggregations) { + auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream); + extract_populated_keys(global_set, populated_keys, stream); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + populated_keys.size(), + initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_agg_kinds}); + } + // Else initialize the whole table + else { + cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view(); + cudf::detail::initialize_with_identity(sparse_table_view, aggs, stream); + } + return sparse_table; +} + +template void extract_populated_keys( + global_set_t const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); + +template void extract_populated_keys( + nullable_global_set_t const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); + +template cudf::table create_sparse_results_table( + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector aggs, + bool direct_aggregations, + global_set_t const& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); + +template cudf::table create_sparse_results_table( + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector aggs, + bool direct_aggregations, + nullable_global_set_t const& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp new file mode 100644 index 00000000000..2daa88289c0 --- /dev/null +++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +template +void extract_populated_keys(SetType const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); + +// make table that will hold sparse results +template +cudf::table create_sparse_results_table(cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector aggs, + bool direct_aggregations, + GlobalSetType const& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh index c1dd68c2b78..651a6a2014a 100644 --- a/cpp/src/groupby/hash/helpers.cuh +++ b/cpp/src/groupby/hash/helpers.cuh @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -70,6 +71,22 @@ using nullable_row_comparator_t = cudf::experimental::row::equality::device_row_ cudf::nullate::DYNAMIC, cudf::experimental::row::equality::nan_equal_physical_equality_comparator>; +using global_set_t = cuco::static_set, + cuda::thread_scope_device, + row_comparator_t, + probing_scheme_t, + cudf::detail::cuco_allocator, + cuco::storage>; + +using nullable_global_set_t = cuco::static_set, + cuda::thread_scope_device, + nullable_row_comparator_t, + probing_scheme_t, + cudf::detail::cuco_allocator, + cuco::storage>; + using hash_set_ref_t = cuco::static_set_ref< cudf::size_type, cuda::thread_scope_device, From 8ce4cda47cb6e9010f121b7cfda853884bfef22e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 26 Sep 2024 17:48:28 -0700 Subject: [PATCH 049/135] Add groupby multi-aggs test --- cpp/tests/CMakeLists.txt | 1 + cpp/tests/groupby/multi_aggs_tests.cpp | 115 +++++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 cpp/tests/groupby/multi_aggs_tests.cpp diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index b67d922d377..66d70c0c7cf 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -137,6 +137,7 @@ ConfigureTest( groupby/merge_lists_tests.cpp groupby/merge_sets_tests.cpp groupby/min_scan_tests.cpp + groupby/multi_aggs_tests.cpp groupby/nth_element_tests.cpp groupby/nunique_tests.cpp groupby/product_scan_tests.cpp diff --git a/cpp/tests/groupby/multi_aggs_tests.cpp b/cpp/tests/groupby/multi_aggs_tests.cpp new file mode 100644 index 00000000000..ae491a8f796 --- /dev/null +++ b/cpp/tests/groupby/multi_aggs_tests.cpp @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include + +using namespace cudf::test::iterators; + +namespace { +template +std::unique_ptr create_fixed_table(cudf::size_type num_columns, + cudf::size_type num_rows, + bool include_validity, + Elements elements) +{ + auto valids = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); + std::vector> src_cols(num_columns); + for (int idx = 0; idx < num_columns; idx++) { + if (include_validity) { + src_cols[idx] = + cudf::test::fixed_width_column_wrapper(elements, elements + num_rows, valids); + } else { + src_cols[idx] = cudf::test::fixed_width_column_wrapper(elements, elements + num_rows); + } + } + std::vector> columns(num_columns); + std::transform(src_cols.begin(), + src_cols.end(), + columns.begin(), + [](cudf::test::fixed_width_column_wrapper& in) { + auto ret = in.release(); + // pre-cache the null count + [[maybe_unused]] auto const nulls = ret->has_nulls(); + return ret; + }); + return std::make_unique(std::move(columns)); +} + +template +std::unique_ptr create_random_fixed_table(cudf::size_type num_columns, + cudf::size_type num_rows) +{ + auto rand_elements = + cudf::detail::make_counting_transform_iterator(0, [](T i) { return rand(); }); + return create_fixed_table(num_columns, num_rows, false, rand_elements); +} +} // namespace + +template +struct groupby_multi_aggs_test : public cudf::test::BaseFixture {}; + +template +std::vector convert(std::initializer_list in) +{ + std::vector out(std::cbegin(in), std::cend(in)); + return out; +} + +using supported_types = cudf::test::Concat>; +TYPED_TEST_SUITE(groupby_multi_aggs_test, supported_types); +using K = int32_t; + +TYPED_TEST(groupby_multi_aggs_test, basic) +{ + using V = TypeParam; + + auto constexpr num_cols = 3'000; + auto constexpr num_rows = 100'000; + auto keys = create_random_fixed_table(1, num_rows); + + auto vals = create_random_fixed_table(num_cols, num_rows); + + std::vector requests; + for (auto i = 0; i < num_cols; i++) { + requests.emplace_back(); + + requests[i].values = vals->get_column(i).view(); + requests[i].aggregations.push_back( + std::move(cudf::make_mean_aggregation())); + requests[i].aggregations.push_back( + std::move(cudf::make_min_aggregation())); + requests[i].aggregations.push_back( + std::move(cudf::make_max_aggregation())); + requests[i].aggregations.push_back( + std::move(cudf::make_count_aggregation())); + } + + cudf::groupby::groupby gb_obj{keys->view()}; + + auto result = gb_obj.aggregate(requests, cudf::test::get_default_stream()); +} From 06cf48f5bfb7b1e26e2ce88fa8388e76954e0f3d Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 26 Sep 2024 18:12:23 -0700 Subject: [PATCH 050/135] Further separate compute_single_pass_aggs --- cpp/CMakeLists.txt | 1 + .../groupby/hash/compute_single_pass_aggs.cu | 284 +---------------- .../groupby/hash/compute_single_pass_aggs.cuh | 295 ++++++++++++++++++ .../hash/compute_single_pass_aggs_null.cu | 28 ++ 4 files changed, 325 insertions(+), 283 deletions(-) create mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs.cuh create mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs_null.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7f90fb388dc..4bc37eb212c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -317,6 +317,7 @@ add_library( src/groupby/hash/compute_aggregations.cu src/groupby/hash/compute_groupby.cu src/groupby/hash/compute_single_pass_aggs.cu + src/groupby/hash/compute_single_pass_aggs_null.cu src/groupby/hash/create_sparse_results_table.cu src/groupby/hash/flatten_single_pass_aggs.cpp src/groupby/hash/groupby.cu diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu index 465d58ed9ef..f8b0f65b92f 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cu +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cu @@ -14,284 +14,10 @@ * limitations under the License. */ -#include "compute_aggregations.hpp" +#include "compute_single_pass_aggs.cuh" #include "compute_single_pass_aggs.hpp" -#include "create_sparse_results_table.hpp" -#include "flatten_single_pass_aggs.hpp" -#include "helpers.cuh" -#include "single_pass_functors.cuh" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -#include namespace cudf::groupby::detail::hash { -namespace { -template -// TODO pass block -__device__ void find_local_mapping(cudf::size_type cur_idx, - cudf::size_type num_input_rows, - SetType shared_set, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* cardinality, - cudf::size_type* local_mapping_index, - cudf::size_type* shared_set_indices) -{ - cudf::size_type result_idx; - // TODO: un-init - bool inserted; - if (cur_idx < num_input_rows and - (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) { - auto const result = shared_set.insert_and_find(cur_idx); - result_idx = *result.first; - inserted = result.second; - // inserted a new element - if (result.second) { - auto const shared_set_index = atomicAdd(cardinality, 1); - shared_set_indices[shared_set_index] = cur_idx; - local_mapping_index[cur_idx] = shared_set_index; - } - } - // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all - // threads in the thread block. - __syncthreads(); - if (cur_idx < num_input_rows and - (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) { - // element was already in set - if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; } - } -} - -template -__device__ void find_global_mapping(cudf::size_type cur_idx, - SetType global_set, - cudf::size_type* shared_set_indices, - cudf::size_type* global_mapping_index) -{ - auto const input_idx = shared_set_indices[cur_idx]; - global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] = - *global_set.insert_and_find(input_idx).first; -} - -/* - * Inserts keys into the shared memory hash set, and stores the row index of the local - * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a - * threadblock exceeds `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without - * updating `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to - * the global hash set, and save the row index of the global sparse table in `global_mapping_index`. - */ -template -CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, - cudf::size_type num_input_rows, - WindowExtent window_extent, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - bool* direct_aggregations) -{ - // TODO: indices inserted in each shared memory set - __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS]; - - // Shared set initialization - __shared__ typename SetRef::window_type windows[window_extent.value()]; - auto storage = SetRef::storage_ref_type(window_extent, windows); - auto shared_set = SetRef(cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, - global_set.key_eq(), - probing_scheme_t{global_set.hash_function()}, - {}, - storage); - auto const block = cooperative_groups::this_thread_block(); - shared_set.initialize(block); - - auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find); - - __shared__ cudf::size_type cardinality; - if (block.thread_rank() == 0) { cardinality = 0; } - block.sync(); - - auto const stride = cudf::detail::grid_1d::grid_stride(); - - for (auto cur_idx = cudf::detail::grid_1d::global_thread_id(); - cur_idx - block.thread_rank() < num_input_rows; - cur_idx += stride) { - find_local_mapping(cur_idx, - num_input_rows, - shared_insert_ref, - row_bitmask, - skip_rows_with_nulls, - &cardinality, - local_mapping_index, - shared_set_indices); - - block.sync(); - - if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { - if (block.thread_rank() == 0) { *direct_aggregations = true; } - break; - } - - block.sync(); - } - - // Insert unique keys from shared to global hash set - if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) { - for (auto cur_idx = block.thread_rank(); cur_idx < cardinality; - cur_idx += block.num_threads()) { - find_global_mapping(cur_idx, global_set, shared_set_indices, global_mapping_index); - } - } - - if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; } -} - -template -int max_occupancy_grid_size(Kernel kernel, cudf::size_type n) -{ - int max_active_blocks{-1}; - CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks, kernel, GROUPBY_BLOCK_SIZE, 0)); - auto const grid_size = max_active_blocks * cudf::detail::num_multiprocessors(); - auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE); - return std::min(grid_size, num_blocks); -} -} // namespace - -/** - * @brief Computes all aggregations from `requests` that require a single pass - * over the data and stores the results in `sparse_results` - */ -template -rmm::device_uvector compute_single_pass_aggs( - cudf::table_view const& keys, - cudf::host_span requests, - cudf::detail::result_cache* sparse_results, - SetType& global_set, - bool skip_rows_with_nulls, - rmm::cuda_stream_view stream) -{ - // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy - auto constexpr shared_set_capacity = - static_cast(static_cast(GROUPBY_SHM_MAX_ELEMENTS) * 1.43); - using extent_type = cuco::extent; - using shared_set_type = cuco::static_set, - cuco::storage>; - using shared_set_ref_type = typename shared_set_type::ref_type<>; - auto constexpr window_extent = cuco::make_window_extent(extent_type{}); - - auto const num_input_rows = keys.num_rows(); - - auto row_bitmask = - skip_rows_with_nulls - ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first - : rmm::device_buffer{}; - - auto global_set_ref = global_set.ref(cuco::op::insert_and_find); - auto const grid_size = max_occupancy_grid_size( - compute_mapping_indices, - num_input_rows); - // 'local_mapping_index' maps from the global row index of the input table to the row index of - // the local pre-aggregate table - rmm::device_uvector local_mapping_index(num_input_rows, stream); - // 'global_mapping_index' maps from the local pre-aggregate table to the row index of - // global aggregate table - rmm::device_uvector global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS, - stream); - rmm::device_uvector block_cardinality(grid_size, stream); - rmm::device_scalar direct_aggregations(false, stream); - compute_mapping_indices - <<>>(global_set_ref, - num_input_rows, - window_extent, - static_cast(row_bitmask.data()), - skip_rows_with_nulls, - local_mapping_index.data(), - global_mapping_index.data(), - block_cardinality.data(), - direct_aggregations.data()); - stream.synchronize(); - - // 'populated_keys' contains inserted row_indices (keys) of global hash set - rmm::device_uvector populated_keys(keys.num_rows(), stream); - - // flatten the aggs to a table that can be operated on by aggregate_row - auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); - auto const d_agg_kinds = cudf::detail::make_device_uvector_async( - agg_kinds, stream, rmm::mr::get_current_device_resource()); - // make table that will hold sparse results - cudf::table sparse_table = create_sparse_results_table(flattened_values, - d_agg_kinds.data(), - agg_kinds, - direct_aggregations.value(stream), - global_set, - populated_keys, - stream); - // prepare to launch kernel to do the actual aggregation - auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - auto d_values = table_device_view::create(flattened_values, stream); - - compute_aggregations(grid_size, - num_input_rows, - static_cast(row_bitmask.data()), - skip_rows_with_nulls, - local_mapping_index.data(), - global_mapping_index.data(), - block_cardinality.data(), - *d_values, - *d_sparse_table, - d_agg_kinds.data(), - stream); - - if (direct_aggregations.value(stream)) { - auto const stride = GROUPBY_BLOCK_SIZE * grid_size; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - keys.num_rows(), - compute_direct_aggregates{global_set_ref, - *d_values, - *d_sparse_table, - d_agg_kinds.data(), - block_cardinality.data(), - stride, - static_cast(row_bitmask.data()), - skip_rows_with_nulls}); - extract_populated_keys(global_set, populated_keys, stream); - } - - // Add results back to sparse_results cache - auto sparse_result_cols = sparse_table.release(); - for (size_t i = 0; i < aggs.size(); i++) { - // Note that the cache will make a copy of this temporary aggregation - sparse_results->add_result( - flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); - } - - return populated_keys; -} - template rmm::device_uvector compute_single_pass_aggs( cudf::table_view const& keys, cudf::host_span requests, @@ -299,12 +25,4 @@ template rmm::device_uvector compute_single_pass_aggs compute_single_pass_aggs( - cudf::table_view const& keys, - cudf::host_span requests, - cudf::detail::result_cache* sparse_results, - nullable_global_set_t& global_set, - bool skip_rows_with_nulls, - rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh new file mode 100644 index 00000000000..d74b7ac4aa0 --- /dev/null +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -0,0 +1,295 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "compute_aggregations.hpp" +#include "compute_single_pass_aggs.hpp" +#include "create_sparse_results_table.hpp" +#include "flatten_single_pass_aggs.hpp" +#include "helpers.cuh" +#include "single_pass_functors.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +namespace { +template +// TODO pass block +__device__ void find_local_mapping(cudf::size_type cur_idx, + cudf::size_type num_input_rows, + SetType shared_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* cardinality, + cudf::size_type* local_mapping_index, + cudf::size_type* shared_set_indices) +{ + cudf::size_type result_idx; + // TODO: un-init + bool inserted; + if (cur_idx < num_input_rows and + (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) { + auto const result = shared_set.insert_and_find(cur_idx); + result_idx = *result.first; + inserted = result.second; + // inserted a new element + if (result.second) { + auto const shared_set_index = atomicAdd(cardinality, 1); + shared_set_indices[shared_set_index] = cur_idx; + local_mapping_index[cur_idx] = shared_set_index; + } + } + // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all + // threads in the thread block. + __syncthreads(); + if (cur_idx < num_input_rows and + (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) { + // element was already in set + if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; } + } +} + +template +__device__ void find_global_mapping(cudf::size_type cur_idx, + SetType global_set, + cudf::size_type* shared_set_indices, + cudf::size_type* global_mapping_index) +{ + auto const input_idx = shared_set_indices[cur_idx]; + global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] = + *global_set.insert_and_find(input_idx).first; +} + +/* + * Inserts keys into the shared memory hash set, and stores the row index of the local + * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a + * threadblock exceeds `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without + * updating `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to + * the global hash set, and save the row index of the global sparse table in `global_mapping_index`. + */ +template +CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, + cudf::size_type num_input_rows, + WindowExtent window_extent, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + bool* direct_aggregations) +{ + // TODO: indices inserted in each shared memory set + __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS]; + + // Shared set initialization + __shared__ typename SetRef::window_type windows[window_extent.value()]; + auto storage = SetRef::storage_ref_type(window_extent, windows); + auto shared_set = SetRef(cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, + global_set.key_eq(), + probing_scheme_t{global_set.hash_function()}, + {}, + storage); + auto const block = cooperative_groups::this_thread_block(); + shared_set.initialize(block); + + auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find); + + __shared__ cudf::size_type cardinality; + if (block.thread_rank() == 0) { cardinality = 0; } + block.sync(); + + auto const stride = cudf::detail::grid_1d::grid_stride(); + + for (auto cur_idx = cudf::detail::grid_1d::global_thread_id(); + cur_idx - block.thread_rank() < num_input_rows; + cur_idx += stride) { + find_local_mapping(cur_idx, + num_input_rows, + shared_insert_ref, + row_bitmask, + skip_rows_with_nulls, + &cardinality, + local_mapping_index, + shared_set_indices); + + block.sync(); + + if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { + if (block.thread_rank() == 0) { *direct_aggregations = true; } + break; + } + + block.sync(); + } + + // Insert unique keys from shared to global hash set + if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) { + for (auto cur_idx = block.thread_rank(); cur_idx < cardinality; + cur_idx += block.num_threads()) { + find_global_mapping(cur_idx, global_set, shared_set_indices, global_mapping_index); + } + } + + if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; } +} + +template +int max_occupancy_grid_size(Kernel kernel, cudf::size_type n) +{ + int max_active_blocks{-1}; + CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks, kernel, GROUPBY_BLOCK_SIZE, 0)); + auto const grid_size = max_active_blocks * cudf::detail::num_multiprocessors(); + auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE); + return std::min(grid_size, num_blocks); +} +} // namespace + +/** + * @brief Computes all aggregations from `requests` that require a single pass + * over the data and stores the results in `sparse_results` + */ +template +rmm::device_uvector compute_single_pass_aggs( + cudf::table_view const& keys, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + SetType& global_set, + bool skip_rows_with_nulls, + rmm::cuda_stream_view stream) +{ + // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy + auto constexpr shared_set_capacity = + static_cast(static_cast(GROUPBY_SHM_MAX_ELEMENTS) * 1.43); + using extent_type = cuco::extent; + using shared_set_type = cuco::static_set, + cuco::storage>; + using shared_set_ref_type = typename shared_set_type::ref_type<>; + auto constexpr window_extent = cuco::make_window_extent(extent_type{}); + + auto const num_input_rows = keys.num_rows(); + + auto row_bitmask = + skip_rows_with_nulls + ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first + : rmm::device_buffer{}; + + auto global_set_ref = global_set.ref(cuco::op::insert_and_find); + auto const grid_size = max_occupancy_grid_size( + compute_mapping_indices, + num_input_rows); + // 'local_mapping_index' maps from the global row index of the input table to the row index of + // the local pre-aggregate table + rmm::device_uvector local_mapping_index(num_input_rows, stream); + // 'global_mapping_index' maps from the local pre-aggregate table to the row index of + // global aggregate table + rmm::device_uvector global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS, + stream); + rmm::device_uvector block_cardinality(grid_size, stream); + rmm::device_scalar direct_aggregations(false, stream); + compute_mapping_indices + <<>>(global_set_ref, + num_input_rows, + window_extent, + static_cast(row_bitmask.data()), + skip_rows_with_nulls, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + direct_aggregations.data()); + stream.synchronize(); + + // 'populated_keys' contains inserted row_indices (keys) of global hash set + rmm::device_uvector populated_keys(keys.num_rows(), stream); + + // flatten the aggs to a table that can be operated on by aggregate_row + auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); + auto const d_agg_kinds = cudf::detail::make_device_uvector_async( + agg_kinds, stream, rmm::mr::get_current_device_resource()); + // make table that will hold sparse results + cudf::table sparse_table = create_sparse_results_table(flattened_values, + d_agg_kinds.data(), + agg_kinds, + direct_aggregations.value(stream), + global_set, + populated_keys, + stream); + // prepare to launch kernel to do the actual aggregation + auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); + auto d_values = table_device_view::create(flattened_values, stream); + + compute_aggregations(grid_size, + num_input_rows, + static_cast(row_bitmask.data()), + skip_rows_with_nulls, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + stream); + + if (direct_aggregations.value(stream)) { + auto const stride = GROUPBY_BLOCK_SIZE * grid_size; + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + keys.num_rows(), + compute_direct_aggregates{global_set_ref, + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + block_cardinality.data(), + stride, + static_cast(row_bitmask.data()), + skip_rows_with_nulls}); + extract_populated_keys(global_set, populated_keys, stream); + } + + // Add results back to sparse_results cache + auto sparse_result_cols = sparse_table.release(); + for (size_t i = 0; i < aggs.size(); i++) { + // Note that the cache will make a copy of this temporary aggregation + sparse_results->add_result( + flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); + } + + return populated_keys; +} +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu b/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu new file mode 100644 index 00000000000..b88f1a952d5 --- /dev/null +++ b/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_single_pass_aggs.cuh" +#include "compute_single_pass_aggs.hpp" + +namespace cudf::groupby::detail::hash { +template rmm::device_uvector compute_single_pass_aggs( + cudf::table_view const& keys, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + nullable_global_set_t& global_set, + bool skip_rows_with_nulls, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash From 4a952982d9c557138f1983cb79a3cd1f74485c3d Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 26 Sep 2024 19:23:24 -0700 Subject: [PATCH 051/135] test --- cpp/src/groupby/hash/compute_aggregations.cu | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu index 8b559ffc1be..75d89af1313 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cu +++ b/cpp/src/groupby/hash/compute_aggregations.cu @@ -234,6 +234,9 @@ constexpr size_t compute_shared_memory_size(Kernel kernel, int grid_size) size_t dynamic_shmem_size = 0; + std::cout << "### active_blocks_per_sm: " << active_blocks_per_sm << " grid_size: " << grid_size + << "\n"; + CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock( &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE)); return get_previous_multiple_of_8(0.5 * dynamic_shmem_size); From 4b247b1f323b506f6511bcb7ba37a6b31b0f92b4 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sun, 29 Sep 2024 17:07:18 -0700 Subject: [PATCH 052/135] Renaming + minor cleanups --- cpp/src/groupby/hash/compute_aggregations.cu | 51 ++++++++++--------- cpp/src/groupby/hash/compute_aggregations.hpp | 4 +- .../groupby/hash/compute_single_pass_aggs.cuh | 2 + .../hash/create_sparse_results_table.cu | 10 ++-- .../hash/create_sparse_results_table.hpp | 2 +- 5 files changed, 38 insertions(+), 31 deletions(-) diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu index 75d89af1313..0a47d14d140 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cu +++ b/cpp/src/groupby/hash/compute_aggregations.cu @@ -15,6 +15,7 @@ */ #include "compute_aggregations.hpp" +#include "create_sparse_results_table.hpp" #include "global_memory_aggregator.cuh" #include "helpers.cuh" #include "shared_memory_aggregator.cuh" @@ -69,12 +70,12 @@ __device__ void initialize_shared_memory_aggregates(int col_start, std::byte** s_aggregates_pointer, bool** s_aggregates_valid_pointer, cudf::size_type cardinality, - cudf::aggregation::Kind const* aggs) + cudf::aggregation::Kind const* d_agg_kinds) { for (auto col_idx = col_start; col_idx < col_end; col_idx++) { for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) { cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(), - aggs[col_idx], + d_agg_kinds[col_idx], initialize_shmem{}, s_aggregates_pointer[col_idx], idx, @@ -92,7 +93,7 @@ __device__ void compute_pre_aggregrates(int col_start, cudf::size_type* local_mapping_index, std::byte** s_aggregates_pointer, bool** s_aggregates_valid_pointer, - cudf::aggregation::Kind const* aggs) + cudf::aggregation::Kind const* d_agg_kinds) { // TODO grid_1d utility for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows; @@ -104,7 +105,7 @@ __device__ void compute_pre_aggregrates(int col_start, auto input_col = input_values.column(col_idx); cudf::detail::dispatch_type_and_aggregation(input_col.type(), - aggs[col_idx], + d_agg_kinds[col_idx], shmem_element_aggregator{}, s_aggregates_pointer[col_idx], map_idx, @@ -124,7 +125,7 @@ __device__ void compute_final_aggregates(int col_start, cudf::size_type* global_mapping_index, std::byte** s_aggregates_pointer, bool** s_aggregates_valid_pointer, - cudf::aggregation::Kind const* aggs) + cudf::aggregation::Kind const* d_agg_kinds) { for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) { auto out_idx = global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx]; @@ -132,7 +133,7 @@ __device__ void compute_final_aggregates(int col_start, auto output_col = output_values.column(col_idx); cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(), - aggs[col_idx], + d_agg_kinds[col_idx], gmem_element_aggregator{}, output_col, out_idx, @@ -146,17 +147,17 @@ __device__ void compute_final_aggregates(int col_start, /* Takes the local_mapping_index and global_mapping_index to compute * pre (shared) and final (global) aggregates*/ -CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::aggregation::Kind const* aggs, - int total_agg_size, - int pointer_size) +CUDF_KERNEL void compute_d_agg_kinds_kernel(cudf::size_type num_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* d_agg_kinds, + int total_agg_size, + int pointer_size) { auto const block = cooperative_groups::this_thread_block(); auto const cardinality = block_cardinality[block.group_index().x]; @@ -195,7 +196,7 @@ CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows, s_aggregates_pointer, s_aggregates_valid_pointer, cardinality, - aggs); + d_agg_kinds); block.sync(); compute_pre_aggregrates(col_start, col_end, @@ -206,7 +207,7 @@ CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows, local_mapping_index, s_aggregates_pointer, s_aggregates_valid_pointer, - aggs); + d_agg_kinds); block.sync(); compute_final_aggregates(col_start, col_end, @@ -216,7 +217,7 @@ CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows, global_mapping_index, s_aggregates_pointer, s_aggregates_valid_pointer, - aggs); + d_agg_kinds); block.sync(); } } @@ -253,10 +254,12 @@ void compute_aggregations(int grid_size, cudf::size_type* block_cardinality, cudf::table_device_view input_values, cudf::mutable_table_device_view output_values, - cudf::aggregation::Kind const* aggs, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, rmm::cuda_stream_view stream) { - auto const shmem_size = compute_shared_memory_size(compute_aggs_kernel, grid_size); + auto const shmem_size = compute_shared_memory_size(compute_d_agg_kinds_kernel, grid_size); // For each aggregation, need two pointers to arrays in shmem // One where the aggregation is performed, one indicating the validity of the aggregation @@ -264,7 +267,7 @@ void compute_aggregations(int grid_size, round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns()); // The rest of shmem is utilized for the actual arrays in shmem auto const shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2; - compute_aggs_kernel<<>>( + compute_d_agg_kinds_kernel<<>>( num_input_rows, row_bitmask, skip_rows_with_nulls, @@ -273,7 +276,7 @@ void compute_aggregations(int grid_size, block_cardinality, input_values, output_values, - aggs, + d_agg_kinds, shmem_agg_size, shmem_agg_pointer_size); } diff --git a/cpp/src/groupby/hash/compute_aggregations.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp index 87c37158cd0..badf8079875 100644 --- a/cpp/src/groupby/hash/compute_aggregations.hpp +++ b/cpp/src/groupby/hash/compute_aggregations.hpp @@ -35,7 +35,9 @@ void compute_aggregations(int grid_size, cudf::size_type* block_cardinality, cudf::table_device_view input_values, cudf::mutable_table_device_view output_values, - cudf::aggregation::Kind const* aggs, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index d74b7ac4aa0..51c131b59eb 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -263,7 +263,9 @@ rmm::device_uvector compute_single_pass_aggs( block_cardinality.data(), *d_values, *d_sparse_table, + flattened_values, d_agg_kinds.data(), + agg_kinds, stream); if (direct_aggregations.value(stream)) { diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu index 7ae0184528d..fa3e1b3a2ba 100644 --- a/cpp/src/groupby/hash/create_sparse_results_table.cu +++ b/cpp/src/groupby/hash/create_sparse_results_table.cu @@ -48,7 +48,7 @@ void extract_populated_keys(SetType const& key_set, template cudf::table create_sparse_results_table(cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, - std::vector aggs, + std::vector agg_kinds, bool direct_aggregations, GlobalSetType const& global_set, rmm::device_uvector& populated_keys, @@ -58,7 +58,7 @@ cudf::table create_sparse_results_table(cudf::table_view const& flattened_values std::vector> sparse_columns; std::transform(flattened_values.begin(), flattened_values.end(), - aggs.begin(), + agg_kinds.begin(), std::back_inserter(sparse_columns), [stream](auto const& col, auto const& agg) { auto const nullable = @@ -89,7 +89,7 @@ cudf::table create_sparse_results_table(cudf::table_view const& flattened_values // Else initialize the whole table else { cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view(); - cudf::detail::initialize_with_identity(sparse_table_view, aggs, stream); + cudf::detail::initialize_with_identity(sparse_table_view, agg_kinds, stream); } return sparse_table; } @@ -107,7 +107,7 @@ template void extract_populated_keys( template cudf::table create_sparse_results_table( cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, - std::vector aggs, + std::vector agg_kinds, bool direct_aggregations, global_set_t const& global_set, rmm::device_uvector& populated_keys, @@ -116,7 +116,7 @@ template cudf::table create_sparse_results_table( template cudf::table create_sparse_results_table( cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, - std::vector aggs, + std::vector agg_kinds, bool direct_aggregations, nullable_global_set_t const& global_set, rmm::device_uvector& populated_keys, diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp index 2daa88289c0..f2810bd0235 100644 --- a/cpp/src/groupby/hash/create_sparse_results_table.hpp +++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp @@ -34,7 +34,7 @@ void extract_populated_keys(SetType const& key_set, template cudf::table create_sparse_results_table(cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, - std::vector aggs, + std::vector agg_kinds, bool direct_aggregations, GlobalSetType const& global_set, rmm::device_uvector& populated_keys, From 90597288981a8813e9497a57939b8fe603472bed Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sun, 29 Sep 2024 17:07:48 -0700 Subject: [PATCH 053/135] Remove unused code --- cpp/src/groupby/hash/compute_aggregations.cu | 6 ------ 1 file changed, 6 deletions(-) diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu index 0a47d14d140..dda4c1c5773 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cu +++ b/cpp/src/groupby/hash/compute_aggregations.cu @@ -230,14 +230,8 @@ constexpr size_t compute_shared_memory_size(Kernel kernel, int grid_size) auto const active_blocks_per_sm = cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); - CUDF_EXPECTS(active_blocks_per_sm >= 1, "active_blocks_per_sm must be larger than 1"); - CUDF_EXPECTS(grid_size >= 1, "grid_size must be larger than 1"); - size_t dynamic_shmem_size = 0; - std::cout << "### active_blocks_per_sm: " << active_blocks_per_sm << " grid_size: " << grid_size - << "\n"; - CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock( &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE)); return get_previous_multiple_of_8(0.5 * dynamic_shmem_size); From 87312faf8d34f8cebf6b10bacb9c7482af9e3b16 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sun, 29 Sep 2024 17:48:47 -0700 Subject: [PATCH 054/135] Make compute_aggregations return sparse table --- cpp/src/groupby/hash/compute_aggregations.cu | 76 +++++++++++++++---- cpp/src/groupby/hash/compute_aggregations.hpp | 29 +++---- .../groupby/hash/compute_single_pass_aggs.cuh | 42 +++++----- 3 files changed, 98 insertions(+), 49 deletions(-) diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu index dda4c1c5773..7861884562a 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cu +++ b/cpp/src/groupby/hash/compute_aggregations.cu @@ -239,22 +239,36 @@ constexpr size_t compute_shared_memory_size(Kernel kernel, int grid_size) } // namespace -void compute_aggregations(int grid_size, - cudf::size_type num_input_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::table_view const& flattened_values, - cudf::aggregation::Kind const* d_agg_kinds, - std::vector const& agg_kinds, - rmm::cuda_stream_view stream) +template +cudf::table compute_aggregations(int grid_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + bool direct_aggregations, + SetType& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream) { auto const shmem_size = compute_shared_memory_size(compute_d_agg_kinds_kernel, grid_size); + // make table that will hold sparse results + cudf::table sparse_table = create_sparse_results_table(flattened_values, + d_agg_kinds, + agg_kinds, + direct_aggregations, + global_set, + populated_keys, + stream); + auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); + auto output_values = *d_sparse_table; + // For each aggregation, need two pointers to arrays in shmem // One where the aggregation is performed, one indicating the validity of the aggregation auto const shmem_agg_pointer_size = @@ -273,6 +287,42 @@ void compute_aggregations(int grid_size, d_agg_kinds, shmem_agg_size, shmem_agg_pointer_size); + + return sparse_table; } +template cudf::table compute_aggregations( + int grid_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + bool direct_aggregations, + global_set_t& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); + +template cudf::table compute_aggregations( + int grid_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + bool direct_aggregations, + nullable_global_set_t& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); + } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_aggregations.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp index badf8079875..bcb996b645d 100644 --- a/cpp/src/groupby/hash/compute_aggregations.hpp +++ b/cpp/src/groupby/hash/compute_aggregations.hpp @@ -26,18 +26,21 @@ namespace cudf::groupby::detail::hash { -void compute_aggregations(int grid_size, - cudf::size_type num_input_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::table_view const& flattened_values, - cudf::aggregation::Kind const* d_agg_kinds, - std::vector const& agg_kinds, - rmm::cuda_stream_view stream); +template +cudf::table compute_aggregations(int grid_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + bool direct_aggregations, + SetType& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 51c131b59eb..3372d88e714 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -242,31 +242,27 @@ rmm::device_uvector compute_single_pass_aggs( auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); auto const d_agg_kinds = cudf::detail::make_device_uvector_async( agg_kinds, stream, rmm::mr::get_current_device_resource()); - // make table that will hold sparse results - cudf::table sparse_table = create_sparse_results_table(flattened_values, - d_agg_kinds.data(), - agg_kinds, - direct_aggregations.value(stream), - global_set, - populated_keys, - stream); + // prepare to launch kernel to do the actual aggregation - auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - auto d_values = table_device_view::create(flattened_values, stream); + auto d_values = table_device_view::create(flattened_values, stream); - compute_aggregations(grid_size, - num_input_rows, - static_cast(row_bitmask.data()), - skip_rows_with_nulls, - local_mapping_index.data(), - global_mapping_index.data(), - block_cardinality.data(), - *d_values, - *d_sparse_table, - flattened_values, - d_agg_kinds.data(), - agg_kinds, - stream); + cudf::table sparse_table = + compute_aggregations(grid_size, + num_input_rows, + static_cast(row_bitmask.data()), + skip_rows_with_nulls, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + *d_values, + flattened_values, + d_agg_kinds.data(), + agg_kinds, + direct_aggregations.value(stream), + global_set, + populated_keys, + stream); + auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); if (direct_aggregations.value(stream)) { auto const stride = GROUPBY_BLOCK_SIZE * grid_size; From bb7187dea240bcb9c6c6bddec37aa914bae6f23b Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sun, 29 Sep 2024 18:17:11 -0700 Subject: [PATCH 055/135] Add rollback if encounting CUDA errors --- cpp/src/groupby/hash/compute_aggregations.cu | 59 +++++++++++-------- cpp/src/groupby/hash/compute_aggregations.hpp | 33 ++++++----- .../groupby/hash/compute_single_pass_aggs.cuh | 16 ++++- 3 files changed, 66 insertions(+), 42 deletions(-) diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu index 7861884562a..d2d7aa83568 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cu +++ b/cpp/src/groupby/hash/compute_aggregations.cu @@ -225,38 +225,44 @@ CUDF_KERNEL void compute_d_agg_kinds_kernel(cudf::size_type num_rows, constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; } template -constexpr size_t compute_shared_memory_size(Kernel kernel, int grid_size) +constexpr std::pair compute_shared_memory_size(Kernel kernel, + int grid_size) noexcept { auto const active_blocks_per_sm = cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); size_t dynamic_shmem_size = 0; - CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock( - &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE)); - return get_previous_multiple_of_8(0.5 * dynamic_shmem_size); + auto const status = cudaOccupancyAvailableDynamicSMemPerBlock( + &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE); + if (status != cudaSuccess) { cudaGetLastError(); } + return {status, get_previous_multiple_of_8(0.5 * dynamic_shmem_size)}; } } // namespace template -cudf::table compute_aggregations(int grid_size, - cudf::size_type num_input_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::table_view const& flattened_values, - cudf::aggregation::Kind const* d_agg_kinds, - std::vector const& agg_kinds, - bool direct_aggregations, - SetType& global_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream) +std::pair compute_aggregations( + int grid_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + bool direct_aggregations, + SetType& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream) { - auto const shmem_size = compute_shared_memory_size(compute_d_agg_kinds_kernel, grid_size); + auto const [status, shmem_size] = + compute_shared_memory_size(compute_d_agg_kinds_kernel, grid_size); + + if (status != cudaSuccess) { direct_aggregations = true; } // make table that will hold sparse results cudf::table sparse_table = create_sparse_results_table(flattened_values, @@ -266,8 +272,11 @@ cudf::table compute_aggregations(int grid_size, global_set, populated_keys, stream); - auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - auto output_values = *d_sparse_table; + + if (status != cudaSuccess) { return {status, sparse_table}; } + + auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); + auto output_values = *d_sparse_table; // For each aggregation, need two pointers to arrays in shmem // One where the aggregation is performed, one indicating the validity of the aggregation @@ -288,10 +297,10 @@ cudf::table compute_aggregations(int grid_size, shmem_agg_size, shmem_agg_pointer_size); - return sparse_table; + return {status, sparse_table}; } -template cudf::table compute_aggregations( +template std::pair compute_aggregations( int grid_size, cudf::size_type num_input_rows, bitmask_type const* row_bitmask, @@ -308,7 +317,7 @@ template cudf::table compute_aggregations( rmm::device_uvector& populated_keys, rmm::cuda_stream_view stream); -template cudf::table compute_aggregations( +template std::pair compute_aggregations( int grid_size, cudf::size_type num_input_rows, bitmask_type const* row_bitmask, diff --git a/cpp/src/groupby/hash/compute_aggregations.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp index bcb996b645d..1c382f01195 100644 --- a/cpp/src/groupby/hash/compute_aggregations.hpp +++ b/cpp/src/groupby/hash/compute_aggregations.hpp @@ -22,25 +22,28 @@ #include +#include + #include namespace cudf::groupby::detail::hash { template -cudf::table compute_aggregations(int grid_size, - cudf::size_type num_input_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::table_view const& flattened_values, - cudf::aggregation::Kind const* d_agg_kinds, - std::vector const& agg_kinds, - bool direct_aggregations, - SetType& global_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream); +std::pair compute_aggregations( + int grid_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + bool direct_aggregations, + SetType& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 3372d88e714..25c8aed957c 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -246,7 +246,7 @@ rmm::device_uvector compute_single_pass_aggs( // prepare to launch kernel to do the actual aggregation auto d_values = table_device_view::create(flattened_values, stream); - cudf::table sparse_table = + auto [status, sparse_table] = compute_aggregations(grid_size, num_input_rows, static_cast(row_bitmask.data()), @@ -264,7 +264,19 @@ rmm::device_uvector compute_single_pass_aggs( stream); auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - if (direct_aggregations.value(stream)) { + if (status != cudaSuccess) { + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + keys.num_rows(), + hash::compute_single_pass_aggs_fn{global_set_ref, + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + static_cast(row_bitmask.data()), + skip_rows_with_nulls}); + extract_populated_keys(global_set, populated_keys, stream); + } else if (direct_aggregations.value(stream)) { auto const stride = GROUPBY_BLOCK_SIZE * grid_size; thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), From 5f05ca72e20e8d658dbb35ab1a614321f640aab9 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sun, 29 Sep 2024 18:21:33 -0700 Subject: [PATCH 056/135] Add explicit instantiations for compute_aggregations --- cpp/CMakeLists.txt | 1 + cpp/src/groupby/hash/compute_aggregations.cu | 302 +---------------- cpp/src/groupby/hash/compute_aggregations.cuh | 303 ++++++++++++++++++ .../groupby/hash/compute_aggregations_null.cu | 37 +++ 4 files changed, 342 insertions(+), 301 deletions(-) create mode 100644 cpp/src/groupby/hash/compute_aggregations.cuh create mode 100644 cpp/src/groupby/hash/compute_aggregations_null.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 4bc37eb212c..743c8cd1f7c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -315,6 +315,7 @@ add_library( src/filling/sequence.cu src/groupby/groupby.cu src/groupby/hash/compute_aggregations.cu + src/groupby/hash/compute_aggregations_null.cu src/groupby/hash/compute_groupby.cu src/groupby/hash/compute_single_pass_aggs.cu src/groupby/hash/compute_single_pass_aggs_null.cu diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu index d2d7aa83568..8e70a3a77f0 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cu +++ b/cpp/src/groupby/hash/compute_aggregations.cu @@ -14,292 +14,10 @@ * limitations under the License. */ +#include "compute_aggregations.cuh" #include "compute_aggregations.hpp" -#include "create_sparse_results_table.hpp" -#include "global_memory_aggregator.cuh" -#include "helpers.cuh" -#include "shared_memory_aggregator.cuh" -#include "single_pass_functors.cuh" - -#include -#include -#include -#include -#include -#include - -#include - -#include - -#include namespace cudf::groupby::detail::hash { -namespace { -__device__ void calculate_columns_to_aggregate(int& col_start, - int& col_end, - cudf::mutable_table_device_view output_values, - int num_input_cols, - std::byte** s_aggregates_pointer, - bool** s_aggregates_valid_pointer, - std::byte* shared_set_aggregates, - cudf::size_type cardinality, - int total_agg_size) -{ - if (threadIdx.x == 0) { - col_start = col_end; - int bytes_allocated = 0; - int valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality); - while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) { - int next_col_size = - round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality); - int next_col_total_size = valid_col_size + next_col_size; - if (bytes_allocated + next_col_total_size > total_agg_size) { break; } - s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated; - s_aggregates_valid_pointer[col_end] = - reinterpret_cast(shared_set_aggregates + bytes_allocated + next_col_size); - bytes_allocated += next_col_total_size; - col_end++; - } - } -} - -__device__ void initialize_shared_memory_aggregates(int col_start, - int col_end, - cudf::mutable_table_device_view output_values, - std::byte** s_aggregates_pointer, - bool** s_aggregates_valid_pointer, - cudf::size_type cardinality, - cudf::aggregation::Kind const* d_agg_kinds) -{ - for (auto col_idx = col_start; col_idx < col_end; col_idx++) { - for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) { - cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(), - d_agg_kinds[col_idx], - initialize_shmem{}, - s_aggregates_pointer[col_idx], - idx, - s_aggregates_valid_pointer[col_idx]); - } - } -} - -__device__ void compute_pre_aggregrates(int col_start, - int col_end, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::table_device_view input_values, - cudf::size_type num_input_rows, - cudf::size_type* local_mapping_index, - std::byte** s_aggregates_pointer, - bool** s_aggregates_valid_pointer, - cudf::aggregation::Kind const* d_agg_kinds) -{ - // TODO grid_1d utility - for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows; - cur_idx += blockDim.x * gridDim.x) { - if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx)) { - auto map_idx = local_mapping_index[cur_idx]; - - for (auto col_idx = col_start; col_idx < col_end; col_idx++) { - auto input_col = input_values.column(col_idx); - - cudf::detail::dispatch_type_and_aggregation(input_col.type(), - d_agg_kinds[col_idx], - shmem_element_aggregator{}, - s_aggregates_pointer[col_idx], - map_idx, - s_aggregates_valid_pointer[col_idx], - input_col, - cur_idx); - } - } - } -} - -__device__ void compute_final_aggregates(int col_start, - int col_end, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::size_type cardinality, - cudf::size_type* global_mapping_index, - std::byte** s_aggregates_pointer, - bool** s_aggregates_valid_pointer, - cudf::aggregation::Kind const* d_agg_kinds) -{ - for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) { - auto out_idx = global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx]; - for (auto col_idx = col_start; col_idx < col_end; col_idx++) { - auto output_col = output_values.column(col_idx); - - cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(), - d_agg_kinds[col_idx], - gmem_element_aggregator{}, - output_col, - out_idx, - input_values.column(col_idx), - s_aggregates_pointer[col_idx], - cur_idx, - s_aggregates_valid_pointer[col_idx]); - } - } -} - -/* Takes the local_mapping_index and global_mapping_index to compute - * pre (shared) and final (global) aggregates*/ -CUDF_KERNEL void compute_d_agg_kinds_kernel(cudf::size_type num_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::aggregation::Kind const* d_agg_kinds, - int total_agg_size, - int pointer_size) -{ - auto const block = cooperative_groups::this_thread_block(); - auto const cardinality = block_cardinality[block.group_index().x]; - if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; } - - auto const num_cols = output_values.num_columns(); - - __shared__ int col_start; - __shared__ int col_end; - extern __shared__ std::byte shared_set_aggregates[]; - std::byte** s_aggregates_pointer = - reinterpret_cast(shared_set_aggregates + total_agg_size); - bool** s_aggregates_valid_pointer = - reinterpret_cast(shared_set_aggregates + total_agg_size + pointer_size); - - if (block.thread_rank() == 0) { - col_start = 0; - col_end = 0; - } - block.sync(); - - while (col_end < num_cols) { - calculate_columns_to_aggregate(col_start, - col_end, - output_values, - num_cols, - s_aggregates_pointer, - s_aggregates_valid_pointer, - shared_set_aggregates, - cardinality, - total_agg_size); - block.sync(); - initialize_shared_memory_aggregates(col_start, - col_end, - output_values, - s_aggregates_pointer, - s_aggregates_valid_pointer, - cardinality, - d_agg_kinds); - block.sync(); - compute_pre_aggregrates(col_start, - col_end, - row_bitmask, - skip_rows_with_nulls, - input_values, - num_rows, - local_mapping_index, - s_aggregates_pointer, - s_aggregates_valid_pointer, - d_agg_kinds); - block.sync(); - compute_final_aggregates(col_start, - col_end, - input_values, - output_values, - cardinality, - global_mapping_index, - s_aggregates_pointer, - s_aggregates_valid_pointer, - d_agg_kinds); - block.sync(); - } -} - -constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; } - -template -constexpr std::pair compute_shared_memory_size(Kernel kernel, - int grid_size) noexcept -{ - auto const active_blocks_per_sm = - cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); - - size_t dynamic_shmem_size = 0; - - auto const status = cudaOccupancyAvailableDynamicSMemPerBlock( - &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE); - if (status != cudaSuccess) { cudaGetLastError(); } - return {status, get_previous_multiple_of_8(0.5 * dynamic_shmem_size)}; -} - -} // namespace - -template -std::pair compute_aggregations( - int grid_size, - cudf::size_type num_input_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::table_view const& flattened_values, - cudf::aggregation::Kind const* d_agg_kinds, - std::vector const& agg_kinds, - bool direct_aggregations, - SetType& global_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream) -{ - auto const [status, shmem_size] = - compute_shared_memory_size(compute_d_agg_kinds_kernel, grid_size); - - if (status != cudaSuccess) { direct_aggregations = true; } - - // make table that will hold sparse results - cudf::table sparse_table = create_sparse_results_table(flattened_values, - d_agg_kinds, - agg_kinds, - direct_aggregations, - global_set, - populated_keys, - stream); - - if (status != cudaSuccess) { return {status, sparse_table}; } - - auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - auto output_values = *d_sparse_table; - - // For each aggregation, need two pointers to arrays in shmem - // One where the aggregation is performed, one indicating the validity of the aggregation - auto const shmem_agg_pointer_size = - round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns()); - // The rest of shmem is utilized for the actual arrays in shmem - auto const shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2; - compute_d_agg_kinds_kernel<<>>( - num_input_rows, - row_bitmask, - skip_rows_with_nulls, - local_mapping_index, - global_mapping_index, - block_cardinality, - input_values, - output_values, - d_agg_kinds, - shmem_agg_size, - shmem_agg_pointer_size); - - return {status, sparse_table}; -} - template std::pair compute_aggregations( int grid_size, cudf::size_type num_input_rows, @@ -316,22 +34,4 @@ template std::pair compute_aggregations( global_set_t& global_set, rmm::device_uvector& populated_keys, rmm::cuda_stream_view stream); - -template std::pair compute_aggregations( - int grid_size, - cudf::size_type num_input_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::table_view const& flattened_values, - cudf::aggregation::Kind const* d_agg_kinds, - std::vector const& agg_kinds, - bool direct_aggregations, - nullable_global_set_t& global_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream); - } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh new file mode 100644 index 00000000000..a7da0ec6e85 --- /dev/null +++ b/cpp/src/groupby/hash/compute_aggregations.cuh @@ -0,0 +1,303 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "compute_aggregations.hpp" +#include "create_sparse_results_table.hpp" +#include "global_memory_aggregator.cuh" +#include "helpers.cuh" +#include "shared_memory_aggregator.cuh" +#include "single_pass_functors.cuh" + +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +namespace cudf::groupby::detail::hash { +namespace { +__device__ void calculate_columns_to_aggregate(int& col_start, + int& col_end, + cudf::mutable_table_device_view output_values, + int num_input_cols, + std::byte** s_aggregates_pointer, + bool** s_aggregates_valid_pointer, + std::byte* shared_set_aggregates, + cudf::size_type cardinality, + int total_agg_size) +{ + if (threadIdx.x == 0) { + col_start = col_end; + int bytes_allocated = 0; + int valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality); + while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) { + int next_col_size = + round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality); + int next_col_total_size = valid_col_size + next_col_size; + if (bytes_allocated + next_col_total_size > total_agg_size) { break; } + s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated; + s_aggregates_valid_pointer[col_end] = + reinterpret_cast(shared_set_aggregates + bytes_allocated + next_col_size); + bytes_allocated += next_col_total_size; + col_end++; + } + } +} + +__device__ void initialize_shared_memory_aggregates(int col_start, + int col_end, + cudf::mutable_table_device_view output_values, + std::byte** s_aggregates_pointer, + bool** s_aggregates_valid_pointer, + cudf::size_type cardinality, + cudf::aggregation::Kind const* d_agg_kinds) +{ + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) { + cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(), + d_agg_kinds[col_idx], + initialize_shmem{}, + s_aggregates_pointer[col_idx], + idx, + s_aggregates_valid_pointer[col_idx]); + } + } +} + +__device__ void compute_pre_aggregrates(int col_start, + int col_end, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::table_device_view input_values, + cudf::size_type num_input_rows, + cudf::size_type* local_mapping_index, + std::byte** s_aggregates_pointer, + bool** s_aggregates_valid_pointer, + cudf::aggregation::Kind const* d_agg_kinds) +{ + // TODO grid_1d utility + for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows; + cur_idx += blockDim.x * gridDim.x) { + if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx)) { + auto map_idx = local_mapping_index[cur_idx]; + + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + auto input_col = input_values.column(col_idx); + + cudf::detail::dispatch_type_and_aggregation(input_col.type(), + d_agg_kinds[col_idx], + shmem_element_aggregator{}, + s_aggregates_pointer[col_idx], + map_idx, + s_aggregates_valid_pointer[col_idx], + input_col, + cur_idx); + } + } + } +} + +__device__ void compute_final_aggregates(int col_start, + int col_end, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::size_type cardinality, + cudf::size_type* global_mapping_index, + std::byte** s_aggregates_pointer, + bool** s_aggregates_valid_pointer, + cudf::aggregation::Kind const* d_agg_kinds) +{ + for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) { + auto out_idx = global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx]; + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + auto output_col = output_values.column(col_idx); + + cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(), + d_agg_kinds[col_idx], + gmem_element_aggregator{}, + output_col, + out_idx, + input_values.column(col_idx), + s_aggregates_pointer[col_idx], + cur_idx, + s_aggregates_valid_pointer[col_idx]); + } + } +} + +/* Takes the local_mapping_index and global_mapping_index to compute + * pre (shared) and final (global) aggregates*/ +CUDF_KERNEL void compute_d_agg_kinds_kernel(cudf::size_type num_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* d_agg_kinds, + int total_agg_size, + int pointer_size) +{ + auto const block = cooperative_groups::this_thread_block(); + auto const cardinality = block_cardinality[block.group_index().x]; + if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; } + + auto const num_cols = output_values.num_columns(); + + __shared__ int col_start; + __shared__ int col_end; + extern __shared__ std::byte shared_set_aggregates[]; + std::byte** s_aggregates_pointer = + reinterpret_cast(shared_set_aggregates + total_agg_size); + bool** s_aggregates_valid_pointer = + reinterpret_cast(shared_set_aggregates + total_agg_size + pointer_size); + + if (block.thread_rank() == 0) { + col_start = 0; + col_end = 0; + } + block.sync(); + + while (col_end < num_cols) { + calculate_columns_to_aggregate(col_start, + col_end, + output_values, + num_cols, + s_aggregates_pointer, + s_aggregates_valid_pointer, + shared_set_aggregates, + cardinality, + total_agg_size); + block.sync(); + initialize_shared_memory_aggregates(col_start, + col_end, + output_values, + s_aggregates_pointer, + s_aggregates_valid_pointer, + cardinality, + d_agg_kinds); + block.sync(); + compute_pre_aggregrates(col_start, + col_end, + row_bitmask, + skip_rows_with_nulls, + input_values, + num_rows, + local_mapping_index, + s_aggregates_pointer, + s_aggregates_valid_pointer, + d_agg_kinds); + block.sync(); + compute_final_aggregates(col_start, + col_end, + input_values, + output_values, + cardinality, + global_mapping_index, + s_aggregates_pointer, + s_aggregates_valid_pointer, + d_agg_kinds); + block.sync(); + } +} + +constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; } + +template +constexpr std::pair compute_shared_memory_size(Kernel kernel, + int grid_size) noexcept +{ + auto const active_blocks_per_sm = + cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); + + size_t dynamic_shmem_size = 0; + + auto const status = cudaOccupancyAvailableDynamicSMemPerBlock( + &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE); + if (status != cudaSuccess) { cudaGetLastError(); } + return {status, get_previous_multiple_of_8(0.5 * dynamic_shmem_size)}; +} + +} // namespace + +template +std::pair compute_aggregations( + int grid_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + bool direct_aggregations, + SetType& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream) +{ + auto const [status, shmem_size] = + compute_shared_memory_size(compute_d_agg_kinds_kernel, grid_size); + + if (status != cudaSuccess) { direct_aggregations = true; } + + // make table that will hold sparse results + cudf::table sparse_table = create_sparse_results_table(flattened_values, + d_agg_kinds, + agg_kinds, + direct_aggregations, + global_set, + populated_keys, + stream); + + if (status != cudaSuccess) { return {status, sparse_table}; } + + auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); + auto output_values = *d_sparse_table; + + // For each aggregation, need two pointers to arrays in shmem + // One where the aggregation is performed, one indicating the validity of the aggregation + auto const shmem_agg_pointer_size = + round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns()); + // The rest of shmem is utilized for the actual arrays in shmem + auto const shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2; + compute_d_agg_kinds_kernel<<>>( + num_input_rows, + row_bitmask, + skip_rows_with_nulls, + local_mapping_index, + global_mapping_index, + block_cardinality, + input_values, + output_values, + d_agg_kinds, + shmem_agg_size, + shmem_agg_pointer_size); + + return {status, sparse_table}; +} +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_aggregations_null.cu b/cpp/src/groupby/hash/compute_aggregations_null.cu new file mode 100644 index 00000000000..d2c2a5f5830 --- /dev/null +++ b/cpp/src/groupby/hash/compute_aggregations_null.cu @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_aggregations.cuh" +#include "compute_aggregations.hpp" + +namespace cudf::groupby::detail::hash { +template std::pair compute_aggregations( + int grid_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + bool direct_aggregations, + nullable_global_set_t& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash From 2e30d9ba039f0cda763cd11ddd2fcacd826698e2 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 30 Sep 2024 11:16:32 -0700 Subject: [PATCH 057/135] Clean up the shmem agg determination logic --- cpp/CMakeLists.txt | 1 - cpp/src/groupby/hash/compute_aggregations.cu | 273 +++++++++++++++- cpp/src/groupby/hash/compute_aggregations.cuh | 303 ------------------ cpp/src/groupby/hash/compute_aggregations.hpp | 32 +- .../groupby/hash/compute_aggregations_null.cu | 37 --- .../groupby/hash/compute_single_pass_aggs.cuh | 74 +++-- 6 files changed, 311 insertions(+), 409 deletions(-) delete mode 100644 cpp/src/groupby/hash/compute_aggregations.cuh delete mode 100644 cpp/src/groupby/hash/compute_aggregations_null.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 743c8cd1f7c..4bc37eb212c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -315,7 +315,6 @@ add_library( src/filling/sequence.cu src/groupby/groupby.cu src/groupby/hash/compute_aggregations.cu - src/groupby/hash/compute_aggregations_null.cu src/groupby/hash/compute_groupby.cu src/groupby/hash/compute_single_pass_aggs.cu src/groupby/hash/compute_single_pass_aggs_null.cu diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu index 8e70a3a77f0..0b1008493f4 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cu +++ b/cpp/src/groupby/hash/compute_aggregations.cu @@ -14,24 +14,263 @@ * limitations under the License. */ -#include "compute_aggregations.cuh" #include "compute_aggregations.hpp" +#include "create_sparse_results_table.hpp" +#include "global_memory_aggregator.cuh" +#include "helpers.cuh" +#include "shared_memory_aggregator.cuh" +#include "single_pass_functors.cuh" + +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include namespace cudf::groupby::detail::hash { -template std::pair compute_aggregations( - int grid_size, - cudf::size_type num_input_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::table_view const& flattened_values, - cudf::aggregation::Kind const* d_agg_kinds, - std::vector const& agg_kinds, - bool direct_aggregations, - global_set_t& global_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream); +namespace { +__device__ void calculate_columns_to_aggregate(int& col_start, + int& col_end, + cudf::mutable_table_device_view output_values, + int num_input_cols, + std::byte** s_aggregates_pointer, + bool** s_aggregates_valid_pointer, + std::byte* shared_set_aggregates, + cudf::size_type cardinality, + int total_agg_size) +{ + if (threadIdx.x == 0) { + col_start = col_end; + int bytes_allocated = 0; + int valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality); + while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) { + int next_col_size = + round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality); + int next_col_total_size = valid_col_size + next_col_size; + if (bytes_allocated + next_col_total_size > total_agg_size) { break; } + s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated; + s_aggregates_valid_pointer[col_end] = + reinterpret_cast(shared_set_aggregates + bytes_allocated + next_col_size); + bytes_allocated += next_col_total_size; + col_end++; + } + } +} + +__device__ void initialize_shared_memory_aggregates(int col_start, + int col_end, + cudf::mutable_table_device_view output_values, + std::byte** s_aggregates_pointer, + bool** s_aggregates_valid_pointer, + cudf::size_type cardinality, + cudf::aggregation::Kind const* d_agg_kinds) +{ + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) { + cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(), + d_agg_kinds[col_idx], + initialize_shmem{}, + s_aggregates_pointer[col_idx], + idx, + s_aggregates_valid_pointer[col_idx]); + } + } +} + +__device__ void compute_pre_aggregrates(int col_start, + int col_end, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::table_device_view input_values, + cudf::size_type num_input_rows, + cudf::size_type* local_mapping_index, + std::byte** s_aggregates_pointer, + bool** s_aggregates_valid_pointer, + cudf::aggregation::Kind const* d_agg_kinds) +{ + // TODO grid_1d utility + for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows; + cur_idx += blockDim.x * gridDim.x) { + if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx)) { + auto map_idx = local_mapping_index[cur_idx]; + + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + auto input_col = input_values.column(col_idx); + + cudf::detail::dispatch_type_and_aggregation(input_col.type(), + d_agg_kinds[col_idx], + shmem_element_aggregator{}, + s_aggregates_pointer[col_idx], + map_idx, + s_aggregates_valid_pointer[col_idx], + input_col, + cur_idx); + } + } + } +} + +__device__ void compute_final_aggregates(int col_start, + int col_end, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::size_type cardinality, + cudf::size_type* global_mapping_index, + std::byte** s_aggregates_pointer, + bool** s_aggregates_valid_pointer, + cudf::aggregation::Kind const* d_agg_kinds) +{ + for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) { + auto out_idx = global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx]; + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + auto output_col = output_values.column(col_idx); + + cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(), + d_agg_kinds[col_idx], + gmem_element_aggregator{}, + output_col, + out_idx, + input_values.column(col_idx), + s_aggregates_pointer[col_idx], + cur_idx, + s_aggregates_valid_pointer[col_idx]); + } + } +} + +/* Takes the local_mapping_index and global_mapping_index to compute + * pre (shared) and final (global) aggregates*/ +CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* d_agg_kinds, + int total_agg_size, + int pointer_size) +{ + auto const block = cooperative_groups::this_thread_block(); + auto const cardinality = block_cardinality[block.group_index().x]; + if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; } + + auto const num_cols = output_values.num_columns(); + + __shared__ int col_start; + __shared__ int col_end; + extern __shared__ std::byte shared_set_aggregates[]; + std::byte** s_aggregates_pointer = + reinterpret_cast(shared_set_aggregates + total_agg_size); + bool** s_aggregates_valid_pointer = + reinterpret_cast(shared_set_aggregates + total_agg_size + pointer_size); + + if (block.thread_rank() == 0) { + col_start = 0; + col_end = 0; + } + block.sync(); + + while (col_end < num_cols) { + calculate_columns_to_aggregate(col_start, + col_end, + output_values, + num_cols, + s_aggregates_pointer, + s_aggregates_valid_pointer, + shared_set_aggregates, + cardinality, + total_agg_size); + block.sync(); + initialize_shared_memory_aggregates(col_start, + col_end, + output_values, + s_aggregates_pointer, + s_aggregates_valid_pointer, + cardinality, + d_agg_kinds); + block.sync(); + compute_pre_aggregrates(col_start, + col_end, + row_bitmask, + skip_rows_with_nulls, + input_values, + num_rows, + local_mapping_index, + s_aggregates_pointer, + s_aggregates_valid_pointer, + d_agg_kinds); + block.sync(); + compute_final_aggregates(col_start, + col_end, + input_values, + output_values, + cardinality, + global_mapping_index, + s_aggregates_pointer, + s_aggregates_valid_pointer, + d_agg_kinds); + block.sync(); + } +} + +constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; } + +} // namespace + +constexpr std::pair can_use_shmem_aggs(int grid_size) noexcept +{ + auto const active_blocks_per_sm = + cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); + + size_t dynamic_shmem_size = 0; + + auto const status = cudaOccupancyAvailableDynamicSMemPerBlock( + &dynamic_shmem_size, compute_aggs_kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE); + auto const success = status == cudaSuccess; + if (!success) { cudaGetLastError(); } + + return {success, get_previous_multiple_of_8(0.5 * dynamic_shmem_size)}; +} + +void compute_aggregations(int grid_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* d_agg_kinds, + size_t shmem_size, + rmm::cuda_stream_view stream) +{ + // For each aggregation, need two pointers to arrays in shmem + // One where the aggregation is performed, one indicating the validity of the aggregation + auto const shmem_agg_pointer_size = + round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns()); + // The rest of shmem is utilized for the actual arrays in shmem + auto const shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2; + compute_aggs_kernel<<>>( + num_input_rows, + row_bitmask, + skip_rows_with_nulls, + local_mapping_index, + global_mapping_index, + block_cardinality, + input_values, + output_values, + d_agg_kinds, + shmem_agg_size, + shmem_agg_pointer_size); +} } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh deleted file mode 100644 index a7da0ec6e85..00000000000 --- a/cpp/src/groupby/hash/compute_aggregations.cuh +++ /dev/null @@ -1,303 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "compute_aggregations.hpp" -#include "create_sparse_results_table.hpp" -#include "global_memory_aggregator.cuh" -#include "helpers.cuh" -#include "shared_memory_aggregator.cuh" -#include "single_pass_functors.cuh" - -#include -#include -#include -#include -#include -#include - -#include - -#include - -#include - -namespace cudf::groupby::detail::hash { -namespace { -__device__ void calculate_columns_to_aggregate(int& col_start, - int& col_end, - cudf::mutable_table_device_view output_values, - int num_input_cols, - std::byte** s_aggregates_pointer, - bool** s_aggregates_valid_pointer, - std::byte* shared_set_aggregates, - cudf::size_type cardinality, - int total_agg_size) -{ - if (threadIdx.x == 0) { - col_start = col_end; - int bytes_allocated = 0; - int valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality); - while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) { - int next_col_size = - round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality); - int next_col_total_size = valid_col_size + next_col_size; - if (bytes_allocated + next_col_total_size > total_agg_size) { break; } - s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated; - s_aggregates_valid_pointer[col_end] = - reinterpret_cast(shared_set_aggregates + bytes_allocated + next_col_size); - bytes_allocated += next_col_total_size; - col_end++; - } - } -} - -__device__ void initialize_shared_memory_aggregates(int col_start, - int col_end, - cudf::mutable_table_device_view output_values, - std::byte** s_aggregates_pointer, - bool** s_aggregates_valid_pointer, - cudf::size_type cardinality, - cudf::aggregation::Kind const* d_agg_kinds) -{ - for (auto col_idx = col_start; col_idx < col_end; col_idx++) { - for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) { - cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(), - d_agg_kinds[col_idx], - initialize_shmem{}, - s_aggregates_pointer[col_idx], - idx, - s_aggregates_valid_pointer[col_idx]); - } - } -} - -__device__ void compute_pre_aggregrates(int col_start, - int col_end, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::table_device_view input_values, - cudf::size_type num_input_rows, - cudf::size_type* local_mapping_index, - std::byte** s_aggregates_pointer, - bool** s_aggregates_valid_pointer, - cudf::aggregation::Kind const* d_agg_kinds) -{ - // TODO grid_1d utility - for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows; - cur_idx += blockDim.x * gridDim.x) { - if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx)) { - auto map_idx = local_mapping_index[cur_idx]; - - for (auto col_idx = col_start; col_idx < col_end; col_idx++) { - auto input_col = input_values.column(col_idx); - - cudf::detail::dispatch_type_and_aggregation(input_col.type(), - d_agg_kinds[col_idx], - shmem_element_aggregator{}, - s_aggregates_pointer[col_idx], - map_idx, - s_aggregates_valid_pointer[col_idx], - input_col, - cur_idx); - } - } - } -} - -__device__ void compute_final_aggregates(int col_start, - int col_end, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::size_type cardinality, - cudf::size_type* global_mapping_index, - std::byte** s_aggregates_pointer, - bool** s_aggregates_valid_pointer, - cudf::aggregation::Kind const* d_agg_kinds) -{ - for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) { - auto out_idx = global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx]; - for (auto col_idx = col_start; col_idx < col_end; col_idx++) { - auto output_col = output_values.column(col_idx); - - cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(), - d_agg_kinds[col_idx], - gmem_element_aggregator{}, - output_col, - out_idx, - input_values.column(col_idx), - s_aggregates_pointer[col_idx], - cur_idx, - s_aggregates_valid_pointer[col_idx]); - } - } -} - -/* Takes the local_mapping_index and global_mapping_index to compute - * pre (shared) and final (global) aggregates*/ -CUDF_KERNEL void compute_d_agg_kinds_kernel(cudf::size_type num_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::aggregation::Kind const* d_agg_kinds, - int total_agg_size, - int pointer_size) -{ - auto const block = cooperative_groups::this_thread_block(); - auto const cardinality = block_cardinality[block.group_index().x]; - if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; } - - auto const num_cols = output_values.num_columns(); - - __shared__ int col_start; - __shared__ int col_end; - extern __shared__ std::byte shared_set_aggregates[]; - std::byte** s_aggregates_pointer = - reinterpret_cast(shared_set_aggregates + total_agg_size); - bool** s_aggregates_valid_pointer = - reinterpret_cast(shared_set_aggregates + total_agg_size + pointer_size); - - if (block.thread_rank() == 0) { - col_start = 0; - col_end = 0; - } - block.sync(); - - while (col_end < num_cols) { - calculate_columns_to_aggregate(col_start, - col_end, - output_values, - num_cols, - s_aggregates_pointer, - s_aggregates_valid_pointer, - shared_set_aggregates, - cardinality, - total_agg_size); - block.sync(); - initialize_shared_memory_aggregates(col_start, - col_end, - output_values, - s_aggregates_pointer, - s_aggregates_valid_pointer, - cardinality, - d_agg_kinds); - block.sync(); - compute_pre_aggregrates(col_start, - col_end, - row_bitmask, - skip_rows_with_nulls, - input_values, - num_rows, - local_mapping_index, - s_aggregates_pointer, - s_aggregates_valid_pointer, - d_agg_kinds); - block.sync(); - compute_final_aggregates(col_start, - col_end, - input_values, - output_values, - cardinality, - global_mapping_index, - s_aggregates_pointer, - s_aggregates_valid_pointer, - d_agg_kinds); - block.sync(); - } -} - -constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; } - -template -constexpr std::pair compute_shared_memory_size(Kernel kernel, - int grid_size) noexcept -{ - auto const active_blocks_per_sm = - cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); - - size_t dynamic_shmem_size = 0; - - auto const status = cudaOccupancyAvailableDynamicSMemPerBlock( - &dynamic_shmem_size, kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE); - if (status != cudaSuccess) { cudaGetLastError(); } - return {status, get_previous_multiple_of_8(0.5 * dynamic_shmem_size)}; -} - -} // namespace - -template -std::pair compute_aggregations( - int grid_size, - cudf::size_type num_input_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::table_view const& flattened_values, - cudf::aggregation::Kind const* d_agg_kinds, - std::vector const& agg_kinds, - bool direct_aggregations, - SetType& global_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream) -{ - auto const [status, shmem_size] = - compute_shared_memory_size(compute_d_agg_kinds_kernel, grid_size); - - if (status != cudaSuccess) { direct_aggregations = true; } - - // make table that will hold sparse results - cudf::table sparse_table = create_sparse_results_table(flattened_values, - d_agg_kinds, - agg_kinds, - direct_aggregations, - global_set, - populated_keys, - stream); - - if (status != cudaSuccess) { return {status, sparse_table}; } - - auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - auto output_values = *d_sparse_table; - - // For each aggregation, need two pointers to arrays in shmem - // One where the aggregation is performed, one indicating the validity of the aggregation - auto const shmem_agg_pointer_size = - round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns()); - // The rest of shmem is utilized for the actual arrays in shmem - auto const shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2; - compute_d_agg_kinds_kernel<<>>( - num_input_rows, - row_bitmask, - skip_rows_with_nulls, - local_mapping_index, - global_mapping_index, - block_cardinality, - input_values, - output_values, - d_agg_kinds, - shmem_agg_size, - shmem_agg_pointer_size); - - return {status, sparse_table}; -} -} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_aggregations.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp index 1c382f01195..f01d9f24c66 100644 --- a/cpp/src/groupby/hash/compute_aggregations.hpp +++ b/cpp/src/groupby/hash/compute_aggregations.hpp @@ -13,7 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once #include @@ -28,22 +27,19 @@ namespace cudf::groupby::detail::hash { -template -std::pair compute_aggregations( - int grid_size, - cudf::size_type num_input_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::table_view const& flattened_values, - cudf::aggregation::Kind const* d_agg_kinds, - std::vector const& agg_kinds, - bool direct_aggregations, - SetType& global_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream); +std::pair can_use_shmem_aggs(int grid_size) noexcept; + +void compute_aggregations(int grid_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* d_agg_kinds, + size_t shmem_size, + rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_aggregations_null.cu b/cpp/src/groupby/hash/compute_aggregations_null.cu deleted file mode 100644 index d2c2a5f5830..00000000000 --- a/cpp/src/groupby/hash/compute_aggregations_null.cu +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "compute_aggregations.cuh" -#include "compute_aggregations.hpp" - -namespace cudf::groupby::detail::hash { -template std::pair compute_aggregations( - int grid_size, - cudf::size_type num_input_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::table_view const& flattened_values, - cudf::aggregation::Kind const* d_agg_kinds, - std::vector const& agg_kinds, - bool direct_aggregations, - nullable_global_set_t& global_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream); -} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 25c8aed957c..b534e9b8f1e 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -243,28 +243,22 @@ rmm::device_uvector compute_single_pass_aggs( auto const d_agg_kinds = cudf::detail::make_device_uvector_async( agg_kinds, stream, rmm::mr::get_current_device_resource()); - // prepare to launch kernel to do the actual aggregation - auto d_values = table_device_view::create(flattened_values, stream); + auto const [uses_shmem_aggs, shmem_size] = can_use_shmem_aggs(grid_size); - auto [status, sparse_table] = - compute_aggregations(grid_size, - num_input_rows, - static_cast(row_bitmask.data()), - skip_rows_with_nulls, - local_mapping_index.data(), - global_mapping_index.data(), - block_cardinality.data(), - *d_values, - flattened_values, - d_agg_kinds.data(), - agg_kinds, - direct_aggregations.value(stream), - global_set, - populated_keys, - stream); + // make table that will hold sparse results + cudf::table sparse_table = + create_sparse_results_table(flattened_values, + d_agg_kinds.data(), + agg_kinds, + uses_shmem_aggs ? direct_aggregations.value(stream) : true, + global_set, + populated_keys, + stream); + // prepare to launch kernel to do the actual aggregation + auto d_values = table_device_view::create(flattened_values, stream); auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - if (status != cudaSuccess) { + if (!uses_shmem_aggs) { thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -276,20 +270,34 @@ rmm::device_uvector compute_single_pass_aggs( static_cast(row_bitmask.data()), skip_rows_with_nulls}); extract_populated_keys(global_set, populated_keys, stream); - } else if (direct_aggregations.value(stream)) { - auto const stride = GROUPBY_BLOCK_SIZE * grid_size; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - keys.num_rows(), - compute_direct_aggregates{global_set_ref, - *d_values, - *d_sparse_table, - d_agg_kinds.data(), - block_cardinality.data(), - stride, - static_cast(row_bitmask.data()), - skip_rows_with_nulls}); - extract_populated_keys(global_set, populated_keys, stream); + } else { + compute_aggregations(grid_size, + num_input_rows, + static_cast(row_bitmask.data()), + skip_rows_with_nulls, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + shmem_size, + stream); + if (direct_aggregations.value(stream)) { + auto const stride = GROUPBY_BLOCK_SIZE * grid_size; + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + keys.num_rows(), + compute_direct_aggregates{global_set_ref, + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + block_cardinality.data(), + stride, + static_cast(row_bitmask.data()), + skip_rows_with_nulls}); + extract_populated_keys(global_set, populated_keys, stream); + } } // Add results back to sparse_results cache From c24247553284582bf1d6c3fd4abe313788cc28a9 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 30 Sep 2024 11:23:22 -0700 Subject: [PATCH 058/135] Fix mismatch --- cpp/src/groupby/hash/compute_aggregations.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu index 0b1008493f4..aec6f39501e 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cu +++ b/cpp/src/groupby/hash/compute_aggregations.cu @@ -226,7 +226,7 @@ constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 * } // namespace -constexpr std::pair can_use_shmem_aggs(int grid_size) noexcept +std::pair can_use_shmem_aggs(int grid_size) noexcept { auto const active_blocks_per_sm = cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); From 30e572e1fa4841eaab69f62be111838d21fe6cbd Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 30 Sep 2024 15:00:01 -0700 Subject: [PATCH 059/135] Clean up device aggregators --- .../groupby/hash/global_memory_aggregator.cuh | 113 ++++++------------ .../groupby/hash/shared_memory_aggregator.cuh | 112 +++++++---------- 2 files changed, 79 insertions(+), 146 deletions(-) diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh index 9f38750060b..abf8bd71483 100644 --- a/cpp/src/groupby/hash/global_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh @@ -25,29 +25,23 @@ namespace cudf::groupby::detail::hash { -template +template struct update_target_element_gmem { __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, std::byte* source, cudf::size_type source_index, - bool* source_null) const noexcept + bool* source_null) const { CUDF_UNREACHABLE("Invalid source type and aggregation combination."); } }; -template +template struct update_target_element_gmem< Source, cudf::aggregation::MIN, - target_has_nulls, - source_has_nulls, std::enable_if_t() && cudf::has_atomic_support() && !cudf::is_fixed_point()>> { __device__ void operator()(cudf::mutable_column_device_view target, @@ -57,7 +51,7 @@ struct update_target_element_gmem< cudf::size_type source_index, bool* source_null) const noexcept { - if (source_has_nulls and source_null[source_index]) { return; } + if (source_null[source_index]) { return; } using Target = cudf::detail::target_type_t; @@ -65,16 +59,14 @@ struct update_target_element_gmem< cudf::detail::atomic_min(&target.element(target_index), static_cast(source_casted[source_index])); - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + if (target.is_null(target_index)) { target.set_valid(target_index); } } }; -template +template struct update_target_element_gmem< Source, cudf::aggregation::MIN, - target_has_nulls, - source_has_nulls, std::enable_if_t() && cudf::has_atomic_support>()>> { __device__ void operator()(cudf::mutable_column_device_view target, @@ -84,23 +76,21 @@ struct update_target_element_gmem< cudf::size_type source_index, bool* source_null) const noexcept { - if (source_has_nulls and source_null[source_index]) { return; } + if (source_null[source_index]) { return; } using Target = cudf::detail::target_type_t; using DeviceType = cudf::device_storage_type_t; DeviceType* source_casted = reinterpret_cast(source); cudf::detail::atomic_min(&target.element(target_index), static_cast(source_casted[source_index])); - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + if (target.is_null(target_index)) { target.set_valid(target_index); } } }; -template +template struct update_target_element_gmem< Source, cudf::aggregation::MAX, - target_has_nulls, - source_has_nulls, std::enable_if_t() && cudf::has_atomic_support() && !cudf::is_fixed_point()>> { __device__ void operator()(cudf::mutable_column_device_view target, @@ -110,22 +100,20 @@ struct update_target_element_gmem< cudf::size_type source_index, bool* source_null) const noexcept { - if (source_has_nulls and source_null[source_index]) { return; } + if (source_null[source_index]) { return; } using Target = cudf::detail::target_type_t; Target* source_casted = reinterpret_cast(source); cudf::detail::atomic_max(&target.element(target_index), static_cast(source_casted[source_index])); - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + if (target.is_null(target_index)) { target.set_valid(target_index); } } }; -template +template struct update_target_element_gmem< Source, cudf::aggregation::MAX, - target_has_nulls, - source_has_nulls, std::enable_if_t() && cudf::has_atomic_support>()>> { __device__ void operator()(cudf::mutable_column_device_view target, @@ -135,7 +123,7 @@ struct update_target_element_gmem< cudf::size_type source_index, bool* source_null) const noexcept { - if (source_has_nulls and source_null[source_index]) { return; } + if (source_null[source_index]) { return; } using Target = cudf::detail::target_type_t; using DeviceType = cudf::device_storage_type_t; @@ -143,16 +131,14 @@ struct update_target_element_gmem< cudf::detail::atomic_max(&target.element(target_index), static_cast(source_casted[source_index])); - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + if (target.is_null(target_index)) { target.set_valid(target_index); } } }; -template +template struct update_target_element_gmem< Source, cudf::aggregation::SUM, - target_has_nulls, - source_has_nulls, std::enable_if_t() && cudf::has_atomic_support() && !cudf::is_fixed_point() && !cudf::is_timestamp()>> { __device__ void operator()(cudf::mutable_column_device_view target, @@ -162,23 +148,21 @@ struct update_target_element_gmem< cudf::size_type source_index, bool* source_null) const noexcept { - if (source_has_nulls and source_null[source_index]) { return; } + if (source_null[source_index]) { return; } using Target = cudf::detail::target_type_t; Target* source_casted = reinterpret_cast(source); cudf::detail::atomic_add(&target.element(target_index), static_cast(source_casted[source_index])); - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + if (target.is_null(target_index)) { target.set_valid(target_index); } } }; -template +template struct update_target_element_gmem< Source, cudf::aggregation::SUM, - target_has_nulls, - source_has_nulls, std::enable_if_t>() && cudf::is_fixed_point()>> { __device__ void operator()(cudf::mutable_column_device_view target, @@ -188,14 +172,14 @@ struct update_target_element_gmem< cudf::size_type source_index, bool* source_null) const noexcept { - if (source_has_nulls and source_null[source_index]) { return; } + if (source_null[source_index]) { return; } using Target = cudf::detail::target_type_t; using DeviceType = cudf::device_storage_type_t; DeviceType* source_casted = reinterpret_cast(source); cudf::detail::atomic_add(&target.element(target_index), static_cast(source_casted[source_index])); - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + if (target.is_null(target_index)) { target.set_valid(target_index); } } }; @@ -207,7 +191,6 @@ struct update_target_element_gmem< * dictionary. * */ -template struct update_target_from_dictionary_gmem { template {}( + update_target_element_gmem{}( target, target_index, source_column, source, source_index, source_null); } template +template struct update_target_element_gmem< dictionary32, k, - target_has_nulls, - source_has_nulls, std::enable_if_t> { __device__ void operator()(mutable_column_device_view target, @@ -264,12 +242,12 @@ struct update_target_element_gmem< cudf::size_type source_index, bool* source_null) const noexcept { - if (source_has_nulls and source_null[source_index]) { return; } + if (source_null[source_index]) { return; } dispatch_type_and_aggregation( source_column.child(cudf::dictionary_column_view::keys_column_index).type(), k, - update_target_from_dictionary_gmem{}, + update_target_from_dictionary_gmem{}, target, target_index, source_column, @@ -280,11 +258,9 @@ struct update_target_element_gmem< }; // The shared memory will already have it squared -template +template struct update_target_element_gmem()>> { __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, @@ -293,7 +269,7 @@ struct update_target_element_gmem; Target* source_casted = reinterpret_cast(source); @@ -301,15 +277,13 @@ struct update_target_element_gmem(target_index), value); - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + if (target.is_null(target_index)) { target.set_valid(target_index); } } }; -template +template struct update_target_element_gmem()>> { __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, @@ -318,25 +292,23 @@ struct update_target_element_gmem; Target* source_casted = reinterpret_cast(source); cudf::detail::atomic_mul(&target.element(target_index), static_cast(source_casted[source_index])); - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + if (target.is_null(target_index)) { target.set_valid(target_index); } } }; // Assuming that the target column of COUNT_VALID, COUNT_ALL would be using fixed_width column and // non-fixed point column -template +template struct update_target_element_gmem< Source, cudf::aggregation::COUNT_VALID, - target_has_nulls, - source_has_nulls, std::enable_if_t()>> { __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, @@ -356,12 +328,10 @@ struct update_target_element_gmem< }; // TODO: VALID and ALL have same code -template +template struct update_target_element_gmem< Source, cudf::aggregation::COUNT_ALL, - target_has_nulls, - source_has_nulls, std::enable_if_t()>> { __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, @@ -380,12 +350,10 @@ struct update_target_element_gmem< } }; -template +template struct update_target_element_gmem< Source, cudf::aggregation::ARGMAX, - target_has_nulls, - source_has_nulls, std::enable_if_t() and cudf::is_relationally_comparable()>> { __device__ void operator()(cudf::mutable_column_device_view target, @@ -395,7 +363,7 @@ struct update_target_element_gmem< cudf::size_type source_index, bool* source_null) const noexcept { - if (source_has_nulls and source_null[source_index]) { return; } + if (source_null[source_index]) { return; } using Target = cudf::detail::target_type_t; Target* source_casted = reinterpret_cast(source); auto source_argmax_index = source_casted[source_index]; @@ -409,15 +377,13 @@ struct update_target_element_gmem< } } - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + if (target.is_null(target_index)) { target.set_valid(target_index); } } }; -template +template struct update_target_element_gmem< Source, cudf::aggregation::ARGMIN, - target_has_nulls, - source_has_nulls, std::enable_if_t() and cudf::is_relationally_comparable()>> { __device__ void operator()(cudf::mutable_column_device_view target, @@ -427,7 +393,7 @@ struct update_target_element_gmem< cudf::size_type source_index, bool* source_null) const noexcept { - if (source_has_nulls and source_null[source_index]) { return; } + if (source_null[source_index]) { return; } using Target = cudf::detail::target_type_t; Target* source_casted = reinterpret_cast(source); auto source_argmin_index = source_casted[source_index]; @@ -441,11 +407,10 @@ struct update_target_element_gmem< } } - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + if (target.is_null(target_index)) { target.set_valid(target_index); } } }; -template struct gmem_element_aggregator { template __device__ void operator()(cudf::mutable_column_device_view target, @@ -455,7 +420,7 @@ struct gmem_element_aggregator { cudf::size_type source_index, bool* source_null) const noexcept { - update_target_element_gmem{}( + update_target_element_gmem{}( target, target_index, source_column, source, source_index, source_null); } }; diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh index ef46c9b4cb4..624a56710d5 100644 --- a/cpp/src/groupby/hash/shared_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh @@ -25,28 +25,22 @@ namespace cudf::groupby::detail::hash { -template +template struct update_target_element_shmem { __device__ void operator()(std::byte* target, cudf::size_type target_index, bool* target_null, cudf::column_device_view source, - cudf::size_type source_index) const noexcept + cudf::size_type source_index) const { CUDF_UNREACHABLE("Invalid source type and aggregation combination."); } }; -template +template struct update_target_element_shmem< Source, cudf::aggregation::MIN, - target_has_nulls, - source_has_nulls, std::enable_if_t() && cudf::has_atomic_support() && !cudf::is_fixed_point()>> { __device__ void operator()(std::byte* target, @@ -55,23 +49,21 @@ struct update_target_element_shmem< cudf::column_device_view source, cudf::size_type source_index) const noexcept { - if (source_has_nulls and source.is_null(source_index)) { return; } + if (source.is_null(source_index)) { return; } using Target = cudf::detail::target_type_t; Target* target_casted = reinterpret_cast(target); cudf::detail::atomic_min(&target_casted[target_index], static_cast(source.element(source_index))); - if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + if (target_null[target_index]) { target_null[target_index] = false; } } }; -template +template struct update_target_element_shmem< Source, cudf::aggregation::MIN, - target_has_nulls, - source_has_nulls, std::enable_if_t() && cudf::has_atomic_support>()>> { __device__ void operator()(std::byte* target, @@ -80,7 +72,7 @@ struct update_target_element_shmem< cudf::column_device_view source, cudf::size_type source_index) const noexcept { - if (source_has_nulls and source.is_null(source_index)) { return; } + if (source.is_null(source_index)) { return; } using Target = cudf::detail::target_type_t; using DeviceTarget = cudf::device_storage_type_t; @@ -89,16 +81,14 @@ struct update_target_element_shmem< DeviceTarget* target_casted = reinterpret_cast(target); cudf::detail::atomic_min(&target_casted[target_index], static_cast(source.element(source_index))); - if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + if (target_null[target_index]) { target_null[target_index] = false; } } }; -template +template struct update_target_element_shmem< Source, cudf::aggregation::MAX, - target_has_nulls, - source_has_nulls, std::enable_if_t() && cudf::has_atomic_support() && !cudf::is_fixed_point()>> { __device__ void operator()(std::byte* target, @@ -107,22 +97,20 @@ struct update_target_element_shmem< cudf::column_device_view source, cudf::size_type source_index) const noexcept { - if (source_has_nulls and source.is_null(source_index)) { return; } + if (source.is_null(source_index)) { return; } using Target = cudf::detail::target_type_t; Target* target_casted = reinterpret_cast(target); cudf::detail::atomic_max(&target_casted[target_index], static_cast(source.element(source_index))); - if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + if (target_null[target_index]) { target_null[target_index] = false; } } }; -template +template struct update_target_element_shmem< Source, cudf::aggregation::MAX, - target_has_nulls, - source_has_nulls, std::enable_if_t() && cudf::has_atomic_support>()>> { __device__ void operator()(std::byte* target, @@ -131,7 +119,7 @@ struct update_target_element_shmem< cudf::column_device_view source, cudf::size_type source_index) const noexcept { - if (source_has_nulls and source.is_null(source_index)) { return; } + if (source.is_null(source_index)) { return; } using Target = cudf::detail::target_type_t; @@ -142,16 +130,14 @@ struct update_target_element_shmem< cudf::detail::atomic_max(&target_casted[target_index], static_cast(source.element(source_index))); - if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + if (target_null[target_index]) { target_null[target_index] = false; } } }; -template +template struct update_target_element_shmem< Source, cudf::aggregation::SUM, - target_has_nulls, - source_has_nulls, std::enable_if_t() && cudf::has_atomic_support() && !cudf::is_fixed_point() && !cudf::is_timestamp()>> { __device__ void operator()(std::byte* target, @@ -160,23 +146,21 @@ struct update_target_element_shmem< cudf::column_device_view source, cudf::size_type source_index) const noexcept { - if (source_has_nulls and source.is_null(source_index)) { return; } + if (source.is_null(source_index)) { return; } using Target = cudf::detail::target_type_t; Target* target_casted = reinterpret_cast(target); cudf::detail::atomic_add(&target_casted[target_index], static_cast(source.element(source_index))); - if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + if (target_null[target_index]) { target_null[target_index] = false; } } }; -template +template struct update_target_element_shmem< Source, cudf::aggregation::SUM, - target_has_nulls, - source_has_nulls, std::enable_if_t>() && cudf::is_fixed_point()>> { __device__ void operator()(std::byte* target, @@ -185,7 +169,7 @@ struct update_target_element_shmem< cudf::column_device_view source, cudf::size_type source_index) const noexcept { - if (source_has_nulls and source.is_null(source_index)) { return; } + if (source.is_null(source_index)) { return; } using Target = cudf::detail::target_type_t; @@ -196,11 +180,10 @@ struct update_target_element_shmem< cudf::detail::atomic_add(&target_casted[target_index], static_cast(source.element(source_index))); - if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + if (target_null[target_index]) { target_null[target_index] = false; } } }; -template struct update_target_from_dictionary_shmem { template {}( + update_target_element_shmem{}( target, target_index, target_null, source, source_index); } template +template struct update_target_element_shmem< dictionary32, k, - target_has_nulls, - source_has_nulls, std::enable_if_t> { __device__ void operator()(std::byte* target, @@ -240,12 +221,12 @@ struct update_target_element_shmem< cudf::column_device_view source, cudf::size_type source_index) const noexcept { - if (source_has_nulls and source.is_null(source_index)) { return; } + if (source.is_null(source_index)) { return; } dispatch_type_and_aggregation( source.child(cudf::dictionary_column_view::keys_column_index).type(), k, - update_target_from_dictionary_shmem{}, + update_target_from_dictionary_shmem{}, target, target_index, target_null, @@ -254,11 +235,9 @@ struct update_target_element_shmem< } }; -template +template struct update_target_element_shmem()>> { __device__ void operator()(std::byte* target, cudf::size_type target_index, @@ -266,22 +245,20 @@ struct update_target_element_shmem; Target* target_casted = reinterpret_cast(target); auto value = static_cast(source.element(source_index)); cudf::detail::atomic_add(&target_casted[target_index], value * value); - if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + if (target_null[target_index]) { target_null[target_index] = false; } } }; -template +template struct update_target_element_shmem()>> { __device__ void operator()(std::byte* target, cudf::size_type target_index, @@ -289,23 +266,21 @@ struct update_target_element_shmem; Target* target_casted = reinterpret_cast(target); cudf::detail::atomic_mul(&target_casted[target_index], static_cast(source.element(source_index))); - if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + if (target_null[target_index]) { target_null[target_index] = false; } } }; -template +template struct update_target_element_shmem< Source, cudf::aggregation::COUNT_VALID, - target_has_nulls, - source_has_nulls, std::enable_if_t()>> { __device__ void operator()(std::byte* target, cudf::size_type target_index, @@ -313,7 +288,7 @@ struct update_target_element_shmem< cudf::column_device_view source, cudf::size_type source_index) const noexcept { - if (source_has_nulls and source.is_null(source_index)) { return; } + if (source.is_null(source_index)) { return; } using Target = cudf::detail::target_type_t; Target* target_casted = reinterpret_cast(target); @@ -321,12 +296,10 @@ struct update_target_element_shmem< } }; -template +template struct update_target_element_shmem< Source, cudf::aggregation::COUNT_ALL, - target_has_nulls, - source_has_nulls, std::enable_if_t()>> { __device__ void operator()(std::byte* target, cudf::size_type target_index, @@ -342,12 +315,10 @@ struct update_target_element_shmem< } }; -template +template struct update_target_element_shmem< Source, cudf::aggregation::ARGMAX, - target_has_nulls, - source_has_nulls, std::enable_if_t() and cudf::is_relationally_comparable()>> { __device__ void operator()(std::byte* target, @@ -356,7 +327,7 @@ struct update_target_element_shmem< cudf::column_device_view source, cudf::size_type source_index) const noexcept { - if (source_has_nulls and source.is_null(source_index)) { return; } + if (source.is_null(source_index)) { return; } using Target = cudf::detail::target_type_t; Target* target_casted = reinterpret_cast(target); @@ -368,16 +339,14 @@ struct update_target_element_shmem< } } - if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + if (target_null[target_index]) { target_null[target_index] = false; } } }; -template +template struct update_target_element_shmem< Source, cudf::aggregation::ARGMIN, - target_has_nulls, - source_has_nulls, std::enable_if_t() and cudf::is_relationally_comparable()>> { __device__ void operator()(std::byte* target, @@ -386,7 +355,7 @@ struct update_target_element_shmem< cudf::column_device_view source, cudf::size_type source_index) const noexcept { - if (source_has_nulls and source.is_null(source_index)) { return; } + if (source.is_null(source_index)) { return; } using Target = cudf::detail::target_type_t; Target* target_casted = reinterpret_cast(target); @@ -398,11 +367,10 @@ struct update_target_element_shmem< } } - if (target_has_nulls and target_null[target_index]) { target_null[target_index] = false; } + if (target_null[target_index]) { target_null[target_index] = false; } } }; -template struct shmem_element_aggregator { template __device__ void operator()(std::byte* target, @@ -411,7 +379,7 @@ struct shmem_element_aggregator { cudf::column_device_view source, cudf::size_type source_index) const noexcept { - update_target_element_shmem{}( + update_target_element_shmem{}( target, target_index, target_null, source, source_index); } }; From 0916fe79c2f9c0acf8720d794c585f7458c0c7fd Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 30 Sep 2024 15:08:01 -0700 Subject: [PATCH 060/135] Header cleanups --- cpp/src/groupby/hash/groupby.cu | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index b307b8a8d1f..c206da91375 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -19,21 +19,13 @@ #include "helpers.cuh" #include -#include -#include -#include -#include #include #include -#include #include -#include #include #include -#include #include #include -#include #include #include #include @@ -42,11 +34,10 @@ #include -#include -#include - +#include #include #include +#include namespace cudf::groupby::detail::hash { namespace { From e7ff94dfff54b0b511dae3b4c4642ac61599de97 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 30 Sep 2024 15:16:37 -0700 Subject: [PATCH 061/135] More header cleanups --- cpp/src/groupby/hash/compute_groupby.cu | 4 ---- cpp/src/groupby/hash/groupby.cu | 3 +-- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu index 1eb208c588d..9021846f71e 100644 --- a/cpp/src/groupby/hash/compute_groupby.cu +++ b/cpp/src/groupby/hash/compute_groupby.cu @@ -19,12 +19,9 @@ #include "sparse_to_dense_results.hpp" #include "var_hash_functor.cuh" -#include #include -#include #include #include -#include #include #include #include @@ -38,7 +35,6 @@ #include namespace cudf::groupby::detail::hash { - /** * @brief Computes groupby using hash table. * diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index c206da91375..03b1a40d224 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -28,11 +28,11 @@ #include #include #include -#include #include #include #include +#include #include #include @@ -41,7 +41,6 @@ namespace cudf::groupby::detail::hash { namespace { - /** * @brief List of aggregation operations that can be computed with a hash-based * implementation. From d01f0a2e7e48cdafb85b94c47805de97259434fa Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 30 Sep 2024 15:27:12 -0700 Subject: [PATCH 062/135] Switch to cuda::std utilities for device APIs --- .../groupby/hash/global_memory_aggregator.cuh | 62 ++++++++++--------- .../groupby/hash/shared_memory_aggregator.cuh | 62 ++++++++++--------- 2 files changed, 68 insertions(+), 56 deletions(-) diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh index abf8bd71483..f8baf7d84ba 100644 --- a/cpp/src/groupby/hash/global_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh @@ -23,6 +23,8 @@ #include #include +#include + namespace cudf::groupby::detail::hash { template @@ -42,8 +44,8 @@ template struct update_target_element_gmem< Source, cudf::aggregation::MIN, - std::enable_if_t() && cudf::has_atomic_support() && - !cudf::is_fixed_point()>> { + cuda::std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_fixed_point()>> { __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, @@ -67,8 +69,8 @@ template struct update_target_element_gmem< Source, cudf::aggregation::MIN, - std::enable_if_t() && - cudf::has_atomic_support>()>> { + cuda::std::enable_if_t() && + cudf::has_atomic_support>()>> { __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, @@ -91,8 +93,8 @@ template struct update_target_element_gmem< Source, cudf::aggregation::MAX, - std::enable_if_t() && cudf::has_atomic_support() && - !cudf::is_fixed_point()>> { + cuda::std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_fixed_point()>> { __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, @@ -114,8 +116,8 @@ template struct update_target_element_gmem< Source, cudf::aggregation::MAX, - std::enable_if_t() && - cudf::has_atomic_support>()>> { + cuda::std::enable_if_t() && + cudf::has_atomic_support>()>> { __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, @@ -139,8 +141,8 @@ template struct update_target_element_gmem< Source, cudf::aggregation::SUM, - std::enable_if_t() && cudf::has_atomic_support() && - !cudf::is_fixed_point() && !cudf::is_timestamp()>> { + cuda::std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_fixed_point() && !cudf::is_timestamp()>> { __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, @@ -163,8 +165,8 @@ template struct update_target_element_gmem< Source, cudf::aggregation::SUM, - std::enable_if_t>() && - cudf::is_fixed_point()>> { + cuda::std::enable_if_t>() && + cudf::is_fixed_point()>> { __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, @@ -194,7 +196,7 @@ struct update_target_element_gmem< struct update_target_from_dictionary_gmem { template ()>* = nullptr> + cuda::std::enable_if_t()>* = nullptr> __device__ void operator()(mutable_column_device_view target, size_type target_index, column_device_view source_column, @@ -207,7 +209,7 @@ struct update_target_from_dictionary_gmem { } template ()>* = nullptr> + cuda::std::enable_if_t()>* = nullptr> __device__ void operator()(mutable_column_device_view target, size_type target_index, column_device_view source_column, @@ -233,8 +235,8 @@ template struct update_target_element_gmem< dictionary32, k, - std::enable_if_t> { + cuda::std::enable_if_t> { __device__ void operator()(mutable_column_device_view target, size_type target_index, column_device_view source_column, @@ -259,9 +261,10 @@ struct update_target_element_gmem< // The shared memory will already have it squared template -struct update_target_element_gmem()>> { +struct update_target_element_gmem< + Source, + cudf::aggregation::SUM_OF_SQUARES, + cuda::std::enable_if_t()>> { __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, @@ -282,9 +285,10 @@ struct update_target_element_gmem -struct update_target_element_gmem()>> { +struct update_target_element_gmem< + Source, + cudf::aggregation::PRODUCT, + cuda::std::enable_if_t()>> { __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, @@ -309,7 +313,8 @@ template struct update_target_element_gmem< Source, cudf::aggregation::COUNT_VALID, - std::enable_if_t()>> { + cuda::std::enable_if_t< + cudf::detail::is_valid_aggregation()>> { __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, @@ -332,7 +337,8 @@ template struct update_target_element_gmem< Source, cudf::aggregation::COUNT_ALL, - std::enable_if_t()>> { + cuda::std::enable_if_t< + cudf::detail::is_valid_aggregation()>> { __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, @@ -354,8 +360,8 @@ template struct update_target_element_gmem< Source, cudf::aggregation::ARGMAX, - std::enable_if_t() and - cudf::is_relationally_comparable()>> { + cuda::std::enable_if_t() and + cudf::is_relationally_comparable()>> { __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, @@ -384,8 +390,8 @@ template struct update_target_element_gmem< Source, cudf::aggregation::ARGMIN, - std::enable_if_t() and - cudf::is_relationally_comparable()>> { + cuda::std::enable_if_t() and + cudf::is_relationally_comparable()>> { __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh index 624a56710d5..3f1b4f01375 100644 --- a/cpp/src/groupby/hash/shared_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh @@ -23,6 +23,8 @@ #include #include +#include + namespace cudf::groupby::detail::hash { template @@ -41,8 +43,8 @@ template struct update_target_element_shmem< Source, cudf::aggregation::MIN, - std::enable_if_t() && cudf::has_atomic_support() && - !cudf::is_fixed_point()>> { + cuda::std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_fixed_point()>> { __device__ void operator()(std::byte* target, cudf::size_type target_index, bool* target_null, @@ -64,8 +66,8 @@ template struct update_target_element_shmem< Source, cudf::aggregation::MIN, - std::enable_if_t() && - cudf::has_atomic_support>()>> { + cuda::std::enable_if_t() && + cudf::has_atomic_support>()>> { __device__ void operator()(std::byte* target, cudf::size_type target_index, bool* target_null, @@ -89,8 +91,8 @@ template struct update_target_element_shmem< Source, cudf::aggregation::MAX, - std::enable_if_t() && cudf::has_atomic_support() && - !cudf::is_fixed_point()>> { + cuda::std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_fixed_point()>> { __device__ void operator()(std::byte* target, cudf::size_type target_index, bool* target_null, @@ -111,8 +113,8 @@ template struct update_target_element_shmem< Source, cudf::aggregation::MAX, - std::enable_if_t() && - cudf::has_atomic_support>()>> { + cuda::std::enable_if_t() && + cudf::has_atomic_support>()>> { __device__ void operator()(std::byte* target, cudf::size_type target_index, bool* target_null, @@ -138,8 +140,8 @@ template struct update_target_element_shmem< Source, cudf::aggregation::SUM, - std::enable_if_t() && cudf::has_atomic_support() && - !cudf::is_fixed_point() && !cudf::is_timestamp()>> { + cuda::std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_fixed_point() && !cudf::is_timestamp()>> { __device__ void operator()(std::byte* target, cudf::size_type target_index, bool* target_null, @@ -161,8 +163,8 @@ template struct update_target_element_shmem< Source, cudf::aggregation::SUM, - std::enable_if_t>() && - cudf::is_fixed_point()>> { + cuda::std::enable_if_t>() && + cudf::is_fixed_point()>> { __device__ void operator()(std::byte* target, cudf::size_type target_index, bool* target_null, @@ -187,7 +189,7 @@ struct update_target_element_shmem< struct update_target_from_dictionary_shmem { template ()>* = nullptr> + cuda::std::enable_if_t()>* = nullptr> __device__ void operator()(std::byte* target, cudf::size_type target_index, bool* target_null, @@ -199,7 +201,7 @@ struct update_target_from_dictionary_shmem { } template ()>* = nullptr> + cuda::std::enable_if_t()>* = nullptr> __device__ void operator()(std::byte* target, cudf::size_type target_index, bool* target_null, @@ -213,8 +215,8 @@ template struct update_target_element_shmem< dictionary32, k, - std::enable_if_t> { + cuda::std::enable_if_t> { __device__ void operator()(std::byte* target, cudf::size_type target_index, bool* target_null, @@ -236,9 +238,10 @@ struct update_target_element_shmem< }; template -struct update_target_element_shmem()>> { +struct update_target_element_shmem< + Source, + cudf::aggregation::SUM_OF_SQUARES, + cuda::std::enable_if_t()>> { __device__ void operator()(std::byte* target, cudf::size_type target_index, bool* target_null, @@ -257,9 +260,10 @@ struct update_target_element_shmem -struct update_target_element_shmem()>> { +struct update_target_element_shmem< + Source, + cudf::aggregation::PRODUCT, + cuda::std::enable_if_t()>> { __device__ void operator()(std::byte* target, cudf::size_type target_index, bool* target_null, @@ -281,7 +285,8 @@ template struct update_target_element_shmem< Source, cudf::aggregation::COUNT_VALID, - std::enable_if_t()>> { + cuda::std::enable_if_t< + cudf::detail::is_valid_aggregation()>> { __device__ void operator()(std::byte* target, cudf::size_type target_index, bool* target_null, @@ -300,7 +305,8 @@ template struct update_target_element_shmem< Source, cudf::aggregation::COUNT_ALL, - std::enable_if_t()>> { + cuda::std::enable_if_t< + cudf::detail::is_valid_aggregation()>> { __device__ void operator()(std::byte* target, cudf::size_type target_index, bool* target_null, @@ -319,8 +325,8 @@ template struct update_target_element_shmem< Source, cudf::aggregation::ARGMAX, - std::enable_if_t() and - cudf::is_relationally_comparable()>> { + cuda::std::enable_if_t() and + cudf::is_relationally_comparable()>> { __device__ void operator()(std::byte* target, cudf::size_type target_index, bool* target_null, @@ -347,8 +353,8 @@ template struct update_target_element_shmem< Source, cudf::aggregation::ARGMIN, - std::enable_if_t() and - cudf::is_relationally_comparable()>> { + cuda::std::enable_if_t() and + cudf::is_relationally_comparable()>> { __device__ void operator()(std::byte* target, cudf::size_type target_index, bool* target_null, From 221bed4efd28b2149aecbc80215fdb0961d9f155 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 30 Sep 2024 16:45:02 -0700 Subject: [PATCH 063/135] Clean up shared aggregator early exit logic --- .../groupby/hash/global_memory_aggregator.cuh | 2 +- .../groupby/hash/shared_memory_aggregator.cuh | 28 ++----------------- cpp/src/groupby/hash/single_pass_functors.cuh | 1 + 3 files changed, 5 insertions(+), 26 deletions(-) diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh index f8baf7d84ba..636c6e97c28 100644 --- a/cpp/src/groupby/hash/global_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh @@ -352,7 +352,7 @@ struct update_target_element_gmem< cudf::detail::atomic_add(&target.element(target_index), static_cast(source_casted[source_index])); - // It is assumed the output for COUNT_VALID is initialized to be all valid + // It is assumed the output for COUNT_ALL is initialized to be all valid } }; diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh index 3f1b4f01375..9be2e43eac0 100644 --- a/cpp/src/groupby/hash/shared_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh @@ -51,8 +51,6 @@ struct update_target_element_shmem< cudf::column_device_view source, cudf::size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = cudf::detail::target_type_t; Target* target_casted = reinterpret_cast(target); cudf::detail::atomic_min(&target_casted[target_index], @@ -74,8 +72,6 @@ struct update_target_element_shmem< cudf::column_device_view source, cudf::size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = cudf::detail::target_type_t; using DeviceTarget = cudf::device_storage_type_t; using DeviceSource = cudf::device_storage_type_t; @@ -99,8 +95,6 @@ struct update_target_element_shmem< cudf::column_device_view source, cudf::size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = cudf::detail::target_type_t; Target* target_casted = reinterpret_cast(target); cudf::detail::atomic_max(&target_casted[target_index], @@ -121,8 +115,6 @@ struct update_target_element_shmem< cudf::column_device_view source, cudf::size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = cudf::detail::target_type_t; using DeviceTarget = cudf::device_storage_type_t; @@ -148,8 +140,6 @@ struct update_target_element_shmem< cudf::column_device_view source, cudf::size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = cudf::detail::target_type_t; Target* target_casted = reinterpret_cast(target); cudf::detail::atomic_add(&target_casted[target_index], @@ -171,8 +161,6 @@ struct update_target_element_shmem< cudf::column_device_view source, cudf::size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = cudf::detail::target_type_t; using DeviceTarget = cudf::device_storage_type_t; @@ -223,8 +211,6 @@ struct update_target_element_shmem< cudf::column_device_view source, cudf::size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - dispatch_type_and_aggregation( source.child(cudf::dictionary_column_view::keys_column_index).type(), k, @@ -248,8 +234,6 @@ struct update_target_element_shmem< cudf::column_device_view source, cudf::size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = cudf::detail::target_type_t; Target* target_casted = reinterpret_cast(target); auto value = static_cast(source.element(source_index)); @@ -270,8 +254,6 @@ struct update_target_element_shmem< cudf::column_device_view source, cudf::size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = cudf::detail::target_type_t; Target* target_casted = reinterpret_cast(target); cudf::detail::atomic_mul(&target_casted[target_index], @@ -293,8 +275,6 @@ struct update_target_element_shmem< cudf::column_device_view source, cudf::size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = cudf::detail::target_type_t; Target* target_casted = reinterpret_cast(target); cudf::detail::atomic_add(&target_casted[target_index], Target{1}); @@ -333,8 +313,6 @@ struct update_target_element_shmem< cudf::column_device_view source, cudf::size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = cudf::detail::target_type_t; Target* target_casted = reinterpret_cast(target); auto old = cudf::detail::atomic_cas( @@ -361,8 +339,6 @@ struct update_target_element_shmem< cudf::column_device_view source, cudf::size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = cudf::detail::target_type_t; Target* target_casted = reinterpret_cast(target); auto old = cudf::detail::atomic_cas( @@ -385,9 +361,11 @@ struct shmem_element_aggregator { cudf::column_device_view source, cudf::size_type source_index) const noexcept { + if constexpr (k != cudf::aggregation::COUNT_ALL) { + if (source.is_null(source_index)) { return; } + } update_target_element_shmem{}( target, target_index, target_null, source, source_index); } }; - } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index 19ba33e01e3..1a3e761c7ae 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -25,6 +25,7 @@ namespace cudf::groupby::detail::hash { +// TODO: TO BE REMOVED template __device__ constexpr bool is_supported() { From b31f16f38c5e3fb2e813f2f90aecdb8edfe82beb Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 30 Sep 2024 16:48:13 -0700 Subject: [PATCH 064/135] Clean up global aggregator early exit logic --- .../groupby/hash/global_memory_aggregator.cuh | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh index 636c6e97c28..053c95b40a4 100644 --- a/cpp/src/groupby/hash/global_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh @@ -53,8 +53,6 @@ struct update_target_element_gmem< cudf::size_type source_index, bool* source_null) const noexcept { - if (source_null[source_index]) { return; } - using Target = cudf::detail::target_type_t; Target* source_casted = reinterpret_cast(source); @@ -78,7 +76,6 @@ struct update_target_element_gmem< cudf::size_type source_index, bool* source_null) const noexcept { - if (source_null[source_index]) { return; } using Target = cudf::detail::target_type_t; using DeviceType = cudf::device_storage_type_t; DeviceType* source_casted = reinterpret_cast(source); @@ -102,7 +99,6 @@ struct update_target_element_gmem< cudf::size_type source_index, bool* source_null) const noexcept { - if (source_null[source_index]) { return; } using Target = cudf::detail::target_type_t; Target* source_casted = reinterpret_cast(source); cudf::detail::atomic_max(&target.element(target_index), @@ -125,7 +121,6 @@ struct update_target_element_gmem< cudf::size_type source_index, bool* source_null) const noexcept { - if (source_null[source_index]) { return; } using Target = cudf::detail::target_type_t; using DeviceType = cudf::device_storage_type_t; @@ -150,7 +145,6 @@ struct update_target_element_gmem< cudf::size_type source_index, bool* source_null) const noexcept { - if (source_null[source_index]) { return; } using Target = cudf::detail::target_type_t; Target* source_casted = reinterpret_cast(source); @@ -174,7 +168,6 @@ struct update_target_element_gmem< cudf::size_type source_index, bool* source_null) const noexcept { - if (source_null[source_index]) { return; } using Target = cudf::detail::target_type_t; using DeviceType = cudf::device_storage_type_t; @@ -244,8 +237,6 @@ struct update_target_element_gmem< cudf::size_type source_index, bool* source_null) const noexcept { - if (source_null[source_index]) { return; } - dispatch_type_and_aggregation( source_column.child(cudf::dictionary_column_view::keys_column_index).type(), k, @@ -272,7 +263,6 @@ struct update_target_element_gmem< cudf::size_type source_index, bool* source_null) const noexcept { - if (source_null[source_index]) { return; } using Target = cudf::detail::target_type_t; Target* source_casted = reinterpret_cast(source); @@ -296,7 +286,6 @@ struct update_target_element_gmem< cudf::size_type source_index, bool* source_null) const noexcept { - if (source_null[source_index]) { return; } using Target = cudf::detail::target_type_t; Target* source_casted = reinterpret_cast(source); @@ -369,7 +358,6 @@ struct update_target_element_gmem< cudf::size_type source_index, bool* source_null) const noexcept { - if (source_null[source_index]) { return; } using Target = cudf::detail::target_type_t; Target* source_casted = reinterpret_cast(source); auto source_argmax_index = source_casted[source_index]; @@ -399,7 +387,6 @@ struct update_target_element_gmem< cudf::size_type source_index, bool* source_null) const noexcept { - if (source_null[source_index]) { return; } using Target = cudf::detail::target_type_t; Target* source_casted = reinterpret_cast(source); auto source_argmin_index = source_casted[source_index]; @@ -426,6 +413,9 @@ struct gmem_element_aggregator { cudf::size_type source_index, bool* source_null) const noexcept { + if constexpr (k != cudf::aggregation::COUNT_ALL) { + if (source_null[source_index]) { return; } + } update_target_element_gmem{}( target, target_index, source_column, source, source_index, source_null); } From 9cea918d443b1737c4ca3ba18c3ca1cf46822c21 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 1 Oct 2024 10:49:10 -0700 Subject: [PATCH 065/135] Fix merge conflicts --- cpp/src/groupby/hash/single_pass_functors.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index 1a3e761c7ae..6d10c8065ca 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -18,7 +18,7 @@ #include "helpers.cuh" -#include +#include #include #include #include @@ -217,7 +217,7 @@ struct compute_direct_aggregates { if (block_cardinality[block_id] >= GROUPBY_CARDINALITY_THRESHOLD and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i))) { auto const result = set.insert_and_find(i); - cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); + cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); } } }; @@ -293,7 +293,7 @@ struct compute_single_pass_aggs_fn { if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i)) { auto const result = set.insert_and_find(i); - cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); + cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); } } }; From fe9c212f570a3ef334887cace12338bdb6ea1794 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 1 Oct 2024 12:40:49 -0700 Subject: [PATCH 066/135] Clean up device aggregator early exit logic --- .../detail/aggregation/device_aggregators.cuh | 27 +++---------------- 1 file changed, 3 insertions(+), 24 deletions(-) diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh index 10be5e1d36f..dd92568465b 100644 --- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh +++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh @@ -51,8 +51,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; cudf::detail::atomic_min(&target.element(target_index), static_cast(source.element(source_index))); @@ -72,8 +70,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; using DeviceTarget = device_storage_type_t; using DeviceSource = device_storage_type_t; @@ -96,8 +92,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; cudf::detail::atomic_max(&target.element(target_index), static_cast(source.element(source_index))); @@ -117,8 +111,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; using DeviceTarget = device_storage_type_t; using DeviceSource = device_storage_type_t; @@ -141,8 +133,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; cudf::detail::atomic_add(&target.element(target_index), static_cast(source.element(source_index))); @@ -162,8 +152,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; using DeviceTarget = device_storage_type_t; using DeviceSource = device_storage_type_t; @@ -227,8 +215,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - dispatch_type_and_aggregation( source.child(cudf::dictionary_column_view::keys_column_index).type(), k, @@ -249,8 +235,6 @@ struct update_target_element; auto value = static_cast(source.element(source_index)); cudf::detail::atomic_add(&target.element(target_index), value * value); @@ -267,8 +251,6 @@ struct update_target_element; cudf::detail::atomic_mul(&target.element(target_index), static_cast(source.element(source_index))); @@ -286,8 +268,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; cudf::detail::atomic_add(&target.element(target_index), Target{1}); @@ -323,8 +303,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; auto old = cudf::detail::atomic_cas( &target.element(target_index), ARGMAX_SENTINEL, source_index); @@ -349,8 +327,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; auto old = cudf::detail::atomic_cas( &target.element(target_index), ARGMIN_SENTINEL, source_index); @@ -376,6 +352,9 @@ struct elementwise_aggregator { column_device_view source, size_type source_index) const noexcept { + if constexpr (k != cudf::aggregation::COUNT_ALL) { + if (source.is_null(source_index)) { return; } + } update_target_element{}(target, target_index, source, source_index); } }; From a7a9d75757b88dab1d03419d348a0fd56edb5516 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 1 Oct 2024 16:14:38 -0700 Subject: [PATCH 067/135] Add traits to minimize code duplication --- .../detail/aggregation/device_aggregators.cuh | 99 ++++++------------- 1 file changed, 28 insertions(+), 71 deletions(-) diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh index dd92568465b..bc370c59296 100644 --- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh +++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh @@ -29,6 +29,25 @@ #include namespace cudf::detail { +/// Checks if an aggregation kind needs to operate on the underlying storage type +template +__device__ constexpr bool uses_underlying_type() +{ + return k == aggregation::MIN or k == aggregation::MAX or k == aggregation::SUM; +} + +/// Gets the underlying target type for the given source type and aggregation kind +template +using underlying_target_t = + cuda::std::conditional_t(), + cudf::device_storage_type_t>, + cudf::detail::target_type_t>; + +/// Gets the underlying source type for the given source type and aggregation kind +template +using underlying_source_t = + cuda::std::conditional_t(), cudf::device_storage_type_t, Source>; + template struct update_target_element { __device__ void operator()(mutable_column_device_view target, @@ -44,35 +63,14 @@ template struct update_target_element< Source, aggregation::MIN, - cuda::std::enable_if_t() && cudf::has_atomic_support() && - !is_fixed_point()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - using Target = target_type_t; - cudf::detail::atomic_min(&target.element(target_index), - static_cast(source.element(source_index))); - - if (target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::MIN, - cuda::std::enable_if_t() && - cudf::has_atomic_support>()>> { + cuda::std::enable_if_t() && cudf::has_atomic_support()>> { __device__ void operator()(mutable_column_device_view target, size_type target_index, column_device_view source, size_type source_index) const noexcept { - using Target = target_type_t; - using DeviceTarget = device_storage_type_t; - using DeviceSource = device_storage_type_t; + using DeviceTarget = cudf::detail::underlying_target_t; + using DeviceSource = cudf::detail::underlying_source_t; cudf::detail::atomic_min(&target.element(target_index), static_cast(source.element(source_index))); @@ -85,35 +83,14 @@ template struct update_target_element< Source, aggregation::MAX, - cuda::std::enable_if_t() && cudf::has_atomic_support() && - !is_fixed_point()>> { + cuda::std::enable_if_t() && cudf::has_atomic_support()>> { __device__ void operator()(mutable_column_device_view target, size_type target_index, column_device_view source, size_type source_index) const noexcept { - using Target = target_type_t; - cudf::detail::atomic_max(&target.element(target_index), - static_cast(source.element(source_index))); - - if (target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::MAX, - cuda::std::enable_if_t() && - cudf::has_atomic_support>()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - using Target = target_type_t; - using DeviceTarget = device_storage_type_t; - using DeviceSource = device_storage_type_t; + using DeviceTarget = cudf::detail::underlying_target_t; + using DeviceSource = cudf::detail::underlying_source_t; cudf::detail::atomic_max(&target.element(target_index), static_cast(source.element(source_index))); @@ -127,34 +104,14 @@ struct update_target_element< Source, aggregation::SUM, cuda::std::enable_if_t() && cudf::has_atomic_support() && - !cudf::is_fixed_point() && !cudf::is_timestamp()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - using Target = target_type_t; - cudf::detail::atomic_add(&target.element(target_index), - static_cast(source.element(source_index))); - - if (target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::SUM, - cuda::std::enable_if_t() && - cudf::has_atomic_support>()>> { + !cudf::is_timestamp()>> { __device__ void operator()(mutable_column_device_view target, size_type target_index, column_device_view source, size_type source_index) const noexcept { - using Target = target_type_t; - using DeviceTarget = device_storage_type_t; - using DeviceSource = device_storage_type_t; + using DeviceTarget = cudf::detail::underlying_target_t; + using DeviceSource = cudf::detail::underlying_source_t; cudf::detail::atomic_add(&target.element(target_index), static_cast(source.element(source_index))); From cb042ef8c066d095b93d93517bbe31a3f8e6de6b Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 1 Oct 2024 16:32:08 -0700 Subject: [PATCH 068/135] Use traits to avoid code duplication --- .../groupby/hash/global_memory_aggregator.cuh | 90 ++----------------- .../groupby/hash/shared_memory_aggregator.cuh | 89 +++--------------- 2 files changed, 18 insertions(+), 161 deletions(-) diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh index 053c95b40a4..62be580fa43 100644 --- a/cpp/src/groupby/hash/global_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh @@ -16,8 +16,8 @@ #pragma once -#include #include +#include #include #include #include @@ -44,8 +44,7 @@ template struct update_target_element_gmem< Source, cudf::aggregation::MIN, - cuda::std::enable_if_t() && cudf::has_atomic_support() && - !cudf::is_fixed_point()>> { + cuda::std::enable_if_t() && cudf::has_atomic_support()>> { __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, @@ -53,31 +52,7 @@ struct update_target_element_gmem< cudf::size_type source_index, bool* source_null) const noexcept { - using Target = cudf::detail::target_type_t; - - Target* source_casted = reinterpret_cast(source); - cudf::detail::atomic_min(&target.element(target_index), - static_cast(source_casted[source_index])); - - if (target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element_gmem< - Source, - cudf::aggregation::MIN, - cuda::std::enable_if_t() && - cudf::has_atomic_support>()>> { - __device__ void operator()(cudf::mutable_column_device_view target, - cudf::size_type target_index, - cudf::column_device_view source_column, - std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept - { - using Target = cudf::detail::target_type_t; - using DeviceType = cudf::device_storage_type_t; + using DeviceType = cudf::detail::underlying_target_t; DeviceType* source_casted = reinterpret_cast(source); cudf::detail::atomic_min(&target.element(target_index), static_cast(source_casted[source_index])); @@ -90,30 +65,7 @@ template struct update_target_element_gmem< Source, cudf::aggregation::MAX, - cuda::std::enable_if_t() && cudf::has_atomic_support() && - !cudf::is_fixed_point()>> { - __device__ void operator()(cudf::mutable_column_device_view target, - cudf::size_type target_index, - cudf::column_device_view source_column, - std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept - { - using Target = cudf::detail::target_type_t; - Target* source_casted = reinterpret_cast(source); - cudf::detail::atomic_max(&target.element(target_index), - static_cast(source_casted[source_index])); - - if (target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element_gmem< - Source, - cudf::aggregation::MAX, - cuda::std::enable_if_t() && - cudf::has_atomic_support>()>> { + cuda::std::enable_if_t() && cudf::has_atomic_support()>> { __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, @@ -121,9 +73,7 @@ struct update_target_element_gmem< cudf::size_type source_index, bool* source_null) const noexcept { - using Target = cudf::detail::target_type_t; - - using DeviceType = cudf::device_storage_type_t; + using DeviceType = cudf::detail::underlying_target_t; DeviceType* source_casted = reinterpret_cast(source); cudf::detail::atomic_max(&target.element(target_index), static_cast(source_casted[source_index])); @@ -137,7 +87,7 @@ struct update_target_element_gmem< Source, cudf::aggregation::SUM, cuda::std::enable_if_t() && cudf::has_atomic_support() && - !cudf::is_fixed_point() && !cudf::is_timestamp()>> { + !cudf::is_timestamp()>> { __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, @@ -145,35 +95,11 @@ struct update_target_element_gmem< cudf::size_type source_index, bool* source_null) const noexcept { - using Target = cudf::detail::target_type_t; - - Target* source_casted = reinterpret_cast(source); - cudf::detail::atomic_add(&target.element(target_index), - static_cast(source_casted[source_index])); - - if (target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element_gmem< - Source, - cudf::aggregation::SUM, - cuda::std::enable_if_t>() && - cudf::is_fixed_point()>> { - __device__ void operator()(cudf::mutable_column_device_view target, - cudf::size_type target_index, - cudf::column_device_view source_column, - std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept - { - using Target = cudf::detail::target_type_t; - - using DeviceType = cudf::device_storage_type_t; + using DeviceType = cudf::detail::underlying_target_t; DeviceType* source_casted = reinterpret_cast(source); cudf::detail::atomic_add(&target.element(target_index), static_cast(source_casted[source_index])); + if (target.is_null(target_index)) { target.set_valid(target_index); } } }; diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh index 9be2e43eac0..5bea0defe29 100644 --- a/cpp/src/groupby/hash/shared_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh @@ -16,8 +16,8 @@ #pragma once -#include #include +#include #include #include #include @@ -43,38 +43,15 @@ template struct update_target_element_shmem< Source, cudf::aggregation::MIN, - cuda::std::enable_if_t() && cudf::has_atomic_support() && - !cudf::is_fixed_point()>> { + cuda::std::enable_if_t() && cudf::has_atomic_support()>> { __device__ void operator()(std::byte* target, cudf::size_type target_index, bool* target_null, cudf::column_device_view source, cudf::size_type source_index) const noexcept { - using Target = cudf::detail::target_type_t; - Target* target_casted = reinterpret_cast(target); - cudf::detail::atomic_min(&target_casted[target_index], - static_cast(source.element(source_index))); - - if (target_null[target_index]) { target_null[target_index] = false; } - } -}; - -template -struct update_target_element_shmem< - Source, - cudf::aggregation::MIN, - cuda::std::enable_if_t() && - cudf::has_atomic_support>()>> { - __device__ void operator()(std::byte* target, - cudf::size_type target_index, - bool* target_null, - cudf::column_device_view source, - cudf::size_type source_index) const noexcept - { - using Target = cudf::detail::target_type_t; - using DeviceTarget = cudf::device_storage_type_t; - using DeviceSource = cudf::device_storage_type_t; + using DeviceTarget = cudf::detail::underlying_target_t; + using DeviceSource = cudf::detail::underlying_source_t; DeviceTarget* target_casted = reinterpret_cast(target); cudf::detail::atomic_min(&target_casted[target_index], @@ -87,38 +64,15 @@ template struct update_target_element_shmem< Source, cudf::aggregation::MAX, - cuda::std::enable_if_t() && cudf::has_atomic_support() && - !cudf::is_fixed_point()>> { + cuda::std::enable_if_t() && cudf::has_atomic_support()>> { __device__ void operator()(std::byte* target, cudf::size_type target_index, bool* target_null, cudf::column_device_view source, cudf::size_type source_index) const noexcept { - using Target = cudf::detail::target_type_t; - Target* target_casted = reinterpret_cast(target); - cudf::detail::atomic_max(&target_casted[target_index], - static_cast(source.element(source_index))); - if (target_null[target_index]) { target_null[target_index] = false; } - } -}; - -template -struct update_target_element_shmem< - Source, - cudf::aggregation::MAX, - cuda::std::enable_if_t() && - cudf::has_atomic_support>()>> { - __device__ void operator()(std::byte* target, - cudf::size_type target_index, - bool* target_null, - cudf::column_device_view source, - cudf::size_type source_index) const noexcept - { - using Target = cudf::detail::target_type_t; - - using DeviceTarget = cudf::device_storage_type_t; - using DeviceSource = cudf::device_storage_type_t; + using DeviceTarget = cudf::detail::underlying_target_t; + using DeviceSource = cudf::detail::underlying_source_t; DeviceTarget* target_casted = reinterpret_cast(target); cudf::detail::atomic_max(&target_casted[target_index], @@ -133,38 +87,15 @@ struct update_target_element_shmem< Source, cudf::aggregation::SUM, cuda::std::enable_if_t() && cudf::has_atomic_support() && - !cudf::is_fixed_point() && !cudf::is_timestamp()>> { - __device__ void operator()(std::byte* target, - cudf::size_type target_index, - bool* target_null, - cudf::column_device_view source, - cudf::size_type source_index) const noexcept - { - using Target = cudf::detail::target_type_t; - Target* target_casted = reinterpret_cast(target); - cudf::detail::atomic_add(&target_casted[target_index], - static_cast(source.element(source_index))); - - if (target_null[target_index]) { target_null[target_index] = false; } - } -}; - -template -struct update_target_element_shmem< - Source, - cudf::aggregation::SUM, - cuda::std::enable_if_t>() && - cudf::is_fixed_point()>> { + !cudf::is_timestamp()>> { __device__ void operator()(std::byte* target, cudf::size_type target_index, bool* target_null, cudf::column_device_view source, cudf::size_type source_index) const noexcept { - using Target = cudf::detail::target_type_t; - - using DeviceTarget = cudf::device_storage_type_t; - using DeviceSource = cudf::device_storage_type_t; + using DeviceTarget = cudf::detail::underlying_target_t; + using DeviceSource = cudf::detail::underlying_source_t; DeviceTarget* target_casted = reinterpret_cast(target); cudf::detail::atomic_add(&target_casted[target_index], From ecdd3fddbcf4e90a8e27f4f34f47b7e82901aa7a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 4 Oct 2024 13:36:06 -0700 Subject: [PATCH 069/135] Cannot query shmem with nested type dispatcher --- .../groupby/hash/compute_single_pass_aggs.cuh | 79 +++++++++++++------ .../groupby/hash/global_memory_aggregator.cuh | 2 + .../groupby/hash/shared_memory_aggregator.cuh | 2 + 3 files changed, 58 insertions(+), 25 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index b534e9b8f1e..aad63d33b91 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -210,7 +210,57 @@ rmm::device_uvector compute_single_pass_aggs( ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first : rmm::device_buffer{}; + auto const has_dictionary_input = std::any_of(keys.begin(), keys.end(), [](cudf::column_view col) { + return cudf::is_dictionary(col.type());}); + + // 'populated_keys' contains inserted row_indices (keys) of global hash set + rmm::device_uvector populated_keys(keys.num_rows(), stream); + + // flatten the aggs to a table that can be operated on by aggregate_row + auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); + auto const d_agg_kinds = cudf::detail::make_device_uvector_async( + agg_kinds, stream, rmm::mr::get_current_device_resource()); + auto global_set_ref = global_set.ref(cuco::op::insert_and_find); + + if (has_dictionary_input) { + // make table that will hold sparse results + cudf::table sparse_table = + create_sparse_results_table(flattened_values, + d_agg_kinds.data(), + agg_kinds, + has_dictionary_input, + global_set, + populated_keys, + stream); + + // prepare to launch kernel to do the actual aggregation + auto d_values = table_device_view::create(flattened_values, stream); + auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); + + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + keys.num_rows(), + hash::compute_single_pass_aggs_fn{global_set_ref, + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + static_cast(row_bitmask.data()), + skip_rows_with_nulls}); + extract_populated_keys(global_set, populated_keys, stream); + + // Add results back to sparse_results cache + auto sparse_result_cols = sparse_table.release(); + for (size_t i = 0; i < aggs.size(); i++) { + // Note that the cache will make a copy of this temporary aggregation + sparse_results->add_result( + flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); + } + + return populated_keys; + } + auto const grid_size = max_occupancy_grid_size( compute_mapping_indices, num_input_rows); @@ -235,22 +285,12 @@ rmm::device_uvector compute_single_pass_aggs( direct_aggregations.data()); stream.synchronize(); - // 'populated_keys' contains inserted row_indices (keys) of global hash set - rmm::device_uvector populated_keys(keys.num_rows(), stream); - - // flatten the aggs to a table that can be operated on by aggregate_row - auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); - auto const d_agg_kinds = cudf::detail::make_device_uvector_async( - agg_kinds, stream, rmm::mr::get_current_device_resource()); - - auto const [uses_shmem_aggs, shmem_size] = can_use_shmem_aggs(grid_size); - // make table that will hold sparse results cudf::table sparse_table = create_sparse_results_table(flattened_values, d_agg_kinds.data(), agg_kinds, - uses_shmem_aggs ? direct_aggregations.value(stream) : true, + direct_aggregations.value(stream), global_set, populated_keys, stream); @@ -258,19 +298,9 @@ rmm::device_uvector compute_single_pass_aggs( auto d_values = table_device_view::create(flattened_values, stream); auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - if (!uses_shmem_aggs) { - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - keys.num_rows(), - hash::compute_single_pass_aggs_fn{global_set_ref, - *d_values, - *d_sparse_table, - d_agg_kinds.data(), - static_cast(row_bitmask.data()), - skip_rows_with_nulls}); - extract_populated_keys(global_set, populated_keys, stream); - } else { + auto const [valid, shmem_size] = can_use_shmem_aggs(grid_size); + CUDF_EXPECTS(valid, "this must be usable"); + compute_aggregations(grid_size, num_input_rows, static_cast(row_bitmask.data()), @@ -298,7 +328,6 @@ rmm::device_uvector compute_single_pass_aggs( skip_rows_with_nulls}); extract_populated_keys(global_set, populated_keys, stream); } - } // Add results back to sparse_results cache auto sparse_result_cols = sparse_table.release(); diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh index 62be580fa43..cd0bb64c4ee 100644 --- a/cpp/src/groupby/hash/global_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh @@ -150,6 +150,7 @@ struct update_target_from_dictionary_gmem { * * `update_target_element( target, target_index, source.keys(), source.indices()[source_index] )` */ +/* template struct update_target_element_gmem< dictionary32, @@ -175,6 +176,7 @@ struct update_target_element_gmem< source_null); } }; +*/ // The shared memory will already have it squared template diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh index 5bea0defe29..3b85ccf2ead 100644 --- a/cpp/src/groupby/hash/shared_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh @@ -130,6 +130,7 @@ struct update_target_from_dictionary_shmem { } }; +/* template struct update_target_element_shmem< dictionary32, @@ -153,6 +154,7 @@ struct update_target_element_shmem< static_cast(source.element(source_index))); } }; +*/ template struct update_target_element_shmem< From 2a96255de99cf03dcf3d1616480f174ac8dc2493 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 7 Oct 2024 10:48:12 -0700 Subject: [PATCH 070/135] Remove unused overloads --- .../groupby/hash/global_memory_aggregator.cuh | 74 ------------------- .../groupby/hash/shared_memory_aggregator.cuh | 51 ------------- 2 files changed, 125 deletions(-) diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh index cd0bb64c4ee..08d2c0552b3 100644 --- a/cpp/src/groupby/hash/global_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh @@ -104,80 +104,6 @@ struct update_target_element_gmem< } }; -/** - * @brief Function object to update a single element in a target column using - * the dictionary key addressed by the specific index. - * - * SFINAE is used to prevent recursion for dictionary type. Dictionary keys cannot be a - * dictionary. - * - */ -struct update_target_from_dictionary_gmem { - template ()>* = nullptr> - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source_column, - std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept - { - update_target_element_gmem{}( - target, target_index, source_column, source, source_index, source_null); - } - template ()>* = nullptr> - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source_column, - std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept - { - } -}; - -/** - * @brief Specialization function for dictionary type and aggregations. - * - * The `source` column is a dictionary type. This functor de-references the - * dictionary's keys child column and maps the input source index through - * the dictionary's indices child column to pass to the `update_target_element` - * in the above `update_target_from_dictionary` using the type-dispatcher to - * resolve the keys column type. - * - * `update_target_element( target, target_index, source.keys(), source.indices()[source_index] )` - */ -/* -template -struct update_target_element_gmem< - dictionary32, - k, - cuda::std::enable_if_t> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source_column, - std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept - { - dispatch_type_and_aggregation( - source_column.child(cudf::dictionary_column_view::keys_column_index).type(), - k, - update_target_from_dictionary_gmem{}, - target, - target_index, - source_column, - source, - source_index, - source_null); - } -}; -*/ - // The shared memory will already have it squared template struct update_target_element_gmem< diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh index 3b85ccf2ead..c5713e4a72e 100644 --- a/cpp/src/groupby/hash/shared_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh @@ -105,57 +105,6 @@ struct update_target_element_shmem< } }; -struct update_target_from_dictionary_shmem { - template ()>* = nullptr> - __device__ void operator()(std::byte* target, - cudf::size_type target_index, - bool* target_null, - cudf::column_device_view source, - cudf::size_type source_index) const noexcept - { - update_target_element_shmem{}( - target, target_index, target_null, source, source_index); - } - template ()>* = nullptr> - __device__ void operator()(std::byte* target, - cudf::size_type target_index, - bool* target_null, - cudf::column_device_view source, - cudf::size_type source_index) const noexcept - { - } -}; - -/* -template -struct update_target_element_shmem< - dictionary32, - k, - cuda::std::enable_if_t> { - __device__ void operator()(std::byte* target, - cudf::size_type target_index, - bool* target_null, - cudf::column_device_view source, - cudf::size_type source_index) const noexcept - { - dispatch_type_and_aggregation( - source.child(cudf::dictionary_column_view::keys_column_index).type(), - k, - update_target_from_dictionary_shmem{}, - target, - target_index, - target_null, - source.child(cudf::dictionary_column_view::keys_column_index), - static_cast(source.element(source_index))); - } -}; -*/ - template struct update_target_element_shmem< Source, From aa30df0e1749eb860ccf23897fc6169b4b9ec29e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 7 Oct 2024 10:48:25 -0700 Subject: [PATCH 071/135] Formatting --- .../groupby/hash/compute_single_pass_aggs.cuh | 118 +++++++++--------- 1 file changed, 59 insertions(+), 59 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index aad63d33b91..e8b9ca3ff9b 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -210,8 +210,10 @@ rmm::device_uvector compute_single_pass_aggs( ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first : rmm::device_buffer{}; - auto const has_dictionary_input = std::any_of(keys.begin(), keys.end(), [](cudf::column_view col) { - return cudf::is_dictionary(col.type());}); + auto const has_dictionary_input = + std::any_of(keys.begin(), keys.end(), [](cudf::column_view col) { + return cudf::is_dictionary(col.type()); + }); // 'populated_keys' contains inserted row_indices (keys) of global hash set rmm::device_uvector populated_keys(keys.num_rows(), stream); @@ -221,22 +223,21 @@ rmm::device_uvector compute_single_pass_aggs( auto const d_agg_kinds = cudf::detail::make_device_uvector_async( agg_kinds, stream, rmm::mr::get_current_device_resource()); - auto global_set_ref = global_set.ref(cuco::op::insert_and_find); + auto global_set_ref = global_set.ref(cuco::op::insert_and_find); if (has_dictionary_input) { - // make table that will hold sparse results - cudf::table sparse_table = - create_sparse_results_table(flattened_values, - d_agg_kinds.data(), - agg_kinds, - has_dictionary_input, - global_set, - populated_keys, - stream); - - // prepare to launch kernel to do the actual aggregation - auto d_values = table_device_view::create(flattened_values, stream); - auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); + // make table that will hold sparse results + cudf::table sparse_table = create_sparse_results_table(flattened_values, + d_agg_kinds.data(), + agg_kinds, + has_dictionary_input, + global_set, + populated_keys, + stream); + + // prepare to launch kernel to do the actual aggregation + auto d_values = table_device_view::create(flattened_values, stream); + auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); thrust::for_each_n( rmm::exec_policy(stream), @@ -250,15 +251,15 @@ rmm::device_uvector compute_single_pass_aggs( skip_rows_with_nulls}); extract_populated_keys(global_set, populated_keys, stream); - // Add results back to sparse_results cache - auto sparse_result_cols = sparse_table.release(); - for (size_t i = 0; i < aggs.size(); i++) { - // Note that the cache will make a copy of this temporary aggregation - sparse_results->add_result( - flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); - } + // Add results back to sparse_results cache + auto sparse_result_cols = sparse_table.release(); + for (size_t i = 0; i < aggs.size(); i++) { + // Note that the cache will make a copy of this temporary aggregation + sparse_results->add_result( + flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); + } - return populated_keys; + return populated_keys; } auto const grid_size = max_occupancy_grid_size( @@ -286,14 +287,13 @@ rmm::device_uvector compute_single_pass_aggs( stream.synchronize(); // make table that will hold sparse results - cudf::table sparse_table = - create_sparse_results_table(flattened_values, - d_agg_kinds.data(), - agg_kinds, - direct_aggregations.value(stream), - global_set, - populated_keys, - stream); + cudf::table sparse_table = create_sparse_results_table(flattened_values, + d_agg_kinds.data(), + agg_kinds, + direct_aggregations.value(stream), + global_set, + populated_keys, + stream); // prepare to launch kernel to do the actual aggregation auto d_values = table_device_view::create(flattened_values, stream); auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); @@ -301,33 +301,33 @@ rmm::device_uvector compute_single_pass_aggs( auto const [valid, shmem_size] = can_use_shmem_aggs(grid_size); CUDF_EXPECTS(valid, "this must be usable"); - compute_aggregations(grid_size, - num_input_rows, - static_cast(row_bitmask.data()), - skip_rows_with_nulls, - local_mapping_index.data(), - global_mapping_index.data(), - block_cardinality.data(), - *d_values, - *d_sparse_table, - d_agg_kinds.data(), - shmem_size, - stream); - if (direct_aggregations.value(stream)) { - auto const stride = GROUPBY_BLOCK_SIZE * grid_size; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - keys.num_rows(), - compute_direct_aggregates{global_set_ref, - *d_values, - *d_sparse_table, - d_agg_kinds.data(), - block_cardinality.data(), - stride, - static_cast(row_bitmask.data()), - skip_rows_with_nulls}); - extract_populated_keys(global_set, populated_keys, stream); - } + compute_aggregations(grid_size, + num_input_rows, + static_cast(row_bitmask.data()), + skip_rows_with_nulls, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + shmem_size, + stream); + if (direct_aggregations.value(stream)) { + auto const stride = GROUPBY_BLOCK_SIZE * grid_size; + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + keys.num_rows(), + compute_direct_aggregates{global_set_ref, + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + block_cardinality.data(), + stride, + static_cast(row_bitmask.data()), + skip_rows_with_nulls}); + extract_populated_keys(global_set, populated_keys, stream); + } // Add results back to sparse_results cache auto sparse_result_cols = sparse_table.release(); From c0e1a323162c1a85c1bd5c13e313f1bcc6dec28d Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 7 Oct 2024 12:50:06 -0700 Subject: [PATCH 072/135] Fix dict request determination logic --- cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index e8b9ca3ff9b..37cfd4c0f4f 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -210,9 +210,9 @@ rmm::device_uvector compute_single_pass_aggs( ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first : rmm::device_buffer{}; - auto const has_dictionary_input = - std::any_of(keys.begin(), keys.end(), [](cudf::column_view col) { - return cudf::is_dictionary(col.type()); + auto const has_dictionary_request = std::any_of( + requests.begin(), requests.end(), [](cudf::groupby::aggregation_request const& request) { + return cudf::is_dictionary(request.values.type()); }); // 'populated_keys' contains inserted row_indices (keys) of global hash set @@ -225,12 +225,12 @@ rmm::device_uvector compute_single_pass_aggs( auto global_set_ref = global_set.ref(cuco::op::insert_and_find); - if (has_dictionary_input) { + if (has_dictionary_request) { // make table that will hold sparse results cudf::table sparse_table = create_sparse_results_table(flattened_values, d_agg_kinds.data(), agg_kinds, - has_dictionary_input, + has_dictionary_request, global_set, populated_keys, stream); From fc5dc018ab7461c3f74fb78013d23e4d49310736 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 7 Oct 2024 14:06:25 -0700 Subject: [PATCH 073/135] Remove can_use_shmem_aggs logic --- cpp/src/groupby/hash/compute_aggregations.cu | 14 +++++--------- cpp/src/groupby/hash/compute_aggregations.hpp | 7 ------- cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 4 ---- 3 files changed, 5 insertions(+), 20 deletions(-) diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu index aec6f39501e..915ede5154b 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cu +++ b/cpp/src/groupby/hash/compute_aggregations.cu @@ -226,19 +226,15 @@ constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 * } // namespace -std::pair can_use_shmem_aggs(int grid_size) noexcept +size_t available_shared_memory_size(int grid_size) { auto const active_blocks_per_sm = cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); size_t dynamic_shmem_size = 0; - - auto const status = cudaOccupancyAvailableDynamicSMemPerBlock( - &dynamic_shmem_size, compute_aggs_kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE); - auto const success = status == cudaSuccess; - if (!success) { cudaGetLastError(); } - - return {success, get_previous_multiple_of_8(0.5 * dynamic_shmem_size)}; + CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock( + &dynamic_shmem_size, compute_aggs_kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE)); + return get_previous_multiple_of_8(0.5 * dynamic_shmem_size); } void compute_aggregations(int grid_size, @@ -251,9 +247,9 @@ void compute_aggregations(int grid_size, cudf::table_device_view input_values, cudf::mutable_table_device_view output_values, cudf::aggregation::Kind const* d_agg_kinds, - size_t shmem_size, rmm::cuda_stream_view stream) { + auto const shmem_size = available_shared_memory_size(grid_size); // For each aggregation, need two pointers to arrays in shmem // One where the aggregation is performed, one indicating the validity of the aggregation auto const shmem_agg_pointer_size = diff --git a/cpp/src/groupby/hash/compute_aggregations.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp index f01d9f24c66..d0e8e354d12 100644 --- a/cpp/src/groupby/hash/compute_aggregations.hpp +++ b/cpp/src/groupby/hash/compute_aggregations.hpp @@ -21,14 +21,8 @@ #include -#include - -#include - namespace cudf::groupby::detail::hash { -std::pair can_use_shmem_aggs(int grid_size) noexcept; - void compute_aggregations(int grid_size, cudf::size_type num_input_rows, bitmask_type const* row_bitmask, @@ -39,7 +33,6 @@ void compute_aggregations(int grid_size, cudf::table_device_view input_values, cudf::mutable_table_device_view output_values, cudf::aggregation::Kind const* d_agg_kinds, - size_t shmem_size, rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 37cfd4c0f4f..73a69f81200 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -298,9 +298,6 @@ rmm::device_uvector compute_single_pass_aggs( auto d_values = table_device_view::create(flattened_values, stream); auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - auto const [valid, shmem_size] = can_use_shmem_aggs(grid_size); - CUDF_EXPECTS(valid, "this must be usable"); - compute_aggregations(grid_size, num_input_rows, static_cast(row_bitmask.data()), @@ -311,7 +308,6 @@ rmm::device_uvector compute_single_pass_aggs( *d_values, *d_sparse_table, d_agg_kinds.data(), - shmem_size, stream); if (direct_aggregations.value(stream)) { auto const stride = GROUPBY_BLOCK_SIZE * grid_size; From c1a421f6bafcdcc0e54c82939850ee165b84923f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 7 Oct 2024 14:07:55 -0700 Subject: [PATCH 074/135] Remove groupby multi-aggs cpp tests --- cpp/tests/CMakeLists.txt | 1 - cpp/tests/groupby/multi_aggs_tests.cpp | 115 ------------------------- 2 files changed, 116 deletions(-) delete mode 100644 cpp/tests/groupby/multi_aggs_tests.cpp diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index c3672999eba..4596ec65ce7 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -137,7 +137,6 @@ ConfigureTest( groupby/merge_lists_tests.cpp groupby/merge_sets_tests.cpp groupby/min_scan_tests.cpp - groupby/multi_aggs_tests.cpp groupby/nth_element_tests.cpp groupby/nunique_tests.cpp groupby/product_scan_tests.cpp diff --git a/cpp/tests/groupby/multi_aggs_tests.cpp b/cpp/tests/groupby/multi_aggs_tests.cpp deleted file mode 100644 index ae491a8f796..00000000000 --- a/cpp/tests/groupby/multi_aggs_tests.cpp +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include - -#include - -#include -#include - -using namespace cudf::test::iterators; - -namespace { -template -std::unique_ptr create_fixed_table(cudf::size_type num_columns, - cudf::size_type num_rows, - bool include_validity, - Elements elements) -{ - auto valids = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); - std::vector> src_cols(num_columns); - for (int idx = 0; idx < num_columns; idx++) { - if (include_validity) { - src_cols[idx] = - cudf::test::fixed_width_column_wrapper(elements, elements + num_rows, valids); - } else { - src_cols[idx] = cudf::test::fixed_width_column_wrapper(elements, elements + num_rows); - } - } - std::vector> columns(num_columns); - std::transform(src_cols.begin(), - src_cols.end(), - columns.begin(), - [](cudf::test::fixed_width_column_wrapper& in) { - auto ret = in.release(); - // pre-cache the null count - [[maybe_unused]] auto const nulls = ret->has_nulls(); - return ret; - }); - return std::make_unique(std::move(columns)); -} - -template -std::unique_ptr create_random_fixed_table(cudf::size_type num_columns, - cudf::size_type num_rows) -{ - auto rand_elements = - cudf::detail::make_counting_transform_iterator(0, [](T i) { return rand(); }); - return create_fixed_table(num_columns, num_rows, false, rand_elements); -} -} // namespace - -template -struct groupby_multi_aggs_test : public cudf::test::BaseFixture {}; - -template -std::vector convert(std::initializer_list in) -{ - std::vector out(std::cbegin(in), std::cend(in)); - return out; -} - -using supported_types = cudf::test::Concat>; -TYPED_TEST_SUITE(groupby_multi_aggs_test, supported_types); -using K = int32_t; - -TYPED_TEST(groupby_multi_aggs_test, basic) -{ - using V = TypeParam; - - auto constexpr num_cols = 3'000; - auto constexpr num_rows = 100'000; - auto keys = create_random_fixed_table(1, num_rows); - - auto vals = create_random_fixed_table(num_cols, num_rows); - - std::vector requests; - for (auto i = 0; i < num_cols; i++) { - requests.emplace_back(); - - requests[i].values = vals->get_column(i).view(); - requests[i].aggregations.push_back( - std::move(cudf::make_mean_aggregation())); - requests[i].aggregations.push_back( - std::move(cudf::make_min_aggregation())); - requests[i].aggregations.push_back( - std::move(cudf::make_max_aggregation())); - requests[i].aggregations.push_back( - std::move(cudf::make_count_aggregation())); - } - - cudf::groupby::groupby gb_obj{keys->view()}; - - auto result = gb_obj.aggregate(requests, cudf::test::get_default_stream()); -} From 7a7ad6140c40cd9324006cfeba969436ff455470 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 7 Oct 2024 14:39:59 -0700 Subject: [PATCH 075/135] Renaming for clarity --- cpp/CMakeLists.txt | 2 +- .../groupby/hash/compute_single_pass_aggs.cuh | 24 +-- ...s.cu => compute_single_pass_shmem_aggs.cu} | 154 +++++++++--------- ...hpp => compute_single_pass_shmem_aggs.hpp} | 22 +-- 4 files changed, 101 insertions(+), 101 deletions(-) rename cpp/src/groupby/hash/{compute_aggregations.cu => compute_single_pass_shmem_aggs.cu} (60%) rename cpp/src/groupby/hash/{compute_aggregations.hpp => compute_single_pass_shmem_aggs.hpp} (52%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5c4fd5979dc..ea476f96af4 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -315,10 +315,10 @@ add_library( src/filling/repeat.cu src/filling/sequence.cu src/groupby/groupby.cu - src/groupby/hash/compute_aggregations.cu src/groupby/hash/compute_groupby.cu src/groupby/hash/compute_single_pass_aggs.cu src/groupby/hash/compute_single_pass_aggs_null.cu + src/groupby/hash/compute_single_pass_shmem_aggs.cu src/groupby/hash/create_sparse_results_table.cu src/groupby/hash/flatten_single_pass_aggs.cpp src/groupby/hash/groupby.cu diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 73a69f81200..86133605d44 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -15,8 +15,8 @@ */ #pragma once -#include "compute_aggregations.hpp" #include "compute_single_pass_aggs.hpp" +#include "compute_single_pass_shmem_aggs.hpp" #include "create_sparse_results_table.hpp" #include "flatten_single_pass_aggs.hpp" #include "helpers.cuh" @@ -298,17 +298,17 @@ rmm::device_uvector compute_single_pass_aggs( auto d_values = table_device_view::create(flattened_values, stream); auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - compute_aggregations(grid_size, - num_input_rows, - static_cast(row_bitmask.data()), - skip_rows_with_nulls, - local_mapping_index.data(), - global_mapping_index.data(), - block_cardinality.data(), - *d_values, - *d_sparse_table, - d_agg_kinds.data(), - stream); + compute_single_pass_shmem_aggs(grid_size, + num_input_rows, + static_cast(row_bitmask.data()), + skip_rows_with_nulls, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + stream); if (direct_aggregations.value(stream)) { auto const stride = GROUPBY_BLOCK_SIZE * grid_size; thrust::for_each_n(rmm::exec_policy(stream), diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu similarity index 60% rename from cpp/src/groupby/hash/compute_aggregations.cu rename to cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu index 915ede5154b..2f41b6b23d5 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cu +++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "compute_aggregations.hpp" +#include "compute_single_pass_shmem_aggs.hpp" #include "create_sparse_results_table.hpp" #include "global_memory_aggregator.cuh" #include "helpers.cuh" @@ -64,13 +64,13 @@ __device__ void calculate_columns_to_aggregate(int& col_start, } } -__device__ void initialize_shared_memory_aggregates(int col_start, - int col_end, - cudf::mutable_table_device_view output_values, - std::byte** s_aggregates_pointer, - bool** s_aggregates_valid_pointer, - cudf::size_type cardinality, - cudf::aggregation::Kind const* d_agg_kinds) +__device__ void initialize_shared_memory_aggs(int col_start, + int col_end, + cudf::mutable_table_device_view output_values, + std::byte** s_aggregates_pointer, + bool** s_aggregates_valid_pointer, + cudf::size_type cardinality, + cudf::aggregation::Kind const* d_agg_kinds) { for (auto col_idx = col_start; col_idx < col_end; col_idx++) { for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) { @@ -84,16 +84,16 @@ __device__ void initialize_shared_memory_aggregates(int col_start, } } -__device__ void compute_pre_aggregrates(int col_start, - int col_end, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::table_device_view input_values, - cudf::size_type num_input_rows, - cudf::size_type* local_mapping_index, - std::byte** s_aggregates_pointer, - bool** s_aggregates_valid_pointer, - cudf::aggregation::Kind const* d_agg_kinds) +__device__ void compute_pre_aggregrations(int col_start, + int col_end, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::table_device_view input_values, + cudf::size_type num_input_rows, + cudf::size_type* local_mapping_index, + std::byte** s_aggregates_pointer, + bool** s_aggregates_valid_pointer, + cudf::aggregation::Kind const* d_agg_kinds) { // TODO grid_1d utility for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows; @@ -117,15 +117,15 @@ __device__ void compute_pre_aggregrates(int col_start, } } -__device__ void compute_final_aggregates(int col_start, - int col_end, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::size_type cardinality, - cudf::size_type* global_mapping_index, - std::byte** s_aggregates_pointer, - bool** s_aggregates_valid_pointer, - cudf::aggregation::Kind const* d_agg_kinds) +__device__ void compute_final_aggregations(int col_start, + int col_end, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::size_type cardinality, + cudf::size_type* global_mapping_index, + std::byte** s_aggregates_pointer, + bool** s_aggregates_valid_pointer, + cudf::aggregation::Kind const* d_agg_kinds) { for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) { auto out_idx = global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx]; @@ -147,17 +147,17 @@ __device__ void compute_final_aggregates(int col_start, /* Takes the local_mapping_index and global_mapping_index to compute * pre (shared) and final (global) aggregates*/ -CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::aggregation::Kind const* d_agg_kinds, - int total_agg_size, - int pointer_size) +CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* d_agg_kinds, + int total_agg_size, + int pointer_size) { auto const block = cooperative_groups::this_thread_block(); auto const cardinality = block_cardinality[block.group_index().x]; @@ -190,34 +190,34 @@ CUDF_KERNEL void compute_aggs_kernel(cudf::size_type num_rows, cardinality, total_agg_size); block.sync(); - initialize_shared_memory_aggregates(col_start, - col_end, - output_values, - s_aggregates_pointer, - s_aggregates_valid_pointer, - cardinality, - d_agg_kinds); + initialize_shared_memory_aggs(col_start, + col_end, + output_values, + s_aggregates_pointer, + s_aggregates_valid_pointer, + cardinality, + d_agg_kinds); block.sync(); - compute_pre_aggregrates(col_start, - col_end, - row_bitmask, - skip_rows_with_nulls, - input_values, - num_rows, - local_mapping_index, - s_aggregates_pointer, - s_aggregates_valid_pointer, - d_agg_kinds); + compute_pre_aggregrations(col_start, + col_end, + row_bitmask, + skip_rows_with_nulls, + input_values, + num_rows, + local_mapping_index, + s_aggregates_pointer, + s_aggregates_valid_pointer, + d_agg_kinds); block.sync(); - compute_final_aggregates(col_start, - col_end, - input_values, - output_values, - cardinality, - global_mapping_index, - s_aggregates_pointer, - s_aggregates_valid_pointer, - d_agg_kinds); + compute_final_aggregations(col_start, + col_end, + input_values, + output_values, + cardinality, + global_mapping_index, + s_aggregates_pointer, + s_aggregates_valid_pointer, + d_agg_kinds); block.sync(); } } @@ -233,21 +233,21 @@ size_t available_shared_memory_size(int grid_size) size_t dynamic_shmem_size = 0; CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock( - &dynamic_shmem_size, compute_aggs_kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE)); + &dynamic_shmem_size, single_pass_shmem_aggs_kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE)); return get_previous_multiple_of_8(0.5 * dynamic_shmem_size); } -void compute_aggregations(int grid_size, - cudf::size_type num_input_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::aggregation::Kind const* d_agg_kinds, - rmm::cuda_stream_view stream) +void compute_single_pass_shmem_aggs(int grid_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* d_agg_kinds, + rmm::cuda_stream_view stream) { auto const shmem_size = available_shared_memory_size(grid_size); // For each aggregation, need two pointers to arrays in shmem @@ -256,7 +256,7 @@ void compute_aggregations(int grid_size, round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns()); // The rest of shmem is utilized for the actual arrays in shmem auto const shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2; - compute_aggs_kernel<<>>( + single_pass_shmem_aggs_kernel<<>>( num_input_rows, row_bitmask, skip_rows_with_nulls, diff --git a/cpp/src/groupby/hash/compute_aggregations.hpp b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp similarity index 52% rename from cpp/src/groupby/hash/compute_aggregations.hpp rename to cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp index d0e8e354d12..033cfa39a8c 100644 --- a/cpp/src/groupby/hash/compute_aggregations.hpp +++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp @@ -23,16 +23,16 @@ namespace cudf::groupby::detail::hash { -void compute_aggregations(int grid_size, - cudf::size_type num_input_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::aggregation::Kind const* d_agg_kinds, - rmm::cuda_stream_view stream); +void compute_single_pass_shmem_aggs(int grid_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* d_agg_kinds, + rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash From 9a7d432e3267bb351037a93c293a946909cc2569 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 7 Oct 2024 14:44:03 -0700 Subject: [PATCH 076/135] Renaming --- cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu index 2f41b6b23d5..9f03b5af08b 100644 --- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu +++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu @@ -64,7 +64,7 @@ __device__ void calculate_columns_to_aggregate(int& col_start, } } -__device__ void initialize_shared_memory_aggs(int col_start, +__device__ void initialize_shmem_aggregations(int col_start, int col_end, cudf::mutable_table_device_view output_values, std::byte** s_aggregates_pointer, @@ -190,7 +190,7 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, cardinality, total_agg_size); block.sync(); - initialize_shared_memory_aggs(col_start, + initialize_shmem_aggregations(col_start, col_end, output_values, s_aggregates_pointer, From c81cbdd6b3ce332827bc7c75a9940fcfbde3e8fb Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 8 Oct 2024 14:16:59 -0700 Subject: [PATCH 077/135] Add rollback for insufficient shared memory case --- .../groupby/hash/compute_single_pass_aggs.cuh | 16 +++++++++++----- .../hash/compute_single_pass_shmem_aggs.cu | 11 +++++++---- .../hash/compute_single_pass_shmem_aggs.hpp | 4 ++++ 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 86133605d44..2340595afc8 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -225,12 +225,21 @@ rmm::device_uvector compute_single_pass_aggs( auto global_set_ref = global_set.ref(cuco::op::insert_and_find); - if (has_dictionary_request) { + auto const grid_size = max_occupancy_grid_size( + compute_mapping_indices, + num_input_rows); + auto const has_sufficient_shmem = available_shared_memory_size(grid_size) > + (shmem_agg_pointer_size(flattened_values.num_columns()) * 2); + auto const uses_global_aggs = has_dictionary_request or !has_sufficient_shmem; + + // Use naive global memory aggregations when there are dictionary columns to aggregagte or when + // there is no sufficient dynamic shared memory for shared memory aggregations + if (uses_global_aggs) { // make table that will hold sparse results cudf::table sparse_table = create_sparse_results_table(flattened_values, d_agg_kinds.data(), agg_kinds, - has_dictionary_request, + uses_global_aggs, global_set, populated_keys, stream); @@ -262,9 +271,6 @@ rmm::device_uvector compute_single_pass_aggs( return populated_keys; } - auto const grid_size = max_occupancy_grid_size( - compute_mapping_indices, - num_input_rows); // 'local_mapping_index' maps from the global row index of the input table to the row index of // the local pre-aggregate table rmm::device_uvector local_mapping_index(num_input_rows, stream); diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu index 9f03b5af08b..70fd2dbc36f 100644 --- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu +++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu @@ -237,6 +237,8 @@ size_t available_shared_memory_size(int grid_size) return get_previous_multiple_of_8(0.5 * dynamic_shmem_size); } +size_t shmem_agg_pointer_size(int num_cols) { return sizeof(void*) * num_cols; } + void compute_single_pass_shmem_aggs(int grid_size, cudf::size_type num_input_rows, bitmask_type const* row_bitmask, @@ -252,10 +254,11 @@ void compute_single_pass_shmem_aggs(int grid_size, auto const shmem_size = available_shared_memory_size(grid_size); // For each aggregation, need two pointers to arrays in shmem // One where the aggregation is performed, one indicating the validity of the aggregation - auto const shmem_agg_pointer_size = - round_to_multiple_of_8(sizeof(std::byte*) * output_values.num_columns()); + auto const shmem_pointer_size = shmem_agg_pointer_size(output_values.num_columns()); // The rest of shmem is utilized for the actual arrays in shmem - auto const shmem_agg_size = shmem_size - shmem_agg_pointer_size * 2; + CUDF_EXPECTS(shmem_size > shmem_pointer_size * 2, + "No enough space for shared memory aggregations"); + auto const shmem_agg_size = shmem_size - shmem_pointer_size * 2; single_pass_shmem_aggs_kernel<<>>( num_input_rows, row_bitmask, @@ -267,6 +270,6 @@ void compute_single_pass_shmem_aggs(int grid_size, output_values, d_agg_kinds, shmem_agg_size, - shmem_agg_pointer_size); + shmem_pointer_size); } } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp index 033cfa39a8c..c871752e7e3 100644 --- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp +++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp @@ -23,6 +23,10 @@ namespace cudf::groupby::detail::hash { +size_t available_shared_memory_size(int grid_size); + +size_t shmem_agg_pointer_size(int num_cols); + void compute_single_pass_shmem_aggs(int grid_size, cudf::size_type num_input_rows, bitmask_type const* row_bitmask, From ed3e92bcd6a187da4c2d926128f09c0ef6ba3615 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 8 Oct 2024 14:21:47 -0700 Subject: [PATCH 078/135] Minor cleanups --- cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 2340595afc8..e3c6df48638 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -210,11 +210,6 @@ rmm::device_uvector compute_single_pass_aggs( ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first : rmm::device_buffer{}; - auto const has_dictionary_request = std::any_of( - requests.begin(), requests.end(), [](cudf::groupby::aggregation_request const& request) { - return cudf::is_dictionary(request.values.type()); - }); - // 'populated_keys' contains inserted row_indices (keys) of global hash set rmm::device_uvector populated_keys(keys.num_rows(), stream); @@ -230,9 +225,13 @@ rmm::device_uvector compute_single_pass_aggs( num_input_rows); auto const has_sufficient_shmem = available_shared_memory_size(grid_size) > (shmem_agg_pointer_size(flattened_values.num_columns()) * 2); - auto const uses_global_aggs = has_dictionary_request or !has_sufficient_shmem; + auto const uses_global_aggs = has_dictionary_request or !has_sufficient_shmem; + auto const has_dictionary_request = std::any_of( + requests.begin(), requests.end(), [](cudf::groupby::aggregation_request const& request) { + return cudf::is_dictionary(request.values.type()); + }); - // Use naive global memory aggregations when there are dictionary columns to aggregagte or when + // Use naive global memory aggregations when there are dictionary columns to aggregagte or // there is no sufficient dynamic shared memory for shared memory aggregations if (uses_global_aggs) { // make table that will hold sparse results From 7c1aa4a8bf37faa7c022d4f02c79e80e82c60b09 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 8 Oct 2024 14:30:29 -0700 Subject: [PATCH 079/135] Minor fix --- cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index e3c6df48638..76c8d77ff84 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -225,11 +225,11 @@ rmm::device_uvector compute_single_pass_aggs( num_input_rows); auto const has_sufficient_shmem = available_shared_memory_size(grid_size) > (shmem_agg_pointer_size(flattened_values.num_columns()) * 2); - auto const uses_global_aggs = has_dictionary_request or !has_sufficient_shmem; auto const has_dictionary_request = std::any_of( requests.begin(), requests.end(), [](cudf::groupby::aggregation_request const& request) { return cudf::is_dictionary(request.values.type()); }); + auto const uses_global_aggs = has_dictionary_request or !has_sufficient_shmem; // Use naive global memory aggregations when there are dictionary columns to aggregagte or // there is no sufficient dynamic shared memory for shared memory aggregations From e9766786cebcb91fecfc087d90404bd378657a34 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 8 Oct 2024 14:46:19 -0700 Subject: [PATCH 080/135] Revert custom cuco --- rapids_config.cmake | 4 ---- 1 file changed, 4 deletions(-) diff --git a/rapids_config.cmake b/rapids_config.cmake index 96df5adedac..3a88769f6e7 100644 --- a/rapids_config.cmake +++ b/rapids_config.cmake @@ -11,10 +11,6 @@ # or implied. See the License for the specific language governing permissions and limitations under # the License. # ============================================================================= - -set(rapids-cmake-repo PointKernel/rapids-cmake) -set(rapids-cmake-branch cuco-hash-function) - file(READ "${CMAKE_CURRENT_LIST_DIR}/VERSION" _rapids_version) if(_rapids_version MATCHES [[^([0-9][0-9])\.([0-9][0-9])\.([0-9][0-9])]]) set(RAPIDS_VERSION_MAJOR "${CMAKE_MATCH_1}") From e028fa5d74cf0fad01af24efaf7a9b37ef477f07 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 8 Oct 2024 15:55:29 -0700 Subject: [PATCH 081/135] Set proper ref type on host --- cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 76c8d77ff84..5210e6db67b 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -123,8 +123,6 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, auto const block = cooperative_groups::this_thread_block(); shared_set.initialize(block); - auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find); - __shared__ cudf::size_type cardinality; if (block.thread_rank() == 0) { cardinality = 0; } block.sync(); @@ -136,7 +134,7 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, cur_idx += stride) { find_local_mapping(cur_idx, num_input_rows, - shared_insert_ref, + shared_set, row_bitmask, skip_rows_with_nulls, &cardinality, @@ -200,7 +198,7 @@ rmm::device_uvector compute_single_pass_aggs( probing_scheme_t, cuco::cuda_allocator, cuco::storage>; - using shared_set_ref_type = typename shared_set_type::ref_type<>; + using shared_set_ref_type = typename shared_set_type::ref_type; auto constexpr window_extent = cuco::make_window_extent(extent_type{}); auto const num_input_rows = keys.num_rows(); From 5ea276cf6488e2c287db410ff5f4884d269fd055 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 10 Oct 2024 15:46:28 -0700 Subject: [PATCH 082/135] Clean up mapping indices calculations --- .../groupby/hash/compute_single_pass_aggs.cuh | 73 +++++++++---------- 1 file changed, 35 insertions(+), 38 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 5210e6db67b..9a697bfb4e8 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -45,8 +45,8 @@ namespace cudf::groupby::detail::hash { namespace { template -// TODO pass block -__device__ void find_local_mapping(cudf::size_type cur_idx, +__device__ void find_local_mapping(cooperative_groups::thread_block const& block, + cudf::size_type idx, cudf::size_type num_input_rows, SetType shared_set, bitmask_type const* row_bitmask, @@ -55,48 +55,50 @@ __device__ void find_local_mapping(cudf::size_type cur_idx, cudf::size_type* local_mapping_index, cudf::size_type* shared_set_indices) { - cudf::size_type result_idx; - // TODO: un-init - bool inserted; - if (cur_idx < num_input_rows and - (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) { - auto const result = shared_set.insert_and_find(cur_idx); + cudf::size_type result_idx{}; + bool inserted{}; + if (idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx))) { + auto const result = shared_set.insert_and_find(idx); result_idx = *result.first; inserted = result.second; // inserted a new element if (result.second) { auto const shared_set_index = atomicAdd(cardinality, 1); - shared_set_indices[shared_set_index] = cur_idx; - local_mapping_index[cur_idx] = shared_set_index; + shared_set_indices[shared_set_index] = idx; + local_mapping_index[idx] = shared_set_index; } } // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all // threads in the thread block. - __syncthreads(); - if (cur_idx < num_input_rows and - (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) { + block.sync(); + if (idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx))) { // element was already in set - if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; } + if (!inserted) { local_mapping_index[idx] = local_mapping_index[result_idx]; } } } template -__device__ void find_global_mapping(cudf::size_type cur_idx, +__device__ void find_global_mapping(cooperative_groups::thread_block const& block, + cudf::size_type cardinality, SetType global_set, cudf::size_type* shared_set_indices, cudf::size_type* global_mapping_index) { - auto const input_idx = shared_set_indices[cur_idx]; - global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] = - *global_set.insert_and_find(input_idx).first; + for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) { + auto const input_idx = shared_set_indices[idx]; + // for a unique key in shared memory hash set, `global_mapping_index` stores + // its match in global hash set + global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx] = + *global_set.insert_and_find(input_idx).first; + } } /* - * Inserts keys into the shared memory hash set, and stores the row index of the local - * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a - * threadblock exceeds `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without - * updating `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to - * the global hash set, and save the row index of the global sparse table in `global_mapping_index`. + * @brief Inserts keys into the shared memory hash set, and stores the block-wise rank for a given + * row index in `local_mapping_index`. If the number of unique keys found in a threadblock exceeds + * `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without updating + * `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to the + * global hash set, and save the row index of the global sparse table in `global_mapping_index`. */ template CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, @@ -129,10 +131,11 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, auto const stride = cudf::detail::grid_1d::grid_stride(); - for (auto cur_idx = cudf::detail::grid_1d::global_thread_id(); - cur_idx - block.thread_rank() < num_input_rows; - cur_idx += stride) { - find_local_mapping(cur_idx, + for (auto idx = cudf::detail::grid_1d::global_thread_id(); + idx - block.thread_rank() < num_input_rows; + idx += stride) { + find_local_mapping(block, + idx, num_input_rows, shared_set, row_bitmask, @@ -147,16 +150,12 @@ CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, if (block.thread_rank() == 0) { *direct_aggregations = true; } break; } - - block.sync(); } - // Insert unique keys from shared to global hash set + // Insert unique keys from shared to global hash set if block-cardinality + // doesn't exceed the threshold upper-limit if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) { - for (auto cur_idx = block.thread_rank(); cur_idx < cardinality; - cur_idx += block.num_threads()) { - find_global_mapping(cur_idx, global_set, shared_set_indices, global_mapping_index); - } + find_global_mapping(block, cardinality, global_set, shared_set_indices, global_mapping_index); } if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; } @@ -268,11 +267,9 @@ rmm::device_uvector compute_single_pass_aggs( return populated_keys; } - // 'local_mapping_index' maps from the global row index of the input table to the row index of - // the local pre-aggregate table + // 'local_mapping_index' maps from the global row index of the input table to its block-wise rank rmm::device_uvector local_mapping_index(num_input_rows, stream); - // 'global_mapping_index' maps from the local pre-aggregate table to the row index of - // global aggregate table + // 'global_mapping_index' maps from the block-wise rank to the row index of global aggregate table rmm::device_uvector global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS, stream); rmm::device_uvector block_cardinality(grid_size, stream); From d32b1e7d9159d1d6bfb8ed1d87b16adfba91a278 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 10 Oct 2024 16:24:06 -0700 Subject: [PATCH 083/135] Minor cleanups for find_global_mapping --- cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 9a697bfb4e8..411bc0a1b1e 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -77,17 +77,17 @@ __device__ void find_local_mapping(cooperative_groups::thread_block const& block } } -template +template __device__ void find_global_mapping(cooperative_groups::thread_block const& block, cudf::size_type cardinality, - SetType global_set, + GlobalSetT global_set, cudf::size_type* shared_set_indices, cudf::size_type* global_mapping_index) { + // for all unique keys in shared memory hash set, stores their matches in + // global hash set to `global_mapping_index` for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) { auto const input_idx = shared_set_indices[idx]; - // for a unique key in shared memory hash set, `global_mapping_index` stores - // its match in global hash set global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx] = *global_set.insert_and_find(input_idx).first; } From 32655cf694675141ba526990844432e52f4f8fff Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 10 Oct 2024 16:43:31 -0700 Subject: [PATCH 084/135] Use size_type instead of int --- .../hash/compute_single_pass_shmem_aggs.cu | 82 +++++++++---------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu index 70fd2dbc36f..5bdde0ff832 100644 --- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu +++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu @@ -36,36 +36,34 @@ namespace cudf::groupby::detail::hash { namespace { -__device__ void calculate_columns_to_aggregate(int& col_start, - int& col_end, +__device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, + cudf::size_type& col_end, cudf::mutable_table_device_view output_values, - int num_input_cols, + cudf::size_type num_input_cols, std::byte** s_aggregates_pointer, bool** s_aggregates_valid_pointer, std::byte* shared_set_aggregates, cudf::size_type cardinality, - int total_agg_size) + cudf::size_type total_agg_size) { - if (threadIdx.x == 0) { - col_start = col_end; - int bytes_allocated = 0; - int valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality); - while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) { - int next_col_size = - round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality); - int next_col_total_size = valid_col_size + next_col_size; - if (bytes_allocated + next_col_total_size > total_agg_size) { break; } - s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated; - s_aggregates_valid_pointer[col_end] = - reinterpret_cast(shared_set_aggregates + bytes_allocated + next_col_size); - bytes_allocated += next_col_total_size; - col_end++; - } + col_start = col_end; + cudf::size_type bytes_allocated = 0; + cudf::size_type valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality); + while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) { + cudf::size_type next_col_size = + round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality); + cudf::size_type next_col_total_size = valid_col_size + next_col_size; + if (bytes_allocated + next_col_total_size > total_agg_size) { break; } + s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated; + s_aggregates_valid_pointer[col_end] = + reinterpret_cast(shared_set_aggregates + bytes_allocated + next_col_size); + bytes_allocated += next_col_total_size; + col_end++; } } -__device__ void initialize_shmem_aggregations(int col_start, - int col_end, +__device__ void initialize_shmem_aggregations(cudf::size_type col_start, + cudf::size_type col_end, cudf::mutable_table_device_view output_values, std::byte** s_aggregates_pointer, bool** s_aggregates_valid_pointer, @@ -84,8 +82,8 @@ __device__ void initialize_shmem_aggregations(int col_start, } } -__device__ void compute_pre_aggregrations(int col_start, - int col_end, +__device__ void compute_pre_aggregrations(cudf::size_type col_start, + cudf::size_type col_end, bitmask_type const* row_bitmask, bool skip_rows_with_nulls, cudf::table_device_view input_values, @@ -117,8 +115,8 @@ __device__ void compute_pre_aggregrations(int col_start, } } -__device__ void compute_final_aggregations(int col_start, - int col_end, +__device__ void compute_final_aggregations(cudf::size_type col_start, + cudf::size_type col_end, cudf::table_device_view input_values, cudf::mutable_table_device_view output_values, cudf::size_type cardinality, @@ -156,8 +154,8 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, cudf::table_device_view input_values, cudf::mutable_table_device_view output_values, cudf::aggregation::Kind const* d_agg_kinds, - int total_agg_size, - int pointer_size) + cudf::size_type total_agg_size, + cudf::size_type pointer_size) { auto const block = cooperative_groups::this_thread_block(); auto const cardinality = block_cardinality[block.group_index().x]; @@ -165,8 +163,8 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, auto const num_cols = output_values.num_columns(); - __shared__ int col_start; - __shared__ int col_end; + __shared__ cudf::size_type col_start; + __shared__ cudf::size_type col_end; extern __shared__ std::byte shared_set_aggregates[]; std::byte** s_aggregates_pointer = reinterpret_cast(shared_set_aggregates + total_agg_size); @@ -180,15 +178,17 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, block.sync(); while (col_end < num_cols) { - calculate_columns_to_aggregate(col_start, - col_end, - output_values, - num_cols, - s_aggregates_pointer, - s_aggregates_valid_pointer, - shared_set_aggregates, - cardinality, - total_agg_size); + if (block.thread_rank() == 0) { + calculate_columns_to_aggregate(col_start, + col_end, + output_values, + num_cols, + s_aggregates_pointer, + s_aggregates_valid_pointer, + shared_set_aggregates, + cardinality, + total_agg_size); + } block.sync(); initialize_shmem_aggregations(col_start, col_end, @@ -226,7 +226,7 @@ constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 * } // namespace -size_t available_shared_memory_size(int grid_size) +size_t available_shared_memory_size(cudf::size_type grid_size) { auto const active_blocks_per_sm = cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); @@ -237,9 +237,9 @@ size_t available_shared_memory_size(int grid_size) return get_previous_multiple_of_8(0.5 * dynamic_shmem_size); } -size_t shmem_agg_pointer_size(int num_cols) { return sizeof(void*) * num_cols; } +size_t shmem_agg_pointer_size(cudf::size_type num_cols) { return sizeof(void*) * num_cols; } -void compute_single_pass_shmem_aggs(int grid_size, +void compute_single_pass_shmem_aggs(cudf::size_type grid_size, cudf::size_type num_input_rows, bitmask_type const* row_bitmask, bool skip_rows_with_nulls, From 2548871749a63c5f7f76a0783d29e4edda0a4513 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 10 Oct 2024 17:06:24 -0700 Subject: [PATCH 085/135] Renaming + spacing for clarity --- .../hash/compute_single_pass_shmem_aggs.cu | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu index 5bdde0ff832..1406303a8f4 100644 --- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu +++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu @@ -39,7 +39,7 @@ namespace { __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, cudf::size_type& col_end, cudf::mutable_table_device_view output_values, - cudf::size_type num_input_cols, + cudf::size_type output_size, std::byte** s_aggregates_pointer, bool** s_aggregates_valid_pointer, std::byte* shared_set_aggregates, @@ -48,17 +48,22 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, { col_start = col_end; cudf::size_type bytes_allocated = 0; - cudf::size_type valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality); - while ((bytes_allocated < total_agg_size) && (col_end < num_input_cols)) { - cudf::size_type next_col_size = + + auto const valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality); + + while (bytes_allocated < total_agg_size && col_end < output_size) { + auto const next_col_size = round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality); - cudf::size_type next_col_total_size = valid_col_size + next_col_size; + auto const next_col_total_size = next_col_size + valid_col_size; + if (bytes_allocated + next_col_total_size > total_agg_size) { break; } + s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated; s_aggregates_valid_pointer[col_end] = reinterpret_cast(shared_set_aggregates + bytes_allocated + next_col_size); + bytes_allocated += next_col_total_size; - col_end++; + ++col_end; } } From 1b09ec14061c66880155719abd7890b495a3bc44 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 10 Oct 2024 17:29:23 -0700 Subject: [PATCH 086/135] Clean up shared memory agg init --- .../groupby/hash/compute_single_pass_shmem_aggs.cu | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu index 1406303a8f4..0d271115adc 100644 --- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu +++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu @@ -67,7 +67,8 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, } } -__device__ void initialize_shmem_aggregations(cudf::size_type col_start, +__device__ void initialize_shmem_aggregations(cooperative_groups::thread_block const& block, + cudf::size_type col_start, cudf::size_type col_end, cudf::mutable_table_device_view output_values, std::byte** s_aggregates_pointer, @@ -76,7 +77,7 @@ __device__ void initialize_shmem_aggregations(cudf::size_type col_start, cudf::aggregation::Kind const* d_agg_kinds) { for (auto col_idx = col_start; col_idx < col_end; col_idx++) { - for (auto idx = threadIdx.x; idx < cardinality; idx += blockDim.x) { + for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) { cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(), d_agg_kinds[col_idx], initialize_shmem{}, @@ -85,6 +86,7 @@ __device__ void initialize_shmem_aggregations(cudf::size_type col_start, s_aggregates_valid_pointer[col_idx]); } } + block.sync(); } __device__ void compute_pre_aggregrations(cudf::size_type col_start, @@ -195,14 +197,16 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, total_agg_size); } block.sync(); - initialize_shmem_aggregations(col_start, + + initialize_shmem_aggregations(block, + col_start, col_end, output_values, s_aggregates_pointer, s_aggregates_valid_pointer, cardinality, d_agg_kinds); - block.sync(); + compute_pre_aggregrations(col_start, col_end, row_bitmask, From 56d75fbd14bb82b3f2f6265b0dfac09f2398eaa9 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 11 Oct 2024 11:05:01 -0700 Subject: [PATCH 087/135] Move compute_mapping_indices to its own TU to reduce build time --- cpp/CMakeLists.txt | 2 + .../groupby/hash/compute_mapping_indices.cu | 35 ++++ .../groupby/hash/compute_mapping_indices.cuh | 188 ++++++++++++++++++ .../groupby/hash/compute_mapping_indices.hpp | 42 ++++ .../hash/compute_mapping_indices_null.cu | 35 ++++ .../groupby/hash/compute_single_pass_aggs.cuh | 177 ++--------------- .../hash/hash_compound_agg_finalizer.cu | 4 +- cpp/src/groupby/hash/helpers.cuh | 16 +- .../groupby/hash/sparse_to_dense_results.cu | 29 ++- 9 files changed, 344 insertions(+), 184 deletions(-) create mode 100644 cpp/src/groupby/hash/compute_mapping_indices.cu create mode 100644 cpp/src/groupby/hash/compute_mapping_indices.cuh create mode 100644 cpp/src/groupby/hash/compute_mapping_indices.hpp create mode 100644 cpp/src/groupby/hash/compute_mapping_indices_null.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index ea476f96af4..e4f44a85947 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -316,6 +316,8 @@ add_library( src/filling/sequence.cu src/groupby/groupby.cu src/groupby/hash/compute_groupby.cu + src/groupby/hash/compute_mapping_indices.cu + src/groupby/hash/compute_mapping_indices_null.cu src/groupby/hash/compute_single_pass_aggs.cu src/groupby/hash/compute_single_pass_aggs_null.cu src/groupby/hash/compute_single_pass_shmem_aggs.cu diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cu b/cpp/src/groupby/hash/compute_mapping_indices.cu new file mode 100644 index 00000000000..1cbe70d651f --- /dev/null +++ b/cpp/src/groupby/hash/compute_mapping_indices.cu @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_mapping_indices.cuh" +#include "compute_mapping_indices.hpp" + +namespace cudf::groupby::detail::hash { +template cudf::size_type max_occupancy_grid_size>( + cudf::size_type n); + +template void compute_mapping_indices>( + cudf::size_type grid_size, + cudf::size_type num, + hash_set_ref_t global_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + bool* direct_aggregations, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh new file mode 100644 index 00000000000..91e7c83a2a2 --- /dev/null +++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "compute_mapping_indices.hpp" +#include "helpers.cuh" + +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +template +__device__ void find_local_mapping(cooperative_groups::thread_block const& block, + cudf::size_type idx, + cudf::size_type num_input_rows, + SetType shared_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* cardinality, + cudf::size_type* local_mapping_index, + cudf::size_type* shared_set_indices) +{ + cudf::size_type result_idx{}; + bool inserted{}; + if (idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx))) { + auto const result = shared_set.insert_and_find(idx); + result_idx = *result.first; + inserted = result.second; + // inserted a new element + if (result.second) { + auto const shared_set_index = atomicAdd(cardinality, 1); + shared_set_indices[shared_set_index] = idx; + local_mapping_index[idx] = shared_set_index; + } + } + // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all + // threads in the thread block. + block.sync(); + if (idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx))) { + // element was already in set + if (!inserted) { local_mapping_index[idx] = local_mapping_index[result_idx]; } + } +} + +template +__device__ void find_global_mapping(cooperative_groups::thread_block const& block, + cudf::size_type cardinality, + SetRef global_set, + cudf::size_type* shared_set_indices, + cudf::size_type* global_mapping_index) +{ + // for all unique keys in shared memory hash set, stores their matches in + // global hash set to `global_mapping_index` + for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) { + auto const input_idx = shared_set_indices[idx]; + global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx] = + *global_set.insert_and_find(input_idx).first; + } +} + +/* + * @brief Inserts keys into the shared memory hash set, and stores the block-wise rank for a given + * row index in `local_mapping_index`. If the number of unique keys found in a threadblock exceeds + * `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without updating + * `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to the + * global hash set, and save the row index of the global sparse table in `global_mapping_index`. + */ +template +CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows, + SetRef global_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + bool* direct_aggregations) +{ + // TODO: indices inserted in each shared memory set + __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS]; + + // Shared set initialization + __shared__ cuco::window windows[window_extent.value()]; + + auto raw_set = cuco::static_set_ref{ + cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, + global_set.key_eq(), + probing_scheme_t{global_set.hash_function()}, + cuco::thread_scope_block, + cuco::aow_storage_ref{ + window_extent, windows}}; + auto shared_set = raw_set.rebind_operators(cuco::insert_and_find); + + auto const block = cooperative_groups::this_thread_block(); + shared_set.initialize(block); + + __shared__ cudf::size_type cardinality; + if (block.thread_rank() == 0) { cardinality = 0; } + block.sync(); + + auto const stride = cudf::detail::grid_1d::grid_stride(); + + for (auto idx = cudf::detail::grid_1d::global_thread_id(); + idx - block.thread_rank() < num_input_rows; + idx += stride) { + find_local_mapping(block, + idx, + num_input_rows, + shared_set, + row_bitmask, + skip_rows_with_nulls, + &cardinality, + local_mapping_index, + shared_set_indices); + + block.sync(); + + if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { + if (block.thread_rank() == 0) { *direct_aggregations = true; } + break; + } + } + + // Insert unique keys from shared to global hash set if block-cardinality + // doesn't exceed the threshold upper-limit + if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) { + find_global_mapping(block, cardinality, global_set, shared_set_indices, global_mapping_index); + } + + if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; } +} + +template +int max_occupancy_grid_size(cudf::size_type n) +{ + int max_active_blocks{-1}; + CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks, mapping_indices_kernel, GROUPBY_BLOCK_SIZE, 0)); + auto const grid_size = max_active_blocks * cudf::detail::num_multiprocessors(); + auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE); + return std::min(grid_size, num_blocks); +} + +template +void compute_mapping_indices(cudf::size_type grid_size, + cudf::size_type num, + SetRef global_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + bool* direct_aggregations, + rmm::cuda_stream_view stream) +{ + mapping_indices_kernel<<>>(num, + global_set, + row_bitmask, + skip_rows_with_nulls, + local_mapping_index, + global_mapping_index, + block_cardinality, + direct_aggregations); + stream.synchronize(); +} +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_mapping_indices.hpp b/cpp/src/groupby/hash/compute_mapping_indices.hpp new file mode 100644 index 00000000000..d2cf3450730 --- /dev/null +++ b/cpp/src/groupby/hash/compute_mapping_indices.hpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +/* + * @brief Computes the maximum number of active blocks of the given kernel that can be executed on + * the underlying device + */ +template +[[nodiscard]] cudf::size_type max_occupancy_grid_size(cudf::size_type n); + +template +void compute_mapping_indices(cudf::size_type grid_size, + cudf::size_type num, + SetRef global_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + bool* direct_aggregations, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_mapping_indices_null.cu b/cpp/src/groupby/hash/compute_mapping_indices_null.cu new file mode 100644 index 00000000000..1b04016f9a1 --- /dev/null +++ b/cpp/src/groupby/hash/compute_mapping_indices_null.cu @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_mapping_indices.cuh" +#include "compute_mapping_indices.hpp" + +namespace cudf::groupby::detail::hash { +template cudf::size_type +max_occupancy_grid_size>(cudf::size_type n); + +template void compute_mapping_indices>( + cudf::size_type grid_size, + cudf::size_type num, + nullable_hash_set_ref_t global_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + bool* direct_aggregations, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 411bc0a1b1e..974b973b1fa 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -15,6 +15,7 @@ */ #pragma once +#include "compute_mapping_indices.hpp" #include "compute_single_pass_aggs.hpp" #include "compute_single_pass_shmem_aggs.hpp" #include "create_sparse_results_table.hpp" @@ -22,12 +23,8 @@ #include "helpers.cuh" #include "single_pass_functors.cuh" -#include #include -#include #include -#include -#include #include #include #include @@ -40,139 +37,10 @@ #include #include -#include +#include +#include namespace cudf::groupby::detail::hash { -namespace { -template -__device__ void find_local_mapping(cooperative_groups::thread_block const& block, - cudf::size_type idx, - cudf::size_type num_input_rows, - SetType shared_set, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* cardinality, - cudf::size_type* local_mapping_index, - cudf::size_type* shared_set_indices) -{ - cudf::size_type result_idx{}; - bool inserted{}; - if (idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx))) { - auto const result = shared_set.insert_and_find(idx); - result_idx = *result.first; - inserted = result.second; - // inserted a new element - if (result.second) { - auto const shared_set_index = atomicAdd(cardinality, 1); - shared_set_indices[shared_set_index] = idx; - local_mapping_index[idx] = shared_set_index; - } - } - // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all - // threads in the thread block. - block.sync(); - if (idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx))) { - // element was already in set - if (!inserted) { local_mapping_index[idx] = local_mapping_index[result_idx]; } - } -} - -template -__device__ void find_global_mapping(cooperative_groups::thread_block const& block, - cudf::size_type cardinality, - GlobalSetT global_set, - cudf::size_type* shared_set_indices, - cudf::size_type* global_mapping_index) -{ - // for all unique keys in shared memory hash set, stores their matches in - // global hash set to `global_mapping_index` - for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) { - auto const input_idx = shared_set_indices[idx]; - global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx] = - *global_set.insert_and_find(input_idx).first; - } -} - -/* - * @brief Inserts keys into the shared memory hash set, and stores the block-wise rank for a given - * row index in `local_mapping_index`. If the number of unique keys found in a threadblock exceeds - * `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without updating - * `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to the - * global hash set, and save the row index of the global sparse table in `global_mapping_index`. - */ -template -CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, - cudf::size_type num_input_rows, - WindowExtent window_extent, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - bool* direct_aggregations) -{ - // TODO: indices inserted in each shared memory set - __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS]; - - // Shared set initialization - __shared__ typename SetRef::window_type windows[window_extent.value()]; - auto storage = SetRef::storage_ref_type(window_extent, windows); - auto shared_set = SetRef(cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, - global_set.key_eq(), - probing_scheme_t{global_set.hash_function()}, - {}, - storage); - auto const block = cooperative_groups::this_thread_block(); - shared_set.initialize(block); - - __shared__ cudf::size_type cardinality; - if (block.thread_rank() == 0) { cardinality = 0; } - block.sync(); - - auto const stride = cudf::detail::grid_1d::grid_stride(); - - for (auto idx = cudf::detail::grid_1d::global_thread_id(); - idx - block.thread_rank() < num_input_rows; - idx += stride) { - find_local_mapping(block, - idx, - num_input_rows, - shared_set, - row_bitmask, - skip_rows_with_nulls, - &cardinality, - local_mapping_index, - shared_set_indices); - - block.sync(); - - if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { - if (block.thread_rank() == 0) { *direct_aggregations = true; } - break; - } - } - - // Insert unique keys from shared to global hash set if block-cardinality - // doesn't exceed the threshold upper-limit - if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) { - find_global_mapping(block, cardinality, global_set, shared_set_indices, global_mapping_index); - } - - if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; } -} - -template -int max_occupancy_grid_size(Kernel kernel, cudf::size_type n) -{ - int max_active_blocks{-1}; - CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks, kernel, GROUPBY_BLOCK_SIZE, 0)); - auto const grid_size = max_active_blocks * cudf::detail::num_multiprocessors(); - auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE); - return std::min(grid_size, num_blocks); -} -} // namespace - /** * @brief Computes all aggregations from `requests` that require a single pass * over the data and stores the results in `sparse_results` @@ -186,20 +54,6 @@ rmm::device_uvector compute_single_pass_aggs( bool skip_rows_with_nulls, rmm::cuda_stream_view stream) { - // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy - auto constexpr shared_set_capacity = - static_cast(static_cast(GROUPBY_SHM_MAX_ELEMENTS) * 1.43); - using extent_type = cuco::extent; - using shared_set_type = cuco::static_set, - cuco::storage>; - using shared_set_ref_type = typename shared_set_type::ref_type; - auto constexpr window_extent = cuco::make_window_extent(extent_type{}); - auto const num_input_rows = keys.num_rows(); auto row_bitmask = @@ -217,9 +71,7 @@ rmm::device_uvector compute_single_pass_aggs( auto global_set_ref = global_set.ref(cuco::op::insert_and_find); - auto const grid_size = max_occupancy_grid_size( - compute_mapping_indices, - num_input_rows); + auto const grid_size = max_occupancy_grid_size(num_input_rows); auto const has_sufficient_shmem = available_shared_memory_size(grid_size) > (shmem_agg_pointer_size(flattened_values.num_columns()) * 2); auto const has_dictionary_request = std::any_of( @@ -274,17 +126,16 @@ rmm::device_uvector compute_single_pass_aggs( stream); rmm::device_uvector block_cardinality(grid_size, stream); rmm::device_scalar direct_aggregations(false, stream); - compute_mapping_indices - <<>>(global_set_ref, - num_input_rows, - window_extent, - static_cast(row_bitmask.data()), - skip_rows_with_nulls, - local_mapping_index.data(), - global_mapping_index.data(), - block_cardinality.data(), - direct_aggregations.data()); - stream.synchronize(); + compute_mapping_indices(grid_size, + num_input_rows, + global_set_ref, + static_cast(row_bitmask.data()), + skip_rows_with_nulls, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + direct_aggregations.data(), + stream); // make table that will hold sparse results cudf::table sparse_table = create_sparse_results_table(flattened_values, diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu index 119ac8cf6fd..221e63ac121 100644 --- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu @@ -194,7 +194,7 @@ void hash_compound_agg_finalizer::visit(cudf::detail::std_aggregation c dense_results->add_result(col, agg, std::move(result)); } -template class hash_compound_agg_finalizer; -template class hash_compound_agg_finalizer; +template class hash_compound_agg_finalizer>; +template class hash_compound_agg_finalizer>; } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh index 651a6a2014a..f00996b6127 100644 --- a/cpp/src/groupby/hash/helpers.cuh +++ b/cpp/src/groupby/hash/helpers.cuh @@ -45,6 +45,16 @@ CUDF_HOST_DEVICE auto constexpr GROUPBY_CARDINALITY_THRESHOLD = 128; CUDF_HOST_DEVICE auto constexpr GROUPBY_SHM_MAX_ELEMENTS = GROUPBY_CARDINALITY_THRESHOLD + GROUPBY_BLOCK_SIZE; +// GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy +/// Shared memory hash set extent type +using shmem_extent_t = + cuco::extent(static_cast(GROUPBY_SHM_MAX_ELEMENTS) * 1.43)>; + +/// Number of windows needed by each shared memory hash set +CUDF_HOST_DEVICE auto constexpr window_extent = + cuco::make_window_extent(shmem_extent_t{}); + /** * @brief Returns the smallest multiple of 8 that is greater than or equal to the given integer. */ @@ -87,20 +97,22 @@ using nullable_global_set_t = cuco::static_set, cuco::storage>; +template using hash_set_ref_t = cuco::static_set_ref< cudf::size_type, cuda::thread_scope_device, row_comparator_t, probing_scheme_t, cuco::aow_storage_ref>, - cuco::op::find_tag>; + Op>; +template using nullable_hash_set_ref_t = cuco::static_set_ref< cudf::size_type, cuda::thread_scope_device, nullable_row_comparator_t, probing_scheme_t, cuco::aow_storage_ref>, - cuco::op::find_tag>; + Op>; } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu index af61173fb6a..36dc306879e 100644 --- a/cpp/src/groupby/hash/sparse_to_dense_results.cu +++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu @@ -28,12 +28,6 @@ #include namespace cudf::groupby::detail::hash { -/** - * @brief Gather sparse results into dense using `gather_map` and add to - * `dense_cache` - * - * @see groupby_null_templated() - */ template void sparse_to_dense_results(table_view const& keys, host_span requests, @@ -64,23 +58,24 @@ void sparse_to_dense_results(table_view const& keys, } } -template void sparse_to_dense_results(table_view const& keys, - host_span requests, - cudf::detail::result_cache* sparse_results, - cudf::detail::result_cache* dense_results, - device_span gather_map, - hash_set_ref_t set, - bool skip_key_rows_with_nulls, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); +template void sparse_to_dense_results>( + table_view const& keys, + host_span requests, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + hash_set_ref_t set, + bool skip_key_rows_with_nulls, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); -template void sparse_to_dense_results( +template void sparse_to_dense_results>( table_view const& keys, host_span requests, cudf::detail::result_cache* sparse_results, cudf::detail::result_cache* dense_results, device_span gather_map, - nullable_hash_set_ref_t set, + nullable_hash_set_ref_t set, bool skip_key_rows_with_nulls, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); From ab5ef604bb7245624dae30830169b8ea96a59b56 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 11 Oct 2024 12:26:25 -0700 Subject: [PATCH 088/135] Clean up the shared memory init function --- .../hash/compute_single_pass_shmem_aggs.cu | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu index 0d271115adc..9e3c62f46ac 100644 --- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu +++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu @@ -15,7 +15,6 @@ */ #include "compute_single_pass_shmem_aggs.hpp" -#include "create_sparse_results_table.hpp" #include "global_memory_aggregator.cuh" #include "helpers.cuh" #include "shared_memory_aggregator.cuh" @@ -36,6 +35,9 @@ namespace cudf::groupby::detail::hash { namespace { +// Prepares shared memory data required by each output column, exits if +// no enough memory space to perform the shared memory aggregation for the +// current output column __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, cudf::size_type& col_end, cudf::mutable_table_device_view output_values, @@ -67,6 +69,7 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, } } +// Each block initialize its own shared memory aggregation results __device__ void initialize_shmem_aggregations(cooperative_groups::thread_block const& block, cudf::size_type col_start, cudf::size_type col_end, @@ -100,14 +103,13 @@ __device__ void compute_pre_aggregrations(cudf::size_type col_start, bool** s_aggregates_valid_pointer, cudf::aggregation::Kind const* d_agg_kinds) { - // TODO grid_1d utility - for (auto cur_idx = blockDim.x * blockIdx.x + threadIdx.x; cur_idx < num_input_rows; - cur_idx += blockDim.x * gridDim.x) { - if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx)) { - auto map_idx = local_mapping_index[cur_idx]; + for (auto idx = cudf::detail::grid_1d::global_thread_id(); idx < num_input_rows; + idx += cudf::detail::grid_1d::grid_stride()) { + if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx)) { + auto const map_idx = local_mapping_index[idx]; for (auto col_idx = col_start; col_idx < col_end; col_idx++) { - auto input_col = input_values.column(col_idx); + auto const input_col = input_values.column(col_idx); cudf::detail::dispatch_type_and_aggregation(input_col.type(), d_agg_kinds[col_idx], @@ -116,7 +118,7 @@ __device__ void compute_pre_aggregrations(cudf::size_type col_start, map_idx, s_aggregates_valid_pointer[col_idx], input_col, - cur_idx); + idx); } } } @@ -218,6 +220,7 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, s_aggregates_valid_pointer, d_agg_kinds); block.sync(); + compute_final_aggregations(col_start, col_end, input_values, From 5bfe6ea4781ea188ca380172039f49444a76e3b1 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 11 Oct 2024 12:35:21 -0700 Subject: [PATCH 089/135] Add reminder --- cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu index 9e3c62f46ac..3c0a00c4798 100644 --- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu +++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu @@ -58,6 +58,7 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality); auto const next_col_total_size = next_col_size + valid_col_size; + // TODO: it seems early exit will break the followup calculatons. To verify if (bytes_allocated + next_col_total_size > total_agg_size) { break; } s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated; @@ -107,10 +108,8 @@ __device__ void compute_pre_aggregrations(cudf::size_type col_start, idx += cudf::detail::grid_1d::grid_stride()) { if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx)) { auto const map_idx = local_mapping_index[idx]; - for (auto col_idx = col_start; col_idx < col_end; col_idx++) { auto const input_col = input_values.column(col_idx); - cudf::detail::dispatch_type_and_aggregation(input_col.type(), d_agg_kinds[col_idx], shmem_element_aggregator{}, From d597ea70edc403563f2fd3c5f7fa6bf1be55b1e7 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 11 Oct 2024 13:09:20 -0700 Subject: [PATCH 090/135] Remove unused header --- cpp/src/groupby/groupby.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index cc0682b68b9..6eb82618e2a 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include From 1e85f08551b0b5f184717f1e2180ff5ceb622098 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 11 Oct 2024 14:18:40 -0700 Subject: [PATCH 091/135] Renaming + API cleanups --- cpp/src/groupby/hash/compute_groupby.cu | 26 +++++++--- cpp/src/groupby/hash/compute_groupby.hpp | 2 +- .../groupby/hash/compute_single_pass_aggs.cu | 5 +- .../groupby/hash/compute_single_pass_aggs.cuh | 51 ++++++++----------- .../groupby/hash/compute_single_pass_aggs.hpp | 7 +-- .../hash/compute_single_pass_aggs_null.cu | 5 +- .../groupby/hash/sparse_to_dense_results.cu | 8 +-- .../groupby/hash/sparse_to_dense_results.hpp | 2 +- 8 files changed, 56 insertions(+), 50 deletions(-) diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu index 9021846f71e..377d0361bd1 100644 --- a/cpp/src/groupby/hash/compute_groupby.cu +++ b/cpp/src/groupby/hash/compute_groupby.cu @@ -65,21 +65,21 @@ template std::unique_ptr
compute_groupby(table_view const& keys, host_span requests, cudf::detail::result_cache* cache, - bool skip_key_rows_with_nulls, + bool skip_rows_with_nulls, Equal const& d_row_equal, row_hash_t const& d_row_hash, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { // convert to int64_t to avoid potential overflow with large `keys` - auto const num_keys = static_cast(keys.num_rows()); + auto const num_rows = static_cast(keys.num_rows()); // Cache of sparse results where the location of aggregate value in each // column is indexed by the hash set cudf::detail::result_cache sparse_results(requests.size()); auto set = cuco::static_set{ - cuco::extent{num_keys}, + cuco::extent{num_rows}, cudf::detail::CUCO_DESIRED_LOAD_FACTOR, // 50% occupancy cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, d_row_equal, @@ -89,9 +89,19 @@ std::unique_ptr
compute_groupby(table_view const& keys, cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, stream.value()}; + auto row_bitmask = + skip_rows_with_nulls + ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first + : rmm::device_buffer{}; + // Compute all single pass aggs first - auto gather_map = compute_single_pass_aggs( - keys, requests, &sparse_results, set, skip_key_rows_with_nulls, stream); + auto gather_map = compute_single_pass_aggs(num_rows, + static_cast(row_bitmask.data()), + requests, + &sparse_results, + set, + skip_rows_with_nulls, + stream); // Compact all results from sparse_results and insert into cache sparse_to_dense_results(keys, @@ -100,7 +110,7 @@ std::unique_ptr
compute_groupby(table_view const& keys, cache, gather_map, set.ref(cuco::find), - skip_key_rows_with_nulls, + skip_rows_with_nulls, stream, mr); @@ -116,7 +126,7 @@ template std::unique_ptr
compute_groupby( table_view const& keys, host_span requests, cudf::detail::result_cache* cache, - bool skip_key_rows_with_nulls, + bool skip_rows_with_nulls, row_comparator_t const& d_row_equal, row_hash_t const& d_row_hash, rmm::cuda_stream_view stream, @@ -126,7 +136,7 @@ template std::unique_ptr
compute_groupby( table_view const& keys, host_span requests, cudf::detail::result_cache* cache, - bool skip_key_rows_with_nulls, + bool skip_rows_with_nulls, nullable_row_comparator_t const& d_row_equal, row_hash_t const& d_row_hash, rmm::cuda_stream_view stream, diff --git a/cpp/src/groupby/hash/compute_groupby.hpp b/cpp/src/groupby/hash/compute_groupby.hpp index 358c81365a0..a11c1db4262 100644 --- a/cpp/src/groupby/hash/compute_groupby.hpp +++ b/cpp/src/groupby/hash/compute_groupby.hpp @@ -59,7 +59,7 @@ template std::unique_ptr compute_groupby(table_view const& keys, host_span requests, cudf::detail::result_cache* cache, - bool skip_key_rows_with_nulls, + bool skip_rows_with_nulls, Equal const& d_row_equal, row_hash_t const& d_row_hash, rmm::cuda_stream_view stream, diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu index f8b0f65b92f..8ba78653957 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cu +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,8 @@ namespace cudf::groupby::detail::hash { template rmm::device_uvector compute_single_pass_aggs( - cudf::table_view const& keys, + int64_t num_rows, + bitmask_type const* row_bitmask, cudf::host_span requests, cudf::detail::result_cache* sparse_results, global_set_t& global_set, diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 974b973b1fa..fab5887d7b8 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include @@ -47,22 +47,16 @@ namespace cudf::groupby::detail::hash { */ template rmm::device_uvector compute_single_pass_aggs( - cudf::table_view const& keys, + int64_t num_rows, + bitmask_type const* row_bitmask, cudf::host_span requests, cudf::detail::result_cache* sparse_results, SetType& global_set, bool skip_rows_with_nulls, rmm::cuda_stream_view stream) { - auto const num_input_rows = keys.num_rows(); - - auto row_bitmask = - skip_rows_with_nulls - ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first - : rmm::device_buffer{}; - // 'populated_keys' contains inserted row_indices (keys) of global hash set - rmm::device_uvector populated_keys(keys.num_rows(), stream); + rmm::device_uvector populated_keys(num_rows, stream); // flatten the aggs to a table that can be operated on by aggregate_row auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); @@ -71,7 +65,7 @@ rmm::device_uvector compute_single_pass_aggs( auto global_set_ref = global_set.ref(cuco::op::insert_and_find); - auto const grid_size = max_occupancy_grid_size(num_input_rows); + auto const grid_size = max_occupancy_grid_size(num_rows); auto const has_sufficient_shmem = available_shared_memory_size(grid_size) > (shmem_agg_pointer_size(flattened_values.num_columns()) * 2); auto const has_dictionary_request = std::any_of( @@ -96,16 +90,15 @@ rmm::device_uvector compute_single_pass_aggs( auto d_values = table_device_view::create(flattened_values, stream); auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - keys.num_rows(), - hash::compute_single_pass_aggs_fn{global_set_ref, - *d_values, - *d_sparse_table, - d_agg_kinds.data(), - static_cast(row_bitmask.data()), - skip_rows_with_nulls}); + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + num_rows, + hash::compute_single_pass_aggs_fn{global_set_ref, + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + row_bitmask, + skip_rows_with_nulls}); extract_populated_keys(global_set, populated_keys, stream); // Add results back to sparse_results cache @@ -120,16 +113,16 @@ rmm::device_uvector compute_single_pass_aggs( } // 'local_mapping_index' maps from the global row index of the input table to its block-wise rank - rmm::device_uvector local_mapping_index(num_input_rows, stream); + rmm::device_uvector local_mapping_index(num_rows, stream); // 'global_mapping_index' maps from the block-wise rank to the row index of global aggregate table rmm::device_uvector global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS, stream); rmm::device_uvector block_cardinality(grid_size, stream); rmm::device_scalar direct_aggregations(false, stream); compute_mapping_indices(grid_size, - num_input_rows, + num_rows, global_set_ref, - static_cast(row_bitmask.data()), + row_bitmask, skip_rows_with_nulls, local_mapping_index.data(), global_mapping_index.data(), @@ -150,8 +143,8 @@ rmm::device_uvector compute_single_pass_aggs( auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); compute_single_pass_shmem_aggs(grid_size, - num_input_rows, - static_cast(row_bitmask.data()), + num_rows, + row_bitmask, skip_rows_with_nulls, local_mapping_index.data(), global_mapping_index.data(), @@ -164,14 +157,14 @@ rmm::device_uvector compute_single_pass_aggs( auto const stride = GROUPBY_BLOCK_SIZE * grid_size; thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), - keys.num_rows(), + num_rows, compute_direct_aggregates{global_set_ref, *d_values, *d_sparse_table, d_agg_kinds.data(), block_cardinality.data(), stride, - static_cast(row_bitmask.data()), + row_bitmask, skip_rows_with_nulls}); extract_populated_keys(global_set, populated_keys, stream); } diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp index 6cbea9fcd3c..a0d2452d39f 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,8 +16,8 @@ #pragma once #include +#include #include -#include #include #include @@ -31,7 +31,8 @@ namespace cudf::groupby::detail::hash { */ template rmm::device_uvector compute_single_pass_aggs( - cudf::table_view const& keys, + int64_t num_rows, + bitmask_type const* row_bitmask, cudf::host_span requests, cudf::detail::result_cache* sparse_results, SetType& global_set, diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu b/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu index b88f1a952d5..be7c667766c 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu +++ b/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,8 @@ namespace cudf::groupby::detail::hash { template rmm::device_uvector compute_single_pass_aggs( - cudf::table_view const& keys, + int64_t num_rows, + bitmask_type const* row_bitmask, cudf::host_span requests, cudf::detail::result_cache* sparse_results, nullable_global_set_t& global_set, diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu index 36dc306879e..e960cc1f4e0 100644 --- a/cpp/src/groupby/hash/sparse_to_dense_results.cu +++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu @@ -35,14 +35,14 @@ void sparse_to_dense_results(table_view const& keys, cudf::detail::result_cache* dense_results, device_span gather_map, SetType set, - bool skip_key_rows_with_nulls, + bool skip_rows_with_nulls, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { auto row_bitmask = cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first; bitmask_type const* row_bitmask_ptr = - skip_key_rows_with_nulls ? static_cast(row_bitmask.data()) : nullptr; + skip_rows_with_nulls ? static_cast(row_bitmask.data()) : nullptr; for (auto const& request : requests) { auto const& agg_v = request.aggregations; @@ -65,7 +65,7 @@ template void sparse_to_dense_results>( cudf::detail::result_cache* dense_results, device_span gather_map, hash_set_ref_t set, - bool skip_key_rows_with_nulls, + bool skip_rows_with_nulls, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); @@ -76,7 +76,7 @@ template void sparse_to_dense_results>( cudf::detail::result_cache* dense_results, device_span gather_map, nullable_hash_set_ref_t set, - bool skip_key_rows_with_nulls, + bool skip_rows_with_nulls, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.hpp b/cpp/src/groupby/hash/sparse_to_dense_results.hpp index bfdc42953ad..2c14cc1e7f6 100644 --- a/cpp/src/groupby/hash/sparse_to_dense_results.hpp +++ b/cpp/src/groupby/hash/sparse_to_dense_results.hpp @@ -38,7 +38,7 @@ void sparse_to_dense_results(table_view const& keys, cudf::detail::result_cache* dense_results, device_span gather_map, SetType set, - bool skip_key_rows_with_nulls, + bool skip_rows_with_nulls, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); } // namespace cudf::groupby::detail::hash From 80f92752599c1dfa016b39951e1252d1add2494b Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 11 Oct 2024 14:28:09 -0700 Subject: [PATCH 092/135] Get rid of redundant bitmask calculation --- cpp/src/groupby/hash/compute_groupby.cu | 2 +- cpp/src/groupby/hash/sparse_to_dense_results.cu | 15 ++++----------- cpp/src/groupby/hash/sparse_to_dense_results.hpp | 4 ++-- 3 files changed, 7 insertions(+), 14 deletions(-) diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu index 377d0361bd1..95bf74c9e84 100644 --- a/cpp/src/groupby/hash/compute_groupby.cu +++ b/cpp/src/groupby/hash/compute_groupby.cu @@ -104,7 +104,7 @@ std::unique_ptr
compute_groupby(table_view const& keys, stream); // Compact all results from sparse_results and insert into cache - sparse_to_dense_results(keys, + sparse_to_dense_results(static_cast(row_bitmask.data()), requests, &sparse_results, cache, diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu index e960cc1f4e0..adba5dee8f5 100644 --- a/cpp/src/groupby/hash/sparse_to_dense_results.cu +++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -29,7 +28,7 @@ namespace cudf::groupby::detail::hash { template -void sparse_to_dense_results(table_view const& keys, +void sparse_to_dense_results(bitmask_type const* row_bitmask, host_span requests, cudf::detail::result_cache* sparse_results, cudf::detail::result_cache* dense_results, @@ -39,11 +38,6 @@ void sparse_to_dense_results(table_view const& keys, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto row_bitmask = - cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first; - bitmask_type const* row_bitmask_ptr = - skip_rows_with_nulls ? static_cast(row_bitmask.data()) : nullptr; - for (auto const& request : requests) { auto const& agg_v = request.aggregations; auto const& col = request.values; @@ -51,7 +45,7 @@ void sparse_to_dense_results(table_view const& keys, // Given an aggregation, this will get the result from sparse_results and // convert and return dense, compacted result auto finalizer = hash_compound_agg_finalizer( - col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr); + col, sparse_results, dense_results, gather_map, set, row_bitmask, stream, mr); for (auto&& agg : agg_v) { agg->finalize(finalizer); } @@ -59,7 +53,7 @@ void sparse_to_dense_results(table_view const& keys, } template void sparse_to_dense_results>( - table_view const& keys, + bitmask_type const* row_bitmask, host_span requests, cudf::detail::result_cache* sparse_results, cudf::detail::result_cache* dense_results, @@ -70,7 +64,7 @@ template void sparse_to_dense_results>( rmm::device_async_resource_ref mr); template void sparse_to_dense_results>( - table_view const& keys, + bitmask_type const* row_bitmask, host_span requests, cudf::detail::result_cache* sparse_results, cudf::detail::result_cache* dense_results, @@ -79,5 +73,4 @@ template void sparse_to_dense_results>( bool skip_rows_with_nulls, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); - } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.hpp b/cpp/src/groupby/hash/sparse_to_dense_results.hpp index 2c14cc1e7f6..31fc02e7a38 100644 --- a/cpp/src/groupby/hash/sparse_to_dense_results.hpp +++ b/cpp/src/groupby/hash/sparse_to_dense_results.hpp @@ -16,8 +16,8 @@ #pragma once #include +#include #include -#include #include #include @@ -32,7 +32,7 @@ namespace cudf::groupby::detail::hash { * @see groupby_null_templated() */ template -void sparse_to_dense_results(table_view const& keys, +void sparse_to_dense_results(bitmask_type const* row_bitmask, host_span requests, cudf::detail::result_cache* sparse_results, cudf::detail::result_cache* dense_results, From 5baa2cf1a140710339ded54d741e2c4b42289195 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 11 Oct 2024 14:31:47 -0700 Subject: [PATCH 093/135] Add missing header --- cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index fab5887d7b8..ca2256d78dd 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -33,9 +33,11 @@ #include #include +#include #include #include +#include #include #include From 53e0e00ca5edf16d9a2a4e51411a1bfc64a7bc81 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 11 Oct 2024 14:44:59 -0700 Subject: [PATCH 094/135] Add missing header --- cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index ca2256d78dd..4e41429bd46 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -32,6 +32,7 @@ #include #include +#include #include #include From 57a450af4110f83194e3eed7e952093f4111968f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 11 Oct 2024 14:49:46 -0700 Subject: [PATCH 095/135] Clean up headers --- cpp/src/groupby/hash/compute_groupby.cu | 3 ++- cpp/src/groupby/hash/compute_mapping_indices.cuh | 1 - cpp/src/groupby/hash/compute_mapping_indices.hpp | 1 - cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 1 - cpp/src/groupby/hash/compute_single_pass_aggs.hpp | 1 - cpp/src/groupby/hash/sparse_to_dense_results.cu | 1 - cpp/src/groupby/hash/sparse_to_dense_results.hpp | 1 - 7 files changed, 2 insertions(+), 7 deletions(-) diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu index 95bf74c9e84..7565e8ecfbb 100644 --- a/cpp/src/groupby/hash/compute_groupby.cu +++ b/cpp/src/groupby/hash/compute_groupby.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -91,7 +92,7 @@ std::unique_ptr
compute_groupby(table_view const& keys, auto row_bitmask = skip_rows_with_nulls - ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first + ? cudf::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first : rmm::device_buffer{}; // Compute all single pass aggs first diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh index 91e7c83a2a2..dd369a123ca 100644 --- a/cpp/src/groupby/hash/compute_mapping_indices.cuh +++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh @@ -19,7 +19,6 @@ #include "helpers.cuh" #include -#include #include #include #include diff --git a/cpp/src/groupby/hash/compute_mapping_indices.hpp b/cpp/src/groupby/hash/compute_mapping_indices.hpp index d2cf3450730..d8047f9a5d8 100644 --- a/cpp/src/groupby/hash/compute_mapping_indices.hpp +++ b/cpp/src/groupby/hash/compute_mapping_indices.hpp @@ -15,7 +15,6 @@ */ #pragma once -#include #include #include diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index 4e41429bd46..fb199c28b21 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -24,7 +24,6 @@ #include "single_pass_functors.cuh" #include -#include #include #include #include diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp index a0d2452d39f..7dda9d4c4be 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp @@ -16,7 +16,6 @@ #pragma once #include -#include #include #include #include diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu index adba5dee8f5..eb037e69937 100644 --- a/cpp/src/groupby/hash/sparse_to_dense_results.cu +++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu @@ -18,7 +18,6 @@ #include "helpers.cuh" #include -#include #include #include #include diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.hpp b/cpp/src/groupby/hash/sparse_to_dense_results.hpp index 31fc02e7a38..0b8975d235c 100644 --- a/cpp/src/groupby/hash/sparse_to_dense_results.hpp +++ b/cpp/src/groupby/hash/sparse_to_dense_results.hpp @@ -16,7 +16,6 @@ #pragma once #include -#include #include #include #include From 5b92cd01167e462f83b04a58a0c04298581c99d3 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 14 Oct 2024 10:10:47 -0700 Subject: [PATCH 096/135] Minor cleanup on ref type determination --- cpp/src/groupby/hash/compute_single_pass_aggs.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index fb199c28b21..e55f095f764 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -67,7 +67,8 @@ rmm::device_uvector compute_single_pass_aggs( auto global_set_ref = global_set.ref(cuco::op::insert_and_find); - auto const grid_size = max_occupancy_grid_size(num_rows); + auto const grid_size = + max_occupancy_grid_size>(num_rows); auto const has_sufficient_shmem = available_shared_memory_size(grid_size) > (shmem_agg_pointer_size(flattened_values.num_columns()) * 2); auto const has_dictionary_request = std::any_of( From 98aa46829a1aaafebd10d7608b03c11ef7e0deab Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 14 Oct 2024 10:57:54 -0700 Subject: [PATCH 097/135] Add device num_bitmask_words device utility --- cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu index 3c0a00c4798..5fa818df2ae 100644 --- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu +++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu @@ -35,6 +35,16 @@ namespace cudf::groupby::detail::hash { namespace { +/// Computes number of *actual* bitmask_type elements needed +__device__ constexpr size_type num_bitmask_words(size_type number_of_bits) +{ + // TODO: This duplicates `cudf::num_bitmask_words`. Converting it into + // a public host-device utility will require non-trivial effort, so the + // cleanup will be addressed in a separate PR. + return cudf::util::div_rounding_up_safe(number_of_bits, + detail::size_in_bits()); +} + // Prepares shared memory data required by each output column, exits if // no enough memory space to perform the shared memory aggregation for the // current output column From 8be8d158928ffc85adad904885bc6ce7f424f724 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 14 Oct 2024 10:59:14 -0700 Subject: [PATCH 098/135] Fix a minor bug determining column C++ type --- cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu index 5fa818df2ae..7f4d8030dd4 100644 --- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu +++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu @@ -64,8 +64,8 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, auto const valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality); while (bytes_allocated < total_agg_size && col_end < output_size) { - auto const next_col_size = - round_to_multiple_of_8(sizeof(output_values.column(col_end).type()) * cardinality); + auto const next_col_size = round_to_multiple_of_8( + sizeof(cudf::id_to_type(output_values.column(col_end).type().id())) * cardinality); auto const next_col_total_size = next_col_size + valid_col_size; // TODO: it seems early exit will break the followup calculatons. To verify From d3c465ba7dc18e0ea6f6876485fc849d5cd3c803 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 14 Oct 2024 11:31:21 -0700 Subject: [PATCH 099/135] Bug fix: use type_dispatcher --- .../hash/compute_single_pass_shmem_aggs.cu | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu index 7f4d8030dd4..ed8e4d1d756 100644 --- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu +++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu @@ -26,6 +26,7 @@ #include #include #include +#include #include @@ -35,6 +36,16 @@ namespace cudf::groupby::detail::hash { namespace { +/// Functor used by type dispatcher returning the size of the underlying C++ type +struct size_of_functor { + template + __device__ constexpr cudf::size_type operator()() + { + return sizeof(T); + } +}; + +/* /// Computes number of *actual* bitmask_type elements needed __device__ constexpr size_type num_bitmask_words(size_type number_of_bits) { @@ -42,8 +53,9 @@ __device__ constexpr size_type num_bitmask_words(size_type number_of_bits) // a public host-device utility will require non-trivial effort, so the // cleanup will be addressed in a separate PR. return cudf::util::div_rounding_up_safe(number_of_bits, - detail::size_in_bits()); + cudf::detail::size_in_bits()); } +*/ // Prepares shared memory data required by each output column, exits if // no enough memory space to perform the shared memory aggregation for the @@ -64,8 +76,9 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, auto const valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality); while (bytes_allocated < total_agg_size && col_end < output_size) { + auto const col_idx = col_end; auto const next_col_size = round_to_multiple_of_8( - sizeof(cudf::id_to_type(output_values.column(col_end).type().id())) * cardinality); + cudf::type_dispatcher(output_values.column(col_idx).type(), size_of_functor{}) * cardinality); auto const next_col_total_size = next_col_size + valid_col_size; // TODO: it seems early exit will break the followup calculatons. To verify From f109b814e1cb74fd340de7e0f930c3afcf79032b Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 14 Oct 2024 11:47:05 -0700 Subject: [PATCH 100/135] Pass block to compute_final_aggregations --- .../groupby/hash/compute_single_pass_shmem_aggs.cu | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu index ed8e4d1d756..7df3de93b34 100644 --- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu +++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu @@ -146,7 +146,8 @@ __device__ void compute_pre_aggregrations(cudf::size_type col_start, } } -__device__ void compute_final_aggregations(cudf::size_type col_start, +__device__ void compute_final_aggregations(cooperative_groups::thread_block const& block, + cudf::size_type col_start, cudf::size_type col_end, cudf::table_device_view input_values, cudf::mutable_table_device_view output_values, @@ -156,8 +157,8 @@ __device__ void compute_final_aggregations(cudf::size_type col_start, bool** s_aggregates_valid_pointer, cudf::aggregation::Kind const* d_agg_kinds) { - for (auto cur_idx = threadIdx.x; cur_idx < cardinality; cur_idx += blockDim.x) { - auto out_idx = global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx]; + for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) { + auto out_idx = global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx]; for (auto col_idx = col_start; col_idx < col_end; col_idx++) { auto output_col = output_values.column(col_idx); @@ -168,10 +169,11 @@ __device__ void compute_final_aggregations(cudf::size_type col_start, out_idx, input_values.column(col_idx), s_aggregates_pointer[col_idx], - cur_idx, + idx, s_aggregates_valid_pointer[col_idx]); } } + block.sync(); } /* Takes the local_mapping_index and global_mapping_index to compute @@ -243,7 +245,8 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, d_agg_kinds); block.sync(); - compute_final_aggregations(col_start, + compute_final_aggregations(block, + col_start, col_end, input_values, output_values, @@ -252,7 +255,6 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, s_aggregates_pointer, s_aggregates_valid_pointer, d_agg_kinds); - block.sync(); } } From 280db67bab7a38c62f9bc8c934b4bed49f89cf57 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 14 Oct 2024 13:44:13 -0700 Subject: [PATCH 101/135] Cleanup: use offsets instead pointers to save memory space --- .../groupby/hash/compute_single_pass_aggs.cuh | 2 +- .../hash/compute_single_pass_shmem_aggs.cu | 130 ++++++++++-------- .../hash/compute_single_pass_shmem_aggs.hpp | 2 +- .../groupby/hash/global_memory_aggregator.cuh | 48 +++---- .../groupby/hash/shared_memory_aggregator.cuh | 38 ++--- cpp/src/groupby/hash/single_pass_functors.cuh | 30 ++-- 6 files changed, 129 insertions(+), 121 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh index e55f095f764..94c7f4b59c7 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh @@ -70,7 +70,7 @@ rmm::device_uvector compute_single_pass_aggs( auto const grid_size = max_occupancy_grid_size>(num_rows); auto const has_sufficient_shmem = available_shared_memory_size(grid_size) > - (shmem_agg_pointer_size(flattened_values.num_columns()) * 2); + (shmem_offsets_size(flattened_values.num_columns()) * 2); auto const has_dictionary_request = std::any_of( requests.begin(), requests.end(), [](cudf::groupby::aggregation_request const& request) { return cudf::is_dictionary(request.values.type()); diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu index 7df3de93b34..1a984d6f100 100644 --- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu +++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu @@ -64,9 +64,8 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, cudf::size_type& col_end, cudf::mutable_table_device_view output_values, cudf::size_type output_size, - std::byte** s_aggregates_pointer, - bool** s_aggregates_valid_pointer, - std::byte* shared_set_aggregates, + cudf::size_type* target_offsets, + cudf::size_type* target_mask_offsets, cudf::size_type cardinality, cudf::size_type total_agg_size) { @@ -84,9 +83,8 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, // TODO: it seems early exit will break the followup calculatons. To verify if (bytes_allocated + next_col_total_size > total_agg_size) { break; } - s_aggregates_pointer[col_end] = shared_set_aggregates + bytes_allocated; - s_aggregates_valid_pointer[col_end] = - reinterpret_cast(shared_set_aggregates + bytes_allocated + next_col_size); + target_offsets[col_end] = bytes_allocated; + target_mask_offsets[col_end] = bytes_allocated + next_col_size; bytes_allocated += next_col_total_size; ++col_end; @@ -98,19 +96,22 @@ __device__ void initialize_shmem_aggregations(cooperative_groups::thread_block c cudf::size_type col_start, cudf::size_type col_end, cudf::mutable_table_device_view output_values, - std::byte** s_aggregates_pointer, - bool** s_aggregates_valid_pointer, + std::byte* shared_set_aggs, + cudf::size_type* target_offsets, + cudf::size_type* target_mask_offsets, cudf::size_type cardinality, cudf::aggregation::Kind const* d_agg_kinds) { for (auto col_idx = col_start; col_idx < col_end; col_idx++) { for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) { + std::byte* target = reinterpret_cast(shared_set_aggs + target_offsets[col_idx]); + bool* target_mask = reinterpret_cast(shared_set_aggs + target_mask_offsets[col_idx]); cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(), d_agg_kinds[col_idx], initialize_shmem{}, - s_aggregates_pointer[col_idx], - idx, - s_aggregates_valid_pointer[col_idx]); + target, + target_mask, + idx); } } block.sync(); @@ -120,27 +121,32 @@ __device__ void compute_pre_aggregrations(cudf::size_type col_start, cudf::size_type col_end, bitmask_type const* row_bitmask, bool skip_rows_with_nulls, - cudf::table_device_view input_values, + cudf::table_device_view source, cudf::size_type num_input_rows, cudf::size_type* local_mapping_index, - std::byte** s_aggregates_pointer, - bool** s_aggregates_valid_pointer, + std::byte* shared_set_aggs, + cudf::size_type* target_offsets, + cudf::size_type* target_mask_offsets, cudf::aggregation::Kind const* d_agg_kinds) { - for (auto idx = cudf::detail::grid_1d::global_thread_id(); idx < num_input_rows; - idx += cudf::detail::grid_1d::grid_stride()) { - if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx)) { - auto const map_idx = local_mapping_index[idx]; + for (auto source_idx = cudf::detail::grid_1d::global_thread_id(); source_idx < num_input_rows; + source_idx += cudf::detail::grid_1d::grid_stride()) { + if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, source_idx)) { + auto const target_idx = local_mapping_index[source_idx]; for (auto col_idx = col_start; col_idx < col_end; col_idx++) { - auto const input_col = input_values.column(col_idx); - cudf::detail::dispatch_type_and_aggregation(input_col.type(), + auto const source_col = source.column(col_idx); + + std::byte* target = reinterpret_cast(shared_set_aggs + target_offsets[col_idx]); + bool* target_mask = reinterpret_cast(shared_set_aggs + target_mask_offsets[col_idx]); + + cudf::detail::dispatch_type_and_aggregation(source_col.type(), d_agg_kinds[col_idx], shmem_element_aggregator{}, - s_aggregates_pointer[col_idx], - map_idx, - s_aggregates_valid_pointer[col_idx], - input_col, - idx); + target, + target_mask, + target_idx, + source_col, + source_idx); } } } @@ -150,27 +156,33 @@ __device__ void compute_final_aggregations(cooperative_groups::thread_block cons cudf::size_type col_start, cudf::size_type col_end, cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, + cudf::mutable_table_device_view target, cudf::size_type cardinality, cudf::size_type* global_mapping_index, - std::byte** s_aggregates_pointer, - bool** s_aggregates_valid_pointer, + std::byte* shared_set_aggs, + cudf::size_type* agg_res_offsets, + cudf::size_type* agg_mask_offsets, cudf::aggregation::Kind const* d_agg_kinds) { + // Aggregate shared memory sources to global memory targets for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) { - auto out_idx = global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx]; + auto const target_idx = + global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx]; for (auto col_idx = col_start; col_idx < col_end; col_idx++) { - auto output_col = output_values.column(col_idx); + auto target_col = target.column(col_idx); + + std::byte* source = reinterpret_cast(shared_set_aggs + agg_res_offsets[col_idx]); + bool* source_mask = reinterpret_cast(shared_set_aggs + agg_mask_offsets[col_idx]); cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(), d_agg_kinds[col_idx], gmem_element_aggregator{}, - output_col, - out_idx, + target_col, + target_idx, input_values.column(col_idx), - s_aggregates_pointer[col_idx], - idx, - s_aggregates_valid_pointer[col_idx]); + source, + source_mask, + idx); } } block.sync(); @@ -188,7 +200,7 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, cudf::mutable_table_device_view output_values, cudf::aggregation::Kind const* d_agg_kinds, cudf::size_type total_agg_size, - cudf::size_type pointer_size) + cudf::size_type offsets_size) { auto const block = cooperative_groups::this_thread_block(); auto const cardinality = block_cardinality[block.group_index().x]; @@ -198,11 +210,12 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, __shared__ cudf::size_type col_start; __shared__ cudf::size_type col_end; - extern __shared__ std::byte shared_set_aggregates[]; - std::byte** s_aggregates_pointer = - reinterpret_cast(shared_set_aggregates + total_agg_size); - bool** s_aggregates_valid_pointer = - reinterpret_cast(shared_set_aggregates + total_agg_size + pointer_size); + extern __shared__ std::byte shared_set_aggs[]; + + cudf::size_type* target_offsets = + reinterpret_cast(shared_set_aggs + total_agg_size); + cudf::size_type* target_mask_offsets = + reinterpret_cast(shared_set_aggs + total_agg_size + offsets_size); if (block.thread_rank() == 0) { col_start = 0; @@ -216,9 +229,8 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, col_end, output_values, num_cols, - s_aggregates_pointer, - s_aggregates_valid_pointer, - shared_set_aggregates, + target_offsets, + target_mask_offsets, cardinality, total_agg_size); } @@ -228,8 +240,9 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, col_start, col_end, output_values, - s_aggregates_pointer, - s_aggregates_valid_pointer, + shared_set_aggs, + target_offsets, + target_mask_offsets, cardinality, d_agg_kinds); @@ -240,8 +253,9 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, input_values, num_rows, local_mapping_index, - s_aggregates_pointer, - s_aggregates_valid_pointer, + shared_set_aggs, + target_offsets, + target_mask_offsets, d_agg_kinds); block.sync(); @@ -252,8 +266,9 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, output_values, cardinality, global_mapping_index, - s_aggregates_pointer, - s_aggregates_valid_pointer, + shared_set_aggs, + target_offsets, + target_mask_offsets, d_agg_kinds); } } @@ -273,7 +288,7 @@ size_t available_shared_memory_size(cudf::size_type grid_size) return get_previous_multiple_of_8(0.5 * dynamic_shmem_size); } -size_t shmem_agg_pointer_size(cudf::size_type num_cols) { return sizeof(void*) * num_cols; } +size_t shmem_offsets_size(cudf::size_type num_cols) { return sizeof(cudf::size_type) * num_cols; } void compute_single_pass_shmem_aggs(cudf::size_type grid_size, cudf::size_type num_input_rows, @@ -288,13 +303,12 @@ void compute_single_pass_shmem_aggs(cudf::size_type grid_size, rmm::cuda_stream_view stream) { auto const shmem_size = available_shared_memory_size(grid_size); - // For each aggregation, need two pointers to arrays in shmem - // One where the aggregation is performed, one indicating the validity of the aggregation - auto const shmem_pointer_size = shmem_agg_pointer_size(output_values.num_columns()); + // For each aggregation, need one offset determining where the aggregation is + // performed, another indicating the validity of the aggregation + auto const offsets_size = shmem_offsets_size(output_values.num_columns()); // The rest of shmem is utilized for the actual arrays in shmem - CUDF_EXPECTS(shmem_size > shmem_pointer_size * 2, - "No enough space for shared memory aggregations"); - auto const shmem_agg_size = shmem_size - shmem_pointer_size * 2; + CUDF_EXPECTS(shmem_size > offsets_size * 2, "No enough space for shared memory aggregations"); + auto const shmem_agg_size = shmem_size - offsets_size * 2; single_pass_shmem_aggs_kernel<<>>( num_input_rows, row_bitmask, @@ -306,6 +320,6 @@ void compute_single_pass_shmem_aggs(cudf::size_type grid_size, output_values, d_agg_kinds, shmem_agg_size, - shmem_pointer_size); + offsets_size); } } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp index c871752e7e3..73db4750a1f 100644 --- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp +++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp @@ -25,7 +25,7 @@ namespace cudf::groupby::detail::hash { size_t available_shared_memory_size(int grid_size); -size_t shmem_agg_pointer_size(int num_cols); +size_t shmem_offsets_size(int num_cols); void compute_single_pass_shmem_aggs(int grid_size, cudf::size_type num_input_rows, diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh index 08d2c0552b3..89394790117 100644 --- a/cpp/src/groupby/hash/global_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh @@ -33,8 +33,8 @@ struct update_target_element_gmem { cudf::size_type target_index, cudf::column_device_view source_column, std::byte* source, - cudf::size_type source_index, - bool* source_null) const + bool* source_mask, + cudf::size_type source_index) const noexcept { CUDF_UNREACHABLE("Invalid source type and aggregation combination."); } @@ -49,8 +49,8 @@ struct update_target_element_gmem< cudf::size_type target_index, cudf::column_device_view source_column, std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept + bool* source_mask, + cudf::size_type source_index) const noexcept { using DeviceType = cudf::detail::underlying_target_t; DeviceType* source_casted = reinterpret_cast(source); @@ -70,8 +70,8 @@ struct update_target_element_gmem< cudf::size_type target_index, cudf::column_device_view source_column, std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept + bool* source_mask, + cudf::size_type source_index) const noexcept { using DeviceType = cudf::detail::underlying_target_t; DeviceType* source_casted = reinterpret_cast(source); @@ -92,8 +92,8 @@ struct update_target_element_gmem< cudf::size_type target_index, cudf::column_device_view source_column, std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept + bool* source_mask, + cudf::size_type source_index) const noexcept { using DeviceType = cudf::detail::underlying_target_t; DeviceType* source_casted = reinterpret_cast(source); @@ -114,8 +114,8 @@ struct update_target_element_gmem< cudf::size_type target_index, cudf::column_device_view source_column, std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept + bool* source_mask, + cudf::size_type source_index) const noexcept { using Target = cudf::detail::target_type_t; @@ -137,8 +137,8 @@ struct update_target_element_gmem< cudf::size_type target_index, cudf::column_device_view source_column, std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept + bool* source_mask, + cudf::size_type source_index) const noexcept { using Target = cudf::detail::target_type_t; @@ -162,8 +162,8 @@ struct update_target_element_gmem< cudf::size_type target_index, cudf::column_device_view source_column, std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept + bool* source_mask, + cudf::size_type source_index) const noexcept { using Target = cudf::detail::target_type_t; @@ -186,8 +186,8 @@ struct update_target_element_gmem< cudf::size_type target_index, cudf::column_device_view source_column, std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept + bool* source_mask, + cudf::size_type source_index) const noexcept { using Target = cudf::detail::target_type_t; @@ -209,8 +209,8 @@ struct update_target_element_gmem< cudf::size_type target_index, cudf::column_device_view source_column, std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept + bool* source_mask, + cudf::size_type source_index) const noexcept { using Target = cudf::detail::target_type_t; Target* source_casted = reinterpret_cast(source); @@ -238,8 +238,8 @@ struct update_target_element_gmem< cudf::size_type target_index, cudf::column_device_view source_column, std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept + bool* source_mask, + cudf::size_type source_index) const noexcept { using Target = cudf::detail::target_type_t; Target* source_casted = reinterpret_cast(source); @@ -264,14 +264,14 @@ struct gmem_element_aggregator { cudf::size_type target_index, cudf::column_device_view source_column, std::byte* source, - cudf::size_type source_index, - bool* source_null) const noexcept + bool* source_mask, + cudf::size_type source_index) const noexcept { if constexpr (k != cudf::aggregation::COUNT_ALL) { - if (source_null[source_index]) { return; } + if (source_mask[source_index]) { return; } } update_target_element_gmem{}( - target, target_index, source_column, source, source_index, source_null); + target, target_index, source_column, source, source_mask, source_index); } }; diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh index c5713e4a72e..f4be32ed723 100644 --- a/cpp/src/groupby/hash/shared_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh @@ -30,8 +30,8 @@ namespace cudf::groupby::detail::hash { template struct update_target_element_shmem { __device__ void operator()(std::byte* target, + bool* target_mask, cudf::size_type target_index, - bool* target_null, cudf::column_device_view source, cudf::size_type source_index) const { @@ -45,8 +45,8 @@ struct update_target_element_shmem< cudf::aggregation::MIN, cuda::std::enable_if_t() && cudf::has_atomic_support()>> { __device__ void operator()(std::byte* target, + bool* target_mask, cudf::size_type target_index, - bool* target_null, cudf::column_device_view source, cudf::size_type source_index) const noexcept { @@ -56,7 +56,7 @@ struct update_target_element_shmem< DeviceTarget* target_casted = reinterpret_cast(target); cudf::detail::atomic_min(&target_casted[target_index], static_cast(source.element(source_index))); - if (target_null[target_index]) { target_null[target_index] = false; } + if (target_mask[target_index]) { target_mask[target_index] = false; } } }; @@ -66,8 +66,8 @@ struct update_target_element_shmem< cudf::aggregation::MAX, cuda::std::enable_if_t() && cudf::has_atomic_support()>> { __device__ void operator()(std::byte* target, + bool* target_mask, cudf::size_type target_index, - bool* target_null, cudf::column_device_view source, cudf::size_type source_index) const noexcept { @@ -78,7 +78,7 @@ struct update_target_element_shmem< cudf::detail::atomic_max(&target_casted[target_index], static_cast(source.element(source_index))); - if (target_null[target_index]) { target_null[target_index] = false; } + if (target_mask[target_index]) { target_mask[target_index] = false; } } }; @@ -89,8 +89,8 @@ struct update_target_element_shmem< cuda::std::enable_if_t() && cudf::has_atomic_support() && !cudf::is_timestamp()>> { __device__ void operator()(std::byte* target, + bool* target_mask, cudf::size_type target_index, - bool* target_null, cudf::column_device_view source, cudf::size_type source_index) const noexcept { @@ -101,7 +101,7 @@ struct update_target_element_shmem< cudf::detail::atomic_add(&target_casted[target_index], static_cast(source.element(source_index))); - if (target_null[target_index]) { target_null[target_index] = false; } + if (target_mask[target_index]) { target_mask[target_index] = false; } } }; @@ -111,8 +111,8 @@ struct update_target_element_shmem< cudf::aggregation::SUM_OF_SQUARES, cuda::std::enable_if_t()>> { __device__ void operator()(std::byte* target, + bool* target_mask, cudf::size_type target_index, - bool* target_null, cudf::column_device_view source, cudf::size_type source_index) const noexcept { @@ -121,7 +121,7 @@ struct update_target_element_shmem< auto value = static_cast(source.element(source_index)); cudf::detail::atomic_add(&target_casted[target_index], value * value); - if (target_null[target_index]) { target_null[target_index] = false; } + if (target_mask[target_index]) { target_mask[target_index] = false; } } }; @@ -131,8 +131,8 @@ struct update_target_element_shmem< cudf::aggregation::PRODUCT, cuda::std::enable_if_t()>> { __device__ void operator()(std::byte* target, + bool* target_mask, cudf::size_type target_index, - bool* target_null, cudf::column_device_view source, cudf::size_type source_index) const noexcept { @@ -141,7 +141,7 @@ struct update_target_element_shmem< cudf::detail::atomic_mul(&target_casted[target_index], static_cast(source.element(source_index))); - if (target_null[target_index]) { target_null[target_index] = false; } + if (target_mask[target_index]) { target_mask[target_index] = false; } } }; @@ -152,8 +152,8 @@ struct update_target_element_shmem< cuda::std::enable_if_t< cudf::detail::is_valid_aggregation()>> { __device__ void operator()(std::byte* target, + bool* target_mask, cudf::size_type target_index, - bool* target_null, cudf::column_device_view source, cudf::size_type source_index) const noexcept { @@ -170,8 +170,8 @@ struct update_target_element_shmem< cuda::std::enable_if_t< cudf::detail::is_valid_aggregation()>> { __device__ void operator()(std::byte* target, + bool* target_mask, cudf::size_type target_index, - bool* target_null, cudf::column_device_view source, cudf::size_type source_index) const noexcept { @@ -190,8 +190,8 @@ struct update_target_element_shmem< cuda::std::enable_if_t() and cudf::is_relationally_comparable()>> { __device__ void operator()(std::byte* target, + bool* target_mask, cudf::size_type target_index, - bool* target_null, cudf::column_device_view source, cudf::size_type source_index) const noexcept { @@ -205,7 +205,7 @@ struct update_target_element_shmem< } } - if (target_null[target_index]) { target_null[target_index] = false; } + if (target_mask[target_index]) { target_mask[target_index] = false; } } }; @@ -216,8 +216,8 @@ struct update_target_element_shmem< cuda::std::enable_if_t() and cudf::is_relationally_comparable()>> { __device__ void operator()(std::byte* target, + bool* target_mask, cudf::size_type target_index, - bool* target_null, cudf::column_device_view source, cudf::size_type source_index) const noexcept { @@ -231,15 +231,15 @@ struct update_target_element_shmem< } } - if (target_null[target_index]) { target_null[target_index] = false; } + if (target_mask[target_index]) { target_mask[target_index] = false; } } }; struct shmem_element_aggregator { template __device__ void operator()(std::byte* target, + bool* target_mask, cudf::size_type target_index, - bool* target_null, cudf::column_device_view source, cudf::size_type source_index) const noexcept { @@ -247,7 +247,7 @@ struct shmem_element_aggregator { if (source.is_null(source_index)) { return; } } update_target_element_shmem{}( - target, target_index, target_null, source, source_index); + target, target_mask, target_index, source, source_index); } }; } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index 6d10c8065ca..93b2bff8990 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -74,41 +74,35 @@ __device__ T get_identity() template struct initialize_target_element { __device__ void operator()(std::byte* target, - cudf::size_type target_index, - bool* target_null) const noexcept + bool* target_mask, + cudf::size_type idx) const noexcept { CUDF_UNREACHABLE("Invalid source type and aggregation combination."); } }; -// TODO: are the conditions correctly checked? template struct initialize_target_element()>> { __device__ void operator()(std::byte* target, - cudf::size_type target_index, - bool* target_null) const noexcept + bool* target_mask, + cudf::size_type idx) const noexcept { - using DeviceType = cudf::device_storage_type_t; - DeviceType* target_casted = reinterpret_cast(target); - target_casted[target_index] = get_identity(); + using DeviceType = cudf::device_storage_type_t; + DeviceType* target_casted = reinterpret_cast(target); - if (k == cudf::aggregation::COUNT_ALL || k == cudf::aggregation::COUNT_VALID) { - target_null[target_index] = false; - } else { - target_null[target_index] = true; - } + target_casted[idx] = get_identity(); + target_mask[idx] = !(k == cudf::aggregation::COUNT_ALL || k == cudf::aggregation::COUNT_VALID); } }; struct initialize_shmem { template + // TODO naming __device__ void operator()(std::byte* target, - cudf::size_type target_index, - bool* target_null) const noexcept + bool* target_mask, + cudf::size_type idx) const noexcept { - // TODO: typecasting work for every datatype - - initialize_target_element{}(target, target_index, target_null); + initialize_target_element{}(target, target_mask, idx); } }; From 8a0551e9c235906ad70438606fcd57a3686999e3 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 14 Oct 2024 14:10:32 -0700 Subject: [PATCH 102/135] Rename for clarity --- .../hash/compute_single_pass_shmem_aggs.cu | 74 ++++++++++--------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu index 1a984d6f100..444cfdf4d79 100644 --- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu +++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu @@ -64,8 +64,8 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, cudf::size_type& col_end, cudf::mutable_table_device_view output_values, cudf::size_type output_size, - cudf::size_type* target_offsets, - cudf::size_type* target_mask_offsets, + cudf::size_type* shmem_agg_res_offsets, + cudf::size_type* shmem_agg_mask_offsets, cudf::size_type cardinality, cudf::size_type total_agg_size) { @@ -83,8 +83,8 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, // TODO: it seems early exit will break the followup calculatons. To verify if (bytes_allocated + next_col_total_size > total_agg_size) { break; } - target_offsets[col_end] = bytes_allocated; - target_mask_offsets[col_end] = bytes_allocated + next_col_size; + shmem_agg_res_offsets[col_end] = bytes_allocated; + shmem_agg_mask_offsets[col_end] = bytes_allocated + next_col_size; bytes_allocated += next_col_total_size; ++col_end; @@ -96,16 +96,18 @@ __device__ void initialize_shmem_aggregations(cooperative_groups::thread_block c cudf::size_type col_start, cudf::size_type col_end, cudf::mutable_table_device_view output_values, - std::byte* shared_set_aggs, - cudf::size_type* target_offsets, - cudf::size_type* target_mask_offsets, + std::byte* shmem_agg_storage, + cudf::size_type* shmem_agg_res_offsets, + cudf::size_type* shmem_agg_mask_offsets, cudf::size_type cardinality, cudf::aggregation::Kind const* d_agg_kinds) { for (auto col_idx = col_start; col_idx < col_end; col_idx++) { for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) { - std::byte* target = reinterpret_cast(shared_set_aggs + target_offsets[col_idx]); - bool* target_mask = reinterpret_cast(shared_set_aggs + target_mask_offsets[col_idx]); + std::byte* target = + reinterpret_cast(shmem_agg_storage + shmem_agg_res_offsets[col_idx]); + bool* target_mask = + reinterpret_cast(shmem_agg_storage + shmem_agg_mask_offsets[col_idx]); cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(), d_agg_kinds[col_idx], initialize_shmem{}, @@ -124,11 +126,12 @@ __device__ void compute_pre_aggregrations(cudf::size_type col_start, cudf::table_device_view source, cudf::size_type num_input_rows, cudf::size_type* local_mapping_index, - std::byte* shared_set_aggs, - cudf::size_type* target_offsets, - cudf::size_type* target_mask_offsets, + std::byte* shmem_agg_storage, + cudf::size_type* shmem_agg_res_offsets, + cudf::size_type* shmem_agg_mask_offsets, cudf::aggregation::Kind const* d_agg_kinds) { + // Aggregates global memory sources to shared memory targets for (auto source_idx = cudf::detail::grid_1d::global_thread_id(); source_idx < num_input_rows; source_idx += cudf::detail::grid_1d::grid_stride()) { if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, source_idx)) { @@ -136,8 +139,10 @@ __device__ void compute_pre_aggregrations(cudf::size_type col_start, for (auto col_idx = col_start; col_idx < col_end; col_idx++) { auto const source_col = source.column(col_idx); - std::byte* target = reinterpret_cast(shared_set_aggs + target_offsets[col_idx]); - bool* target_mask = reinterpret_cast(shared_set_aggs + target_mask_offsets[col_idx]); + std::byte* target = + reinterpret_cast(shmem_agg_storage + shmem_agg_res_offsets[col_idx]); + bool* target_mask = + reinterpret_cast(shmem_agg_storage + shmem_agg_mask_offsets[col_idx]); cudf::detail::dispatch_type_and_aggregation(source_col.type(), d_agg_kinds[col_idx], @@ -159,20 +164,21 @@ __device__ void compute_final_aggregations(cooperative_groups::thread_block cons cudf::mutable_table_device_view target, cudf::size_type cardinality, cudf::size_type* global_mapping_index, - std::byte* shared_set_aggs, + std::byte* shmem_agg_storage, cudf::size_type* agg_res_offsets, cudf::size_type* agg_mask_offsets, cudf::aggregation::Kind const* d_agg_kinds) { - // Aggregate shared memory sources to global memory targets + // Aggregates shared memory sources to global memory targets for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) { auto const target_idx = global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx]; for (auto col_idx = col_start; col_idx < col_end; col_idx++) { auto target_col = target.column(col_idx); - std::byte* source = reinterpret_cast(shared_set_aggs + agg_res_offsets[col_idx]); - bool* source_mask = reinterpret_cast(shared_set_aggs + agg_mask_offsets[col_idx]); + std::byte* source = + reinterpret_cast(shmem_agg_storage + agg_res_offsets[col_idx]); + bool* source_mask = reinterpret_cast(shmem_agg_storage + agg_mask_offsets[col_idx]); cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(), d_agg_kinds[col_idx], @@ -210,12 +216,12 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, __shared__ cudf::size_type col_start; __shared__ cudf::size_type col_end; - extern __shared__ std::byte shared_set_aggs[]; + extern __shared__ std::byte shmem_agg_storage[]; - cudf::size_type* target_offsets = - reinterpret_cast(shared_set_aggs + total_agg_size); - cudf::size_type* target_mask_offsets = - reinterpret_cast(shared_set_aggs + total_agg_size + offsets_size); + cudf::size_type* shmem_agg_res_offsets = + reinterpret_cast(shmem_agg_storage + total_agg_size); + cudf::size_type* shmem_agg_mask_offsets = + reinterpret_cast(shmem_agg_storage + total_agg_size + offsets_size); if (block.thread_rank() == 0) { col_start = 0; @@ -229,8 +235,8 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, col_end, output_values, num_cols, - target_offsets, - target_mask_offsets, + shmem_agg_res_offsets, + shmem_agg_mask_offsets, cardinality, total_agg_size); } @@ -240,9 +246,9 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, col_start, col_end, output_values, - shared_set_aggs, - target_offsets, - target_mask_offsets, + shmem_agg_storage, + shmem_agg_res_offsets, + shmem_agg_mask_offsets, cardinality, d_agg_kinds); @@ -253,9 +259,9 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, input_values, num_rows, local_mapping_index, - shared_set_aggs, - target_offsets, - target_mask_offsets, + shmem_agg_storage, + shmem_agg_res_offsets, + shmem_agg_mask_offsets, d_agg_kinds); block.sync(); @@ -266,9 +272,9 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, output_values, cardinality, global_mapping_index, - shared_set_aggs, - target_offsets, - target_mask_offsets, + shmem_agg_storage, + shmem_agg_res_offsets, + shmem_agg_mask_offsets, d_agg_kinds); } } From 5c493008eb6274299a74bad14d83f354aa23efaf Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 14 Oct 2024 15:54:53 -0700 Subject: [PATCH 103/135] Minor improvement to reduce build time --- cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu index 444cfdf4d79..cd1759a10ea 100644 --- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu +++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu @@ -75,9 +75,11 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, auto const valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality); while (bytes_allocated < total_agg_size && col_end < output_size) { - auto const col_idx = col_end; - auto const next_col_size = round_to_multiple_of_8( - cudf::type_dispatcher(output_values.column(col_idx).type(), size_of_functor{}) * cardinality); + auto const col_idx = col_end; + auto const next_col_size = + round_to_multiple_of_8(cudf::type_dispatcher( + output_values.column(col_idx).type(), size_of_functor{}) * + cardinality); auto const next_col_total_size = next_col_size + valid_col_size; // TODO: it seems early exit will break the followup calculatons. To verify From 99010b301a767418e85a16d305383ac7a485b815 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 14 Oct 2024 16:42:59 -0700 Subject: [PATCH 104/135] Use mask logic instead of null logic --- cpp/src/groupby/hash/global_memory_aggregator.cuh | 14 ++------------ cpp/src/groupby/hash/shared_memory_aggregator.cuh | 15 ++++++++------- cpp/src/groupby/hash/single_pass_functors.cuh | 7 ++++++- 3 files changed, 16 insertions(+), 20 deletions(-) diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh index 89394790117..4c682ad6fae 100644 --- a/cpp/src/groupby/hash/global_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh @@ -33,7 +33,6 @@ struct update_target_element_gmem { cudf::size_type target_index, cudf::column_device_view source_column, std::byte* source, - bool* source_mask, cudf::size_type source_index) const noexcept { CUDF_UNREACHABLE("Invalid source type and aggregation combination."); @@ -49,7 +48,6 @@ struct update_target_element_gmem< cudf::size_type target_index, cudf::column_device_view source_column, std::byte* source, - bool* source_mask, cudf::size_type source_index) const noexcept { using DeviceType = cudf::detail::underlying_target_t; @@ -70,7 +68,6 @@ struct update_target_element_gmem< cudf::size_type target_index, cudf::column_device_view source_column, std::byte* source, - bool* source_mask, cudf::size_type source_index) const noexcept { using DeviceType = cudf::detail::underlying_target_t; @@ -92,7 +89,6 @@ struct update_target_element_gmem< cudf::size_type target_index, cudf::column_device_view source_column, std::byte* source, - bool* source_mask, cudf::size_type source_index) const noexcept { using DeviceType = cudf::detail::underlying_target_t; @@ -114,7 +110,6 @@ struct update_target_element_gmem< cudf::size_type target_index, cudf::column_device_view source_column, std::byte* source, - bool* source_mask, cudf::size_type source_index) const noexcept { using Target = cudf::detail::target_type_t; @@ -137,7 +132,6 @@ struct update_target_element_gmem< cudf::size_type target_index, cudf::column_device_view source_column, std::byte* source, - bool* source_mask, cudf::size_type source_index) const noexcept { using Target = cudf::detail::target_type_t; @@ -162,7 +156,6 @@ struct update_target_element_gmem< cudf::size_type target_index, cudf::column_device_view source_column, std::byte* source, - bool* source_mask, cudf::size_type source_index) const noexcept { using Target = cudf::detail::target_type_t; @@ -186,7 +179,6 @@ struct update_target_element_gmem< cudf::size_type target_index, cudf::column_device_view source_column, std::byte* source, - bool* source_mask, cudf::size_type source_index) const noexcept { using Target = cudf::detail::target_type_t; @@ -209,7 +201,6 @@ struct update_target_element_gmem< cudf::size_type target_index, cudf::column_device_view source_column, std::byte* source, - bool* source_mask, cudf::size_type source_index) const noexcept { using Target = cudf::detail::target_type_t; @@ -238,7 +229,6 @@ struct update_target_element_gmem< cudf::size_type target_index, cudf::column_device_view source_column, std::byte* source, - bool* source_mask, cudf::size_type source_index) const noexcept { using Target = cudf::detail::target_type_t; @@ -268,10 +258,10 @@ struct gmem_element_aggregator { cudf::size_type source_index) const noexcept { if constexpr (k != cudf::aggregation::COUNT_ALL) { - if (source_mask[source_index]) { return; } + if (!source_mask[source_index]) { return; } } update_target_element_gmem{}( - target, target_index, source_column, source, source_mask, source_index); + target, target_index, source_column, source, source_index); } }; diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh index f4be32ed723..32248025fe2 100644 --- a/cpp/src/groupby/hash/shared_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh @@ -56,7 +56,8 @@ struct update_target_element_shmem< DeviceTarget* target_casted = reinterpret_cast(target); cudf::detail::atomic_min(&target_casted[target_index], static_cast(source.element(source_index))); - if (target_mask[target_index]) { target_mask[target_index] = false; } + + if (!target_mask[target_index]) { target_mask[target_index] = true; } } }; @@ -78,7 +79,7 @@ struct update_target_element_shmem< cudf::detail::atomic_max(&target_casted[target_index], static_cast(source.element(source_index))); - if (target_mask[target_index]) { target_mask[target_index] = false; } + if (!target_mask[target_index]) { target_mask[target_index] = true; } } }; @@ -101,7 +102,7 @@ struct update_target_element_shmem< cudf::detail::atomic_add(&target_casted[target_index], static_cast(source.element(source_index))); - if (target_mask[target_index]) { target_mask[target_index] = false; } + if (!target_mask[target_index]) { target_mask[target_index] = true; } } }; @@ -121,7 +122,7 @@ struct update_target_element_shmem< auto value = static_cast(source.element(source_index)); cudf::detail::atomic_add(&target_casted[target_index], value * value); - if (target_mask[target_index]) { target_mask[target_index] = false; } + if (!target_mask[target_index]) { target_mask[target_index] = true; } } }; @@ -141,7 +142,7 @@ struct update_target_element_shmem< cudf::detail::atomic_mul(&target_casted[target_index], static_cast(source.element(source_index))); - if (target_mask[target_index]) { target_mask[target_index] = false; } + if (!target_mask[target_index]) { target_mask[target_index] = true; } } }; @@ -205,7 +206,7 @@ struct update_target_element_shmem< } } - if (target_mask[target_index]) { target_mask[target_index] = false; } + if (!target_mask[target_index]) { target_mask[target_index] = true; } } }; @@ -231,7 +232,7 @@ struct update_target_element_shmem< } } - if (target_mask[target_index]) { target_mask[target_index] = false; } + if (!target_mask[target_index]) { target_mask[target_index] = true; } } }; diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index 93b2bff8990..5c788522ac9 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -91,7 +91,12 @@ struct initialize_target_element(target); target_casted[idx] = get_identity(); - target_mask[idx] = !(k == cudf::aggregation::COUNT_ALL || k == cudf::aggregation::COUNT_VALID); + + if (k == cudf::aggregation::COUNT_ALL || k == cudf::aggregation::COUNT_VALID) { + target_mask[idx] = true; + } else { + target_mask[idx] = false; + } } }; From d071662e234a3b40213c7e7b57ccd2f86ed96801 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 14 Oct 2024 17:03:58 -0700 Subject: [PATCH 105/135] Minor header cleanup --- cpp/src/groupby/hash/var_hash_functor.cuh | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/groupby/hash/var_hash_functor.cuh b/cpp/src/groupby/hash/var_hash_functor.cuh index e02c322c68f..bb55cc9188c 100644 --- a/cpp/src/groupby/hash/var_hash_functor.cuh +++ b/cpp/src/groupby/hash/var_hash_functor.cuh @@ -15,8 +15,6 @@ */ #pragma once -#include "helpers.cuh" - #include #include #include From 4e2c2cc8f4d66901536a9282df577a072d1f0edb Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 14 Oct 2024 19:08:48 -0700 Subject: [PATCH 106/135] Remove unused code + clean up null check --- .../groupby/hash/compute_single_pass_shmem_aggs.cu | 12 ------------ cpp/src/groupby/hash/global_memory_aggregator.cuh | 5 ++--- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu index cd1759a10ea..314adf336a5 100644 --- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu +++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu @@ -45,18 +45,6 @@ struct size_of_functor { } }; -/* -/// Computes number of *actual* bitmask_type elements needed -__device__ constexpr size_type num_bitmask_words(size_type number_of_bits) -{ - // TODO: This duplicates `cudf::num_bitmask_words`. Converting it into - // a public host-device utility will require non-trivial effort, so the - // cleanup will be addressed in a separate PR. - return cudf::util::div_rounding_up_safe(number_of_bits, - cudf::detail::size_in_bits()); -} -*/ - // Prepares shared memory data required by each output column, exits if // no enough memory space to perform the shared memory aggregation for the // current output column diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh index 4c682ad6fae..5747a10ed1b 100644 --- a/cpp/src/groupby/hash/global_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh @@ -257,9 +257,8 @@ struct gmem_element_aggregator { bool* source_mask, cudf::size_type source_index) const noexcept { - if constexpr (k != cudf::aggregation::COUNT_ALL) { - if (!source_mask[source_index]) { return; } - } + if (!source_mask[source_index]) { return; } + update_target_element_gmem{}( target, target_index, source_column, source, source_index); } From c2514f6147ac13e53f68f04220f2356b58fa4a3f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 14 Oct 2024 19:30:28 -0700 Subject: [PATCH 107/135] Use cuda::std::byte on device --- .../hash/compute_single_pass_shmem_aggs.cu | 23 +++++++++--------- .../groupby/hash/global_memory_aggregator.cuh | 23 +++++++++--------- .../groupby/hash/shared_memory_aggregator.cuh | 24 ++++++++++--------- cpp/src/groupby/hash/single_pass_functors.cuh | 8 ++++--- 4 files changed, 41 insertions(+), 37 deletions(-) diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu index 314adf336a5..9874e2f7444 100644 --- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu +++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu @@ -31,8 +31,7 @@ #include #include - -#include +#include namespace cudf::groupby::detail::hash { namespace { @@ -86,7 +85,7 @@ __device__ void initialize_shmem_aggregations(cooperative_groups::thread_block c cudf::size_type col_start, cudf::size_type col_end, cudf::mutable_table_device_view output_values, - std::byte* shmem_agg_storage, + cuda::std::byte* shmem_agg_storage, cudf::size_type* shmem_agg_res_offsets, cudf::size_type* shmem_agg_mask_offsets, cudf::size_type cardinality, @@ -94,8 +93,8 @@ __device__ void initialize_shmem_aggregations(cooperative_groups::thread_block c { for (auto col_idx = col_start; col_idx < col_end; col_idx++) { for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) { - std::byte* target = - reinterpret_cast(shmem_agg_storage + shmem_agg_res_offsets[col_idx]); + cuda::std::byte* target = + reinterpret_cast(shmem_agg_storage + shmem_agg_res_offsets[col_idx]); bool* target_mask = reinterpret_cast(shmem_agg_storage + shmem_agg_mask_offsets[col_idx]); cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(), @@ -116,7 +115,7 @@ __device__ void compute_pre_aggregrations(cudf::size_type col_start, cudf::table_device_view source, cudf::size_type num_input_rows, cudf::size_type* local_mapping_index, - std::byte* shmem_agg_storage, + cuda::std::byte* shmem_agg_storage, cudf::size_type* shmem_agg_res_offsets, cudf::size_type* shmem_agg_mask_offsets, cudf::aggregation::Kind const* d_agg_kinds) @@ -129,8 +128,8 @@ __device__ void compute_pre_aggregrations(cudf::size_type col_start, for (auto col_idx = col_start; col_idx < col_end; col_idx++) { auto const source_col = source.column(col_idx); - std::byte* target = - reinterpret_cast(shmem_agg_storage + shmem_agg_res_offsets[col_idx]); + cuda::std::byte* target = + reinterpret_cast(shmem_agg_storage + shmem_agg_res_offsets[col_idx]); bool* target_mask = reinterpret_cast(shmem_agg_storage + shmem_agg_mask_offsets[col_idx]); @@ -154,7 +153,7 @@ __device__ void compute_final_aggregations(cooperative_groups::thread_block cons cudf::mutable_table_device_view target, cudf::size_type cardinality, cudf::size_type* global_mapping_index, - std::byte* shmem_agg_storage, + cuda::std::byte* shmem_agg_storage, cudf::size_type* agg_res_offsets, cudf::size_type* agg_mask_offsets, cudf::aggregation::Kind const* d_agg_kinds) @@ -166,8 +165,8 @@ __device__ void compute_final_aggregations(cooperative_groups::thread_block cons for (auto col_idx = col_start; col_idx < col_end; col_idx++) { auto target_col = target.column(col_idx); - std::byte* source = - reinterpret_cast(shmem_agg_storage + agg_res_offsets[col_idx]); + cuda::std::byte* source = + reinterpret_cast(shmem_agg_storage + agg_res_offsets[col_idx]); bool* source_mask = reinterpret_cast(shmem_agg_storage + agg_mask_offsets[col_idx]); cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(), @@ -206,7 +205,7 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, __shared__ cudf::size_type col_start; __shared__ cudf::size_type col_end; - extern __shared__ std::byte shmem_agg_storage[]; + extern __shared__ cuda::std::byte shmem_agg_storage[]; cudf::size_type* shmem_agg_res_offsets = reinterpret_cast(shmem_agg_storage + total_agg_size); diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh index 5747a10ed1b..fa4190491e9 100644 --- a/cpp/src/groupby/hash/global_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh @@ -23,6 +23,7 @@ #include #include +#include #include namespace cudf::groupby::detail::hash { @@ -32,7 +33,7 @@ struct update_target_element_gmem { __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, - std::byte* source, + cuda::std::byte* source, cudf::size_type source_index) const noexcept { CUDF_UNREACHABLE("Invalid source type and aggregation combination."); @@ -47,7 +48,7 @@ struct update_target_element_gmem< __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, - std::byte* source, + cuda::std::byte* source, cudf::size_type source_index) const noexcept { using DeviceType = cudf::detail::underlying_target_t; @@ -67,7 +68,7 @@ struct update_target_element_gmem< __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, - std::byte* source, + cuda::std::byte* source, cudf::size_type source_index) const noexcept { using DeviceType = cudf::detail::underlying_target_t; @@ -88,7 +89,7 @@ struct update_target_element_gmem< __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, - std::byte* source, + cuda::std::byte* source, cudf::size_type source_index) const noexcept { using DeviceType = cudf::detail::underlying_target_t; @@ -109,7 +110,7 @@ struct update_target_element_gmem< __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, - std::byte* source, + cuda::std::byte* source, cudf::size_type source_index) const noexcept { using Target = cudf::detail::target_type_t; @@ -131,7 +132,7 @@ struct update_target_element_gmem< __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, - std::byte* source, + cuda::std::byte* source, cudf::size_type source_index) const noexcept { using Target = cudf::detail::target_type_t; @@ -155,7 +156,7 @@ struct update_target_element_gmem< __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, - std::byte* source, + cuda::std::byte* source, cudf::size_type source_index) const noexcept { using Target = cudf::detail::target_type_t; @@ -178,7 +179,7 @@ struct update_target_element_gmem< __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, - std::byte* source, + cuda::std::byte* source, cudf::size_type source_index) const noexcept { using Target = cudf::detail::target_type_t; @@ -200,7 +201,7 @@ struct update_target_element_gmem< __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, - std::byte* source, + cuda::std::byte* source, cudf::size_type source_index) const noexcept { using Target = cudf::detail::target_type_t; @@ -228,7 +229,7 @@ struct update_target_element_gmem< __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, - std::byte* source, + cuda::std::byte* source, cudf::size_type source_index) const noexcept { using Target = cudf::detail::target_type_t; @@ -253,7 +254,7 @@ struct gmem_element_aggregator { __device__ void operator()(cudf::mutable_column_device_view target, cudf::size_type target_index, cudf::column_device_view source_column, - std::byte* source, + cuda::std::byte* source, bool* source_mask, cudf::size_type source_index) const noexcept { diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh index 32248025fe2..c5bdfe253ea 100644 --- a/cpp/src/groupby/hash/shared_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh @@ -23,13 +23,14 @@ #include #include +#include #include namespace cudf::groupby::detail::hash { template struct update_target_element_shmem { - __device__ void operator()(std::byte* target, + __device__ void operator()(cuda::std::byte* target, bool* target_mask, cudf::size_type target_index, cudf::column_device_view source, @@ -44,7 +45,7 @@ struct update_target_element_shmem< Source, cudf::aggregation::MIN, cuda::std::enable_if_t() && cudf::has_atomic_support()>> { - __device__ void operator()(std::byte* target, + __device__ void operator()(cuda::std::byte* target, bool* target_mask, cudf::size_type target_index, cudf::column_device_view source, @@ -66,7 +67,7 @@ struct update_target_element_shmem< Source, cudf::aggregation::MAX, cuda::std::enable_if_t() && cudf::has_atomic_support()>> { - __device__ void operator()(std::byte* target, + __device__ void operator()(cuda::std::byte* target, bool* target_mask, cudf::size_type target_index, cudf::column_device_view source, @@ -89,7 +90,7 @@ struct update_target_element_shmem< cudf::aggregation::SUM, cuda::std::enable_if_t() && cudf::has_atomic_support() && !cudf::is_timestamp()>> { - __device__ void operator()(std::byte* target, + __device__ void operator()(cuda::std::byte* target, bool* target_mask, cudf::size_type target_index, cudf::column_device_view source, @@ -111,7 +112,7 @@ struct update_target_element_shmem< Source, cudf::aggregation::SUM_OF_SQUARES, cuda::std::enable_if_t()>> { - __device__ void operator()(std::byte* target, + __device__ void operator()(cuda::std::byte* target, bool* target_mask, cudf::size_type target_index, cudf::column_device_view source, @@ -131,7 +132,7 @@ struct update_target_element_shmem< Source, cudf::aggregation::PRODUCT, cuda::std::enable_if_t()>> { - __device__ void operator()(std::byte* target, + __device__ void operator()(cuda::std::byte* target, bool* target_mask, cudf::size_type target_index, cudf::column_device_view source, @@ -152,12 +153,13 @@ struct update_target_element_shmem< cudf::aggregation::COUNT_VALID, cuda::std::enable_if_t< cudf::detail::is_valid_aggregation()>> { - __device__ void operator()(std::byte* target, + __device__ void operator()(cuda::std::byte* target, bool* target_mask, cudf::size_type target_index, cudf::column_device_view source, cudf::size_type source_index) const noexcept { + // The nullability was checked prior to this call in the `shmem_element_aggregator` functor using Target = cudf::detail::target_type_t; Target* target_casted = reinterpret_cast(target); cudf::detail::atomic_add(&target_casted[target_index], Target{1}); @@ -170,7 +172,7 @@ struct update_target_element_shmem< cudf::aggregation::COUNT_ALL, cuda::std::enable_if_t< cudf::detail::is_valid_aggregation()>> { - __device__ void operator()(std::byte* target, + __device__ void operator()(cuda::std::byte* target, bool* target_mask, cudf::size_type target_index, cudf::column_device_view source, @@ -190,7 +192,7 @@ struct update_target_element_shmem< cudf::aggregation::ARGMAX, cuda::std::enable_if_t() and cudf::is_relationally_comparable()>> { - __device__ void operator()(std::byte* target, + __device__ void operator()(cuda::std::byte* target, bool* target_mask, cudf::size_type target_index, cudf::column_device_view source, @@ -216,7 +218,7 @@ struct update_target_element_shmem< cudf::aggregation::ARGMIN, cuda::std::enable_if_t() and cudf::is_relationally_comparable()>> { - __device__ void operator()(std::byte* target, + __device__ void operator()(cuda::std::byte* target, bool* target_mask, cudf::size_type target_index, cudf::column_device_view source, @@ -238,7 +240,7 @@ struct update_target_element_shmem< struct shmem_element_aggregator { template - __device__ void operator()(std::byte* target, + __device__ void operator()(cuda::std::byte* target, bool* target_mask, cudf::size_type target_index, cudf::column_device_view source, diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index 5c788522ac9..eaa71ae6d8c 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -23,6 +23,8 @@ #include #include +#include + namespace cudf::groupby::detail::hash { // TODO: TO BE REMOVED @@ -73,7 +75,7 @@ __device__ T get_identity() template struct initialize_target_element { - __device__ void operator()(std::byte* target, + __device__ void operator()(cuda::std::byte* target, bool* target_mask, cudf::size_type idx) const noexcept { @@ -83,7 +85,7 @@ struct initialize_target_element { template struct initialize_target_element()>> { - __device__ void operator()(std::byte* target, + __device__ void operator()(cuda::std::byte* target, bool* target_mask, cudf::size_type idx) const noexcept { @@ -103,7 +105,7 @@ struct initialize_target_element // TODO naming - __device__ void operator()(std::byte* target, + __device__ void operator()(cuda::std::byte* target, bool* target_mask, cudf::size_type idx) const noexcept { From 51114c9b57bc81adf84e5fee72962606e9b89779 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sat, 19 Oct 2024 14:43:00 -0700 Subject: [PATCH 108/135] Revert agg details --- .../cudf/detail/aggregation/aggregation.cuh | 21 +---- .../detail/aggregation/device_aggregators.cuh | 91 ++++++++++++++++--- 2 files changed, 77 insertions(+), 35 deletions(-) diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh index 91d25d99c1d..de53e7586cd 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.cuh +++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh @@ -170,7 +170,7 @@ struct identity_initializer { } template - constexpr T get_identity() + T get_identity() { if (k == aggregation::ARGMAX || k == aggregation::ARGMIN) { if constexpr (cudf::is_timestamp()) @@ -186,25 +186,6 @@ struct identity_initializer { } public: - template - __device__ std::enable_if_t(), void> operator()( - cudf::mutable_column_device_view target, cudf::size_type target_index) - { - using DeviceType = device_storage_type_t; - using ElementType = - cuda::std::conditional_t() && !cudf::is_fixed_point(), - Target, - DeviceType>; - target.element(target_index) = get_identity(); - } - - template - __device__ std::enable_if_t(), void> operator()( - cudf::mutable_column_device_view target, cudf::size_type target_index) - { - CUDF_UNREACHABLE("Unsupported aggregation for initializing values"); - } - template std::enable_if_t(), void> operator()(mutable_column_view const& col, rmm::cuda_stream_view stream) diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh index bc370c59296..204eee49a2a 100644 --- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh +++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh @@ -13,7 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once #include @@ -50,12 +49,31 @@ using underlying_source_t = template struct update_target_element { + __device__ void operator()(mutable_column_device_view, + size_type, + column_device_view, + size_type) const noexcept + { + CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + } +}; + +template +struct update_target_element< + Source, + aggregation::MIN, + cuda::std::enable_if_t() && cudf::has_atomic_support() && + !is_fixed_point()>> { __device__ void operator()(mutable_column_device_view target, size_type target_index, column_device_view source, size_type source_index) const noexcept { - CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + using Target = target_type_t; + cudf::detail::atomic_min(&target.element(target_index), + static_cast(source.element(source_index))); + + if (target.is_null(target_index)) { target.set_valid(target_index); } } }; @@ -63,14 +81,16 @@ template struct update_target_element< Source, aggregation::MIN, - cuda::std::enable_if_t() && cudf::has_atomic_support()>> { + cuda::std::enable_if_t() && + cudf::has_atomic_support>()>> { __device__ void operator()(mutable_column_device_view target, size_type target_index, column_device_view source, size_type source_index) const noexcept { - using DeviceTarget = cudf::detail::underlying_target_t; - using DeviceSource = cudf::detail::underlying_source_t; + using Target = target_type_t; + using DeviceTarget = device_storage_type_t; + using DeviceSource = device_storage_type_t; cudf::detail::atomic_min(&target.element(target_index), static_cast(source.element(source_index))); @@ -83,14 +103,35 @@ template struct update_target_element< Source, aggregation::MAX, - cuda::std::enable_if_t() && cudf::has_atomic_support()>> { + cuda::std::enable_if_t() && cudf::has_atomic_support() && + !is_fixed_point()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + using Target = target_type_t; + cudf::detail::atomic_max(&target.element(target_index), + static_cast(source.element(source_index))); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element< + Source, + aggregation::MAX, + cuda::std::enable_if_t() && + cudf::has_atomic_support>()>> { __device__ void operator()(mutable_column_device_view target, size_type target_index, column_device_view source, size_type source_index) const noexcept { - using DeviceTarget = cudf::detail::underlying_target_t; - using DeviceSource = cudf::detail::underlying_source_t; + using Target = target_type_t; + using DeviceTarget = device_storage_type_t; + using DeviceSource = device_storage_type_t; cudf::detail::atomic_max(&target.element(target_index), static_cast(source.element(source_index))); @@ -104,14 +145,34 @@ struct update_target_element< Source, aggregation::SUM, cuda::std::enable_if_t() && cudf::has_atomic_support() && - !cudf::is_timestamp()>> { + !cudf::is_fixed_point() && !cudf::is_timestamp()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + using Target = target_type_t; + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source.element(source_index))); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element< + Source, + aggregation::SUM, + cuda::std::enable_if_t() && + cudf::has_atomic_support>()>> { __device__ void operator()(mutable_column_device_view target, size_type target_index, column_device_view source, size_type source_index) const noexcept { - using DeviceTarget = cudf::detail::underlying_target_t; - using DeviceSource = cudf::detail::underlying_source_t; + using Target = target_type_t; + using DeviceTarget = device_storage_type_t; + using DeviceSource = device_storage_type_t; cudf::detail::atomic_add(&target.element(target_index), static_cast(source.element(source_index))); @@ -142,10 +203,10 @@ struct update_target_from_dictionary { template ()>* = nullptr> - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept + __device__ void operator()(mutable_column_device_view, + size_type, + column_device_view, + size_type) const noexcept { } }; From a3c6eb24e4ec3561a1fbd39e289bc8b7e03521b2 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sat, 19 Oct 2024 14:48:47 -0700 Subject: [PATCH 109/135] Fetch trunk aggregators --- .../groupby/hash/global_memory_aggregator.cuh | 29 ++++++++++++------- .../groupby/hash/shared_memory_aggregator.cuh | 23 ++++++++++----- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh index fa4190491e9..50e89c727ff 100644 --- a/cpp/src/groupby/hash/global_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh @@ -13,28 +13,25 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once #include #include #include #include -#include #include #include #include namespace cudf::groupby::detail::hash { - template struct update_target_element_gmem { - __device__ void operator()(cudf::mutable_column_device_view target, - cudf::size_type target_index, - cudf::column_device_view source_column, - cuda::std::byte* source, - cudf::size_type source_index) const noexcept + __device__ void operator()(cudf::mutable_column_device_view, + cudf::size_type, + cudf::column_device_view, + cuda::std::byte*, + cudf::size_type) const noexcept { CUDF_UNREACHABLE("Invalid source type and aggregation combination."); } @@ -169,7 +166,6 @@ struct update_target_element_gmem< } }; -// TODO: VALID and ALL have same code template struct update_target_element_gmem< Source, @@ -249,6 +245,18 @@ struct update_target_element_gmem< } }; +/** + * @brief A functor that updates a single element in the target column stored in global memory by + * applying an aggregation operation to a corresponding element from a source column in shared + * memory. + * + * This functor can NOT be used for dictionary columns. + * + * This is a redundant copy replicating the behavior of `elementwise_aggregator` from + * `cudf/detail/aggregation/device_aggregators.cuh`. The key difference is that this functor accepts + * a pointer to raw bytes as the source, as `column_device_view` cannot yet be constructed from + * shared memory. + */ struct gmem_element_aggregator { template __device__ void operator()(cudf::mutable_column_device_view target, @@ -258,11 +266,12 @@ struct gmem_element_aggregator { bool* source_mask, cudf::size_type source_index) const noexcept { + // Early exit for all aggregation kinds since shared memory aggregation of + // `COUNT_ALL` is always valid if (!source_mask[source_index]) { return; } update_target_element_gmem{}( target, target_index, source_column, source, source_index); } }; - } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh index c5bdfe253ea..9cbeeb34b86 100644 --- a/cpp/src/groupby/hash/shared_memory_aggregator.cuh +++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh @@ -13,28 +13,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once #include #include #include #include -#include #include #include #include namespace cudf::groupby::detail::hash { - template struct update_target_element_shmem { - __device__ void operator()(cuda::std::byte* target, - bool* target_mask, - cudf::size_type target_index, - cudf::column_device_view source, - cudf::size_type source_index) const + __device__ void operator()( + cuda::std::byte*, bool*, cudf::size_type, cudf::column_device_view, cudf::size_type) const { CUDF_UNREACHABLE("Invalid source type and aggregation combination."); } @@ -238,6 +232,18 @@ struct update_target_element_shmem< } }; +/** + * @brief A functor that updates a single element in the target column stored in shared memory by + * applying an aggregation operation to a corresponding element from a source column in global + * memory. + * + * This functor can NOT be used for dictionary columns. + * + * This is a redundant copy replicating the behavior of `elementwise_aggregator` from + * `cudf/detail/aggregation/device_aggregators.cuh`. The key difference is that this functor accepts + * a pointer to raw bytes as the source, as `column_device_view` cannot yet be constructed from + * shared memory. + */ struct shmem_element_aggregator { template __device__ void operator()(cuda::std::byte* target, @@ -246,6 +252,7 @@ struct shmem_element_aggregator { cudf::column_device_view source, cudf::size_type source_index) const noexcept { + // Check nullability for all aggregation kinds but `COUNT_ALL` if constexpr (k != cudf::aggregation::COUNT_ALL) { if (source.is_null(source_index)) { return; } } From a8f8ab3519ddd1750ac40576bab71551a802cad3 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sat, 19 Oct 2024 14:52:53 -0700 Subject: [PATCH 110/135] Fetch trunk hash_compound_agg_finalizer --- cpp/src/groupby/hash/hash_compound_agg_finalizer.cu | 5 ++--- cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu index 221e63ac121..37a61c1a22c 100644 --- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -173,7 +173,7 @@ void hash_compound_agg_finalizer::visit(cudf::detail::var_aggregation c cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream); thrust::for_each_n( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), col.size(), var_hash_functor{ @@ -196,5 +196,4 @@ void hash_compound_agg_finalizer::visit(cudf::detail::std_aggregation c template class hash_compound_agg_finalizer>; template class hash_compound_agg_finalizer>; - } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp index 16cbe92511f..8bee1a92c40 100644 --- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 9746891d80374589ee1ee0847f9882206dbb70c1 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sat, 19 Oct 2024 14:54:52 -0700 Subject: [PATCH 111/135] Fetch trunk groupby --- cpp/src/groupby/hash/groupby.cu | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index c2947316c9f..30e1d52fdbf 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -32,7 +32,6 @@ #include #include -#include #include #include @@ -80,7 +79,7 @@ constexpr bool array_contains(std::array const& haystack, T needle) * @return true `t` is valid for a hash based groupby * @return false `t` is invalid for a hash based groupby */ -constexpr bool is_hash_aggregation(aggregation::Kind t) +bool constexpr is_hash_aggregation(aggregation::Kind t) { return array_contains(hash_aggregations, t); } @@ -88,8 +87,8 @@ constexpr bool is_hash_aggregation(aggregation::Kind t) std::unique_ptr
dispatch_groupby(table_view const& keys, host_span requests, cudf::detail::result_cache* cache, - bool keys_have_nulls, - null_policy include_null_keys, + bool const keys_have_nulls, + null_policy const include_null_keys, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -105,11 +104,11 @@ std::unique_ptr
dispatch_groupby(table_view const& keys, if (cudf::detail::has_nested_columns(keys)) { auto const d_row_equal = comparator.equal_to(has_null, null_keys_are_equal); return compute_groupby( - keys, requests, cache, skip_rows_with_nulls, d_row_equal, d_row_hash, stream, mr); + keys, requests, skip_rows_with_nulls, d_row_equal, d_row_hash, cache, stream, mr); } else { auto const d_row_equal = comparator.equal_to(has_null, null_keys_are_equal); return compute_groupby( - keys, requests, cache, skip_rows_with_nulls, d_row_equal, d_row_hash, stream, mr); + keys, requests, skip_rows_with_nulls, d_row_equal, d_row_hash, cache, stream, mr); } } } // namespace From 91c75a2185c140ae8eb763154c76615a2cb18591 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sat, 19 Oct 2024 15:01:07 -0700 Subject: [PATCH 112/135] Fetch trunk compute_groupby --- cpp/src/groupby/hash/compute_groupby.cu | 86 ++++++++----------- cpp/src/groupby/hash/compute_groupby.hpp | 24 ++++-- .../hash/create_sparse_results_table.hpp | 11 +++ 3 files changed, 63 insertions(+), 58 deletions(-) diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu index 7565e8ecfbb..bd2e5c8148e 100644 --- a/cpp/src/groupby/hash/compute_groupby.cu +++ b/cpp/src/groupby/hash/compute_groupby.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,74 +14,50 @@ * limitations under the License. */ +#include "compute_groupby.hpp" #include "compute_single_pass_aggs.hpp" #include "helpers.cuh" #include "sparse_to_dense_results.hpp" -#include "var_hash_functor.cuh" +#include #include #include #include #include #include -#include #include #include #include +#include #include #include +#include #include namespace cudf::groupby::detail::hash { -/** - * @brief Computes groupby using hash table. - * - * First, we create a hash table that stores the indices of unique rows in - * `keys`. The upper limit on the number of values in this map is the number - * of rows in `keys`. - * - * To store the results of aggregations, we create temporary sparse columns - * which have the same size as input value columns. Using the hash map, we - * determine the location within the sparse column to write the result of the - * aggregation into. - * - * The sparse column results of all aggregations are stored into the cache - * `sparse_results`. This enables the use of previously calculated results in - * other aggregations. - * - * All the aggregations which can be computed in a single pass are computed - * first, in a combined kernel. Then using these results, aggregations that - * require multiple passes, will be computed. - * - * Finally, using the hash map, we generate a vector of indices of populated - * values in sparse result columns. Then, for each aggregation originally - * requested in `requests`, we gather sparse results into a column of dense - * results using the aforementioned index vector. Dense results are stored into - * the in/out parameter `cache`. - */ -template +template std::unique_ptr
compute_groupby(table_view const& keys, host_span requests, - cudf::detail::result_cache* cache, bool skip_rows_with_nulls, Equal const& d_row_equal, - row_hash_t const& d_row_hash, + Hash const& d_row_hash, + cudf::detail::result_cache* cache, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { // convert to int64_t to avoid potential overflow with large `keys` - auto const num_rows = static_cast(keys.num_rows()); + auto const num_keys = static_cast(keys.num_rows()); // Cache of sparse results where the location of aggregate value in each // column is indexed by the hash set cudf::detail::result_cache sparse_results(requests.size()); - auto set = cuco::static_set{ - cuco::extent{num_rows}, - cudf::detail::CUCO_DESIRED_LOAD_FACTOR, // 50% occupancy + auto const set = cuco::static_set{ + num_keys, + cudf::detail::CUCO_DESIRED_LOAD_FACTOR, // 50% load factor cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, d_row_equal, probing_scheme_t{d_row_hash}, @@ -96,22 +72,25 @@ std::unique_ptr
compute_groupby(table_view const& keys, : rmm::device_buffer{}; // Compute all single pass aggs first - auto gather_map = compute_single_pass_aggs(num_rows, - static_cast(row_bitmask.data()), - requests, - &sparse_results, - set, - skip_rows_with_nulls, - stream); + compute_single_pass_aggs(num_keys, + skip_rows_with_nulls, + static_cast(row_bitmask.data()), + set.ref(cuco::insert_and_find), + requests, + &sparse_results, + stream); + + // Extract the populated indices from the hash set and create a gather map. + // Gathering using this map from sparse results will give dense results. + auto gather_map = extract_populated_keys(set, keys.num_rows(), stream); // Compact all results from sparse_results and insert into cache - sparse_to_dense_results(static_cast(row_bitmask.data()), - requests, + sparse_to_dense_results(requests, &sparse_results, cache, gather_map, set.ref(cuco::find), - skip_rows_with_nulls, + static_cast(row_bitmask.data()), stream, mr); @@ -123,24 +102,29 @@ std::unique_ptr
compute_groupby(table_view const& keys, mr); } -template std::unique_ptr
compute_groupby( +template rmm::device_uvector extract_populated_keys( + global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream); + +template rmm::device_uvector extract_populated_keys( + nullable_global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream); + +template std::unique_ptr
compute_groupby( table_view const& keys, host_span requests, - cudf::detail::result_cache* cache, bool skip_rows_with_nulls, row_comparator_t const& d_row_equal, row_hash_t const& d_row_hash, + cudf::detail::result_cache* cache, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); -template std::unique_ptr
compute_groupby( +template std::unique_ptr
compute_groupby( table_view const& keys, host_span requests, - cudf::detail::result_cache* cache, bool skip_rows_with_nulls, nullable_row_comparator_t const& d_row_equal, row_hash_t const& d_row_hash, + cudf::detail::result_cache* cache, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); - } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_groupby.hpp b/cpp/src/groupby/hash/compute_groupby.hpp index a11c1db4262..77243dc0a4f 100644 --- a/cpp/src/groupby/hash/compute_groupby.hpp +++ b/cpp/src/groupby/hash/compute_groupby.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,8 +15,6 @@ */ #pragma once -#include "helpers.cuh" - #include #include #include @@ -54,15 +52,27 @@ namespace cudf::groupby::detail::hash { * requested in `requests`, we gather sparse results into a column of dense * results using the aforementioned index vector. Dense results are stored into * the in/out parameter `cache`. + * + * @tparam Equal Device row comparator type + * @tparam Hash Device row hasher type + * + * @param keys Table whose rows act as the groupby keys + * @param requests The set of columns to aggregate and the aggregations to perform + * @param skip_rows_with_nulls Flag indicating whether to ignore nulls or not + * @param d_row_equal Device row comparator + * @param d_row_hash Device row hasher + * @param cache Dense aggregation results + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned table + * @return Table of unique keys */ -template +template std::unique_ptr compute_groupby(table_view const& keys, host_span requests, - cudf::detail::result_cache* cache, bool skip_rows_with_nulls, Equal const& d_row_equal, - row_hash_t const& d_row_hash, + Hash const& d_row_hash, + cudf::detail::result_cache* cache, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); - } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp index f2810bd0235..6e667228045 100644 --- a/cpp/src/groupby/hash/create_sparse_results_table.hpp +++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp @@ -25,6 +25,17 @@ #include namespace cudf::groupby::detail::hash { +/** + * @brief Computes and returns a device vector containing all populated keys in + * `key_set`. + * + * @tparam SetType Type of the key hash set + * + * @param key_set Key hash set + * TODO + * @param stream CUDA stream used for device memory operations and kernel launches + * @return An array of unique keys contained in `key_set` + */ template void extract_populated_keys(SetType const& key_set, rmm::device_uvector& populated_keys, From 17072b0b8b1490825d42cda1291ef2df73dc99a1 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 21 Oct 2024 10:12:47 -0700 Subject: [PATCH 113/135] Make mask const --- cpp/src/groupby/hash/create_sparse_results_table.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu index 5db4249740a..bc32e306b3f 100644 --- a/cpp/src/groupby/hash/create_sparse_results_table.cu +++ b/cpp/src/groupby/hash/create_sparse_results_table.cu @@ -66,7 +66,7 @@ cudf::table create_sparse_results_table(cudf::table_view const& flattened_values ? false : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or agg == cudf::aggregation::STD); - auto mask_flag = + auto const mask_flag = (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED; auto const col_type = cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() From 4672734fba0cf8d799e39107aab99488a27c731a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 21 Oct 2024 10:16:08 -0700 Subject: [PATCH 114/135] Use size_type instead of int --- cpp/src/groupby/hash/compute_mapping_indices.cuh | 4 ++-- cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp | 6 +++--- cpp/src/groupby/hash/single_pass_functors.cuh | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh index dd369a123ca..fa080709fd0 100644 --- a/cpp/src/groupby/hash/compute_mapping_indices.cuh +++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh @@ -152,9 +152,9 @@ CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows, } template -int max_occupancy_grid_size(cudf::size_type n) +cudf::size_type max_occupancy_grid_size(cudf::size_type n) { - int max_active_blocks{-1}; + cudf::size_type max_active_blocks{-1}; CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_active_blocks, mapping_indices_kernel, GROUPBY_BLOCK_SIZE, 0)); auto const grid_size = max_active_blocks * cudf::detail::num_multiprocessors(); diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp index 73db4750a1f..2fdb590324c 100644 --- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp +++ b/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp @@ -23,11 +23,11 @@ namespace cudf::groupby::detail::hash { -size_t available_shared_memory_size(int grid_size); +size_t available_shared_memory_size(cudf::size_type grid_size); -size_t shmem_offsets_size(int num_cols); +size_t shmem_offsets_size(cudf::size_type num_cols); -void compute_single_pass_shmem_aggs(int grid_size, +void compute_single_pass_shmem_aggs(cudf::size_type grid_size, cudf::size_type num_input_rows, bitmask_type const* row_bitmask, bool skip_rows_with_nulls, diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index 9d8cc0ad73b..b36bdd32af5 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -188,7 +188,7 @@ struct compute_direct_aggregates { cudf::mutable_table_device_view output_values; cudf::aggregation::Kind const* __restrict__ aggs; cudf::size_type* block_cardinality; - int stride; + cudf::size_type stride; bitmask_type const* __restrict__ row_bitmask; bool skip_rows_with_nulls; @@ -197,7 +197,7 @@ struct compute_direct_aggregates { cudf::mutable_table_device_view output_values, cudf::aggregation::Kind const* aggs, cudf::size_type* block_cardinality, - int stride, + cudf::size_type stride, bitmask_type const* row_bitmask, bool skip_rows_with_nulls) : set(set), @@ -213,7 +213,7 @@ struct compute_direct_aggregates { __device__ void operator()(cudf::size_type i) { - int block_id = (i % stride) / GROUPBY_BLOCK_SIZE; + auto const block_id = (i % stride) / GROUPBY_BLOCK_SIZE; if (block_cardinality[block_id] >= GROUPBY_CARDINALITY_THRESHOLD and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i))) { auto const result = set.insert_and_find(i); From f8220d937ded20a76851ecbebe34cb87a94cc087 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 22 Oct 2024 12:57:26 -0700 Subject: [PATCH 115/135] Move global agg to its own TU + renaming --- cpp/CMakeLists.txt | 7 +- ...e_pass_aggs.cu => compute_aggregations.cu} | 6 +- ...pass_aggs.cuh => compute_aggregations.cuh} | 103 +++++++--------- ...pass_aggs.hpp => compute_aggregations.hpp} | 2 +- ...s_null.cu => compute_aggregations_null.cu} | 6 +- .../hash/compute_global_memory_aggs.cu | 111 ++++++++++++++++++ .../hash/compute_global_memory_aggs.hpp | 42 +++++++ cpp/src/groupby/hash/compute_groupby.cu | 16 +-- ..._aggs.cu => compute_shared_memory_aggs.cu} | 24 ++-- ...ggs.hpp => compute_shared_memory_aggs.hpp} | 22 ++-- 10 files changed, 236 insertions(+), 103 deletions(-) rename cpp/src/groupby/hash/{compute_single_pass_aggs.cu => compute_aggregations.cu} (85%) rename cpp/src/groupby/hash/{compute_single_pass_aggs.cuh => compute_aggregations.cuh} (64%) rename cpp/src/groupby/hash/{compute_single_pass_aggs.hpp => compute_aggregations.hpp} (95%) rename cpp/src/groupby/hash/{compute_single_pass_aggs_null.cu => compute_aggregations_null.cu} (84%) create mode 100644 cpp/src/groupby/hash/compute_global_memory_aggs.cu create mode 100644 cpp/src/groupby/hash/compute_global_memory_aggs.hpp rename cpp/src/groupby/hash/{compute_single_pass_shmem_aggs.cu => compute_shared_memory_aggs.cu} (94%) rename cpp/src/groupby/hash/{compute_single_pass_shmem_aggs.hpp => compute_shared_memory_aggs.hpp} (55%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index ebd2b2b6d8c..57bcc2df604 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -371,9 +371,10 @@ add_library( src/groupby/hash/compute_groupby.cu src/groupby/hash/compute_mapping_indices.cu src/groupby/hash/compute_mapping_indices_null.cu - src/groupby/hash/compute_single_pass_aggs.cu - src/groupby/hash/compute_single_pass_aggs_null.cu - src/groupby/hash/compute_single_pass_shmem_aggs.cu + src/groupby/hash/compute_aggregations.cu + src/groupby/hash/compute_aggregations_null.cu + src/groupby/hash/compute_global_memory_aggs.cu + src/groupby/hash/compute_shared_memory_aggs.cu src/groupby/hash/create_sparse_results_table.cu src/groupby/hash/flatten_single_pass_aggs.cpp src/groupby/hash/groupby.cu diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_aggregations.cu similarity index 85% rename from cpp/src/groupby/hash/compute_single_pass_aggs.cu rename to cpp/src/groupby/hash/compute_aggregations.cu index 04519edf791..cac6c2224f0 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cu +++ b/cpp/src/groupby/hash/compute_aggregations.cu @@ -14,11 +14,11 @@ * limitations under the License. */ -#include "compute_single_pass_aggs.cuh" -#include "compute_single_pass_aggs.hpp" +#include "compute_aggregations.cuh" +#include "compute_aggregations.hpp" namespace cudf::groupby::detail::hash { -template rmm::device_uvector compute_single_pass_aggs( +template rmm::device_uvector compute_aggregations( int64_t num_rows, bool skip_rows_with_nulls, bitmask_type const* row_bitmask, diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh similarity index 64% rename from cpp/src/groupby/hash/compute_single_pass_aggs.cuh rename to cpp/src/groupby/hash/compute_aggregations.cuh index 6929d04dba1..8117b3fe0fa 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ b/cpp/src/groupby/hash/compute_aggregations.cuh @@ -15,9 +15,10 @@ */ #pragma once +#include "compute_aggregations.hpp" +#include "compute_global_memory_aggs.hpp" #include "compute_mapping_indices.hpp" -#include "compute_single_pass_aggs.hpp" -#include "compute_single_pass_shmem_aggs.hpp" +#include "compute_shared_memory_aggs.hpp" #include "create_sparse_results_table.hpp" #include "flatten_single_pass_aggs.hpp" #include "helpers.cuh" @@ -35,12 +36,12 @@ #include #include -#include #include #include #include #include +#include namespace cudf::groupby::detail::hash { /** @@ -48,7 +49,7 @@ namespace cudf::groupby::detail::hash { * over the data and stores the results in `sparse_results` */ template -rmm::device_uvector compute_single_pass_aggs( +rmm::device_uvector compute_aggregations( int64_t num_rows, bool skip_rows_with_nulls, bitmask_type const* row_bitmask, @@ -57,16 +58,11 @@ rmm::device_uvector compute_single_pass_aggs( cudf::detail::result_cache* sparse_results, rmm::cuda_stream_view stream) { - // 'populated_keys' contains inserted row_indices (keys) of global hash set - rmm::device_uvector populated_keys(num_rows, stream); - // flatten the aggs to a table that can be operated on by aggregate_row - auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); - auto const d_agg_kinds = cudf::detail::make_device_uvector_async( + auto [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); + auto const d_agg_kinds = cudf::detail::make_device_uvector_async( agg_kinds, stream, rmm::mr::get_current_device_resource()); - auto global_set_ref = global_set.ref(cuco::op::insert_and_find); - auto const grid_size = max_occupancy_grid_size>(num_rows); auto const has_sufficient_shmem = available_shared_memory_size(grid_size) > @@ -75,46 +71,26 @@ rmm::device_uvector compute_single_pass_aggs( requests.begin(), requests.end(), [](cudf::groupby::aggregation_request const& request) { return cudf::is_dictionary(request.values.type()); }); - auto const uses_global_aggs = has_dictionary_request or !has_sufficient_shmem; - - // Use naive global memory aggregations when there are dictionary columns to aggregagte or - // there is no sufficient dynamic shared memory for shared memory aggregations - if (uses_global_aggs) { - // make table that will hold sparse results - cudf::table sparse_table = create_sparse_results_table(flattened_values, - d_agg_kinds.data(), - agg_kinds, - uses_global_aggs, - global_set, - populated_keys, - stream); - - // prepare to launch kernel to do the actual aggregation - auto d_values = table_device_view::create(flattened_values, stream); - auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - num_rows, - hash::compute_single_pass_aggs_fn{global_set_ref, - *d_values, - *d_sparse_table, - d_agg_kinds.data(), - row_bitmask, - skip_rows_with_nulls}); - extract_populated_keys(global_set, populated_keys, stream); - - // Add results back to sparse_results cache - auto sparse_result_cols = sparse_table.release(); - for (size_t i = 0; i < aggs.size(); i++) { - // Note that the cache will make a copy of this temporary aggregation - sparse_results->add_result( - flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); - } - - return populated_keys; + auto const is_shared_memory_compatible = !has_dictionary_request and has_sufficient_shmem; + + // Performs naive global memory aggregations when the workload is not compatible with shared + // memory, such as when aggregating dictionary columns or when there is insufficient dynamic + // shared memory for shared memory aggregations. + if (!is_shared_memory_compatible) { + return compute_global_memory_aggs(num_rows, + skip_rows_with_nulls, + row_bitmask, + flattened_values, + d_agg_kinds.data(), + agg_kinds, + global_set, + aggs, + sparse_results, + stream); } + // 'populated_keys' contains inserted row_indices (keys) of global hash set + rmm::device_uvector populated_keys(num_rows, stream); // 'local_mapping_index' maps from the global row index of the input table to its block-wise rank rmm::device_uvector local_mapping_index(num_rows, stream); // 'global_mapping_index' maps from the block-wise rank to the row index of global aggregate table @@ -122,6 +98,9 @@ rmm::device_uvector compute_single_pass_aggs( stream); rmm::device_uvector block_cardinality(grid_size, stream); rmm::device_scalar direct_aggregations(false, stream); + + auto global_set_ref = global_set.ref(cuco::op::insert_and_find); + compute_mapping_indices(grid_size, num_rows, global_set_ref, @@ -145,21 +124,21 @@ rmm::device_uvector compute_single_pass_aggs( auto d_values = table_device_view::create(flattened_values, stream); auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - compute_single_pass_shmem_aggs(grid_size, - num_rows, - row_bitmask, - skip_rows_with_nulls, - local_mapping_index.data(), - global_mapping_index.data(), - block_cardinality.data(), - *d_values, - *d_sparse_table, - d_agg_kinds.data(), - stream); + compute_shared_memory_aggs(grid_size, + num_rows, + row_bitmask, + skip_rows_with_nulls, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + stream); if (direct_aggregations.value(stream)) { auto const stride = GROUPBY_BLOCK_SIZE * grid_size; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator{0}, num_rows, compute_direct_aggregates{global_set_ref, *d_values, diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp similarity index 95% rename from cpp/src/groupby/hash/compute_single_pass_aggs.hpp rename to cpp/src/groupby/hash/compute_aggregations.hpp index e409b3ff685..829c3c808b0 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp +++ b/cpp/src/groupby/hash/compute_aggregations.hpp @@ -29,7 +29,7 @@ namespace cudf::groupby::detail::hash { * over the data and stores the results in `sparse_results` */ template -rmm::device_uvector compute_single_pass_aggs( +rmm::device_uvector compute_aggregations( int64_t num_rows, bool skip_rows_with_nulls, bitmask_type const* row_bitmask, diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu b/cpp/src/groupby/hash/compute_aggregations_null.cu similarity index 84% rename from cpp/src/groupby/hash/compute_single_pass_aggs_null.cu rename to cpp/src/groupby/hash/compute_aggregations_null.cu index 135ba4188f2..1d7184227ea 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu +++ b/cpp/src/groupby/hash/compute_aggregations_null.cu @@ -14,11 +14,11 @@ * limitations under the License. */ -#include "compute_single_pass_aggs.cuh" -#include "compute_single_pass_aggs.hpp" +#include "compute_aggregations.cuh" +#include "compute_aggregations.hpp" namespace cudf::groupby::detail::hash { -template rmm::device_uvector compute_single_pass_aggs( +template rmm::device_uvector compute_aggregations( int64_t num_rows, bool skip_rows_with_nulls, bitmask_type const* row_bitmask, diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cu b/cpp/src/groupby/hash/compute_global_memory_aggs.cu new file mode 100644 index 00000000000..ad0cfbb6e12 --- /dev/null +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cu @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "create_sparse_results_table.hpp" +#include "flatten_single_pass_aggs.hpp" +#include "helpers.cuh" +#include "single_pass_functors.cuh" + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +rmm::device_uvector compute_global_memory_aggs( + cudf::size_type num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + SetType& global_set, + std::vector>& aggregations, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream) +{ + auto constexpr uses_global_memory_aggs = true; + // 'populated_keys' contains inserted row_indices (keys) of global hash set + rmm::device_uvector populated_keys(num_rows, stream); + + // make table that will hold sparse results + cudf::table sparse_table = create_sparse_results_table(flattened_values, + d_agg_kinds, + agg_kinds, + uses_global_memory_aggs, + global_set, + populated_keys, + stream); + + // prepare to launch kernel to do the actual aggregation + auto d_values = table_device_view::create(flattened_values, stream); + auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); + auto global_set_ref = global_set.ref(cuco::op::insert_and_find); + + thrust::for_each_n( + rmm::exec_policy_nosync(stream), + thrust::counting_iterator{0}, + num_rows, + hash::compute_single_pass_aggs_fn{ + global_set_ref, *d_values, *d_sparse_table, d_agg_kinds, row_bitmask, skip_rows_with_nulls}); + extract_populated_keys(global_set, populated_keys, stream); + + // Add results back to sparse_results cache + auto sparse_result_cols = sparse_table.release(); + for (size_t i = 0; i < aggregations.size(); i++) { + // Note that the cache will make a copy of this temporary aggregation + sparse_results->add_result( + flattened_values.column(i), *aggregations[i], std::move(sparse_result_cols[i])); + } + + return populated_keys; +} + +template rmm::device_uvector compute_global_memory_aggs( + cudf::size_type num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + global_set_t& global_set, + std::vector>& aggregations, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); + +template rmm::device_uvector compute_global_memory_aggs( + cudf::size_type num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + nullable_global_set_t& global_set, + std::vector>& aggregations, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp new file mode 100644 index 00000000000..0777b9ffd93 --- /dev/null +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +rmm::device_uvector compute_global_memory_aggs( + cudf::size_type num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + SetType& global_set, + std::vector>& aggregations, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu index fd416710439..e1dbf2a3d9e 100644 --- a/cpp/src/groupby/hash/compute_groupby.cu +++ b/cpp/src/groupby/hash/compute_groupby.cu @@ -14,8 +14,8 @@ * limitations under the License. */ +#include "compute_aggregations.hpp" #include "compute_groupby.hpp" -#include "compute_single_pass_aggs.hpp" #include "helpers.cuh" #include "sparse_to_dense_results.hpp" @@ -71,13 +71,13 @@ std::unique_ptr
compute_groupby(table_view const& keys, : rmm::device_buffer{}; // Compute all single pass aggs first - auto gather_map = compute_single_pass_aggs(num_keys, - skip_rows_with_nulls, - static_cast(row_bitmask.data()), - set, - requests, - &sparse_results, - stream); + auto gather_map = compute_aggregations(num_keys, + skip_rows_with_nulls, + static_cast(row_bitmask.data()), + set, + requests, + &sparse_results, + stream); // Compact all results from sparse_results and insert into cache sparse_to_dense_results(requests, diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu similarity index 94% rename from cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu rename to cpp/src/groupby/hash/compute_shared_memory_aggs.cu index 9874e2f7444..9b479eae037 100644 --- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.cu +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "compute_single_pass_shmem_aggs.hpp" +#include "compute_shared_memory_aggs.hpp" #include "global_memory_aggregator.cuh" #include "helpers.cuh" #include "shared_memory_aggregator.cuh" @@ -285,17 +285,17 @@ size_t available_shared_memory_size(cudf::size_type grid_size) size_t shmem_offsets_size(cudf::size_type num_cols) { return sizeof(cudf::size_type) * num_cols; } -void compute_single_pass_shmem_aggs(cudf::size_type grid_size, - cudf::size_type num_input_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::aggregation::Kind const* d_agg_kinds, - rmm::cuda_stream_view stream) +void compute_shared_memory_aggs(cudf::size_type grid_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* d_agg_kinds, + rmm::cuda_stream_view stream) { auto const shmem_size = available_shared_memory_size(grid_size); // For each aggregation, need one offset determining where the aggregation is diff --git a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp similarity index 55% rename from cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp rename to cpp/src/groupby/hash/compute_shared_memory_aggs.hpp index 2fdb590324c..7dc2b448a60 100644 --- a/cpp/src/groupby/hash/compute_single_pass_shmem_aggs.hpp +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp @@ -27,16 +27,16 @@ size_t available_shared_memory_size(cudf::size_type grid_size); size_t shmem_offsets_size(cudf::size_type num_cols); -void compute_single_pass_shmem_aggs(cudf::size_type grid_size, - cudf::size_type num_input_rows, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - cudf::table_device_view input_values, - cudf::mutable_table_device_view output_values, - cudf::aggregation::Kind const* d_agg_kinds, - rmm::cuda_stream_view stream); +void compute_shared_memory_aggs(cudf::size_type grid_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* d_agg_kinds, + rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash From aeef28bff91d620733c5cedb8609e3f4c0125c7a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 22 Oct 2024 13:42:34 -0700 Subject: [PATCH 116/135] Rename for clarity --- cpp/src/groupby/hash/compute_aggregations.cuh | 14 +++++++++----- cpp/src/groupby/hash/single_pass_functors.cuh | 4 ++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh index 8117b3fe0fa..83dce5813ac 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cuh +++ b/cpp/src/groupby/hash/compute_aggregations.cuh @@ -97,7 +97,9 @@ rmm::device_uvector compute_aggregations( rmm::device_uvector global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS, stream); rmm::device_uvector block_cardinality(grid_size, stream); - rmm::device_scalar direct_aggregations(false, stream); + + // Flag indicating whether a global memory aggregation fallback is required or not + rmm::device_scalar needs_global_memory_fallback(false, stream); auto global_set_ref = global_set.ref(cuco::op::insert_and_find); @@ -109,14 +111,16 @@ rmm::device_uvector compute_aggregations( local_mapping_index.data(), global_mapping_index.data(), block_cardinality.data(), - direct_aggregations.data(), + needs_global_memory_fallback.data(), stream); + auto const needs_fallback = needs_global_memory_fallback.value(stream); + // make table that will hold sparse results cudf::table sparse_table = create_sparse_results_table(flattened_values, d_agg_kinds.data(), agg_kinds, - direct_aggregations.value(stream), + needs_fallback, global_set, populated_keys, stream); @@ -135,12 +139,12 @@ rmm::device_uvector compute_aggregations( *d_sparse_table, d_agg_kinds.data(), stream); - if (direct_aggregations.value(stream)) { + if (needs_fallback) { auto const stride = GROUPBY_BLOCK_SIZE * grid_size; thrust::for_each_n(rmm::exec_policy_nosync(stream), thrust::counting_iterator{0}, num_rows, - compute_direct_aggregates{global_set_ref, + global_memory_fallback_fn{global_set_ref, *d_values, *d_sparse_table, d_agg_kinds.data(), diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index b36bdd32af5..dc43dbb7179 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -182,7 +182,7 @@ struct initialize_sparse_table { }; template -struct compute_direct_aggregates { +struct global_memory_fallback_fn { SetType set; cudf::table_device_view input_values; cudf::mutable_table_device_view output_values; @@ -192,7 +192,7 @@ struct compute_direct_aggregates { bitmask_type const* __restrict__ row_bitmask; bool skip_rows_with_nulls; - compute_direct_aggregates(SetType set, + global_memory_fallback_fn(SetType set, cudf::table_device_view input_values, cudf::mutable_table_device_view output_values, cudf::aggregation::Kind const* aggs, From 6b323f0d2991f61077c74be9f5c1d740681de2f2 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 22 Oct 2024 14:16:44 -0700 Subject: [PATCH 117/135] Rename direct_aggregations as needs_global_memory_fallback --- .../groupby/hash/compute_mapping_indices.cu | 2 +- .../groupby/hash/compute_mapping_indices.cuh | 23 ++++++++++--------- .../groupby/hash/compute_mapping_indices.hpp | 2 +- .../hash/compute_mapping_indices_null.cu | 2 +- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cu b/cpp/src/groupby/hash/compute_mapping_indices.cu index 1cbe70d651f..5b746b87a14 100644 --- a/cpp/src/groupby/hash/compute_mapping_indices.cu +++ b/cpp/src/groupby/hash/compute_mapping_indices.cu @@ -30,6 +30,6 @@ template void compute_mapping_indices> cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, - bool* direct_aggregations, + bool* needs_global_memory_fallback, rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh index fa080709fd0..0ff567c28f0 100644 --- a/cpp/src/groupby/hash/compute_mapping_indices.cuh +++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh @@ -95,7 +95,7 @@ CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows, cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, - bool* direct_aggregations) + bool* needs_global_memory_fallback) { // TODO: indices inserted in each shared memory set __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS]; @@ -137,7 +137,7 @@ CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows, block.sync(); if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { - if (block.thread_rank() == 0) { *direct_aggregations = true; } + if (block.thread_rank() == 0) { *needs_global_memory_fallback = true; } break; } } @@ -171,17 +171,18 @@ void compute_mapping_indices(cudf::size_type grid_size, cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, - bool* direct_aggregations, + bool* needs_global_memory_fallback, rmm::cuda_stream_view stream) { - mapping_indices_kernel<<>>(num, - global_set, - row_bitmask, - skip_rows_with_nulls, - local_mapping_index, - global_mapping_index, - block_cardinality, - direct_aggregations); + mapping_indices_kernel<<>>( + num, + global_set, + row_bitmask, + skip_rows_with_nulls, + local_mapping_index, + global_mapping_index, + block_cardinality, + needs_global_memory_fallback); stream.synchronize(); } } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_mapping_indices.hpp b/cpp/src/groupby/hash/compute_mapping_indices.hpp index d8047f9a5d8..b4eb2597118 100644 --- a/cpp/src/groupby/hash/compute_mapping_indices.hpp +++ b/cpp/src/groupby/hash/compute_mapping_indices.hpp @@ -36,6 +36,6 @@ void compute_mapping_indices(cudf::size_type grid_size, cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, - bool* direct_aggregations, + bool* needs_global_memory_fallback, rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_mapping_indices_null.cu b/cpp/src/groupby/hash/compute_mapping_indices_null.cu index 1b04016f9a1..cfccd0a0009 100644 --- a/cpp/src/groupby/hash/compute_mapping_indices_null.cu +++ b/cpp/src/groupby/hash/compute_mapping_indices_null.cu @@ -30,6 +30,6 @@ template void compute_mapping_indices Date: Tue, 22 Oct 2024 15:49:04 -0700 Subject: [PATCH 118/135] Use atomic_flag to avoid UB --- cpp/src/groupby/hash/compute_aggregations.cuh | 11 +++++++++-- cpp/src/groupby/hash/compute_mapping_indices.cu | 2 +- cpp/src/groupby/hash/compute_mapping_indices.cuh | 7 ++++--- cpp/src/groupby/hash/compute_mapping_indices.hpp | 4 +++- cpp/src/groupby/hash/compute_mapping_indices_null.cu | 2 +- 5 files changed, 18 insertions(+), 8 deletions(-) diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh index 83dce5813ac..9df9779f209 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cuh +++ b/cpp/src/groupby/hash/compute_aggregations.cuh @@ -37,6 +37,7 @@ #include #include +#include #include #include @@ -99,7 +100,7 @@ rmm::device_uvector compute_aggregations( rmm::device_uvector block_cardinality(grid_size, stream); // Flag indicating whether a global memory aggregation fallback is required or not - rmm::device_scalar needs_global_memory_fallback(false, stream); + rmm::device_scalar needs_global_memory_fallback(stream); auto global_set_ref = global_set.ref(cuco::op::insert_and_find); @@ -114,7 +115,13 @@ rmm::device_uvector compute_aggregations( needs_global_memory_fallback.data(), stream); - auto const needs_fallback = needs_global_memory_fallback.value(stream); + cuda::std::atomic_flag h_needs_fallback; + CUDF_CUDA_TRY(cudaMemcpyAsync(&h_needs_fallback, + needs_global_memory_fallback.data(), + sizeof(cuda::std::atomic_flag), + cudaMemcpyDefault, + stream.value())); + auto const needs_fallback = h_needs_fallback.test(); // make table that will hold sparse results cudf::table sparse_table = create_sparse_results_table(flattened_values, diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cu b/cpp/src/groupby/hash/compute_mapping_indices.cu index 5b746b87a14..519d7cd2f1c 100644 --- a/cpp/src/groupby/hash/compute_mapping_indices.cu +++ b/cpp/src/groupby/hash/compute_mapping_indices.cu @@ -30,6 +30,6 @@ template void compute_mapping_indices> cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, - bool* needs_global_memory_fallback, + cuda::std::atomic_flag* needs_global_memory_fallback, rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh index 0ff567c28f0..c5f542b7905 100644 --- a/cpp/src/groupby/hash/compute_mapping_indices.cuh +++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh @@ -27,6 +27,7 @@ #include #include +#include #include @@ -95,7 +96,7 @@ CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows, cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, - bool* needs_global_memory_fallback) + cuda::std::atomic_flag* needs_global_memory_fallback) { // TODO: indices inserted in each shared memory set __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS]; @@ -137,7 +138,7 @@ CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows, block.sync(); if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { - if (block.thread_rank() == 0) { *needs_global_memory_fallback = true; } + if (block.thread_rank() == 0) { needs_global_memory_fallback->test_and_set(); } break; } } @@ -171,7 +172,7 @@ void compute_mapping_indices(cudf::size_type grid_size, cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, - bool* needs_global_memory_fallback, + cuda::std::atomic_flag* needs_global_memory_fallback, rmm::cuda_stream_view stream) { mapping_indices_kernel<<>>( diff --git a/cpp/src/groupby/hash/compute_mapping_indices.hpp b/cpp/src/groupby/hash/compute_mapping_indices.hpp index b4eb2597118..473ad99e650 100644 --- a/cpp/src/groupby/hash/compute_mapping_indices.hpp +++ b/cpp/src/groupby/hash/compute_mapping_indices.hpp @@ -19,6 +19,8 @@ #include +#include + namespace cudf::groupby::detail::hash { /* * @brief Computes the maximum number of active blocks of the given kernel that can be executed on @@ -36,6 +38,6 @@ void compute_mapping_indices(cudf::size_type grid_size, cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, - bool* needs_global_memory_fallback, + cuda::std::atomic_flag* needs_global_memory_fallback, rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_mapping_indices_null.cu b/cpp/src/groupby/hash/compute_mapping_indices_null.cu index cfccd0a0009..81c3c9e456f 100644 --- a/cpp/src/groupby/hash/compute_mapping_indices_null.cu +++ b/cpp/src/groupby/hash/compute_mapping_indices_null.cu @@ -30,6 +30,6 @@ template void compute_mapping_indices Date: Tue, 22 Oct 2024 16:21:35 -0700 Subject: [PATCH 119/135] Cleanups --- .../groupby/hash/compute_mapping_indices.cuh | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh index c5f542b7905..0c7897d0f19 100644 --- a/cpp/src/groupby/hash/compute_mapping_indices.cuh +++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh @@ -28,6 +28,7 @@ #include #include #include +#include #include @@ -43,23 +44,27 @@ __device__ void find_local_mapping(cooperative_groups::thread_block const& block cudf::size_type* local_mapping_index, cudf::size_type* shared_set_indices) { - cudf::size_type result_idx{}; - bool inserted{}; - if (idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx))) { - auto const result = shared_set.insert_and_find(idx); - result_idx = *result.first; - inserted = result.second; - // inserted a new element - if (result.second) { - auto const shared_set_index = atomicAdd(cardinality, 1); - shared_set_indices[shared_set_index] = idx; - local_mapping_index[idx] = shared_set_index; + auto const is_valid_input = + idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx)); + auto const [result_idx, inserted] = [&]() { + if (is_valid_input) { + auto const result = shared_set.insert_and_find(idx); + auto const matched_idx = *result.first; + auto const inserted = result.second; + // inserted a new element + if (result.second) { + auto const shared_set_index = atomicAdd(cardinality, 1); + shared_set_indices[shared_set_index] = idx; + local_mapping_index[idx] = shared_set_index; + } + return cuda::std::pair{matched_idx, inserted}; } - } + return cuda::std::pair{0, false}; // dummy values + }(); // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all // threads in the thread block. block.sync(); - if (idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx))) { + if (is_valid_input) { // element was already in set if (!inserted) { local_mapping_index[idx] = local_mapping_index[result_idx]; } } @@ -98,7 +103,6 @@ CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows, cudf::size_type* block_cardinality, cuda::std::atomic_flag* needs_global_memory_fallback) { - // TODO: indices inserted in each shared memory set __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS]; // Shared set initialization From e3726e3c1b572edf53b75da350baad643d33bfc9 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 22 Oct 2024 16:42:25 -0700 Subject: [PATCH 120/135] Further split compute_global_memory_aggs --- cpp/CMakeLists.txt | 1 + .../hash/compute_global_memory_aggs.cu | 83 +---------------- .../hash/compute_global_memory_aggs.cuh | 89 +++++++++++++++++++ .../hash/compute_global_memory_aggs_null.cu | 32 +++++++ 4 files changed, 124 insertions(+), 81 deletions(-) create mode 100644 cpp/src/groupby/hash/compute_global_memory_aggs.cuh create mode 100644 cpp/src/groupby/hash/compute_global_memory_aggs_null.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 57bcc2df604..fb098031f7d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -374,6 +374,7 @@ add_library( src/groupby/hash/compute_aggregations.cu src/groupby/hash/compute_aggregations_null.cu src/groupby/hash/compute_global_memory_aggs.cu + src/groupby/hash/compute_global_memory_aggs_null.cu src/groupby/hash/compute_shared_memory_aggs.cu src/groupby/hash/create_sparse_results_table.cu src/groupby/hash/flatten_single_pass_aggs.cpp diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cu b/cpp/src/groupby/hash/compute_global_memory_aggs.cu index ad0cfbb6e12..6025686953e 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cu @@ -14,77 +14,10 @@ * limitations under the License. */ -#include "create_sparse_results_table.hpp" -#include "flatten_single_pass_aggs.hpp" -#include "helpers.cuh" -#include "single_pass_functors.cuh" - -#include -#include -#include -#include - -#include -#include -#include - -#include -#include - -#include -#include +#include "compute_global_memory_aggs.cuh" +#include "compute_global_memory_aggs.hpp" namespace cudf::groupby::detail::hash { -template -rmm::device_uvector compute_global_memory_aggs( - cudf::size_type num_rows, - bool skip_rows_with_nulls, - bitmask_type const* row_bitmask, - cudf::table_view const& flattened_values, - cudf::aggregation::Kind const* d_agg_kinds, - std::vector const& agg_kinds, - SetType& global_set, - std::vector>& aggregations, - cudf::detail::result_cache* sparse_results, - rmm::cuda_stream_view stream) -{ - auto constexpr uses_global_memory_aggs = true; - // 'populated_keys' contains inserted row_indices (keys) of global hash set - rmm::device_uvector populated_keys(num_rows, stream); - - // make table that will hold sparse results - cudf::table sparse_table = create_sparse_results_table(flattened_values, - d_agg_kinds, - agg_kinds, - uses_global_memory_aggs, - global_set, - populated_keys, - stream); - - // prepare to launch kernel to do the actual aggregation - auto d_values = table_device_view::create(flattened_values, stream); - auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - auto global_set_ref = global_set.ref(cuco::op::insert_and_find); - - thrust::for_each_n( - rmm::exec_policy_nosync(stream), - thrust::counting_iterator{0}, - num_rows, - hash::compute_single_pass_aggs_fn{ - global_set_ref, *d_values, *d_sparse_table, d_agg_kinds, row_bitmask, skip_rows_with_nulls}); - extract_populated_keys(global_set, populated_keys, stream); - - // Add results back to sparse_results cache - auto sparse_result_cols = sparse_table.release(); - for (size_t i = 0; i < aggregations.size(); i++) { - // Note that the cache will make a copy of this temporary aggregation - sparse_results->add_result( - flattened_values.column(i), *aggregations[i], std::move(sparse_result_cols[i])); - } - - return populated_keys; -} - template rmm::device_uvector compute_global_memory_aggs( cudf::size_type num_rows, bool skip_rows_with_nulls, @@ -96,16 +29,4 @@ template rmm::device_uvector compute_global_memory_aggs>& aggregations, cudf::detail::result_cache* sparse_results, rmm::cuda_stream_view stream); - -template rmm::device_uvector compute_global_memory_aggs( - cudf::size_type num_rows, - bool skip_rows_with_nulls, - bitmask_type const* row_bitmask, - cudf::table_view const& flattened_values, - cudf::aggregation::Kind const* d_agg_kinds, - std::vector const& agg_kinds, - nullable_global_set_t& global_set, - std::vector>& aggregations, - cudf::detail::result_cache* sparse_results, - rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh new file mode 100644 index 00000000000..00db149c6d9 --- /dev/null +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "compute_global_memory_aggs.hpp" +#include "create_sparse_results_table.hpp" +#include "flatten_single_pass_aggs.hpp" +#include "helpers.cuh" +#include "single_pass_functors.cuh" + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +rmm::device_uvector compute_global_memory_aggs( + cudf::size_type num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + SetType& global_set, + std::vector>& aggregations, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream) +{ + auto constexpr uses_global_memory_aggs = true; + // 'populated_keys' contains inserted row_indices (keys) of global hash set + rmm::device_uvector populated_keys(num_rows, stream); + + // make table that will hold sparse results + cudf::table sparse_table = create_sparse_results_table(flattened_values, + d_agg_kinds, + agg_kinds, + uses_global_memory_aggs, + global_set, + populated_keys, + stream); + + // prepare to launch kernel to do the actual aggregation + auto d_values = table_device_view::create(flattened_values, stream); + auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); + auto global_set_ref = global_set.ref(cuco::op::insert_and_find); + + thrust::for_each_n( + rmm::exec_policy_nosync(stream), + thrust::counting_iterator{0}, + num_rows, + hash::compute_single_pass_aggs_fn{ + global_set_ref, *d_values, *d_sparse_table, d_agg_kinds, row_bitmask, skip_rows_with_nulls}); + extract_populated_keys(global_set, populated_keys, stream); + + // Add results back to sparse_results cache + auto sparse_result_cols = sparse_table.release(); + for (size_t i = 0; i < aggregations.size(); i++) { + // Note that the cache will make a copy of this temporary aggregation + sparse_results->add_result( + flattened_values.column(i), *aggregations[i], std::move(sparse_result_cols[i])); + } + + return populated_keys; +} +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu new file mode 100644 index 00000000000..209e2b7f20a --- /dev/null +++ b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_global_memory_aggs.cuh" +#include "compute_global_memory_aggs.hpp" + +namespace cudf::groupby::detail::hash { +template rmm::device_uvector compute_global_memory_aggs( + cudf::size_type num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + nullable_global_set_t& global_set, + std::vector>& aggregations, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash From 3775ec85e06ace02c40c2c9c25866a2b22cf24cd Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 23 Oct 2024 15:27:30 -0700 Subject: [PATCH 121/135] Remove unused code --- cpp/src/groupby/hash/single_pass_functors.cuh | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index dc43dbb7179..abf29f098af 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -173,7 +173,6 @@ struct initialize_sparse_table { for (auto col_idx = 0; col_idx < sparse_table.num_columns(); col_idx++) { cudf::detail::dispatch_type_and_aggregation(sparse_table.column(col_idx).type(), aggs[col_idx], - // cudf::detail::identity_initializer{}, initialize_gmem{}, sparse_table.column(col_idx), key_idx); From 8dd5535bb9f73f48eb7c43631167c45fd5bcbebc Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 23 Oct 2024 15:42:50 -0700 Subject: [PATCH 122/135] Sync to make sure the data is valid --- cpp/src/groupby/hash/compute_aggregations.cuh | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh index 9df9779f209..519e9f55eaf 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cuh +++ b/cpp/src/groupby/hash/compute_aggregations.cuh @@ -121,6 +121,7 @@ rmm::device_uvector compute_aggregations( sizeof(cuda::std::atomic_flag), cudaMemcpyDefault, stream.value())); + stream.synchronize(); auto const needs_fallback = h_needs_fallback.test(); // make table that will hold sparse results @@ -146,21 +147,6 @@ rmm::device_uvector compute_aggregations( *d_sparse_table, d_agg_kinds.data(), stream); - if (needs_fallback) { - auto const stride = GROUPBY_BLOCK_SIZE * grid_size; - thrust::for_each_n(rmm::exec_policy_nosync(stream), - thrust::counting_iterator{0}, - num_rows, - global_memory_fallback_fn{global_set_ref, - *d_values, - *d_sparse_table, - d_agg_kinds.data(), - block_cardinality.data(), - stride, - row_bitmask, - skip_rows_with_nulls}); - extract_populated_keys(global_set, populated_keys, stream); - } // Add results back to sparse_results cache auto sparse_result_cols = sparse_table.release(); From 59accf609e175f4f721c78b3d9515e37ae1cbb6e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 23 Oct 2024 15:51:35 -0700 Subject: [PATCH 123/135] Add comments --- cpp/src/groupby/hash/compute_aggregations.cuh | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh index 519e9f55eaf..71db8d10e7d 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cuh +++ b/cpp/src/groupby/hash/compute_aggregations.cuh @@ -148,6 +148,26 @@ rmm::device_uvector compute_aggregations( d_agg_kinds.data(), stream); + // The shared memory groupby is designed so that each thread block can handle up to 128 unique + // keys. When a block reaches this cardinality limit, shared memory becomes insufficient to store + // the temporary aggregation results. In these situations, we must fall back to a global memory + // aggregator to process the remaining aggregation requests. + if (needs_fallback) { + auto const stride = GROUPBY_BLOCK_SIZE * grid_size; + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator{0}, + num_rows, + global_memory_fallback_fn{global_set_ref, + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + block_cardinality.data(), + stride, + row_bitmask, + skip_rows_with_nulls}); + extract_populated_keys(global_set, populated_keys, stream); + } + // Add results back to sparse_results cache auto sparse_result_cols = sparse_table.release(); for (size_t i = 0; i < aggs.size(); i++) { From 8be28d02540d972434a0bba85eee7e8f4fa60243 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 23 Oct 2024 15:55:53 -0700 Subject: [PATCH 124/135] Add comments --- cpp/src/groupby/hash/compute_aggregations.cuh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh index 71db8d10e7d..68f5a8434f1 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cuh +++ b/cpp/src/groupby/hash/compute_aggregations.cuh @@ -116,6 +116,8 @@ rmm::device_uvector compute_aggregations( stream); cuda::std::atomic_flag h_needs_fallback; + // Cannot use `device_scalar::value` as it requires a copy constructor, which + // `atomic_flag` doesn't have. CUDF_CUDA_TRY(cudaMemcpyAsync(&h_needs_fallback, needs_global_memory_fallback.data(), sizeof(cuda::std::atomic_flag), From 91da22e5cf297a291fe1a5a2076ba076e055cf22 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 23 Oct 2024 16:02:41 -0700 Subject: [PATCH 125/135] Remove redundant sync --- cpp/src/groupby/hash/compute_mapping_indices.cuh | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh index 0c7897d0f19..d353830780f 100644 --- a/cpp/src/groupby/hash/compute_mapping_indices.cuh +++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh @@ -188,6 +188,5 @@ void compute_mapping_indices(cudf::size_type grid_size, global_mapping_index, block_cardinality, needs_global_memory_fallback); - stream.synchronize(); } } // namespace cudf::groupby::detail::hash From cf289d10dee2b07565064fd566328dd2d916cc2d Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 23 Oct 2024 16:18:53 -0700 Subject: [PATCH 126/135] Add CUDF_UNREACHABLE instead of silent break + remove outdated comments --- cpp/src/groupby/hash/compute_shared_memory_aggs.cu | 5 +++-- cpp/src/groupby/hash/helpers.cuh | 2 -- cpp/src/groupby/hash/single_pass_functors.cuh | 1 - 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu index 9b479eae037..3e961fa1d76 100644 --- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu @@ -69,8 +69,9 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, cardinality); auto const next_col_total_size = next_col_size + valid_col_size; - // TODO: it seems early exit will break the followup calculatons. To verify - if (bytes_allocated + next_col_total_size > total_agg_size) { break; } + if (bytes_allocated + next_col_total_size > total_agg_size) { + CUDF_UNREACHABLE("No enough memory space for shared memory aggregations"); + } shmem_agg_res_offsets[col_end] = bytes_allocated; shmem_agg_mask_offsets[col_end] = bytes_allocated + next_col_size; diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh index 0d117ca35b3..7879518e660 100644 --- a/cpp/src/groupby/hash/helpers.cuh +++ b/cpp/src/groupby/hash/helpers.cuh @@ -23,8 +23,6 @@ #include namespace cudf::groupby::detail::hash { -// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested -// types and `cg_size = 1`for flat data to improve performance /// Number of threads to handle each input element CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1; diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index abf29f098af..28c6ba717f1 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -103,7 +103,6 @@ struct initialize_target_element - // TODO naming __device__ void operator()(cuda::std::byte* target, bool* target_mask, cudf::size_type idx) const noexcept From a1d139aad9e461400de5225e9ffee83aded7af96 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 23 Oct 2024 16:20:10 -0700 Subject: [PATCH 127/135] Add doc --- cpp/src/groupby/hash/create_sparse_results_table.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp index 416a5cff0d1..8155ce852e0 100644 --- a/cpp/src/groupby/hash/create_sparse_results_table.hpp +++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp @@ -34,7 +34,7 @@ namespace cudf::groupby::detail::hash { * @tparam SetType Type of the key hash set * * @param key_set Key hash set - * TODO + * @param populated_keys Array of unique keys * @param stream CUDA stream used for device memory operations and kernel launches * @return An array of unique keys contained in `key_set` */ From f9f201a76f4e1edae93b5ea2be87e38ccba7ba0e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 28 Oct 2024 10:22:36 -0700 Subject: [PATCH 128/135] Fix leftover --- cpp/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 0f9d5118e13..ce147a988ea 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -376,7 +376,6 @@ add_library( src/groupby/hash/compute_mapping_indices.cu src/groupby/hash/compute_mapping_indices_null.cu src/groupby/hash/compute_shared_memory_aggs.cu - src/groupby/hash/compute_single_pass_aggs.cu src/groupby/hash/create_sparse_results_table.cu src/groupby/hash/flatten_single_pass_aggs.cpp src/groupby/hash/groupby.cu From fef1ca8a08ba90bbf9b298ca9756903786c6ca44 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 30 Oct 2024 17:51:27 -0700 Subject: [PATCH 129/135] Renaming for clarity + add missing func --- cpp/src/groupby/hash/compute_aggregations.cuh | 4 ++-- .../groupby/hash/compute_shared_memory_aggs.cu | 15 ++++++++++----- .../groupby/hash/compute_shared_memory_aggs.hpp | 4 +++- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh index f730ef96c5f..e8b29a0e7a8 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cuh +++ b/cpp/src/groupby/hash/compute_aggregations.cuh @@ -66,9 +66,9 @@ rmm::device_uvector compute_aggregations( auto const grid_size = max_occupancy_grid_size>(num_rows); - auto const available_shmem_size = available_shared_memory_size(grid_size); + auto const available_shmem_size = get_available_shared_memory_size(grid_size); auto const has_sufficient_shmem = - available_shmem_size > (shmem_offsets_size(flattened_values.num_columns()) * 2); + available_shmem_size > (compute_shmem_offsets_size(flattened_values.num_columns()) * 2); auto const has_dictionary_request = std::any_of( requests.begin(), requests.end(), [](cudf::groupby::aggregation_request const& request) { return cudf::is_dictionary(request.values.type()); diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu index 12c02a1865e..3371b667be7 100644 --- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu @@ -275,7 +275,7 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, } } // namespace -std::size_t available_shared_memory_size(cudf::size_type grid_size) +std::size_t get_available_shared_memory_size(cudf::size_type grid_size) { auto const active_blocks_per_sm = cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); @@ -287,6 +287,11 @@ std::size_t available_shared_memory_size(cudf::size_type grid_size) ALIGNMENT); } +std::size_t compute_shmem_offsets_size(cudf::size_type num_cols) +{ + return sizeof(cudf::size_type) * num_cols; +} + void compute_shared_memory_aggs(cudf::size_type grid_size, std::size_t available_shmem_size, cudf::size_type num_input_rows, @@ -302,11 +307,11 @@ void compute_shared_memory_aggs(cudf::size_type grid_size, { // For each aggregation, need one offset determining where the aggregation is // performed, another indicating the validity of the aggregation - auto const shmem_offsets_size = output_values.num_columns() * sizeof(cudf::size_type); + auto const offsets_size = compute_shmem_offsets_size(output_values.num_columns()); // The rest of shmem is utilized for the actual arrays in shmem - CUDF_EXPECTS(available_shmem_size > shmem_offsets_size * 2, + CUDF_EXPECTS(available_shmem_size > offsets_size * 2, "No enough space for shared memory aggregations"); - auto const shmem_agg_size = available_shmem_size - shmem_offsets_size * 2; + auto const shmem_agg_size = available_shmem_size - offsets_size * 2; single_pass_shmem_aggs_kernel<<>>( num_input_rows, row_bitmask, @@ -318,6 +323,6 @@ void compute_shared_memory_aggs(cudf::size_type grid_size, output_values, d_agg_kinds, shmem_agg_size, - shmem_offsets_size); + offsets_size); } } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp index 23b9858afa3..65b658a021d 100644 --- a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp @@ -22,7 +22,9 @@ #include namespace cudf::groupby::detail::hash { -std::size_t available_shared_memory_size(cudf::size_type grid_size); +std::size_t get_available_shared_memory_size(cudf::size_type grid_size); + +std::size_t compute_shmem_offsets_size(cudf::size_type num_cols); void compute_shared_memory_aggs(cudf::size_type grid_size, std::size_t available_shmem_size, From 8ccd81789af6f0719c82c8c194a22aaa97c48d86 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 30 Oct 2024 18:29:18 -0700 Subject: [PATCH 130/135] Minor fix --- cpp/src/groupby/hash/compute_shared_memory_aggs.cu | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu index 3371b667be7..a47ba27558b 100644 --- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu @@ -74,9 +74,7 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, ALIGNMENT); auto const next_col_total_size = next_col_size + valid_col_size; - if (bytes_allocated + next_col_total_size > total_agg_size) { - CUDF_UNREACHABLE("Not enough memory for shared memory aggregations"); - } + if (bytes_allocated + next_col_total_size > total_agg_size) { break; } shmem_agg_res_offsets[col_end] = bytes_allocated; shmem_agg_mask_offsets[col_end] = bytes_allocated + next_col_size; From 0c315f8bbe8f696b1a3d55e28935e1faa48ca11e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 4 Nov 2024 13:03:41 -0800 Subject: [PATCH 131/135] Update comments --- cpp/src/groupby/hash/compute_shared_memory_aggs.cu | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu index a47ba27558b..c15d8d44127 100644 --- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu @@ -47,9 +47,8 @@ struct size_of_functor { /// Shared memory data alignment CUDF_HOST_DEVICE cudf::size_type constexpr ALIGNMENT = 8; -// Prepares shared memory data required by each output column, exits if -// no enough memory space to perform the shared memory aggregation for the -// current output column +// Allocates shared memory required for output columns. Exits if there is insufficient memory to +// perform shared memory aggregation for the current output column. __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, cudf::size_type& col_end, cudf::mutable_table_device_view output_values, From 7131c9f7985f6e5d733217fce64c61d1413194bf Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 6 Nov 2024 13:25:11 -0800 Subject: [PATCH 132/135] Apply suggestions from code review Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com> --- cpp/src/groupby/hash/single_pass_functors.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index e22998b01dd..7a9a95f3059 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -108,8 +108,8 @@ struct initialize_shmem { template struct initialize_target_element_gmem { - __device__ void operator()(cudf::mutable_column_device_view target, - cudf::size_type target_index) const noexcept + __device__ void operator()(cudf::mutable_column_device_view, + cudf::size_type) const noexcept { CUDF_UNREACHABLE("Invalid source type and aggregation combination."); } From 5c6b33c3390448da9bf27d2e7b7b8a2cb4de8337 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 6 Nov 2024 13:37:35 -0800 Subject: [PATCH 133/135] Make compute_shmem_offsets_size constexpr --- cpp/src/groupby/hash/compute_shared_memory_aggs.cu | 5 ----- cpp/src/groupby/hash/compute_shared_memory_aggs.hpp | 5 ++++- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu index c15d8d44127..f0361ccced2 100644 --- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu @@ -284,11 +284,6 @@ std::size_t get_available_shared_memory_size(cudf::size_type grid_size) ALIGNMENT); } -std::size_t compute_shmem_offsets_size(cudf::size_type num_cols) -{ - return sizeof(cudf::size_type) * num_cols; -} - void compute_shared_memory_aggs(cudf::size_type grid_size, std::size_t available_shmem_size, cudf::size_type num_input_rows, diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp index 65b658a021d..346956cdab0 100644 --- a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp @@ -24,7 +24,10 @@ namespace cudf::groupby::detail::hash { std::size_t get_available_shared_memory_size(cudf::size_type grid_size); -std::size_t compute_shmem_offsets_size(cudf::size_type num_cols); +std::size_t constexpr compute_shmem_offsets_size(cudf::size_type num_cols) +{ + return sizeof(cudf::size_type) * num_cols; +} void compute_shared_memory_aggs(cudf::size_type grid_size, std::size_t available_shmem_size, From b05fab40a19c093e087f0cf55e4b8db99ca056fb Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 6 Nov 2024 18:03:53 -0800 Subject: [PATCH 134/135] Formatting --- cpp/src/groupby/hash/single_pass_functors.cuh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index 7a9a95f3059..572098c75f8 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -108,8 +108,7 @@ struct initialize_shmem { template struct initialize_target_element_gmem { - __device__ void operator()(cudf::mutable_column_device_view, - cudf::size_type) const noexcept + __device__ void operator()(cudf::mutable_column_device_view, cudf::size_type) const noexcept { CUDF_UNREACHABLE("Invalid source type and aggregation combination."); } From 96fbaa97c18988f60ba148c86c2f95add3e8e598 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 7 Nov 2024 17:07:17 -0800 Subject: [PATCH 135/135] Update cpp/src/groupby/hash/single_pass_functors.cuh --- cpp/src/groupby/hash/single_pass_functors.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index 572098c75f8..048c9252773 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License.