Skip to content

Commit

Permalink
Add cardinality control for groupby benchs with flat types (#15134)
Browse files Browse the repository at this point in the history
Contributes to #15114

This PR adds cardinality control to `group_max`, `group_nunique` and `group_rank` benchmarks.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #15134
  • Loading branch information
PointKernel authored Mar 8, 2024
1 parent 69952b0 commit b08dd9b
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 22 deletions.
53 changes: 42 additions & 11 deletions cpp/benchmarks/groupby/group_max.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,25 +22,30 @@
#include <nvbench/nvbench.cuh>

template <typename Type>
void bench_groupby_max(nvbench::state& state, nvbench::type_list<Type>)
void groupby_max_helper(nvbench::state& state,
cudf::size_type num_rows,
cudf::size_type cardinality,
double null_probability)
{
auto const size = static_cast<cudf::size_type>(state.get_int64("num_rows"));

auto const keys = [&] {
data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, 100);
return create_random_column(cudf::type_to_id<int32_t>(), row_count{size}, profile);
data_profile const profile =
data_profile_builder()
.cardinality(cardinality)
.no_validity()
.distribution(cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, num_rows);
return create_random_column(cudf::type_to_id<int32_t>(), row_count{num_rows}, profile);
}();

auto const vals = [&] {
auto builder = data_profile_builder().cardinality(0).distribution(
cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 1000);
if (const auto null_freq = state.get_float64("null_probability"); null_freq > 0) {
builder.null_probability(null_freq);
cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, num_rows);
if (null_probability > 0) {
builder.null_probability(null_probability);
} else {
builder.no_validity();
}
return create_random_column(cudf::type_to_id<Type>(), row_count{size}, data_profile{builder});
return create_random_column(
cudf::type_to_id<Type>(), row_count{num_rows}, data_profile{builder});
}();

auto keys_view = keys->view();
Expand All @@ -55,13 +60,39 @@ void bench_groupby_max(nvbench::state& state, nvbench::type_list<Type>)
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });

auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
state.add_buffer_size(
mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
}

template <typename Type>
void bench_groupby_max(nvbench::state& state, nvbench::type_list<Type>)
{
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const null_probability = state.get_float64("null_probability");

groupby_max_helper<Type>(state, num_rows, cardinality, null_probability);
}

template <typename Type>
void bench_groupby_max_cardinality(nvbench::state& state, nvbench::type_list<Type>)
{
auto constexpr num_rows = 20'000'000;
auto constexpr null_probability = 0.;
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));

groupby_max_helper<Type>(state, num_rows, cardinality, null_probability);
}

NVBENCH_BENCH_TYPES(bench_groupby_max,
NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, int64_t, float, double>))
.set_name("groupby_max")
.add_int64_axis("cardinality", {0})
.add_int64_power_of_two_axis("num_rows", {12, 18, 24})
.add_float64_axis("null_probability", {0, 0.1, 0.9});

NVBENCH_BENCH_TYPES(bench_groupby_max_cardinality, NVBENCH_TYPE_AXES(nvbench::type_list<int32_t>))
.set_name("groupby_max_cardinality")
.add_int64_axis("cardinality", {10, 20, 50, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000});
19 changes: 13 additions & 6 deletions cpp/benchmarks/groupby/group_nunique.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -39,17 +39,23 @@ auto make_aggregation_request_vector(cudf::column_view const& values, Args&&...
template <typename Type>
void bench_groupby_nunique(nvbench::state& state, nvbench::type_list<Type>)
{
auto const size = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const size = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));

auto const keys = [&] {
data_profile profile = data_profile_builder().cardinality(0).no_validity().distribution(
cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, 100);
data_profile profile =
data_profile_builder()
.cardinality(cardinality)
.no_validity()
.distribution(cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, size);
return create_random_column(cudf::type_to_id<int32_t>(), row_count{size}, profile);
}();

auto const vals = [&] {
data_profile profile = data_profile_builder().cardinality(0).distribution(
cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 1000);
data_profile profile =
data_profile_builder()
.cardinality(cardinality)
.distribution(cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, size);
if (const auto null_freq = state.get_float64("null_probability"); null_freq > 0) {
profile.set_null_probability(null_freq);
} else {
Expand All @@ -71,4 +77,5 @@ void bench_groupby_nunique(nvbench::state& state, nvbench::type_list<Type>)
NVBENCH_BENCH_TYPES(bench_groupby_nunique, NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, int64_t>))
.set_name("groupby_nunique")
.add_int64_power_of_two_axis("num_rows", {12, 16, 20, 24})
.add_int64_axis("cardinality", {0})
.add_float64_axis("null_probability", {0, 0.5});
12 changes: 7 additions & 5 deletions cpp/benchmarks/groupby/group_rank.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -31,10 +31,12 @@ static void nvbench_groupby_rank(nvbench::state& state,

bool const is_sorted = state.get_int64("is_sorted");
cudf::size_type const column_size = state.get_int64("data_size");
constexpr int num_groups = 100;
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));

data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
dtype, distribution_id::UNIFORM, 0, num_groups);
data_profile const profile = data_profile_builder()
.cardinality(cardinality)
.no_validity()
.distribution(dtype, distribution_id::UNIFORM, 0, column_size);

auto source_table = create_random_table({dtype, dtype}, row_count{column_size}, profile);

Expand Down Expand Up @@ -100,5 +102,5 @@ NVBENCH_BENCH_TYPES(nvbench_groupby_rank, NVBENCH_TYPE_AXES(methods))
10000000, // 10M
100000000, // 100M
})

.add_int64_axis("cardinality", {0})
.add_int64_axis("is_sorted", {0, 1});

0 comments on commit b08dd9b

Please sign in to comment.