Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add throughput metrics for REDUCTION_BENCH/REDUCTION_NVBENCH benchmarks #16126

Merged
merged 7 commits into from
Jul 2, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,11 @@ target_include_directories(
# Use an OBJECT library so we only compile these helper source files only once
add_library(
cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp"
synchronization/synchronization.cpp io/cuio_common.cpp
synchronization/synchronization.cpp
io/cuio_common.cpp
common/table_utilities.cpp
common/benchmark_utilities.cpp
common/nvbench_utilities.cpp
)
target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $<TARGET_NAME_IF_EXISTS:conda_env>)
add_custom_command(
Expand Down
25 changes: 25 additions & 0 deletions cpp/benchmarks/common/benchmark_utilities.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "benchmark_utilities.hpp"

void set_items_processed(::benchmark::State& state, int64_t items_processed_per_iteration) {
state.SetItemsProcessed(state.iterations() * items_processed_per_iteration);
}

void set_bytes_processed(::benchmark::State& state, int64_t bytes_processed_per_iteration) {
state.SetBytesProcessed(state.iterations() * bytes_processed_per_iteration);
}
41 changes: 41 additions & 0 deletions cpp/benchmarks/common/benchmark_utilities.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <benchmark/benchmark.h>

/**
* @brief Sets the number of items processed during the benchmark.
*
* This function could be used instead of ::benchmark::State.SetItemsProcessed()
* to avoid repeatedly computing ::benchmark::State.iterations() * items_processed_per_iteration.
*
* @param state the benchmark state
* @param items_processed_per_iteration number of items processed per iteration
*/
void set_items_processed(::benchmark::State& state, int64_t items_processed_per_iteration);

/**
* @brief Sets the number of bytes processed during the benchmark.
*
* This function could be used instead of ::benchmark::State.SetItemsProcessed()
* to avoid repeatedly computing ::benchmark::State.iterations() * bytes_processed_per_iteration.
*
* @param state the benchmark state
* @param bytes_processed_per_iteration number of bytes processed per iteration
*/
void set_bytes_processed(::benchmark::State& state, int64_t bytes_processed_per_iteration);
61 changes: 61 additions & 0 deletions cpp/benchmarks/common/nvbench_utilities.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "nvbench_utilities.hpp"

#include <nvbench/nvbench.cuh>

// This function is copied over from https://github.com/NVIDIA/nvbench/blob/a171514056e5d6a7f52a035dd6c812fa301d4f4f/nvbench/detail/measure_cold.cu#L190-L224.
void set_throughputs(nvbench::state& state)
{
double avg_cuda_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");

if (const auto items = state.get_element_count(); items != 0)
{
auto &summ = state.add_summary("nv/cold/bw/item_rate");
summ.set_string("name", "Elem/s");
summ.set_string("hint", "item_rate");
summ.set_string("description", "Number of input elements processed per second");
summ.set_float64("value", static_cast<double>(items) / avg_cuda_time);
}

if (const auto bytes = state.get_global_memory_rw_bytes(); bytes != 0)
{
const auto avg_used_gmem_bw = static_cast<double>(bytes) / avg_cuda_time;
{
auto &summ = state.add_summary("nv/cold/bw/global/bytes_per_second");
summ.set_string("name", "GlobalMem BW");
summ.set_string("hint", "byte_rate");
summ.set_string("description",
"Number of bytes read/written per second to the CUDA "
"device's global memory");
summ.set_float64("value", avg_used_gmem_bw);
}

{
const auto peak_gmem_bw =
static_cast<double>(state.get_device()->get_global_memory_bus_bandwidth());

auto &summ = state.add_summary("nv/cold/bw/global/utilization");
summ.set_string("name", "BWUtil");
summ.set_string("hint", "percentage");
summ.set_string("description",
"Global device memory utilization as a percentage of the "
"device's peak bandwidth");
summ.set_float64("value", avg_used_gmem_bw / peak_gmem_bw);
}
} // bandwidth
}
31 changes: 31 additions & 0 deletions cpp/benchmarks/common/nvbench_utilities.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

namespace nvbench {
struct state;
}

/**
* @brief Sets throughput statistics, such as "Elem/s", "GlobalMem BW", and "BWUtil" for the
* nvbench results summary.
*
* This function could be used to work around a known issue that the throughput statistics
* should be added before the nvbench::state.exec() call, otherwise they will not be printed
* in the summary. See https://github.com/NVIDIA/nvbench/issues/175 for more details.
*/
void set_throughputs(nvbench::state& state);
41 changes: 41 additions & 0 deletions cpp/benchmarks/common/table_utilities.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "table_utilities.hpp"

#include <cudf/transform.hpp>
#include <cudf/reduction.hpp>

int64_t estimate_size(std::unique_ptr<cudf::column> column)
{
std::vector<std::unique_ptr<cudf::column>> columns;
columns.emplace_back(std::move(column));
cudf::table table{std::move(columns)};
return estimate_size(table.view());
}
jihoonson marked this conversation as resolved.
Show resolved Hide resolved

int64_t estimate_size(cudf::table_view const& view)
{
// Compute the size in bits for each row.
auto const row_sizes = cudf::row_bit_count(view);
// Accumulate the row sizes to compute a sum.
auto const agg = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
cudf::data_type sum_dtype{cudf::type_id::INT64};
auto const total_size_scalar = cudf::reduce(*row_sizes, *agg, sum_dtype);
auto const total_size_in_bits = static_cast<cudf::numeric_scalar<int64_t>*>(total_size_scalar.get())->value();
// Convert the size in bits to the size in bytes.
return static_cast<int64_t>(static_cast<double>(total_size_in_bits) / 8);
jihoonson marked this conversation as resolved.
Show resolved Hide resolved
}
41 changes: 41 additions & 0 deletions cpp/benchmarks/common/table_utilities.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/table/table_view.hpp>

/**
* @brief Estimates the column size in bytes.
*
* @remark As this function internally uses cudf::row_bit_count() to estimate each row size
* and accumulates them, the returned estimate may be an inexact approximation in some
* cases. See cudf::row_bit_count() for more details.
*
* @param column The column to estimate its size
*/
int64_t estimate_size(std::unique_ptr<cudf::column> column);
jihoonson marked this conversation as resolved.
Show resolved Hide resolved

/**
* @brief Estimates the table size in bytes.
*
* @remark As this function internally uses cudf::row_bit_count() to estimate each row size
* and accumulates them, the returned estimate may be an inexact approximation in some
* cases. See cudf::row_bit_count() for more details.
*
* @param view The view to estimate its size
*/
int64_t estimate_size(cudf::table_view const& view);
10 changes: 8 additions & 2 deletions cpp/benchmarks/reduction/anyall.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,7 +14,9 @@
* limitations under the License.
*/

#include <benchmarks/common/benchmark_utilities.hpp>
#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/common/table_utilities.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

Expand All @@ -34,14 +36,18 @@ void BM_reduction_anyall(benchmark::State& state,
auto const dtype = cudf::type_to_id<type>();
data_profile const profile = data_profile_builder().no_validity().distribution(
dtype, distribution_id::UNIFORM, 0, agg->kind == cudf::aggregation::ANY ? 0 : 100);
auto const values = create_random_column(dtype, row_count{column_size}, profile);
auto values = create_random_column(dtype, row_count{column_size}, profile);

cudf::data_type output_dtype{cudf::type_id::BOOL8};

for (auto _ : state) {
cuda_event_timer timer(state, true);
auto result = cudf::reduce(*values, *agg, output_dtype);
}

// The benchmark takes a column and produces one scalar.
set_items_processed(state, column_size + 1);
set_bytes_processed(state, estimate_size(std::move(values)) + cudf::size_of(output_dtype));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
set_bytes_processed(state, estimate_size(std::move(values)) + cudf::size_of(output_dtype));
set_bytes_processed(state, estimate_size(*values) + cudf::size_of(output_dtype));

similarly at other places.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think values->view() is more clear but leave it up to you if you'd rather use *values

Copy link
Contributor Author

@jihoonson jihoonson Jun 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @karthikeyann, thanks for the review. I just want to better understand your comment. Your seem to be suggesting to pass a column_view instead of moving the column. This has been done in 40804e2. Or, are you suggesting to use the *?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just saw David's comment above. I also find values->view() more explicit and clear, so would like to keep this pattern unless you feel strongly about it.

}

#define concat(a, b, c) a##b##c
Expand Down
10 changes: 9 additions & 1 deletion cpp/benchmarks/reduction/dictionary.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,6 +14,7 @@
* limitations under the License.
*/

#include <benchmarks/common/benchmark_utilities.hpp>
#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>
Expand Down Expand Up @@ -52,6 +53,13 @@ void BM_reduction_dictionary(benchmark::State& state,
cuda_event_timer timer(state, true);
auto result = cudf::reduce(*values, *agg, output_dtype);
}

// The benchmark takes a column and produces two scalars.
set_items_processed(state, column_size + 1);

// We don't set the metrics for the size read/written as row_bit_count() doesn't
// support the dictionary type yet (and so is estimate_size()).
// See https://github.com/rapidsai/cudf/issues/16121 for details.
}

#define concat(a, b, c) a##b##c
Expand Down
15 changes: 11 additions & 4 deletions cpp/benchmarks/reduction/minmax.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,7 +14,9 @@
* limitations under the License.
*/

#include <benchmarks/common/benchmark_utilities.hpp>
#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/common/table_utilities.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

Expand All @@ -28,14 +30,19 @@ template <typename type>
void BM_reduction(benchmark::State& state)
{
cudf::size_type const column_size{(cudf::size_type)state.range(0)};
auto const dtype = cudf::type_to_id<type>();
auto const input_column =
create_random_column(dtype, row_count{column_size}, data_profile_builder().no_validity());
auto const dtype_id = cudf::type_to_id<type>();
auto input_column =
create_random_column(dtype_id, row_count{column_size}, data_profile_builder().no_validity());

for (auto _ : state) {
cuda_event_timer timer(state, true);
auto result = cudf::minmax(*input_column);
}

// The benchmark takes a column and produces two scalars.
set_items_processed(state, column_size + 2);
cudf::data_type dtype = cudf::data_type{dtype_id};
set_bytes_processed(state, estimate_size(std::move(input_column)) + 2 * cudf::size_of(dtype));
}

#define concat(a, b, c) a##b##c
Expand Down
Loading
Loading