From a78432184f20f7acf493eaa8d1928cfee29d1771 Mon Sep 17 00:00:00 2001 From: Basit Ayantunde Date: Fri, 4 Oct 2024 16:19:37 +0100 Subject: [PATCH] Switched BINARY_OP Benchmarks from GoogleBench to NVBench (#16963) This merge request switches the Benchmarking solution for the BINARY_OP benchmarks from GoogleBench to NVBench Authors: - Basit Ayantunde (https://github.com/lamarrr) Approvers: - Nghia Truong (https://github.com/ttnghia) - Tianyu Liu (https://github.com/kingcrimsontianyu) URL: https://github.com/rapidsai/cudf/pull/16963 --- cpp/benchmarks/CMakeLists.txt | 2 +- cpp/benchmarks/binaryop/binaryop.cpp | 65 ++++++------------- cpp/benchmarks/binaryop/compiled_binaryop.cpp | 47 ++++++-------- 3 files changed, 40 insertions(+), 74 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 1e13bf176c1..b8a53cd8bd9 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -334,7 +334,7 @@ ConfigureNVBench(AST_NVBENCH ast/transform.cpp) # ################################################################################################## # * binaryop benchmark ---------------------------------------------------------------------------- -ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp) +ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp) # ################################################################################################## # * nvtext benchmark ------------------------------------------------------------------- diff --git a/cpp/benchmarks/binaryop/binaryop.cpp b/cpp/benchmarks/binaryop/binaryop.cpp index fa98d9e601a..7d267a88764 100644 --- a/cpp/benchmarks/binaryop/binaryop.cpp +++ b/cpp/benchmarks/binaryop/binaryop.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,15 +15,14 @@ */ #include -#include -#include #include #include #include +#include + #include -#include // This set of benchmarks is designed to be a comparison for the AST benchmarks @@ -33,13 +32,10 @@ enum class TreeType { }; template -class BINARYOP : public cudf::benchmark {}; - -template -static void BM_binaryop_transform(benchmark::State& state) +static void BM_binaryop_transform(nvbench::state& state) { - auto const table_size{static_cast(state.range(0))}; - auto const tree_levels{static_cast(state.range(1))}; + auto const table_size{static_cast(state.get_int64("table_size"))}; + auto const tree_levels{static_cast(state.get_int64("tree_levels"))}; // Create table data auto const n_cols = reuse_columns ? 1 : tree_levels + 1; @@ -47,9 +43,10 @@ static void BM_binaryop_transform(benchmark::State& state) cycle_dtypes({cudf::type_to_id()}, n_cols), row_count{table_size}); cudf::table_view table{*source_table}; - // Execute benchmark - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 + // Use the number of bytes read from global memory + state.add_global_memory_reads(table_size * (tree_levels + 1)); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { // Execute tree that chains additions like (((a + b) + c) + d) auto const op = cudf::binary_operator::ADD; auto const result_data_type = cudf::data_type(cudf::type_to_id()); @@ -64,16 +61,18 @@ static void BM_binaryop_transform(benchmark::State& state) result = cudf::binary_operation(result->view(), col, op, result_data_type); }); } - } - - // Use the number of bytes read from global memory - state.SetBytesProcessed(static_cast(state.iterations()) * state.range(0) * - (tree_levels + 1) * sizeof(key_type)); + }); } #define BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns) \ - BENCHMARK_TEMPLATE_DEFINE_F(BINARYOP, name, key_type, tree_type, reuse_columns) \ - (::benchmark::State & st) { BM_binaryop_transform(st); } + \ + static void name(::nvbench::state& st) \ + { \ + BM_binaryop_transform(st); \ + } \ + NVBENCH_BENCH(name) \ + .add_int64_axis("tree_levels", {1, 2, 5, 10}) \ + .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000}) BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_int32_imbalanced_unique, int32_t, @@ -87,29 +86,3 @@ BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_double_imbalanced_unique, double, TreeType::IMBALANCED_LEFT, false); - -static void CustomRanges(benchmark::internal::Benchmark* b) -{ - auto row_counts = std::vector{100'000, 1'000'000, 10'000'000, 100'000'000}; - auto operation_counts = std::vector{1, 2, 5, 10}; - for (auto const& row_count : row_counts) { - for (auto const& operation_count : operation_counts) { - b->Args({row_count, operation_count}); - } - } -} - -BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_unique) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_reuse) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(BINARYOP, binaryop_double_imbalanced_unique) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp index 7086a61c7c5..bc0ff69bce9 100644 --- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp +++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp @@ -15,20 +15,18 @@ */ #include -#include -#include #include -class COMPILED_BINARYOP : public cudf::benchmark {}; +#include template -void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop) +void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop) { - auto const column_size{static_cast(state.range(0))}; + auto const table_size = static_cast(state.get_int64("table_size")); auto const source_table = create_random_table( - {cudf::type_to_id(), cudf::type_to_id()}, row_count{column_size}); + {cudf::type_to_id(), cudf::type_to_id()}, row_count{table_size}); auto lhs = cudf::column_view(source_table->get_column(0)); auto rhs = cudf::column_view(source_table->get_column(1)); @@ -38,31 +36,26 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop) // Call once for hot cache. cudf::binary_operation(lhs, rhs, binop, output_dtype); - for (auto _ : state) { - cuda_event_timer timer(state, true); - cudf::binary_operation(lhs, rhs, binop, output_dtype); - } - // use number of bytes read and written to global memory - state.SetBytesProcessed(static_cast(state.iterations()) * column_size * - (sizeof(TypeLhs) + sizeof(TypeRhs) + sizeof(TypeOut))); + state.add_global_memory_reads(table_size); + state.add_global_memory_reads(table_size); + state.add_global_memory_reads(table_size); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { cudf::binary_operation(lhs, rhs, binop, output_dtype); }); } +#define BM_STRINGIFY(a) #a + // TODO tparam boolean for null. -#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout) \ - BENCHMARK_DEFINE_F(COMPILED_BINARYOP, name) \ - (::benchmark::State & st) \ - { \ - BM_compiled_binaryop(st, cudf::binary_operator::bop); \ - } \ - BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name) \ - ->Unit(benchmark::kMicrosecond) \ - ->UseManualTime() \ - ->Arg(10000) /* 10k */ \ - ->Arg(100000) /* 100k */ \ - ->Arg(1000000) /* 1M */ \ - ->Arg(10000000) /* 10M */ \ - ->Arg(100000000); /* 100M */ +#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout) \ + static void name(::nvbench::state& st) \ + { \ + ::BM_compiled_binaryop(st, ::cudf::binary_operator::bop); \ + } \ + NVBENCH_BENCH(name) \ + .set_name("compiled_binary_op_" BM_STRINGIFY(name)) \ + .add_int64_axis("table_size", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000}) #define build_name(a, b, c, d) a##_##b##_##c##_##d