Switched BINARY_OP Benchmarks from GoogleBench to NVBench (#16963)

This merge request switches the Benchmarking solution for the BINARY_OP benchmarks from GoogleBench to NVBench Authors: - Basit Ayantunde (https://github.com/lamarrr) Approvers: - Nghia Truong (https://github.com/ttnghia) - Tianyu Liu (https://github.com/kingcrimsontianyu) URL: #16963
rapidsai · Oct 4, 2024 · a784321 · a784321
1 parent 2fa2e6a
commit a784321
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 74 deletions.
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -334,7 +334,7 @@ ConfigureNVBench(AST_NVBENCH ast/transform.cpp)
 
 # ##################################################################################################
 # * binaryop benchmark ----------------------------------------------------------------------------
-ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp)
+ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp)
 
 # ##################################################################################################
 # * nvtext benchmark -------------------------------------------------------------------

diff --git a/cpp/benchmarks/binaryop/binaryop.cpp b/cpp/benchmarks/binaryop/binaryop.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,15 +15,14 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/binaryop.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 #include <algorithm>
-#include <vector>
 
 // This set of benchmarks is designed to be a comparison for the AST benchmarks
 
@@ -33,23 +32,21 @@ enum class TreeType {
 };
 
 template <typename key_type, TreeType tree_type, bool reuse_columns>
-class BINARYOP : public cudf::benchmark {};
-
-template <typename key_type, TreeType tree_type, bool reuse_columns>
-static void BM_binaryop_transform(benchmark::State& state)
+static void BM_binaryop_transform(nvbench::state& state)
 {
- auto const table_size{static_cast<cudf::size_type>(state.range(0))};
- auto const tree_levels{static_cast<cudf::size_type>(state.range(1))};
+ auto const table_size{static_cast<cudf::size_type>(state.get_int64("table_size"))};
+ auto const tree_levels{static_cast<cudf::size_type>(state.get_int64("tree_levels"))};
 
  // Create table data
  auto const n_cols = reuse_columns ? 1 : tree_levels + 1;
  auto const source_table = create_sequence_table(
  cycle_dtypes({cudf::type_to_id<key_type>()}, n_cols), row_count{table_size});
  cudf::table_view table{*source_table};
 
- // Execute benchmark
- for (auto _ : state) {
- cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
+ // Use the number of bytes read from global memory
+ state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));
+
+ state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) {
  // Execute tree that chains additions like (((a + b) + c) + d)
  auto const op = cudf::binary_operator::ADD;
  auto const result_data_type = cudf::data_type(cudf::type_to_id<key_type>());
@@ -64,16 +61,18 @@ static void BM_binaryop_transform(benchmark::State& state)
  result = cudf::binary_operation(result->view(), col, op, result_data_type);
  });
  }
- }
-
- // Use the number of bytes read from global memory
- state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) *
- (tree_levels + 1) * sizeof(key_type));
+ });
 }
 
 #define BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns) \
- BENCHMARK_TEMPLATE_DEFINE_F(BINARYOP, name, key_type, tree_type, reuse_columns) \
- (::benchmark::State & st) { BM_binaryop_transform<key_type, tree_type, reuse_columns>(st); }
+ \
+ static void name(::nvbench::state& st) \
+ { \
+ BM_binaryop_transform<key_type, tree_type, reuse_columns>(st); \
+ } \
+ NVBENCH_BENCH(name) \
+ .add_int64_axis("tree_levels", {1, 2, 5, 10}) \
+ .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000})
 
 BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_int32_imbalanced_unique,
  int32_t,
@@ -87,29 +86,3 @@ BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_double_imbalanced_unique,
  double,
  TreeType::IMBALANCED_LEFT,
  false);
-
-static void CustomRanges(benchmark::internal::Benchmark* b)
-{
- auto row_counts = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
- auto operation_counts = std::vector<cudf::size_type>{1, 2, 5, 10};
- for (auto const& row_count : row_counts) {
- for (auto const& operation_count : operation_counts) {
- b->Args({row_count, operation_count});
- }
- }
-}
-
-BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_unique)
- ->Apply(CustomRanges)
- ->Unit(benchmark::kMillisecond)
- ->UseManualTime();
-
-BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_reuse)
- ->Apply(CustomRanges)
- ->Unit(benchmark::kMillisecond)
- ->UseManualTime();
-
-BENCHMARK_REGISTER_F(BINARYOP, binaryop_double_imbalanced_unique)
- ->Apply(CustomRanges)
- ->Unit(benchmark::kMillisecond)
- ->UseManualTime();
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
@@ -15,20 +15,18 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/binaryop.hpp>
 
-class COMPILED_BINARYOP : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
 template <typename TypeLhs, typename TypeRhs, typename TypeOut>
-void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
+void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop)
 {
- auto const column_size{static_cast<cudf::size_type>(state.range(0))};
+ auto const table_size = static_cast<cudf::size_type>(state.get_int64("table_size"));
 
  auto const source_table = create_random_table(
- {cudf::type_to_id<TypeLhs>(), cudf::type_to_id<TypeRhs>()}, row_count{column_size});
+ {cudf::type_to_id<TypeLhs>(), cudf::type_to_id<TypeRhs>()}, row_count{table_size});
 
  auto lhs = cudf::column_view(source_table->get_column(0));
  auto rhs = cudf::column_view(source_table->get_column(1));
@@ -38,31 +36,26 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
  // Call once for hot cache.
  cudf::binary_operation(lhs, rhs, binop, output_dtype);
 
- for (auto _ : state) {
- cuda_event_timer timer(state, true);
- cudf::binary_operation(lhs, rhs, binop, output_dtype);
- }
-
  // use number of bytes read and written to global memory
- state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * column_size *
- (sizeof(TypeLhs) + sizeof(TypeRhs) + sizeof(TypeOut)));
+ state.add_global_memory_reads<TypeLhs>(table_size);
+ state.add_global_memory_reads<TypeRhs>(table_size);
+ state.add_global_memory_reads<TypeOut>(table_size);
+
+ state.exec(nvbench::exec_tag::sync,
+ [&](nvbench::launch&) { cudf::binary_operation(lhs, rhs, binop, output_dtype); });
 }
 
+#define BM_STRINGIFY(a) #a
+
 // TODO tparam boolean for null.
-#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout) \
- BENCHMARK_DEFINE_F(COMPILED_BINARYOP, name) \
- (::benchmark::State & st) \
- { \
- BM_compiled_binaryop<lhs, rhs, tout>(st, cudf::binary_operator::bop); \
- } \
- BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name) \
- ->Unit(benchmark::kMicrosecond) \
- ->UseManualTime() \
- ->Arg(10000) /* 10k */ \
- ->Arg(100000) /* 100k */ \
- ->Arg(1000000) /* 1M */ \
- ->Arg(10000000) /* 10M */ \
- ->Arg(100000000); /* 100M */
+#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout) \
+ static void name(::nvbench::state& st) \
+ { \
+ ::BM_compiled_binaryop<lhs, rhs, tout>(st, ::cudf::binary_operator::bop); \
+ } \
+ NVBENCH_BENCH(name) \
+ .set_name("compiled_binary_op_" BM_STRINGIFY(name)) \
+ .add_int64_axis("table_size", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000})
 
 #define build_name(a, b, c, d) a##_##b##_##c##_##d