From cdf4edea51fb469e74951f098200b997a182e853 Mon Sep 17 00:00:00 2001 From: Basit Ayantunde Date: Fri, 27 Sep 2024 21:00:38 +0100 Subject: [PATCH] switched ast benchmarks from googlebench to nvbench --- cpp/benchmarks/CMakeLists.txt | 2 +- cpp/benchmarks/ast/transform.cpp | 50 +++++++------------ .../synchronization/synchronization.cpp | 31 ++++++------ .../synchronization/synchronization.hpp | 6 +++ 4 files changed, 43 insertions(+), 46 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 4113e38dcf4..d99689befdc 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -330,7 +330,7 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp) # ################################################################################################## # * ast benchmark --------------------------------------------------------------------------------- -ConfigureBench(AST_BENCH ast/transform.cpp) +ConfigureNVBench(AST_BENCH ast/transform.cpp) # ################################################################################################## # * binaryop benchmark ---------------------------------------------------------------------------- diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp index 65a44532cf1..6a6ea96b54b 100644 --- a/cpp/benchmarks/ast/transform.cpp +++ b/cpp/benchmarks/ast/transform.cpp @@ -15,14 +15,18 @@ */ #include -#include #include #include #include +#include + #include +#include +#include + #include #include #include @@ -35,13 +39,10 @@ enum class TreeType { }; template -class AST : public cudf::benchmark {}; - -template -static void BM_ast_transform(benchmark::State& state) +static void BM_ast_transform(nvbench::state& state) { - auto const table_size{static_cast(state.range(0))}; - auto const tree_levels{static_cast(state.range(1))}; + auto const table_size = static_cast(state.get_int64("TableSize")); + auto const tree_levels = static_cast(state.get_int64("TreeLevels")); // Create table data auto const n_cols = reuse_columns ? 1 : tree_levels + 1; @@ -86,38 +87,25 @@ static void BM_ast_transform(benchmark::State& state) auto const& expression_tree_root = expressions.back(); - // Execute benchmark - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { + flush_device_L2_cache(rmm::cuda_stream_view{state.get_cuda_stream().get_stream()}); cudf::compute_column(table, expression_tree_root); - } + }); // Use the number of bytes read from global memory - state.SetBytesProcessed(static_cast(state.iterations()) * state.range(0) * - (tree_levels + 1) * sizeof(key_type)); -} - -static void CustomRanges(benchmark::internal::Benchmark* b) -{ - auto row_counts = std::vector{100'000, 1'000'000, 10'000'000, 100'000'000}; - auto operation_counts = std::vector{1, 5, 10}; - for (auto const& row_count : row_counts) { - for (auto const& operation_count : operation_counts) { - b->Args({row_count, operation_count}); - } - } + state.add_global_memory_reads(static_cast(state.get_summaries().size()) * table_size * + (tree_levels + 1) * sizeof(key_type)); } #define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable) \ - (::benchmark::State & st) \ + static void name(::nvbench::state& st) \ { \ - BM_ast_transform(st); \ + ::BM_ast_transform(st); \ } \ - BENCHMARK_REGISTER_F(AST, name) \ - ->Apply(CustomRanges) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); + NVBENCH_BENCH(name) \ + .set_name(#name) \ + .add_int64_axis("TableSize", {100'000, 1'000'000, 10'000'000, 100'000'000}) \ + .add_int64_axis("TreeLevels", {1, 5, 10}) AST_TRANSFORM_BENCHMARK_DEFINE( ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false); diff --git a/cpp/benchmarks/synchronization/synchronization.cpp b/cpp/benchmarks/synchronization/synchronization.cpp index 5993bb23542..fae60f4e28d 100644 --- a/cpp/benchmarks/synchronization/synchronization.cpp +++ b/cpp/benchmarks/synchronization/synchronization.cpp @@ -21,26 +21,29 @@ #include #include +void flush_device_L2_cache(rmm::cuda_stream_view stream) +{ + int current_device = 0; + CUDF_CUDA_TRY(cudaGetDevice(¤t_device)); + + int l2_cache_bytes = 0; + CUDF_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device)); + + if (l2_cache_bytes > 0) { + int const memset_value = 0; + rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream); + CUDF_CUDA_TRY( + cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value())); + } +} + cuda_event_timer::cuda_event_timer(benchmark::State& state, bool flush_l2_cache, rmm::cuda_stream_view stream) : stream(stream), p_state(&state) { // flush all of L2$ - if (flush_l2_cache) { - int current_device = 0; - CUDF_CUDA_TRY(cudaGetDevice(¤t_device)); - - int l2_cache_bytes = 0; - CUDF_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device)); - - if (l2_cache_bytes > 0) { - int const memset_value = 0; - rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream); - CUDF_CUDA_TRY( - cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value())); - } - } + if (flush_l2_cache) { flush_device_L2_cache(stream); } CUDF_CUDA_TRY(cudaEventCreate(&start)); CUDF_CUDA_TRY(cudaEventCreate(&stop)); diff --git a/cpp/benchmarks/synchronization/synchronization.hpp b/cpp/benchmarks/synchronization/synchronization.hpp index cc3bf828d60..69f3230fa59 100644 --- a/cpp/benchmarks/synchronization/synchronization.hpp +++ b/cpp/benchmarks/synchronization/synchronization.hpp @@ -69,6 +69,12 @@ #include #include +/** + * @brief clears the L2$ by cudaMemset'ing a buffer of L2$ size + * @param stream CUDA stream used for device memory operations and kernel launches + */ +void flush_device_L2_cache(rmm::cuda_stream_view stream = cudf::get_default_stream()); + class cuda_event_timer { public: /**