Skip to content

Commit

Permalink
switched ast benchmarks from googlebench to nvbench
Browse files Browse the repository at this point in the history
  • Loading branch information
lamarrr committed Sep 27, 2024
1 parent 6b3d57d commit cdf4ede
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 46 deletions.
2 changes: 1 addition & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp)

# ##################################################################################################
# * ast benchmark ---------------------------------------------------------------------------------
ConfigureBench(AST_BENCH ast/transform.cpp)
ConfigureNVBench(AST_BENCH ast/transform.cpp)

# ##################################################################################################
# * binaryop benchmark ----------------------------------------------------------------------------
Expand Down
50 changes: 19 additions & 31 deletions cpp/benchmarks/ast/transform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,18 @@
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/transform.hpp>
#include <cudf/types.hpp>

#include <rmm/cuda_stream_view.hpp>

#include <thrust/iterator/counting_iterator.h>

#include <nvbench/exec_tag.cuh>
#include <nvbench/nvbench.cuh>

#include <algorithm>
#include <list>
#include <memory>
Expand All @@ -35,13 +39,10 @@ enum class TreeType {
};

template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
class AST : public cudf::benchmark {};

template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
static void BM_ast_transform(benchmark::State& state)
static void BM_ast_transform(nvbench::state& state)
{
auto const table_size{static_cast<cudf::size_type>(state.range(0))};
auto const tree_levels{static_cast<cudf::size_type>(state.range(1))};
auto const table_size = static_cast<cudf::size_type>(state.get_int64("TableSize"));
auto const tree_levels = static_cast<cudf::size_type>(state.get_int64("TreeLevels"));

// Create table data
auto const n_cols = reuse_columns ? 1 : tree_levels + 1;
Expand Down Expand Up @@ -86,38 +87,25 @@ static void BM_ast_transform(benchmark::State& state)

auto const& expression_tree_root = expressions.back();

// Execute benchmark
for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) {
flush_device_L2_cache(rmm::cuda_stream_view{state.get_cuda_stream().get_stream()});
cudf::compute_column(table, expression_tree_root);
}
});

// Use the number of bytes read from global memory
state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) *
(tree_levels + 1) * sizeof(key_type));
}

static void CustomRanges(benchmark::internal::Benchmark* b)
{
auto row_counts = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
auto operation_counts = std::vector<cudf::size_type>{1, 5, 10};
for (auto const& row_count : row_counts) {
for (auto const& operation_count : operation_counts) {
b->Args({row_count, operation_count});
}
}
state.add_global_memory_reads(static_cast<int64_t>(state.get_summaries().size()) * table_size *
(tree_levels + 1) * sizeof(key_type));
}

#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \
BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable) \
(::benchmark::State & st) \
static void name(::nvbench::state& st) \
{ \
BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st); \
::BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st); \
} \
BENCHMARK_REGISTER_F(AST, name) \
->Apply(CustomRanges) \
->Unit(benchmark::kMillisecond) \
->UseManualTime();
NVBENCH_BENCH(name) \
.set_name(#name) \
.add_int64_axis("TableSize", {100'000, 1'000'000, 10'000'000, 100'000'000}) \
.add_int64_axis("TreeLevels", {1, 5, 10})

AST_TRANSFORM_BENCHMARK_DEFINE(
ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false);
Expand Down
31 changes: 17 additions & 14 deletions cpp/benchmarks/synchronization/synchronization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,26 +21,29 @@
#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>

void flush_device_L2_cache(rmm::cuda_stream_view stream)
{
int current_device = 0;
CUDF_CUDA_TRY(cudaGetDevice(&current_device));

int l2_cache_bytes = 0;
CUDF_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));

if (l2_cache_bytes > 0) {
int const memset_value = 0;
rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream);
CUDF_CUDA_TRY(
cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value()));
}
}

cuda_event_timer::cuda_event_timer(benchmark::State& state,
bool flush_l2_cache,
rmm::cuda_stream_view stream)
: stream(stream), p_state(&state)
{
// flush all of L2$
if (flush_l2_cache) {
int current_device = 0;
CUDF_CUDA_TRY(cudaGetDevice(&current_device));

int l2_cache_bytes = 0;
CUDF_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));

if (l2_cache_bytes > 0) {
int const memset_value = 0;
rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream);
CUDF_CUDA_TRY(
cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value()));
}
}
if (flush_l2_cache) { flush_device_L2_cache(stream); }

CUDF_CUDA_TRY(cudaEventCreate(&start));
CUDF_CUDA_TRY(cudaEventCreate(&stop));
Expand Down
6 changes: 6 additions & 0 deletions cpp/benchmarks/synchronization/synchronization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@
#include <benchmark/benchmark.h>
#include <driver_types.h>

/**
* @brief clears the L2$ by cudaMemset'ing a buffer of L2$ size
* @param stream CUDA stream used for device memory operations and kernel launches
*/
void flush_device_L2_cache(rmm::cuda_stream_view stream = cudf::get_default_stream());

class cuda_event_timer {
public:
/**
Expand Down

0 comments on commit cdf4ede

Please sign in to comment.