From 30362d08210fea19ea4890bc04e9a8a4ff332ec5 Mon Sep 17 00:00:00 2001 From: Ivo Dilov Date: Thu, 7 Dec 2023 14:45:04 +0000 Subject: [PATCH 1/4] Introduce sorting/merging google benchmarks - Adds google benchmarks and a benchmarks target to run them - Adds specific benchmarks for: - Sorting a segment few and many columns - Merging sorted segments (either interleaving or ordered) --- .gitmodules | 3 + cpp/arcticdb/CMakeLists.txt | 16 ++++ cpp/arcticdb/column_store/memory_segment.hpp | 2 +- .../test/benchmark_memory_segment.cpp | 43 +++++++++ .../processing/test/benchmark_clause.cpp | 90 +++++++++++++++++++ cpp/third_party/CMakeLists.txt | 2 + cpp/third_party/benchmark | 1 + environment_unix.yml | 1 + 8 files changed, 157 insertions(+), 1 deletion(-) create mode 100644 cpp/arcticdb/column_store/test/benchmark_memory_segment.cpp create mode 100644 cpp/arcticdb/processing/test/benchmark_clause.cpp create mode 160000 cpp/third_party/benchmark diff --git a/.gitmodules b/.gitmodules index 7c009e8039..2aba11272b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -13,6 +13,9 @@ [submodule "cpp/third_party/rapidcheck"] path = cpp/third_party/rapidcheck url = https://github.com/emil-e/rapidcheck.git +[submodule "cpp/third_party/benchmark"] + path = cpp/third_party/benchmark + url = https://github.com/google/benchmark.git [submodule "cpp/vcpkg"] path = cpp/vcpkg url = https://github.com/microsoft/vcpkg.git diff --git a/cpp/arcticdb/CMakeLists.txt b/cpp/arcticdb/CMakeLists.txt index dd6b2931a3..222ee96775 100644 --- a/cpp/arcticdb/CMakeLists.txt +++ b/cpp/arcticdb/CMakeLists.txt @@ -79,6 +79,7 @@ else() find_package(robin_hood REQUIRED) find_package(LMDB REQUIRED) find_package(LMDBXX REQUIRED) + find_package(benchmark REQUIRED) if(${BUILD_WITH_REMOTERY}) find_package(Remotery REQUIRED) @@ -857,6 +858,21 @@ if(${TEST}) gtest_discover_tests(test_unit_arcticdb PROPERTIES DISCOVERY_TIMEOUT 60) + set(benchmark_srcs + column_store/test/benchmark_memory_segment.cpp + processing/test/benchmark_clause.cpp) + + add_executable(benchmarks ${benchmark_srcs}) + + target_link_libraries(benchmarks + PUBLIC + benchmark::benchmark + benchmark::benchmark_main + ${COMMON_PUBLIC_TEST_LIBRARIES} + PRIVATE + ${AWSSDK_LINK_LIBRARIES} + ) + set(rapidcheck_srcs column_store/test/rapidcheck_column_store.cpp column_store/test/rapidcheck_chunked_buffer.cpp diff --git a/cpp/arcticdb/column_store/memory_segment.hpp b/cpp/arcticdb/column_store/memory_segment.hpp index 612b0b1de1..4d49023634 100644 --- a/cpp/arcticdb/column_store/memory_segment.hpp +++ b/cpp/arcticdb/column_store/memory_segment.hpp @@ -385,7 +385,7 @@ class SegmentInMemory { impl_->sort(column); } - SegmentInMemory clone() { + SegmentInMemory clone() const { return SegmentInMemory(std::make_shared(impl_->clone())); } diff --git a/cpp/arcticdb/column_store/test/benchmark_memory_segment.cpp b/cpp/arcticdb/column_store/test/benchmark_memory_segment.cpp new file mode 100644 index 0000000000..bd8dd8e734 --- /dev/null +++ b/cpp/arcticdb/column_store/test/benchmark_memory_segment.cpp @@ -0,0 +1,43 @@ +/* Copyright 2023 Man Group Operations Limited + * + * Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt. + * + * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0. + */ + +#include + +#include +#include +#include + +#include + +using namespace arcticdb; + +// run like: --benchmark_time_unit=ms --benchmark_filter=.* --benchmark_min_time=5x + +SegmentInMemory get_segment_for_bm(const StreamId &id, size_t num_rows, size_t num_columns){ + auto fields = std::vector(num_columns); + auto data_types = std::vector{DataType::UINT8, DataType::UINT64, DataType::FLOAT64, DataType::ASCII_FIXED64}; + for (size_t i=0; i(id, fields, num_rows, 0, 0); + return test_frame.segment_; +} + +static void BM_sort(benchmark::State& state) { + auto segment = get_segment_for_bm("test", state.range(0), state.range(1)); + std::random_device rng; + std::mt19937 urng(rng()); + std::shuffle(segment.begin(), segment.end(), urng); + for (auto _ : state) { + state.PauseTiming(); + auto temp = segment.clone(); + state.ResumeTiming(); + temp.sort("time"); + } +} + +BENCHMARK(BM_sort)->Args({100'000, 100})->Args({1'000'000, 1}); \ No newline at end of file diff --git a/cpp/arcticdb/processing/test/benchmark_clause.cpp b/cpp/arcticdb/processing/test/benchmark_clause.cpp new file mode 100644 index 0000000000..9af614e7cb --- /dev/null +++ b/cpp/arcticdb/processing/test/benchmark_clause.cpp @@ -0,0 +1,90 @@ +/* Copyright 2023 Man Group Operations Limited + * + * Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt. + * + * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0. + */ + +#include + +#include +#include +#include +#include +#include + +using namespace arcticdb; + +// run like: --benchmark_time_unit=ms --benchmark_filter=.* --benchmark_min_time=5x + +SegmentInMemory get_segment_for_merge(const StreamId &id, size_t num_rows, size_t start, size_t step){ + auto segment = SegmentInMemory{ + get_test_descriptor(id, { + scalar_field(DataType::UINT8, "column") + }), + num_rows + }; + auto& index_col = segment.column(0); + auto& value_col = segment.column(1); + for (auto i=0u; i(start + i*step)); + value_col.push_back(static_cast(i)); + } + segment.set_row_data(num_rows-1); + return segment; +} + +void time_merge_on_segments(const std::vector &segments, benchmark::State& state){ + // Pauses the timing while setting up the merge clause to only time the merging itself + state.PauseTiming(); + auto component_manager = std::make_shared(); + Composite entity_ids; + for (auto& segment : segments){ + auto proc_unit = ProcessingUnit{segment.clone()}; + entity_ids.push_back(push_entities(component_manager, std::move(proc_unit))); + } + + auto stream_id = StreamId("Merge"); + StreamDescriptor descriptor{}; + descriptor.add_field(FieldRef{make_scalar_type(DataType::NANOSECONDS_UTC64),"time"}); + MergeClause merge_clause{TimeseriesIndex{"time"}, DenseColumnPolicy{}, stream_id, descriptor}; + merge_clause.set_component_manager(component_manager); + state.ResumeTiming(); + + auto _ = merge_clause.process(std::move(entity_ids)); +} + +static void BM_merge_interleaved(benchmark::State& state){ + const auto num_segs = state.range(0); + const auto num_rows = state.range(1); + std::vector segments; + for (auto i = 0u; i segments; + for (auto i = 0u; iArgs({10'000, 100}); +BENCHMARK(BM_merge_ordered)->Args({10'000, 100}); \ No newline at end of file diff --git a/cpp/third_party/CMakeLists.txt b/cpp/third_party/CMakeLists.txt index e320c9e4d4..7978729bef 100644 --- a/cpp/third_party/CMakeLists.txt +++ b/cpp/third_party/CMakeLists.txt @@ -6,4 +6,6 @@ if(NOT ${ARCTICDB_USING_CONDA}) add_subdirectory(msgpack-c) add_subdirectory(Remotery) add_subdirectory(lmdbcxx) + set(BENCHMARK_ENABLE_TESTING OFF) + add_subdirectory(benchmark EXCLUDE_FROM_ALL) endif() diff --git a/cpp/third_party/benchmark b/cpp/third_party/benchmark new file mode 160000 index 0000000000..1e96bb0ab5 --- /dev/null +++ b/cpp/third_party/benchmark @@ -0,0 +1 @@ +Subproject commit 1e96bb0ab5e758861f5bbbd4edbd0a8d9a2a7cae diff --git a/environment_unix.yml b/environment_unix.yml index e31da0bb6a..9606d6165e 100644 --- a/environment_unix.yml +++ b/environment_unix.yml @@ -8,6 +8,7 @@ dependencies: - cmake - gtest - gflags + - benchmark - doxygen - boost-cpp - grpcio From 5b8addffbc471645fc7886aa7c9a0566a7ab49ca Mon Sep 17 00:00:00 2001 From: Ivo Dilov Date: Tue, 19 Dec 2023 11:42:09 +0200 Subject: [PATCH 2/4] New sorting benchmarks for ordered and sparse data - The sparse benchmark tests sort_external on sparse columns. The column we sort by is not sparse (requirement on the sort). --- .../test/benchmark_memory_segment.cpp | 85 ++++++++++++++++--- 1 file changed, 73 insertions(+), 12 deletions(-) diff --git a/cpp/arcticdb/column_store/test/benchmark_memory_segment.cpp b/cpp/arcticdb/column_store/test/benchmark_memory_segment.cpp index bd8dd8e734..a0b6ac14de 100644 --- a/cpp/arcticdb/column_store/test/benchmark_memory_segment.cpp +++ b/cpp/arcticdb/column_store/test/benchmark_memory_segment.cpp @@ -17,21 +17,78 @@ using namespace arcticdb; // run like: --benchmark_time_unit=ms --benchmark_filter=.* --benchmark_min_time=5x -SegmentInMemory get_segment_for_bm(const StreamId &id, size_t num_rows, size_t num_columns){ +std::vector get_sparse_bits(size_t num_rows, size_t num_set){ + auto sparse_bits = std::vector(num_rows, false); + std::fill(sparse_bits.begin(), sparse_bits.begin()+num_set, true); + std::random_device rd; + std::mt19937 g(rd()); + std::shuffle(sparse_bits.begin(), sparse_bits.end(), g); + return sparse_bits; +} + +std::vector get_random_permutation(size_t num_rows){ + auto result = std::vector(num_rows); + std::iota(result.begin(), result.end(), 1); + std::random_device rd; + std::mt19937 g(rd()); + std::shuffle(result.begin(), result.end(), g); + return result; +} + +SegmentInMemory get_shuffled_segment(const StreamId &id, size_t num_rows, size_t num_columns, std::optional non_index_sparse_percent = std::nullopt){ auto fields = std::vector(num_columns); - auto data_types = std::vector{DataType::UINT8, DataType::UINT64, DataType::FLOAT64, DataType::ASCII_FIXED64}; - for (size_t i=0; i(id, fields), + num_rows, + false, + non_index_sparse_percent.has_value() + }; + + for (auto i=0u; i<=num_columns; ++i){ + auto& column = segment.column(i); + auto values = get_random_permutation(num_rows); + // We ensure the column we're sorting by is NOT sparse. As of 2023/12 sorting by sparse columns is not supported. + auto num_set = non_index_sparse_percent.has_value() && i!=0 ? + size_t(num_rows * (1-non_index_sparse_percent.value())) : + num_rows; + auto has_value = get_sparse_bits(num_rows, num_set); + for (auto j=0u; j(id, fields, num_rows, 0, 0); - return test_frame.segment_; } -static void BM_sort(benchmark::State& state) { - auto segment = get_segment_for_bm("test", state.range(0), state.range(1)); - std::random_device rng; - std::mt19937 urng(rng()); - std::shuffle(segment.begin(), segment.end(), urng); +static void BM_sort_sparse(benchmark::State& state) { + auto segment = get_shuffled_segment("test", state.range(0), state.range(1), 0.5); for (auto _ : state) { state.PauseTiming(); auto temp = segment.clone(); @@ -40,4 +97,8 @@ static void BM_sort(benchmark::State& state) { } } -BENCHMARK(BM_sort)->Args({100'000, 100})->Args({1'000'000, 1}); \ No newline at end of file +// The {100k, 100} puts more weight on the sort_external part of the sort +// where the {1M, 1} puts more weight on the create_jive_table part. +BENCHMARK(BM_sort_shuffled)->Args({100'000, 100})->Args({1'000'000, 1}); +BENCHMARK(BM_sort_ordered)->Args({100'000, 100}); +BENCHMARK(BM_sort_sparse)->Args({100'000, 100}); From b969b9f4d08ec121731fc421adc40547d889bd7d Mon Sep 17 00:00:00 2001 From: Ivo Dilov Date: Wed, 20 Dec 2023 14:14:49 +0200 Subject: [PATCH 3/4] Fix windows build --- cpp/arcticdb/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/arcticdb/CMakeLists.txt b/cpp/arcticdb/CMakeLists.txt index 222ee96775..753c010bd1 100644 --- a/cpp/arcticdb/CMakeLists.txt +++ b/cpp/arcticdb/CMakeLists.txt @@ -871,6 +871,7 @@ if(${TEST}) ${COMMON_PUBLIC_TEST_LIBRARIES} PRIVATE ${AWSSDK_LINK_LIBRARIES} + arcticdb_core_static ) set(rapidcheck_srcs From 20e42972a6d89e1d15d6457df9af8340b57b774f Mon Sep 17 00:00:00 2001 From: Ivo Dilov Date: Wed, 20 Dec 2023 19:43:01 +0200 Subject: [PATCH 4/4] Code review changes --- .../test/benchmark_memory_segment.cpp | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/cpp/arcticdb/column_store/test/benchmark_memory_segment.cpp b/cpp/arcticdb/column_store/test/benchmark_memory_segment.cpp index a0b6ac14de..69613fc011 100644 --- a/cpp/arcticdb/column_store/test/benchmark_memory_segment.cpp +++ b/cpp/arcticdb/column_store/test/benchmark_memory_segment.cpp @@ -17,25 +17,23 @@ using namespace arcticdb; // run like: --benchmark_time_unit=ms --benchmark_filter=.* --benchmark_min_time=5x -std::vector get_sparse_bits(size_t num_rows, size_t num_set){ +std::vector get_sparse_bits(size_t num_rows, size_t num_set, std::mt19937 g){ auto sparse_bits = std::vector(num_rows, false); std::fill(sparse_bits.begin(), sparse_bits.begin()+num_set, true); - std::random_device rd; - std::mt19937 g(rd()); std::shuffle(sparse_bits.begin(), sparse_bits.end(), g); return sparse_bits; } -std::vector get_random_permutation(size_t num_rows){ +std::vector get_random_permutation(size_t num_rows, std::mt19937 g){ auto result = std::vector(num_rows); std::iota(result.begin(), result.end(), 1); - std::random_device rd; - std::mt19937 g(rd()); std::shuffle(result.begin(), result.end(), g); return result; } -SegmentInMemory get_shuffled_segment(const StreamId &id, size_t num_rows, size_t num_columns, std::optional non_index_sparse_percent = std::nullopt){ +SegmentInMemory get_shuffled_segment(const StreamId& id, size_t num_rows, size_t num_columns, std::optional sparsity_percentage = std::nullopt){ + // We use a seed to get the same shuffled segment for given arguments. + std::mt19937 g(0); auto fields = std::vector(num_columns); for (auto i=0u; i(id, fields), num_rows, false, - non_index_sparse_percent.has_value() + sparsity_percentage.has_value() }; for (auto i=0u; i<=num_columns; ++i){ auto& column = segment.column(i); - auto values = get_random_permutation(num_rows); + auto values = get_random_permutation(num_rows, g); // We ensure the column we're sorting by is NOT sparse. As of 2023/12 sorting by sparse columns is not supported. - auto num_set = non_index_sparse_percent.has_value() && i!=0 ? - size_t(num_rows * (1-non_index_sparse_percent.value())) : - num_rows; - auto has_value = get_sparse_bits(num_rows, num_set); + auto num_set = num_rows; + if (i!=0 && sparsity_percentage.has_value()){ + num_set = size_t(num_rows * (1-sparsity_percentage.value())); + } + auto has_value = get_sparse_bits(num_rows, num_set, g); for (auto j=0u; j