man-group · IvoDD · Dec 22, 2023 · Dec 7, 2023 · Dec 19, 2023 · Dec 20, 2023
diff --git a/.gitmodules b/.gitmodules
@@ -13,6 +13,9 @@
 [submodule "cpp/third_party/rapidcheck"]
  path = cpp/third_party/rapidcheck
  url = https://github.com/emil-e/rapidcheck.git
+[submodule "cpp/third_party/benchmark"]
+ path = cpp/third_party/benchmark
+ url = https://github.com/google/benchmark.git
 [submodule "cpp/vcpkg"]
  path = cpp/vcpkg
  url = https://github.com/microsoft/vcpkg.git
@@ -79,6 +79,7 @@ else()
  find_package(robin_hood REQUIRED)
  find_package(LMDB REQUIRED)
  find_package(LMDBXX REQUIRED)
+ find_package(benchmark REQUIRED)
 
  if(${BUILD_WITH_REMOTERY})
  find_package(Remotery REQUIRED)
@@ -857,6 +858,22 @@ if(${TEST})
 
  gtest_discover_tests(test_unit_arcticdb PROPERTIES DISCOVERY_TIMEOUT 60)
 
+ set(benchmark_srcs
+ column_store/test/benchmark_memory_segment.cpp
+ processing/test/benchmark_clause.cpp)
+
+ add_executable(benchmarks ${benchmark_srcs})
+
+ target_link_libraries(benchmarks
+ PUBLIC
+ benchmark::benchmark
+ benchmark::benchmark_main
+ ${COMMON_PUBLIC_TEST_LIBRARIES}
+ PRIVATE
+ ${AWSSDK_LINK_LIBRARIES}
+ arcticdb_core_static
+ )
+
  set(rapidcheck_srcs
  column_store/test/rapidcheck_column_store.cpp
  column_store/test/rapidcheck_chunked_buffer.cpp

@@ -385,7 +385,7 @@ class SegmentInMemory {
  impl_->sort(column);
  }
 
- SegmentInMemory clone() {
+ SegmentInMemory clone() const {
  return SegmentInMemory(std::make_shared<SegmentInMemoryImpl>(impl_->clone()));
  }
 

@@ -0,0 +1,103 @@
+/* Copyright 2023 Man Group Operations Limited
+ *
+ * Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
+ *
+ * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
+ */
+
+#include <benchmark/benchmark.h>
+
+#include <arcticdb/column_store/memory_segment.hpp>
+#include <arcticdb/stream/test/stream_test_common.hpp>
+#include <folly/container/Enumerate.h>
+
+#include <algorithm>
+
+using namespace arcticdb;
+
+// run like: --benchmark_time_unit=ms --benchmark_filter=.* --benchmark_min_time=5x
+
+std::vector<bool> get_sparse_bits(size_t num_rows, size_t num_set, std::mt19937 g){
+ auto sparse_bits = std::vector<bool>(num_rows, false);
+ std::fill(sparse_bits.begin(), sparse_bits.begin()+num_set, true);
+ std::shuffle(sparse_bits.begin(), sparse_bits.end(), g);
+ return sparse_bits;
+}
+
+std::vector<uint64_t> get_random_permutation(size_t num_rows, std::mt19937 g){
+ auto result = std::vector<uint64_t>(num_rows);
+ std::iota(result.begin(), result.end(), 1);
+ std::shuffle(result.begin(), result.end(), g);
+ return result;
+}
+
+SegmentInMemory get_shuffled_segment(const StreamId& id, size_t num_rows, size_t num_columns, std::optional<float> sparsity_percentage = std::nullopt){
+ // We use a seed to get the same shuffled segment for given arguments.
+ std::mt19937 g(0);
+ auto fields = std::vector<FieldRef>(num_columns);
+ for (auto i=0u; i<num_columns; ++i){
+ fields[i] = scalar_field(DataType::UINT64, "column_"+std::to_string(i));
+ }
+ auto segment = SegmentInMemory{
+ get_test_descriptor<stream::TimeseriesIndex>(id, fields),
+ num_rows,
+ false,
+ sparsity_percentage.has_value()
+ };
+
+ for (auto i=0u; i<=num_columns; ++i){
+ auto& column = segment.column(i);
+ auto values = get_random_permutation(num_rows, g);
+ // We ensure the column we're sorting by is NOT sparse. As of 2023/12 sorting by sparse columns is not supported.
+ auto num_set = num_rows;
+ if (i!=0 && sparsity_percentage.has_value()){
+ num_set = size_t(num_rows * (1-sparsity_percentage.value()));
+ }
+ auto has_value = get_sparse_bits(num_rows, num_set, g);
+ for (auto j=0u; j<num_rows; ++j){
+ if (has_value[j]){
+ column.set_scalar(j, values[j]);
+ }
+ }
+ }
+ segment.set_row_data(num_rows-1);
+
+ return segment;
+}
+
+static void BM_sort_shuffled(benchmark::State& state) {
+ auto segment = get_shuffled_segment("test", state.range(0), state.range(1));
+ for (auto _ : state) {
+ state.PauseTiming();
+ auto temp = segment.clone();
+ state.ResumeTiming();
+ temp.sort("time");
+ }
+}
+
+static void BM_sort_ordered(benchmark::State& state) {
+ auto segment = get_shuffled_segment("test", state.range(0), state.range(1));
+ segment.sort("time");
+ for (auto _ : state) {
+ state.PauseTiming();
+ auto temp = segment.clone();
+ state.ResumeTiming();
+ temp.sort("time");
+ }
+}
+
+static void BM_sort_sparse(benchmark::State& state) {
+ auto segment = get_shuffled_segment("test", state.range(0), state.range(1), 0.5);
+ for (auto _ : state) {
+ state.PauseTiming();
+ auto temp = segment.clone();
+ state.ResumeTiming();
+ temp.sort("time");
+ }
+}
+
+// The {100k, 100} puts more weight on the sort_external part of the sort
+// where the {1M, 1} puts more weight on the create_jive_table part.
+BENCHMARK(BM_sort_shuffled)->Args({100'000, 100})->Args({1'000'000, 1});
+BENCHMARK(BM_sort_ordered)->Args({100'000, 100});
+BENCHMARK(BM_sort_sparse)->Args({100'000, 100});
@@ -0,0 +1,90 @@
+/* Copyright 2023 Man Group Operations Limited
+ *
+ * Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
+ *
+ * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
+ */
+
+#include <benchmark/benchmark.h>
+
+#include <arcticdb/processing/clause.hpp>
+#include <arcticdb/util/test/generators.hpp>
+#include <arcticdb/column_store/memory_segment.hpp>
+#include <folly/futures/Future.h>
+#include <arcticdb/pipeline/frame_slice.hpp>
+
+using namespace arcticdb;
+
+// run like: --benchmark_time_unit=ms --benchmark_filter=.* --benchmark_min_time=5x
+
+SegmentInMemory get_segment_for_merge(const StreamId &id, size_t num_rows, size_t start, size_t step){
+ auto segment = SegmentInMemory{
+ get_test_descriptor<stream::TimeseriesIndex>(id, {
+ scalar_field(DataType::UINT8, "column")
+ }),
+ num_rows
+ };
+ auto& index_col = segment.column(0);
+ auto& value_col = segment.column(1);
+ for (auto i=0u; i<num_rows; ++i){
+ index_col.push_back(static_cast<int64_t>(start + i*step));
+ value_col.push_back(static_cast<uint8_t>(i));
+ }
+ segment.set_row_data(num_rows-1);
+ return segment;
+}
+
+void time_merge_on_segments(const std::vector<SegmentInMemory> &segments, benchmark::State& state){
+ // Pauses the timing while setting up the merge clause to only time the merging itself
+ state.PauseTiming();
+ auto component_manager = std::make_shared<ComponentManager>();
+ Composite<EntityIds> entity_ids;
+ for (auto& segment : segments){
+ auto proc_unit = ProcessingUnit{segment.clone()};
+ entity_ids.push_back(push_entities(component_manager, std::move(proc_unit)));
+ }
+
+ auto stream_id = StreamId("Merge");
+ StreamDescriptor descriptor{};
+ descriptor.add_field(FieldRef{make_scalar_type(DataType::NANOSECONDS_UTC64),"time"});
+ MergeClause merge_clause{TimeseriesIndex{"time"}, DenseColumnPolicy{}, stream_id, descriptor};
+ merge_clause.set_component_manager(component_manager);
+ state.ResumeTiming();
+
+ auto _ = merge_clause.process(std::move(entity_ids));
+}
+
+static void BM_merge_interleaved(benchmark::State& state){
+ const auto num_segs = state.range(0);
+ const auto num_rows = state.range(1);
+ std::vector<SegmentInMemory> segments;
+ for (auto i = 0u; i<num_segs; ++i){
+ auto id = "merge_" + std::to_string(i);
+ // step size of [num_segs] guarantees the segments will merge completely interleaved
+ auto seg = get_segment_for_merge(id, num_rows, i, num_segs);
+ segments.emplace_back(std::move(seg));
+ }
+
+ for (auto _ : state){
+ time_merge_on_segments(segments, state);
+ }
+}
+
+static void BM_merge_ordered(benchmark::State& state){
+ const auto num_segs = state.range(0);
+ const auto num_rows = state.range(1);
+ std::vector<SegmentInMemory> segments;
+ for (auto i = 0u; i<num_segs; ++i){
+ auto id = "merge_" + std::to_string(i);
+ // start of [i*num_segs] guarantees the segments will merge completely in order
+ auto seg = get_segment_for_merge(id, num_rows, i*num_segs, 1);
+ segments.emplace_back(std::move(seg));
+ }
+
+ for (auto _ : state){
+ time_merge_on_segments(segments, state);
+ }
+}
+
+BENCHMARK(BM_merge_interleaved)->Args({10'000, 100});
+BENCHMARK(BM_merge_ordered)->Args({10'000, 100});
@@ -6,4 +6,6 @@ if(NOT ${ARCTICDB_USING_CONDA})
  add_subdirectory(msgpack-c)
  add_subdirectory(Remotery)
  add_subdirectory(lmdbcxx)
+ set(BENCHMARK_ENABLE_TESTING OFF)
+ add_subdirectory(benchmark EXCLUDE_FROM_ALL)
 endif()
diff --git a/environment_unix.yml b/environment_unix.yml
@@ -8,6 +8,7 @@ dependencies:
  - cmake
  - gtest
  - gflags
+ - benchmark
  - doxygen
  - boost-cpp
  - grpcio