Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce sorting and merging google benchmarks #1138

Merged
merged 4 commits into from
Dec 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
[submodule "cpp/third_party/rapidcheck"]
path = cpp/third_party/rapidcheck
url = https://github.com/emil-e/rapidcheck.git
[submodule "cpp/third_party/benchmark"]
path = cpp/third_party/benchmark
url = https://github.com/google/benchmark.git
[submodule "cpp/vcpkg"]
path = cpp/vcpkg
url = https://github.com/microsoft/vcpkg.git
17 changes: 17 additions & 0 deletions cpp/arcticdb/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ else()
find_package(robin_hood REQUIRED)
find_package(LMDB REQUIRED)
find_package(LMDBXX REQUIRED)
find_package(benchmark REQUIRED)

if(${BUILD_WITH_REMOTERY})
find_package(Remotery REQUIRED)
Expand Down Expand Up @@ -857,6 +858,22 @@ if(${TEST})

gtest_discover_tests(test_unit_arcticdb PROPERTIES DISCOVERY_TIMEOUT 60)

set(benchmark_srcs
column_store/test/benchmark_memory_segment.cpp
processing/test/benchmark_clause.cpp)

add_executable(benchmarks ${benchmark_srcs})

target_link_libraries(benchmarks
PUBLIC
benchmark::benchmark
benchmark::benchmark_main
${COMMON_PUBLIC_TEST_LIBRARIES}
PRIVATE
${AWSSDK_LINK_LIBRARIES}
arcticdb_core_static
)

set(rapidcheck_srcs
column_store/test/rapidcheck_column_store.cpp
column_store/test/rapidcheck_chunked_buffer.cpp
Expand Down
2 changes: 1 addition & 1 deletion cpp/arcticdb/column_store/memory_segment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ class SegmentInMemory {
impl_->sort(column);
}

SegmentInMemory clone() {
SegmentInMemory clone() const {
return SegmentInMemory(std::make_shared<SegmentInMemoryImpl>(impl_->clone()));
}

Expand Down
103 changes: 103 additions & 0 deletions cpp/arcticdb/column_store/test/benchmark_memory_segment.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/* Copyright 2023 Man Group Operations Limited
*
* Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
*
* As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
*/

#include <benchmark/benchmark.h>

#include <arcticdb/column_store/memory_segment.hpp>
#include <arcticdb/stream/test/stream_test_common.hpp>
#include <folly/container/Enumerate.h>

#include <algorithm>

using namespace arcticdb;

// run like: --benchmark_time_unit=ms --benchmark_filter=.* --benchmark_min_time=5x

std::vector<bool> get_sparse_bits(size_t num_rows, size_t num_set, std::mt19937 g){
auto sparse_bits = std::vector<bool>(num_rows, false);
std::fill(sparse_bits.begin(), sparse_bits.begin()+num_set, true);
std::shuffle(sparse_bits.begin(), sparse_bits.end(), g);
return sparse_bits;
}

std::vector<uint64_t> get_random_permutation(size_t num_rows, std::mt19937 g){
auto result = std::vector<uint64_t>(num_rows);
std::iota(result.begin(), result.end(), 1);
std::shuffle(result.begin(), result.end(), g);
return result;
}

SegmentInMemory get_shuffled_segment(const StreamId& id, size_t num_rows, size_t num_columns, std::optional<float> sparsity_percentage = std::nullopt){
// We use a seed to get the same shuffled segment for given arguments.
std::mt19937 g(0);
auto fields = std::vector<FieldRef>(num_columns);
for (auto i=0u; i<num_columns; ++i){
fields[i] = scalar_field(DataType::UINT64, "column_"+std::to_string(i));
}
auto segment = SegmentInMemory{
get_test_descriptor<stream::TimeseriesIndex>(id, fields),
num_rows,
false,
sparsity_percentage.has_value()
};

for (auto i=0u; i<=num_columns; ++i){
auto& column = segment.column(i);
auto values = get_random_permutation(num_rows, g);
// We ensure the column we're sorting by is NOT sparse. As of 2023/12 sorting by sparse columns is not supported.
auto num_set = num_rows;
if (i!=0 && sparsity_percentage.has_value()){
num_set = size_t(num_rows * (1-sparsity_percentage.value()));
}
auto has_value = get_sparse_bits(num_rows, num_set, g);
for (auto j=0u; j<num_rows; ++j){
if (has_value[j]){
column.set_scalar(j, values[j]);
}
}
}
segment.set_row_data(num_rows-1);

return segment;
}

static void BM_sort_shuffled(benchmark::State& state) {
auto segment = get_shuffled_segment("test", state.range(0), state.range(1));
for (auto _ : state) {
state.PauseTiming();
auto temp = segment.clone();
state.ResumeTiming();
temp.sort("time");
}
}

static void BM_sort_ordered(benchmark::State& state) {
auto segment = get_shuffled_segment("test", state.range(0), state.range(1));
segment.sort("time");
for (auto _ : state) {
state.PauseTiming();
auto temp = segment.clone();
state.ResumeTiming();
temp.sort("time");
}
}

static void BM_sort_sparse(benchmark::State& state) {
auto segment = get_shuffled_segment("test", state.range(0), state.range(1), 0.5);
for (auto _ : state) {
state.PauseTiming();
auto temp = segment.clone();
state.ResumeTiming();
temp.sort("time");
}
}

// The {100k, 100} puts more weight on the sort_external part of the sort
// where the {1M, 1} puts more weight on the create_jive_table part.
BENCHMARK(BM_sort_shuffled)->Args({100'000, 100})->Args({1'000'000, 1});
BENCHMARK(BM_sort_ordered)->Args({100'000, 100});
BENCHMARK(BM_sort_sparse)->Args({100'000, 100});
90 changes: 90 additions & 0 deletions cpp/arcticdb/processing/test/benchmark_clause.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/* Copyright 2023 Man Group Operations Limited
*
* Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
*
* As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
*/

#include <benchmark/benchmark.h>

#include <arcticdb/processing/clause.hpp>
#include <arcticdb/util/test/generators.hpp>
#include <arcticdb/column_store/memory_segment.hpp>
#include <folly/futures/Future.h>
#include <arcticdb/pipeline/frame_slice.hpp>

using namespace arcticdb;

// run like: --benchmark_time_unit=ms --benchmark_filter=.* --benchmark_min_time=5x

SegmentInMemory get_segment_for_merge(const StreamId &id, size_t num_rows, size_t start, size_t step){
auto segment = SegmentInMemory{
get_test_descriptor<stream::TimeseriesIndex>(id, {
scalar_field(DataType::UINT8, "column")
}),
num_rows
};
auto& index_col = segment.column(0);
auto& value_col = segment.column(1);
for (auto i=0u; i<num_rows; ++i){
index_col.push_back(static_cast<int64_t>(start + i*step));
value_col.push_back(static_cast<uint8_t>(i));
}
segment.set_row_data(num_rows-1);
return segment;
}

void time_merge_on_segments(const std::vector<SegmentInMemory> &segments, benchmark::State& state){
// Pauses the timing while setting up the merge clause to only time the merging itself
state.PauseTiming();
auto component_manager = std::make_shared<ComponentManager>();
Composite<EntityIds> entity_ids;
for (auto& segment : segments){
auto proc_unit = ProcessingUnit{segment.clone()};
entity_ids.push_back(push_entities(component_manager, std::move(proc_unit)));
}

auto stream_id = StreamId("Merge");
StreamDescriptor descriptor{};
descriptor.add_field(FieldRef{make_scalar_type(DataType::NANOSECONDS_UTC64),"time"});
MergeClause merge_clause{TimeseriesIndex{"time"}, DenseColumnPolicy{}, stream_id, descriptor};
merge_clause.set_component_manager(component_manager);
state.ResumeTiming();

auto _ = merge_clause.process(std::move(entity_ids));
}

static void BM_merge_interleaved(benchmark::State& state){
const auto num_segs = state.range(0);
const auto num_rows = state.range(1);
std::vector<SegmentInMemory> segments;
for (auto i = 0u; i<num_segs; ++i){
auto id = "merge_" + std::to_string(i);
// step size of [num_segs] guarantees the segments will merge completely interleaved
auto seg = get_segment_for_merge(id, num_rows, i, num_segs);
segments.emplace_back(std::move(seg));
}

for (auto _ : state){
time_merge_on_segments(segments, state);
}
}

static void BM_merge_ordered(benchmark::State& state){
const auto num_segs = state.range(0);
const auto num_rows = state.range(1);
std::vector<SegmentInMemory> segments;
for (auto i = 0u; i<num_segs; ++i){
auto id = "merge_" + std::to_string(i);
// start of [i*num_segs] guarantees the segments will merge completely in order
auto seg = get_segment_for_merge(id, num_rows, i*num_segs, 1);
segments.emplace_back(std::move(seg));
}

for (auto _ : state){
time_merge_on_segments(segments, state);
}
}

BENCHMARK(BM_merge_interleaved)->Args({10'000, 100});
BENCHMARK(BM_merge_ordered)->Args({10'000, 100});
2 changes: 2 additions & 0 deletions cpp/third_party/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,6 @@ if(NOT ${ARCTICDB_USING_CONDA})
add_subdirectory(msgpack-c)
add_subdirectory(Remotery)
add_subdirectory(lmdbcxx)
set(BENCHMARK_ENABLE_TESTING OFF)
add_subdirectory(benchmark EXCLUDE_FROM_ALL)
endif()
1 change: 1 addition & 0 deletions cpp/third_party/benchmark
Submodule benchmark added at 1e96bb
1 change: 1 addition & 0 deletions environment_unix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ dependencies:
- cmake
- gtest
- gflags
- benchmark
poodlewars marked this conversation as resolved.
Show resolved Hide resolved
- doxygen
- boost-cpp
- grpcio
Expand Down
Loading