Skip to content

Commit

Permalink
New sorting benchmarks for ordered and sparse data
Browse files Browse the repository at this point in the history
- The sparse data benchmark is currently disabled because sorting by
  sparse columns is currently not supported.
  • Loading branch information
IvoDD committed Dec 19, 2023
1 parent ece90a8 commit b08012d
Showing 1 changed file with 65 additions and 12 deletions.
77 changes: 65 additions & 12 deletions cpp/arcticdb/column_store/test/benchmark_memory_segment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,71 @@ using namespace arcticdb;

// run like: --benchmark_time_unit=ms --benchmark_filter=.* --benchmark_min_time=5x

SegmentInMemory get_segment_for_bm(const StreamId &id, size_t num_rows, size_t num_columns){
std::vector<bool> get_sparse_bits(size_t num_rows, size_t num_set){
auto sparse_bits = std::vector<bool>(num_rows, false);
std::fill(sparse_bits.begin(), sparse_bits.begin()+num_set, true);
std::random_shuffle(sparse_bits.begin(), sparse_bits.end());
return sparse_bits;
}

std::vector<uint64_t> get_random_permutation(size_t num_rows){
auto result = std::vector<uint64_t>(num_rows);
std::iota(result.begin(), result.end(), 1);
std::random_shuffle(result.begin(), result.end());
return result;
}

SegmentInMemory get_shuffled_segment(const StreamId &id, size_t num_rows, size_t num_columns, std::optional<float> sparse_percent = std::nullopt){
auto fields = std::vector<FieldRef>(num_columns);
auto data_types = std::vector<DataType>{DataType::UINT8, DataType::UINT64, DataType::FLOAT64, DataType::ASCII_FIXED64};
for (size_t i=0; i<num_columns; ++i){
fields[i] = scalar_field(data_types[i%data_types.size()], "column_"+std::to_string(i));
for (auto i=0u; i<num_columns; ++i){
fields[i] = scalar_field(DataType::UINT64, "column_"+std::to_string(i));
}
auto segment = SegmentInMemory{
get_test_descriptor<stream::TimeseriesIndex>(id, fields),
num_rows,
false,
sparse_percent.has_value()
};

auto num_set = sparse_percent.has_value() ? size_t(num_rows * (1-sparse_percent.value())) : num_rows;
for (auto i=0u; i<=num_columns; ++i){
auto& column = segment.column(i);
auto values = get_random_permutation(num_rows);
auto has_value = get_sparse_bits(num_rows, num_set);
for (auto j=0u; j<num_rows; ++j){
if (has_value[j]){
column.set_scalar(j, values[j]);
}
}
}
segment.set_row_data(num_rows-1);

return segment;
}

static void BM_sort_shuffled(benchmark::State& state) {
auto segment = get_shuffled_segment("test", state.range(0), state.range(1));
for (auto _ : state) {
state.PauseTiming();
auto temp = segment.clone();
state.ResumeTiming();
temp.sort("time");
}
}

static void BM_sort_ordered(benchmark::State& state) {
auto segment = get_shuffled_segment("test", state.range(0), state.range(1));
segment.sort("time");
for (auto _ : state) {
state.PauseTiming();
auto temp = segment.clone();
state.ResumeTiming();
temp.sort("time");
}
auto test_frame = get_test_frame<stream::TimeseriesIndex>(id, fields, num_rows, 0, 0);
return test_frame.segment_;
}

static void BM_sort(benchmark::State& state) {
auto segment = get_segment_for_bm("test", state.range(0), state.range(1));
std::random_device rng;
std::mt19937 urng(rng());
std::shuffle(segment.begin(), segment.end(), urng);
[[maybe_unused]] static void BM_sort_sparse(benchmark::State& state) {
auto segment = get_shuffled_segment("test", state.range(0), state.range(1), 0.2);
for (auto _ : state) {
state.PauseTiming();
auto temp = segment.clone();
Expand All @@ -40,4 +90,7 @@ static void BM_sort(benchmark::State& state) {
}
}

BENCHMARK(BM_sort)->Args({100'000, 100})->Args({1'000'000, 1});
BENCHMARK(BM_sort_shuffled)->Args({100'000, 100})->Args({1'000'000, 1});
BENCHMARK(BM_sort_ordered)->Args({100'000, 100})->Args({1'000'000, 1});
//TODO: Enable sparse benchmark when sorting sparse columns is supported
//BENCHMARK(BM_sort_sparse)->Args({100'000, 100})->Args({1'000'000, 1});

0 comments on commit b08012d

Please sign in to comment.