From 3a5944057886828a767a47a056c926058e9c7d7c Mon Sep 17 00:00:00 2001 From: binwei Date: Sat, 30 Apr 2022 16:32:25 +0800 Subject: [PATCH 01/19] merge master and branch shuffle_opt_fillbyreducer. To submit PR to upstream Implemented fill by reducer --- native-sql-engine/cpp/CMakeLists.txt | 3 + .../src/benchmarks/shuffle_split_benchmark.cc | 499 ++++++++++-------- native-sql-engine/cpp/src/shuffle/splitter.cc | 167 +++++- native-sql-engine/cpp/src/shuffle/splitter.h | 4 + 4 files changed, 436 insertions(+), 237 deletions(-) diff --git a/native-sql-engine/cpp/CMakeLists.txt b/native-sql-engine/cpp/CMakeLists.txt index a1301fd1d..fe7e989ee 100644 --- a/native-sql-engine/cpp/CMakeLists.txt +++ b/native-sql-engine/cpp/CMakeLists.txt @@ -1,6 +1,9 @@ cmake_minimum_required(VERSION 3.16) project(spark_columnar_plugin) +#add_definitions(-DSKIPWRITE -DSKIPCOMPRESS -DPROCESSROW) +add_definitions(-DPROCESSROW) + #add_compile_options(-g) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) diff --git a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc index cd81ef877..ec1416641 100644 --- a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc +++ b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc @@ -41,24 +41,12 @@ namespace shuffle { const int batch_buffer_size = 32768; const int split_buffer_size = 8192; -class BenchmarkShuffleSplit : public ::benchmark::Fixture { +class BenchmarkShuffleSplit { public: - BenchmarkShuffleSplit() { - file_name = - "/mnt/DP_disk1/lineitem/" - "part-00025-356249a2-c285-42b9-8a18-5b10be61e0c4-c000.snappy.parquet"; - + BenchmarkShuffleSplit(std::string file_name) { GetRecordBatchReader(file_name); - std::cout << schema->ToString() << std::endl; - const auto& fields = schema->fields(); - for (const auto& field : fields) { - if (field->name() == "l_orderkey") { - auto node = gandiva::TreeExprBuilder::MakeField(field); - expr_vector.push_back(gandiva::TreeExprBuilder::MakeExpression( - std::move(node), arrow::field("res_" + field->name(), field->type()))); - } - } } + void GetRecordBatchReader(const std::string& input_file) { std::unique_ptr<::parquet::arrow::FileReader> parquet_reader; std::shared_ptr record_batch_reader; @@ -89,11 +77,97 @@ class BenchmarkShuffleSplit : public ::benchmark::Fixture { for (int i = 0; i < num_columns; ++i) { column_indices.push_back(i); } + const auto& fields = schema->fields(); + for (const auto& field : fields) { + if (field->name() == "l_orderkey") { + auto node = gandiva::TreeExprBuilder::MakeField(field); + expr_vector.push_back(gandiva::TreeExprBuilder::MakeExpression( + std::move(node), arrow::field("res_" + field->name(), field->type()))); + } + } } - void SetUp(const ::benchmark::State& state) {} + void operator()(benchmark::State& state) { + SetCPU(state.thread_index()); + arrow::Compression::type compression_type = (arrow::Compression::type)state.range(1); + + const int num_partitions = state.range(0); + + auto options = SplitOptions::Defaults(); + options.compression_type = compression_type; + options.buffer_size = split_buffer_size; + options.buffered_write = true; + options.offheap_per_task = 128 * 1024 * 1024 * 1024L; + options.prefer_spill = true; + options.write_schema = false; + + std::shared_ptr splitter; + int64_t elapse_read = 0; + int64_t num_batches = 0; + int64_t num_rows = 0; + int64_t split_time = 0; + + Do_Split(splitter, elapse_read, num_batches, num_rows, split_time, + num_partitions, options, state); + + auto fs = std::make_shared(); + fs->DeleteFile(splitter->DataFile()); + + state.SetBytesProcessed(int64_t(splitter->RawPartitionBytes())); + + state.counters["rowgroups"] = + benchmark::Counter(row_group_indices.size(), benchmark::Counter::kAvgThreads, + benchmark::Counter::OneK::kIs1000); + state.counters["columns"] = + benchmark::Counter(column_indices.size(), benchmark::Counter::kAvgThreads, + benchmark::Counter::OneK::kIs1000); + state.counters["batches"] = benchmark::Counter( + num_batches, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); + state.counters["num_rows"] = benchmark::Counter( + num_rows, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); + state.counters["num_partitions"] = benchmark::Counter( + num_partitions, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); + state.counters["batch_buffer_size"] = + benchmark::Counter(batch_buffer_size, benchmark::Counter::kAvgThreads, + benchmark::Counter::OneK::kIs1024); + state.counters["split_buffer_size"] = + benchmark::Counter(split_buffer_size, benchmark::Counter::kAvgThreads, + benchmark::Counter::OneK::kIs1024); + + state.counters["bytes_spilled"] = + benchmark::Counter(splitter->TotalBytesSpilled(), benchmark::Counter::kAvgThreads, + benchmark::Counter::OneK::kIs1024); + state.counters["bytes_written"] = + benchmark::Counter(splitter->TotalBytesWritten(), benchmark::Counter::kAvgThreads, + benchmark::Counter::OneK::kIs1024); + state.counters["bytes_raw"] = + benchmark::Counter(splitter->RawPartitionBytes(), benchmark::Counter::kAvgThreads, + benchmark::Counter::OneK::kIs1024); + state.counters["bytes_spilled"] = + benchmark::Counter(splitter->TotalBytesSpilled(), benchmark::Counter::kAvgThreads, + benchmark::Counter::OneK::kIs1024); + + state.counters["parquet_parse"] = benchmark::Counter( + elapse_read, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); + state.counters["compute_pid_time"] = + benchmark::Counter(splitter->TotalComputePidTime(), benchmark::Counter::kAvgThreads, + benchmark::Counter::OneK::kIs1000); + state.counters["write_time"] = + benchmark::Counter(splitter->TotalWriteTime(), benchmark::Counter::kAvgThreads, + benchmark::Counter::OneK::kIs1000); + state.counters["spill_time"] = + benchmark::Counter(splitter->TotalSpillTime(), benchmark::Counter::kAvgThreads, + benchmark::Counter::OneK::kIs1000); + state.counters["compress_time"] = + benchmark::Counter(splitter->TotalCompressTime(), benchmark::Counter::kAvgThreads, + benchmark::Counter::OneK::kIs1000); + + split_time = split_time - splitter->TotalSpillTime() - splitter->TotalComputePidTime() - + splitter->TotalCompressTime() - splitter->TotalWriteTime(); + state.counters["split_time"] = benchmark::Counter( + split_time, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - void TearDown(const ::benchmark::State& state) {} + } protected: long SetCPU(uint32_t cpuindex) { @@ -102,9 +176,9 @@ class BenchmarkShuffleSplit : public ::benchmark::Fixture { CPU_SET(cpuindex, &cs); return sched_setaffinity(0, sizeof(cs), &cs); } - virtual void Do_Split(const std::shared_ptr& splitter, int64_t& elapse_read, + virtual void Do_Split(std::shared_ptr& splitter, int64_t& elapse_read, int64_t& num_batches, int64_t& num_rows, int64_t& split_time, - benchmark::State& state) {} + const int num_partitions, SplitOptions options, benchmark::State& state) {} protected: std::string file_name; @@ -116,232 +190,124 @@ class BenchmarkShuffleSplit : public ::benchmark::Fixture { parquet::ArrowReaderProperties properties; }; -BENCHMARK_DEFINE_F(BenchmarkShuffleSplit, CacheScan)(benchmark::State& state) { - SetCPU(state.thread_index()); - arrow::Compression::type compression_type = (arrow::Compression::type)state.range(1); +class BenchmarkShuffleSplit_CacheScan_Benchmark: public BenchmarkShuffleSplit{ +public: +BenchmarkShuffleSplit_CacheScan_Benchmark(std::string filename):BenchmarkShuffleSplit(filename){} - const int num_partitions = state.range(0); +protected: + void Do_Split(std::shared_ptr& splitter, int64_t& elapse_read, + int64_t& num_batches, int64_t& num_rows, int64_t& split_time, + const int num_partitions, SplitOptions options, benchmark::State& state) { + + std::vector local_column_indices; + local_column_indices.push_back(0); + local_column_indices.push_back(1); + local_column_indices.push_back(2); + local_column_indices.push_back(4); + local_column_indices.push_back(5); + local_column_indices.push_back(6); + local_column_indices.push_back(7); + + std::shared_ptr local_schema; + local_schema = std::make_shared(*schema.get()); + + ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(15)); + ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(14)); + ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(13)); + ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(12)); + ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(11)); + ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(10)); + ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(9)); + ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(8)); + ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(3)); + + if(state.thread_index() == 0) + std::cout << local_schema->ToString() << std::endl; - auto options = SplitOptions::Defaults(); - options.compression_type = compression_type; - options.buffer_size = split_buffer_size; - options.buffered_write = true; - options.offheap_per_task = 128 * 1024 * 1024 * 1024L; - options.prefer_spill = true; - options.write_schema = false; + ARROW_ASSIGN_OR_THROW( + splitter, Splitter::Make("rr", local_schema, num_partitions, options)); + + std::shared_ptr record_batch; - std::shared_ptr splitter; + std::unique_ptr<::parquet::arrow::FileReader> parquet_reader; + std::shared_ptr record_batch_reader; + ASSERT_NOT_OK(::parquet::arrow::FileReader::Make( + arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file), properties, + &parquet_reader)); - if (!expr_vector.empty()) { - ARROW_ASSIGN_OR_THROW(splitter, Splitter::Make("hash", schema, num_partitions, - expr_vector, std::move(options))); - } else { - ARROW_ASSIGN_OR_THROW( - splitter, Splitter::Make("rr", schema, num_partitions, std::move(options))); - } + std::vector> batches; + ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader(row_group_indices, local_column_indices, + &record_batch_reader)); + do { + TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch)); - std::shared_ptr record_batch; - int64_t elapse_read = 0; - int64_t num_batches = 0; - int64_t num_rows = 0; - int64_t split_time = 0; - - std::unique_ptr<::parquet::arrow::FileReader> parquet_reader; - std::shared_ptr record_batch_reader; - ASSERT_NOT_OK(::parquet::arrow::FileReader::Make( - arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file), properties, - &parquet_reader)); - - std::vector> batches; - ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader(row_group_indices, column_indices, - &record_batch_reader)); - do { - TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch)); - - if (record_batch) { - batches.push_back(record_batch); - num_batches += 1; - num_rows += record_batch->num_rows(); + if (record_batch) { + batches.push_back(record_batch); + num_batches += 1; + num_rows += record_batch->num_rows(); + } + } while (record_batch); + std::cout << "parquet parse done elapsed time " << elapse_read/1000000 << " ms " << std::endl; + std::cout << "batches = " << num_batches << " rows = " << num_rows << std::endl; + + for (auto _ : state) { + for_each(batches.begin(), batches.end(), + [&splitter, &split_time](std::shared_ptr& record_batch) { + TIME_NANO_OR_THROW(split_time, splitter->Split(*record_batch)); + }); } - } while (record_batch); - for (auto _ : state) { - for_each(batches.begin(), batches.end(), - [&splitter, &split_time](std::shared_ptr& record_batch) { - TIME_NANO_OR_THROW(split_time, splitter->Split(*record_batch)); - }); + TIME_NANO_OR_THROW(split_time, splitter->Stop()); } - TIME_NANO_OR_THROW(split_time, splitter->Stop()); - - auto fs = std::make_shared(); - fs->DeleteFile(splitter->DataFile()); - - state.SetBytesProcessed(int64_t(splitter->RawPartitionBytes())); - - state.counters["rowgroups"] = - benchmark::Counter(row_group_indices.size(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); - state.counters["columns"] = - benchmark::Counter(column_indices.size(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); - state.counters["batches"] = benchmark::Counter( - num_batches, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - state.counters["num_rows"] = benchmark::Counter( - num_rows, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - state.counters["num_partitions"] = benchmark::Counter( - num_partitions, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - state.counters["batch_buffer_size"] = - benchmark::Counter(batch_buffer_size, benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); - state.counters["split_buffer_size"] = - benchmark::Counter(split_buffer_size, benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); - - state.counters["bytes_spilled"] = - benchmark::Counter(splitter->TotalBytesSpilled(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); - state.counters["bytes_written"] = - benchmark::Counter(splitter->TotalBytesWritten(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); - state.counters["bytes_raw"] = - benchmark::Counter(splitter->RawPartitionBytes(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); - state.counters["bytes_spilled"] = - benchmark::Counter(splitter->TotalBytesSpilled(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); - - state.counters["parquet_parse"] = benchmark::Counter( - elapse_read, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - state.counters["compute_pid_time"] = - benchmark::Counter(splitter->TotalComputePidTime(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); - state.counters["write_time"] = - benchmark::Counter(splitter->TotalWriteTime(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); - state.counters["spill_time"] = - benchmark::Counter(splitter->TotalSpillTime(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); - state.counters["compress_time"] = - benchmark::Counter(splitter->TotalCompressTime(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); - - split_time = split_time - splitter->TotalSpillTime() - splitter->TotalComputePidTime() - - splitter->TotalCompressTime() - splitter->TotalWriteTime(); - state.counters["split_time"] = benchmark::Counter( - split_time, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); -} - -BENCHMARK_DEFINE_F(BenchmarkShuffleSplit, IterateScan)(benchmark::State& state) { - SetCPU(state.thread_index()); - - arrow::Compression::type compression_type = (arrow::Compression::type)state.range(1); - - const int num_partitions = state.range(0); - - auto options = SplitOptions::Defaults(); - options.compression_type = compression_type; - options.buffer_size = split_buffer_size; - options.buffered_write = true; - options.offheap_per_task = 128 * 1024 * 1024 * 1024L; - options.prefer_spill = true; - options.write_schema = false; - - std::shared_ptr splitter; - - if (!expr_vector.empty()) { - ARROW_ASSIGN_OR_THROW(splitter, Splitter::Make("hash", schema, num_partitions, - expr_vector, std::move(options))); - } else { - ARROW_ASSIGN_OR_THROW( - splitter, Splitter::Make("rr", schema, num_partitions, std::move(options))); - } - int64_t elapse_read = 0; - int64_t num_batches = 0; - int64_t num_rows = 0; - int64_t split_time = 0; +}; - std::shared_ptr record_batch; - std::unique_ptr<::parquet::arrow::FileReader> parquet_reader; - std::shared_ptr record_batch_reader; - ASSERT_NOT_OK(::parquet::arrow::FileReader::Make( - arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file), properties, - &parquet_reader)); +class BenchmarkShuffleSplit_IterateScan_Benchmark: public BenchmarkShuffleSplit{ +public: +BenchmarkShuffleSplit_IterateScan_Benchmark(std::string filename):BenchmarkShuffleSplit(filename){} - for (auto _ : state) { - std::vector> batches; - ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader(row_group_indices, column_indices, - &record_batch_reader)); - TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch)); - while (record_batch) { - num_batches += 1; - num_rows += record_batch->num_rows(); - TIME_NANO_OR_THROW(split_time, splitter->Split(*record_batch)); +protected: + void Do_Split(std::shared_ptr& splitter, int64_t& elapse_read, + int64_t& num_batches, int64_t& num_rows, int64_t& split_time, + const int num_partitions, SplitOptions options, benchmark::State& state) { + + if(state.thread_index() == 0) + std::cout << schema->ToString() << std::endl; + + if (!expr_vector.empty()) { + ARROW_ASSIGN_OR_THROW(splitter, Splitter::Make("hash", schema, num_partitions, + expr_vector, std::move(options))); + } else { + ARROW_ASSIGN_OR_THROW( + splitter, Splitter::Make("rr", schema, num_partitions, std::move(options))); + } + + std::shared_ptr record_batch; + + std::unique_ptr<::parquet::arrow::FileReader> parquet_reader; + std::shared_ptr record_batch_reader; + ASSERT_NOT_OK(::parquet::arrow::FileReader::Make( + arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file), properties, + &parquet_reader)); + + for (auto _ : state) { + std::vector> batches; + ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader(row_group_indices, column_indices, + &record_batch_reader)); TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch)); + while (record_batch) { + num_batches += 1; + num_rows += record_batch->num_rows(); + TIME_NANO_OR_THROW(split_time, splitter->Split(*record_batch)); + TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch)); + } } + TIME_NANO_OR_THROW(split_time, splitter->Stop()); } - TIME_NANO_OR_THROW(split_time, splitter->Stop()); - - auto fs = std::make_shared(); - fs->DeleteFile(splitter->DataFile()); - - state.SetBytesProcessed(int64_t(splitter->RawPartitionBytes())); - - state.counters["rowgroups"] = - benchmark::Counter(row_group_indices.size(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); - state.counters["columns"] = - benchmark::Counter(column_indices.size(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); - state.counters["batches"] = benchmark::Counter( - num_batches, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - state.counters["num_rows"] = benchmark::Counter( - num_rows, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - state.counters["num_partitions"] = benchmark::Counter( - num_partitions, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - state.counters["batch_buffer_size"] = - benchmark::Counter(batch_buffer_size, benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); - state.counters["split_buffer_size"] = - benchmark::Counter(split_buffer_size, benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); - - state.counters["bytes_spilled"] = - benchmark::Counter(splitter->TotalBytesSpilled(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); - state.counters["bytes_written"] = - benchmark::Counter(splitter->TotalBytesWritten(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); - state.counters["bytes_raw"] = - benchmark::Counter(splitter->RawPartitionBytes(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); - state.counters["bytes_spilled"] = - benchmark::Counter(splitter->TotalBytesSpilled(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); - - state.counters["parquet_parse"] = benchmark::Counter( - elapse_read, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - state.counters["compute_pid_time"] = - benchmark::Counter(splitter->TotalComputePidTime(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); - state.counters["write_time"] = - benchmark::Counter(splitter->TotalWriteTime(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); - state.counters["spill_time"] = - benchmark::Counter(splitter->TotalSpillTime(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); - state.counters["compress_time"] = - benchmark::Counter(splitter->TotalCompressTime(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); - - split_time = split_time - splitter->TotalSpillTime() - splitter->TotalComputePidTime() - - splitter->TotalCompressTime() - splitter->TotalWriteTime(); - state.counters["split_time"] = benchmark::Counter( - split_time, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); -} +}; /*BENCHMARK_REGISTER_F(BenchmarkShuffleSplit, CacheScan)->Iterations(1) ->Args({96*2, arrow::Compression::FASTPFOR}) @@ -370,14 +336,79 @@ BENCHMARK_DEFINE_F(BenchmarkShuffleSplit, IterateScan)(benchmark::State& state) ->Threads(16) ->Threads(24) ->Unit(benchmark::kSecond);*/ -BENCHMARK_REGISTER_F(BenchmarkShuffleSplit, IterateScan) - ->Iterations(1) - ->Args({96 * 16, arrow::Compression::FASTPFOR}) - ->Threads(24) +/*BENCHMARK_REGISTER_F(BenchmarkShuffleSplit, CacheScan) + ->Iterations(1000000) + ->Args({512, arrow::Compression::FASTPFOR}) + ->Threads(1) ->ReportAggregatesOnly(false) ->MeasureProcessCPUTime() - ->Unit(benchmark::kSecond); + ->Unit(benchmark::kSecond);*/ } // namespace shuffle } // namespace sparkcolumnarplugin -BENCHMARK_MAIN(); +int main(int argc, char** argv) { + + uint32_t iterations=1; + uint32_t partitions=512; + uint32_t threads=1; + std::string datafile; + + for (int i=0;iIterations(iterations) + ->Args({partitions, arrow::Compression::FASTPFOR}) + ->Threads(threads) + ->ReportAggregatesOnly(false) + ->MeasureProcessCPUTime() + ->Unit(benchmark::kSecond); + +/* sparkcolumnarplugin::shuffle::BenchmarkShuffleSplit_IterateScan_Benchmark bck(datafile); + + benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck) + ->Iterations(1) + ->Args({96*2, arrow::Compression::FASTPFOR}) + ->Args({96*4, arrow::Compression::FASTPFOR}) + ->Args({96*8, arrow::Compression::FASTPFOR}) + ->Args({96*16, arrow::Compression::FASTPFOR}) + ->Args({96*32, arrow::Compression::FASTPFOR}) + ->Threads(24) + ->Unit(benchmark::kSecond); + + benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck) + ->Iterations(1) + ->Args({4096, arrow::Compression::FASTPFOR}) + ->Threads(1) + ->Threads(2) + ->Threads(4) + ->Threads(8) + ->Threads(16) + ->Threads(24) + ->Unit(benchmark::kSecond); +*/ + + benchmark::Initialize(&argc, argv); + benchmark::RunSpecifiedBenchmarks(); + benchmark::Shutdown(); +} \ No newline at end of file diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc index e739bd04f..798668dde 100644 --- a/native-sql-engine/cpp/src/shuffle/splitter.cc +++ b/native-sql-engine/cpp/src/shuffle/splitter.cc @@ -28,20 +28,45 @@ #include #include +#include +#include +#include #include "shuffle/utils.h" #include "utils/macros.h" +#include -#if defined(COLUMNAR_PLUGIN_USE_AVX512) +/*#if defined(COLUMNAR_PLUGIN_USE_AVX512) #include #else #include #endif +*/ namespace sparkcolumnarplugin { namespace shuffle { using arrow::internal::checked_cast; + + + +template +std::string __m128i_toString(const __m128i var) { + std::stringstream sstr; + T values[16/sizeof(T)]; + std::memcpy(values,&var,sizeof(values)); //See discussion below + if (sizeof(T) == 1) { + for (unsigned int i = 0; i < sizeof(__m128i); i++) { //C++11: Range for also possible + sstr << std::hex << (int) values[i] << " " << std::dec; + } + } else { + for (unsigned int i = 0; i < sizeof(__m128i) / sizeof(T); i++) { //C++11: Range for also possible + sstr << std::hex << values[i] << " " << std::dec; + } + } + return sstr.str(); +} + SplitOptions SplitOptions::Defaults() { return SplitOptions(); } #if defined(COLUMNAR_PLUGIN_USE_AVX512) inline __m256i CountPartitionIdOccurrence(const std::vector& partition_id, @@ -293,6 +318,7 @@ arrow::Status Splitter::Init() { partition_cached_recordbatch_size_.resize(num_partitions_); partition_lengths_.resize(num_partitions_); raw_partition_lengths_.resize(num_partitions_); + reducer_offset_offset_.resize(num_partitions_ + 1); for (int i = 0; i < column_type_id_.size(); ++i) { switch (column_type_id_[i]->id()) { @@ -815,6 +841,26 @@ arrow::Result Splitter::SpillLargestPartition(int64_t* size) { } arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) { +#ifdef PROCESSROW + + reducer_offsets_.resize(rb.num_rows()); + + reducer_offset_offset_[0] = 0; + for (auto pid = 1; pid <= num_partitions_; pid++) { + reducer_offset_offset_[pid] = + reducer_offset_offset_[pid - 1] + partition_id_cnt_[pid - 1]; + } + for (auto row = 0; row < rb.num_rows(); row++) { + auto pid = partition_id_[row]; + reducer_offsets_[reducer_offset_offset_[pid]] = row; + _mm_prefetch(reducer_offsets_.data() + reducer_offset_offset_[pid] + 32, _MM_HINT_T0); + reducer_offset_offset_[pid]++; + } + std::transform(reducer_offset_offset_.begin(), std::prev(reducer_offset_offset_.end()), + partition_id_cnt_.begin(), reducer_offset_offset_.begin(), + [](uint16_t x, int16_t y) { return x - y; }); + +#endif // for the first input record batch, scan binary arrays and large binary // arrays to get their empirical sizes @@ -922,6 +968,27 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb) auto src_addr = const_cast(rb.column_data(col_idx)->buffers[1]->data()); switch (arrow::bit_width(column_type_id_[col_idx]->id())) { +#ifdef PROCESSROW +// assume batch size = 32k; reducer# = 4K; row/reducer = 8 +#define PROCESS(_CTYPE) \ + std::transform(partition_buffer_idx_offset_.begin(), \ + partition_buffer_idx_offset_.end(), partition_buffer_idx_base_.begin(), \ + partition_buffer_idx_offset_.begin(), \ + [](uint8_t* x, int16_t y) { return x + y * sizeof(_CTYPE); }); \ + for (auto pid = 0; pid < num_partitions_; pid++) { \ + auto dst_pid_base = \ + reinterpret_cast<_CTYPE*>(partition_buffer_idx_offset_[pid]); /*32k*/ \ + auto r = reducer_offset_offset_[pid]; /*8k*/ \ + auto size = reducer_offset_offset_[pid + 1]; \ + for (r; r < size; r++) { \ + auto src_offset = reducer_offsets_[r]; /*16k*/ \ + *dst_pid_base = reinterpret_cast<_CTYPE*>(src_addr)[src_offset]; /*64k*/ \ + _mm_prefetch(&(src_addr)[src_offset * sizeof(_CTYPE) + 64], _MM_HINT_T2); \ + dst_pid_base += 1; \ + } \ + } \ + break; +#else #define PROCESS(_CTYPE) \ std::transform(partition_buffer_idx_offset_.begin(), \ partition_buffer_idx_offset_.end(), partition_buffer_idx_base_.begin(), \ @@ -932,9 +999,10 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb) auto dst_pid_base = reinterpret_cast<_CTYPE*>(partition_buffer_idx_offset_[pid]); \ *dst_pid_base = reinterpret_cast<_CTYPE*>(src_addr)[row]; \ partition_buffer_idx_offset_[pid] += sizeof(_CTYPE); \ - _mm_prefetch(&dst_pid_base[1], _MM_HINT_T0); \ + _mm_prefetch(&dst_pid_base[64 / sizeof(_CTYPE)], _MM_HINT_T0); \ } \ break; +#endif case 8: PROCESS(uint8_t) case 16: @@ -942,9 +1010,93 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb) case 32: PROCESS(uint32_t) case 64: +#ifdef PROCESSAVX + std::transform(partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(), + partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(), + [](uint8_t* x, int16_t y) { return x+y*sizeof(uint64_t); }); + for (auto pid = 0; pid < num_partitions_; pid++) + { + auto dst_pid_base = reinterpret_cast(partition_buffer_idx_offset_[pid]); /*32k*/ + auto r = reducer_offset_offset_[pid]; /*8k*/ + auto size = reducer_offset_offset_[pid+1]; +#if 1 + for (r; r 0); r++) + { + auto src_offset = reducer_offsets_[r]; /*16k*/ + *dst_pid_base = reinterpret_cast(src_addr)[src_offset]; /*64k*/ + _mm_prefetch(&(src_addr)[src_offset*sizeof(uint64_t)+64], _MM_HINT_T2); + dst_pid_base+=1; + } +#if 0 + for (r; r+4(src_addr)[src_offset]; /*64k*/ + _mm_prefetch(&(src_addr)[src_offset*sizeof(uint64_t)+64], _MM_HINT_T2); + dst_pid_base+=1; + } + } + break; +#else PROCESS(uint64_t) +#endif + #undef PROCESS case 128: // arrow::Decimal128Type::type_id +#ifdef PROCESSROW + // assume batch size = 32k; reducer# = 4K; row/reducer = 8 + std::transform( + partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(), + partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(), + [](uint8_t* x, int16_t y) { return x + y * 16; }); + for (auto pid = 0; pid < num_partitions_; pid++) { + auto dst_pid_base = + reinterpret_cast(partition_buffer_idx_offset_[pid]); /*32k*/ + auto r = reducer_offset_offset_[pid]; /*8k*/ + auto size = reducer_offset_offset_[pid + 1]; + for (r; r < size; r++) { + auto src_offset = reducer_offsets_[r]; /*16k*/ + *dst_pid_base = + reinterpret_cast(src_addr)[src_offset << 1]; /*128k*/ + *(dst_pid_base + 1) = + reinterpret_cast(src_addr)[src_offset << 1 | 1]; /*128k*/ + _mm_prefetch(&(src_addr)[src_offset * 16 + 64], _MM_HINT_T2); + dst_pid_base += 2; + } + } + break; +#else std::transform( partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(), partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(), @@ -960,6 +1112,7 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb) _MM_HINT_T0); } break; +#endif case 1: // arrow::BooleanType::type_id: partition_buffer_idx_offset.resize(partition_buffer_idx_base_.size()); std::copy(partition_buffer_idx_base_.begin(), partition_buffer_idx_base_.end(), @@ -1159,6 +1312,7 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch& if (rb.column_data(col_idx)->GetNullCount() == 0 && column_has_null_[col_idx] == true) { // if the input record batch doesn't have null, set validity to True + // column_has_null_ is used to skip the partition_id_cnt_[pid] and dst_addrs[pid] access for (auto pid = 0; pid < num_partitions_; ++pid) { if (partition_id_cnt_[pid] > 0 && dst_addrs[pid] != nullptr) { arrow::BitUtil::SetBitsTo(dst_addrs[pid], partition_buffer_idx_base_[pid], @@ -1406,7 +1560,14 @@ arrow::Status HashSplitter::ComputeAndCountPartitionId(const arrow::RecordBatch& for (auto i = 0; i < num_rows; ++i) { // positive mod auto pid = pid_arr->Value(i) % num_partitions_; - if (pid < 0) pid = (pid + num_partitions_) % num_partitions_; + //force to generate ASM + __asm__ ( + "lea (%[num_partitions],%[pid],1),%[tmp]\n" + "test %[pid],%[pid]\n" + "cmovs %[tmp],%[pid]\n" + : [pid] "+r"(pid) + : [num_partitions]"r"(num_partitions_),[tmp]"r"(0) + ); partition_id_[i] = pid; partition_id_cnt_[pid]++; } diff --git a/native-sql-engine/cpp/src/shuffle/splitter.h b/native-sql-engine/cpp/src/shuffle/splitter.h index 0dfac2f8c..2fb4bb3d4 100644 --- a/native-sql-engine/cpp/src/shuffle/splitter.h +++ b/native-sql-engine/cpp/src/shuffle/splitter.h @@ -226,6 +226,10 @@ class Splitter { // updated for each input record batch // col std::vector partition_id_; + // [num_rows] + std::vector reducer_offsets_; + // [num_partitions] + std::vector reducer_offset_offset_; // col std::vector partition_id_cnt_; From 94b733dd463118886e98cfd287a50c14d309f55f Mon Sep 17 00:00:00 2001 From: binwei Date: Sat, 30 Apr 2022 16:42:21 +0800 Subject: [PATCH 02/19] format code --- .../src/benchmarks/shuffle_split_benchmark.cc | 217 +++++++++--------- native-sql-engine/cpp/src/shuffle/splitter.cc | 129 ++++++----- .../src/third_party/parallel_hashmap/btree.h | 21 +- .../src/third_party/parallel_hashmap/phmap.h | 12 +- .../third_party/parallel_hashmap/phmap_base.h | 36 +-- 5 files changed, 204 insertions(+), 211 deletions(-) diff --git a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc index ec1416641..d2bffe36a 100644 --- a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc +++ b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc @@ -43,10 +43,8 @@ const int split_buffer_size = 8192; class BenchmarkShuffleSplit { public: - BenchmarkShuffleSplit(std::string file_name) { - GetRecordBatchReader(file_name); - } - + BenchmarkShuffleSplit(std::string file_name) { GetRecordBatchReader(file_name); } + void GetRecordBatchReader(const std::string& input_file) { std::unique_ptr<::parquet::arrow::FileReader> parquet_reader; std::shared_ptr record_batch_reader; @@ -107,8 +105,8 @@ class BenchmarkShuffleSplit { int64_t num_rows = 0; int64_t split_time = 0; - Do_Split(splitter, elapse_read, num_batches, num_rows, split_time, - num_partitions, options, state); + Do_Split(splitter, elapse_read, num_batches, num_rows, split_time, num_partitions, + options, state); auto fs = std::make_shared(); fs->DeleteFile(splitter->DataFile()); @@ -117,56 +115,57 @@ class BenchmarkShuffleSplit { state.counters["rowgroups"] = benchmark::Counter(row_group_indices.size(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); + benchmark::Counter::OneK::kIs1000); state.counters["columns"] = benchmark::Counter(column_indices.size(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); + benchmark::Counter::OneK::kIs1000); state.counters["batches"] = benchmark::Counter( num_batches, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); state.counters["num_rows"] = benchmark::Counter( num_rows, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - state.counters["num_partitions"] = benchmark::Counter( - num_partitions, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); + state.counters["num_partitions"] = + benchmark::Counter(num_partitions, benchmark::Counter::kAvgThreads, + benchmark::Counter::OneK::kIs1000); state.counters["batch_buffer_size"] = benchmark::Counter(batch_buffer_size, benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); + benchmark::Counter::OneK::kIs1024); state.counters["split_buffer_size"] = benchmark::Counter(split_buffer_size, benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); + benchmark::Counter::OneK::kIs1024); state.counters["bytes_spilled"] = benchmark::Counter(splitter->TotalBytesSpilled(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); + benchmark::Counter::OneK::kIs1024); state.counters["bytes_written"] = benchmark::Counter(splitter->TotalBytesWritten(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); + benchmark::Counter::OneK::kIs1024); state.counters["bytes_raw"] = benchmark::Counter(splitter->RawPartitionBytes(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); + benchmark::Counter::OneK::kIs1024); state.counters["bytes_spilled"] = benchmark::Counter(splitter->TotalBytesSpilled(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); + benchmark::Counter::OneK::kIs1024); state.counters["parquet_parse"] = benchmark::Counter( elapse_read, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - state.counters["compute_pid_time"] = - benchmark::Counter(splitter->TotalComputePidTime(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); + state.counters["compute_pid_time"] = benchmark::Counter( + splitter->TotalComputePidTime(), benchmark::Counter::kAvgThreads, + benchmark::Counter::OneK::kIs1000); state.counters["write_time"] = benchmark::Counter(splitter->TotalWriteTime(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); + benchmark::Counter::OneK::kIs1000); state.counters["spill_time"] = benchmark::Counter(splitter->TotalSpillTime(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); + benchmark::Counter::OneK::kIs1000); state.counters["compress_time"] = benchmark::Counter(splitter->TotalCompressTime(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); + benchmark::Counter::OneK::kIs1000); - split_time = split_time - splitter->TotalSpillTime() - splitter->TotalComputePidTime() - - splitter->TotalCompressTime() - splitter->TotalWriteTime(); + split_time = split_time - splitter->TotalSpillTime() - + splitter->TotalComputePidTime() - splitter->TotalCompressTime() - + splitter->TotalWriteTime(); state.counters["split_time"] = benchmark::Counter( - split_time, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - + split_time, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); } protected: @@ -178,7 +177,8 @@ class BenchmarkShuffleSplit { } virtual void Do_Split(std::shared_ptr& splitter, int64_t& elapse_read, int64_t& num_batches, int64_t& num_rows, int64_t& split_time, - const int num_partitions, SplitOptions options, benchmark::State& state) {} + const int num_partitions, SplitOptions options, + benchmark::State& state) {} protected: std::string file_name; @@ -190,16 +190,15 @@ class BenchmarkShuffleSplit { parquet::ArrowReaderProperties properties; }; +class BenchmarkShuffleSplit_CacheScan_Benchmark : public BenchmarkShuffleSplit { + public: + BenchmarkShuffleSplit_CacheScan_Benchmark(std::string filename) + : BenchmarkShuffleSplit(filename) {} -class BenchmarkShuffleSplit_CacheScan_Benchmark: public BenchmarkShuffleSplit{ -public: -BenchmarkShuffleSplit_CacheScan_Benchmark(std::string filename):BenchmarkShuffleSplit(filename){} - -protected: + protected: void Do_Split(std::shared_ptr& splitter, int64_t& elapse_read, - int64_t& num_batches, int64_t& num_rows, int64_t& split_time, - const int num_partitions, SplitOptions options, benchmark::State& state) { - + int64_t& num_batches, int64_t& num_rows, int64_t& split_time, + const int num_partitions, SplitOptions options, benchmark::State& state) { std::vector local_column_indices; local_column_indices.push_back(0); local_column_indices.push_back(1); @@ -208,7 +207,7 @@ BenchmarkShuffleSplit_CacheScan_Benchmark(std::string filename):BenchmarkShuffle local_column_indices.push_back(5); local_column_indices.push_back(6); local_column_indices.push_back(7); - + std::shared_ptr local_schema; local_schema = std::make_shared(*schema.get()); @@ -222,23 +221,22 @@ BenchmarkShuffleSplit_CacheScan_Benchmark(std::string filename):BenchmarkShuffle ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(8)); ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(3)); - if(state.thread_index() == 0) - std::cout << local_schema->ToString() << std::endl; + if (state.thread_index() == 0) std::cout << local_schema->ToString() << std::endl; + + ARROW_ASSIGN_OR_THROW(splitter, + Splitter::Make("rr", local_schema, num_partitions, options)); - ARROW_ASSIGN_OR_THROW( - splitter, Splitter::Make("rr", local_schema, num_partitions, options)); - std::shared_ptr record_batch; std::unique_ptr<::parquet::arrow::FileReader> parquet_reader; std::shared_ptr record_batch_reader; ASSERT_NOT_OK(::parquet::arrow::FileReader::Make( - arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file), properties, - &parquet_reader)); + arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file), + properties, &parquet_reader)); std::vector> batches; - ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader(row_group_indices, local_column_indices, - &record_batch_reader)); + ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader( + row_group_indices, local_column_indices, &record_batch_reader)); do { TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch)); @@ -248,38 +246,36 @@ BenchmarkShuffleSplit_CacheScan_Benchmark(std::string filename):BenchmarkShuffle num_rows += record_batch->num_rows(); } } while (record_batch); - std::cout << "parquet parse done elapsed time " << elapse_read/1000000 << " ms " << std::endl; + std::cout << "parquet parse done elapsed time " << elapse_read / 1000000 << " ms " + << std::endl; std::cout << "batches = " << num_batches << " rows = " << num_rows << std::endl; for (auto _ : state) { - for_each(batches.begin(), batches.end(), - [&splitter, &split_time](std::shared_ptr& record_batch) { - TIME_NANO_OR_THROW(split_time, splitter->Split(*record_batch)); - }); + for_each( + batches.begin(), batches.end(), + [&splitter, &split_time](std::shared_ptr& record_batch) { + TIME_NANO_OR_THROW(split_time, splitter->Split(*record_batch)); + }); } TIME_NANO_OR_THROW(split_time, splitter->Stop()); } - - }; +class BenchmarkShuffleSplit_IterateScan_Benchmark : public BenchmarkShuffleSplit { + public: + BenchmarkShuffleSplit_IterateScan_Benchmark(std::string filename) + : BenchmarkShuffleSplit(filename) {} -class BenchmarkShuffleSplit_IterateScan_Benchmark: public BenchmarkShuffleSplit{ -public: -BenchmarkShuffleSplit_IterateScan_Benchmark(std::string filename):BenchmarkShuffleSplit(filename){} - -protected: + protected: void Do_Split(std::shared_ptr& splitter, int64_t& elapse_read, - int64_t& num_batches, int64_t& num_rows, int64_t& split_time, - const int num_partitions, SplitOptions options, benchmark::State& state) { - - if(state.thread_index() == 0) - std::cout << schema->ToString() << std::endl; + int64_t& num_batches, int64_t& num_rows, int64_t& split_time, + const int num_partitions, SplitOptions options, benchmark::State& state) { + if (state.thread_index() == 0) std::cout << schema->ToString() << std::endl; if (!expr_vector.empty()) { ARROW_ASSIGN_OR_THROW(splitter, Splitter::Make("hash", schema, num_partitions, - expr_vector, std::move(options))); + expr_vector, std::move(options))); } else { ARROW_ASSIGN_OR_THROW( splitter, Splitter::Make("rr", schema, num_partitions, std::move(options))); @@ -290,13 +286,13 @@ BenchmarkShuffleSplit_IterateScan_Benchmark(std::string filename):BenchmarkShuff std::unique_ptr<::parquet::arrow::FileReader> parquet_reader; std::shared_ptr record_batch_reader; ASSERT_NOT_OK(::parquet::arrow::FileReader::Make( - arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file), properties, - &parquet_reader)); + arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file), + properties, &parquet_reader)); for (auto _ : state) { std::vector> batches; - ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader(row_group_indices, column_indices, - &record_batch_reader)); + ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader( + row_group_indices, column_indices, &record_batch_reader)); TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch)); while (record_batch) { num_batches += 1; @@ -347,26 +343,20 @@ BenchmarkShuffleSplit_IterateScan_Benchmark(std::string filename):BenchmarkShuff } // namespace sparkcolumnarplugin int main(int argc, char** argv) { - - uint32_t iterations=1; - uint32_t partitions=512; - uint32_t threads=1; + uint32_t iterations = 1; + uint32_t partitions = 512; + uint32_t threads = 1; std::string datafile; - for (int i=0;iIterations(iterations) - ->Args({partitions, arrow::Compression::FASTPFOR}) - ->Threads(threads) - ->ReportAggregatesOnly(false) - ->MeasureProcessCPUTime() - ->Unit(benchmark::kSecond); - -/* sparkcolumnarplugin::shuffle::BenchmarkShuffleSplit_IterateScan_Benchmark bck(datafile); - - benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck) - ->Iterations(1) - ->Args({96*2, arrow::Compression::FASTPFOR}) - ->Args({96*4, arrow::Compression::FASTPFOR}) - ->Args({96*8, arrow::Compression::FASTPFOR}) - ->Args({96*16, arrow::Compression::FASTPFOR}) - ->Args({96*32, arrow::Compression::FASTPFOR}) - ->Threads(24) + ->Iterations(iterations) + ->Args({partitions, arrow::Compression::FASTPFOR}) + ->Threads(threads) + ->ReportAggregatesOnly(false) + ->MeasureProcessCPUTime() ->Unit(benchmark::kSecond); - benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck) - ->Iterations(1) - ->Args({4096, arrow::Compression::FASTPFOR}) - ->Threads(1) - ->Threads(2) - ->Threads(4) - ->Threads(8) - ->Threads(16) - ->Threads(24) - ->Unit(benchmark::kSecond); -*/ + /* sparkcolumnarplugin::shuffle::BenchmarkShuffleSplit_IterateScan_Benchmark + bck(datafile); + + benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck) + ->Iterations(1) + ->Args({96*2, arrow::Compression::FASTPFOR}) + ->Args({96*4, arrow::Compression::FASTPFOR}) + ->Args({96*8, arrow::Compression::FASTPFOR}) + ->Args({96*16, arrow::Compression::FASTPFOR}) + ->Args({96*32, arrow::Compression::FASTPFOR}) + ->Threads(24) + ->Unit(benchmark::kSecond); + + benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck) + ->Iterations(1) + ->Args({4096, arrow::Compression::FASTPFOR}) + ->Threads(1) + ->Threads(2) + ->Threads(4) + ->Threads(8) + ->Threads(16) + ->Threads(24) + ->Unit(benchmark::kSecond); + */ benchmark::Initialize(&argc, argv); benchmark::RunSpecifiedBenchmarks(); diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc index 798668dde..812dc4516 100644 --- a/native-sql-engine/cpp/src/shuffle/splitter.cc +++ b/native-sql-engine/cpp/src/shuffle/splitter.cc @@ -25,16 +25,16 @@ #include #include #include +#include -#include -#include -#include #include +#include #include +#include +#include #include "shuffle/utils.h" #include "utils/macros.h" -#include /*#if defined(COLUMNAR_PLUGIN_USE_AVX512) #include @@ -47,24 +47,23 @@ namespace sparkcolumnarplugin { namespace shuffle { using arrow::internal::checked_cast; - - - template std::string __m128i_toString(const __m128i var) { - std::stringstream sstr; - T values[16/sizeof(T)]; - std::memcpy(values,&var,sizeof(values)); //See discussion below - if (sizeof(T) == 1) { - for (unsigned int i = 0; i < sizeof(__m128i); i++) { //C++11: Range for also possible - sstr << std::hex << (int) values[i] << " " << std::dec; - } - } else { - for (unsigned int i = 0; i < sizeof(__m128i) / sizeof(T); i++) { //C++11: Range for also possible - sstr << std::hex << values[i] << " " << std::dec; - } + std::stringstream sstr; + T values[16 / sizeof(T)]; + std::memcpy(values, &var, sizeof(values)); // See discussion below + if (sizeof(T) == 1) { + for (unsigned int i = 0; i < sizeof(__m128i); i++) { // C++11: Range for also + // possible + sstr << std::hex << (int)values[i] << " " << std::dec; + } + } else { + for (unsigned int i = 0; i < sizeof(__m128i) / sizeof(T); + i++) { // C++11: Range for also possible + sstr << std::hex << values[i] << " " << std::dec; } - return sstr.str(); + } + return sstr.str(); } SplitOptions SplitOptions::Defaults() { return SplitOptions(); } @@ -1011,22 +1010,22 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb) PROCESS(uint32_t) case 64: #ifdef PROCESSAVX - std::transform(partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(), - partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(), - [](uint8_t* x, int16_t y) { return x+y*sizeof(uint64_t); }); - for (auto pid = 0; pid < num_partitions_; pid++) - { - auto dst_pid_base = reinterpret_cast(partition_buffer_idx_offset_[pid]); /*32k*/ - auto r = reducer_offset_offset_[pid]; /*8k*/ - auto size = reducer_offset_offset_[pid+1]; + std::transform( + partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(), + partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(), + [](uint8_t* x, int16_t y) { return x + y * sizeof(uint64_t); }); + for (auto pid = 0; pid < num_partitions_; pid++) { + auto dst_pid_base = + reinterpret_cast(partition_buffer_idx_offset_[pid]); /*32k*/ + auto r = reducer_offset_offset_[pid]; /*8k*/ + auto size = reducer_offset_offset_[pid + 1]; #if 1 - for (r; r 0); r++) - { - auto src_offset = reducer_offsets_[r]; /*16k*/ - *dst_pid_base = reinterpret_cast(src_addr)[src_offset]; /*64k*/ - _mm_prefetch(&(src_addr)[src_offset*sizeof(uint64_t)+64], _MM_HINT_T2); - dst_pid_base+=1; - } + for (r; r < size && (((uint64_t)dst_pid_base & 0x1f) > 0); r++) { + auto src_offset = reducer_offsets_[r]; /*16k*/ + *dst_pid_base = reinterpret_cast(src_addr)[src_offset]; /*64k*/ + _mm_prefetch(&(src_addr)[src_offset * sizeof(uint64_t) + 64], _MM_HINT_T2); + dst_pid_base += 1; + } #if 0 for (r; r+4(src_addr)[src_offset]; /*64k*/ - _mm_prefetch(&(src_addr)[src_offset*sizeof(uint64_t)+64], _MM_HINT_T2); - dst_pid_base+=1; - } - } + for (r; r + 2 < size; r += 2) { + __m128i src_offset_2x = + _mm_cvtsi32_si128(*((int32_t*)(reducer_offsets_.data() + r))); + src_offset_2x = _mm_shufflelo_epi16(src_offset_2x, 0x98); + + __m128i src_2x = + _mm_i32gather_epi64((const long long int*)src_addr, src_offset_2x, 8); + _mm_store_si128((__m128i*)dst_pid_base, src_2x); + //_mm_stream_si128((__m128i*)dst_pid_base,src_2x); + + _mm_prefetch( + &(src_addr)[(uint32_t)reducer_offsets_[r] * sizeof(uint64_t) + 64], + _MM_HINT_T2); + _mm_prefetch( + &(src_addr)[(uint32_t)reducer_offsets_[r + 1] * sizeof(uint64_t) + 64], + _MM_HINT_T2); + dst_pid_base += 2; + } +#endif + for (r; r < size; r++) { + auto src_offset = reducer_offsets_[r]; /*16k*/ + *dst_pid_base = reinterpret_cast(src_addr)[src_offset]; /*64k*/ + _mm_prefetch(&(src_addr)[src_offset * sizeof(uint64_t) + 64], _MM_HINT_T2); + dst_pid_base += 1; + } + } break; #else PROCESS(uint64_t) @@ -1075,7 +1078,7 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb) #undef PROCESS case 128: // arrow::Decimal128Type::type_id #ifdef PROCESSROW - // assume batch size = 32k; reducer# = 4K; row/reducer = 8 + // assume batch size = 32k; reducer# = 4K; row/reducer = 8 std::transform( partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(), partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(), @@ -1312,7 +1315,8 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch& if (rb.column_data(col_idx)->GetNullCount() == 0 && column_has_null_[col_idx] == true) { // if the input record batch doesn't have null, set validity to True - // column_has_null_ is used to skip the partition_id_cnt_[pid] and dst_addrs[pid] access + // column_has_null_ is used to skip the partition_id_cnt_[pid] and dst_addrs[pid] + // access for (auto pid = 0; pid < num_partitions_; ++pid) { if (partition_id_cnt_[pid] > 0 && dst_addrs[pid] != nullptr) { arrow::BitUtil::SetBitsTo(dst_addrs[pid], partition_buffer_idx_base_[pid], @@ -1560,14 +1564,13 @@ arrow::Status HashSplitter::ComputeAndCountPartitionId(const arrow::RecordBatch& for (auto i = 0; i < num_rows; ++i) { // positive mod auto pid = pid_arr->Value(i) % num_partitions_; - //force to generate ASM - __asm__ ( + // force to generate ASM + __asm__( "lea (%[num_partitions],%[pid],1),%[tmp]\n" "test %[pid],%[pid]\n" "cmovs %[tmp],%[pid]\n" : [pid] "+r"(pid) - : [num_partitions]"r"(num_partitions_),[tmp]"r"(0) - ); + : [num_partitions] "r"(num_partitions_), [tmp] "r"(0)); partition_id_[i] = pid; partition_id_cnt_[pid]++; } diff --git a/native-sql-engine/cpp/src/third_party/parallel_hashmap/btree.h b/native-sql-engine/cpp/src/third_party/parallel_hashmap/btree.h index b9b0d94da..24c2d145b 100644 --- a/native-sql-engine/cpp/src/third_party/parallel_hashmap/btree.h +++ b/native-sql-engine/cpp/src/third_party/parallel_hashmap/btree.h @@ -661,9 +661,9 @@ constexpr bool do_less_than_comparison(const Compare& compare, const K& x, const // SFINAE prevents implicit conversions to int (such as from bool). template ::value, int> = 0> constexpr phmap::weak_ordering compare_result_as_ordering(const Int c) { - return c < 0 - ? phmap::weak_ordering::less - : c == 0 ? phmap::weak_ordering::equivalent : phmap::weak_ordering::greater; + return c < 0 ? phmap::weak_ordering::less + : c == 0 ? phmap::weak_ordering::equivalent + : phmap::weak_ordering::greater; } constexpr phmap::weak_ordering compare_result_as_ordering(const phmap::weak_ordering c) { return c; @@ -685,9 +685,9 @@ template < int> = 0> constexpr phmap::weak_ordering do_three_way_comparison(const Compare& compare, const K& x, const LK& y) { - return compare(x, y) ? phmap::weak_ordering::less - : compare(y, x) ? phmap::weak_ordering::greater - : phmap::weak_ordering::equivalent; + return compare(x, y) ? phmap::weak_ordering::less + : compare(y, x) ? phmap::weak_ordering::greater + : phmap::weak_ordering::equivalent; } } // namespace compare_internal @@ -1063,11 +1063,10 @@ class btree_node { // Compute how many values we can fit onto a leaf node taking into account // padding. constexpr static size_type NodeTargetValues(const int begin, const int end) { - return begin == end - ? begin - : SizeWithNValues((begin + end) / 2 + 1) > params_type::kTargetNodeSize - ? NodeTargetValues(begin, (begin + end) / 2) - : NodeTargetValues((begin + end) / 2 + 1, end); + return begin == end ? begin + : SizeWithNValues((begin + end) / 2 + 1) > params_type::kTargetNodeSize + ? NodeTargetValues(begin, (begin + end) / 2) + : NodeTargetValues((begin + end) / 2 + 1, end); } enum { diff --git a/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap.h b/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap.h index 4628cca30..05d227a43 100644 --- a/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap.h +++ b/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap.h @@ -2156,13 +2156,13 @@ class raw_hash_map : public raw_hash_set { // incomplete types as values, as in unordered_map. // MappedReference<> may be a non-reference type. template - using MappedReference = decltype( - P::value(std::addressof(std::declval()))); + using MappedReference = decltype(P::value( + std::addressof(std::declval()))); // MappedConstReference<> may be a non-reference type. template - using MappedConstReference = decltype( - P::value(std::addressof(std::declval()))); + using MappedConstReference = decltype(P::value( + std::addressof(std::declval()))); using KeyArgImpl = KeyArg::value && IsTransparent::value>; @@ -3409,8 +3409,8 @@ class parallel_hash_map // incomplete types as values, as in unordered_map. // MappedReference<> may be a non-reference type. template - using MappedReference = decltype( - P::value(std::addressof(std::declval()))); + using MappedReference = decltype(P::value( + std::addressof(std::declval()))); // MappedConstReference<> may be a non-reference type. template diff --git a/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap_base.h b/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap_base.h index 3b3b6b120..0f4e6375d 100644 --- a/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap_base.h +++ b/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap_base.h @@ -826,8 +826,8 @@ struct Invoker { // The result type of Invoke. template -using InvokeT = decltype( - Invoker::type::Invoke(std::declval(), std::declval()...)); +using InvokeT = decltype(Invoker::type::Invoke(std::declval(), + std::declval()...)); // Invoke(f, args...) is an implementation of INVOKE(f, args...) from section // [func.require] of the C++ standard. @@ -1002,9 +1002,10 @@ constexpr T&& forward( namespace utility_internal { // Helper method for expanding tuple into a called method. template -auto apply_helper(Functor&& functor, Tuple&& t, index_sequence) -> decltype( - phmap::base_internal::Invoke(phmap::forward(functor), - std::get(phmap::forward(t))...)) { +auto apply_helper(Functor&& functor, Tuple&& t, index_sequence) + -> decltype(phmap::base_internal::Invoke( + phmap::forward(functor), + std::get(phmap::forward(t))...)) { return phmap::base_internal::Invoke(phmap::forward(functor), std::get(phmap::forward(t))...); } @@ -1887,19 +1888,18 @@ class optional_assign_base { template constexpr copy_traits get_ctor_copy_traits() { - return std::is_copy_constructible::value - ? copy_traits::copyable - : std::is_move_constructible::value ? copy_traits::movable - : copy_traits::non_movable; + return std::is_copy_constructible::value ? copy_traits::copyable + : std::is_move_constructible::value ? copy_traits::movable + : copy_traits::non_movable; } template constexpr copy_traits get_assign_copy_traits() { return phmap::is_copy_assignable::value && std::is_copy_constructible::value ? copy_traits::copyable - : phmap::is_move_assignable::value && std::is_move_constructible::value - ? copy_traits::movable - : copy_traits::non_movable; + : phmap::is_move_assignable::value && std::is_move_constructible::value + ? copy_traits::movable + : copy_traits::non_movable; } // Whether T is constructible or convertible from optional. @@ -2421,9 +2421,9 @@ constexpr optional make_optional(std::initializer_list il, Args&&... args) template constexpr auto operator==(const optional& x, const optional& y) -> decltype(optional_internal::convertible_to_bool(*x == *y)) { - return static_cast(x) != static_cast(y) - ? false - : static_cast(x) == false ? true : static_cast(*x == *y); + return static_cast(x) != static_cast(y) ? false + : static_cast(x) == false ? true + : static_cast(*x == *y); } // Returns: If bool(x) != bool(y), true; otherwise, if bool(x) == false, false; @@ -2431,9 +2431,9 @@ constexpr auto operator==(const optional& x, const optional& y) template constexpr auto operator!=(const optional& x, const optional& y) -> decltype(optional_internal::convertible_to_bool(*x != *y)) { - return static_cast(x) != static_cast(y) - ? true - : static_cast(x) == false ? false : static_cast(*x != *y); + return static_cast(x) != static_cast(y) ? true + : static_cast(x) == false ? false + : static_cast(*x != *y); } // Returns: If !y, false; otherwise, if !x, true; otherwise *x < *y. template From d7ce830e78b985607f9d2dd611e476dfdf3f9a50 Mon Sep 17 00:00:00 2001 From: binwei Date: Sat, 30 Apr 2022 22:42:25 +0800 Subject: [PATCH 03/19] Allocate large block of memory then slice to each buffer --- native-sql-engine/cpp/CMakeLists.txt | 6 +- native-sql-engine/cpp/src/shuffle/splitter.cc | 81 +++++++++++++------ native-sql-engine/cpp/src/shuffle/splitter.h | 8 +- .../cpp/src/tests/shuffle_split_test.cc | 2 +- 4 files changed, 69 insertions(+), 28 deletions(-) diff --git a/native-sql-engine/cpp/CMakeLists.txt b/native-sql-engine/cpp/CMakeLists.txt index fe7e989ee..48a923614 100644 --- a/native-sql-engine/cpp/CMakeLists.txt +++ b/native-sql-engine/cpp/CMakeLists.txt @@ -1,10 +1,10 @@ cmake_minimum_required(VERSION 3.16) project(spark_columnar_plugin) -#add_definitions(-DSKIPWRITE -DSKIPCOMPRESS -DPROCESSROW) -add_definitions(-DPROCESSROW) +add_definitions(-DSKIPWRITE -DSKIPCOMPRESS -DPROCESSROW) +#add_definitions(-DPROCESSROW) -#add_compile_options(-g) +add_compile_options(-g) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(root_directory ${PROJECT_BINARY_DIR}) diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc index 812dc4516..62e31df65 100644 --- a/native-sql-engine/cpp/src/shuffle/splitter.cc +++ b/native-sql-engine/cpp/src/shuffle/splitter.cc @@ -47,6 +47,11 @@ namespace sparkcolumnarplugin { namespace shuffle { using arrow::internal::checked_cast; +#ifndef SPLIT_BUFFER_SIZE +//by default, allocate 8M block, 2M page size +#define SPLIT_BUFFER_SIZE 8*1024*1024 +#endif + template std::string __m128i_toString(const __m128i var) { std::stringstream sstr; @@ -401,6 +406,36 @@ arrow::Status Splitter::Init() { tiny_bach_write_options_.codec, arrow::util::Codec::CreateInt32(arrow::Compression::UNCOMPRESSED)); + //Allocate first buffer for split reducer + ARROW_ASSIGN_OR_RAISE(combine_buffer_, arrow::AllocateResizableBuffer( + SPLIT_BUFFER_SIZE, + options_.memory_pool)); + combine_buffer_->Resize(0, /*shrink_to_fit =*/false); + + return arrow::Status::OK(); +} +arrow::Status Splitter::AllocateBufferFromPool(std::shared_ptr& buffer, uint32_t size) +{ + // if size is already larger than buffer pool size, allocate it directly + //make size 64byte aligned + auto reminder = size & 0x3f; + size+=(64-reminder) & ((reminder==0)-1); + + if (size > SPLIT_BUFFER_SIZE ) + { + ARROW_ASSIGN_OR_RAISE(buffer, arrow::AllocateResizableBuffer( + size, options_.memory_pool)); + return arrow::Status::OK(); + }else if (combine_buffer_->capacity() - combine_buffer_->size() < size) + { + //memory pool is not enough + ARROW_ASSIGN_OR_RAISE(combine_buffer_, arrow::AllocateResizableBuffer( + SPLIT_BUFFER_SIZE, + options_.memory_pool)); + } + buffer = arrow::SliceMutableBuffer(combine_buffer_, combine_buffer_->size(),size); + + combine_buffer_->Resize(combine_buffer_->size() + size); return arrow::Status::OK(); } @@ -576,15 +611,13 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer default: { auto& buffers = partition_fixed_width_buffers_[fixed_width_idx][partition_id]; if (buffers[0] != nullptr) { - buffers[0]->Resize((num_rows >> 3) + 1, /*shrink_to_fit =*/false); + buffers[0]=arrow::SliceBuffer(buffers[0],0,(num_rows >> 3) + 1); } if (buffers[1] != nullptr) { if (column_type_id_[i]->id() == arrow::BooleanType::type_id) - buffers[1]->Resize((num_rows >> 3) + 1, /*shrink_to_fit =*/false); + buffers[1]=arrow::SliceBuffer(buffers[1],0,(num_rows >> 3) + 1); else - buffers[1]->Resize( - num_rows * (arrow::bit_width(column_type_id_[i]->id()) >> 3), - /*shrink_to_fit =*/false); + buffers[1]=arrow::SliceBuffer(buffers[1],0,num_rows * (arrow::bit_width(column_type_id_[i]->id()) >> 3)); } if (reset_buffers) { @@ -642,12 +675,14 @@ arrow::Status Splitter::AllocatePartitionBuffers(int32_t partition_id, int32_t n auto binary_idx = 0; auto large_binary_idx = 0; auto list_idx = 0; + auto total_size = 0; std::vector> new_binary_builders; std::vector> new_large_binary_builders; std::vector> new_list_builders; - std::vector> new_value_buffers; - std::vector> new_validity_buffers; + std::vector> new_value_buffers; + std::vector> new_validity_buffers; + for (auto i = 0; i < num_fields; ++i) { switch (column_type_id_[i]->id()) { case arrow::BinaryType::type_id: @@ -688,30 +723,30 @@ arrow::Status Splitter::AllocatePartitionBuffers(int32_t partition_id, int32_t n case arrow::NullType::type_id: break; default: { - std::shared_ptr value_buffer; + try{ + std::shared_ptr value_buffer; if (column_type_id_[i]->id() == arrow::BooleanType::type_id) { - ARROW_ASSIGN_OR_RAISE(value_buffer, arrow::AllocateResizableBuffer( - arrow::BitUtil::BytesForBits(new_size), - options_.memory_pool)); + auto status = AllocateBufferFromPool(value_buffer, arrow::BitUtil::BytesForBits(new_size)); + ARROW_RETURN_NOT_OK( status ); } else { - ARROW_ASSIGN_OR_RAISE( - value_buffer, - arrow::AllocateResizableBuffer( - new_size * (arrow::bit_width(column_type_id_[i]->id()) / 8), - options_.memory_pool)); + auto status = AllocateBufferFromPool(value_buffer, new_size * (arrow::bit_width(column_type_id_[i]->id()) >>3)); + ARROW_RETURN_NOT_OK( status ); + } new_value_buffers.push_back(std::move(value_buffer)); if (input_fixed_width_has_null_[fixed_width_idx]) { - std::shared_ptr validity_buffer; - ARROW_ASSIGN_OR_RAISE( - validity_buffer, - arrow::AllocateResizableBuffer(arrow::BitUtil::BytesForBits(new_size), - options_.memory_pool)); + std::shared_ptr validity_buffer; + auto status = AllocateBufferFromPool(validity_buffer, arrow::BitUtil::BytesForBits(new_size)); + ARROW_RETURN_NOT_OK( status ); new_validity_buffers.push_back(std::move(validity_buffer)); } else { new_validity_buffers.push_back(nullptr); } fixed_width_idx++; + }catch(const std::exception& e) + { + std::cout << "exception captured " << e.what() << std::endl; + } break; } } @@ -746,10 +781,10 @@ arrow::Status Splitter::AllocatePartitionBuffers(int32_t partition_id, int32_t n break; default: partition_fixed_width_value_addrs_[fixed_width_idx][partition_id] = - const_cast(new_value_buffers[fixed_width_idx]->data()); + new_value_buffers[fixed_width_idx]->mutable_data(); if (input_fixed_width_has_null_[fixed_width_idx]) { partition_fixed_width_validity_addrs_[fixed_width_idx][partition_id] = - const_cast(new_validity_buffers[fixed_width_idx]->data()); + new_validity_buffers[fixed_width_idx]->mutable_data(); } else { partition_fixed_width_validity_addrs_[fixed_width_idx][partition_id] = nullptr; } diff --git a/native-sql-engine/cpp/src/shuffle/splitter.h b/native-sql-engine/cpp/src/shuffle/splitter.h index 2fb4bb3d4..1c1c8e2da 100644 --- a/native-sql-engine/cpp/src/shuffle/splitter.h +++ b/native-sql-engine/cpp/src/shuffle/splitter.h @@ -138,6 +138,8 @@ class Splitter { arrow::Status SplitListArray(const arrow::RecordBatch& rb); + arrow::Status AllocateBufferFromPool(std::shared_ptr& buffer, uint32_t size); + template ::ArrayType, typename BuilderType = typename arrow::TypeTraits::BuilderType> arrow::Status AppendBinary( @@ -188,7 +190,7 @@ class Splitter { // col partid std::vector> partition_fixed_width_value_addrs_; // col partid - std::vector>>> + std::vector>>> partition_fixed_width_buffers_; // col partid std::vector>> @@ -198,6 +200,10 @@ class Splitter { partition_large_binary_builders_; std::vector>> partition_list_builders_; // col partid + + //slice the buffer for each reducer's column, in this way we can combine into large page + std::shared_ptr combine_buffer_; + // partid std::vector>> partition_cached_recordbatch_; diff --git a/native-sql-engine/cpp/src/tests/shuffle_split_test.cc b/native-sql-engine/cpp/src/tests/shuffle_split_test.cc index 1f12742cd..715364a6d 100644 --- a/native-sql-engine/cpp/src/tests/shuffle_split_test.cc +++ b/native-sql-engine/cpp/src/tests/shuffle_split_test.cc @@ -431,7 +431,7 @@ TEST_F(SplitterTest, TestSpillFailWithOutOfMemory) { } TEST_F(SplitterTest, TestSpillLargestPartition) { - std::shared_ptr pool = std::make_shared(4000000); + std::shared_ptr pool = std::make_shared(9*1024*1024); // pool = std::make_shared(pool.get()); int32_t num_partitions = 2; From 7bdec939608ae5c6870bbba1672affc4dd2711d4 Mon Sep 17 00:00:00 2001 From: binwei Date: Sun, 1 May 2022 17:07:13 +0800 Subject: [PATCH 04/19] wip, rebase to master --- native-sql-engine/cpp/CMakeLists.txt | 4 +- .../src/benchmarks/shuffle_split_benchmark.cc | 45 ++++++++++++++ native-sql-engine/cpp/src/shuffle/splitter.cc | 11 +++- .../cpp/src/tests/shuffle_split_test.cc | 58 ++++++++++++++++++- 4 files changed, 112 insertions(+), 6 deletions(-) diff --git a/native-sql-engine/cpp/CMakeLists.txt b/native-sql-engine/cpp/CMakeLists.txt index 48a923614..e7d14e0c8 100644 --- a/native-sql-engine/cpp/CMakeLists.txt +++ b/native-sql-engine/cpp/CMakeLists.txt @@ -1,8 +1,8 @@ cmake_minimum_required(VERSION 3.16) project(spark_columnar_plugin) -add_definitions(-DSKIPWRITE -DSKIPCOMPRESS -DPROCESSROW) -#add_definitions(-DPROCESSROW) +#add_definitions(-DSKIPWRITE -DSKIPCOMPRESS -DPROCESSROW) +add_definitions(-DPROCESSROW) add_compile_options(-g) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) diff --git a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc index d2bffe36a..ce4e88b62 100644 --- a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc +++ b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc @@ -41,6 +41,50 @@ namespace shuffle { const int batch_buffer_size = 32768; const int split_buffer_size = 8192; + +class MyLoggingMemoryPool : public MemoryPool { + public: + explicit MyLoggingMemoryPool(MemoryPool* pool): pool_(pool) {} + ~MyLoggingMemoryPool() override = default; + + Status Allocate(int64_t size, uint8_t** out) override { + Status s = pool_->Allocate(size, out); + std::cout << "Allocate: size = " << size << std::endl; + return s; + } + Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override + { + Status s = pool_->Reallocate(old_size, new_size, ptr); + std::cout << "Reallocate: old_size = " << old_size << " - new_size = " << new_size + << std::endl; + return s; + } + + void Free(uint8_t* buffer, int64_t size) override{ + pool_->Free(buffer, size); + std::cout << "Free: size = " << size << std::endl; + } + + int64_t bytes_allocated() const override{ + int64_t nb_bytes = pool_->bytes_allocated(); + std::cout << "bytes_allocated: " << nb_bytes << std::endl; + return nb_bytes; + } + + int64_t max_memory() const override{ + int64_t mem = pool_->max_memory(); + std::cout << "max_memory: " << mem << std::endl; + return mem; + } + + std::string backend_name() const override{ + return pool_->backend_name(); + } + + private: + MemoryPool* pool_; +}; + class BenchmarkShuffleSplit { public: BenchmarkShuffleSplit(std::string file_name) { GetRecordBatchReader(file_name); } @@ -188,6 +232,7 @@ class BenchmarkShuffleSplit { std::shared_ptr schema; std::vector> expr_vector; parquet::ArrowReaderProperties properties; + }; class BenchmarkShuffleSplit_CacheScan_Benchmark : public BenchmarkShuffleSplit { diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc index 62e31df65..a5e3ca932 100644 --- a/native-sql-engine/cpp/src/shuffle/splitter.cc +++ b/native-sql-engine/cpp/src/shuffle/splitter.cc @@ -432,10 +432,11 @@ arrow::Status Splitter::AllocateBufferFromPool(std::shared_ptr& b ARROW_ASSIGN_OR_RAISE(combine_buffer_, arrow::AllocateResizableBuffer( SPLIT_BUFFER_SIZE, options_.memory_pool)); + combine_buffer_->Resize(0, /*shrink_to_fit = */ false); } buffer = arrow::SliceMutableBuffer(combine_buffer_, combine_buffer_->size(),size); - combine_buffer_->Resize(combine_buffer_->size() + size); + combine_buffer_->Resize(combine_buffer_->size() + size, /*shrink_to_fit = */ false); return arrow::Status::OK(); } @@ -489,6 +490,7 @@ arrow::Status Splitter::Stop() { data_file_os_ = fout; } + std::cout << " cache record batch " << std::endl; // stop PartitionWriter and collect metrics for (auto pid = 0; pid < num_partitions_; ++pid) { RETURN_NOT_OK(CacheRecordBatch(pid, true)); @@ -508,11 +510,15 @@ arrow::Status Splitter::Stop() { partition_lengths_[pid] = 0; } } + this->combine_buffer_.reset(); // close data file output Stream RETURN_NOT_OK(data_file_os_->Close()); EVAL_END("write", options_.thread_id, options_.task_attempt_id) + + + return arrow::Status::OK(); } int64_t batch_nbytes(const arrow::RecordBatch& batch) { @@ -527,6 +533,7 @@ int64_t batch_nbytes(const arrow::RecordBatch& batch) { continue; } accumulated += buf->size(); + std::cout << " buffer addr = 0x" << std::hex << buf->address() << std::dec << std::endl; } } return accumulated; @@ -637,7 +644,7 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer } } } - + std::cout << " cache record " << std::endl; auto batch = arrow::RecordBatch::Make(schema_, num_rows, std::move(arrays)); int64_t raw_size = batch_nbytes(batch); diff --git a/native-sql-engine/cpp/src/tests/shuffle_split_test.cc b/native-sql-engine/cpp/src/tests/shuffle_split_test.cc index 715364a6d..cc05cd3e1 100644 --- a/native-sql-engine/cpp/src/tests/shuffle_split_test.cc +++ b/native-sql-engine/cpp/src/tests/shuffle_split_test.cc @@ -26,6 +26,20 @@ #include +#include +void print_trace(void) { + char **strings; + size_t i, size; + enum Constexpr { MAX_SIZE = 1024 }; + void *array[MAX_SIZE]; + size = backtrace(array, MAX_SIZE); + strings = backtrace_symbols(array, size); + for (i = 0; i < size; i++) + printf(" %s\n", strings[i]); + puts(""); + free(strings); +} + #include "shuffle/splitter.h" #include "tests/test_utils.h" @@ -42,6 +56,8 @@ class MyMemoryPool : public arrow::MemoryPool { } RETURN_NOT_OK(pool_->Allocate(size, out)); stats_.UpdateAllocatedBytes(size); + std::cout << "Allocate: size = " << size << " addr = " << std::hex << (uint64_t)*out << std::dec << std::endl; + //print_trace(); return arrow::Status::OK(); } @@ -49,14 +65,19 @@ class MyMemoryPool : public arrow::MemoryPool { if (new_size > capacity_) { return Status::OutOfMemory("malloc of size ", new_size, " failed"); } + auto old_ptr = ptr; RETURN_NOT_OK(pool_->Reallocate(old_size, new_size, ptr)); stats_.UpdateAllocatedBytes(new_size - old_size); + std::cout << "Reallocate: old_size = " << old_size << " old_ptr = " << std::hex << (uint64_t)*old_ptr << std::dec << " new_size = " << new_size << " addr = " << std::hex << (uint64_t)*ptr << std::dec << std::endl; + //print_trace(); return arrow::Status::OK(); } void Free(uint8_t* buffer, int64_t size) override { pool_->Free(buffer, size); stats_.UpdateAllocatedBytes(-size); + std::cout << "Free: size = " << size << " addr = " << std::hex << (uint64_t)buffer << std::dec << std::endl; + //print_trace(); } int64_t bytes_allocated() const override { return stats_.bytes_allocated(); } @@ -287,6 +308,39 @@ TEST_F(SplitterTest, TestRoundRobinSplitter) { } } +TEST_F(SplitterTest, TestSplitterMemoryLeak) { + + std::shared_ptr pool = std::make_shared(9*1024*1024); + + int32_t num_partitions = 2; + split_options_.buffer_size = 4; + split_options_.memory_pool = pool.get(); + + std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; + ARROW_ASSIGN_OR_THROW(splitter_, + Splitter::Make("rr", schema_, num_partitions, split_options_)); + + ASSERT_NOT_OK(splitter_->Split(*input_batch_1_)); + std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; + ASSERT_NOT_OK(splitter_->Split(*input_batch_2_)); + std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; + ASSERT_NOT_OK(splitter_->Split(*input_batch_1_)); + std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; + + std::cout << "split down " << std::endl; + + ASSERT_NOT_OK(splitter_->Stop()); + std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; + + std::cout << "stopped " << std::endl; + + splitter_.reset(); + std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; + std::cout << "splitter_ killed " << std::endl; + + split_options_.memory_pool = arrow::default_memory_pool(); +} + TEST_F(SplitterTest, TestHashSplitter) { int32_t num_partitions = 2; split_options_.buffer_size = 4; @@ -420,7 +474,7 @@ TEST_F(SplitterTest, TestSpillFailWithOutOfMemory) { int32_t num_partitions = 2; split_options_.buffer_size = 4; - split_options_.memory_pool = pool.get(); + //split_options_.memory_pool = pool.get(); ARROW_ASSIGN_OR_THROW(splitter_, Splitter::Make("rr", schema_, num_partitions, split_options_)); @@ -436,7 +490,7 @@ TEST_F(SplitterTest, TestSpillLargestPartition) { int32_t num_partitions = 2; split_options_.buffer_size = 4; - split_options_.memory_pool = pool.get(); + //split_options_.memory_pool = pool.get(); split_options_.compression_type = arrow::Compression::UNCOMPRESSED; ARROW_ASSIGN_OR_THROW(splitter_, Splitter::Make("rr", schema_, num_partitions, split_options_)); From 6b9881e0bbf5ed64d52c700a5bd83383f5a822f4 Mon Sep 17 00:00:00 2001 From: binwei Date: Sun, 1 May 2022 17:09:09 +0800 Subject: [PATCH 05/19] to rebase to master --- native-sql-engine/cpp/CMakeLists.txt | 11 - .../src/benchmarks/shuffle_split_benchmark.cc | 450 ----- native-sql-engine/cpp/src/shuffle/splitter.cc | 1674 ----------------- .../cpp/src/tests/shuffle_split_test.cc | 1139 ----------- 4 files changed, 3274 deletions(-) delete mode 100644 native-sql-engine/cpp/CMakeLists.txt delete mode 100644 native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc delete mode 100644 native-sql-engine/cpp/src/shuffle/splitter.cc delete mode 100644 native-sql-engine/cpp/src/tests/shuffle_split_test.cc diff --git a/native-sql-engine/cpp/CMakeLists.txt b/native-sql-engine/cpp/CMakeLists.txt deleted file mode 100644 index e7d14e0c8..000000000 --- a/native-sql-engine/cpp/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -cmake_minimum_required(VERSION 3.16) -project(spark_columnar_plugin) - -#add_definitions(-DSKIPWRITE -DSKIPCOMPRESS -DPROCESSROW) -add_definitions(-DPROCESSROW) - -add_compile_options(-g) -set(CMAKE_EXPORT_COMPILE_COMMANDS ON) - -set(root_directory ${PROJECT_BINARY_DIR}) -add_subdirectory(src) diff --git a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc deleted file mode 100644 index ce4e88b62..000000000 --- a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc +++ /dev/null @@ -1,450 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -//#include -#include -#include -//#include -#include -#include -#include -#include -#include - -#include - -#include "codegen/code_generator.h" -#include "codegen/code_generator_factory.h" -#include "tests/test_utils.h" - -namespace sparkcolumnarplugin { -namespace shuffle { - -const int batch_buffer_size = 32768; -const int split_buffer_size = 8192; - - -class MyLoggingMemoryPool : public MemoryPool { - public: - explicit MyLoggingMemoryPool(MemoryPool* pool): pool_(pool) {} - ~MyLoggingMemoryPool() override = default; - - Status Allocate(int64_t size, uint8_t** out) override { - Status s = pool_->Allocate(size, out); - std::cout << "Allocate: size = " << size << std::endl; - return s; - } - Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override - { - Status s = pool_->Reallocate(old_size, new_size, ptr); - std::cout << "Reallocate: old_size = " << old_size << " - new_size = " << new_size - << std::endl; - return s; - } - - void Free(uint8_t* buffer, int64_t size) override{ - pool_->Free(buffer, size); - std::cout << "Free: size = " << size << std::endl; - } - - int64_t bytes_allocated() const override{ - int64_t nb_bytes = pool_->bytes_allocated(); - std::cout << "bytes_allocated: " << nb_bytes << std::endl; - return nb_bytes; - } - - int64_t max_memory() const override{ - int64_t mem = pool_->max_memory(); - std::cout << "max_memory: " << mem << std::endl; - return mem; - } - - std::string backend_name() const override{ - return pool_->backend_name(); - } - - private: - MemoryPool* pool_; -}; - -class BenchmarkShuffleSplit { - public: - BenchmarkShuffleSplit(std::string file_name) { GetRecordBatchReader(file_name); } - - void GetRecordBatchReader(const std::string& input_file) { - std::unique_ptr<::parquet::arrow::FileReader> parquet_reader; - std::shared_ptr record_batch_reader; - - std::shared_ptr fs; - std::string file_name; - ARROW_ASSIGN_OR_THROW(fs, arrow::fs::FileSystemFromUriOrPath(input_file, &file_name)) - - ARROW_ASSIGN_OR_THROW(file, fs->OpenInputFile(file_name)); - - properties.set_batch_size(batch_buffer_size); - properties.set_pre_buffer(false); - properties.set_use_threads(false); - - ASSERT_NOT_OK(::parquet::arrow::FileReader::Make( - arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file), - properties, &parquet_reader)); - - ASSERT_NOT_OK(parquet_reader->GetSchema(&schema)); - - auto num_rowgroups = parquet_reader->num_row_groups(); - - for (int i = 0; i < num_rowgroups; ++i) { - row_group_indices.push_back(i); - } - - auto num_columns = schema->num_fields(); - for (int i = 0; i < num_columns; ++i) { - column_indices.push_back(i); - } - const auto& fields = schema->fields(); - for (const auto& field : fields) { - if (field->name() == "l_orderkey") { - auto node = gandiva::TreeExprBuilder::MakeField(field); - expr_vector.push_back(gandiva::TreeExprBuilder::MakeExpression( - std::move(node), arrow::field("res_" + field->name(), field->type()))); - } - } - } - - void operator()(benchmark::State& state) { - SetCPU(state.thread_index()); - arrow::Compression::type compression_type = (arrow::Compression::type)state.range(1); - - const int num_partitions = state.range(0); - - auto options = SplitOptions::Defaults(); - options.compression_type = compression_type; - options.buffer_size = split_buffer_size; - options.buffered_write = true; - options.offheap_per_task = 128 * 1024 * 1024 * 1024L; - options.prefer_spill = true; - options.write_schema = false; - - std::shared_ptr splitter; - int64_t elapse_read = 0; - int64_t num_batches = 0; - int64_t num_rows = 0; - int64_t split_time = 0; - - Do_Split(splitter, elapse_read, num_batches, num_rows, split_time, num_partitions, - options, state); - - auto fs = std::make_shared(); - fs->DeleteFile(splitter->DataFile()); - - state.SetBytesProcessed(int64_t(splitter->RawPartitionBytes())); - - state.counters["rowgroups"] = - benchmark::Counter(row_group_indices.size(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); - state.counters["columns"] = - benchmark::Counter(column_indices.size(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); - state.counters["batches"] = benchmark::Counter( - num_batches, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - state.counters["num_rows"] = benchmark::Counter( - num_rows, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - state.counters["num_partitions"] = - benchmark::Counter(num_partitions, benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); - state.counters["batch_buffer_size"] = - benchmark::Counter(batch_buffer_size, benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); - state.counters["split_buffer_size"] = - benchmark::Counter(split_buffer_size, benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); - - state.counters["bytes_spilled"] = - benchmark::Counter(splitter->TotalBytesSpilled(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); - state.counters["bytes_written"] = - benchmark::Counter(splitter->TotalBytesWritten(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); - state.counters["bytes_raw"] = - benchmark::Counter(splitter->RawPartitionBytes(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); - state.counters["bytes_spilled"] = - benchmark::Counter(splitter->TotalBytesSpilled(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1024); - - state.counters["parquet_parse"] = benchmark::Counter( - elapse_read, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - state.counters["compute_pid_time"] = benchmark::Counter( - splitter->TotalComputePidTime(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); - state.counters["write_time"] = - benchmark::Counter(splitter->TotalWriteTime(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); - state.counters["spill_time"] = - benchmark::Counter(splitter->TotalSpillTime(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); - state.counters["compress_time"] = - benchmark::Counter(splitter->TotalCompressTime(), benchmark::Counter::kAvgThreads, - benchmark::Counter::OneK::kIs1000); - - split_time = split_time - splitter->TotalSpillTime() - - splitter->TotalComputePidTime() - splitter->TotalCompressTime() - - splitter->TotalWriteTime(); - state.counters["split_time"] = benchmark::Counter( - split_time, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); - } - - protected: - long SetCPU(uint32_t cpuindex) { - cpu_set_t cs; - CPU_ZERO(&cs); - CPU_SET(cpuindex, &cs); - return sched_setaffinity(0, sizeof(cs), &cs); - } - virtual void Do_Split(std::shared_ptr& splitter, int64_t& elapse_read, - int64_t& num_batches, int64_t& num_rows, int64_t& split_time, - const int num_partitions, SplitOptions options, - benchmark::State& state) {} - - protected: - std::string file_name; - std::shared_ptr file; - std::vector row_group_indices; - std::vector column_indices; - std::shared_ptr schema; - std::vector> expr_vector; - parquet::ArrowReaderProperties properties; - -}; - -class BenchmarkShuffleSplit_CacheScan_Benchmark : public BenchmarkShuffleSplit { - public: - BenchmarkShuffleSplit_CacheScan_Benchmark(std::string filename) - : BenchmarkShuffleSplit(filename) {} - - protected: - void Do_Split(std::shared_ptr& splitter, int64_t& elapse_read, - int64_t& num_batches, int64_t& num_rows, int64_t& split_time, - const int num_partitions, SplitOptions options, benchmark::State& state) { - std::vector local_column_indices; - local_column_indices.push_back(0); - local_column_indices.push_back(1); - local_column_indices.push_back(2); - local_column_indices.push_back(4); - local_column_indices.push_back(5); - local_column_indices.push_back(6); - local_column_indices.push_back(7); - - std::shared_ptr local_schema; - local_schema = std::make_shared(*schema.get()); - - ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(15)); - ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(14)); - ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(13)); - ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(12)); - ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(11)); - ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(10)); - ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(9)); - ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(8)); - ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(3)); - - if (state.thread_index() == 0) std::cout << local_schema->ToString() << std::endl; - - ARROW_ASSIGN_OR_THROW(splitter, - Splitter::Make("rr", local_schema, num_partitions, options)); - - std::shared_ptr record_batch; - - std::unique_ptr<::parquet::arrow::FileReader> parquet_reader; - std::shared_ptr record_batch_reader; - ASSERT_NOT_OK(::parquet::arrow::FileReader::Make( - arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file), - properties, &parquet_reader)); - - std::vector> batches; - ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader( - row_group_indices, local_column_indices, &record_batch_reader)); - do { - TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch)); - - if (record_batch) { - batches.push_back(record_batch); - num_batches += 1; - num_rows += record_batch->num_rows(); - } - } while (record_batch); - std::cout << "parquet parse done elapsed time " << elapse_read / 1000000 << " ms " - << std::endl; - std::cout << "batches = " << num_batches << " rows = " << num_rows << std::endl; - - for (auto _ : state) { - for_each( - batches.begin(), batches.end(), - [&splitter, &split_time](std::shared_ptr& record_batch) { - TIME_NANO_OR_THROW(split_time, splitter->Split(*record_batch)); - }); - } - - TIME_NANO_OR_THROW(split_time, splitter->Stop()); - } -}; - -class BenchmarkShuffleSplit_IterateScan_Benchmark : public BenchmarkShuffleSplit { - public: - BenchmarkShuffleSplit_IterateScan_Benchmark(std::string filename) - : BenchmarkShuffleSplit(filename) {} - - protected: - void Do_Split(std::shared_ptr& splitter, int64_t& elapse_read, - int64_t& num_batches, int64_t& num_rows, int64_t& split_time, - const int num_partitions, SplitOptions options, benchmark::State& state) { - if (state.thread_index() == 0) std::cout << schema->ToString() << std::endl; - - if (!expr_vector.empty()) { - ARROW_ASSIGN_OR_THROW(splitter, Splitter::Make("hash", schema, num_partitions, - expr_vector, std::move(options))); - } else { - ARROW_ASSIGN_OR_THROW( - splitter, Splitter::Make("rr", schema, num_partitions, std::move(options))); - } - - std::shared_ptr record_batch; - - std::unique_ptr<::parquet::arrow::FileReader> parquet_reader; - std::shared_ptr record_batch_reader; - ASSERT_NOT_OK(::parquet::arrow::FileReader::Make( - arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file), - properties, &parquet_reader)); - - for (auto _ : state) { - std::vector> batches; - ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader( - row_group_indices, column_indices, &record_batch_reader)); - TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch)); - while (record_batch) { - num_batches += 1; - num_rows += record_batch->num_rows(); - TIME_NANO_OR_THROW(split_time, splitter->Split(*record_batch)); - TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch)); - } - } - TIME_NANO_OR_THROW(split_time, splitter->Stop()); - } -}; - -/*BENCHMARK_REGISTER_F(BenchmarkShuffleSplit, CacheScan)->Iterations(1) - ->Args({96*2, arrow::Compression::FASTPFOR}) - ->Args({96*4, arrow::Compression::FASTPFOR}) - ->Args({96*8, arrow::Compression::FASTPFOR}) - ->Args({96*16, arrow::Compression::FASTPFOR}) - ->Args({96*32, arrow::Compression::FASTPFOR}) - ->Threads(1) - ->Threads(2) - ->Threads(4) - ->Threads(8) - ->Threads(16) - ->Threads(24) - ->Unit(benchmark::kSecond); -*/ -/*BENCHMARK_REGISTER_F(BenchmarkShuffleSplit, IterateScan)->Iterations(1) - ->Args({96*2, arrow::Compression::FASTPFOR}) - ->Args({96*4, arrow::Compression::FASTPFOR}) - ->Args({96*8, arrow::Compression::FASTPFOR}) - ->Args({96*16, arrow::Compression::FASTPFOR}) - ->Args({96*32, arrow::Compression::FASTPFOR}) - ->Threads(1) - ->Threads(2) - ->Threads(4) - ->Threads(8) - ->Threads(16) - ->Threads(24) - ->Unit(benchmark::kSecond);*/ -/*BENCHMARK_REGISTER_F(BenchmarkShuffleSplit, CacheScan) - ->Iterations(1000000) - ->Args({512, arrow::Compression::FASTPFOR}) - ->Threads(1) - ->ReportAggregatesOnly(false) - ->MeasureProcessCPUTime() - ->Unit(benchmark::kSecond);*/ -} // namespace shuffle -} // namespace sparkcolumnarplugin - -int main(int argc, char** argv) { - uint32_t iterations = 1; - uint32_t partitions = 512; - uint32_t threads = 1; - std::string datafile; - - for (int i = 0; i < argc; i++) { - if (strcmp(argv[i], "--iterations") == 0) { - iterations = atol(argv[i + 1]); - } else if (strcmp(argv[i], "--partitions") == 0) { - partitions = atol(argv[i + 1]); - } else if (strcmp(argv[i], "--threads") == 0) { - threads = atol(argv[i + 1]); - } else if (strcmp(argv[i], "--file") == 0) { - datafile = argv[i + 1]; - } - } - std::cout << "iterations = " << iterations << std::endl; - std::cout << "partitions = " << partitions << std::endl; - std::cout << "threads = " << threads << std::endl; - std::cout << "datafile = " << datafile << std::endl; - - sparkcolumnarplugin::shuffle::BenchmarkShuffleSplit_CacheScan_Benchmark bck(datafile); - - benchmark::RegisterBenchmark("BenchmarkShuffleSplit::CacheScan", bck) - ->Iterations(iterations) - ->Args({partitions, arrow::Compression::FASTPFOR}) - ->Threads(threads) - ->ReportAggregatesOnly(false) - ->MeasureProcessCPUTime() - ->Unit(benchmark::kSecond); - - /* sparkcolumnarplugin::shuffle::BenchmarkShuffleSplit_IterateScan_Benchmark - bck(datafile); - - benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck) - ->Iterations(1) - ->Args({96*2, arrow::Compression::FASTPFOR}) - ->Args({96*4, arrow::Compression::FASTPFOR}) - ->Args({96*8, arrow::Compression::FASTPFOR}) - ->Args({96*16, arrow::Compression::FASTPFOR}) - ->Args({96*32, arrow::Compression::FASTPFOR}) - ->Threads(24) - ->Unit(benchmark::kSecond); - - benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck) - ->Iterations(1) - ->Args({4096, arrow::Compression::FASTPFOR}) - ->Threads(1) - ->Threads(2) - ->Threads(4) - ->Threads(8) - ->Threads(16) - ->Threads(24) - ->Unit(benchmark::kSecond); - */ - - benchmark::Initialize(&argc, argv); - benchmark::RunSpecifiedBenchmarks(); - benchmark::Shutdown(); -} \ No newline at end of file diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc deleted file mode 100644 index a5e3ca932..000000000 --- a/native-sql-engine/cpp/src/shuffle/splitter.cc +++ /dev/null @@ -1,1674 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "shuffle/splitter.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "shuffle/utils.h" -#include "utils/macros.h" - -/*#if defined(COLUMNAR_PLUGIN_USE_AVX512) -#include -#else -#include -#endif -*/ - -namespace sparkcolumnarplugin { -namespace shuffle { -using arrow::internal::checked_cast; - -#ifndef SPLIT_BUFFER_SIZE -//by default, allocate 8M block, 2M page size -#define SPLIT_BUFFER_SIZE 8*1024*1024 -#endif - -template -std::string __m128i_toString(const __m128i var) { - std::stringstream sstr; - T values[16 / sizeof(T)]; - std::memcpy(values, &var, sizeof(values)); // See discussion below - if (sizeof(T) == 1) { - for (unsigned int i = 0; i < sizeof(__m128i); i++) { // C++11: Range for also - // possible - sstr << std::hex << (int)values[i] << " " << std::dec; - } - } else { - for (unsigned int i = 0; i < sizeof(__m128i) / sizeof(T); - i++) { // C++11: Range for also possible - sstr << std::hex << values[i] << " " << std::dec; - } - } - return sstr.str(); -} - -SplitOptions SplitOptions::Defaults() { return SplitOptions(); } -#if defined(COLUMNAR_PLUGIN_USE_AVX512) -inline __m256i CountPartitionIdOccurrence(const std::vector& partition_id, - int32_t row) { - __m128i partid_cnt_low; - __m128i partid_cnt_high; - int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - - partid_cnt_low = _mm_xor_si128(partid_cnt_low, partid_cnt_low); - - tmp1 = (partition_id[row + 1] ^ partition_id[row]) == 0; - partid_cnt_low = _mm_insert_epi32(partid_cnt_low, tmp1, 1); - - tmp2 = (partition_id[row + 2] ^ partition_id[row]) == 0; - tmp2 += (partition_id[row + 2] ^ partition_id[row + 1]) == 0; - partid_cnt_low = _mm_insert_epi32(partid_cnt_low, tmp2, 2); - - tmp3 = (partition_id[row + 3] ^ partition_id[row]) == 0; - tmp3 += (partition_id[row + 3] ^ partition_id[row + 1]) == 0; - tmp3 += (partition_id[row + 3] ^ partition_id[row + 2]) == 0; - partid_cnt_low = _mm_insert_epi32(partid_cnt_low, tmp3, 3); - - tmp4 = (partition_id[row + 4] ^ partition_id[row]) == 0; - tmp4 += (partition_id[row + 4] ^ partition_id[row + 1]) == 0; - tmp4 += (partition_id[row + 4] ^ partition_id[row + 2]) == 0; - tmp4 += (partition_id[row + 4] ^ partition_id[row + 3]) == 0; - partid_cnt_high = _mm_insert_epi32(partid_cnt_high, tmp4, 0); - - tmp5 = (partition_id[row + 5] ^ partition_id[row]) == 0; - tmp5 += (partition_id[row + 5] ^ partition_id[row + 1]) == 0; - tmp5 += (partition_id[row + 5] ^ partition_id[row + 2]) == 0; - tmp5 += (partition_id[row + 5] ^ partition_id[row + 3]) == 0; - tmp5 += (partition_id[row + 5] ^ partition_id[row + 4]) == 0; - partid_cnt_high = _mm_insert_epi32(partid_cnt_high, tmp5, 1); - - tmp6 = (partition_id[row + 6] ^ partition_id[row]) == 0; - tmp6 += (partition_id[row + 6] ^ partition_id[row + 1]) == 0; - tmp6 += (partition_id[row + 6] ^ partition_id[row + 2]) == 0; - tmp6 += (partition_id[row + 6] ^ partition_id[row + 3]) == 0; - tmp6 += (partition_id[row + 6] ^ partition_id[row + 4]) == 0; - tmp6 += (partition_id[row + 6] ^ partition_id[row + 5]) == 0; - partid_cnt_high = _mm_insert_epi32(partid_cnt_high, tmp6, 2); - - tmp7 = (partition_id[row + 7] ^ partition_id[row]) == 0; - tmp7 += (partition_id[row + 7] ^ partition_id[row + 1]) == 0; - tmp7 += (partition_id[row + 7] ^ partition_id[row + 2]) == 0; - tmp7 += (partition_id[row + 7] ^ partition_id[row + 3]) == 0; - tmp7 += (partition_id[row + 7] ^ partition_id[row + 4]) == 0; - tmp7 += (partition_id[row + 7] ^ partition_id[row + 5]) == 0; - tmp7 += (partition_id[row + 7] ^ partition_id[row + 6]) == 0; - partid_cnt_high = _mm_insert_epi32(partid_cnt_high, tmp7, 3); - - __m256i partid_cnt_8x = _mm256_castsi128_si256(partid_cnt_low); - partid_cnt_8x = _mm256_inserti128_si256(partid_cnt_8x, partid_cnt_high, 1); - return partid_cnt_8x; -} - -inline void PrefetchDstAddr(__m512i dst_addr_8x, int32_t scale) { - _mm_prefetch( - (void*)(_mm_extract_epi64(_mm512_extracti64x2_epi64(dst_addr_8x, 0), 0) + scale), - _MM_HINT_T0); - _mm_prefetch( - (void*)(_mm_extract_epi64(_mm512_extracti64x2_epi64(dst_addr_8x, 0), 1) + scale), - _MM_HINT_T0); - _mm_prefetch( - (void*)(_mm_extract_epi64(_mm512_extracti64x2_epi64(dst_addr_8x, 1), 0) + scale), - _MM_HINT_T0); - _mm_prefetch( - (void*)(_mm_extract_epi64(_mm512_extracti64x2_epi64(dst_addr_8x, 1), 1) + scale), - _MM_HINT_T0); - _mm_prefetch( - (void*)(_mm_extract_epi64(_mm512_extracti64x2_epi64(dst_addr_8x, 2), 0) + scale), - _MM_HINT_T0); - _mm_prefetch( - (void*)(_mm_extract_epi64(_mm512_extracti64x2_epi64(dst_addr_8x, 2), 1) + scale), - _MM_HINT_T0); - _mm_prefetch( - (void*)(_mm_extract_epi64(_mm512_extracti64x2_epi64(dst_addr_8x, 3), 0) + scale), - _MM_HINT_T0); - _mm_prefetch( - (void*)(_mm_extract_epi64(_mm512_extracti64x2_epi64(dst_addr_8x, 3), 1) + scale), - _MM_HINT_T0); -} -#endif - -class Splitter::PartitionWriter { - public: - explicit PartitionWriter(Splitter* splitter, int32_t partition_id) - : splitter_(splitter), partition_id_(partition_id) {} - - arrow::Status Spill() { -#ifndef SKIPWRITE - RETURN_NOT_OK(EnsureOpened()); -#endif - RETURN_NOT_OK(WriteRecordBatchPayload(spilled_file_os_.get(), partition_id_)); - ClearCache(); - return arrow::Status::OK(); - } - - arrow::Status WriteCachedRecordBatchAndClose() { - const auto& data_file_os = splitter_->data_file_os_; - ARROW_ASSIGN_OR_RAISE(auto before_write, data_file_os->Tell()); - - if (splitter_->options_.write_schema) { - RETURN_NOT_OK(WriteSchemaPayload(data_file_os.get())); - } - - if (spilled_file_opened_) { - RETURN_NOT_OK(spilled_file_os_->Close()); - RETURN_NOT_OK(MergeSpilled()); - } else { - if (splitter_->partition_cached_recordbatch_size_[partition_id_] == 0) { - return arrow::Status::Invalid("Partition writer got empty partition"); - } - } - - RETURN_NOT_OK(WriteRecordBatchPayload(data_file_os.get(), partition_id_)); - RETURN_NOT_OK(WriteEOS(data_file_os.get())); - ClearCache(); - - ARROW_ASSIGN_OR_RAISE(auto after_write, data_file_os->Tell()); - partition_length = after_write - before_write; - - return arrow::Status::OK(); - } - - // metrics - int64_t bytes_spilled = 0; - int64_t partition_length = 0; - int64_t compress_time = 0; - - private: - arrow::Status EnsureOpened() { - if (!spilled_file_opened_) { - ARROW_ASSIGN_OR_RAISE(spilled_file_, - CreateTempShuffleFile(splitter_->NextSpilledFileDir())); - ARROW_ASSIGN_OR_RAISE(spilled_file_os_, - arrow::io::FileOutputStream::Open(spilled_file_, true)); - spilled_file_opened_ = true; - } - return arrow::Status::OK(); - } - - arrow::Status MergeSpilled() { - ARROW_ASSIGN_OR_RAISE( - auto spilled_file_is_, - arrow::io::MemoryMappedFile::Open(spilled_file_, arrow::io::FileMode::READ)); - // copy spilled data blocks - ARROW_ASSIGN_OR_RAISE(auto nbytes, spilled_file_is_->GetSize()); - ARROW_ASSIGN_OR_RAISE(auto buffer, spilled_file_is_->Read(nbytes)); - RETURN_NOT_OK(splitter_->data_file_os_->Write(buffer)); - - // close spilled file streams and delete the file - RETURN_NOT_OK(spilled_file_is_->Close()); - auto fs = std::make_shared(); - RETURN_NOT_OK(fs->DeleteFile(spilled_file_)); - bytes_spilled += nbytes; - return arrow::Status::OK(); - } - - arrow::Status WriteSchemaPayload(arrow::io::OutputStream* os) { - ARROW_ASSIGN_OR_RAISE(auto payload, splitter_->GetSchemaPayload()); - int32_t metadata_length = 0; // unused - RETURN_NOT_OK(arrow::ipc::WriteIpcPayload( - *payload, splitter_->options_.ipc_write_options, os, &metadata_length)); - return arrow::Status::OK(); - } - - arrow::Status WriteRecordBatchPayload(arrow::io::OutputStream* os, - int32_t partition_id) { - int32_t metadata_length = 0; // unused -#ifndef SKIPWRITE - for (auto& payload : splitter_->partition_cached_recordbatch_[partition_id_]) { - RETURN_NOT_OK(arrow::ipc::WriteIpcPayload( - *payload, splitter_->options_.ipc_write_options, os, &metadata_length)); - payload = nullptr; - } -#endif - return arrow::Status::OK(); - } - - arrow::Status WriteEOS(arrow::io::OutputStream* os) { - // write EOS - constexpr int32_t kZeroLength = 0; - RETURN_NOT_OK(os->Write(&kIpcContinuationToken, sizeof(int32_t))); - RETURN_NOT_OK(os->Write(&kZeroLength, sizeof(int32_t))); - return arrow::Status::OK(); - } - - void ClearCache() { - splitter_->partition_cached_recordbatch_[partition_id_].clear(); - splitter_->partition_cached_recordbatch_size_[partition_id_] = 0; - } - - Splitter* splitter_; - int32_t partition_id_; - std::string spilled_file_; - std::shared_ptr spilled_file_os_; - - bool spilled_file_opened_ = false; -}; - -// ---------------------------------------------------------------------- -// Splitter - -arrow::Result> Splitter::Make( - const std::string& short_name, std::shared_ptr schema, - int num_partitions, const gandiva::ExpressionVector& expr_vector, - SplitOptions options) { - if (short_name == "hash") { - return HashSplitter::Create(num_partitions, std::move(schema), expr_vector, - std::move(options)); - } else if (short_name == "rr") { - return RoundRobinSplitter::Create(num_partitions, std::move(schema), - std::move(options)); - } else if (short_name == "range") { - return FallbackRangeSplitter::Create(num_partitions, std::move(schema), - std::move(options)); - } else if (short_name == "single") { - return RoundRobinSplitter::Create(1, std::move(schema), std::move(options)); - } - return arrow::Status::NotImplemented("Partitioning " + short_name + - " not supported yet."); -} - -arrow::Result> Splitter::Make( - const std::string& short_name, std::shared_ptr schema, - int num_partitions, SplitOptions options) { - return Make(short_name, std::move(schema), num_partitions, {}, std::move(options)); -} - -arrow::Status Splitter::Init() { - const auto& fields = schema_->fields(); - ARROW_ASSIGN_OR_RAISE(column_type_id_, ToSplitterTypeId(schema_->fields())); - - partition_writer_.resize(num_partitions_); - - // pre-computed row count for each partition after the record batch split - partition_id_cnt_.resize(num_partitions_); - // pre-allocated buffer size for each partition, unit is row count - partition_buffer_size_.resize(num_partitions_); - - // start index for each partition when new record batch starts to split - partition_buffer_idx_base_.resize(num_partitions_); - // the offset of each partition during record batch split - partition_buffer_idx_offset_.resize(num_partitions_); - - partition_cached_recordbatch_.resize(num_partitions_); - partition_cached_recordbatch_size_.resize(num_partitions_); - partition_lengths_.resize(num_partitions_); - raw_partition_lengths_.resize(num_partitions_); - reducer_offset_offset_.resize(num_partitions_ + 1); - - for (int i = 0; i < column_type_id_.size(); ++i) { - switch (column_type_id_[i]->id()) { - case arrow::BinaryType::type_id: - case arrow::StringType::type_id: - binary_array_idx_.push_back(i); - break; - case arrow::LargeBinaryType::type_id: - case arrow::LargeStringType::type_id: - large_binary_array_idx_.push_back(i); - break; - case arrow::StructType::type_id: - case arrow::MapType::type_id: - case arrow::LargeListType::type_id: - case arrow::ListType::type_id: - list_array_idx_.push_back(i); - break; - case arrow::NullType::type_id: - break; - default: - fixed_width_array_idx_.push_back(i); - break; - } - } - - auto num_fixed_width = fixed_width_array_idx_.size(); - partition_fixed_width_validity_addrs_.resize(num_fixed_width); - column_has_null_.resize(num_fixed_width, false); - partition_fixed_width_value_addrs_.resize(num_fixed_width); - partition_fixed_width_buffers_.resize(num_fixed_width); - binary_array_empirical_size_.resize(binary_array_idx_.size()); - large_binary_array_empirical_size_.resize(large_binary_array_idx_.size()); - input_fixed_width_has_null_.resize(num_fixed_width, false); - for (auto i = 0; i < num_fixed_width; ++i) { - partition_fixed_width_validity_addrs_[i].resize(num_partitions_, nullptr); - partition_fixed_width_value_addrs_[i].resize(num_partitions_, nullptr); - partition_fixed_width_buffers_[i].resize(num_partitions_); - } - partition_binary_builders_.resize(binary_array_idx_.size()); - for (auto i = 0; i < binary_array_idx_.size(); ++i) { - partition_binary_builders_[i].resize(num_partitions_); - } - partition_large_binary_builders_.resize(large_binary_array_idx_.size()); - for (auto i = 0; i < large_binary_array_idx_.size(); ++i) { - partition_large_binary_builders_[i].resize(num_partitions_); - } - partition_list_builders_.resize(list_array_idx_.size()); - for (auto i = 0; i < list_array_idx_.size(); ++i) { - partition_list_builders_[i].resize(num_partitions_); - } - - ARROW_ASSIGN_OR_RAISE(configured_dirs_, GetConfiguredLocalDirs()); - sub_dir_selection_.assign(configured_dirs_.size(), 0); - - // Both data_file and shuffle_index_file should be set through jni. - // For test purpose, Create a temporary subdirectory in the system temporary - // dir with prefix "columnar-shuffle" - if (options_.data_file.length() == 0) { - ARROW_ASSIGN_OR_RAISE(options_.data_file, CreateTempShuffleFile(configured_dirs_[0])); - } - - auto& ipc_write_options = options_.ipc_write_options; - ipc_write_options.memory_pool = options_.memory_pool; - ipc_write_options.use_threads = false; - - if (options_.compression_type == arrow::Compression::FASTPFOR) { - ARROW_ASSIGN_OR_RAISE(ipc_write_options.codec, - arrow::util::Codec::CreateInt32(arrow::Compression::FASTPFOR)); - - } else if (options_.compression_type == arrow::Compression::LZ4_FRAME) { - ARROW_ASSIGN_OR_RAISE(ipc_write_options.codec, - arrow::util::Codec::Create(arrow::Compression::LZ4_FRAME)); - } else { - ARROW_ASSIGN_OR_RAISE(ipc_write_options.codec, arrow::util::Codec::CreateInt32( - arrow::Compression::UNCOMPRESSED)); - } - - // initialize tiny batch write options - tiny_bach_write_options_ = ipc_write_options; - ARROW_ASSIGN_OR_RAISE( - tiny_bach_write_options_.codec, - arrow::util::Codec::CreateInt32(arrow::Compression::UNCOMPRESSED)); - - //Allocate first buffer for split reducer - ARROW_ASSIGN_OR_RAISE(combine_buffer_, arrow::AllocateResizableBuffer( - SPLIT_BUFFER_SIZE, - options_.memory_pool)); - combine_buffer_->Resize(0, /*shrink_to_fit =*/false); - - return arrow::Status::OK(); -} -arrow::Status Splitter::AllocateBufferFromPool(std::shared_ptr& buffer, uint32_t size) -{ - // if size is already larger than buffer pool size, allocate it directly - //make size 64byte aligned - auto reminder = size & 0x3f; - size+=(64-reminder) & ((reminder==0)-1); - - if (size > SPLIT_BUFFER_SIZE ) - { - ARROW_ASSIGN_OR_RAISE(buffer, arrow::AllocateResizableBuffer( - size, options_.memory_pool)); - return arrow::Status::OK(); - }else if (combine_buffer_->capacity() - combine_buffer_->size() < size) - { - //memory pool is not enough - ARROW_ASSIGN_OR_RAISE(combine_buffer_, arrow::AllocateResizableBuffer( - SPLIT_BUFFER_SIZE, - options_.memory_pool)); - combine_buffer_->Resize(0, /*shrink_to_fit = */ false); - } - buffer = arrow::SliceMutableBuffer(combine_buffer_, combine_buffer_->size(),size); - - combine_buffer_->Resize(combine_buffer_->size() + size, /*shrink_to_fit = */ false); - return arrow::Status::OK(); -} - -int64_t Splitter::CompressedSize(const arrow::RecordBatch& rb) { - auto payload = std::make_shared(); - arrow::Status result; - result = - arrow::ipc::GetRecordBatchPayload(rb, options_.ipc_write_options, payload.get()); - if (result.ok()) { - return payload->body_length; - } else { - result.UnknownError("Failed to get the compressed size."); - return -1; - } -} - -arrow::Status Splitter::SetCompressType(arrow::Compression::type compressed_type) { - if (compressed_type == arrow::Compression::FASTPFOR) { - ARROW_ASSIGN_OR_RAISE(options_.ipc_write_options.codec, - arrow::util::Codec::CreateInt32(arrow::Compression::FASTPFOR)); - - } else if (compressed_type == arrow::Compression::LZ4_FRAME) { - ARROW_ASSIGN_OR_RAISE(options_.ipc_write_options.codec, - arrow::util::Codec::Create(arrow::Compression::LZ4_FRAME)); - } else { - ARROW_ASSIGN_OR_RAISE( - options_.ipc_write_options.codec, - arrow::util::Codec::CreateInt32(arrow::Compression::UNCOMPRESSED)); - } - return arrow::Status::OK(); -} - -arrow::Status Splitter::Split(const arrow::RecordBatch& rb) { - EVAL_START("split", options_.thread_id) - RETURN_NOT_OK(ComputeAndCountPartitionId(rb)); - RETURN_NOT_OK(DoSplit(rb)); - EVAL_END("split", options_.thread_id, options_.task_attempt_id) - return arrow::Status::OK(); -} - -arrow::Status Splitter::Stop() { - EVAL_START("write", options_.thread_id) - // open data file output stream - std::shared_ptr fout; - ARROW_ASSIGN_OR_RAISE(fout, - arrow::io::FileOutputStream::Open(options_.data_file, true)); - if (options_.buffered_write) { - ARROW_ASSIGN_OR_RAISE(data_file_os_, arrow::io::UnlockedBufferedOutputStream::Create( - 16384, options_.memory_pool, fout)); - } else { - data_file_os_ = fout; - } - - std::cout << " cache record batch " << std::endl; - // stop PartitionWriter and collect metrics - for (auto pid = 0; pid < num_partitions_; ++pid) { - RETURN_NOT_OK(CacheRecordBatch(pid, true)); - if (partition_cached_recordbatch_size_[pid] > 0) { - if (partition_writer_[pid] == nullptr) { - partition_writer_[pid] = std::make_shared(this, pid); - } - } - if (partition_writer_[pid] != nullptr) { - const auto& writer = partition_writer_[pid]; - TIME_NANO_OR_RAISE(total_write_time_, writer->WriteCachedRecordBatchAndClose()); - partition_lengths_[pid] = writer->partition_length; - total_bytes_written_ += writer->partition_length; - total_bytes_spilled_ += writer->bytes_spilled; - total_compress_time_ += writer->compress_time; - } else { - partition_lengths_[pid] = 0; - } - } - this->combine_buffer_.reset(); - - // close data file output Stream - RETURN_NOT_OK(data_file_os_->Close()); - - EVAL_END("write", options_.thread_id, options_.task_attempt_id) - - - - return arrow::Status::OK(); -} -int64_t batch_nbytes(const arrow::RecordBatch& batch) { - int64_t accumulated = 0L; - - for (const auto& array : batch.columns()) { - if (array == nullptr || array->data() == nullptr) { - continue; - } - for (const auto& buf : array->data()->buffers) { - if (buf == nullptr) { - continue; - } - accumulated += buf->size(); - std::cout << " buffer addr = 0x" << std::hex << buf->address() << std::dec << std::endl; - } - } - return accumulated; -} - -int64_t batch_nbytes(std::shared_ptr batch) { - if (batch == nullptr) { - return 0; - } - return batch_nbytes(*batch); -} - -arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffers) { - static int printed = 0; - - if (partition_buffer_idx_base_[partition_id] > 0) { - // already filled - auto fixed_width_idx = 0; - auto binary_idx = 0; - auto large_binary_idx = 0; - auto list_idx = 0; - auto num_fields = schema_->num_fields(); - auto num_rows = partition_buffer_idx_base_[partition_id]; - auto buffer_sizes = 0; - std::vector> arrays(num_fields); - for (int i = 0; i < num_fields; ++i) { - switch (column_type_id_[i]->id()) { - case arrow::BinaryType::type_id: - case arrow::StringType::type_id: { - auto& builder = partition_binary_builders_[binary_idx][partition_id]; - if (reset_buffers) { - RETURN_NOT_OK(builder->Finish(&arrays[i])); - builder->Reset(); - } else { - auto data_size = builder->value_data_length(); - RETURN_NOT_OK(builder->Finish(&arrays[i])); - builder->Reset(); - RETURN_NOT_OK(builder->Reserve(num_rows)); - RETURN_NOT_OK(builder->ReserveData(data_size)); - } - binary_idx++; - break; - } - case arrow::LargeBinaryType::type_id: - case arrow::LargeStringType::type_id: { - auto& builder = - partition_large_binary_builders_[large_binary_idx][partition_id]; - if (reset_buffers) { - RETURN_NOT_OK(builder->Finish(&arrays[i])); - builder->Reset(); - } else { - auto data_size = builder->value_data_length(); - RETURN_NOT_OK(builder->Finish(&arrays[i])); - builder->Reset(); - RETURN_NOT_OK(builder->Reserve(num_rows)); - RETURN_NOT_OK(builder->ReserveData(data_size)); - } - large_binary_idx++; - break; - } - case arrow::StructType::type_id: - case arrow::MapType::type_id: - case arrow::LargeListType::type_id: - case arrow::ListType::type_id: { - auto& builder = partition_list_builders_[list_idx][partition_id]; - if (reset_buffers) { - RETURN_NOT_OK(builder->Finish(&arrays[i])); - builder->Reset(); - } else { - RETURN_NOT_OK(builder->Finish(&arrays[i])); - builder->Reset(); - RETURN_NOT_OK(builder->Reserve(num_rows)); - } - list_idx++; - break; - } - case arrow::NullType::type_id: { - arrays[i] = arrow::MakeArray(arrow::ArrayData::Make( - arrow::null(), num_rows, {nullptr, nullptr}, num_rows)); - break; - } - default: { - auto& buffers = partition_fixed_width_buffers_[fixed_width_idx][partition_id]; - if (buffers[0] != nullptr) { - buffers[0]=arrow::SliceBuffer(buffers[0],0,(num_rows >> 3) + 1); - } - if (buffers[1] != nullptr) { - if (column_type_id_[i]->id() == arrow::BooleanType::type_id) - buffers[1]=arrow::SliceBuffer(buffers[1],0,(num_rows >> 3) + 1); - else - buffers[1]=arrow::SliceBuffer(buffers[1],0,num_rows * (arrow::bit_width(column_type_id_[i]->id()) >> 3)); - } - - if (reset_buffers) { - arrays[i] = arrow::MakeArray( - arrow::ArrayData::Make(schema_->field(i)->type(), num_rows, - {std::move(buffers[0]), std::move(buffers[1])})); - buffers = {nullptr, nullptr}; - partition_fixed_width_validity_addrs_[fixed_width_idx][partition_id] = - nullptr; - partition_fixed_width_value_addrs_[fixed_width_idx][partition_id] = nullptr; - } else { - arrays[i] = arrow::MakeArray(arrow::ArrayData::Make( - schema_->field(i)->type(), num_rows, {buffers[0], buffers[1]})); - } - fixed_width_idx++; - break; - } - } - } - std::cout << " cache record " << std::endl; - auto batch = arrow::RecordBatch::Make(schema_, num_rows, std::move(arrays)); - int64_t raw_size = batch_nbytes(batch); - - raw_partition_lengths_[partition_id] += raw_size; - auto payload = std::make_shared(); -#ifndef SKIPCOMPRESS - if (num_rows <= options_.batch_compress_threshold) { - TIME_NANO_OR_RAISE(total_compress_time_, - arrow::ipc::GetRecordBatchPayload( - *batch, tiny_bach_write_options_, payload.get())); - } else { - TIME_NANO_OR_RAISE(total_compress_time_, - arrow::ipc::GetRecordBatchPayload( - *batch, options_.ipc_write_options, payload.get())); - } -#else - // for test reason - TIME_NANO_OR_RAISE(total_compress_time_, - arrow::ipc::GetRecordBatchPayload(*batch, tiny_bach_write_options_, - payload.get())); -#endif - - partition_cached_recordbatch_size_[partition_id] += payload->body_length; - partition_cached_recordbatch_[partition_id].push_back(std::move(payload)); - partition_buffer_idx_base_[partition_id] = 0; - } - - return arrow::Status::OK(); -} - -arrow::Status Splitter::AllocatePartitionBuffers(int32_t partition_id, int32_t new_size) { - // try to allocate new - auto num_fields = schema_->num_fields(); - auto fixed_width_idx = 0; - auto binary_idx = 0; - auto large_binary_idx = 0; - auto list_idx = 0; - auto total_size = 0; - - std::vector> new_binary_builders; - std::vector> new_large_binary_builders; - std::vector> new_list_builders; - std::vector> new_value_buffers; - std::vector> new_validity_buffers; - - for (auto i = 0; i < num_fields; ++i) { - switch (column_type_id_[i]->id()) { - case arrow::BinaryType::type_id: - case arrow::StringType::type_id: { - auto builder = std::make_shared(options_.memory_pool); - assert(builder != nullptr); - RETURN_NOT_OK(builder->Reserve(new_size)); - RETURN_NOT_OK(builder->ReserveData( - binary_array_empirical_size_[binary_idx] * new_size + 1024)); - new_binary_builders.push_back(std::move(builder)); - binary_idx++; - break; - } - case arrow::LargeBinaryType::type_id: - case arrow::LargeStringType::type_id: { - auto builder = std::make_shared(options_.memory_pool); - assert(builder != nullptr); - RETURN_NOT_OK(builder->Reserve(new_size)); - RETURN_NOT_OK(builder->ReserveData( - large_binary_array_empirical_size_[large_binary_idx] * new_size + 1024)); - new_large_binary_builders.push_back(std::move(builder)); - large_binary_idx++; - break; - } - case arrow::StructType::type_id: - case arrow::MapType::type_id: - case arrow::LargeListType::type_id: - case arrow::ListType::type_id: { - std::unique_ptr array_builder; - RETURN_NOT_OK( - MakeBuilder(options_.memory_pool, column_type_id_[i], &array_builder)); - assert(array_builder != nullptr); - RETURN_NOT_OK(array_builder->Reserve(new_size)); - new_list_builders.push_back(std::move(array_builder)); - list_idx++; - break; - } - case arrow::NullType::type_id: - break; - default: { - try{ - std::shared_ptr value_buffer; - if (column_type_id_[i]->id() == arrow::BooleanType::type_id) { - auto status = AllocateBufferFromPool(value_buffer, arrow::BitUtil::BytesForBits(new_size)); - ARROW_RETURN_NOT_OK( status ); - } else { - auto status = AllocateBufferFromPool(value_buffer, new_size * (arrow::bit_width(column_type_id_[i]->id()) >>3)); - ARROW_RETURN_NOT_OK( status ); - - } - new_value_buffers.push_back(std::move(value_buffer)); - if (input_fixed_width_has_null_[fixed_width_idx]) { - std::shared_ptr validity_buffer; - auto status = AllocateBufferFromPool(validity_buffer, arrow::BitUtil::BytesForBits(new_size)); - ARROW_RETURN_NOT_OK( status ); - new_validity_buffers.push_back(std::move(validity_buffer)); - } else { - new_validity_buffers.push_back(nullptr); - } - fixed_width_idx++; - }catch(const std::exception& e) - { - std::cout << "exception captured " << e.what() << std::endl; - } - break; - } - } - } - - // point to newly allocated buffers - fixed_width_idx = binary_idx = large_binary_idx = 0; - list_idx = 0; - for (auto i = 0; i < num_fields; ++i) { - switch (column_type_id_[i]->id()) { - case arrow::BinaryType::type_id: - case arrow::StringType::type_id: - partition_binary_builders_[binary_idx][partition_id] = - std::move(new_binary_builders[binary_idx]); - binary_idx++; - break; - case arrow::LargeBinaryType::type_id: - case arrow::LargeStringType::type_id: - partition_large_binary_builders_[large_binary_idx][partition_id] = - std::move(new_large_binary_builders[large_binary_idx]); - large_binary_idx++; - break; - case arrow::StructType::type_id: - case arrow::MapType::type_id: - case arrow::LargeListType::type_id: - case arrow::ListType::type_id: - partition_list_builders_[list_idx][partition_id] = - std::move(new_list_builders[list_idx]); - list_idx++; - break; - case arrow::NullType::type_id: - break; - default: - partition_fixed_width_value_addrs_[fixed_width_idx][partition_id] = - new_value_buffers[fixed_width_idx]->mutable_data(); - if (input_fixed_width_has_null_[fixed_width_idx]) { - partition_fixed_width_validity_addrs_[fixed_width_idx][partition_id] = - new_validity_buffers[fixed_width_idx]->mutable_data(); - } else { - partition_fixed_width_validity_addrs_[fixed_width_idx][partition_id] = nullptr; - } - partition_fixed_width_buffers_[fixed_width_idx][partition_id] = { - std::move(new_validity_buffers[fixed_width_idx]), - std::move(new_value_buffers[fixed_width_idx])}; - fixed_width_idx++; - break; - } - } - partition_buffer_size_[partition_id] = new_size; - return arrow::Status::OK(); -} - -arrow::Status Splitter::AllocateNew(int32_t partition_id, int32_t new_size) { - auto status = AllocatePartitionBuffers(partition_id, new_size); - int32_t retry = 0; - while (status.IsOutOfMemory() && retry < 3) { - // retry allocate - std::cout << status.ToString() << std::endl - << std::to_string(++retry) << " retry to allocate new buffer for partition " - << std::to_string(partition_id) << std::endl; - int64_t spilled_size; - ARROW_ASSIGN_OR_RAISE(auto partition_to_spill, SpillLargestPartition(&spilled_size)); - if (partition_to_spill == -1) { - std::cout << "Failed to allocate new buffer for partition " - << std::to_string(partition_id) << ". No partition buffer to spill." - << std::endl; - return status; - } - status = AllocatePartitionBuffers(partition_id, new_size); - } - if (status.IsOutOfMemory()) { - std::cout << "Failed to allocate new buffer for partition " - << std::to_string(partition_id) << ". Out of memory." << std::endl; - } - return status; -} - -// call from memory management -arrow::Status Splitter::SpillFixedSize(int64_t size, int64_t* actual) { - int64_t current_spilled = 0L; - int32_t try_count = 0; - while (current_spilled < size && try_count < 5) { - try_count++; - int64_t single_call_spilled; - ARROW_ASSIGN_OR_RAISE(int32_t spilled_partition_id, - SpillLargestPartition(&single_call_spilled)) - if (spilled_partition_id == -1) { - break; - } - current_spilled += single_call_spilled; - } - *actual = current_spilled; - return arrow::Status::OK(); -} - -arrow::Status Splitter::SpillPartition(int32_t partition_id) { - if (partition_writer_[partition_id] == nullptr) { - partition_writer_[partition_id] = - std::make_shared(this, partition_id); - } - TIME_NANO_OR_RAISE(total_spill_time_, partition_writer_[partition_id]->Spill()); - return arrow::Status::OK(); -} - -arrow::Result Splitter::SpillLargestPartition(int64_t* size) { - // spill the largest partition - auto max_size = 0; - int32_t partition_to_spill = -1; - for (auto i = 0; i < num_partitions_; ++i) { - if (partition_cached_recordbatch_size_[i] > max_size) { - max_size = partition_cached_recordbatch_size_[i]; - partition_to_spill = i; - } - } - if (partition_to_spill != -1) { - RETURN_NOT_OK(SpillPartition(partition_to_spill)); -#ifdef DEBUG - std::cout << "Spilled partition " << std::to_string(partition_to_spill) << ", " - << std::to_string(max_size) << " bytes released" << std::endl; -#endif - *size = max_size; - } else { - *size = 0; - } - return partition_to_spill; -} - -arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) { -#ifdef PROCESSROW - - reducer_offsets_.resize(rb.num_rows()); - - reducer_offset_offset_[0] = 0; - for (auto pid = 1; pid <= num_partitions_; pid++) { - reducer_offset_offset_[pid] = - reducer_offset_offset_[pid - 1] + partition_id_cnt_[pid - 1]; - } - for (auto row = 0; row < rb.num_rows(); row++) { - auto pid = partition_id_[row]; - reducer_offsets_[reducer_offset_offset_[pid]] = row; - _mm_prefetch(reducer_offsets_.data() + reducer_offset_offset_[pid] + 32, _MM_HINT_T0); - reducer_offset_offset_[pid]++; - } - std::transform(reducer_offset_offset_.begin(), std::prev(reducer_offset_offset_.end()), - partition_id_cnt_.begin(), reducer_offset_offset_.begin(), - [](uint16_t x, int16_t y) { return x - y; }); - -#endif - // for the first input record batch, scan binary arrays and large binary - // arrays to get their empirical sizes - - uint32_t size_per_row = 0; - if (!empirical_size_calculated_) { - auto num_rows = rb.num_rows(); - for (int i = 0; i < binary_array_idx_.size(); ++i) { - auto arr = - std::static_pointer_cast(rb.column(binary_array_idx_[i])); - auto length = arr->value_offset(num_rows) - arr->value_offset(0); - binary_array_empirical_size_[i] = length / num_rows; - } - for (int i = 0; i < large_binary_array_idx_.size(); ++i) { - auto arr = std::static_pointer_cast( - rb.column(large_binary_array_idx_[i])); - auto length = arr->value_offset(num_rows) - arr->value_offset(0); - large_binary_array_empirical_size_[i] = length / num_rows; - } - empirical_size_calculated_ = true; - } - - size_per_row = std::accumulate(binary_array_empirical_size_.begin(), - binary_array_empirical_size_.end(), 0); - size_per_row = std::accumulate(large_binary_array_empirical_size_.begin(), - large_binary_array_empirical_size_.end(), size_per_row); - - for (auto col = 0; col < fixed_width_array_idx_.size(); ++col) { - auto col_idx = fixed_width_array_idx_[col]; - size_per_row += arrow::bit_width(column_type_id_[col_idx]->id()) / 8; - if (rb.column_data(col_idx)->GetNullCount() != 0) { - input_fixed_width_has_null_[col] = true; - } - } - - int64_t prealloc_row_cnt = - options_.offheap_per_task > 0 && size_per_row > 0 - ? options_.offheap_per_task / 4 / size_per_row / num_partitions_ - : options_.buffer_size; - prealloc_row_cnt = std::min(prealloc_row_cnt, (int64_t)options_.buffer_size); - - // prepare partition buffers and spill if necessary - for (auto pid = 0; pid < num_partitions_; ++pid) { - if (partition_id_cnt_[pid] > 0) { - // make sure the size to be allocated is larger than the size to be filled - auto new_size = std::max((uint16_t)prealloc_row_cnt, partition_id_cnt_[pid]); - if (partition_buffer_size_[pid] == 0) { - // allocate buffer if it's not yet allocated - RETURN_NOT_OK(AllocatePartitionBuffers(pid, new_size)); - } else if (partition_buffer_idx_base_[pid] + partition_id_cnt_[pid] > - partition_buffer_size_[pid]) { - // if the size to be filled + allready filled > the buffer size, need to allocate - // new buffer - if (options_.prefer_spill) { - // if prefer_spill is set, spill current record batch, we may reuse the buffers - - if (new_size > partition_buffer_size_[pid]) { - // if the partition size after split is already larger than allocated buffer - // size, need reallocate - RETURN_NOT_OK(CacheRecordBatch(pid, /*reset_buffers = */ true)); - // splill immediately - RETURN_NOT_OK(SpillPartition(pid)); - RETURN_NOT_OK(AllocatePartitionBuffers(pid, new_size)); - } else { - // partition size after split is smaller than buffer size, no need to reset - // buffer, reuse it. - RETURN_NOT_OK(CacheRecordBatch(pid, /*reset_buffers = */ false)); - RETURN_NOT_OK(SpillPartition(pid)); - } - } else { - // if prefer_spill is disabled, cache the record batch - RETURN_NOT_OK(CacheRecordBatch(pid, /*reset_buffers = */ true)); - // allocate partition buffer with retries - RETURN_NOT_OK(AllocateNew(pid, new_size)); - } - } - } - } -// now start to split the record batch -#if defined(COLUMNAR_PLUGIN_USE_AVX512) - RETURN_NOT_OK(SplitFixedWidthValueBufferAVX(rb)); -#else - RETURN_NOT_OK(SplitFixedWidthValueBuffer(rb)); -#endif - RETURN_NOT_OK(SplitFixedWidthValidityBuffer(rb)); - RETURN_NOT_OK(SplitBinaryArray(rb)); - RETURN_NOT_OK(SplitLargeBinaryArray(rb)); - RETURN_NOT_OK(SplitListArray(rb)); - - // update partition buffer base after split - for (auto pid = 0; pid < num_partitions_; ++pid) { - partition_buffer_idx_base_[pid] += partition_id_cnt_[pid]; - } - return arrow::Status::OK(); -} - -arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb) { - const auto num_rows = rb.num_rows(); - int64_t row; - std::vector partition_buffer_idx_offset; - - for (auto col = 0; col < fixed_width_array_idx_.size(); ++col) { - const auto& dst_addrs = partition_fixed_width_value_addrs_[col]; - std::copy(dst_addrs.begin(), dst_addrs.end(), partition_buffer_idx_offset_.begin()); - auto col_idx = fixed_width_array_idx_[col]; - auto src_addr = const_cast(rb.column_data(col_idx)->buffers[1]->data()); - - switch (arrow::bit_width(column_type_id_[col_idx]->id())) { -#ifdef PROCESSROW -// assume batch size = 32k; reducer# = 4K; row/reducer = 8 -#define PROCESS(_CTYPE) \ - std::transform(partition_buffer_idx_offset_.begin(), \ - partition_buffer_idx_offset_.end(), partition_buffer_idx_base_.begin(), \ - partition_buffer_idx_offset_.begin(), \ - [](uint8_t* x, int16_t y) { return x + y * sizeof(_CTYPE); }); \ - for (auto pid = 0; pid < num_partitions_; pid++) { \ - auto dst_pid_base = \ - reinterpret_cast<_CTYPE*>(partition_buffer_idx_offset_[pid]); /*32k*/ \ - auto r = reducer_offset_offset_[pid]; /*8k*/ \ - auto size = reducer_offset_offset_[pid + 1]; \ - for (r; r < size; r++) { \ - auto src_offset = reducer_offsets_[r]; /*16k*/ \ - *dst_pid_base = reinterpret_cast<_CTYPE*>(src_addr)[src_offset]; /*64k*/ \ - _mm_prefetch(&(src_addr)[src_offset * sizeof(_CTYPE) + 64], _MM_HINT_T2); \ - dst_pid_base += 1; \ - } \ - } \ - break; -#else -#define PROCESS(_CTYPE) \ - std::transform(partition_buffer_idx_offset_.begin(), \ - partition_buffer_idx_offset_.end(), partition_buffer_idx_base_.begin(), \ - partition_buffer_idx_offset_.begin(), \ - [](uint8_t* x, int16_t y) { return x + y * sizeof(_CTYPE); }); \ - for (row = 0; row < num_rows; ++row) { \ - auto pid = partition_id_[row]; \ - auto dst_pid_base = reinterpret_cast<_CTYPE*>(partition_buffer_idx_offset_[pid]); \ - *dst_pid_base = reinterpret_cast<_CTYPE*>(src_addr)[row]; \ - partition_buffer_idx_offset_[pid] += sizeof(_CTYPE); \ - _mm_prefetch(&dst_pid_base[64 / sizeof(_CTYPE)], _MM_HINT_T0); \ - } \ - break; -#endif - case 8: - PROCESS(uint8_t) - case 16: - PROCESS(uint16_t) - case 32: - PROCESS(uint32_t) - case 64: -#ifdef PROCESSAVX - std::transform( - partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(), - partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(), - [](uint8_t* x, int16_t y) { return x + y * sizeof(uint64_t); }); - for (auto pid = 0; pid < num_partitions_; pid++) { - auto dst_pid_base = - reinterpret_cast(partition_buffer_idx_offset_[pid]); /*32k*/ - auto r = reducer_offset_offset_[pid]; /*8k*/ - auto size = reducer_offset_offset_[pid + 1]; -#if 1 - for (r; r < size && (((uint64_t)dst_pid_base & 0x1f) > 0); r++) { - auto src_offset = reducer_offsets_[r]; /*16k*/ - *dst_pid_base = reinterpret_cast(src_addr)[src_offset]; /*64k*/ - _mm_prefetch(&(src_addr)[src_offset * sizeof(uint64_t) + 64], _MM_HINT_T2); - dst_pid_base += 1; - } -#if 0 - for (r; r+4(src_addr)[src_offset]; /*64k*/ - _mm_prefetch(&(src_addr)[src_offset * sizeof(uint64_t) + 64], _MM_HINT_T2); - dst_pid_base += 1; - } - } - break; -#else - PROCESS(uint64_t) -#endif - -#undef PROCESS - case 128: // arrow::Decimal128Type::type_id -#ifdef PROCESSROW - // assume batch size = 32k; reducer# = 4K; row/reducer = 8 - std::transform( - partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(), - partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(), - [](uint8_t* x, int16_t y) { return x + y * 16; }); - for (auto pid = 0; pid < num_partitions_; pid++) { - auto dst_pid_base = - reinterpret_cast(partition_buffer_idx_offset_[pid]); /*32k*/ - auto r = reducer_offset_offset_[pid]; /*8k*/ - auto size = reducer_offset_offset_[pid + 1]; - for (r; r < size; r++) { - auto src_offset = reducer_offsets_[r]; /*16k*/ - *dst_pid_base = - reinterpret_cast(src_addr)[src_offset << 1]; /*128k*/ - *(dst_pid_base + 1) = - reinterpret_cast(src_addr)[src_offset << 1 | 1]; /*128k*/ - _mm_prefetch(&(src_addr)[src_offset * 16 + 64], _MM_HINT_T2); - dst_pid_base += 2; - } - } - break; -#else - std::transform( - partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(), - partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(), - [](uint8_t* x, int16_t y) { return x + y * 16; }); - for (auto row = 0; row < num_rows; ++row) { - auto pid = partition_id_[row]; - reinterpret_cast(partition_buffer_idx_offset_[pid])[0] = - reinterpret_cast(src_addr)[row << 1]; - reinterpret_cast(partition_buffer_idx_offset_[pid])[1] = - reinterpret_cast(src_addr)[row << 1 | 1]; - partition_buffer_idx_offset_[pid] += 16; - _mm_prefetch(&reinterpret_cast(partition_buffer_idx_offset_[pid])[2], - _MM_HINT_T0); - } - break; -#endif - case 1: // arrow::BooleanType::type_id: - partition_buffer_idx_offset.resize(partition_buffer_idx_base_.size()); - std::copy(partition_buffer_idx_base_.begin(), partition_buffer_idx_base_.end(), - partition_buffer_idx_offset.begin()); - for (auto row = 0; row < num_rows; ++row) { - auto pid = partition_id_[row]; - uint16_t dst_offset = partition_buffer_idx_offset[pid]; - dst_addrs[pid][dst_offset >> 3] ^= - (dst_addrs[pid][dst_offset >> 3] >> (dst_offset & 7) ^ - src_addr[row >> 3] >> (row & 7)) - << (dst_offset & 7); - partition_buffer_idx_offset[pid]++; - } - break; - default: - return arrow::Status::Invalid("Column type " + - schema_->field(col_idx)->type()->ToString() + - " is not fixed width"); - } - } - return arrow::Status::OK(); -} - -#if defined(COLUMNAR_PLUGIN_USE_AVX512) -arrow::Status Splitter::SplitFixedWidthValueBufferAVX(const arrow::RecordBatch& rb) { - __m256i inc_one = _mm256_load_si256((__m256i*)(ONES)); - - const auto num_rows = rb.num_rows(); - for (auto col = 0; col < fixed_width_array_idx_.size(); ++col) { - std::fill(std::begin(partition_buffer_idx_offset_), - std::end(partition_buffer_idx_offset_), 0); - auto col_idx = fixed_width_array_idx_[col]; - auto src_addr = const_cast(rb.column_data(col_idx)->buffers[1]->data()); - const auto& dst_addrs = partition_fixed_width_value_addrs_[col]; - - switch (column_type_id_[col_idx]) { -#define PROCESS(SHUFFLE_TYPE, CTYPE) \ - case Type::SHUFFLE_TYPE: \ - for (auto row = 0; row < num_rows; ++row) { \ - auto pid = partition_id_[row]; \ - auto dst_offset = \ - partition_buffer_idx_base_[pid] + partition_buffer_idx_offset_[pid]; \ - reinterpret_cast(dst_addrs[pid])[dst_offset] = \ - reinterpret_cast(src_addr)[row]; \ - partition_buffer_idx_offset_[pid]++; \ - _mm_prefetch(&reinterpret_cast(dst_addrs[pid])[dst_offset + 1], \ - _MM_HINT_T0); \ - } \ - break; - PROCESS(SHUFFLE_1BYTE, uint8_t) - PROCESS(SHUFFLE_2BYTE, uint16_t) -#undef PROCESS - case Type::SHUFFLE_4BYTE: { - auto rows = num_rows - num_rows % 8; - auto src_addr_32 = reinterpret_cast(src_addr); - for (auto row = 0; row < rows; row += 8) { - __m256i partid_cnt_8x = CountPartitionIdOccurrence(partition_id_, row); - - // partition id is 32 bit, 8 partition id - __m256i partid_8x = _mm256_loadu_si256((__m256i*)(partition_id_.data() + row)); - - // dst_base and dst_offset are 32 bit - __m256i dst_idx_base_8x = - _mm256_i32gather_epi32(partition_buffer_idx_base_.data(), partid_8x, 4); - __m256i dst_idx_offset_8x = - _mm256_i32gather_epi32(partition_buffer_idx_offset_.data(), partid_8x, 4); - dst_idx_offset_8x = _mm256_add_epi32(dst_idx_offset_8x, partid_cnt_8x); - __m256i dst_idx_8x = _mm256_add_epi32(dst_idx_base_8x, dst_idx_offset_8x); - - // dst base address is 64 bit - __m512i dst_addr_base_8x = - _mm512_i32gather_epi64(partid_8x, dst_addrs.data(), 8); - - // calculate dst address, dst_addr = dst_base_addr + dst_idx*4 - //_mm512_cvtepu32_epi64: zero extend dst_offset 32bit -> 64bit - //_mm512_slli_epi64(_, 2): each 64bit dst_offset << 2 - __m512i dst_addr_offset_8x = - _mm512_slli_epi64(_mm512_cvtepu32_epi64(dst_idx_8x), 2); - __m512i dst_addr_8x = _mm512_add_epi64(dst_addr_base_8x, dst_addr_offset_8x); - - // source value is 32 bit - __m256i src_val_8x = _mm256_loadu_si256((__m256i*)(src_addr_32 + row)); - - // scatter - _mm512_i64scatter_epi32(nullptr, dst_addr_8x, src_val_8x, 1); - - // update partition_buffer_idx_offset_ - partid_cnt_8x = _mm256_add_epi32(partid_cnt_8x, inc_one); - for (int i = 0; i < 8; ++i) { - partition_buffer_idx_offset_[partition_id_[row + i]]++; - } - - PrefetchDstAddr(dst_addr_8x, 4); - } - for (auto row = rows; row < num_rows; ++row) { - auto pid = partition_id_[row]; - reinterpret_cast(dst_addrs[pid])[partition_buffer_idx_base_[pid] + - partition_buffer_idx_offset_[pid]] = - (src_addr_32)[row]; - partition_buffer_idx_offset_[pid]++; - } - } break; - case Type::SHUFFLE_8BYTE: { - auto rows = num_rows - num_rows % 8; - auto src_addr_64 = reinterpret_cast(src_addr); - for (auto row = 0; row < rows; row += 8) { - __m256i partid_cnt_8x = CountPartitionIdOccurrence(partition_id_, row); - - // partition id is 32 bit, 8 partition id - __m256i partid_8x = _mm256_loadu_si256((__m256i*)(partition_id_.data() + row)); - - // dst_base and dst_offset are 32 bit - __m256i dst_idx_base_8x = - _mm256_i32gather_epi32(partition_buffer_idx_base_.data(), partid_8x, 4); - __m256i dst_idx_offset_8x = - _mm256_i32gather_epi32(partition_buffer_idx_offset_.data(), partid_8x, 4); - dst_idx_offset_8x = _mm256_add_epi32(dst_idx_offset_8x, partid_cnt_8x); - __m256i dst_idx_8x = _mm256_add_epi32(dst_idx_base_8x, dst_idx_offset_8x); - - // dst base address is 64 bit - __m512i dst_addr_base_8x = - _mm512_i32gather_epi64(partid_8x, dst_addrs.data(), 8); - - // calculate dst address, dst_addr = dst_base_addr + dst_idx*8 - //_mm512_cvtepu32_epi64: zero extend dst_offset 32bit -> 64bit - //_mm512_slli_epi64(_, 3): each 64bit dst_offset << 3 - __m512i dst_addr_offset_8x = - _mm512_slli_epi64(_mm512_cvtepu32_epi64(dst_idx_8x), 3); - __m512i dst_addr_8x = _mm512_add_epi64(dst_addr_base_8x, dst_addr_offset_8x); - - // source value is 64 bit - __m512i src_val_8x = _mm512_loadu_si512((__m512i*)(src_addr_64 + row)); - - // scatter - _mm512_i64scatter_epi64(nullptr, dst_addr_8x, src_val_8x, 1); - - // update partition_buffer_idx_offset_ - partid_cnt_8x = _mm256_add_epi32(partid_cnt_8x, inc_one); - for (int i = 0; i < 8; ++i) { - partition_buffer_idx_offset_[partition_id_[row + i]]++; - } - - PrefetchDstAddr(dst_addr_8x, 8); - } - // handle the rest - for (auto row = rows; row < num_rows; ++row) { - auto pid = partition_id_[row]; - reinterpret_cast(dst_addrs[pid])[partition_buffer_idx_base_[pid] + - partition_buffer_idx_offset_[pid]] = - (src_addr_64)[row]; - partition_buffer_idx_offset_[pid]++; - } - } break; - case Type::SHUFFLE_DECIMAL128: - for (auto row = 0; row < num_rows; ++row) { - auto pid = partition_id_[row]; - auto dst_offset = - (partition_buffer_idx_base_[pid] + partition_buffer_idx_offset_[pid]) << 1; - reinterpret_cast(dst_addrs[pid])[dst_offset] = - reinterpret_cast(src_addr)[row << 1]; - reinterpret_cast(dst_addrs[pid])[dst_offset | 1] = - reinterpret_cast(src_addr)[row << 1 | 1]; - partition_buffer_idx_offset_[pid]++; - _mm_prefetch(&reinterpret_cast(dst_addrs[pid])[dst_offset + 2], - _MM_HINT_T0); - } - break; - case Type::SHUFFLE_BIT: - for (auto row = 0; row < num_rows; ++row) { - auto pid = partition_id_[row]; - auto dst_offset = - partition_buffer_idx_base_[pid] + partition_buffer_idx_offset_[pid]; - dst_addrs[pid][dst_offset >> 3] ^= - (dst_addrs[pid][dst_offset >> 3] >> (dst_offset & 7) ^ - src_addr[row >> 3] >> (row & 7)) - << (dst_offset & 7); - partition_buffer_idx_offset_[pid]++; - } - break; - default: - return arrow::Status::Invalid("Column type " + - schema_->field(col_idx)->type()->ToString() + - " is not fixed width"); - } - } - return arrow::Status::OK(); -} -#endif - -arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch& rb) { - const auto num_rows = rb.num_rows(); - std::vector partition_buffer_idx_offset; - - for (auto col = 0; col < fixed_width_array_idx_.size(); ++col) { - auto col_idx = fixed_width_array_idx_[col]; - auto& dst_addrs = partition_fixed_width_validity_addrs_[col]; - if (rb.column_data(col_idx)->GetNullCount() == 0 && - column_has_null_[col_idx] == true) { - // if the input record batch doesn't have null, set validity to True - // column_has_null_ is used to skip the partition_id_cnt_[pid] and dst_addrs[pid] - // access - for (auto pid = 0; pid < num_partitions_; ++pid) { - if (partition_id_cnt_[pid] > 0 && dst_addrs[pid] != nullptr) { - arrow::BitUtil::SetBitsTo(dst_addrs[pid], partition_buffer_idx_base_[pid], - partition_id_cnt_[pid], true); - } - } - } else if (rb.column_data(col_idx)->GetNullCount() > 0) { - // there is Null count - column_has_null_[col_idx] = true; - for (auto pid = 0; pid < num_partitions_; ++pid) { - if (partition_id_cnt_[pid] > 0 && dst_addrs[pid] == nullptr) { - // init bitmap if it's null, initialize the buffer as true - auto new_size = - std::max(partition_id_cnt_[pid], (uint16_t)options_.buffer_size); - ARROW_ASSIGN_OR_RAISE( - auto validity_buffer, - arrow::AllocateResizableBuffer(arrow::BitUtil::BytesForBits(new_size), - options_.memory_pool)); - dst_addrs[pid] = const_cast(validity_buffer->data()); - arrow::BitUtil::SetBitsTo(dst_addrs[pid], 0, partition_buffer_idx_base_[pid], - true); - partition_fixed_width_buffers_[col][pid][0] = std::move(validity_buffer); - } - } - - auto src_addr = const_cast(rb.column_data(col_idx)->buffers[0]->data()); - partition_buffer_idx_offset.resize(partition_buffer_idx_base_.size()); - std::copy(partition_buffer_idx_base_.begin(), partition_buffer_idx_base_.end(), - partition_buffer_idx_offset.begin()); - for (auto row = 0; row < num_rows; ++row) { - auto pid = partition_id_[row]; - auto dst_offset = partition_buffer_idx_offset[pid]; - dst_addrs[pid][dst_offset >> 3] ^= - (dst_addrs[pid][dst_offset >> 3] >> (dst_offset & 7) ^ - src_addr[row >> 3] >> (row & 7)) - << (dst_offset & 7); - partition_buffer_idx_offset[pid]++; - } - } - } - return arrow::Status::OK(); -} - -arrow::Status Splitter::SplitBinaryArray(const arrow::RecordBatch& rb) { - for (int i = 0; i < binary_array_idx_.size(); ++i) { - RETURN_NOT_OK(AppendBinary( - std::static_pointer_cast(rb.column(binary_array_idx_[i])), - partition_binary_builders_[i], rb.num_rows())); - } - return arrow::Status::OK(); -} - -arrow::Status Splitter::SplitLargeBinaryArray(const arrow::RecordBatch& rb) { - for (int i = 0; i < large_binary_array_idx_.size(); ++i) { - RETURN_NOT_OK(AppendBinary( - std::static_pointer_cast( - rb.column(large_binary_array_idx_[i])), - partition_large_binary_builders_[i], rb.num_rows())); - } - return arrow::Status::OK(); -} - -#define PROCESS_SUPPORTED_TYPES(PROCESS) \ - PROCESS(arrow::BooleanType) \ - PROCESS(arrow::UInt8Type) \ - PROCESS(arrow::Int8Type) \ - PROCESS(arrow::UInt16Type) \ - PROCESS(arrow::Int16Type) \ - PROCESS(arrow::UInt32Type) \ - PROCESS(arrow::Int32Type) \ - PROCESS(arrow::UInt64Type) \ - PROCESS(arrow::Int64Type) \ - PROCESS(arrow::FloatType) \ - PROCESS(arrow::DoubleType) \ - PROCESS(arrow::Date32Type) \ - PROCESS(arrow::Date64Type) \ - PROCESS(arrow::Decimal128Type) \ - PROCESS(arrow::StringType) \ - PROCESS(arrow::BinaryType) -arrow::Status Splitter::SplitListArray(const arrow::RecordBatch& rb) { - for (int i = 0; i < list_array_idx_.size(); ++i) { - auto src_arr = - std::static_pointer_cast(rb.column(list_array_idx_[i])); - auto status = AppendList(rb.column(list_array_idx_[i]), partition_list_builders_[i], - rb.num_rows()); - if (!status.ok()) return status; - } - return arrow::Status::OK(); -} - -#undef PROCESS_SUPPORTED_TYPES - -template -arrow::Status Splitter::AppendBinary( - const std::shared_ptr& src_arr, - const std::vector>& dst_builders, int64_t num_rows) { - using offset_type = typename T::offset_type; - if (src_arr->null_count() == 0) { - for (auto row = 0; row < num_rows; ++row) { - offset_type length; - auto value = src_arr->GetValue(row, &length); - const auto& builder = dst_builders[partition_id_[row]]; - RETURN_NOT_OK(builder->Reserve(1)); - RETURN_NOT_OK(builder->ReserveData(length)); - builder->UnsafeAppend(value, length); - } - } else { - for (auto row = 0; row < num_rows; ++row) { - if (src_arr->IsValid(row)) { - offset_type length; - auto value = src_arr->GetValue(row, &length); - const auto& builder = dst_builders[partition_id_[row]]; - RETURN_NOT_OK(builder->Reserve(1)); - RETURN_NOT_OK(builder->ReserveData(length)); - builder->UnsafeAppend(value, length); - } else { - dst_builders[partition_id_[row]]->AppendNull(); - } - } - } - return arrow::Status::OK(); -} - -arrow::Status Splitter::AppendList( - const std::shared_ptr& src_arr, - const std::vector>& dst_builders, - int64_t num_rows) { - for (auto row = 0; row < num_rows; ++row) { - RETURN_NOT_OK(dst_builders[partition_id_[row]]->AppendArraySlice( - *(src_arr->data().get()), row, 1)); - } - return arrow::Status::OK(); -} - -std::string Splitter::NextSpilledFileDir() { - auto spilled_file_dir = GetSpilledShuffleFileDir(configured_dirs_[dir_selection_], - sub_dir_selection_[dir_selection_]); - sub_dir_selection_[dir_selection_] = - (sub_dir_selection_[dir_selection_] + 1) % options_.num_sub_dirs; - dir_selection_ = (dir_selection_ + 1) % configured_dirs_.size(); - return spilled_file_dir; -} - -arrow::Result> Splitter::GetSchemaPayload() { - if (schema_payload_ != nullptr) { - return schema_payload_; - } - schema_payload_ = std::make_shared(); - arrow::ipc::DictionaryFieldMapper dict_file_mapper; // unused - RETURN_NOT_OK(arrow::ipc::GetSchemaPayload(*schema_, options_.ipc_write_options, - dict_file_mapper, schema_payload_.get())); - return schema_payload_; -} - -// ---------------------------------------------------------------------- -// RoundRobinSplitter - -arrow::Result> RoundRobinSplitter::Create( - int32_t num_partitions, std::shared_ptr schema, SplitOptions options) { - std::shared_ptr res( - new RoundRobinSplitter(num_partitions, std::move(schema), std::move(options))); - RETURN_NOT_OK(res->Init()); - return res; -} - -arrow::Status RoundRobinSplitter::ComputeAndCountPartitionId( - const arrow::RecordBatch& rb) { - std::fill(std::begin(partition_id_cnt_), std::end(partition_id_cnt_), 0); - partition_id_.resize(rb.num_rows()); - for (auto& pid : partition_id_) { - pid = pid_selection_; - partition_id_cnt_[pid_selection_]++; - pid_selection_ = (pid_selection_ + 1) == num_partitions_ ? 0 : (pid_selection_ + 1); - } - return arrow::Status::OK(); -} - -// ---------------------------------------------------------------------- -// HashSplitter - -arrow::Result> HashSplitter::Create( - int32_t num_partitions, std::shared_ptr schema, - const gandiva::ExpressionVector& expr_vector, SplitOptions options) { - std::shared_ptr res( - new HashSplitter(num_partitions, std::move(schema), std::move(options))); - RETURN_NOT_OK(res->Init()); - RETURN_NOT_OK(res->CreateProjector(expr_vector)); - return res; -} - -arrow::Status HashSplitter::CreateProjector( - const gandiva::ExpressionVector& expr_vector) { - // same seed as spark's - auto hash = gandiva::TreeExprBuilder::MakeLiteral((int32_t)42); - for (const auto& expr : expr_vector) { - switch (expr->root()->return_type()->id()) { - case arrow::NullType::type_id: - break; - case arrow::BooleanType::type_id: - case arrow::Int8Type::type_id: - case arrow::Int16Type::type_id: - case arrow::Int32Type::type_id: - case arrow::FloatType::type_id: - case arrow::Date32Type::type_id: - hash = gandiva::TreeExprBuilder::MakeFunction( - "hash32_spark", {expr->root(), hash}, arrow::int32()); - break; - case arrow::Int64Type::type_id: - case arrow::DoubleType::type_id: - hash = gandiva::TreeExprBuilder::MakeFunction( - "hash64_spark", {expr->root(), hash}, arrow::int32()); - break; - case arrow::StringType::type_id: - hash = gandiva::TreeExprBuilder::MakeFunction( - "hashbuf_spark", {expr->root(), hash}, arrow::int32()); - break; - default: - hash = gandiva::TreeExprBuilder::MakeFunction("hash32", {expr->root(), hash}, - arrow::int32()); - /*return arrow::Status::NotImplemented("HashSplitter::CreateProjector - doesn't support type ", expr->result()->type()->ToString());*/ - } - } - auto hash_expr = - gandiva::TreeExprBuilder::MakeExpression(hash, arrow::field("pid", arrow::int32())); - return gandiva::Projector::Make(schema_, {hash_expr}, &projector_); -} - -arrow::Status HashSplitter::ComputeAndCountPartitionId(const arrow::RecordBatch& rb) { - auto num_rows = rb.num_rows(); - partition_id_.resize(num_rows); - std::fill(std::begin(partition_id_cnt_), std::end(partition_id_cnt_), 0); - - arrow::ArrayVector outputs; - TIME_NANO_OR_RAISE(total_compute_pid_time_, - projector_->Evaluate(rb, options_.memory_pool, &outputs)); - if (outputs.size() != 1) { - return arrow::Status::Invalid("Projector result should have one field, actual is ", - std::to_string(outputs.size())); - } - auto pid_arr = std::dynamic_pointer_cast(outputs.at(0)); - if (pid_arr == nullptr) { - return arrow::Status::Invalid("failed to cast outputs.at(0)"); - } - for (auto i = 0; i < num_rows; ++i) { - // positive mod - auto pid = pid_arr->Value(i) % num_partitions_; - // force to generate ASM - __asm__( - "lea (%[num_partitions],%[pid],1),%[tmp]\n" - "test %[pid],%[pid]\n" - "cmovs %[tmp],%[pid]\n" - : [pid] "+r"(pid) - : [num_partitions] "r"(num_partitions_), [tmp] "r"(0)); - partition_id_[i] = pid; - partition_id_cnt_[pid]++; - } - return arrow::Status::OK(); -} - -// ---------------------------------------------------------------------- -// FallBackRangeSplitter - -arrow::Result> FallbackRangeSplitter::Create( - int32_t num_partitions, std::shared_ptr schema, SplitOptions options) { - auto res = std::shared_ptr( - new FallbackRangeSplitter(num_partitions, std::move(schema), std::move(options))); - RETURN_NOT_OK(res->Init()); - return res; -} - -arrow::Status FallbackRangeSplitter::Init() { - input_schema_ = std::move(schema_); - ARROW_ASSIGN_OR_RAISE(schema_, input_schema_->RemoveField(0)) - return Splitter::Init(); -} - -arrow::Status FallbackRangeSplitter::Split(const arrow::RecordBatch& rb) { - EVAL_START("split", options_.thread_id) - RETURN_NOT_OK(ComputeAndCountPartitionId(rb)); - ARROW_ASSIGN_OR_RAISE(auto remove_pid, rb.RemoveColumn(0)); - RETURN_NOT_OK(DoSplit(*remove_pid)); - EVAL_END("split", options_.thread_id, options_.task_attempt_id) - return arrow::Status::OK(); -} - -arrow::Status FallbackRangeSplitter::ComputeAndCountPartitionId( - const arrow::RecordBatch& rb) { - if (rb.column(0)->type_id() != arrow::Type::INT32) { - return arrow::Status::Invalid("RecordBatch field 0 should be ", - arrow::int32()->ToString(), ", actual is ", - rb.column(0)->type()->ToString()); - } - - auto pid_arr = reinterpret_cast(rb.column_data(0)->buffers[1]->data()); - auto num_rows = rb.num_rows(); - partition_id_.resize(num_rows); - std::fill(std::begin(partition_id_cnt_), std::end(partition_id_cnt_), 0); - for (auto i = 0; i < num_rows; ++i) { - auto pid = pid_arr[i]; - if (pid >= num_partitions_) { - return arrow::Status::Invalid("Partition id ", std::to_string(pid), - " is equal or greater than ", - std::to_string(num_partitions_)); - } - partition_id_[i] = pid; - partition_id_cnt_[pid]++; - } - return arrow::Status::OK(); -} - -} // namespace shuffle -} // namespace sparkcolumnarplugin diff --git a/native-sql-engine/cpp/src/tests/shuffle_split_test.cc b/native-sql-engine/cpp/src/tests/shuffle_split_test.cc deleted file mode 100644 index cc05cd3e1..000000000 --- a/native-sql-engine/cpp/src/tests/shuffle_split_test.cc +++ /dev/null @@ -1,1139 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -void print_trace(void) { - char **strings; - size_t i, size; - enum Constexpr { MAX_SIZE = 1024 }; - void *array[MAX_SIZE]; - size = backtrace(array, MAX_SIZE); - strings = backtrace_symbols(array, size); - for (i = 0; i < size; i++) - printf(" %s\n", strings[i]); - puts(""); - free(strings); -} - -#include "shuffle/splitter.h" -#include "tests/test_utils.h" - -namespace sparkcolumnarplugin { -namespace shuffle { - -class MyMemoryPool : public arrow::MemoryPool { - public: - explicit MyMemoryPool(int64_t capacity) : capacity_(capacity) {} - - Status Allocate(int64_t size, uint8_t** out) override { - if (bytes_allocated() + size > capacity_) { - return Status::OutOfMemory("malloc of size ", size, " failed"); - } - RETURN_NOT_OK(pool_->Allocate(size, out)); - stats_.UpdateAllocatedBytes(size); - std::cout << "Allocate: size = " << size << " addr = " << std::hex << (uint64_t)*out << std::dec << std::endl; - //print_trace(); - return arrow::Status::OK(); - } - - Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override { - if (new_size > capacity_) { - return Status::OutOfMemory("malloc of size ", new_size, " failed"); - } - auto old_ptr = ptr; - RETURN_NOT_OK(pool_->Reallocate(old_size, new_size, ptr)); - stats_.UpdateAllocatedBytes(new_size - old_size); - std::cout << "Reallocate: old_size = " << old_size << " old_ptr = " << std::hex << (uint64_t)*old_ptr << std::dec << " new_size = " << new_size << " addr = " << std::hex << (uint64_t)*ptr << std::dec << std::endl; - //print_trace(); - return arrow::Status::OK(); - } - - void Free(uint8_t* buffer, int64_t size) override { - pool_->Free(buffer, size); - stats_.UpdateAllocatedBytes(-size); - std::cout << "Free: size = " << size << " addr = " << std::hex << (uint64_t)buffer << std::dec << std::endl; - //print_trace(); - } - - int64_t bytes_allocated() const override { return stats_.bytes_allocated(); } - - int64_t max_memory() const override { return pool_->max_memory(); } - - std::string backend_name() const override { return pool_->backend_name(); } - - private: - MemoryPool* pool_ = arrow::default_memory_pool(); - int64_t capacity_; - arrow::internal::MemoryPoolStats stats_; -}; - -class SplitterTest : public ::testing::Test { - protected: - void SetUp() { - auto f_na = field("f_na", arrow::null()); - auto f_int8_a = field("f_int8_a", arrow::int8()); - auto f_int8_b = field("f_int8_b", arrow::int8()); - auto f_int32 = field("f_int32", arrow::int32()); - auto f_uint64 = field("f_uint64", arrow::uint64()); - auto f_double = field("f_double", arrow::float64()); - auto f_bool = field("f_bool", arrow::boolean()); - auto f_string = field("f_string", arrow::utf8()); - auto f_nullable_string = field("f_nullable_string", arrow::utf8()); - auto f_decimal = field("f_decimal128", arrow::decimal(10, 2)); - - ARROW_ASSIGN_OR_THROW(tmp_dir_1_, - std::move(arrow::internal::TemporaryDir::Make(tmp_dir_prefix))) - ARROW_ASSIGN_OR_THROW(tmp_dir_2_, - std::move(arrow::internal::TemporaryDir::Make(tmp_dir_prefix))) - auto config_dirs = - tmp_dir_1_->path().ToString() + "," + tmp_dir_2_->path().ToString(); - - setenv("NATIVESQL_SPARK_LOCAL_DIRS", config_dirs.c_str(), 1); - - schema_ = arrow::schema({f_na, f_int8_a, f_int8_b, f_int32, f_uint64, f_double, - f_bool, f_string, f_nullable_string, f_decimal}); - - MakeInputBatch(input_data_1, schema_, &input_batch_1_); - MakeInputBatch(input_data_2, schema_, &input_batch_2_); - - split_options_ = SplitOptions::Defaults(); - } - - void TearDown() override { - if (file_ != nullptr && !file_->closed()) { - file_->Close(); - } - } - - static void CheckFileExsists(const std::string& file_name) { - ASSERT_EQ(*arrow::internal::FileExists( - *arrow::internal::PlatformFilename::FromString(file_name)), - true); - } - - arrow::Result> TakeRows( - const std::shared_ptr& input_batch, - const std::string& json_idx) { - std::shared_ptr take_idx; - ASSERT_NOT_OK( - arrow::ipc::internal::json::ArrayFromJSON(arrow::int32(), json_idx, &take_idx)); - - auto cntx = arrow::compute::ExecContext(); - std::shared_ptr res; - ARROW_ASSIGN_OR_RAISE( - arrow::Datum result, - arrow::compute::Take(arrow::Datum(input_batch), arrow::Datum(take_idx), - arrow::compute::TakeOptions{}, &cntx)); - return result.record_batch(); - } - - arrow::Result> - GetRecordBatchStreamReader(const std::string& file_name) { - if (file_ != nullptr && !file_->closed()) { - RETURN_NOT_OK(file_->Close()); - } - ARROW_ASSIGN_OR_RAISE(file_, arrow::io::ReadableFile::Open(file_name)) - ARROW_ASSIGN_OR_RAISE(auto file_reader, - arrow::ipc::RecordBatchStreamReader::Open(file_)) - return file_reader; - } - - static const std::string tmp_dir_prefix; - static const std::vector input_data_1; - static const std::vector input_data_2; - - std::shared_ptr tmp_dir_1_; - std::shared_ptr tmp_dir_2_; - - std::shared_ptr schema_; - std::shared_ptr splitter_; - SplitOptions split_options_; - - std::shared_ptr input_batch_1_; - std::shared_ptr input_batch_2_; - - std::shared_ptr file_; -}; - -const std::string SplitterTest::tmp_dir_prefix = "columnar-shuffle-test"; -const std::vector SplitterTest::input_data_1 = { - "[null, null, null, null, null, null, null, null, null, null]", - "[1, 2, 3, null, 4, null, 5, 6, null, 7]", - "[1, -1, null, null, -2, 2, null, null, 3, -3]", - "[1, 2, 3, 4, null, 5, 6, 7, 8, null]", - "[null, null, null, null, null, null, null, null, null, null]", - R"([-0.1234567, null, 0.1234567, null, -0.142857, null, 0.142857, 0.285714, 0.428617, null])", - "[null, true, false, null, true, true, false, true, null, null]", - R"(["alice0", "bob1", "alice2", "bob3", "Alice4", "Bob5", "AlicE6", "boB7", "ALICE8", "BOB9"])", - R"(["alice", "bob", null, null, "Alice", "Bob", null, "alicE", null, "boB"])", - R"(["-1.01", "2.01", "-3.01", null, "0.11", "3.14", "2.27", null, "-3.14", null])"}; - -const std::vector SplitterTest::input_data_2 = { - "[null, null]", "[null, null]", - "[1, -1]", "[100, null]", - "[1, 1]", R"([0.142857, -0.142857])", - "[true, false]", R"(["bob", "alice"])", - R"([null, null])", R"([null, null])"}; - -TEST_F(SplitterTest, TestSingleSplitter) { - split_options_.buffer_size = 10; - - ARROW_ASSIGN_OR_THROW(splitter_, Splitter::Make("rr", schema_, 1, split_options_)) - - ASSERT_NOT_OK(splitter_->Split(*input_batch_1_)); - ASSERT_NOT_OK(splitter_->Split(*input_batch_2_)); - ASSERT_NOT_OK(splitter_->Split(*input_batch_1_)); - - ASSERT_NOT_OK(splitter_->Stop()); - - // verify data file - CheckFileExsists(splitter_->DataFile()); - - // verify output temporary files - const auto& lengths = splitter_->PartitionLengths(); - ASSERT_EQ(lengths.size(), 1); - - std::shared_ptr file_reader; - ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); - - // verify schema - ASSERT_EQ(*file_reader->schema(), *schema_); - - std::vector> batches; - ASSERT_NOT_OK(file_reader->ReadAll(&batches)); - ASSERT_EQ(batches.size(), 3); - - std::vector expected = {input_batch_1_.get(), input_batch_2_.get(), - input_batch_1_.get()}; - for (auto i = 0; i < batches.size(); ++i) { - const auto& rb = batches[i]; - ASSERT_EQ(rb->num_columns(), schema_->num_fields()); - for (auto j = 0; j < rb->num_columns(); ++j) { - ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); - ASSERT_TRUE(rb->column(j)->Equals(*expected[i]->column(j), - EqualOptions::Defaults().diff_sink(&std::cout))); - } - ASSERT_TRUE(rb->Equals(*expected[i])); - } -} - -TEST_F(SplitterTest, TestRoundRobinSplitter) { - int32_t num_partitions = 2; - split_options_.buffer_size = 4; - ARROW_ASSIGN_OR_THROW(splitter_, - Splitter::Make("rr", schema_, num_partitions, split_options_)); - - ASSERT_NOT_OK(splitter_->Split(*input_batch_1_)); - ASSERT_NOT_OK(splitter_->Split(*input_batch_2_)); - ASSERT_NOT_OK(splitter_->Split(*input_batch_1_)); - - ASSERT_NOT_OK(splitter_->Stop()); - - std::shared_ptr file_reader; - ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); - - // verify partition lengths - const auto& lengths = splitter_->PartitionLengths(); - ASSERT_EQ(lengths.size(), 2); - ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]); - - // verify schema - std::vector> batches; - ASSERT_EQ(*file_reader->schema(), *schema_); - - // prepare first block expected result - std::shared_ptr res_batch_0; - std::shared_ptr res_batch_1; - ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_1_, "[0, 2, 4, 6, 8]")) - ARROW_ASSIGN_OR_THROW(res_batch_1, TakeRows(input_batch_2_, "[0]")) - std::vector expected = {res_batch_0.get(), res_batch_1.get(), - res_batch_0.get()}; - - // verify first block - ASSERT_NOT_OK(file_reader->ReadAll(&batches)); - ASSERT_EQ(batches.size(), 3); - for (auto i = 0; i < batches.size(); ++i) { - const auto& rb = batches[i]; - ASSERT_EQ(rb->num_columns(), schema_->num_fields()); - for (auto j = 0; j < rb->num_columns(); ++j) { - ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); - } - ASSERT_TRUE(rb->Equals(*expected[i])); - } - - // prepare second block expected result - ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_1_, "[1, 3, 5, 7, 9]")) - ARROW_ASSIGN_OR_THROW(res_batch_1, TakeRows(input_batch_2_, "[1]")) - expected = {res_batch_0.get(), res_batch_1.get(), res_batch_0.get()}; - - // verify second block - batches.clear(); - ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); - ASSERT_EQ(*file_reader->schema(), *schema_); - ASSERT_NOT_OK(file_->Advance(lengths[0])); - ASSERT_NOT_OK(file_reader->ReadAll(&batches)); - ASSERT_EQ(batches.size(), 3); - for (auto i = 0; i < batches.size(); ++i) { - const auto& rb = batches[i]; - ASSERT_EQ(rb->num_columns(), schema_->num_fields()); - for (auto j = 0; j < rb->num_columns(); ++j) { - ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); - } - ASSERT_TRUE(rb->Equals(*expected[i])); - } -} - -TEST_F(SplitterTest, TestSplitterMemoryLeak) { - - std::shared_ptr pool = std::make_shared(9*1024*1024); - - int32_t num_partitions = 2; - split_options_.buffer_size = 4; - split_options_.memory_pool = pool.get(); - - std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; - ARROW_ASSIGN_OR_THROW(splitter_, - Splitter::Make("rr", schema_, num_partitions, split_options_)); - - ASSERT_NOT_OK(splitter_->Split(*input_batch_1_)); - std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; - ASSERT_NOT_OK(splitter_->Split(*input_batch_2_)); - std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; - ASSERT_NOT_OK(splitter_->Split(*input_batch_1_)); - std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; - - std::cout << "split down " << std::endl; - - ASSERT_NOT_OK(splitter_->Stop()); - std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; - - std::cout << "stopped " << std::endl; - - splitter_.reset(); - std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; - std::cout << "splitter_ killed " << std::endl; - - split_options_.memory_pool = arrow::default_memory_pool(); -} - -TEST_F(SplitterTest, TestHashSplitter) { - int32_t num_partitions = 2; - split_options_.buffer_size = 4; - - auto f_0 = TreeExprBuilder::MakeField(schema_->field(1)); - auto f_1 = TreeExprBuilder::MakeField(schema_->field(2)); - auto f_2 = TreeExprBuilder::MakeField(schema_->field(3)); - - auto node_0 = TreeExprBuilder::MakeFunction("add", {f_0, f_1}, int8()); - auto expr_0 = TreeExprBuilder::MakeExpression(node_0, field("res0", int8())); - auto expr_1 = TreeExprBuilder::MakeExpression(f_2, field("f_uint64", uint64())); - - ARROW_ASSIGN_OR_THROW(splitter_, Splitter::Make("hash", schema_, num_partitions, - {expr_0, expr_1}, split_options_)) - - ASSERT_NOT_OK(splitter_->Split(*input_batch_1_)); - ASSERT_NOT_OK(splitter_->Split(*input_batch_2_)); - ASSERT_NOT_OK(splitter_->Split(*input_batch_1_)); - - ASSERT_NOT_OK(splitter_->Stop()); - - const auto& lengths = splitter_->PartitionLengths(); - ASSERT_EQ(lengths.size(), 2); - - // verify data file - CheckFileExsists(splitter_->DataFile()); - - std::shared_ptr file_reader; - ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); - - // verify schema - ASSERT_EQ(*file_reader->schema(), *schema_); - - std::vector> batches; - ASSERT_NOT_OK(file_reader->ReadAll(&batches)); - - for (const auto& rb : batches) { - ASSERT_EQ(rb->num_columns(), schema_->num_fields()); - for (auto i = 0; i < rb->num_columns(); ++i) { - ASSERT_EQ(rb->column(i)->length(), rb->num_rows()); - } - } -} - -TEST_F(SplitterTest, TestFallbackRangeSplitter) { - int32_t num_partitions = 2; - split_options_.buffer_size = 4; - - std::shared_ptr pid_arr_0; - ASSERT_NOT_OK(arrow::ipc::internal::json::ArrayFromJSON( - arrow::int32(), "[0, 1, 0, 1, 0, 1, 0, 1, 0, 1]", &pid_arr_0)); - std::shared_ptr pid_arr_1; - ASSERT_NOT_OK( - arrow::ipc::internal::json::ArrayFromJSON(arrow::int32(), "[0, 1]", &pid_arr_1)); - - std::shared_ptr schema_w_pid; - std::shared_ptr input_batch_1_w_pid; - std::shared_ptr input_batch_2_w_pid; - ARROW_ASSIGN_OR_THROW(schema_w_pid, - schema_->AddField(0, arrow::field("pid", arrow::int32()))); - ARROW_ASSIGN_OR_THROW(input_batch_1_w_pid, - input_batch_1_->AddColumn(0, "pid", pid_arr_0)); - ARROW_ASSIGN_OR_THROW(input_batch_2_w_pid, - input_batch_2_->AddColumn(0, "pid", pid_arr_1)); - - ARROW_ASSIGN_OR_THROW(splitter_, Splitter::Make("range", std::move(schema_w_pid), - num_partitions, split_options_)) - - ASSERT_NOT_OK(splitter_->Split(*input_batch_1_w_pid)); - ASSERT_NOT_OK(splitter_->Split(*input_batch_2_w_pid)); - ASSERT_NOT_OK(splitter_->Split(*input_batch_1_w_pid)); - - ASSERT_NOT_OK(splitter_->Stop()); - - std::shared_ptr file_reader; - ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); - - // verify partition lengths - const auto& lengths = splitter_->PartitionLengths(); - ASSERT_EQ(lengths.size(), 2); - ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]); - - // verify schema - std::vector> batches; - ASSERT_EQ(*file_reader->schema(), *schema_); - - // prepare first block expected result - std::shared_ptr res_batch_0; - std::shared_ptr res_batch_1; - ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_1_, "[0, 2, 4, 6, 8]")) - ARROW_ASSIGN_OR_THROW(res_batch_1, TakeRows(input_batch_2_, "[0]")) - std::vector expected = {res_batch_0.get(), res_batch_1.get(), - res_batch_0.get()}; - - // verify first block - ASSERT_NOT_OK(file_reader->ReadAll(&batches)); - ASSERT_EQ(batches.size(), 3); - for (auto i = 0; i < batches.size(); ++i) { - const auto& rb = batches[i]; - ASSERT_EQ(rb->num_columns(), schema_->num_fields()); - for (auto j = 0; j < rb->num_columns(); ++j) { - ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); - } - ASSERT_TRUE(rb->Equals(*expected[i])); - } - - // prepare second block expected result - ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_1_, "[1, 3, 5, 7, 9]")) - ARROW_ASSIGN_OR_THROW(res_batch_1, TakeRows(input_batch_2_, "[1]")) - expected = {res_batch_0.get(), res_batch_1.get(), res_batch_0.get()}; - - // verify second block - batches.clear(); - ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); - ASSERT_EQ(*file_reader->schema(), *schema_); - ASSERT_NOT_OK(file_->Advance(lengths[0])); - ASSERT_NOT_OK(file_reader->ReadAll(&batches)); - ASSERT_EQ(batches.size(), 3); - for (auto i = 0; i < batches.size(); ++i) { - const auto& rb = batches[i]; - ASSERT_EQ(rb->num_columns(), schema_->num_fields()); - for (auto j = 0; j < rb->num_columns(); ++j) { - ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); - } - ASSERT_TRUE(rb->Equals(*expected[i])); - } -} - -TEST_F(SplitterTest, TestSpillFailWithOutOfMemory) { - auto pool = std::make_unique(0); - - int32_t num_partitions = 2; - split_options_.buffer_size = 4; - //split_options_.memory_pool = pool.get(); - ARROW_ASSIGN_OR_THROW(splitter_, - Splitter::Make("rr", schema_, num_partitions, split_options_)); - - auto status = splitter_->Split(*input_batch_1_); - // should return OOM status because there's no partition buffer to spill - ASSERT_TRUE(status.IsOutOfMemory()); - ASSERT_NOT_OK(splitter_->Stop()); -} - -TEST_F(SplitterTest, TestSpillLargestPartition) { - std::shared_ptr pool = std::make_shared(9*1024*1024); - // pool = std::make_shared(pool.get()); - - int32_t num_partitions = 2; - split_options_.buffer_size = 4; - //split_options_.memory_pool = pool.get(); - split_options_.compression_type = arrow::Compression::UNCOMPRESSED; - ARROW_ASSIGN_OR_THROW(splitter_, - Splitter::Make("rr", schema_, num_partitions, split_options_)); - - for (int i = 0; i < 100; ++i) { - ASSERT_NOT_OK(splitter_->Split(*input_batch_1_)); - ASSERT_NOT_OK(splitter_->Split(*input_batch_2_)); - ASSERT_NOT_OK(splitter_->Split(*input_batch_1_)); - } - ASSERT_NOT_OK(splitter_->Stop()); -} - -TEST_F(SplitterTest, TestRoundRobinListArraySplitter) { - auto f_arr_str = field("f_arr", arrow::list(arrow::utf8())); - auto f_arr_bool = field("f_bool", arrow::list(arrow::boolean())); - auto f_arr_int32 = field("f_int32", arrow::list(arrow::int32())); - auto f_arr_double = field("f_double", arrow::list(arrow::float64())); - auto f_arr_decimal = field("f_decimal", arrow::list(arrow::decimal(10, 2))); - - auto rb_schema = - arrow::schema({f_arr_str, f_arr_bool, f_arr_int32, f_arr_double, f_arr_decimal}); - - const std::vector input_data_arr = { - R"([["alice0", "bob1"], ["alice2"], ["bob3"], ["Alice4", "Bob5", "AlicE6"], ["boB7"], ["ALICE8", "BOB9"]])", - R"([[true, null], [true, true, true], [false], [true], [false], [false]])", - R"([[1, 2, 3], [9, 8], [null], [3, 1], [0], [1, 9, null]])", - R"([[0.26121], [-9.12123, 6.111111], [8.121], [7.21, null], [3.2123, 6,1121], [null]])", - R"([["0.26"], ["-9.12", "6.11"], ["8.12"], ["7.21", null], ["3.21", "6.11"], [null]])"}; - - std::shared_ptr input_batch_arr; - MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr); - - int32_t num_partitions = 2; - split_options_.buffer_size = 4; - ARROW_ASSIGN_OR_THROW(splitter_, - Splitter::Make("rr", rb_schema, num_partitions, split_options_)); - - ASSERT_NOT_OK(splitter_->Split(*input_batch_arr)); - ASSERT_NOT_OK(splitter_->Stop()); - - std::shared_ptr file_reader; - ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); - - // verify partition lengths - const auto& lengths = splitter_->PartitionLengths(); - ASSERT_EQ(lengths.size(), 2); - ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]); - - // verify schema - std::vector> batches; - ASSERT_EQ(*file_reader->schema(), *rb_schema); - - // prepare first block expected result - std::shared_ptr res_batch_0; - std::shared_ptr res_batch_1; - ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2, 4]")) - std::vector expected = {res_batch_0.get()}; - - // verify first block - ASSERT_NOT_OK(file_reader->ReadAll(&batches)); - ASSERT_EQ(batches.size(), 1); - for (auto i = 0; i < batches.size(); ++i) { - const auto& rb = batches[i]; - ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); - for (auto j = 0; j < rb->num_columns(); ++j) { - ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); - } - ASSERT_TRUE(rb->Equals(*expected[i])); - } - - // prepare second block expected result - ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3, 5]")) - expected = {res_batch_0.get()}; - - // verify second block - batches.clear(); - ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); - ASSERT_EQ(*file_reader->schema(), *rb_schema); - ASSERT_NOT_OK(file_->Advance(lengths[0])); - ASSERT_NOT_OK(file_reader->ReadAll(&batches)); - ASSERT_EQ(batches.size(), 1); - for (auto i = 0; i < batches.size(); ++i) { - const auto& rb = batches[i]; - ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); - for (auto j = 0; j < rb->num_columns(); ++j) { - ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); - } - ASSERT_TRUE(rb->Equals(*expected[i])); - } -} - -TEST_F(SplitterTest, TestRoundRobinNestListArraySplitter) { - auto f_arr_str = field("f_str", arrow::list(arrow::list(arrow::utf8()))); - auto f_arr_int32 = field("f_int32", arrow::list(arrow::list(arrow::int32()))); - - auto rb_schema = arrow::schema({f_arr_str, f_arr_int32}); - - const std::vector input_data_arr = { - R"([[["alice0", "bob1"]], [["alice2"], ["bob3"]], [["Alice4", "Bob5", "AlicE6"]], [["boB7"], ["ALICE8", "BOB9"]]])", - R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])"}; - - std::shared_ptr input_batch_arr; - MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr); - - int32_t num_partitions = 2; - split_options_.buffer_size = 4; - ARROW_ASSIGN_OR_THROW(splitter_, - Splitter::Make("rr", rb_schema, num_partitions, split_options_)); - - ASSERT_NOT_OK(splitter_->Split(*input_batch_arr)); - ASSERT_NOT_OK(splitter_->Stop()); - - std::shared_ptr file_reader; - ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); - - // verify partition lengths - const auto& lengths = splitter_->PartitionLengths(); - ASSERT_EQ(lengths.size(), 2); - ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]); - - // verify schema - std::vector> batches; - ASSERT_EQ(*file_reader->schema(), *rb_schema); - - // prepare first block expected result - std::shared_ptr res_batch_0; - ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]")) - std::vector expected = {res_batch_0.get()}; - - // verify first block - ASSERT_NOT_OK(file_reader->ReadAll(&batches)); - ASSERT_EQ(batches.size(), 1); - for (auto i = 0; i < batches.size(); ++i) { - const auto& rb = batches[i]; - ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); - for (auto j = 0; j < rb->num_columns(); ++j) { - ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); - } - ASSERT_TRUE(rb->Equals(*expected[i])); - } - - // prepare second block expected result - ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]")) - expected = {res_batch_0.get()}; - - // verify second block - batches.clear(); - ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); - ASSERT_EQ(*file_reader->schema(), *rb_schema); - ASSERT_NOT_OK(file_->Advance(lengths[0])); - ASSERT_NOT_OK(file_reader->ReadAll(&batches)); - ASSERT_EQ(batches.size(), 1); - for (auto i = 0; i < batches.size(); ++i) { - const auto& rb = batches[i]; - ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); - for (auto j = 0; j < rb->num_columns(); ++j) { - ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); - } - ASSERT_TRUE(rb->Equals(*expected[i])); - } -} - -TEST_F(SplitterTest, TestRoundRobinNestLargeListArraySplitter) { - auto f_arr_str = field("f_str", arrow::large_list(arrow::list(arrow::utf8()))); - auto f_arr_int32 = field("f_int32", arrow::large_list(arrow::list(arrow::int32()))); - - auto rb_schema = arrow::schema({f_arr_str, f_arr_int32}); - - const std::vector input_data_arr = { - R"([[["alice0", "bob1"]], [["alice2"], ["bob3"]], [["Alice4", "Bob5", "AlicE6"]], [["boB7"], ["ALICE8", "BOB9"]]])", - R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])"}; - - std::shared_ptr input_batch_arr; - MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr); - - int32_t num_partitions = 2; - split_options_.buffer_size = 4; - ARROW_ASSIGN_OR_THROW(splitter_, - Splitter::Make("rr", rb_schema, num_partitions, split_options_)); - - ASSERT_NOT_OK(splitter_->Split(*input_batch_arr)); - ASSERT_NOT_OK(splitter_->Stop()); - - std::shared_ptr file_reader; - ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); - - // verify partition lengths - const auto& lengths = splitter_->PartitionLengths(); - ASSERT_EQ(lengths.size(), 2); - ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]); - - // verify schema - std::vector> batches; - ASSERT_EQ(*file_reader->schema(), *rb_schema); - - // prepare first block expected result - std::shared_ptr res_batch_0; - ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]")) - std::vector expected = {res_batch_0.get()}; - - // verify first block - ASSERT_NOT_OK(file_reader->ReadAll(&batches)); - ASSERT_EQ(batches.size(), 1); - for (auto i = 0; i < batches.size(); ++i) { - const auto& rb = batches[i]; - ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); - for (auto j = 0; j < rb->num_columns(); ++j) { - ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); - } - ASSERT_TRUE(rb->Equals(*expected[i])); - } - - // prepare second block expected result - ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]")) - expected = {res_batch_0.get()}; - - // verify second block - batches.clear(); - ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); - ASSERT_EQ(*file_reader->schema(), *rb_schema); - ASSERT_NOT_OK(file_->Advance(lengths[0])); - ASSERT_NOT_OK(file_reader->ReadAll(&batches)); - ASSERT_EQ(batches.size(), 1); - for (auto i = 0; i < batches.size(); ++i) { - const auto& rb = batches[i]; - ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); - for (auto j = 0; j < rb->num_columns(); ++j) { - ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); - } - ASSERT_TRUE(rb->Equals(*expected[i])); - } -} - -TEST_F(SplitterTest, TestRoundRobinListStructArraySplitter) { - auto f_arr_int32 = field("f_int32", arrow::list(arrow::list(arrow::int32()))); - auto f_arr_list_struct = - field("f_list_struct", list(struct_({field("a", int32()), field("b", utf8())}))); - - auto rb_schema = arrow::schema({f_arr_int32, f_arr_list_struct}); - - const std::vector input_data_arr = { - R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])", - R"([[{"a": 4, "b": null}], [{"a": 42, "b": null}, {"a": null, "b": "foo2"}], [{"a": 43, "b": "foo3"}], [{"a": 44, "b": "foo4"}]])"}; - - std::shared_ptr input_batch_arr; - MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr); - - int32_t num_partitions = 2; - split_options_.buffer_size = 4; - ARROW_ASSIGN_OR_THROW(splitter_, - Splitter::Make("rr", rb_schema, num_partitions, split_options_)); - - ASSERT_NOT_OK(splitter_->Split(*input_batch_arr)); - ASSERT_NOT_OK(splitter_->Stop()); - - std::shared_ptr file_reader; - ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); - - // verify partition lengths - const auto& lengths = splitter_->PartitionLengths(); - ASSERT_EQ(lengths.size(), 2); - ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]); - - // verify schema - std::vector> batches; - ASSERT_EQ(*file_reader->schema(), *rb_schema); - - // prepare first block expected result - std::shared_ptr res_batch_0; - ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]")) - std::vector expected = {res_batch_0.get()}; - - // verify first block - ASSERT_NOT_OK(file_reader->ReadAll(&batches)); - ASSERT_EQ(batches.size(), 1); - for (auto i = 0; i < batches.size(); ++i) { - const auto& rb = batches[i]; - ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); - for (auto j = 0; j < rb->num_columns(); ++j) { - ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); - } - ASSERT_TRUE(rb->Equals(*expected[i])); - } - - // prepare second block expected result - ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]")) - expected = {res_batch_0.get()}; - - // verify second block - batches.clear(); - ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); - ASSERT_EQ(*file_reader->schema(), *rb_schema); - ASSERT_NOT_OK(file_->Advance(lengths[0])); - ASSERT_NOT_OK(file_reader->ReadAll(&batches)); - ASSERT_EQ(batches.size(), 1); - for (auto i = 0; i < batches.size(); ++i) { - const auto& rb = batches[i]; - ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); - for (auto j = 0; j < rb->num_columns(); ++j) { - ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); - } - ASSERT_TRUE(rb->Equals(*expected[i])); - } -} - -TEST_F(SplitterTest, TestRoundRobinListMapArraySplitter) { - auto f_arr_int32 = field("f_int32", arrow::list(arrow::list(arrow::int32()))); - auto f_arr_list_map = field("f_list_map", list(map(utf8(), utf8()))); - - auto rb_schema = arrow::schema({f_arr_int32, f_arr_list_map}); - - const std::vector input_data_arr = { - R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])", - R"([[[["key1", "val_aa1"]]], [[["key1", "val_bb1"]], [["key2", "val_bb2"]]], [[["key1", "val_cc1"]]], [[["key1", "val_dd1"]]]])"}; - - std::shared_ptr input_batch_arr; - MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr); - - int32_t num_partitions = 2; - split_options_.buffer_size = 4; - ARROW_ASSIGN_OR_THROW(splitter_, - Splitter::Make("rr", rb_schema, num_partitions, split_options_)); - - ASSERT_NOT_OK(splitter_->Split(*input_batch_arr)); - ASSERT_NOT_OK(splitter_->Stop()); - - std::shared_ptr file_reader; - ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); - - // verify partition lengths - const auto& lengths = splitter_->PartitionLengths(); - ASSERT_EQ(lengths.size(), 2); - ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]); - - // verify schema - std::vector> batches; - ASSERT_EQ(*file_reader->schema(), *rb_schema); - - // prepare first block expected result - std::shared_ptr res_batch_0; - ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]")) - std::vector expected = {res_batch_0.get()}; - - // verify first block - ASSERT_NOT_OK(file_reader->ReadAll(&batches)); - ASSERT_EQ(batches.size(), 1); - for (auto i = 0; i < batches.size(); ++i) { - const auto& rb = batches[i]; - ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); - for (auto j = 0; j < rb->num_columns(); ++j) { - ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); - } - ASSERT_TRUE(rb->Equals(*expected[i])); - } - - // prepare second block expected result - ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]")) - expected = {res_batch_0.get()}; - - // verify second block - batches.clear(); - ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); - ASSERT_EQ(*file_reader->schema(), *rb_schema); - ASSERT_NOT_OK(file_->Advance(lengths[0])); - ASSERT_NOT_OK(file_reader->ReadAll(&batches)); - ASSERT_EQ(batches.size(), 1); - for (auto i = 0; i < batches.size(); ++i) { - const auto& rb = batches[i]; - ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); - for (auto j = 0; j < rb->num_columns(); ++j) { - ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); - } - ASSERT_TRUE(rb->Equals(*expected[i])); - } -} - -TEST_F(SplitterTest, TestRoundRobinStructArraySplitter) { - auto f_arr_int32 = field("f_int32", arrow::list(arrow::list(arrow::int32()))); - auto f_arr_struct_list = - field("f_struct_list", struct_({field("a", list(int32())), field("b", utf8())})); - - auto rb_schema = arrow::schema({f_arr_int32, f_arr_struct_list}); - - const std::vector input_data_arr = { - R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])", - R"([{"a": [1,1,1,1], "b": null}, {"a": null, "b": "foo2"}, {"a": [3,3,3,3], "b": "foo3"}, {"a": [4,4,4,4], "b": "foo4"}])"}; - - std::shared_ptr input_batch_arr; - MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr); - - int32_t num_partitions = 2; - split_options_.buffer_size = 4; - ARROW_ASSIGN_OR_THROW(splitter_, - Splitter::Make("rr", rb_schema, num_partitions, split_options_)); - - ASSERT_NOT_OK(splitter_->Split(*input_batch_arr)); - ASSERT_NOT_OK(splitter_->Stop()); - - std::shared_ptr file_reader; - ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); - - // verify partition lengths - const auto& lengths = splitter_->PartitionLengths(); - ASSERT_EQ(lengths.size(), 2); - ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]); - - // verify schema - std::vector> batches; - ASSERT_EQ(*file_reader->schema(), *rb_schema); - - // prepare first block expected result - std::shared_ptr res_batch_0; - ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]")) - std::vector expected = {res_batch_0.get()}; - - // verify first block - ASSERT_NOT_OK(file_reader->ReadAll(&batches)); - ASSERT_EQ(batches.size(), 1); - for (auto i = 0; i < batches.size(); ++i) { - const auto& rb = batches[i]; - ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); - for (auto j = 0; j < rb->num_columns(); ++j) { - ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); - } - ASSERT_TRUE(rb->Equals(*expected[i])); - } - - // prepare second block expected result - ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]")) - expected = {res_batch_0.get()}; - - // verify second block - batches.clear(); - ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); - ASSERT_EQ(*file_reader->schema(), *rb_schema); - ASSERT_NOT_OK(file_->Advance(lengths[0])); - ASSERT_NOT_OK(file_reader->ReadAll(&batches)); - ASSERT_EQ(batches.size(), 1); - for (auto i = 0; i < batches.size(); ++i) { - const auto& rb = batches[i]; - ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); - for (auto j = 0; j < rb->num_columns(); ++j) { - ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); - } - ASSERT_TRUE(rb->Equals(*expected[i])); - } -} - -TEST_F(SplitterTest, TestRoundRobinMapArraySplitter) { - auto f_arr_int32 = field("f_int32", arrow::list(arrow::list(arrow::int32()))); - auto f_arr_map = field("f_map", map(utf8(), utf8())); - - auto rb_schema = arrow::schema({f_arr_int32, f_arr_map}); - - const std::vector input_data_arr = { - R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])", - R"([[["key1", "val_aa1"]], [["key1", "val_bb1"], ["key2", "val_bb2"]], [["key1", "val_cc1"]], [["key1", "val_dd1"]]])"}; - - std::shared_ptr input_batch_arr; - MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr); - - int32_t num_partitions = 2; - split_options_.buffer_size = 4; - ARROW_ASSIGN_OR_THROW(splitter_, - Splitter::Make("rr", rb_schema, num_partitions, split_options_)); - - ASSERT_NOT_OK(splitter_->Split(*input_batch_arr)); - ASSERT_NOT_OK(splitter_->Stop()); - - std::shared_ptr file_reader; - ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); - - // verify partition lengths - const auto& lengths = splitter_->PartitionLengths(); - ASSERT_EQ(lengths.size(), 2); - ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]); - - // verify schema - std::vector> batches; - ASSERT_EQ(*file_reader->schema(), *rb_schema); - - // prepare first block expected result - std::shared_ptr res_batch_0; - ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]")) - std::vector expected = {res_batch_0.get()}; - - // verify first block - ASSERT_NOT_OK(file_reader->ReadAll(&batches)); - ASSERT_EQ(batches.size(), 1); - for (auto i = 0; i < batches.size(); ++i) { - const auto& rb = batches[i]; - ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); - for (auto j = 0; j < rb->num_columns(); ++j) { - ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); - } - ASSERT_TRUE(rb->Equals(*expected[i])); - } - - // prepare second block expected result - ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]")) - expected = {res_batch_0.get()}; - - // verify second block - batches.clear(); - ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); - ASSERT_EQ(*file_reader->schema(), *rb_schema); - ASSERT_NOT_OK(file_->Advance(lengths[0])); - ASSERT_NOT_OK(file_reader->ReadAll(&batches)); - ASSERT_EQ(batches.size(), 1); - for (auto i = 0; i < batches.size(); ++i) { - const auto& rb = batches[i]; - ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); - for (auto j = 0; j < rb->num_columns(); ++j) { - ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); - } - ASSERT_TRUE(rb->Equals(*expected[i])); - } -} - -TEST_F(SplitterTest, TestHashListArraySplitterWithMorePartitions) { - int32_t num_partitions = 5; - split_options_.buffer_size = 4; - - auto f_uint64 = field("f_uint64", arrow::uint64()); - auto f_arr_str = field("f_arr", arrow::list(arrow::utf8())); - - auto rb_schema = arrow::schema({f_uint64, f_arr_str}); - - const std::vector input_batch_1_data = { - R"([1, 2])", R"([["alice0", "bob1"], ["alice2"]])"}; - std::shared_ptr input_batch_arr; - MakeInputBatch(input_batch_1_data, rb_schema, &input_batch_arr); - - auto f_2 = TreeExprBuilder::MakeField(f_uint64); - auto expr_1 = TreeExprBuilder::MakeExpression(f_2, field("f_uint64", uint64())); - - ARROW_ASSIGN_OR_THROW(splitter_, Splitter::Make("hash", rb_schema, num_partitions, - {expr_1}, split_options_)); - - ASSERT_NOT_OK(splitter_->Split(*input_batch_arr)); - - ASSERT_NOT_OK(splitter_->Stop()); - - const auto& lengths = splitter_->PartitionLengths(); - ASSERT_EQ(lengths.size(), 5); - - CheckFileExsists(splitter_->DataFile()); - - std::shared_ptr file_reader; - ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); - - ASSERT_EQ(*file_reader->schema(), *rb_schema); - - std::vector> batches; - ASSERT_NOT_OK(file_reader->ReadAll(&batches)); - - for (const auto& rb : batches) { - ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); - for (auto i = 0; i < rb->num_columns(); ++i) { - ASSERT_EQ(rb->column(i)->length(), rb->num_rows()); - } - } -} - -TEST_F(SplitterTest, TestRoundRobinListArraySplitterwithCompression) { - auto f_arr_str = field("f_arr", arrow::list(arrow::utf8())); - auto f_arr_bool = field("f_bool", arrow::list(arrow::boolean())); - auto f_arr_int32 = field("f_int32", arrow::list(arrow::int32())); - auto f_arr_double = field("f_double", arrow::list(arrow::float64())); - auto f_arr_decimal = field("f_decimal", arrow::list(arrow::decimal(10, 2))); - - auto rb_schema = - arrow::schema({f_arr_str, f_arr_bool, f_arr_int32, f_arr_double, f_arr_decimal}); - - const std::vector input_data_arr = { - R"([["alice0", "bob1"], ["alice2"], ["bob3"], ["Alice4", "Bob5", "AlicE6"], ["boB7"], ["ALICE8", "BOB9"]])", - R"([[true, null], [true, true, true], [false], [true], [false], [false]])", - R"([[1, 2, 3], [9, 8], [null], [3, 1], [0], [1, 9, null]])", - R"([[0.26121], [-9.12123, 6.111111], [8.121], [7.21, null], [3.2123, 6,1121], [null]])", - R"([["0.26"], ["-9.12", "6.11"], ["8.12"], ["7.21", null], ["3.21", "6.11"], [null]])"}; - - std::shared_ptr input_batch_arr; - MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr); - - int32_t num_partitions = 2; - split_options_.buffer_size = 4; - ARROW_ASSIGN_OR_THROW(splitter_, - Splitter::Make("rr", rb_schema, num_partitions, split_options_)); - auto compression_type = arrow::util::Codec::GetCompressionType("lz4"); - ASSERT_NOT_OK(splitter_->SetCompressType(compression_type.MoveValueUnsafe())); - ASSERT_NOT_OK(splitter_->Split(*input_batch_arr)); - ASSERT_NOT_OK(splitter_->Stop()); - - std::shared_ptr file_reader; - ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); - - // verify partition lengths - const auto& lengths = splitter_->PartitionLengths(); - ASSERT_EQ(lengths.size(), 2); - ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]); - - // verify schema - std::vector> batches; - ASSERT_EQ(*file_reader->schema(), *rb_schema); - - // prepare first block expected result - std::shared_ptr res_batch_0; - std::shared_ptr res_batch_1; - ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2, 4]")) - std::vector expected = {res_batch_0.get()}; - - // verify first block - ASSERT_NOT_OK(file_reader->ReadAll(&batches)); - ASSERT_EQ(batches.size(), 1); - for (auto i = 0; i < batches.size(); ++i) { - const auto& rb = batches[i]; - ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); - for (auto j = 0; j < rb->num_columns(); ++j) { - ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); - } - ASSERT_TRUE(rb->Equals(*expected[i])); - } - - // prepare second block expected result - ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3, 5]")) - expected = {res_batch_0.get()}; - - // verify second block - batches.clear(); - ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); - ASSERT_EQ(*file_reader->schema(), *rb_schema); - ASSERT_NOT_OK(file_->Advance(lengths[0])); - ASSERT_NOT_OK(file_reader->ReadAll(&batches)); - ASSERT_EQ(batches.size(), 1); - for (auto i = 0; i < batches.size(); ++i) { - const auto& rb = batches[i]; - ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); - for (auto j = 0; j < rb->num_columns(); ++j) { - ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); - } - ASSERT_TRUE(rb->Equals(*expected[i])); - } -} - -} // namespace shuffle -} // namespace sparkcolumnarplugin From 87b29fee0759ca1f893db7e240696a5690ea61bc Mon Sep 17 00:00:00 2001 From: binwei Date: Sun, 1 May 2022 17:11:20 +0800 Subject: [PATCH 06/19] return to original --- native-sql-engine/cpp/CMakeLists.txt | 2 +- .../src/benchmarks/shuffle_split_benchmark.cc | 45 +++++++++ native-sql-engine/cpp/src/shuffle/splitter.cc | 94 ++++++++++++++----- 3 files changed, 114 insertions(+), 27 deletions(-) diff --git a/native-sql-engine/cpp/CMakeLists.txt b/native-sql-engine/cpp/CMakeLists.txt index fe7e989ee..e7d14e0c8 100644 --- a/native-sql-engine/cpp/CMakeLists.txt +++ b/native-sql-engine/cpp/CMakeLists.txt @@ -4,7 +4,7 @@ project(spark_columnar_plugin) #add_definitions(-DSKIPWRITE -DSKIPCOMPRESS -DPROCESSROW) add_definitions(-DPROCESSROW) -#add_compile_options(-g) +add_compile_options(-g) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(root_directory ${PROJECT_BINARY_DIR}) diff --git a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc index d2bffe36a..ce4e88b62 100644 --- a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc +++ b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc @@ -41,6 +41,50 @@ namespace shuffle { const int batch_buffer_size = 32768; const int split_buffer_size = 8192; + +class MyLoggingMemoryPool : public MemoryPool { + public: + explicit MyLoggingMemoryPool(MemoryPool* pool): pool_(pool) {} + ~MyLoggingMemoryPool() override = default; + + Status Allocate(int64_t size, uint8_t** out) override { + Status s = pool_->Allocate(size, out); + std::cout << "Allocate: size = " << size << std::endl; + return s; + } + Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override + { + Status s = pool_->Reallocate(old_size, new_size, ptr); + std::cout << "Reallocate: old_size = " << old_size << " - new_size = " << new_size + << std::endl; + return s; + } + + void Free(uint8_t* buffer, int64_t size) override{ + pool_->Free(buffer, size); + std::cout << "Free: size = " << size << std::endl; + } + + int64_t bytes_allocated() const override{ + int64_t nb_bytes = pool_->bytes_allocated(); + std::cout << "bytes_allocated: " << nb_bytes << std::endl; + return nb_bytes; + } + + int64_t max_memory() const override{ + int64_t mem = pool_->max_memory(); + std::cout << "max_memory: " << mem << std::endl; + return mem; + } + + std::string backend_name() const override{ + return pool_->backend_name(); + } + + private: + MemoryPool* pool_; +}; + class BenchmarkShuffleSplit { public: BenchmarkShuffleSplit(std::string file_name) { GetRecordBatchReader(file_name); } @@ -188,6 +232,7 @@ class BenchmarkShuffleSplit { std::shared_ptr schema; std::vector> expr_vector; parquet::ArrowReaderProperties properties; + }; class BenchmarkShuffleSplit_CacheScan_Benchmark : public BenchmarkShuffleSplit { diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc index 7839c4ce4..a5e3ca932 100644 --- a/native-sql-engine/cpp/src/shuffle/splitter.cc +++ b/native-sql-engine/cpp/src/shuffle/splitter.cc @@ -47,6 +47,11 @@ namespace sparkcolumnarplugin { namespace shuffle { using arrow::internal::checked_cast; +#ifndef SPLIT_BUFFER_SIZE +//by default, allocate 8M block, 2M page size +#define SPLIT_BUFFER_SIZE 8*1024*1024 +#endif + template std::string __m128i_toString(const __m128i var) { std::stringstream sstr; @@ -401,6 +406,37 @@ arrow::Status Splitter::Init() { tiny_bach_write_options_.codec, arrow::util::Codec::CreateInt32(arrow::Compression::UNCOMPRESSED)); + //Allocate first buffer for split reducer + ARROW_ASSIGN_OR_RAISE(combine_buffer_, arrow::AllocateResizableBuffer( + SPLIT_BUFFER_SIZE, + options_.memory_pool)); + combine_buffer_->Resize(0, /*shrink_to_fit =*/false); + + return arrow::Status::OK(); +} +arrow::Status Splitter::AllocateBufferFromPool(std::shared_ptr& buffer, uint32_t size) +{ + // if size is already larger than buffer pool size, allocate it directly + //make size 64byte aligned + auto reminder = size & 0x3f; + size+=(64-reminder) & ((reminder==0)-1); + + if (size > SPLIT_BUFFER_SIZE ) + { + ARROW_ASSIGN_OR_RAISE(buffer, arrow::AllocateResizableBuffer( + size, options_.memory_pool)); + return arrow::Status::OK(); + }else if (combine_buffer_->capacity() - combine_buffer_->size() < size) + { + //memory pool is not enough + ARROW_ASSIGN_OR_RAISE(combine_buffer_, arrow::AllocateResizableBuffer( + SPLIT_BUFFER_SIZE, + options_.memory_pool)); + combine_buffer_->Resize(0, /*shrink_to_fit = */ false); + } + buffer = arrow::SliceMutableBuffer(combine_buffer_, combine_buffer_->size(),size); + + combine_buffer_->Resize(combine_buffer_->size() + size, /*shrink_to_fit = */ false); return arrow::Status::OK(); } @@ -454,6 +490,7 @@ arrow::Status Splitter::Stop() { data_file_os_ = fout; } + std::cout << " cache record batch " << std::endl; // stop PartitionWriter and collect metrics for (auto pid = 0; pid < num_partitions_; ++pid) { RETURN_NOT_OK(CacheRecordBatch(pid, true)); @@ -473,11 +510,15 @@ arrow::Status Splitter::Stop() { partition_lengths_[pid] = 0; } } + this->combine_buffer_.reset(); // close data file output Stream RETURN_NOT_OK(data_file_os_->Close()); EVAL_END("write", options_.thread_id, options_.task_attempt_id) + + + return arrow::Status::OK(); } int64_t batch_nbytes(const arrow::RecordBatch& batch) { @@ -492,6 +533,7 @@ int64_t batch_nbytes(const arrow::RecordBatch& batch) { continue; } accumulated += buf->size(); + std::cout << " buffer addr = 0x" << std::hex << buf->address() << std::dec << std::endl; } } return accumulated; @@ -576,15 +618,13 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer default: { auto& buffers = partition_fixed_width_buffers_[fixed_width_idx][partition_id]; if (buffers[0] != nullptr) { - buffers[0]->Resize((num_rows >> 3) + 1, /*shrink_to_fit =*/false); + buffers[0]=arrow::SliceBuffer(buffers[0],0,(num_rows >> 3) + 1); } if (buffers[1] != nullptr) { if (column_type_id_[i]->id() == arrow::BooleanType::type_id) - buffers[1]->Resize((num_rows >> 3) + 1, /*shrink_to_fit =*/false); + buffers[1]=arrow::SliceBuffer(buffers[1],0,(num_rows >> 3) + 1); else - buffers[1]->Resize( - num_rows * (arrow::bit_width(column_type_id_[i]->id()) >> 3), - /*shrink_to_fit =*/false); + buffers[1]=arrow::SliceBuffer(buffers[1],0,num_rows * (arrow::bit_width(column_type_id_[i]->id()) >> 3)); } if (reset_buffers) { @@ -604,7 +644,7 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer } } } - + std::cout << " cache record " << std::endl; auto batch = arrow::RecordBatch::Make(schema_, num_rows, std::move(arrays)); int64_t raw_size = batch_nbytes(batch); @@ -642,12 +682,14 @@ arrow::Status Splitter::AllocatePartitionBuffers(int32_t partition_id, int32_t n auto binary_idx = 0; auto large_binary_idx = 0; auto list_idx = 0; + auto total_size = 0; std::vector> new_binary_builders; std::vector> new_large_binary_builders; std::vector> new_list_builders; - std::vector> new_value_buffers; - std::vector> new_validity_buffers; + std::vector> new_value_buffers; + std::vector> new_validity_buffers; + for (auto i = 0; i < num_fields; ++i) { switch (column_type_id_[i]->id()) { case arrow::BinaryType::type_id: @@ -688,30 +730,30 @@ arrow::Status Splitter::AllocatePartitionBuffers(int32_t partition_id, int32_t n case arrow::NullType::type_id: break; default: { - std::shared_ptr value_buffer; + try{ + std::shared_ptr value_buffer; if (column_type_id_[i]->id() == arrow::BooleanType::type_id) { - ARROW_ASSIGN_OR_RAISE(value_buffer, arrow::AllocateResizableBuffer( - arrow::BitUtil::BytesForBits(new_size), - options_.memory_pool)); + auto status = AllocateBufferFromPool(value_buffer, arrow::BitUtil::BytesForBits(new_size)); + ARROW_RETURN_NOT_OK( status ); } else { - ARROW_ASSIGN_OR_RAISE( - value_buffer, - arrow::AllocateResizableBuffer( - new_size * (arrow::bit_width(column_type_id_[i]->id()) / 8), - options_.memory_pool)); + auto status = AllocateBufferFromPool(value_buffer, new_size * (arrow::bit_width(column_type_id_[i]->id()) >>3)); + ARROW_RETURN_NOT_OK( status ); + } new_value_buffers.push_back(std::move(value_buffer)); if (input_fixed_width_has_null_[fixed_width_idx]) { - std::shared_ptr validity_buffer; - ARROW_ASSIGN_OR_RAISE( - validity_buffer, - arrow::AllocateResizableBuffer(arrow::BitUtil::BytesForBits(new_size), - options_.memory_pool)); + std::shared_ptr validity_buffer; + auto status = AllocateBufferFromPool(validity_buffer, arrow::BitUtil::BytesForBits(new_size)); + ARROW_RETURN_NOT_OK( status ); new_validity_buffers.push_back(std::move(validity_buffer)); } else { new_validity_buffers.push_back(nullptr); } fixed_width_idx++; + }catch(const std::exception& e) + { + std::cout << "exception captured " << e.what() << std::endl; + } break; } } @@ -746,10 +788,10 @@ arrow::Status Splitter::AllocatePartitionBuffers(int32_t partition_id, int32_t n break; default: partition_fixed_width_value_addrs_[fixed_width_idx][partition_id] = - const_cast(new_value_buffers[fixed_width_idx]->data()); + new_value_buffers[fixed_width_idx]->mutable_data(); if (input_fixed_width_has_null_[fixed_width_idx]) { partition_fixed_width_validity_addrs_[fixed_width_idx][partition_id] = - const_cast(new_validity_buffers[fixed_width_idx]->data()); + new_validity_buffers[fixed_width_idx]->mutable_data(); } else { partition_fixed_width_validity_addrs_[fixed_width_idx][partition_id] = nullptr; } @@ -1569,8 +1611,8 @@ arrow::Status HashSplitter::ComputeAndCountPartitionId(const arrow::RecordBatch& "lea (%[num_partitions],%[pid],1),%[tmp]\n" "test %[pid],%[pid]\n" "cmovs %[tmp],%[pid]\n" - : [ pid ] "+r"(pid) - : [ num_partitions ] "r"(num_partitions_), [ tmp ] "r"(0)); + : [pid] "+r"(pid) + : [num_partitions] "r"(num_partitions_), [tmp] "r"(0)); partition_id_[i] = pid; partition_id_cnt_[pid]++; } From 4d0e3cfb15e2adef17ccad357b29a65e0aa5bd33 Mon Sep 17 00:00:00 2001 From: binwei Date: Sun, 1 May 2022 17:49:12 +0800 Subject: [PATCH 07/19] added memory leak check in test --- native-sql-engine/cpp/src/shuffle/splitter.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc index a5e3ca932..12d2c87bc 100644 --- a/native-sql-engine/cpp/src/shuffle/splitter.cc +++ b/native-sql-engine/cpp/src/shuffle/splitter.cc @@ -490,7 +490,6 @@ arrow::Status Splitter::Stop() { data_file_os_ = fout; } - std::cout << " cache record batch " << std::endl; // stop PartitionWriter and collect metrics for (auto pid = 0; pid < num_partitions_; ++pid) { RETURN_NOT_OK(CacheRecordBatch(pid, true)); @@ -511,6 +510,7 @@ arrow::Status Splitter::Stop() { } } this->combine_buffer_.reset(); + this->schema_payload_.reset(); // close data file output Stream RETURN_NOT_OK(data_file_os_->Close()); @@ -533,7 +533,6 @@ int64_t batch_nbytes(const arrow::RecordBatch& batch) { continue; } accumulated += buf->size(); - std::cout << " buffer addr = 0x" << std::hex << buf->address() << std::dec << std::endl; } } return accumulated; @@ -644,7 +643,6 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer } } } - std::cout << " cache record " << std::endl; auto batch = arrow::RecordBatch::Make(schema_, num_rows, std::move(arrays)); int64_t raw_size = batch_nbytes(batch); From 1db0b7e61bcb2408186d5245fd291713efccde0e Mon Sep 17 00:00:00 2001 From: binwei Date: Mon, 2 May 2022 19:19:40 +0800 Subject: [PATCH 08/19] Done --- native-sql-engine/cpp/CMakeLists.txt | 2 +- .../src/benchmarks/shuffle_split_benchmark.cc | 141 ++++++++++++++---- native-sql-engine/cpp/src/shuffle/splitter.cc | 95 ++++++------ native-sql-engine/cpp/src/shuffle/splitter.h | 8 +- 4 files changed, 161 insertions(+), 85 deletions(-) diff --git a/native-sql-engine/cpp/CMakeLists.txt b/native-sql-engine/cpp/CMakeLists.txt index e7d14e0c8..fe7e989ee 100644 --- a/native-sql-engine/cpp/CMakeLists.txt +++ b/native-sql-engine/cpp/CMakeLists.txt @@ -4,7 +4,7 @@ project(spark_columnar_plugin) #add_definitions(-DSKIPWRITE -DSKIPCOMPRESS -DPROCESSROW) add_definitions(-DPROCESSROW) -add_compile_options(-g) +#add_compile_options(-g) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(root_directory ${PROJECT_BINARY_DIR}) diff --git a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc index ce4e88b62..2baf1915e 100644 --- a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc +++ b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc @@ -24,12 +24,25 @@ #include //#include #include +#include #include #include #include #include +#include #include +void print_trace(void) { + char** strings; + size_t i, size; + enum Constexpr { MAX_SIZE = 1024 }; + void* array[MAX_SIZE]; + size = backtrace(array, MAX_SIZE); + strings = backtrace_symbols(array, size); + for (i = 0; i < size; i++) printf(" %s\n", strings[i]); + puts(""); + free(strings); +} #include "codegen/code_generator.h" #include "codegen/code_generator_factory.h" @@ -38,51 +51,106 @@ namespace sparkcolumnarplugin { namespace shuffle { +#define ALIGNMENT 2048 * 1024 + const int batch_buffer_size = 32768; const int split_buffer_size = 8192; - -class MyLoggingMemoryPool : public MemoryPool { +class MyMemoryPool : public arrow::MemoryPool { public: - explicit MyLoggingMemoryPool(MemoryPool* pool): pool_(pool) {} - ~MyLoggingMemoryPool() override = default; + explicit MyMemoryPool() {} Status Allocate(int64_t size, uint8_t** out) override { - Status s = pool_->Allocate(size, out); - std::cout << "Allocate: size = " << size << std::endl; - return s; + RETURN_NOT_OK(pool_->Allocate(size, out)); + stats_.UpdateAllocatedBytes(size); + // std::cout << "Allocate: size = " << size << " addr = " << std::hex << + // (uint64_t)*out << std::dec << std::endl; print_trace(); + return arrow::Status::OK(); } - Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override - { - Status s = pool_->Reallocate(old_size, new_size, ptr); - std::cout << "Reallocate: old_size = " << old_size << " - new_size = " << new_size - << std::endl; - return s; + + Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override { + auto old_ptr = *ptr; + RETURN_NOT_OK(pool_->Reallocate(old_size, new_size, ptr)); + stats_.UpdateAllocatedBytes(new_size - old_size); + // std::cout << "Reallocate: old_size = " << old_size << " old_ptr = " << std::hex << + // (uint64_t)old_ptr << std::dec << " new_size = " << new_size << " addr = " << + // std::hex << (uint64_t)*ptr << std::dec << std::endl; print_trace(); + return arrow::Status::OK(); } - void Free(uint8_t* buffer, int64_t size) override{ + void Free(uint8_t* buffer, int64_t size) override { pool_->Free(buffer, size); - std::cout << "Free: size = " << size << std::endl; + stats_.UpdateAllocatedBytes(-size); + // std::cout << "Free: size = " << size << " addr = " << std::hex << (uint64_t)buffer + // << std::dec << std::endl; print_trace(); } - int64_t bytes_allocated() const override{ - int64_t nb_bytes = pool_->bytes_allocated(); - std::cout << "bytes_allocated: " << nb_bytes << std::endl; - return nb_bytes; + int64_t bytes_allocated() const override { return stats_.bytes_allocated(); } + + int64_t max_memory() const override { return pool_->max_memory(); } + + std::string backend_name() const override { return pool_->backend_name(); } + + private: + MemoryPool* pool_ = arrow::default_memory_pool(); + arrow::internal::MemoryPoolStats stats_; +}; + +#define ENABLELARGEPAGE + +class LargePageMemoryPool : public MemoryPool { + public: + explicit LargePageMemoryPool() {} + + ~LargePageMemoryPool() override = default; + + Status Allocate(int64_t size, uint8_t** out) override { +#ifdef ENABLELARGEPAGE + if (size < 2 * 1024 * 1024) { + return pool_->Allocate(size, out); + } else { + Status st = pool_->AlignAllocate(size, out, ALIGNMENT); + madvise(*out, size, /*MADV_HUGEPAGE */ 14); + //std::cout << "Allocate: size = " << size << " addr = " \ + // << std::hex << (uint64_t)*out << " end = " << std::hex << (uint64_t)(*out+size) << std::dec << std::endl; + return st; + } +#else + return pool_->Allocate(size, out); +#endif } - int64_t max_memory() const override{ - int64_t mem = pool_->max_memory(); - std::cout << "max_memory: " << mem << std::endl; - return mem; + Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override { + return pool_->Reallocate(old_size, new_size, ptr); +#ifdef ENABLELARGEPAGE + if (new_size < 2 * 1024 * 1024) { + return pool_->Reallocate(old_size, new_size, ptr); + } else { + Status st = pool_->AlignReallocate(old_size, new_size, ptr, ALIGNMENT); + // madvise(*ptr, new_size, /*MADV_HUGEPAGE */ 14); + return st; + } +#else + return pool_->Reallocate(old_size, new_size, ptr); +#endif } - std::string backend_name() const override{ - return pool_->backend_name(); + void Free(uint8_t* buffer, int64_t size) override { + if (size < 2 * 1024 * 1024) { + pool_->Free(buffer, size); + } else { + pool_->Free(buffer, size, ALIGNMENT); + } } + int64_t bytes_allocated() const override { return pool_->bytes_allocated(); } + + int64_t max_memory() const override { return pool_->max_memory(); } + + std::string backend_name() const override { return "LargePageMemoryPool"; } + private: - MemoryPool* pool_; + MemoryPool* pool_ = arrow::default_memory_pool(); }; class BenchmarkShuffleSplit { @@ -133,6 +201,8 @@ class BenchmarkShuffleSplit { SetCPU(state.thread_index()); arrow::Compression::type compression_type = (arrow::Compression::type)state.range(1); + std::shared_ptr pool = std::make_shared(); + const int num_partitions = state.range(0); auto options = SplitOptions::Defaults(); @@ -142,6 +212,7 @@ class BenchmarkShuffleSplit { options.offheap_per_task = 128 * 1024 * 1024 * 1024L; options.prefer_spill = true; options.write_schema = false; + options.memory_pool = pool.get(); std::shared_ptr splitter; int64_t elapse_read = 0; @@ -210,6 +281,9 @@ class BenchmarkShuffleSplit { splitter->TotalWriteTime(); state.counters["split_time"] = benchmark::Counter( split_time, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); + splitter.reset(); + std::cout << " split reset memory allocated = " + << options.memory_pool->bytes_allocated() << std::endl; } protected: @@ -232,7 +306,6 @@ class BenchmarkShuffleSplit { std::shared_ptr schema; std::vector> expr_vector; parquet::ArrowReaderProperties properties; - }; class BenchmarkShuffleSplit_CacheScan_Benchmark : public BenchmarkShuffleSplit { @@ -296,14 +369,18 @@ class BenchmarkShuffleSplit_CacheScan_Benchmark : public BenchmarkShuffleSplit { std::cout << "batches = " << num_batches << " rows = " << num_rows << std::endl; for (auto _ : state) { - for_each( - batches.begin(), batches.end(), - [&splitter, &split_time](std::shared_ptr& record_batch) { - TIME_NANO_OR_THROW(split_time, splitter->Split(*record_batch)); - }); + for_each(batches.begin(), batches.end(), + [&splitter, &split_time, + &options](std::shared_ptr& record_batch) { + TIME_NANO_OR_THROW(split_time, splitter->Split(*record_batch)); + }); + // std::cout << " split done memory allocated = " << + // options.memory_pool->bytes_allocated() << std::endl; } TIME_NANO_OR_THROW(split_time, splitter->Stop()); + std::cout << " split stop memory allocated = " + << options.memory_pool->bytes_allocated() << std::endl; } }; diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc index 12d2c87bc..2d031ebc2 100644 --- a/native-sql-engine/cpp/src/shuffle/splitter.cc +++ b/native-sql-engine/cpp/src/shuffle/splitter.cc @@ -48,8 +48,8 @@ namespace shuffle { using arrow::internal::checked_cast; #ifndef SPLIT_BUFFER_SIZE -//by default, allocate 8M block, 2M page size -#define SPLIT_BUFFER_SIZE 8*1024*1024 +// by default, allocate 8M block, 2M page size +#define SPLIT_BUFFER_SIZE 8 * 1024 * 1024 #endif template @@ -406,36 +406,31 @@ arrow::Status Splitter::Init() { tiny_bach_write_options_.codec, arrow::util::Codec::CreateInt32(arrow::Compression::UNCOMPRESSED)); - //Allocate first buffer for split reducer + // Allocate first buffer for split reducer ARROW_ASSIGN_OR_RAISE(combine_buffer_, arrow::AllocateResizableBuffer( - SPLIT_BUFFER_SIZE, - options_.memory_pool)); + SPLIT_BUFFER_SIZE, options_.memory_pool)); combine_buffer_->Resize(0, /*shrink_to_fit =*/false); return arrow::Status::OK(); } -arrow::Status Splitter::AllocateBufferFromPool(std::shared_ptr& buffer, uint32_t size) -{ +arrow::Status Splitter::AllocateBufferFromPool(std::shared_ptr& buffer, + uint32_t size) { // if size is already larger than buffer pool size, allocate it directly - //make size 64byte aligned + // make size 64byte aligned auto reminder = size & 0x3f; - size+=(64-reminder) & ((reminder==0)-1); - - if (size > SPLIT_BUFFER_SIZE ) - { - ARROW_ASSIGN_OR_RAISE(buffer, arrow::AllocateResizableBuffer( - size, options_.memory_pool)); + size += (64 - reminder) & ((reminder == 0) - 1); + if (size > SPLIT_BUFFER_SIZE) { + ARROW_ASSIGN_OR_RAISE(buffer, + arrow::AllocateResizableBuffer(size, options_.memory_pool)); return arrow::Status::OK(); - }else if (combine_buffer_->capacity() - combine_buffer_->size() < size) - { - //memory pool is not enough + } else if (combine_buffer_->capacity() - combine_buffer_->size() < size) { + // memory pool is not enough ARROW_ASSIGN_OR_RAISE(combine_buffer_, arrow::AllocateResizableBuffer( - SPLIT_BUFFER_SIZE, - options_.memory_pool)); + SPLIT_BUFFER_SIZE, options_.memory_pool)); combine_buffer_->Resize(0, /*shrink_to_fit = */ false); } - buffer = arrow::SliceMutableBuffer(combine_buffer_, combine_buffer_->size(),size); - + buffer = arrow::SliceMutableBuffer(combine_buffer_, combine_buffer_->size(), size); + combine_buffer_->Resize(combine_buffer_->size() + size, /*shrink_to_fit = */ false); return arrow::Status::OK(); } @@ -517,8 +512,6 @@ arrow::Status Splitter::Stop() { EVAL_END("write", options_.thread_id, options_.task_attempt_id) - - return arrow::Status::OK(); } int64_t batch_nbytes(const arrow::RecordBatch& batch) { @@ -617,13 +610,15 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer default: { auto& buffers = partition_fixed_width_buffers_[fixed_width_idx][partition_id]; if (buffers[0] != nullptr) { - buffers[0]=arrow::SliceBuffer(buffers[0],0,(num_rows >> 3) + 1); + buffers[0] = arrow::SliceBuffer(buffers[0], 0, (num_rows >> 3) + 1); } if (buffers[1] != nullptr) { if (column_type_id_[i]->id() == arrow::BooleanType::type_id) - buffers[1]=arrow::SliceBuffer(buffers[1],0,(num_rows >> 3) + 1); + buffers[1] = arrow::SliceBuffer(buffers[1], 0, (num_rows >> 3) + 1); else - buffers[1]=arrow::SliceBuffer(buffers[1],0,num_rows * (arrow::bit_width(column_type_id_[i]->id()) >> 3)); + buffers[1] = arrow::SliceBuffer( + buffers[1], 0, + num_rows * (arrow::bit_width(column_type_id_[i]->id()) >> 3)); } if (reset_buffers) { @@ -728,30 +723,32 @@ arrow::Status Splitter::AllocatePartitionBuffers(int32_t partition_id, int32_t n case arrow::NullType::type_id: break; default: { - try{ - std::shared_ptr value_buffer; - if (column_type_id_[i]->id() == arrow::BooleanType::type_id) { - auto status = AllocateBufferFromPool(value_buffer, arrow::BitUtil::BytesForBits(new_size)); - ARROW_RETURN_NOT_OK( status ); - } else { - auto status = AllocateBufferFromPool(value_buffer, new_size * (arrow::bit_width(column_type_id_[i]->id()) >>3)); - ARROW_RETURN_NOT_OK( status ); - - } - new_value_buffers.push_back(std::move(value_buffer)); - if (input_fixed_width_has_null_[fixed_width_idx]) { - std::shared_ptr validity_buffer; - auto status = AllocateBufferFromPool(validity_buffer, arrow::BitUtil::BytesForBits(new_size)); - ARROW_RETURN_NOT_OK( status ); - new_validity_buffers.push_back(std::move(validity_buffer)); - } else { - new_validity_buffers.push_back(nullptr); - } - fixed_width_idx++; - }catch(const std::exception& e) - { - std::cout << "exception captured " << e.what() << std::endl; + try { + std::shared_ptr value_buffer; + if (column_type_id_[i]->id() == arrow::BooleanType::type_id) { + auto status = AllocateBufferFromPool(value_buffer, + arrow::BitUtil::BytesForBits(new_size)); + ARROW_RETURN_NOT_OK(status); + } else { + auto status = AllocateBufferFromPool( + value_buffer, + new_size * (arrow::bit_width(column_type_id_[i]->id()) >> 3)); + ARROW_RETURN_NOT_OK(status); } + new_value_buffers.push_back(std::move(value_buffer)); + if (input_fixed_width_has_null_[fixed_width_idx]) { + std::shared_ptr validity_buffer; + auto status = AllocateBufferFromPool(validity_buffer, + arrow::BitUtil::BytesForBits(new_size)); + ARROW_RETURN_NOT_OK(status); + new_validity_buffers.push_back(std::move(validity_buffer)); + } else { + new_validity_buffers.push_back(nullptr); + } + fixed_width_idx++; + } catch (const std::exception& e) { + std::cout << "exception captured " << e.what() << std::endl; + } break; } } diff --git a/native-sql-engine/cpp/src/shuffle/splitter.h b/native-sql-engine/cpp/src/shuffle/splitter.h index 1c1c8e2da..d50519a53 100644 --- a/native-sql-engine/cpp/src/shuffle/splitter.h +++ b/native-sql-engine/cpp/src/shuffle/splitter.h @@ -138,7 +138,8 @@ class Splitter { arrow::Status SplitListArray(const arrow::RecordBatch& rb); - arrow::Status AllocateBufferFromPool(std::shared_ptr& buffer, uint32_t size); + arrow::Status AllocateBufferFromPool(std::shared_ptr& buffer, + uint32_t size); template ::ArrayType, typename BuilderType = typename arrow::TypeTraits::BuilderType> @@ -201,8 +202,9 @@ class Splitter { std::vector>> partition_list_builders_; // col partid - //slice the buffer for each reducer's column, in this way we can combine into large page - std::shared_ptr combine_buffer_; + // slice the buffer for each reducer's column, in this way we can combine into large + // page + std::shared_ptr combine_buffer_; // partid std::vector>> From dc4b579bd05edd14d7aca18c91c1fe2efcae8508 Mon Sep 17 00:00:00 2001 From: binwei Date: Mon, 2 May 2022 19:27:28 +0800 Subject: [PATCH 09/19] disable alignment allocation in benchmark since arrow doesn't support it --- .../cpp/src/benchmarks/shuffle_split_benchmark.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc index 2baf1915e..1f1d1b6e5 100644 --- a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc +++ b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc @@ -96,7 +96,7 @@ class MyMemoryPool : public arrow::MemoryPool { arrow::internal::MemoryPoolStats stats_; }; -#define ENABLELARGEPAGE +//#define ENABLELARGEPAGE class LargePageMemoryPool : public MemoryPool { public: @@ -136,11 +136,15 @@ class LargePageMemoryPool : public MemoryPool { } void Free(uint8_t* buffer, int64_t size) override { +#ifdef ENABLELARGEPAGE if (size < 2 * 1024 * 1024) { pool_->Free(buffer, size); } else { pool_->Free(buffer, size, ALIGNMENT); } +#else + pool_->Free(buffer, size); +#endif } int64_t bytes_allocated() const override { return pool_->bytes_allocated(); } From 173c86c5a5961b8a6efec6d2437aa136d0cf2759 Mon Sep 17 00:00:00 2001 From: binwei Date: Wed, 4 May 2022 00:24:58 +0800 Subject: [PATCH 10/19] optimized validity buffer assign. initialize the validity buffer as true once allocated. skip the initialize during split fix validity buffer bug --- .../src/benchmarks/shuffle_split_benchmark.cc | 18 ++--- native-sql-engine/cpp/src/shuffle/splitter.cc | 65 +++++++++++++------ native-sql-engine/cpp/src/shuffle/splitter.h | 5 +- 3 files changed, 56 insertions(+), 32 deletions(-) diff --git a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc index 1f1d1b6e5..6f9a7f19e 100644 --- a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc +++ b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc @@ -96,7 +96,7 @@ class MyMemoryPool : public arrow::MemoryPool { arrow::internal::MemoryPoolStats stats_; }; -//#define ENABLELARGEPAGE +#define ENABLELARGEPAGE class LargePageMemoryPool : public MemoryPool { public: @@ -286,8 +286,6 @@ class BenchmarkShuffleSplit { state.counters["split_time"] = benchmark::Counter( split_time, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); splitter.reset(); - std::cout << " split reset memory allocated = " - << options.memory_pool->bytes_allocated() << std::endl; } protected: @@ -323,17 +321,18 @@ class BenchmarkShuffleSplit_CacheScan_Benchmark : public BenchmarkShuffleSplit { const int num_partitions, SplitOptions options, benchmark::State& state) { std::vector local_column_indices; local_column_indices.push_back(0); +/* local_column_indices.push_back(0); local_column_indices.push_back(1); local_column_indices.push_back(2); local_column_indices.push_back(4); local_column_indices.push_back(5); local_column_indices.push_back(6); - local_column_indices.push_back(7); + local_column_indices.push_back(7);*/ std::shared_ptr local_schema; local_schema = std::make_shared(*schema.get()); - ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(15)); +/* ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(15)); ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(14)); ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(13)); ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(12)); @@ -342,7 +341,7 @@ class BenchmarkShuffleSplit_CacheScan_Benchmark : public BenchmarkShuffleSplit { ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(9)); ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(8)); ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(3)); - +*/ if (state.thread_index() == 0) std::cout << local_schema->ToString() << std::endl; ARROW_ASSIGN_OR_THROW(splitter, @@ -383,8 +382,6 @@ class BenchmarkShuffleSplit_CacheScan_Benchmark : public BenchmarkShuffleSplit { } TIME_NANO_OR_THROW(split_time, splitter->Stop()); - std::cout << " split stop memory allocated = " - << options.memory_pool->bytes_allocated() << std::endl; } }; @@ -500,7 +497,7 @@ int main(int argc, char** argv) { ->MeasureProcessCPUTime() ->Unit(benchmark::kSecond); - /* sparkcolumnarplugin::shuffle::BenchmarkShuffleSplit_IterateScan_Benchmark +/* sparkcolumnarplugin::shuffle::BenchmarkShuffleSplit_IterateScan_Benchmark bck(datafile); benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck) @@ -523,8 +520,7 @@ int main(int argc, char** argv) { ->Threads(16) ->Threads(24) ->Unit(benchmark::kSecond); - */ - +*/ benchmark::Initialize(&argc, argv); benchmark::RunSpecifiedBenchmarks(); benchmark::Shutdown(); diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc index 2d031ebc2..c96ad12f8 100644 --- a/native-sql-engine/cpp/src/shuffle/splitter.cc +++ b/native-sql-engine/cpp/src/shuffle/splitter.cc @@ -350,7 +350,6 @@ arrow::Status Splitter::Init() { auto num_fixed_width = fixed_width_array_idx_.size(); partition_fixed_width_validity_addrs_.resize(num_fixed_width); - column_has_null_.resize(num_fixed_width, false); partition_fixed_width_value_addrs_.resize(num_fixed_width); partition_fixed_width_buffers_.resize(num_fixed_width); binary_array_empirical_size_.resize(binary_array_idx_.size()); @@ -507,6 +506,8 @@ arrow::Status Splitter::Stop() { this->combine_buffer_.reset(); this->schema_payload_.reset(); + std::cout << "src null count " << src_null_cnt << " dst null cnt = " << dst_null_cnt << std::endl; + // close data file output Stream RETURN_NOT_OK(data_file_os_->Close()); @@ -608,7 +609,7 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer break; } default: { - auto& buffers = partition_fixed_width_buffers_[fixed_width_idx][partition_id]; + auto buffers = partition_fixed_width_buffers_[fixed_width_idx][partition_id]; if (buffers[0] != nullptr) { buffers[0] = arrow::SliceBuffer(buffers[0], 0, (num_rows >> 3) + 1); } @@ -624,14 +625,21 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer if (reset_buffers) { arrays[i] = arrow::MakeArray( arrow::ArrayData::Make(schema_->field(i)->type(), num_rows, - {std::move(buffers[0]), std::move(buffers[1])})); - buffers = {nullptr, nullptr}; + {buffers[0],buffers[1]})); + if(buffers[0]!=nullptr) + { + dst_null_cnt+=arrays[i]->null_count(); + } partition_fixed_width_validity_addrs_[fixed_width_idx][partition_id] = nullptr; partition_fixed_width_value_addrs_[fixed_width_idx][partition_id] = nullptr; } else { arrays[i] = arrow::MakeArray(arrow::ArrayData::Make( schema_->field(i)->type(), num_rows, {buffers[0], buffers[1]})); + if(buffers[0]!=nullptr) + { + dst_null_cnt+=arrays[i]->null_count(); + } } fixed_width_idx++; break; @@ -659,7 +667,7 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer arrow::ipc::GetRecordBatchPayload(*batch, tiny_bach_write_options_, payload.get())); #endif - + partition_cached_recordbatch_size_[partition_id] += payload->body_length; partition_cached_recordbatch_[partition_id].push_back(std::move(payload)); partition_buffer_idx_base_[partition_id] = 0; @@ -741,6 +749,8 @@ arrow::Status Splitter::AllocatePartitionBuffers(int32_t partition_id, int32_t n auto status = AllocateBufferFromPool(validity_buffer, arrow::BitUtil::BytesForBits(new_size)); ARROW_RETURN_NOT_OK(status); + //initialize all true once allocated + memset(validity_buffer->mutable_data(), 0xff, validity_buffer->capacity()); new_validity_buffers.push_back(std::move(validity_buffer)); } else { new_validity_buffers.push_back(nullptr); @@ -850,6 +860,18 @@ arrow::Status Splitter::SpillPartition(int32_t partition_id) { std::make_shared(this, partition_id); } TIME_NANO_OR_RAISE(total_spill_time_, partition_writer_[partition_id]->Spill()); + + //reset validity buffer after spill + std::for_each(partition_fixed_width_buffers_.begin(), + partition_fixed_width_buffers_.end(),[partition_id](std::vector& bufs){ + if (bufs[partition_id][0]!=nullptr) + { + //initialize all true once allocated + auto addr = bufs[partition_id][0]->mutable_data(); + memset(addr,0xff,bufs[partition_id][0]->capacity()); + } + }); + return arrow::Status::OK(); } @@ -877,6 +899,7 @@ arrow::Result Splitter::SpillLargestPartition(int64_t* size) { } arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) { + #ifdef PROCESSROW reducer_offsets_.resize(rb.num_rows()); @@ -926,9 +949,11 @@ arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) { for (auto col = 0; col < fixed_width_array_idx_.size(); ++col) { auto col_idx = fixed_width_array_idx_[col]; size_per_row += arrow::bit_width(column_type_id_[col_idx]->id()) / 8; - if (rb.column_data(col_idx)->GetNullCount() != 0) { + //check input_fixed_width_has_null_[col] is cheaper than GetNullCount() + if (input_fixed_width_has_null_[col]==false && rb.column_data(col_idx)->GetNullCount() != 0) { input_fixed_width_has_null_[col] = true; } + src_null_cnt+=rb.column_data(col_idx)->GetNullCount(); } int64_t prealloc_row_cnt = @@ -989,6 +1014,7 @@ arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) { for (auto pid = 0; pid < num_partitions_; ++pid) { partition_buffer_idx_base_[pid] += partition_id_cnt_[pid]; } + return arrow::Status::OK(); } @@ -1349,20 +1375,8 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch& for (auto col = 0; col < fixed_width_array_idx_.size(); ++col) { auto col_idx = fixed_width_array_idx_[col]; auto& dst_addrs = partition_fixed_width_validity_addrs_[col]; - if (rb.column_data(col_idx)->GetNullCount() == 0 && - column_has_null_[col_idx] == true) { - // if the input record batch doesn't have null, set validity to True - // column_has_null_ is used to skip the partition_id_cnt_[pid] and dst_addrs[pid] - // access - for (auto pid = 0; pid < num_partitions_; ++pid) { - if (partition_id_cnt_[pid] > 0 && dst_addrs[pid] != nullptr) { - arrow::BitUtil::SetBitsTo(dst_addrs[pid], partition_buffer_idx_base_[pid], - partition_id_cnt_[pid], true); - } - } - } else if (rb.column_data(col_idx)->GetNullCount() > 0) { + if (rb.column_data(col_idx)->GetNullCount() > 0) { // there is Null count - column_has_null_[col_idx] = true; for (auto pid = 0; pid < num_partitions_; ++pid) { if (partition_id_cnt_[pid] > 0 && dst_addrs[pid] == nullptr) { // init bitmap if it's null, initialize the buffer as true @@ -1383,6 +1397,8 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch& partition_buffer_idx_offset.resize(partition_buffer_idx_base_.size()); std::copy(partition_buffer_idx_base_.begin(), partition_buffer_idx_base_.end(), partition_buffer_idx_offset.begin()); + std::vector nullcnt; + nullcnt.resize(num_partitions_,0); for (auto row = 0; row < num_rows; ++row) { auto pid = partition_id_[row]; auto dst_offset = partition_buffer_idx_offset[pid]; @@ -1392,6 +1408,17 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch& << (dst_offset & 7); partition_buffer_idx_offset[pid]++; } + // the last row may update the following bits to 0, reinitialize it as 1 + for(auto pid=0;pid 0 && dst_addrs[pid] != nullptr) { + auto lastoffset = partition_buffer_idx_base_[pid]+partition_id_cnt_[pid]; + + arrow::BitUtil::SetBitsTo(dst_addrs[pid], lastoffset, lastoffset+8-(lastoffset&0x7), + true); + } + } + } } return arrow::Status::OK(); diff --git a/native-sql-engine/cpp/src/shuffle/splitter.h b/native-sql-engine/cpp/src/shuffle/splitter.h index d50519a53..fc2f6c37a 100644 --- a/native-sql-engine/cpp/src/shuffle/splitter.h +++ b/native-sql-engine/cpp/src/shuffle/splitter.h @@ -186,8 +186,7 @@ class Splitter { std::vector> partition_writer_; // col partid std::vector> partition_fixed_width_validity_addrs_; - // cache if the column has null so far for any reducer. To bypass the reducer check - std::vector column_has_null_; + // col partid std::vector> partition_fixed_width_value_addrs_; // col partid @@ -255,6 +254,8 @@ class Splitter { int64_t total_compress_time_ = 0; int64_t total_compute_pid_time_ = 0; int64_t peak_memory_allocated_ = 0; + int64_t src_null_cnt = 0; + int64_t dst_null_cnt = 0; std::vector partition_lengths_; std::vector raw_partition_lengths_; From 104ca15fd71f7c2817bcd14bb630e1079f839271 Mon Sep 17 00:00:00 2001 From: binwei Date: Thu, 5 May 2022 10:23:25 +0800 Subject: [PATCH 11/19] fix out of memory test --- native-sql-engine/cpp/src/shuffle/splitter.cc | 59 +++++++++---------- 1 file changed, 27 insertions(+), 32 deletions(-) diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc index c96ad12f8..59d111803 100644 --- a/native-sql-engine/cpp/src/shuffle/splitter.cc +++ b/native-sql-engine/cpp/src/shuffle/splitter.cc @@ -407,7 +407,7 @@ arrow::Status Splitter::Init() { // Allocate first buffer for split reducer ARROW_ASSIGN_OR_RAISE(combine_buffer_, arrow::AllocateResizableBuffer( - SPLIT_BUFFER_SIZE, options_.memory_pool)); + 0, options_.memory_pool)); combine_buffer_->Resize(0, /*shrink_to_fit =*/false); return arrow::Status::OK(); @@ -505,8 +505,7 @@ arrow::Status Splitter::Stop() { } this->combine_buffer_.reset(); this->schema_payload_.reset(); - - std::cout << "src null count " << src_null_cnt << " dst null cnt = " << dst_null_cnt << std::endl; + partition_fixed_width_buffers_.clear(); // close data file output Stream RETURN_NOT_OK(data_file_os_->Close()); @@ -731,34 +730,30 @@ arrow::Status Splitter::AllocatePartitionBuffers(int32_t partition_id, int32_t n case arrow::NullType::type_id: break; default: { - try { - std::shared_ptr value_buffer; - if (column_type_id_[i]->id() == arrow::BooleanType::type_id) { - auto status = AllocateBufferFromPool(value_buffer, - arrow::BitUtil::BytesForBits(new_size)); - ARROW_RETURN_NOT_OK(status); - } else { - auto status = AllocateBufferFromPool( - value_buffer, - new_size * (arrow::bit_width(column_type_id_[i]->id()) >> 3)); - ARROW_RETURN_NOT_OK(status); - } - new_value_buffers.push_back(std::move(value_buffer)); - if (input_fixed_width_has_null_[fixed_width_idx]) { - std::shared_ptr validity_buffer; - auto status = AllocateBufferFromPool(validity_buffer, - arrow::BitUtil::BytesForBits(new_size)); - ARROW_RETURN_NOT_OK(status); - //initialize all true once allocated - memset(validity_buffer->mutable_data(), 0xff, validity_buffer->capacity()); - new_validity_buffers.push_back(std::move(validity_buffer)); - } else { - new_validity_buffers.push_back(nullptr); - } - fixed_width_idx++; - } catch (const std::exception& e) { - std::cout << "exception captured " << e.what() << std::endl; + std::shared_ptr value_buffer; + if (column_type_id_[i]->id() == arrow::BooleanType::type_id) { + auto status = AllocateBufferFromPool(value_buffer, + arrow::BitUtil::BytesForBits(new_size)); + ARROW_RETURN_NOT_OK(status); + } else { + auto status = AllocateBufferFromPool( + value_buffer, + new_size * (arrow::bit_width(column_type_id_[i]->id()) >> 3)); + ARROW_RETURN_NOT_OK(status); } + new_value_buffers.push_back(std::move(value_buffer)); + if (input_fixed_width_has_null_[fixed_width_idx]) { + std::shared_ptr validity_buffer; + auto status = AllocateBufferFromPool(validity_buffer, + arrow::BitUtil::BytesForBits(new_size)); + ARROW_RETURN_NOT_OK(status); + //initialize all true once allocated + memset(validity_buffer->mutable_data(), 0xff, validity_buffer->capacity()); + new_validity_buffers.push_back(std::move(validity_buffer)); + } else { + new_validity_buffers.push_back(nullptr); + } + fixed_width_idx++; break; } } @@ -821,7 +816,7 @@ arrow::Status Splitter::AllocateNew(int32_t partition_id, int32_t new_size) { << std::to_string(partition_id) << std::endl; int64_t spilled_size; ARROW_ASSIGN_OR_RAISE(auto partition_to_spill, SpillLargestPartition(&spilled_size)); - if (partition_to_spill == -1) { + if (partition_to_spill == -1) { std::cout << "Failed to allocate new buffer for partition " << std::to_string(partition_id) << ". No partition buffer to spill." << std::endl; @@ -1014,7 +1009,7 @@ arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) { for (auto pid = 0; pid < num_partitions_; ++pid) { partition_buffer_idx_base_[pid] += partition_id_cnt_[pid]; } - + return arrow::Status::OK(); } From 830cd70b3fe12269721735f855cfd10e71f845de Mon Sep 17 00:00:00 2001 From: binwei Date: Fri, 6 May 2022 15:07:23 +0800 Subject: [PATCH 12/19] fix setbitsto bug remove nullcnt --- native-sql-engine/cpp/src/shuffle/splitter.cc | 11 +---------- native-sql-engine/cpp/src/shuffle/splitter.h | 2 -- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc index 59d111803..af8ca584e 100644 --- a/native-sql-engine/cpp/src/shuffle/splitter.cc +++ b/native-sql-engine/cpp/src/shuffle/splitter.cc @@ -625,20 +625,12 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer arrays[i] = arrow::MakeArray( arrow::ArrayData::Make(schema_->field(i)->type(), num_rows, {buffers[0],buffers[1]})); - if(buffers[0]!=nullptr) - { - dst_null_cnt+=arrays[i]->null_count(); - } partition_fixed_width_validity_addrs_[fixed_width_idx][partition_id] = nullptr; partition_fixed_width_value_addrs_[fixed_width_idx][partition_id] = nullptr; } else { arrays[i] = arrow::MakeArray(arrow::ArrayData::Make( schema_->field(i)->type(), num_rows, {buffers[0], buffers[1]})); - if(buffers[0]!=nullptr) - { - dst_null_cnt+=arrays[i]->null_count(); - } } fixed_width_idx++; break; @@ -948,7 +940,6 @@ arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) { if (input_fixed_width_has_null_[col]==false && rb.column_data(col_idx)->GetNullCount() != 0) { input_fixed_width_has_null_[col] = true; } - src_null_cnt+=rb.column_data(col_idx)->GetNullCount(); } int64_t prealloc_row_cnt = @@ -1409,7 +1400,7 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch& if (partition_id_cnt_[pid] > 0 && dst_addrs[pid] != nullptr) { auto lastoffset = partition_buffer_idx_base_[pid]+partition_id_cnt_[pid]; - arrow::BitUtil::SetBitsTo(dst_addrs[pid], lastoffset, lastoffset+8-(lastoffset&0x7), + arrow::BitUtil::SetBitsTo(dst_addrs[pid], lastoffset, 8-(lastoffset&0x7), true); } } diff --git a/native-sql-engine/cpp/src/shuffle/splitter.h b/native-sql-engine/cpp/src/shuffle/splitter.h index fc2f6c37a..cc7440926 100644 --- a/native-sql-engine/cpp/src/shuffle/splitter.h +++ b/native-sql-engine/cpp/src/shuffle/splitter.h @@ -254,8 +254,6 @@ class Splitter { int64_t total_compress_time_ = 0; int64_t total_compute_pid_time_ = 0; int64_t peak_memory_allocated_ = 0; - int64_t src_null_cnt = 0; - int64_t dst_null_cnt = 0; std::vector partition_lengths_; std::vector raw_partition_lengths_; From 63f77ed9d3a129de4860a81ab22ccbaf20954fc8 Mon Sep 17 00:00:00 2001 From: binwei Date: Fri, 6 May 2022 16:01:33 +0800 Subject: [PATCH 13/19] add shuffle test --- .../cpp/src/tests/shuffle_split_test.cc | 1134 +++++++++++++++++ 1 file changed, 1134 insertions(+) create mode 100644 native-sql-engine/cpp/src/tests/shuffle_split_test.cc diff --git a/native-sql-engine/cpp/src/tests/shuffle_split_test.cc b/native-sql-engine/cpp/src/tests/shuffle_split_test.cc new file mode 100644 index 000000000..d5d8de0bf --- /dev/null +++ b/native-sql-engine/cpp/src/tests/shuffle_split_test.cc @@ -0,0 +1,1134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +void print_trace(void) { + char** strings; + size_t i, size; + enum Constexpr { MAX_SIZE = 1024 }; + void* array[MAX_SIZE]; + size = backtrace(array, MAX_SIZE); + strings = backtrace_symbols(array, size); + for (i = 0; i < size; i++) printf(" %s\n", strings[i]); + puts(""); + free(strings); +} + +#include "shuffle/splitter.h" +#include "tests/test_utils.h" + +namespace sparkcolumnarplugin { +namespace shuffle { + +class MyMemoryPool : public arrow::MemoryPool { + public: + explicit MyMemoryPool(int64_t capacity) : capacity_(capacity) {} + + Status Allocate(int64_t size, uint8_t** out) override { + if (bytes_allocated() + size > capacity_) { + return Status::OutOfMemory("malloc of size ", size, " failed"); + } + RETURN_NOT_OK(pool_->Allocate(size, out)); + stats_.UpdateAllocatedBytes(size); + //std::cout << "Allocate: size = " << size << " addr = " << std::hex << + //(uint64_t)*out << std::dec << std::endl; + // print_trace(); + return arrow::Status::OK(); + } + + Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override { + if (new_size > capacity_) { + return Status::OutOfMemory("malloc of size ", new_size, " failed"); + } + auto old_ptr = *ptr; + RETURN_NOT_OK(pool_->Reallocate(old_size, new_size, ptr)); + stats_.UpdateAllocatedBytes(new_size - old_size); + //std::cout << "Reallocate: old_size = " << old_size << " old_ptr = " << std::hex << + //(uint64_t)old_ptr << std::dec << " new_size = " << new_size << " addr = " << + //std::hex << (uint64_t)*ptr << std::dec << std::endl; + //print_trace(); + return arrow::Status::OK(); + } + + void Free(uint8_t* buffer, int64_t size) override { + pool_->Free(buffer, size); + stats_.UpdateAllocatedBytes(-size); + //std::cout << "Free: size = " << size << " addr = " << std::hex << (uint64_t)buffer + //<< std::dec << std::endl; + //print_trace(); + } + + int64_t bytes_allocated() const override { return stats_.bytes_allocated(); } + + int64_t max_memory() const override { return pool_->max_memory(); } + + std::string backend_name() const override { return pool_->backend_name(); } + + private: + MemoryPool* pool_ = arrow::default_memory_pool(); + int64_t capacity_; + arrow::internal::MemoryPoolStats stats_; +}; + +class SplitterTest : public ::testing::Test { + protected: + void SetUp() { + auto f_na = field("f_na", arrow::null()); + auto f_int8_a = field("f_int8_a", arrow::int8()); + auto f_int8_b = field("f_int8_b", arrow::int8()); + auto f_int32 = field("f_int32", arrow::int32()); + auto f_uint64 = field("f_uint64", arrow::uint64()); + auto f_double = field("f_double", arrow::float64()); + auto f_bool = field("f_bool", arrow::boolean()); + auto f_string = field("f_string", arrow::utf8()); + auto f_nullable_string = field("f_nullable_string", arrow::utf8()); + auto f_decimal = field("f_decimal128", arrow::decimal(10, 2)); + + ARROW_ASSIGN_OR_THROW(tmp_dir_1_, + std::move(arrow::internal::TemporaryDir::Make(tmp_dir_prefix))) + ARROW_ASSIGN_OR_THROW(tmp_dir_2_, + std::move(arrow::internal::TemporaryDir::Make(tmp_dir_prefix))) + auto config_dirs = + tmp_dir_1_->path().ToString() + "," + tmp_dir_2_->path().ToString(); + + setenv("NATIVESQL_SPARK_LOCAL_DIRS", config_dirs.c_str(), 1); + + schema_ = arrow::schema({f_na, f_int8_a, f_int8_b, f_int32, f_uint64, f_double, + f_bool, f_string, f_nullable_string, f_decimal}); + + MakeInputBatch(input_data_1, schema_, &input_batch_1_); + MakeInputBatch(input_data_2, schema_, &input_batch_2_); + + split_options_ = SplitOptions::Defaults(); + } + + void TearDown() override { + if (file_ != nullptr && !file_->closed()) { + file_->Close(); + } + } + + static void CheckFileExsists(const std::string& file_name) { + ASSERT_EQ(*arrow::internal::FileExists( + *arrow::internal::PlatformFilename::FromString(file_name)), + true); + } + + arrow::Result> TakeRows( + const std::shared_ptr& input_batch, + const std::string& json_idx) { + std::shared_ptr take_idx; + ASSERT_NOT_OK( + arrow::ipc::internal::json::ArrayFromJSON(arrow::int32(), json_idx, &take_idx)); + + auto cntx = arrow::compute::ExecContext(); + std::shared_ptr res; + ARROW_ASSIGN_OR_RAISE( + arrow::Datum result, + arrow::compute::Take(arrow::Datum(input_batch), arrow::Datum(take_idx), + arrow::compute::TakeOptions{}, &cntx)); + return result.record_batch(); + } + + arrow::Result> + GetRecordBatchStreamReader(const std::string& file_name) { + if (file_ != nullptr && !file_->closed()) { + RETURN_NOT_OK(file_->Close()); + } + ARROW_ASSIGN_OR_RAISE(file_, arrow::io::ReadableFile::Open(file_name)) + ARROW_ASSIGN_OR_RAISE(auto file_reader, + arrow::ipc::RecordBatchStreamReader::Open(file_)) + return file_reader; + } + + static const std::string tmp_dir_prefix; + static const std::vector input_data_1; + static const std::vector input_data_2; + + std::shared_ptr tmp_dir_1_; + std::shared_ptr tmp_dir_2_; + + std::shared_ptr schema_; + std::shared_ptr splitter_; + SplitOptions split_options_; + + std::shared_ptr input_batch_1_; + std::shared_ptr input_batch_2_; + + std::shared_ptr file_; +}; + +const std::string SplitterTest::tmp_dir_prefix = "columnar-shuffle-test"; +const std::vector SplitterTest::input_data_1 = { + "[null, null, null, null, null, null, null, null, null, null]", + "[1, 2, 3, null, 4, null, 5, 6, null, 7]", + "[1, -1, null, null, -2, 2, null, null, 3, -3]", + "[1, 2, 3, 4, null, 5, 6, 7, 8, null]", + "[null, null, null, null, null, null, null, null, null, null]", + R"([-0.1234567, null, 0.1234567, null, -0.142857, null, 0.142857, 0.285714, 0.428617, null])", + "[null, true, false, null, true, true, false, true, null, null]", + R"(["alice0", "bob1", "alice2", "bob3", "Alice4", "Bob5", "AlicE6", "boB7", "ALICE8", "BOB9"])", + R"(["alice", "bob", null, null, "Alice", "Bob", null, "alicE", null, "boB"])", + R"(["-1.01", "2.01", "-3.01", null, "0.11", "3.14", "2.27", null, "-3.14", null])"}; + +const std::vector SplitterTest::input_data_2 = { + "[null, null]", "[null, null]", + "[1, -1]", "[100, null]", + "[1, 1]", R"([0.142857, -0.142857])", + "[true, false]", R"(["bob", "alice"])", + R"([null, null])", R"([null, null])"}; + +TEST_F(SplitterTest, TestSingleSplitter) { + split_options_.buffer_size = 10; + + ARROW_ASSIGN_OR_THROW(splitter_, Splitter::Make("rr", schema_, 1, split_options_)) + + ASSERT_NOT_OK(splitter_->Split(*input_batch_1_)); + ASSERT_NOT_OK(splitter_->Split(*input_batch_2_)); + ASSERT_NOT_OK(splitter_->Split(*input_batch_1_)); + + ASSERT_NOT_OK(splitter_->Stop()); + + // verify data file + CheckFileExsists(splitter_->DataFile()); + + // verify output temporary files + const auto& lengths = splitter_->PartitionLengths(); + ASSERT_EQ(lengths.size(), 1); + + std::shared_ptr file_reader; + ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); + + // verify schema + ASSERT_EQ(*file_reader->schema(), *schema_); + + std::vector> batches; + ASSERT_NOT_OK(file_reader->ReadAll(&batches)); + ASSERT_EQ(batches.size(), 3); + + std::vector expected = {input_batch_1_.get(), input_batch_2_.get(), + input_batch_1_.get()}; + for (auto i = 0; i < batches.size(); ++i) { + const auto& rb = batches[i]; + ASSERT_EQ(rb->num_columns(), schema_->num_fields()); + for (auto j = 0; j < rb->num_columns(); ++j) { + ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); + ASSERT_TRUE(rb->column(j)->Equals(*expected[i]->column(j), + EqualOptions::Defaults().diff_sink(&std::cout))); + } + ASSERT_TRUE(rb->Equals(*expected[i])); + } +} + +TEST_F(SplitterTest, TestRoundRobinSplitter) { + int32_t num_partitions = 2; + split_options_.buffer_size = 4; + ARROW_ASSIGN_OR_THROW(splitter_, + Splitter::Make("rr", schema_, num_partitions, split_options_)); + + ASSERT_NOT_OK(splitter_->Split(*input_batch_1_)); + ASSERT_NOT_OK(splitter_->Split(*input_batch_2_)); + ASSERT_NOT_OK(splitter_->Split(*input_batch_1_)); + + ASSERT_NOT_OK(splitter_->Stop()); + + std::shared_ptr file_reader; + ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); + + // verify partition lengths + const auto& lengths = splitter_->PartitionLengths(); + ASSERT_EQ(lengths.size(), 2); + ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]); + + // verify schema + std::vector> batches; + ASSERT_EQ(*file_reader->schema(), *schema_); + + // prepare first block expected result + std::shared_ptr res_batch_0; + std::shared_ptr res_batch_1; + ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_1_, "[0, 2, 4, 6, 8]")) + ARROW_ASSIGN_OR_THROW(res_batch_1, TakeRows(input_batch_2_, "[0]")) + std::vector expected = {res_batch_0.get(), res_batch_1.get(), + res_batch_0.get()}; + + // verify first block + ASSERT_NOT_OK(file_reader->ReadAll(&batches)); + ASSERT_EQ(batches.size(), 3); + for (auto i = 0; i < batches.size(); ++i) { + const auto& rb = batches[i]; + ASSERT_EQ(rb->num_columns(), schema_->num_fields()); + for (auto j = 0; j < rb->num_columns(); ++j) { + ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); + } + ASSERT_TRUE(rb->Equals(*expected[i])); + } + + // prepare second block expected result + ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_1_, "[1, 3, 5, 7, 9]")) + ARROW_ASSIGN_OR_THROW(res_batch_1, TakeRows(input_batch_2_, "[1]")) + expected = {res_batch_0.get(), res_batch_1.get(), res_batch_0.get()}; + + // verify second block + batches.clear(); + ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); + ASSERT_EQ(*file_reader->schema(), *schema_); + ASSERT_NOT_OK(file_->Advance(lengths[0])); + ASSERT_NOT_OK(file_reader->ReadAll(&batches)); + ASSERT_EQ(batches.size(), 3); + for (auto i = 0; i < batches.size(); ++i) { + const auto& rb = batches[i]; + ASSERT_EQ(rb->num_columns(), schema_->num_fields()); + for (auto j = 0; j < rb->num_columns(); ++j) { + ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); + } + ASSERT_TRUE(rb->Equals(*expected[i])); + } +} + +TEST_F(SplitterTest, TestSplitterMemoryLeak) { + std::shared_ptr pool = + std::make_shared(9 * 1024 * 1024); + + int32_t num_partitions = 2; + split_options_.buffer_size = 4; + split_options_.memory_pool = pool.get(); + split_options_.write_schema = false; + + ARROW_ASSIGN_OR_THROW(splitter_, + Splitter::Make("rr", schema_, num_partitions, split_options_)); + + ASSERT_NOT_OK(splitter_->Split(*input_batch_1_)); + ASSERT_NOT_OK(splitter_->Split(*input_batch_2_)); + ASSERT_NOT_OK(splitter_->Split(*input_batch_1_)); + + ASSERT_NOT_OK(splitter_->Stop()); + + ASSERT_TRUE(pool->bytes_allocated() == 0); + splitter_.reset(); + ASSERT_TRUE(pool->bytes_allocated() == 0); + + split_options_.memory_pool = arrow::default_memory_pool(); +} + +TEST_F(SplitterTest, TestHashSplitter) { + int32_t num_partitions = 2; + split_options_.buffer_size = 4; + + auto f_0 = TreeExprBuilder::MakeField(schema_->field(1)); + auto f_1 = TreeExprBuilder::MakeField(schema_->field(2)); + auto f_2 = TreeExprBuilder::MakeField(schema_->field(3)); + + auto node_0 = TreeExprBuilder::MakeFunction("add", {f_0, f_1}, int8()); + auto expr_0 = TreeExprBuilder::MakeExpression(node_0, field("res0", int8())); + auto expr_1 = TreeExprBuilder::MakeExpression(f_2, field("f_uint64", uint64())); + + ARROW_ASSIGN_OR_THROW(splitter_, Splitter::Make("hash", schema_, num_partitions, + {expr_0, expr_1}, split_options_)) + + ASSERT_NOT_OK(splitter_->Split(*input_batch_1_)); + ASSERT_NOT_OK(splitter_->Split(*input_batch_2_)); + ASSERT_NOT_OK(splitter_->Split(*input_batch_1_)); + + ASSERT_NOT_OK(splitter_->Stop()); + + const auto& lengths = splitter_->PartitionLengths(); + ASSERT_EQ(lengths.size(), 2); + + // verify data file + CheckFileExsists(splitter_->DataFile()); + + std::shared_ptr file_reader; + ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); + + // verify schema + ASSERT_EQ(*file_reader->schema(), *schema_); + + std::vector> batches; + ASSERT_NOT_OK(file_reader->ReadAll(&batches)); + + for (const auto& rb : batches) { + ASSERT_EQ(rb->num_columns(), schema_->num_fields()); + for (auto i = 0; i < rb->num_columns(); ++i) { + ASSERT_EQ(rb->column(i)->length(), rb->num_rows()); + } + } +} + +TEST_F(SplitterTest, TestFallbackRangeSplitter) { + int32_t num_partitions = 2; + split_options_.buffer_size = 4; + + std::shared_ptr pid_arr_0; + ASSERT_NOT_OK(arrow::ipc::internal::json::ArrayFromJSON( + arrow::int32(), "[0, 1, 0, 1, 0, 1, 0, 1, 0, 1]", &pid_arr_0)); + std::shared_ptr pid_arr_1; + ASSERT_NOT_OK( + arrow::ipc::internal::json::ArrayFromJSON(arrow::int32(), "[0, 1]", &pid_arr_1)); + + std::shared_ptr schema_w_pid; + std::shared_ptr input_batch_1_w_pid; + std::shared_ptr input_batch_2_w_pid; + ARROW_ASSIGN_OR_THROW(schema_w_pid, + schema_->AddField(0, arrow::field("pid", arrow::int32()))); + ARROW_ASSIGN_OR_THROW(input_batch_1_w_pid, + input_batch_1_->AddColumn(0, "pid", pid_arr_0)); + ARROW_ASSIGN_OR_THROW(input_batch_2_w_pid, + input_batch_2_->AddColumn(0, "pid", pid_arr_1)); + + ARROW_ASSIGN_OR_THROW(splitter_, Splitter::Make("range", std::move(schema_w_pid), + num_partitions, split_options_)) + + ASSERT_NOT_OK(splitter_->Split(*input_batch_1_w_pid)); + ASSERT_NOT_OK(splitter_->Split(*input_batch_2_w_pid)); + ASSERT_NOT_OK(splitter_->Split(*input_batch_1_w_pid)); + + ASSERT_NOT_OK(splitter_->Stop()); + + std::shared_ptr file_reader; + ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); + + // verify partition lengths + const auto& lengths = splitter_->PartitionLengths(); + ASSERT_EQ(lengths.size(), 2); + ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]); + + // verify schema + std::vector> batches; + ASSERT_EQ(*file_reader->schema(), *schema_); + + // prepare first block expected result + std::shared_ptr res_batch_0; + std::shared_ptr res_batch_1; + ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_1_, "[0, 2, 4, 6, 8]")) + ARROW_ASSIGN_OR_THROW(res_batch_1, TakeRows(input_batch_2_, "[0]")) + std::vector expected = {res_batch_0.get(), res_batch_1.get(), + res_batch_0.get()}; + + // verify first block + ASSERT_NOT_OK(file_reader->ReadAll(&batches)); + ASSERT_EQ(batches.size(), 3); + for (auto i = 0; i < batches.size(); ++i) { + const auto& rb = batches[i]; + ASSERT_EQ(rb->num_columns(), schema_->num_fields()); + for (auto j = 0; j < rb->num_columns(); ++j) { + ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); + } + ASSERT_TRUE(rb->Equals(*expected[i])); + } + + // prepare second block expected result + ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_1_, "[1, 3, 5, 7, 9]")) + ARROW_ASSIGN_OR_THROW(res_batch_1, TakeRows(input_batch_2_, "[1]")) + expected = {res_batch_0.get(), res_batch_1.get(), res_batch_0.get()}; + + // verify second block + batches.clear(); + ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); + ASSERT_EQ(*file_reader->schema(), *schema_); + ASSERT_NOT_OK(file_->Advance(lengths[0])); + ASSERT_NOT_OK(file_reader->ReadAll(&batches)); + ASSERT_EQ(batches.size(), 3); + for (auto i = 0; i < batches.size(); ++i) { + const auto& rb = batches[i]; + ASSERT_EQ(rb->num_columns(), schema_->num_fields()); + for (auto j = 0; j < rb->num_columns(); ++j) { + ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); + } + ASSERT_TRUE(rb->Equals(*expected[i])); + } +} + +TEST_F(SplitterTest, TestSpillFailWithOutOfMemory) { + auto pool = std::make_unique(0); + + int32_t num_partitions = 2; + split_options_.buffer_size = 4; + split_options_.memory_pool = pool.get(); + ARROW_ASSIGN_OR_THROW(splitter_, + Splitter::Make("rr", schema_, num_partitions, split_options_)); + + auto status = splitter_->Split(*input_batch_1_); + // should return OOM status because there's no partition buffer to spill + ASSERT_TRUE(status.IsOutOfMemory()); + ASSERT_NOT_OK(splitter_->Stop()); +} + +TEST_F(SplitterTest, TestSpillLargestPartition) { + std::shared_ptr pool = + std::make_shared(9 * 1024 * 1024); + // pool = std::make_shared(pool.get()); + + int32_t num_partitions = 2; + split_options_.buffer_size = 4; + // split_options_.memory_pool = pool.get(); + split_options_.compression_type = arrow::Compression::UNCOMPRESSED; + ARROW_ASSIGN_OR_THROW(splitter_, + Splitter::Make("rr", schema_, num_partitions, split_options_)); + + for (int i = 0; i < 100; ++i) { + ASSERT_NOT_OK(splitter_->Split(*input_batch_1_)); + ASSERT_NOT_OK(splitter_->Split(*input_batch_2_)); + ASSERT_NOT_OK(splitter_->Split(*input_batch_1_)); + } + ASSERT_NOT_OK(splitter_->Stop()); +} + +TEST_F(SplitterTest, TestRoundRobinListArraySplitter) { + auto f_arr_str = field("f_arr", arrow::list(arrow::utf8())); + auto f_arr_bool = field("f_bool", arrow::list(arrow::boolean())); + auto f_arr_int32 = field("f_int32", arrow::list(arrow::int32())); + auto f_arr_double = field("f_double", arrow::list(arrow::float64())); + auto f_arr_decimal = field("f_decimal", arrow::list(arrow::decimal(10, 2))); + + auto rb_schema = + arrow::schema({f_arr_str, f_arr_bool, f_arr_int32, f_arr_double, f_arr_decimal}); + + const std::vector input_data_arr = { + R"([["alice0", "bob1"], ["alice2"], ["bob3"], ["Alice4", "Bob5", "AlicE6"], ["boB7"], ["ALICE8", "BOB9"]])", + R"([[true, null], [true, true, true], [false], [true], [false], [false]])", + R"([[1, 2, 3], [9, 8], [null], [3, 1], [0], [1, 9, null]])", + R"([[0.26121], [-9.12123, 6.111111], [8.121], [7.21, null], [3.2123, 6,1121], [null]])", + R"([["0.26"], ["-9.12", "6.11"], ["8.12"], ["7.21", null], ["3.21", "6.11"], [null]])"}; + + std::shared_ptr input_batch_arr; + MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr); + + int32_t num_partitions = 2; + split_options_.buffer_size = 4; + ARROW_ASSIGN_OR_THROW(splitter_, + Splitter::Make("rr", rb_schema, num_partitions, split_options_)); + + ASSERT_NOT_OK(splitter_->Split(*input_batch_arr)); + ASSERT_NOT_OK(splitter_->Stop()); + + std::shared_ptr file_reader; + ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); + + // verify partition lengths + const auto& lengths = splitter_->PartitionLengths(); + ASSERT_EQ(lengths.size(), 2); + ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]); + + // verify schema + std::vector> batches; + ASSERT_EQ(*file_reader->schema(), *rb_schema); + + // prepare first block expected result + std::shared_ptr res_batch_0; + std::shared_ptr res_batch_1; + ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2, 4]")) + std::vector expected = {res_batch_0.get()}; + + // verify first block + ASSERT_NOT_OK(file_reader->ReadAll(&batches)); + ASSERT_EQ(batches.size(), 1); + for (auto i = 0; i < batches.size(); ++i) { + const auto& rb = batches[i]; + ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); + for (auto j = 0; j < rb->num_columns(); ++j) { + ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); + } + ASSERT_TRUE(rb->Equals(*expected[i])); + } + + // prepare second block expected result + ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3, 5]")) + expected = {res_batch_0.get()}; + + // verify second block + batches.clear(); + ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); + ASSERT_EQ(*file_reader->schema(), *rb_schema); + ASSERT_NOT_OK(file_->Advance(lengths[0])); + ASSERT_NOT_OK(file_reader->ReadAll(&batches)); + ASSERT_EQ(batches.size(), 1); + for (auto i = 0; i < batches.size(); ++i) { + const auto& rb = batches[i]; + ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); + for (auto j = 0; j < rb->num_columns(); ++j) { + ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); + } + ASSERT_TRUE(rb->Equals(*expected[i])); + } +} + +TEST_F(SplitterTest, TestRoundRobinNestListArraySplitter) { + auto f_arr_str = field("f_str", arrow::list(arrow::list(arrow::utf8()))); + auto f_arr_int32 = field("f_int32", arrow::list(arrow::list(arrow::int32()))); + + auto rb_schema = arrow::schema({f_arr_str, f_arr_int32}); + + const std::vector input_data_arr = { + R"([[["alice0", "bob1"]], [["alice2"], ["bob3"]], [["Alice4", "Bob5", "AlicE6"]], [["boB7"], ["ALICE8", "BOB9"]]])", + R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])"}; + + std::shared_ptr input_batch_arr; + MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr); + + int32_t num_partitions = 2; + split_options_.buffer_size = 4; + ARROW_ASSIGN_OR_THROW(splitter_, + Splitter::Make("rr", rb_schema, num_partitions, split_options_)); + + ASSERT_NOT_OK(splitter_->Split(*input_batch_arr)); + ASSERT_NOT_OK(splitter_->Stop()); + + std::shared_ptr file_reader; + ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); + + // verify partition lengths + const auto& lengths = splitter_->PartitionLengths(); + ASSERT_EQ(lengths.size(), 2); + ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]); + + // verify schema + std::vector> batches; + ASSERT_EQ(*file_reader->schema(), *rb_schema); + + // prepare first block expected result + std::shared_ptr res_batch_0; + ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]")) + std::vector expected = {res_batch_0.get()}; + + // verify first block + ASSERT_NOT_OK(file_reader->ReadAll(&batches)); + ASSERT_EQ(batches.size(), 1); + for (auto i = 0; i < batches.size(); ++i) { + const auto& rb = batches[i]; + ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); + for (auto j = 0; j < rb->num_columns(); ++j) { + ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); + } + ASSERT_TRUE(rb->Equals(*expected[i])); + } + + // prepare second block expected result + ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]")) + expected = {res_batch_0.get()}; + + // verify second block + batches.clear(); + ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); + ASSERT_EQ(*file_reader->schema(), *rb_schema); + ASSERT_NOT_OK(file_->Advance(lengths[0])); + ASSERT_NOT_OK(file_reader->ReadAll(&batches)); + ASSERT_EQ(batches.size(), 1); + for (auto i = 0; i < batches.size(); ++i) { + const auto& rb = batches[i]; + ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); + for (auto j = 0; j < rb->num_columns(); ++j) { + ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); + } + ASSERT_TRUE(rb->Equals(*expected[i])); + } +} + +TEST_F(SplitterTest, TestRoundRobinNestLargeListArraySplitter) { + auto f_arr_str = field("f_str", arrow::large_list(arrow::list(arrow::utf8()))); + auto f_arr_int32 = field("f_int32", arrow::large_list(arrow::list(arrow::int32()))); + + auto rb_schema = arrow::schema({f_arr_str, f_arr_int32}); + + const std::vector input_data_arr = { + R"([[["alice0", "bob1"]], [["alice2"], ["bob3"]], [["Alice4", "Bob5", "AlicE6"]], [["boB7"], ["ALICE8", "BOB9"]]])", + R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])"}; + + std::shared_ptr input_batch_arr; + MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr); + + int32_t num_partitions = 2; + split_options_.buffer_size = 4; + ARROW_ASSIGN_OR_THROW(splitter_, + Splitter::Make("rr", rb_schema, num_partitions, split_options_)); + + ASSERT_NOT_OK(splitter_->Split(*input_batch_arr)); + ASSERT_NOT_OK(splitter_->Stop()); + + std::shared_ptr file_reader; + ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); + + // verify partition lengths + const auto& lengths = splitter_->PartitionLengths(); + ASSERT_EQ(lengths.size(), 2); + ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]); + + // verify schema + std::vector> batches; + ASSERT_EQ(*file_reader->schema(), *rb_schema); + + // prepare first block expected result + std::shared_ptr res_batch_0; + ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]")) + std::vector expected = {res_batch_0.get()}; + + // verify first block + ASSERT_NOT_OK(file_reader->ReadAll(&batches)); + ASSERT_EQ(batches.size(), 1); + for (auto i = 0; i < batches.size(); ++i) { + const auto& rb = batches[i]; + ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); + for (auto j = 0; j < rb->num_columns(); ++j) { + ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); + } + ASSERT_TRUE(rb->Equals(*expected[i])); + } + + // prepare second block expected result + ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]")) + expected = {res_batch_0.get()}; + + // verify second block + batches.clear(); + ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); + ASSERT_EQ(*file_reader->schema(), *rb_schema); + ASSERT_NOT_OK(file_->Advance(lengths[0])); + ASSERT_NOT_OK(file_reader->ReadAll(&batches)); + ASSERT_EQ(batches.size(), 1); + for (auto i = 0; i < batches.size(); ++i) { + const auto& rb = batches[i]; + ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); + for (auto j = 0; j < rb->num_columns(); ++j) { + ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); + } + ASSERT_TRUE(rb->Equals(*expected[i])); + } +} + +TEST_F(SplitterTest, TestRoundRobinListStructArraySplitter) { + auto f_arr_int32 = field("f_int32", arrow::list(arrow::list(arrow::int32()))); + auto f_arr_list_struct = + field("f_list_struct", list(struct_({field("a", int32()), field("b", utf8())}))); + + auto rb_schema = arrow::schema({f_arr_int32, f_arr_list_struct}); + + const std::vector input_data_arr = { + R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])", + R"([[{"a": 4, "b": null}], [{"a": 42, "b": null}, {"a": null, "b": "foo2"}], [{"a": 43, "b": "foo3"}], [{"a": 44, "b": "foo4"}]])"}; + + std::shared_ptr input_batch_arr; + MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr); + + int32_t num_partitions = 2; + split_options_.buffer_size = 4; + ARROW_ASSIGN_OR_THROW(splitter_, + Splitter::Make("rr", rb_schema, num_partitions, split_options_)); + + ASSERT_NOT_OK(splitter_->Split(*input_batch_arr)); + ASSERT_NOT_OK(splitter_->Stop()); + + std::shared_ptr file_reader; + ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); + + // verify partition lengths + const auto& lengths = splitter_->PartitionLengths(); + ASSERT_EQ(lengths.size(), 2); + ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]); + + // verify schema + std::vector> batches; + ASSERT_EQ(*file_reader->schema(), *rb_schema); + + // prepare first block expected result + std::shared_ptr res_batch_0; + ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]")) + std::vector expected = {res_batch_0.get()}; + + // verify first block + ASSERT_NOT_OK(file_reader->ReadAll(&batches)); + ASSERT_EQ(batches.size(), 1); + for (auto i = 0; i < batches.size(); ++i) { + const auto& rb = batches[i]; + ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); + for (auto j = 0; j < rb->num_columns(); ++j) { + ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); + } + ASSERT_TRUE(rb->Equals(*expected[i])); + } + + // prepare second block expected result + ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]")) + expected = {res_batch_0.get()}; + + // verify second block + batches.clear(); + ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); + ASSERT_EQ(*file_reader->schema(), *rb_schema); + ASSERT_NOT_OK(file_->Advance(lengths[0])); + ASSERT_NOT_OK(file_reader->ReadAll(&batches)); + ASSERT_EQ(batches.size(), 1); + for (auto i = 0; i < batches.size(); ++i) { + const auto& rb = batches[i]; + ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); + for (auto j = 0; j < rb->num_columns(); ++j) { + ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); + } + ASSERT_TRUE(rb->Equals(*expected[i])); + } +} + +TEST_F(SplitterTest, TestRoundRobinListMapArraySplitter) { + auto f_arr_int32 = field("f_int32", arrow::list(arrow::list(arrow::int32()))); + auto f_arr_list_map = field("f_list_map", list(map(utf8(), utf8()))); + + auto rb_schema = arrow::schema({f_arr_int32, f_arr_list_map}); + + const std::vector input_data_arr = { + R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])", + R"([[[["key1", "val_aa1"]]], [[["key1", "val_bb1"]], [["key2", "val_bb2"]]], [[["key1", "val_cc1"]]], [[["key1", "val_dd1"]]]])"}; + + std::shared_ptr input_batch_arr; + MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr); + + int32_t num_partitions = 2; + split_options_.buffer_size = 4; + ARROW_ASSIGN_OR_THROW(splitter_, + Splitter::Make("rr", rb_schema, num_partitions, split_options_)); + + ASSERT_NOT_OK(splitter_->Split(*input_batch_arr)); + ASSERT_NOT_OK(splitter_->Stop()); + + std::shared_ptr file_reader; + ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); + + // verify partition lengths + const auto& lengths = splitter_->PartitionLengths(); + ASSERT_EQ(lengths.size(), 2); + ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]); + + // verify schema + std::vector> batches; + ASSERT_EQ(*file_reader->schema(), *rb_schema); + + // prepare first block expected result + std::shared_ptr res_batch_0; + ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]")) + std::vector expected = {res_batch_0.get()}; + + // verify first block + ASSERT_NOT_OK(file_reader->ReadAll(&batches)); + ASSERT_EQ(batches.size(), 1); + for (auto i = 0; i < batches.size(); ++i) { + const auto& rb = batches[i]; + ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); + for (auto j = 0; j < rb->num_columns(); ++j) { + ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); + } + ASSERT_TRUE(rb->Equals(*expected[i])); + } + + // prepare second block expected result + ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]")) + expected = {res_batch_0.get()}; + + // verify second block + batches.clear(); + ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); + ASSERT_EQ(*file_reader->schema(), *rb_schema); + ASSERT_NOT_OK(file_->Advance(lengths[0])); + ASSERT_NOT_OK(file_reader->ReadAll(&batches)); + ASSERT_EQ(batches.size(), 1); + for (auto i = 0; i < batches.size(); ++i) { + const auto& rb = batches[i]; + ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); + for (auto j = 0; j < rb->num_columns(); ++j) { + ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); + } + ASSERT_TRUE(rb->Equals(*expected[i])); + } +} + +TEST_F(SplitterTest, TestRoundRobinStructArraySplitter) { + auto f_arr_int32 = field("f_int32", arrow::list(arrow::list(arrow::int32()))); + auto f_arr_struct_list = + field("f_struct_list", struct_({field("a", list(int32())), field("b", utf8())})); + + auto rb_schema = arrow::schema({f_arr_int32, f_arr_struct_list}); + + const std::vector input_data_arr = { + R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])", + R"([{"a": [1,1,1,1], "b": null}, {"a": null, "b": "foo2"}, {"a": [3,3,3,3], "b": "foo3"}, {"a": [4,4,4,4], "b": "foo4"}])"}; + + std::shared_ptr input_batch_arr; + MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr); + + int32_t num_partitions = 2; + split_options_.buffer_size = 4; + ARROW_ASSIGN_OR_THROW(splitter_, + Splitter::Make("rr", rb_schema, num_partitions, split_options_)); + + ASSERT_NOT_OK(splitter_->Split(*input_batch_arr)); + ASSERT_NOT_OK(splitter_->Stop()); + + std::shared_ptr file_reader; + ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); + + // verify partition lengths + const auto& lengths = splitter_->PartitionLengths(); + ASSERT_EQ(lengths.size(), 2); + ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]); + + // verify schema + std::vector> batches; + ASSERT_EQ(*file_reader->schema(), *rb_schema); + + // prepare first block expected result + std::shared_ptr res_batch_0; + ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]")) + std::vector expected = {res_batch_0.get()}; + + // verify first block + ASSERT_NOT_OK(file_reader->ReadAll(&batches)); + ASSERT_EQ(batches.size(), 1); + for (auto i = 0; i < batches.size(); ++i) { + const auto& rb = batches[i]; + ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); + for (auto j = 0; j < rb->num_columns(); ++j) { + ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); + } + ASSERT_TRUE(rb->Equals(*expected[i])); + } + + // prepare second block expected result + ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]")) + expected = {res_batch_0.get()}; + + // verify second block + batches.clear(); + ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); + ASSERT_EQ(*file_reader->schema(), *rb_schema); + ASSERT_NOT_OK(file_->Advance(lengths[0])); + ASSERT_NOT_OK(file_reader->ReadAll(&batches)); + ASSERT_EQ(batches.size(), 1); + for (auto i = 0; i < batches.size(); ++i) { + const auto& rb = batches[i]; + ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); + for (auto j = 0; j < rb->num_columns(); ++j) { + ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); + } + ASSERT_TRUE(rb->Equals(*expected[i])); + } +} + +TEST_F(SplitterTest, TestRoundRobinMapArraySplitter) { + auto f_arr_int32 = field("f_int32", arrow::list(arrow::list(arrow::int32()))); + auto f_arr_map = field("f_map", map(utf8(), utf8())); + + auto rb_schema = arrow::schema({f_arr_int32, f_arr_map}); + + const std::vector input_data_arr = { + R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])", + R"([[["key1", "val_aa1"]], [["key1", "val_bb1"], ["key2", "val_bb2"]], [["key1", "val_cc1"]], [["key1", "val_dd1"]]])"}; + + std::shared_ptr input_batch_arr; + MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr); + + int32_t num_partitions = 2; + split_options_.buffer_size = 4; + ARROW_ASSIGN_OR_THROW(splitter_, + Splitter::Make("rr", rb_schema, num_partitions, split_options_)); + + ASSERT_NOT_OK(splitter_->Split(*input_batch_arr)); + ASSERT_NOT_OK(splitter_->Stop()); + + std::shared_ptr file_reader; + ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); + + // verify partition lengths + const auto& lengths = splitter_->PartitionLengths(); + ASSERT_EQ(lengths.size(), 2); + ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]); + + // verify schema + std::vector> batches; + ASSERT_EQ(*file_reader->schema(), *rb_schema); + + // prepare first block expected result + std::shared_ptr res_batch_0; + ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]")) + std::vector expected = {res_batch_0.get()}; + + // verify first block + ASSERT_NOT_OK(file_reader->ReadAll(&batches)); + ASSERT_EQ(batches.size(), 1); + for (auto i = 0; i < batches.size(); ++i) { + const auto& rb = batches[i]; + ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); + for (auto j = 0; j < rb->num_columns(); ++j) { + ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); + } + ASSERT_TRUE(rb->Equals(*expected[i])); + } + + // prepare second block expected result + ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]")) + expected = {res_batch_0.get()}; + + // verify second block + batches.clear(); + ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); + ASSERT_EQ(*file_reader->schema(), *rb_schema); + ASSERT_NOT_OK(file_->Advance(lengths[0])); + ASSERT_NOT_OK(file_reader->ReadAll(&batches)); + ASSERT_EQ(batches.size(), 1); + for (auto i = 0; i < batches.size(); ++i) { + const auto& rb = batches[i]; + ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); + for (auto j = 0; j < rb->num_columns(); ++j) { + ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); + } + ASSERT_TRUE(rb->Equals(*expected[i])); + } +} + +TEST_F(SplitterTest, TestHashListArraySplitterWithMorePartitions) { + int32_t num_partitions = 5; + split_options_.buffer_size = 4; + + auto f_uint64 = field("f_uint64", arrow::uint64()); + auto f_arr_str = field("f_arr", arrow::list(arrow::utf8())); + + auto rb_schema = arrow::schema({f_uint64, f_arr_str}); + + const std::vector input_batch_1_data = { + R"([1, 2])", R"([["alice0", "bob1"], ["alice2"]])"}; + std::shared_ptr input_batch_arr; + MakeInputBatch(input_batch_1_data, rb_schema, &input_batch_arr); + + auto f_2 = TreeExprBuilder::MakeField(f_uint64); + auto expr_1 = TreeExprBuilder::MakeExpression(f_2, field("f_uint64", uint64())); + + ARROW_ASSIGN_OR_THROW(splitter_, Splitter::Make("hash", rb_schema, num_partitions, + {expr_1}, split_options_)); + + ASSERT_NOT_OK(splitter_->Split(*input_batch_arr)); + + ASSERT_NOT_OK(splitter_->Stop()); + + const auto& lengths = splitter_->PartitionLengths(); + ASSERT_EQ(lengths.size(), 5); + + CheckFileExsists(splitter_->DataFile()); + + std::shared_ptr file_reader; + ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); + + ASSERT_EQ(*file_reader->schema(), *rb_schema); + + std::vector> batches; + ASSERT_NOT_OK(file_reader->ReadAll(&batches)); + + for (const auto& rb : batches) { + ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); + for (auto i = 0; i < rb->num_columns(); ++i) { + ASSERT_EQ(rb->column(i)->length(), rb->num_rows()); + } + } +} + +TEST_F(SplitterTest, TestRoundRobinListArraySplitterwithCompression) { + auto f_arr_str = field("f_arr", arrow::list(arrow::utf8())); + auto f_arr_bool = field("f_bool", arrow::list(arrow::boolean())); + auto f_arr_int32 = field("f_int32", arrow::list(arrow::int32())); + auto f_arr_double = field("f_double", arrow::list(arrow::float64())); + auto f_arr_decimal = field("f_decimal", arrow::list(arrow::decimal(10, 2))); + + auto rb_schema = + arrow::schema({f_arr_str, f_arr_bool, f_arr_int32, f_arr_double, f_arr_decimal}); + + const std::vector input_data_arr = { + R"([["alice0", "bob1"], ["alice2"], ["bob3"], ["Alice4", "Bob5", "AlicE6"], ["boB7"], ["ALICE8", "BOB9"]])", + R"([[true, null], [true, true, true], [false], [true], [false], [false]])", + R"([[1, 2, 3], [9, 8], [null], [3, 1], [0], [1, 9, null]])", + R"([[0.26121], [-9.12123, 6.111111], [8.121], [7.21, null], [3.2123, 6,1121], [null]])", + R"([["0.26"], ["-9.12", "6.11"], ["8.12"], ["7.21", null], ["3.21", "6.11"], [null]])"}; + + std::shared_ptr input_batch_arr; + MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr); + + int32_t num_partitions = 2; + split_options_.buffer_size = 4; + ARROW_ASSIGN_OR_THROW(splitter_, + Splitter::Make("rr", rb_schema, num_partitions, split_options_)); + auto compression_type = arrow::util::Codec::GetCompressionType("lz4"); + ASSERT_NOT_OK(splitter_->SetCompressType(compression_type.MoveValueUnsafe())); + ASSERT_NOT_OK(splitter_->Split(*input_batch_arr)); + ASSERT_NOT_OK(splitter_->Stop()); + + std::shared_ptr file_reader; + ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); + + // verify partition lengths + const auto& lengths = splitter_->PartitionLengths(); + ASSERT_EQ(lengths.size(), 2); + ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]); + + // verify schema + std::vector> batches; + ASSERT_EQ(*file_reader->schema(), *rb_schema); + + // prepare first block expected result + std::shared_ptr res_batch_0; + std::shared_ptr res_batch_1; + ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2, 4]")) + std::vector expected = {res_batch_0.get()}; + + // verify first block + ASSERT_NOT_OK(file_reader->ReadAll(&batches)); + ASSERT_EQ(batches.size(), 1); + for (auto i = 0; i < batches.size(); ++i) { + const auto& rb = batches[i]; + ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); + for (auto j = 0; j < rb->num_columns(); ++j) { + ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); + } + ASSERT_TRUE(rb->Equals(*expected[i])); + } + + // prepare second block expected result + ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3, 5]")) + expected = {res_batch_0.get()}; + + // verify second block + batches.clear(); + ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile())); + ASSERT_EQ(*file_reader->schema(), *rb_schema); + ASSERT_NOT_OK(file_->Advance(lengths[0])); + ASSERT_NOT_OK(file_reader->ReadAll(&batches)); + ASSERT_EQ(batches.size(), 1); + for (auto i = 0; i < batches.size(); ++i) { + const auto& rb = batches[i]; + ASSERT_EQ(rb->num_columns(), rb_schema->num_fields()); + for (auto j = 0; j < rb->num_columns(); ++j) { + ASSERT_EQ(rb->column(j)->length(), rb->num_rows()); + } + ASSERT_TRUE(rb->Equals(*expected[i])); + } +} + +} // namespace shuffle +} // namespace sparkcolumnarplugin From 2bca314012bc3e2be93ee70483af9e45eded1d28 Mon Sep 17 00:00:00 2001 From: binwei Date: Fri, 6 May 2022 17:19:14 +0800 Subject: [PATCH 14/19] remove unused variables --- native-sql-engine/cpp/src/shuffle/splitter.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc index af8ca584e..bb9b9909b 100644 --- a/native-sql-engine/cpp/src/shuffle/splitter.cc +++ b/native-sql-engine/cpp/src/shuffle/splitter.cc @@ -1383,8 +1383,6 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch& partition_buffer_idx_offset.resize(partition_buffer_idx_base_.size()); std::copy(partition_buffer_idx_base_.begin(), partition_buffer_idx_base_.end(), partition_buffer_idx_offset.begin()); - std::vector nullcnt; - nullcnt.resize(num_partitions_,0); for (auto row = 0; row < num_rows; ++row) { auto pid = partition_id_[row]; auto dst_offset = partition_buffer_idx_offset[pid]; From 8d5a41efcebd2b9dccae8a1c1eb704fadb702057 Mon Sep 17 00:00:00 2001 From: binwei Date: Sat, 7 May 2022 17:16:44 +0800 Subject: [PATCH 15/19] allocate validity buffer from pool --- native-sql-engine/cpp/src/shuffle/splitter.cc | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc index bb9b9909b..8e4069509 100644 --- a/native-sql-engine/cpp/src/shuffle/splitter.cc +++ b/native-sql-engine/cpp/src/shuffle/splitter.cc @@ -621,16 +621,14 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer num_rows * (arrow::bit_width(column_type_id_[i]->id()) >> 3)); } + if () + arrays[i] = arrow::MakeArray( + arrow::ArrayData::Make(schema_->field(i)->type(), num_rows, + {buffers[0],buffers[1]})); if (reset_buffers) { - arrays[i] = arrow::MakeArray( - arrow::ArrayData::Make(schema_->field(i)->type(), num_rows, - {buffers[0],buffers[1]})); partition_fixed_width_validity_addrs_[fixed_width_idx][partition_id] = nullptr; partition_fixed_width_value_addrs_[fixed_width_idx][partition_id] = nullptr; - } else { - arrays[i] = arrow::MakeArray(arrow::ArrayData::Make( - schema_->field(i)->type(), num_rows, {buffers[0], buffers[1]})); } fixed_width_idx++; break; @@ -937,6 +935,7 @@ arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) { auto col_idx = fixed_width_array_idx_[col]; size_per_row += arrow::bit_width(column_type_id_[col_idx]->id()) / 8; //check input_fixed_width_has_null_[col] is cheaper than GetNullCount() + // once input_fixed_width_has_null_ is set to true, we didn't reset it after spill if (input_fixed_width_has_null_[col]==false && rb.column_data(col_idx)->GetNullCount() != 0) { input_fixed_width_has_null_[col] = true; } @@ -1368,10 +1367,11 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch& // init bitmap if it's null, initialize the buffer as true auto new_size = std::max(partition_id_cnt_[pid], (uint16_t)options_.buffer_size); - ARROW_ASSIGN_OR_RAISE( - auto validity_buffer, - arrow::AllocateResizableBuffer(arrow::BitUtil::BytesForBits(new_size), - options_.memory_pool)); + + std::shared_ptr validity_buffer; + auto status = AllocateBufferFromPool(validity_buffer, + arrow::BitUtil::BytesForBits(new_size)); + ARROW_RETURN_NOT_OK(status); dst_addrs[pid] = const_cast(validity_buffer->data()); arrow::BitUtil::SetBitsTo(dst_addrs[pid], 0, partition_buffer_idx_base_[pid], true); From f2e2fb3f70c84ab0239eb8d9a340b3d58d812b9a Mon Sep 17 00:00:00 2001 From: binwei Date: Sun, 8 May 2022 21:18:21 +0800 Subject: [PATCH 16/19] fix bug set validity buffer after allocation fix bug during of last bits after process valitity buffer --- native-sql-engine/cpp/src/shuffle/splitter.cc | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc index 8e4069509..a91ff96df 100644 --- a/native-sql-engine/cpp/src/shuffle/splitter.cc +++ b/native-sql-engine/cpp/src/shuffle/splitter.cc @@ -49,7 +49,7 @@ using arrow::internal::checked_cast; #ifndef SPLIT_BUFFER_SIZE // by default, allocate 8M block, 2M page size -#define SPLIT_BUFFER_SIZE 8 * 1024 * 1024 +#define SPLIT_BUFFER_SIZE 8*1024*1024 #endif template @@ -610,18 +610,17 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer default: { auto buffers = partition_fixed_width_buffers_[fixed_width_idx][partition_id]; if (buffers[0] != nullptr) { - buffers[0] = arrow::SliceBuffer(buffers[0], 0, (num_rows >> 3) + 1); + buffers[0] = arrow::SliceBuffer(buffers[0], 0, arrow::BitUtil::BytesForBits(num_rows)); } if (buffers[1] != nullptr) { if (column_type_id_[i]->id() == arrow::BooleanType::type_id) - buffers[1] = arrow::SliceBuffer(buffers[1], 0, (num_rows >> 3) + 1); + buffers[1] = arrow::SliceBuffer(buffers[1], 0, arrow::BitUtil::BytesForBits(num_rows)); else buffers[1] = arrow::SliceBuffer( buffers[1], 0, num_rows * (arrow::bit_width(column_type_id_[i]->id()) >> 3)); } - if () arrays[i] = arrow::MakeArray( arrow::ArrayData::Make(schema_->field(i)->type(), num_rows, {buffers[0],buffers[1]})); @@ -1367,18 +1366,15 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch& // init bitmap if it's null, initialize the buffer as true auto new_size = std::max(partition_id_cnt_[pid], (uint16_t)options_.buffer_size); - std::shared_ptr validity_buffer; auto status = AllocateBufferFromPool(validity_buffer, arrow::BitUtil::BytesForBits(new_size)); ARROW_RETURN_NOT_OK(status); dst_addrs[pid] = const_cast(validity_buffer->data()); - arrow::BitUtil::SetBitsTo(dst_addrs[pid], 0, partition_buffer_idx_base_[pid], - true); + memset(validity_buffer->mutable_data(),0xff,validity_buffer->capacity()); partition_fixed_width_buffers_[col][pid][0] = std::move(validity_buffer); } } - auto src_addr = const_cast(rb.column_data(col_idx)->buffers[0]->data()); partition_buffer_idx_offset.resize(partition_buffer_idx_base_.size()); std::copy(partition_buffer_idx_base_.begin(), partition_buffer_idx_base_.end(), @@ -1397,9 +1393,12 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch& { if (partition_id_cnt_[pid] > 0 && dst_addrs[pid] != nullptr) { auto lastoffset = partition_buffer_idx_base_[pid]+partition_id_cnt_[pid]; - - arrow::BitUtil::SetBitsTo(dst_addrs[pid], lastoffset, 8-(lastoffset&0x7), - true); + uint8_t dst = dst_addrs[pid][lastoffset>>3]; + uint8_t msk = 0x1 << (lastoffset & 0x7); + msk=~(msk-1); + msk &= ((lastoffset & 7) == 0)-1; + dst |= msk; + dst_addrs[pid][lastoffset>>3]=dst; } } From 4f8dee59f909fd0d312a90fa6786f78da46d83d3 Mon Sep 17 00:00:00 2001 From: binwei Date: Mon, 9 May 2022 10:32:07 +0800 Subject: [PATCH 17/19] Add arrow check for batch size and part number use uint32 as row number size --- native-sql-engine/cpp/src/shuffle/splitter.cc | 31 ++++++++++++------- native-sql-engine/cpp/src/shuffle/splitter.h | 20 ++++++------ 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc index a91ff96df..d3caabb03 100644 --- a/native-sql-engine/cpp/src/shuffle/splitter.cc +++ b/native-sql-engine/cpp/src/shuffle/splitter.cc @@ -303,6 +303,12 @@ arrow::Result> Splitter::Make( } arrow::Status Splitter::Init() { + + // partition number should be less than 64k + ARROW_CHECK_LE(num_partitions_,64*1024); + // split record batch size should be less than 32k + ARROW_CHECK_LE(options_.buffer_size,32*1024); + const auto& fields = schema_->fields(); ARROW_ASSIGN_OR_RAISE(column_type_id_, ToSplitterTypeId(schema_->fields())); @@ -884,6 +890,9 @@ arrow::Result Splitter::SpillLargestPartition(int64_t* size) { arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) { + //buffer is allocated less than 64K + //ARROW_CHECK_LE(rb.num_rows(),64*1024); + #ifdef PROCESSROW reducer_offsets_.resize(rb.num_rows()); @@ -901,7 +910,7 @@ arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) { } std::transform(reducer_offset_offset_.begin(), std::prev(reducer_offset_offset_.end()), partition_id_cnt_.begin(), reducer_offset_offset_.begin(), - [](uint16_t x, int16_t y) { return x - y; }); + [](row_offset_type x, row_offset_type y) { return x - y; }); #endif // for the first input record batch, scan binary arrays and large binary @@ -950,7 +959,7 @@ arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) { for (auto pid = 0; pid < num_partitions_; ++pid) { if (partition_id_cnt_[pid] > 0) { // make sure the size to be allocated is larger than the size to be filled - auto new_size = std::max((uint16_t)prealloc_row_cnt, partition_id_cnt_[pid]); + auto new_size = std::max((row_offset_type)prealloc_row_cnt, partition_id_cnt_[pid]); if (partition_buffer_size_[pid] == 0) { // allocate buffer if it's not yet allocated RETURN_NOT_OK(AllocatePartitionBuffers(pid, new_size)); @@ -1005,7 +1014,7 @@ arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) { arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb) { const auto num_rows = rb.num_rows(); int64_t row; - std::vector partition_buffer_idx_offset; + std::vector partition_buffer_idx_offset; for (auto col = 0; col < fixed_width_array_idx_.size(); ++col) { const auto& dst_addrs = partition_fixed_width_value_addrs_[col]; @@ -1020,7 +1029,7 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb) std::transform(partition_buffer_idx_offset_.begin(), \ partition_buffer_idx_offset_.end(), partition_buffer_idx_base_.begin(), \ partition_buffer_idx_offset_.begin(), \ - [](uint8_t* x, int16_t y) { return x + y * sizeof(_CTYPE); }); \ + [](uint8_t* x, row_offset_type y) { return x + y * sizeof(_CTYPE); }); \ for (auto pid = 0; pid < num_partitions_; pid++) { \ auto dst_pid_base = \ reinterpret_cast<_CTYPE*>(partition_buffer_idx_offset_[pid]); /*32k*/ \ @@ -1039,7 +1048,7 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb) std::transform(partition_buffer_idx_offset_.begin(), \ partition_buffer_idx_offset_.end(), partition_buffer_idx_base_.begin(), \ partition_buffer_idx_offset_.begin(), \ - [](uint8_t* x, int16_t y) { return x + y * sizeof(_CTYPE); }); \ + [](uint8_t* x, row_offset_type y) { return x + y * sizeof(_CTYPE); }); \ for (row = 0; row < num_rows; ++row) { \ auto pid = partition_id_[row]; \ auto dst_pid_base = reinterpret_cast<_CTYPE*>(partition_buffer_idx_offset_[pid]); \ @@ -1060,7 +1069,7 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb) std::transform( partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(), partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(), - [](uint8_t* x, int16_t y) { return x + y * sizeof(uint64_t); }); + [](uint8_t* x, row_offset_type y) { return x + y * sizeof(uint64_t); }); for (auto pid = 0; pid < num_partitions_; pid++) { auto dst_pid_base = reinterpret_cast(partition_buffer_idx_offset_[pid]); /*32k*/ @@ -1129,7 +1138,7 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb) std::transform( partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(), partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(), - [](uint8_t* x, int16_t y) { return x + y * 16; }); + [](uint8_t* x, row_offset_type y) { return x + y * 16; }); for (auto pid = 0; pid < num_partitions_; pid++) { auto dst_pid_base = reinterpret_cast(partition_buffer_idx_offset_[pid]); /*32k*/ @@ -1150,7 +1159,7 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb) std::transform( partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(), partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(), - [](uint8_t* x, int16_t y) { return x + y * 16; }); + [](uint8_t* x, row_offset_type y) { return x + y * 16; }); for (auto row = 0; row < num_rows; ++row) { auto pid = partition_id_[row]; reinterpret_cast(partition_buffer_idx_offset_[pid])[0] = @@ -1169,7 +1178,7 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb) partition_buffer_idx_offset.begin()); for (auto row = 0; row < num_rows; ++row) { auto pid = partition_id_[row]; - uint16_t dst_offset = partition_buffer_idx_offset[pid]; + row_offset_type dst_offset = partition_buffer_idx_offset[pid]; dst_addrs[pid][dst_offset >> 3] ^= (dst_addrs[pid][dst_offset >> 3] >> (dst_offset & 7) ^ src_addr[row >> 3] >> (row & 7)) @@ -1354,7 +1363,7 @@ arrow::Status Splitter::SplitFixedWidthValueBufferAVX(const arrow::RecordBatch& arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch& rb) { const auto num_rows = rb.num_rows(); - std::vector partition_buffer_idx_offset; + std::vector partition_buffer_idx_offset; for (auto col = 0; col < fixed_width_array_idx_.size(); ++col) { auto col_idx = fixed_width_array_idx_[col]; @@ -1365,7 +1374,7 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch& if (partition_id_cnt_[pid] > 0 && dst_addrs[pid] == nullptr) { // init bitmap if it's null, initialize the buffer as true auto new_size = - std::max(partition_id_cnt_[pid], (uint16_t)options_.buffer_size); + std::max(partition_id_cnt_[pid], (row_offset_type)options_.buffer_size); std::shared_ptr validity_buffer; auto status = AllocateBufferFromPool(validity_buffer, arrow::BitUtil::BytesForBits(new_size)); diff --git a/native-sql-engine/cpp/src/shuffle/splitter.h b/native-sql-engine/cpp/src/shuffle/splitter.h index cc7440926..ace9e5661 100644 --- a/native-sql-engine/cpp/src/shuffle/splitter.h +++ b/native-sql-engine/cpp/src/shuffle/splitter.h @@ -48,6 +48,8 @@ class Splitter { virtual const std::shared_ptr& input_schema() const { return schema_; } + typedef uint32_t row_offset_type; + /** * Split input record batch into partition buffers according to the computed * partition id. The largest partition buffer will be spilled if memory @@ -177,8 +179,8 @@ class Splitter { // partid std::vector partition_buffer_size_; - // partid - std::vector partition_buffer_idx_base_; + // partid, value is reducer batch's offset, output rb rownum < 64k + std::vector partition_buffer_idx_base_; // partid // temp array to hold the destination pointer std::vector partition_buffer_idx_offset_; @@ -231,14 +233,14 @@ class Splitter { std::vector input_fixed_width_has_null_; // updated for each input record batch - // col + // col; value is partition number, part_num < 64k std::vector partition_id_; - // [num_rows] - std::vector reducer_offsets_; - // [num_partitions] - std::vector reducer_offset_offset_; - // col - std::vector partition_id_cnt_; + // [num_rows] ; value is offset in input record batch; input rb rownum < 64k + std::vector reducer_offsets_; + // [num_partitions]; value is offset of row in record batch; input rb rownum < 64k + std::vector reducer_offset_offset_; + // col ; value is reducer's row number for each input record batch; output rb rownum < 64k + std::vector partition_id_cnt_; int32_t num_partitions_; std::shared_ptr schema_; From 791c0cc32d37ec009c71c83476b6822587fe3fbb Mon Sep 17 00:00:00 2001 From: binwei Date: Mon, 9 May 2022 11:23:42 +0800 Subject: [PATCH 18/19] format code --- native-sql-engine/cpp/src/shuffle/splitter.cc | 87 +++++++++---------- native-sql-engine/cpp/src/shuffle/splitter.h | 3 +- 2 files changed, 44 insertions(+), 46 deletions(-) diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc index d3caabb03..3ee7ff014 100644 --- a/native-sql-engine/cpp/src/shuffle/splitter.cc +++ b/native-sql-engine/cpp/src/shuffle/splitter.cc @@ -49,7 +49,7 @@ using arrow::internal::checked_cast; #ifndef SPLIT_BUFFER_SIZE // by default, allocate 8M block, 2M page size -#define SPLIT_BUFFER_SIZE 8*1024*1024 +#define SPLIT_BUFFER_SIZE 8 * 1024 * 1024 #endif template @@ -303,11 +303,10 @@ arrow::Result> Splitter::Make( } arrow::Status Splitter::Init() { - // partition number should be less than 64k - ARROW_CHECK_LE(num_partitions_,64*1024); + ARROW_CHECK_LE(num_partitions_, 64 * 1024); // split record batch size should be less than 32k - ARROW_CHECK_LE(options_.buffer_size,32*1024); + ARROW_CHECK_LE(options_.buffer_size, 32 * 1024); const auto& fields = schema_->fields(); ARROW_ASSIGN_OR_RAISE(column_type_id_, ToSplitterTypeId(schema_->fields())); @@ -412,8 +411,8 @@ arrow::Status Splitter::Init() { arrow::util::Codec::CreateInt32(arrow::Compression::UNCOMPRESSED)); // Allocate first buffer for split reducer - ARROW_ASSIGN_OR_RAISE(combine_buffer_, arrow::AllocateResizableBuffer( - 0, options_.memory_pool)); + ARROW_ASSIGN_OR_RAISE(combine_buffer_, + arrow::AllocateResizableBuffer(0, options_.memory_pool)); combine_buffer_->Resize(0, /*shrink_to_fit =*/false); return arrow::Status::OK(); @@ -616,20 +615,21 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer default: { auto buffers = partition_fixed_width_buffers_[fixed_width_idx][partition_id]; if (buffers[0] != nullptr) { - buffers[0] = arrow::SliceBuffer(buffers[0], 0, arrow::BitUtil::BytesForBits(num_rows)); + buffers[0] = + arrow::SliceBuffer(buffers[0], 0, arrow::BitUtil::BytesForBits(num_rows)); } if (buffers[1] != nullptr) { if (column_type_id_[i]->id() == arrow::BooleanType::type_id) - buffers[1] = arrow::SliceBuffer(buffers[1], 0, arrow::BitUtil::BytesForBits(num_rows)); + buffers[1] = arrow::SliceBuffer(buffers[1], 0, + arrow::BitUtil::BytesForBits(num_rows)); else buffers[1] = arrow::SliceBuffer( buffers[1], 0, num_rows * (arrow::bit_width(column_type_id_[i]->id()) >> 3)); } - arrays[i] = arrow::MakeArray( - arrow::ArrayData::Make(schema_->field(i)->type(), num_rows, - {buffers[0],buffers[1]})); + arrays[i] = arrow::MakeArray(arrow::ArrayData::Make( + schema_->field(i)->type(), num_rows, {buffers[0], buffers[1]})); if (reset_buffers) { partition_fixed_width_validity_addrs_[fixed_width_idx][partition_id] = nullptr; @@ -661,7 +661,7 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer arrow::ipc::GetRecordBatchPayload(*batch, tiny_bach_write_options_, payload.get())); #endif - + partition_cached_recordbatch_size_[partition_id] += payload->body_length; partition_cached_recordbatch_[partition_id].push_back(std::move(payload)); partition_buffer_idx_base_[partition_id] = 0; @@ -728,21 +728,20 @@ arrow::Status Splitter::AllocatePartitionBuffers(int32_t partition_id, int32_t n std::shared_ptr value_buffer; if (column_type_id_[i]->id() == arrow::BooleanType::type_id) { auto status = AllocateBufferFromPool(value_buffer, - arrow::BitUtil::BytesForBits(new_size)); + arrow::BitUtil::BytesForBits(new_size)); ARROW_RETURN_NOT_OK(status); } else { auto status = AllocateBufferFromPool( - value_buffer, - new_size * (arrow::bit_width(column_type_id_[i]->id()) >> 3)); + value_buffer, new_size * (arrow::bit_width(column_type_id_[i]->id()) >> 3)); ARROW_RETURN_NOT_OK(status); } new_value_buffers.push_back(std::move(value_buffer)); if (input_fixed_width_has_null_[fixed_width_idx]) { std::shared_ptr validity_buffer; auto status = AllocateBufferFromPool(validity_buffer, - arrow::BitUtil::BytesForBits(new_size)); + arrow::BitUtil::BytesForBits(new_size)); ARROW_RETURN_NOT_OK(status); - //initialize all true once allocated + // initialize all true once allocated memset(validity_buffer->mutable_data(), 0xff, validity_buffer->capacity()); new_validity_buffers.push_back(std::move(validity_buffer)); } else { @@ -811,7 +810,7 @@ arrow::Status Splitter::AllocateNew(int32_t partition_id, int32_t new_size) { << std::to_string(partition_id) << std::endl; int64_t spilled_size; ARROW_ASSIGN_OR_RAISE(auto partition_to_spill, SpillLargestPartition(&spilled_size)); - if (partition_to_spill == -1) { + if (partition_to_spill == -1) { std::cout << "Failed to allocate new buffer for partition " << std::to_string(partition_id) << ". No partition buffer to spill." << std::endl; @@ -851,16 +850,16 @@ arrow::Status Splitter::SpillPartition(int32_t partition_id) { } TIME_NANO_OR_RAISE(total_spill_time_, partition_writer_[partition_id]->Spill()); - //reset validity buffer after spill + // reset validity buffer after spill std::for_each(partition_fixed_width_buffers_.begin(), - partition_fixed_width_buffers_.end(),[partition_id](std::vector& bufs){ - if (bufs[partition_id][0]!=nullptr) - { - //initialize all true once allocated - auto addr = bufs[partition_id][0]->mutable_data(); - memset(addr,0xff,bufs[partition_id][0]->capacity()); - } - }); + partition_fixed_width_buffers_.end(), + [partition_id](std::vector& bufs) { + if (bufs[partition_id][0] != nullptr) { + // initialize all true once allocated + auto addr = bufs[partition_id][0]->mutable_data(); + memset(addr, 0xff, bufs[partition_id][0]->capacity()); + } + }); return arrow::Status::OK(); } @@ -889,9 +888,8 @@ arrow::Result Splitter::SpillLargestPartition(int64_t* size) { } arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) { - - //buffer is allocated less than 64K - //ARROW_CHECK_LE(rb.num_rows(),64*1024); + // buffer is allocated less than 64K + // ARROW_CHECK_LE(rb.num_rows(),64*1024); #ifdef PROCESSROW @@ -942,9 +940,10 @@ arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) { for (auto col = 0; col < fixed_width_array_idx_.size(); ++col) { auto col_idx = fixed_width_array_idx_[col]; size_per_row += arrow::bit_width(column_type_id_[col_idx]->id()) / 8; - //check input_fixed_width_has_null_[col] is cheaper than GetNullCount() - // once input_fixed_width_has_null_ is set to true, we didn't reset it after spill - if (input_fixed_width_has_null_[col]==false && rb.column_data(col_idx)->GetNullCount() != 0) { + // check input_fixed_width_has_null_[col] is cheaper than GetNullCount() + // once input_fixed_width_has_null_ is set to true, we didn't reset it after spill + if (input_fixed_width_has_null_[col] == false && + rb.column_data(col_idx)->GetNullCount() != 0) { input_fixed_width_has_null_[col] = true; } } @@ -1029,7 +1028,7 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb) std::transform(partition_buffer_idx_offset_.begin(), \ partition_buffer_idx_offset_.end(), partition_buffer_idx_base_.begin(), \ partition_buffer_idx_offset_.begin(), \ - [](uint8_t* x, row_offset_type y) { return x + y * sizeof(_CTYPE); }); \ + [](uint8_t* x, row_offset_type y) { return x + y * sizeof(_CTYPE); }); \ for (auto pid = 0; pid < num_partitions_; pid++) { \ auto dst_pid_base = \ reinterpret_cast<_CTYPE*>(partition_buffer_idx_offset_[pid]); /*32k*/ \ @@ -1048,7 +1047,7 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb) std::transform(partition_buffer_idx_offset_.begin(), \ partition_buffer_idx_offset_.end(), partition_buffer_idx_base_.begin(), \ partition_buffer_idx_offset_.begin(), \ - [](uint8_t* x, row_offset_type y) { return x + y * sizeof(_CTYPE); }); \ + [](uint8_t* x, row_offset_type y) { return x + y * sizeof(_CTYPE); }); \ for (row = 0; row < num_rows; ++row) { \ auto pid = partition_id_[row]; \ auto dst_pid_base = reinterpret_cast<_CTYPE*>(partition_buffer_idx_offset_[pid]); \ @@ -1377,10 +1376,10 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch& std::max(partition_id_cnt_[pid], (row_offset_type)options_.buffer_size); std::shared_ptr validity_buffer; auto status = AllocateBufferFromPool(validity_buffer, - arrow::BitUtil::BytesForBits(new_size)); + arrow::BitUtil::BytesForBits(new_size)); ARROW_RETURN_NOT_OK(status); dst_addrs[pid] = const_cast(validity_buffer->data()); - memset(validity_buffer->mutable_data(),0xff,validity_buffer->capacity()); + memset(validity_buffer->mutable_data(), 0xff, validity_buffer->capacity()); partition_fixed_width_buffers_[col][pid][0] = std::move(validity_buffer); } } @@ -1398,19 +1397,17 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch& partition_buffer_idx_offset[pid]++; } // the last row may update the following bits to 0, reinitialize it as 1 - for(auto pid=0;pid 0 && dst_addrs[pid] != nullptr) { - auto lastoffset = partition_buffer_idx_base_[pid]+partition_id_cnt_[pid]; - uint8_t dst = dst_addrs[pid][lastoffset>>3]; + auto lastoffset = partition_buffer_idx_base_[pid] + partition_id_cnt_[pid]; + uint8_t dst = dst_addrs[pid][lastoffset >> 3]; uint8_t msk = 0x1 << (lastoffset & 0x7); - msk=~(msk-1); - msk &= ((lastoffset & 7) == 0)-1; + msk = ~(msk - 1); + msk &= ((lastoffset & 7) == 0) - 1; dst |= msk; - dst_addrs[pid][lastoffset>>3]=dst; + dst_addrs[pid][lastoffset >> 3] = dst; } } - } } return arrow::Status::OK(); diff --git a/native-sql-engine/cpp/src/shuffle/splitter.h b/native-sql-engine/cpp/src/shuffle/splitter.h index ace9e5661..ab71446f9 100644 --- a/native-sql-engine/cpp/src/shuffle/splitter.h +++ b/native-sql-engine/cpp/src/shuffle/splitter.h @@ -239,7 +239,8 @@ class Splitter { std::vector reducer_offsets_; // [num_partitions]; value is offset of row in record batch; input rb rownum < 64k std::vector reducer_offset_offset_; - // col ; value is reducer's row number for each input record batch; output rb rownum < 64k + // col ; value is reducer's row number for each input record batch; output rb rownum < + // 64k std::vector partition_id_cnt_; int32_t num_partitions_; From 1d8b0d65ff18b60ded9f4348585b6b68364a5922 Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Mon, 9 May 2022 13:09:28 +0800 Subject: [PATCH 19/19] fix format Signed-off-by: Yuan Zhou --- .../src/benchmarks/shuffle_split_benchmark.cc | 84 +++++++++---------- native-sql-engine/cpp/src/shuffle/splitter.cc | 4 +- .../cpp/src/tests/shuffle_split_test.cc | 18 ++-- .../src/third_party/parallel_hashmap/btree.h | 21 ++--- .../src/third_party/parallel_hashmap/phmap.h | 12 +-- .../third_party/parallel_hashmap/phmap_base.h | 36 ++++---- 6 files changed, 88 insertions(+), 87 deletions(-) diff --git a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc index 6f9a7f19e..106d7dba8 100644 --- a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc +++ b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc @@ -143,7 +143,7 @@ class LargePageMemoryPool : public MemoryPool { pool_->Free(buffer, size, ALIGNMENT); } #else - pool_->Free(buffer, size); + pool_->Free(buffer, size); #endif } @@ -321,27 +321,27 @@ class BenchmarkShuffleSplit_CacheScan_Benchmark : public BenchmarkShuffleSplit { const int num_partitions, SplitOptions options, benchmark::State& state) { std::vector local_column_indices; local_column_indices.push_back(0); -/* local_column_indices.push_back(0); - local_column_indices.push_back(1); - local_column_indices.push_back(2); - local_column_indices.push_back(4); - local_column_indices.push_back(5); - local_column_indices.push_back(6); - local_column_indices.push_back(7);*/ + /* local_column_indices.push_back(0); + local_column_indices.push_back(1); + local_column_indices.push_back(2); + local_column_indices.push_back(4); + local_column_indices.push_back(5); + local_column_indices.push_back(6); + local_column_indices.push_back(7);*/ std::shared_ptr local_schema; local_schema = std::make_shared(*schema.get()); -/* ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(15)); - ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(14)); - ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(13)); - ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(12)); - ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(11)); - ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(10)); - ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(9)); - ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(8)); - ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(3)); -*/ + /* ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(15)); + ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(14)); + ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(13)); + ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(12)); + ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(11)); + ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(10)); + ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(9)); + ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(8)); + ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(3)); + */ if (state.thread_index() == 0) std::cout << local_schema->ToString() << std::endl; ARROW_ASSIGN_OR_THROW(splitter, @@ -497,30 +497,30 @@ int main(int argc, char** argv) { ->MeasureProcessCPUTime() ->Unit(benchmark::kSecond); -/* sparkcolumnarplugin::shuffle::BenchmarkShuffleSplit_IterateScan_Benchmark - bck(datafile); - - benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck) - ->Iterations(1) - ->Args({96*2, arrow::Compression::FASTPFOR}) - ->Args({96*4, arrow::Compression::FASTPFOR}) - ->Args({96*8, arrow::Compression::FASTPFOR}) - ->Args({96*16, arrow::Compression::FASTPFOR}) - ->Args({96*32, arrow::Compression::FASTPFOR}) - ->Threads(24) - ->Unit(benchmark::kSecond); - - benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck) - ->Iterations(1) - ->Args({4096, arrow::Compression::FASTPFOR}) - ->Threads(1) - ->Threads(2) - ->Threads(4) - ->Threads(8) - ->Threads(16) - ->Threads(24) - ->Unit(benchmark::kSecond); -*/ + /* sparkcolumnarplugin::shuffle::BenchmarkShuffleSplit_IterateScan_Benchmark + bck(datafile); + + benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck) + ->Iterations(1) + ->Args({96*2, arrow::Compression::FASTPFOR}) + ->Args({96*4, arrow::Compression::FASTPFOR}) + ->Args({96*8, arrow::Compression::FASTPFOR}) + ->Args({96*16, arrow::Compression::FASTPFOR}) + ->Args({96*32, arrow::Compression::FASTPFOR}) + ->Threads(24) + ->Unit(benchmark::kSecond); + + benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck) + ->Iterations(1) + ->Args({4096, arrow::Compression::FASTPFOR}) + ->Threads(1) + ->Threads(2) + ->Threads(4) + ->Threads(8) + ->Threads(16) + ->Threads(24) + ->Unit(benchmark::kSecond); + */ benchmark::Initialize(&argc, argv); benchmark::RunSpecifiedBenchmarks(); benchmark::Shutdown(); diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc index 3ee7ff014..5213d607a 100644 --- a/native-sql-engine/cpp/src/shuffle/splitter.cc +++ b/native-sql-engine/cpp/src/shuffle/splitter.cc @@ -1622,8 +1622,8 @@ arrow::Status HashSplitter::ComputeAndCountPartitionId(const arrow::RecordBatch& "lea (%[num_partitions],%[pid],1),%[tmp]\n" "test %[pid],%[pid]\n" "cmovs %[tmp],%[pid]\n" - : [pid] "+r"(pid) - : [num_partitions] "r"(num_partitions_), [tmp] "r"(0)); + : [ pid ] "+r"(pid) + : [ num_partitions ] "r"(num_partitions_), [ tmp ] "r"(0)); partition_id_[i] = pid; partition_id_cnt_[pid]++; } diff --git a/native-sql-engine/cpp/src/tests/shuffle_split_test.cc b/native-sql-engine/cpp/src/tests/shuffle_split_test.cc index d5d8de0bf..fa03be61d 100644 --- a/native-sql-engine/cpp/src/tests/shuffle_split_test.cc +++ b/native-sql-engine/cpp/src/tests/shuffle_split_test.cc @@ -54,8 +54,8 @@ class MyMemoryPool : public arrow::MemoryPool { } RETURN_NOT_OK(pool_->Allocate(size, out)); stats_.UpdateAllocatedBytes(size); - //std::cout << "Allocate: size = " << size << " addr = " << std::hex << - //(uint64_t)*out << std::dec << std::endl; + // std::cout << "Allocate: size = " << size << " addr = " << std::hex << + //(uint64_t)*out << std::dec << std::endl; // print_trace(); return arrow::Status::OK(); } @@ -67,19 +67,19 @@ class MyMemoryPool : public arrow::MemoryPool { auto old_ptr = *ptr; RETURN_NOT_OK(pool_->Reallocate(old_size, new_size, ptr)); stats_.UpdateAllocatedBytes(new_size - old_size); - //std::cout << "Reallocate: old_size = " << old_size << " old_ptr = " << std::hex << - //(uint64_t)old_ptr << std::dec << " new_size = " << new_size << " addr = " << - //std::hex << (uint64_t)*ptr << std::dec << std::endl; - //print_trace(); + // std::cout << "Reallocate: old_size = " << old_size << " old_ptr = " << std::hex << + //(uint64_t)old_ptr << std::dec << " new_size = " << new_size << " addr = " << + // std::hex << (uint64_t)*ptr << std::dec << std::endl; + // print_trace(); return arrow::Status::OK(); } void Free(uint8_t* buffer, int64_t size) override { pool_->Free(buffer, size); stats_.UpdateAllocatedBytes(-size); - //std::cout << "Free: size = " << size << " addr = " << std::hex << (uint64_t)buffer - //<< std::dec << std::endl; - //print_trace(); + // std::cout << "Free: size = " << size << " addr = " << std::hex << (uint64_t)buffer + //<< std::dec << std::endl; + // print_trace(); } int64_t bytes_allocated() const override { return stats_.bytes_allocated(); } diff --git a/native-sql-engine/cpp/src/third_party/parallel_hashmap/btree.h b/native-sql-engine/cpp/src/third_party/parallel_hashmap/btree.h index 24c2d145b..b9b0d94da 100644 --- a/native-sql-engine/cpp/src/third_party/parallel_hashmap/btree.h +++ b/native-sql-engine/cpp/src/third_party/parallel_hashmap/btree.h @@ -661,9 +661,9 @@ constexpr bool do_less_than_comparison(const Compare& compare, const K& x, const // SFINAE prevents implicit conversions to int (such as from bool). template ::value, int> = 0> constexpr phmap::weak_ordering compare_result_as_ordering(const Int c) { - return c < 0 ? phmap::weak_ordering::less - : c == 0 ? phmap::weak_ordering::equivalent - : phmap::weak_ordering::greater; + return c < 0 + ? phmap::weak_ordering::less + : c == 0 ? phmap::weak_ordering::equivalent : phmap::weak_ordering::greater; } constexpr phmap::weak_ordering compare_result_as_ordering(const phmap::weak_ordering c) { return c; @@ -685,9 +685,9 @@ template < int> = 0> constexpr phmap::weak_ordering do_three_way_comparison(const Compare& compare, const K& x, const LK& y) { - return compare(x, y) ? phmap::weak_ordering::less - : compare(y, x) ? phmap::weak_ordering::greater - : phmap::weak_ordering::equivalent; + return compare(x, y) ? phmap::weak_ordering::less + : compare(y, x) ? phmap::weak_ordering::greater + : phmap::weak_ordering::equivalent; } } // namespace compare_internal @@ -1063,10 +1063,11 @@ class btree_node { // Compute how many values we can fit onto a leaf node taking into account // padding. constexpr static size_type NodeTargetValues(const int begin, const int end) { - return begin == end ? begin - : SizeWithNValues((begin + end) / 2 + 1) > params_type::kTargetNodeSize - ? NodeTargetValues(begin, (begin + end) / 2) - : NodeTargetValues((begin + end) / 2 + 1, end); + return begin == end + ? begin + : SizeWithNValues((begin + end) / 2 + 1) > params_type::kTargetNodeSize + ? NodeTargetValues(begin, (begin + end) / 2) + : NodeTargetValues((begin + end) / 2 + 1, end); } enum { diff --git a/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap.h b/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap.h index 05d227a43..4628cca30 100644 --- a/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap.h +++ b/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap.h @@ -2156,13 +2156,13 @@ class raw_hash_map : public raw_hash_set { // incomplete types as values, as in unordered_map. // MappedReference<> may be a non-reference type. template - using MappedReference = decltype(P::value( - std::addressof(std::declval()))); + using MappedReference = decltype( + P::value(std::addressof(std::declval()))); // MappedConstReference<> may be a non-reference type. template - using MappedConstReference = decltype(P::value( - std::addressof(std::declval()))); + using MappedConstReference = decltype( + P::value(std::addressof(std::declval()))); using KeyArgImpl = KeyArg::value && IsTransparent::value>; @@ -3409,8 +3409,8 @@ class parallel_hash_map // incomplete types as values, as in unordered_map. // MappedReference<> may be a non-reference type. template - using MappedReference = decltype(P::value( - std::addressof(std::declval()))); + using MappedReference = decltype( + P::value(std::addressof(std::declval()))); // MappedConstReference<> may be a non-reference type. template diff --git a/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap_base.h b/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap_base.h index 0f4e6375d..3b3b6b120 100644 --- a/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap_base.h +++ b/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap_base.h @@ -826,8 +826,8 @@ struct Invoker { // The result type of Invoke. template -using InvokeT = decltype(Invoker::type::Invoke(std::declval(), - std::declval()...)); +using InvokeT = decltype( + Invoker::type::Invoke(std::declval(), std::declval()...)); // Invoke(f, args...) is an implementation of INVOKE(f, args...) from section // [func.require] of the C++ standard. @@ -1002,10 +1002,9 @@ constexpr T&& forward( namespace utility_internal { // Helper method for expanding tuple into a called method. template -auto apply_helper(Functor&& functor, Tuple&& t, index_sequence) - -> decltype(phmap::base_internal::Invoke( - phmap::forward(functor), - std::get(phmap::forward(t))...)) { +auto apply_helper(Functor&& functor, Tuple&& t, index_sequence) -> decltype( + phmap::base_internal::Invoke(phmap::forward(functor), + std::get(phmap::forward(t))...)) { return phmap::base_internal::Invoke(phmap::forward(functor), std::get(phmap::forward(t))...); } @@ -1888,18 +1887,19 @@ class optional_assign_base { template constexpr copy_traits get_ctor_copy_traits() { - return std::is_copy_constructible::value ? copy_traits::copyable - : std::is_move_constructible::value ? copy_traits::movable - : copy_traits::non_movable; + return std::is_copy_constructible::value + ? copy_traits::copyable + : std::is_move_constructible::value ? copy_traits::movable + : copy_traits::non_movable; } template constexpr copy_traits get_assign_copy_traits() { return phmap::is_copy_assignable::value && std::is_copy_constructible::value ? copy_traits::copyable - : phmap::is_move_assignable::value && std::is_move_constructible::value - ? copy_traits::movable - : copy_traits::non_movable; + : phmap::is_move_assignable::value && std::is_move_constructible::value + ? copy_traits::movable + : copy_traits::non_movable; } // Whether T is constructible or convertible from optional. @@ -2421,9 +2421,9 @@ constexpr optional make_optional(std::initializer_list il, Args&&... args) template constexpr auto operator==(const optional& x, const optional& y) -> decltype(optional_internal::convertible_to_bool(*x == *y)) { - return static_cast(x) != static_cast(y) ? false - : static_cast(x) == false ? true - : static_cast(*x == *y); + return static_cast(x) != static_cast(y) + ? false + : static_cast(x) == false ? true : static_cast(*x == *y); } // Returns: If bool(x) != bool(y), true; otherwise, if bool(x) == false, false; @@ -2431,9 +2431,9 @@ constexpr auto operator==(const optional& x, const optional& y) template constexpr auto operator!=(const optional& x, const optional& y) -> decltype(optional_internal::convertible_to_bool(*x != *y)) { - return static_cast(x) != static_cast(y) ? true - : static_cast(x) == false ? false - : static_cast(*x != *y); + return static_cast(x) != static_cast(y) + ? true + : static_cast(x) == false ? false : static_cast(*x != *y); } // Returns: If !y, false; otherwise, if !x, true; otherwise *x < *y. template