From 3a5944057886828a767a47a056c926058e9c7d7c Mon Sep 17 00:00:00 2001
From: binwei <binwei.yang@intel.com>
Date: Sat, 30 Apr 2022 16:32:25 +0800
Subject: [PATCH 01/19] merge master and branch shuffle_opt_fillbyreducer. To
 submit PR to upstream Implemented fill by reducer

---
 native-sql-engine/cpp/CMakeLists.txt          |   3 +
 .../src/benchmarks/shuffle_split_benchmark.cc | 499 ++++++++++--------
 native-sql-engine/cpp/src/shuffle/splitter.cc | 167 +++++-
 native-sql-engine/cpp/src/shuffle/splitter.h  |   4 +
 4 files changed, 436 insertions(+), 237 deletions(-)
diff --git a/native-sql-engine/cpp/CMakeLists.txt b/native-sql-engine/cpp/CMakeLists.txt
index a1301fd1d..fe7e989ee 100644
--- a/native-sql-engine/cpp/CMakeLists.txt
+++ b/native-sql-engine/cpp/CMakeLists.txt
@@ -1,6 +1,9 @@
 cmake_minimum_required(VERSION 3.16)
 project(spark_columnar_plugin)
 
+#add_definitions(-DSKIPWRITE -DSKIPCOMPRESS -DPROCESSROW)
+add_definitions(-DPROCESSROW)
+
 #add_compile_options(-g)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
diff --git a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
index cd81ef877..ec1416641 100644
--- a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
+++ b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
@@ -41,24 +41,12 @@ namespace shuffle {
 const int batch_buffer_size = 32768;
 const int split_buffer_size = 8192;
 
-class BenchmarkShuffleSplit : public ::benchmark::Fixture {
+class BenchmarkShuffleSplit {
  public:
-  BenchmarkShuffleSplit() {
-    file_name =
-        "/mnt/DP_disk1/lineitem/"
-        "part-00025-356249a2-c285-42b9-8a18-5b10be61e0c4-c000.snappy.parquet";
-
+  BenchmarkShuffleSplit(std::string file_name) {
     GetRecordBatchReader(file_name);
-    std::cout << schema->ToString() << std::endl;
-    const auto& fields = schema->fields();
-    for (const auto& field : fields) {
-      if (field->name() == "l_orderkey") {
-        auto node = gandiva::TreeExprBuilder::MakeField(field);
-        expr_vector.push_back(gandiva::TreeExprBuilder::MakeExpression(
-            std::move(node), arrow::field("res_" + field->name(), field->type())));
-      }
-    }
   }
+  
   void GetRecordBatchReader(const std::string& input_file) {
     std::unique_ptr<::parquet::arrow::FileReader> parquet_reader;
     std::shared_ptr<RecordBatchReader> record_batch_reader;
@@ -89,11 +77,97 @@ class BenchmarkShuffleSplit : public ::benchmark::Fixture {
     for (int i = 0; i < num_columns; ++i) {
       column_indices.push_back(i);
     }
+    const auto& fields = schema->fields();
+    for (const auto& field : fields) {
+      if (field->name() == "l_orderkey") {
+        auto node = gandiva::TreeExprBuilder::MakeField(field);
+        expr_vector.push_back(gandiva::TreeExprBuilder::MakeExpression(
+            std::move(node), arrow::field("res_" + field->name(), field->type())));
+      }
+    }
   }
 
-  void SetUp(const ::benchmark::State& state) {}
+  void operator()(benchmark::State& state) {
+    SetCPU(state.thread_index());
+    arrow::Compression::type compression_type = (arrow::Compression::type)state.range(1);
+
+    const int num_partitions = state.range(0);
+
+    auto options = SplitOptions::Defaults();
+    options.compression_type = compression_type;
+    options.buffer_size = split_buffer_size;
+    options.buffered_write = true;
+    options.offheap_per_task = 128 * 1024 * 1024 * 1024L;
+    options.prefer_spill = true;
+    options.write_schema = false;
+
+    std::shared_ptr<Splitter> splitter;
+    int64_t elapse_read = 0;
+    int64_t num_batches = 0;
+    int64_t num_rows = 0;
+    int64_t split_time = 0;
+
+    Do_Split(splitter, elapse_read, num_batches, num_rows, split_time,
+        num_partitions, options, state);
+
+    auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
+    fs->DeleteFile(splitter->DataFile());
+
+    state.SetBytesProcessed(int64_t(splitter->RawPartitionBytes()));
+
+    state.counters["rowgroups"] =
+        benchmark::Counter(row_group_indices.size(), benchmark::Counter::kAvgThreads,
+                          benchmark::Counter::OneK::kIs1000);
+    state.counters["columns"] =
+        benchmark::Counter(column_indices.size(), benchmark::Counter::kAvgThreads,
+                          benchmark::Counter::OneK::kIs1000);
+    state.counters["batches"] = benchmark::Counter(
+        num_batches, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
+    state.counters["num_rows"] = benchmark::Counter(
+        num_rows, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
+    state.counters["num_partitions"] = benchmark::Counter(
+        num_partitions, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
+    state.counters["batch_buffer_size"] =
+        benchmark::Counter(batch_buffer_size, benchmark::Counter::kAvgThreads,
+                          benchmark::Counter::OneK::kIs1024);
+    state.counters["split_buffer_size"] =
+        benchmark::Counter(split_buffer_size, benchmark::Counter::kAvgThreads,
+                          benchmark::Counter::OneK::kIs1024);
+
+    state.counters["bytes_spilled"] =
+        benchmark::Counter(splitter->TotalBytesSpilled(), benchmark::Counter::kAvgThreads,
+                          benchmark::Counter::OneK::kIs1024);
+    state.counters["bytes_written"] =
+        benchmark::Counter(splitter->TotalBytesWritten(), benchmark::Counter::kAvgThreads,
+                          benchmark::Counter::OneK::kIs1024);
+    state.counters["bytes_raw"] =
+        benchmark::Counter(splitter->RawPartitionBytes(), benchmark::Counter::kAvgThreads,
+                          benchmark::Counter::OneK::kIs1024);
+    state.counters["bytes_spilled"] =
+        benchmark::Counter(splitter->TotalBytesSpilled(), benchmark::Counter::kAvgThreads,
+                          benchmark::Counter::OneK::kIs1024);
+
+    state.counters["parquet_parse"] = benchmark::Counter(
+        elapse_read, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
+    state.counters["compute_pid_time"] =
+        benchmark::Counter(splitter->TotalComputePidTime(), benchmark::Counter::kAvgThreads,
+                          benchmark::Counter::OneK::kIs1000);
+    state.counters["write_time"] =
+        benchmark::Counter(splitter->TotalWriteTime(), benchmark::Counter::kAvgThreads,
+                          benchmark::Counter::OneK::kIs1000);
+    state.counters["spill_time"] =
+        benchmark::Counter(splitter->TotalSpillTime(), benchmark::Counter::kAvgThreads,
+                          benchmark::Counter::OneK::kIs1000);
+    state.counters["compress_time"] =
+        benchmark::Counter(splitter->TotalCompressTime(), benchmark::Counter::kAvgThreads,
+                          benchmark::Counter::OneK::kIs1000);
+
+    split_time = split_time - splitter->TotalSpillTime() - splitter->TotalComputePidTime() -
+                splitter->TotalCompressTime() - splitter->TotalWriteTime();
+    state.counters["split_time"] = benchmark::Counter(
+        split_time, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);    
 
-  void TearDown(const ::benchmark::State& state) {}
+  }
 
  protected:
   long SetCPU(uint32_t cpuindex) {
@@ -102,9 +176,9 @@ class BenchmarkShuffleSplit : public ::benchmark::Fixture {
     CPU_SET(cpuindex, &cs);
     return sched_setaffinity(0, sizeof(cs), &cs);
   }
-  virtual void Do_Split(const std::shared_ptr<Splitter>& splitter, int64_t& elapse_read,
+  virtual void Do_Split(std::shared_ptr<Splitter>& splitter, int64_t& elapse_read,
                         int64_t& num_batches, int64_t& num_rows, int64_t& split_time,
-                        benchmark::State& state) {}
+                        const int num_partitions, SplitOptions options, benchmark::State& state) {}
 
  protected:
   std::string file_name;
@@ -116,232 +190,124 @@ class BenchmarkShuffleSplit : public ::benchmark::Fixture {
   parquet::ArrowReaderProperties properties;
 };
 
-BENCHMARK_DEFINE_F(BenchmarkShuffleSplit, CacheScan)(benchmark::State& state) {
-  SetCPU(state.thread_index());
 
-  arrow::Compression::type compression_type = (arrow::Compression::type)state.range(1);
+class BenchmarkShuffleSplit_CacheScan_Benchmark: public BenchmarkShuffleSplit{
+public:
+BenchmarkShuffleSplit_CacheScan_Benchmark(std::string filename):BenchmarkShuffleSplit(filename){}
 
-  const int num_partitions = state.range(0);
+protected:
+  void Do_Split(std::shared_ptr<Splitter>& splitter, int64_t& elapse_read,
+                        int64_t& num_batches, int64_t& num_rows, int64_t& split_time,
+                        const int num_partitions, SplitOptions options, benchmark::State& state) {
+
+    std::vector<int> local_column_indices;
+    local_column_indices.push_back(0);
+    local_column_indices.push_back(1);
+    local_column_indices.push_back(2);
+    local_column_indices.push_back(4);
+    local_column_indices.push_back(5);
+    local_column_indices.push_back(6);
+    local_column_indices.push_back(7);
+    
+    std::shared_ptr<arrow::Schema> local_schema;
+    local_schema = std::make_shared<arrow::Schema>(*schema.get());
+
+    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(15));
+    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(14));
+    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(13));
+    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(12));
+    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(11));
+    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(10));
+    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(9));
+    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(8));
+    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(3));
+
+    if(state.thread_index() == 0)
+      std::cout << local_schema->ToString() << std::endl;
 
-  auto options = SplitOptions::Defaults();
-  options.compression_type = compression_type;
-  options.buffer_size = split_buffer_size;
-  options.buffered_write = true;
-  options.offheap_per_task = 128 * 1024 * 1024 * 1024L;
-  options.prefer_spill = true;
-  options.write_schema = false;
+    ARROW_ASSIGN_OR_THROW(
+        splitter, Splitter::Make("rr", local_schema, num_partitions, options));
+    
+    std::shared_ptr<arrow::RecordBatch> record_batch;
 
-  std::shared_ptr<Splitter> splitter;
+    std::unique_ptr<::parquet::arrow::FileReader> parquet_reader;
+    std::shared_ptr<RecordBatchReader> record_batch_reader;
+    ASSERT_NOT_OK(::parquet::arrow::FileReader::Make(
+        arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file), properties,
+        &parquet_reader));
 
-  if (!expr_vector.empty()) {
-    ARROW_ASSIGN_OR_THROW(splitter, Splitter::Make("hash", schema, num_partitions,
-                                                   expr_vector, std::move(options)));
-  } else {
-    ARROW_ASSIGN_OR_THROW(
-        splitter, Splitter::Make("rr", schema, num_partitions, std::move(options)));
-  }
+    std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
+    ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader(row_group_indices, local_column_indices,
+                                                      &record_batch_reader));
+    do {
+      TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch));
 
-  std::shared_ptr<arrow::RecordBatch> record_batch;
-  int64_t elapse_read = 0;
-  int64_t num_batches = 0;
-  int64_t num_rows = 0;
-  int64_t split_time = 0;
-
-  std::unique_ptr<::parquet::arrow::FileReader> parquet_reader;
-  std::shared_ptr<RecordBatchReader> record_batch_reader;
-  ASSERT_NOT_OK(::parquet::arrow::FileReader::Make(
-      arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file), properties,
-      &parquet_reader));
-
-  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
-  ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader(row_group_indices, column_indices,
-                                                     &record_batch_reader));
-  do {
-    TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch));
-
-    if (record_batch) {
-      batches.push_back(record_batch);
-      num_batches += 1;
-      num_rows += record_batch->num_rows();
+      if (record_batch) {
+        batches.push_back(record_batch);
+        num_batches += 1;
+        num_rows += record_batch->num_rows();
+      }
+    } while (record_batch);
+    std::cout << "parquet parse done elapsed time " << elapse_read/1000000 << " ms " << std::endl;
+    std::cout << "batches = " << num_batches << " rows = " << num_rows << std::endl;
+
+    for (auto _ : state) {
+      for_each(batches.begin(), batches.end(),
+              [&splitter, &split_time](std::shared_ptr<arrow::RecordBatch>& record_batch) {
+                TIME_NANO_OR_THROW(split_time, splitter->Split(*record_batch));
+              });
     }
-  } while (record_batch);
 
-  for (auto _ : state) {
-    for_each(batches.begin(), batches.end(),
-             [&splitter, &split_time](std::shared_ptr<arrow::RecordBatch>& record_batch) {
-               TIME_NANO_OR_THROW(split_time, splitter->Split(*record_batch));
-             });
+    TIME_NANO_OR_THROW(split_time, splitter->Stop());
   }
 
-  TIME_NANO_OR_THROW(split_time, splitter->Stop());
-
-  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
-  fs->DeleteFile(splitter->DataFile());
-
-  state.SetBytesProcessed(int64_t(splitter->RawPartitionBytes()));
-
-  state.counters["rowgroups"] =
-      benchmark::Counter(row_group_indices.size(), benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1000);
-  state.counters["columns"] =
-      benchmark::Counter(column_indices.size(), benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1000);
-  state.counters["batches"] = benchmark::Counter(
-      num_batches, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
-  state.counters["num_rows"] = benchmark::Counter(
-      num_rows, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
-  state.counters["num_partitions"] = benchmark::Counter(
-      num_partitions, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
-  state.counters["batch_buffer_size"] =
-      benchmark::Counter(batch_buffer_size, benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1024);
-  state.counters["split_buffer_size"] =
-      benchmark::Counter(split_buffer_size, benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1024);
-
-  state.counters["bytes_spilled"] =
-      benchmark::Counter(splitter->TotalBytesSpilled(), benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1024);
-  state.counters["bytes_written"] =
-      benchmark::Counter(splitter->TotalBytesWritten(), benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1024);
-  state.counters["bytes_raw"] =
-      benchmark::Counter(splitter->RawPartitionBytes(), benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1024);
-  state.counters["bytes_spilled"] =
-      benchmark::Counter(splitter->TotalBytesSpilled(), benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1024);
-
-  state.counters["parquet_parse"] = benchmark::Counter(
-      elapse_read, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
-  state.counters["compute_pid_time"] =
-      benchmark::Counter(splitter->TotalComputePidTime(), benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1000);
-  state.counters["write_time"] =
-      benchmark::Counter(splitter->TotalWriteTime(), benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1000);
-  state.counters["spill_time"] =
-      benchmark::Counter(splitter->TotalSpillTime(), benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1000);
-  state.counters["compress_time"] =
-      benchmark::Counter(splitter->TotalCompressTime(), benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1000);
-
-  split_time = split_time - splitter->TotalSpillTime() - splitter->TotalComputePidTime() -
-               splitter->TotalCompressTime() - splitter->TotalWriteTime();
-  state.counters["split_time"] = benchmark::Counter(
-      split_time, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
-}
-
-BENCHMARK_DEFINE_F(BenchmarkShuffleSplit, IterateScan)(benchmark::State& state) {
-  SetCPU(state.thread_index());
-
-  arrow::Compression::type compression_type = (arrow::Compression::type)state.range(1);
-
-  const int num_partitions = state.range(0);
-
-  auto options = SplitOptions::Defaults();
-  options.compression_type = compression_type;
-  options.buffer_size = split_buffer_size;
-  options.buffered_write = true;
-  options.offheap_per_task = 128 * 1024 * 1024 * 1024L;
-  options.prefer_spill = true;
-  options.write_schema = false;
-
-  std::shared_ptr<Splitter> splitter;
-
-  if (!expr_vector.empty()) {
-    ARROW_ASSIGN_OR_THROW(splitter, Splitter::Make("hash", schema, num_partitions,
-                                                   expr_vector, std::move(options)));
-  } else {
-    ARROW_ASSIGN_OR_THROW(
-        splitter, Splitter::Make("rr", schema, num_partitions, std::move(options)));
-  }
 
-  int64_t elapse_read = 0;
-  int64_t num_batches = 0;
-  int64_t num_rows = 0;
-  int64_t split_time = 0;
+};
 
-  std::shared_ptr<arrow::RecordBatch> record_batch;
 
-  std::unique_ptr<::parquet::arrow::FileReader> parquet_reader;
-  std::shared_ptr<RecordBatchReader> record_batch_reader;
-  ASSERT_NOT_OK(::parquet::arrow::FileReader::Make(
-      arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file), properties,
-      &parquet_reader));
+class BenchmarkShuffleSplit_IterateScan_Benchmark: public BenchmarkShuffleSplit{
+public:
+BenchmarkShuffleSplit_IterateScan_Benchmark(std::string filename):BenchmarkShuffleSplit(filename){}
 
-  for (auto _ : state) {
-    std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
-    ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader(row_group_indices, column_indices,
-                                                       &record_batch_reader));
-    TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch));
-    while (record_batch) {
-      num_batches += 1;
-      num_rows += record_batch->num_rows();
-      TIME_NANO_OR_THROW(split_time, splitter->Split(*record_batch));
+protected:
+  void Do_Split(std::shared_ptr<Splitter>& splitter, int64_t& elapse_read,
+                        int64_t& num_batches, int64_t& num_rows, int64_t& split_time,
+                        const int num_partitions, SplitOptions options, benchmark::State& state) {
+
+    if(state.thread_index() == 0)
+      std::cout << schema->ToString() << std::endl;
+
+    if (!expr_vector.empty()) {
+      ARROW_ASSIGN_OR_THROW(splitter, Splitter::Make("hash", schema, num_partitions,
+                                                    expr_vector, std::move(options)));
+    } else {
+      ARROW_ASSIGN_OR_THROW(
+          splitter, Splitter::Make("rr", schema, num_partitions, std::move(options)));
+    }
+
+    std::shared_ptr<arrow::RecordBatch> record_batch;
+
+    std::unique_ptr<::parquet::arrow::FileReader> parquet_reader;
+    std::shared_ptr<RecordBatchReader> record_batch_reader;
+    ASSERT_NOT_OK(::parquet::arrow::FileReader::Make(
+        arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file), properties,
+        &parquet_reader));
+
+    for (auto _ : state) {
+      std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
+      ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader(row_group_indices, column_indices,
+                                                        &record_batch_reader));
       TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch));
+      while (record_batch) {
+        num_batches += 1;
+        num_rows += record_batch->num_rows();
+        TIME_NANO_OR_THROW(split_time, splitter->Split(*record_batch));
+        TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch));
+      }
     }
+    TIME_NANO_OR_THROW(split_time, splitter->Stop());
   }
-  TIME_NANO_OR_THROW(split_time, splitter->Stop());
-
-  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
-  fs->DeleteFile(splitter->DataFile());
-
-  state.SetBytesProcessed(int64_t(splitter->RawPartitionBytes()));
-
-  state.counters["rowgroups"] =
-      benchmark::Counter(row_group_indices.size(), benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1000);
-  state.counters["columns"] =
-      benchmark::Counter(column_indices.size(), benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1000);
-  state.counters["batches"] = benchmark::Counter(
-      num_batches, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
-  state.counters["num_rows"] = benchmark::Counter(
-      num_rows, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
-  state.counters["num_partitions"] = benchmark::Counter(
-      num_partitions, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
-  state.counters["batch_buffer_size"] =
-      benchmark::Counter(batch_buffer_size, benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1024);
-  state.counters["split_buffer_size"] =
-      benchmark::Counter(split_buffer_size, benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1024);
-
-  state.counters["bytes_spilled"] =
-      benchmark::Counter(splitter->TotalBytesSpilled(), benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1024);
-  state.counters["bytes_written"] =
-      benchmark::Counter(splitter->TotalBytesWritten(), benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1024);
-  state.counters["bytes_raw"] =
-      benchmark::Counter(splitter->RawPartitionBytes(), benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1024);
-  state.counters["bytes_spilled"] =
-      benchmark::Counter(splitter->TotalBytesSpilled(), benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1024);
-
-  state.counters["parquet_parse"] = benchmark::Counter(
-      elapse_read, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
-  state.counters["compute_pid_time"] =
-      benchmark::Counter(splitter->TotalComputePidTime(), benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1000);
-  state.counters["write_time"] =
-      benchmark::Counter(splitter->TotalWriteTime(), benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1000);
-  state.counters["spill_time"] =
-      benchmark::Counter(splitter->TotalSpillTime(), benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1000);
-  state.counters["compress_time"] =
-      benchmark::Counter(splitter->TotalCompressTime(), benchmark::Counter::kAvgThreads,
-                         benchmark::Counter::OneK::kIs1000);
-
-  split_time = split_time - splitter->TotalSpillTime() - splitter->TotalComputePidTime() -
-               splitter->TotalCompressTime() - splitter->TotalWriteTime();
-  state.counters["split_time"] = benchmark::Counter(
-      split_time, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
-}
+};
 
 /*BENCHMARK_REGISTER_F(BenchmarkShuffleSplit, CacheScan)->Iterations(1)
       ->Args({96*2, arrow::Compression::FASTPFOR})
@@ -370,14 +336,79 @@ BENCHMARK_DEFINE_F(BenchmarkShuffleSplit, IterateScan)(benchmark::State& state)
       ->Threads(16)
       ->Threads(24)
       ->Unit(benchmark::kSecond);*/
-BENCHMARK_REGISTER_F(BenchmarkShuffleSplit, IterateScan)
-    ->Iterations(1)
-    ->Args({96 * 16, arrow::Compression::FASTPFOR})
-    ->Threads(24)
+/*BENCHMARK_REGISTER_F(BenchmarkShuffleSplit, CacheScan)
+    ->Iterations(1000000)
+    ->Args({512, arrow::Compression::FASTPFOR})
+    ->Threads(1)
     ->ReportAggregatesOnly(false)
     ->MeasureProcessCPUTime()
-    ->Unit(benchmark::kSecond);
+    ->Unit(benchmark::kSecond);*/
 }  // namespace shuffle
 }  // namespace sparkcolumnarplugin
 
-BENCHMARK_MAIN();
+int main(int argc, char** argv) {
+
+  uint32_t iterations=1;
+  uint32_t partitions=512;
+  uint32_t threads=1;
+  std::string datafile;
+
+  for (int i=0;i<argc;i++)
+  {
+    if(strcmp(argv[i],"--iterations")==0)
+    {
+      iterations=atol(argv[i+1]);
+    }else if (strcmp(argv[i],"--partitions")==0)
+    {
+      partitions=atol(argv[i+1]);
+    }else if (strcmp(argv[i],"--threads")==0)
+    {
+      threads=atol(argv[i+1]);
+    }else if (strcmp(argv[i],"--file")==0)
+    {
+      datafile=argv[i+1];
+    }
+  }
+  std::cout << "iterations = " << iterations << std::endl;
+  std::cout << "partitions = " << partitions << std::endl;
+  std::cout << "threads = " << threads << std::endl;
+  std::cout << "datafile = " << datafile << std::endl;
+
+  sparkcolumnarplugin::shuffle::BenchmarkShuffleSplit_CacheScan_Benchmark bck(datafile);
+
+  benchmark::RegisterBenchmark("BenchmarkShuffleSplit::CacheScan", bck)
+    ->Iterations(iterations)
+    ->Args({partitions, arrow::Compression::FASTPFOR})
+    ->Threads(threads)
+    ->ReportAggregatesOnly(false)
+    ->MeasureProcessCPUTime()
+    ->Unit(benchmark::kSecond);
+
+/*  sparkcolumnarplugin::shuffle::BenchmarkShuffleSplit_IterateScan_Benchmark bck(datafile);
+
+  benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck)
+    ->Iterations(1)
+      ->Args({96*2, arrow::Compression::FASTPFOR})
+      ->Args({96*4, arrow::Compression::FASTPFOR})
+      ->Args({96*8, arrow::Compression::FASTPFOR})
+      ->Args({96*16, arrow::Compression::FASTPFOR})
+      ->Args({96*32, arrow::Compression::FASTPFOR})
+      ->Threads(24)
+      ->Unit(benchmark::kSecond);
+
+  benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck)
+    ->Iterations(1)
+      ->Args({4096, arrow::Compression::FASTPFOR})
+      ->Threads(1)
+      ->Threads(2)
+      ->Threads(4)
+      ->Threads(8)
+      ->Threads(16)
+      ->Threads(24)
+      ->Unit(benchmark::kSecond);
+*/
+
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
+}
\ No newline at end of file
diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc
index e739bd04f..798668dde 100644
--- a/native-sql-engine/cpp/src/shuffle/splitter.cc
+++ b/native-sql-engine/cpp/src/shuffle/splitter.cc
@@ -28,20 +28,45 @@
 
 #include <memory>
 #include <utility>
+#include <string>
+#include <cstring>
+#include <sstream>
 
 #include "shuffle/utils.h"
 #include "utils/macros.h"
+#include <immintrin.h>
 
-#if defined(COLUMNAR_PLUGIN_USE_AVX512)
+/*#if defined(COLUMNAR_PLUGIN_USE_AVX512)
 #include <immintrin.h>
 #else
 #include <xmmintrin.h>
 #endif
+*/
 
 namespace sparkcolumnarplugin {
 namespace shuffle {
 using arrow::internal::checked_cast;
 
+
+
+
+template <typename T>
+std::string __m128i_toString(const __m128i var) {
+    std::stringstream sstr;
+    T values[16/sizeof(T)];
+    std::memcpy(values,&var,sizeof(values)); //See discussion below
+    if (sizeof(T) == 1) {
+        for (unsigned int i = 0; i < sizeof(__m128i); i++) { //C++11: Range for also possible
+            sstr << std::hex << (int) values[i] << " " << std::dec;
+        }
+    } else {
+        for (unsigned int i = 0; i < sizeof(__m128i) / sizeof(T); i++) { //C++11: Range for also possible
+            sstr << std::hex << values[i] << " " << std::dec;
+        }
+    }
+    return sstr.str();
+}
+
 SplitOptions SplitOptions::Defaults() { return SplitOptions(); }
 #if defined(COLUMNAR_PLUGIN_USE_AVX512)
 inline __m256i CountPartitionIdOccurrence(const std::vector<int32_t>& partition_id,
@@ -293,6 +318,7 @@ arrow::Status Splitter::Init() {
   partition_cached_recordbatch_size_.resize(num_partitions_);
   partition_lengths_.resize(num_partitions_);
   raw_partition_lengths_.resize(num_partitions_);
+  reducer_offset_offset_.resize(num_partitions_ + 1);
 
   for (int i = 0; i < column_type_id_.size(); ++i) {
     switch (column_type_id_[i]->id()) {
@@ -815,6 +841,26 @@ arrow::Result<int32_t> Splitter::SpillLargestPartition(int64_t* size) {
 }
 
 arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) {
+#ifdef PROCESSROW
+
+  reducer_offsets_.resize(rb.num_rows());
+
+  reducer_offset_offset_[0] = 0;
+  for (auto pid = 1; pid <= num_partitions_; pid++) {
+    reducer_offset_offset_[pid] =
+        reducer_offset_offset_[pid - 1] + partition_id_cnt_[pid - 1];
+  }
+  for (auto row = 0; row < rb.num_rows(); row++) {
+    auto pid = partition_id_[row];
+    reducer_offsets_[reducer_offset_offset_[pid]] = row;
+    _mm_prefetch(reducer_offsets_.data() + reducer_offset_offset_[pid] + 32, _MM_HINT_T0);
+    reducer_offset_offset_[pid]++;
+  }
+  std::transform(reducer_offset_offset_.begin(), std::prev(reducer_offset_offset_.end()),
+                 partition_id_cnt_.begin(), reducer_offset_offset_.begin(),
+                 [](uint16_t x, int16_t y) { return x - y; });
+
+#endif
   // for the first input record batch, scan binary arrays and large binary
   // arrays to get their empirical sizes
 
@@ -922,6 +968,27 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb)
     auto src_addr = const_cast<uint8_t*>(rb.column_data(col_idx)->buffers[1]->data());
 
     switch (arrow::bit_width(column_type_id_[col_idx]->id())) {
+#ifdef PROCESSROW
+// assume batch size = 32k; reducer# = 4K; row/reducer = 8
+#define PROCESS(_CTYPE)                                                                  \
+  std::transform(partition_buffer_idx_offset_.begin(),                                   \
+                 partition_buffer_idx_offset_.end(), partition_buffer_idx_base_.begin(), \
+                 partition_buffer_idx_offset_.begin(),                                   \
+                 [](uint8_t* x, int16_t y) { return x + y * sizeof(_CTYPE); });          \
+  for (auto pid = 0; pid < num_partitions_; pid++) {                                     \
+    auto dst_pid_base =                                                                  \
+        reinterpret_cast<_CTYPE*>(partition_buffer_idx_offset_[pid]); /*32k*/            \
+    auto r = reducer_offset_offset_[pid];                             /*8k*/             \
+    auto size = reducer_offset_offset_[pid + 1];                                         \
+    for (r; r < size; r++) {                                                             \
+      auto src_offset = reducer_offsets_[r];                           /*16k*/           \
+      *dst_pid_base = reinterpret_cast<_CTYPE*>(src_addr)[src_offset]; /*64k*/           \
+      _mm_prefetch(&(src_addr)[src_offset * sizeof(_CTYPE) + 64], _MM_HINT_T2);          \
+      dst_pid_base += 1;                                                                 \
+    }                                                                                    \
+  }                                                                                      \
+  break;
+#else
 #define PROCESS(_CTYPE)                                                                  \
   std::transform(partition_buffer_idx_offset_.begin(),                                   \
                  partition_buffer_idx_offset_.end(), partition_buffer_idx_base_.begin(), \
@@ -932,9 +999,10 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb)
     auto dst_pid_base = reinterpret_cast<_CTYPE*>(partition_buffer_idx_offset_[pid]);    \
     *dst_pid_base = reinterpret_cast<_CTYPE*>(src_addr)[row];                            \
     partition_buffer_idx_offset_[pid] += sizeof(_CTYPE);                                 \
-    _mm_prefetch(&dst_pid_base[1], _MM_HINT_T0);                                         \
+    _mm_prefetch(&dst_pid_base[64 / sizeof(_CTYPE)], _MM_HINT_T0);                       \
   }                                                                                      \
   break;
+#endif
       case 8:
         PROCESS(uint8_t)
       case 16:
@@ -942,9 +1010,93 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb)
       case 32:
         PROCESS(uint32_t)
       case 64:
+#ifdef PROCESSAVX
+      std::transform(partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(), 
+          partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(),            
+          [](uint8_t* x, int16_t y) { return x+y*sizeof(uint64_t); });                           
+        for (auto pid = 0; pid < num_partitions_; pid++)                                       
+        {                                                                                      
+          auto dst_pid_base = reinterpret_cast<uint64_t*>(partition_buffer_idx_offset_[pid]); /*32k*/  
+          auto r = reducer_offset_offset_[pid];                                        /*8k*/  
+          auto size = reducer_offset_offset_[pid+1];                                           
+#if 1
+          for (r; r<size && (((uint64_t)dst_pid_base & 0x1f) > 0); r++)                          
+          {                                                                                    
+            auto src_offset = reducer_offsets_[r];                                 /*16k*/     
+            *dst_pid_base = reinterpret_cast<uint64_t*>(src_addr)[src_offset];       /*64k*/     
+            _mm_prefetch(&(src_addr)[src_offset*sizeof(uint64_t)+64], _MM_HINT_T2);              
+            dst_pid_base+=1;                                                                   
+          }                                                                                    
+#if 0
+          for (r; r+4<size; r+=4)                              
+          {                                                                                    
+            auto src_offset = reducer_offsets_[r];                                 /*16k*/ 
+            __m128i src_ld = _mm_loadl_epi64((__m128i*)(&reducer_offsets_[r]));    
+            __m128i src_offset_4x = _mm_cvtepu16_epi32(src_ld);
+            
+            __m256i src_4x = _mm256_i32gather_epi64((const long long int*)src_addr,src_offset_4x,8);
+            //_mm256_store_si256((__m256i*)dst_pid_base,src_4x); 
+            _mm_stream_si128((__m128i*)dst_pid_base,src_2x);
+                                                         
+            _mm_prefetch(&(src_addr)[(uint32_t)reducer_offsets_[r]*sizeof(uint64_t)+64], _MM_HINT_T2);              
+            _mm_prefetch(&(src_addr)[(uint32_t)reducer_offsets_[r+1]*sizeof(uint64_t)+64], _MM_HINT_T2);              
+            _mm_prefetch(&(src_addr)[(uint32_t)reducer_offsets_[r+2]*sizeof(uint64_t)+64], _MM_HINT_T2);              
+            _mm_prefetch(&(src_addr)[(uint32_t)reducer_offsets_[r+3]*sizeof(uint64_t)+64], _MM_HINT_T2);              
+            dst_pid_base+=4;                                                                   
+          }    
+#endif
+          for (r; r+2<size; r+=2)                              
+          {                                                                                    
+            __m128i src_offset_2x = _mm_cvtsi32_si128(*((int32_t*)(reducer_offsets_.data()+r)));
+            src_offset_2x = _mm_shufflelo_epi16(src_offset_2x,0x98);
+
+            __m128i src_2x = _mm_i32gather_epi64((const long long int*)src_addr,src_offset_2x,8);
+            _mm_store_si128((__m128i*)dst_pid_base,src_2x); 
+            //_mm_stream_si128((__m128i*)dst_pid_base,src_2x); 
+                                                         
+            _mm_prefetch(&(src_addr)[(uint32_t)reducer_offsets_[r]*sizeof(uint64_t)+64], _MM_HINT_T2);              
+            _mm_prefetch(&(src_addr)[(uint32_t)reducer_offsets_[r+1]*sizeof(uint64_t)+64], _MM_HINT_T2);              
+            dst_pid_base+=2;                                                                   
+          }    
+#endif                                                                                
+          for (r; r<size; r++)                                                                 
+          {                                                                                    
+            auto src_offset = reducer_offsets_[r];                                 /*16k*/     
+            *dst_pid_base = reinterpret_cast<uint64_t*>(src_addr)[src_offset];       /*64k*/     
+            _mm_prefetch(&(src_addr)[src_offset*sizeof(uint64_t)+64], _MM_HINT_T2);              
+            dst_pid_base+=1;                                                                   
+          }                                                                                    
+        }                                                                                      
+        break;
+#else
         PROCESS(uint64_t)
+#endif
+
 #undef PROCESS
       case 128:  // arrow::Decimal128Type::type_id
+#ifdef PROCESSROW
+        // assume batch size = 32k; reducer# = 4K; row/reducer = 8
+        std::transform(
+            partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(),
+            partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(),
+            [](uint8_t* x, int16_t y) { return x + y * 16; });
+        for (auto pid = 0; pid < num_partitions_; pid++) {
+          auto dst_pid_base =
+              reinterpret_cast<uint64_t*>(partition_buffer_idx_offset_[pid]); /*32k*/
+          auto r = reducer_offset_offset_[pid];                               /*8k*/
+          auto size = reducer_offset_offset_[pid + 1];
+          for (r; r < size; r++) {
+            auto src_offset = reducer_offsets_[r]; /*16k*/
+            *dst_pid_base =
+                reinterpret_cast<uint64_t*>(src_addr)[src_offset << 1]; /*128k*/
+            *(dst_pid_base + 1) =
+                reinterpret_cast<uint64_t*>(src_addr)[src_offset << 1 | 1]; /*128k*/
+            _mm_prefetch(&(src_addr)[src_offset * 16 + 64], _MM_HINT_T2);
+            dst_pid_base += 2;
+          }
+        }
+        break;
+#else
         std::transform(
             partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(),
             partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(),
@@ -960,6 +1112,7 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb)
                        _MM_HINT_T0);
         }
         break;
+#endif
       case 1:  // arrow::BooleanType::type_id:
         partition_buffer_idx_offset.resize(partition_buffer_idx_base_.size());
         std::copy(partition_buffer_idx_base_.begin(), partition_buffer_idx_base_.end(),
@@ -1159,6 +1312,7 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch&
     if (rb.column_data(col_idx)->GetNullCount() == 0 &&
         column_has_null_[col_idx] == true) {
       // if the input record batch doesn't have null, set validity to True
+      // column_has_null_ is used to skip the partition_id_cnt_[pid] and dst_addrs[pid] access
       for (auto pid = 0; pid < num_partitions_; ++pid) {
         if (partition_id_cnt_[pid] > 0 && dst_addrs[pid] != nullptr) {
           arrow::BitUtil::SetBitsTo(dst_addrs[pid], partition_buffer_idx_base_[pid],
@@ -1406,7 +1560,14 @@ arrow::Status HashSplitter::ComputeAndCountPartitionId(const arrow::RecordBatch&
   for (auto i = 0; i < num_rows; ++i) {
     // positive mod
     auto pid = pid_arr->Value(i) % num_partitions_;
-    if (pid < 0) pid = (pid + num_partitions_) % num_partitions_;
+    //force to generate ASM
+    __asm__ (
+        "lea (%[num_partitions],%[pid],1),%[tmp]\n"
+        "test %[pid],%[pid]\n"
+        "cmovs %[tmp],%[pid]\n"
+        : [pid] "+r"(pid)
+        : [num_partitions]"r"(num_partitions_),[tmp]"r"(0)
+    );
     partition_id_[i] = pid;
     partition_id_cnt_[pid]++;
   }
diff --git a/native-sql-engine/cpp/src/shuffle/splitter.h b/native-sql-engine/cpp/src/shuffle/splitter.h
index 0dfac2f8c..2fb4bb3d4 100644
--- a/native-sql-engine/cpp/src/shuffle/splitter.h
+++ b/native-sql-engine/cpp/src/shuffle/splitter.h
@@ -226,6 +226,10 @@ class Splitter {
   // updated for each input record batch
   // col
   std::vector<uint16_t> partition_id_;
+  // [num_rows]
+  std::vector<uint16_t> reducer_offsets_;
+  // [num_partitions]
+  std::vector<uint16_t> reducer_offset_offset_;
   // col
   std::vector<uint16_t> partition_id_cnt_;
 

From 94b733dd463118886e98cfd287a50c14d309f55f Mon Sep 17 00:00:00 2001
From: binwei <binwei.yang@intel.com>
Date: Sat, 30 Apr 2022 16:42:21 +0800
Subject: [PATCH 02/19] format code

---
 .../src/benchmarks/shuffle_split_benchmark.cc | 217 +++++++++---------
 native-sql-engine/cpp/src/shuffle/splitter.cc | 129 ++++++-----
 .../src/third_party/parallel_hashmap/btree.h  |  21 +-
 .../src/third_party/parallel_hashmap/phmap.h  |  12 +-
 .../third_party/parallel_hashmap/phmap_base.h |  36 +--
 5 files changed, 204 insertions(+), 211 deletions(-)

diff --git a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
index ec1416641..d2bffe36a 100644
--- a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
+++ b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
@@ -43,10 +43,8 @@ const int split_buffer_size = 8192;
 
 class BenchmarkShuffleSplit {
  public:
-  BenchmarkShuffleSplit(std::string file_name) {
-    GetRecordBatchReader(file_name);
-  }
-  
+  BenchmarkShuffleSplit(std::string file_name) { GetRecordBatchReader(file_name); }
+
   void GetRecordBatchReader(const std::string& input_file) {
     std::unique_ptr<::parquet::arrow::FileReader> parquet_reader;
     std::shared_ptr<RecordBatchReader> record_batch_reader;
@@ -107,8 +105,8 @@ class BenchmarkShuffleSplit {
     int64_t num_rows = 0;
     int64_t split_time = 0;
 
-    Do_Split(splitter, elapse_read, num_batches, num_rows, split_time,
-        num_partitions, options, state);
+    Do_Split(splitter, elapse_read, num_batches, num_rows, split_time, num_partitions,
+             options, state);
 
     auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
     fs->DeleteFile(splitter->DataFile());
@@ -117,56 +115,57 @@ class BenchmarkShuffleSplit {
 
     state.counters["rowgroups"] =
         benchmark::Counter(row_group_indices.size(), benchmark::Counter::kAvgThreads,
-                          benchmark::Counter::OneK::kIs1000);
+                           benchmark::Counter::OneK::kIs1000);
     state.counters["columns"] =
         benchmark::Counter(column_indices.size(), benchmark::Counter::kAvgThreads,
-                          benchmark::Counter::OneK::kIs1000);
+                           benchmark::Counter::OneK::kIs1000);
     state.counters["batches"] = benchmark::Counter(
         num_batches, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
     state.counters["num_rows"] = benchmark::Counter(
         num_rows, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
-    state.counters["num_partitions"] = benchmark::Counter(
-        num_partitions, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
+    state.counters["num_partitions"] =
+        benchmark::Counter(num_partitions, benchmark::Counter::kAvgThreads,
+                           benchmark::Counter::OneK::kIs1000);
     state.counters["batch_buffer_size"] =
         benchmark::Counter(batch_buffer_size, benchmark::Counter::kAvgThreads,
-                          benchmark::Counter::OneK::kIs1024);
+                           benchmark::Counter::OneK::kIs1024);
     state.counters["split_buffer_size"] =
         benchmark::Counter(split_buffer_size, benchmark::Counter::kAvgThreads,
-                          benchmark::Counter::OneK::kIs1024);
+                           benchmark::Counter::OneK::kIs1024);
 
     state.counters["bytes_spilled"] =
         benchmark::Counter(splitter->TotalBytesSpilled(), benchmark::Counter::kAvgThreads,
-                          benchmark::Counter::OneK::kIs1024);
+                           benchmark::Counter::OneK::kIs1024);
     state.counters["bytes_written"] =
         benchmark::Counter(splitter->TotalBytesWritten(), benchmark::Counter::kAvgThreads,
-                          benchmark::Counter::OneK::kIs1024);
+                           benchmark::Counter::OneK::kIs1024);
     state.counters["bytes_raw"] =
         benchmark::Counter(splitter->RawPartitionBytes(), benchmark::Counter::kAvgThreads,
-                          benchmark::Counter::OneK::kIs1024);
+                           benchmark::Counter::OneK::kIs1024);
     state.counters["bytes_spilled"] =
         benchmark::Counter(splitter->TotalBytesSpilled(), benchmark::Counter::kAvgThreads,
-                          benchmark::Counter::OneK::kIs1024);
+                           benchmark::Counter::OneK::kIs1024);
 
     state.counters["parquet_parse"] = benchmark::Counter(
         elapse_read, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
-    state.counters["compute_pid_time"] =
-        benchmark::Counter(splitter->TotalComputePidTime(), benchmark::Counter::kAvgThreads,
-                          benchmark::Counter::OneK::kIs1000);
+    state.counters["compute_pid_time"] = benchmark::Counter(
+        splitter->TotalComputePidTime(), benchmark::Counter::kAvgThreads,
+        benchmark::Counter::OneK::kIs1000);
     state.counters["write_time"] =
         benchmark::Counter(splitter->TotalWriteTime(), benchmark::Counter::kAvgThreads,
-                          benchmark::Counter::OneK::kIs1000);
+                           benchmark::Counter::OneK::kIs1000);
     state.counters["spill_time"] =
         benchmark::Counter(splitter->TotalSpillTime(), benchmark::Counter::kAvgThreads,
-                          benchmark::Counter::OneK::kIs1000);
+                           benchmark::Counter::OneK::kIs1000);
     state.counters["compress_time"] =
         benchmark::Counter(splitter->TotalCompressTime(), benchmark::Counter::kAvgThreads,
-                          benchmark::Counter::OneK::kIs1000);
+                           benchmark::Counter::OneK::kIs1000);
 
-    split_time = split_time - splitter->TotalSpillTime() - splitter->TotalComputePidTime() -
-                splitter->TotalCompressTime() - splitter->TotalWriteTime();
+    split_time = split_time - splitter->TotalSpillTime() -
+                 splitter->TotalComputePidTime() - splitter->TotalCompressTime() -
+                 splitter->TotalWriteTime();
     state.counters["split_time"] = benchmark::Counter(
-        split_time, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);    
-
+        split_time, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
   }
 
  protected:
@@ -178,7 +177,8 @@ class BenchmarkShuffleSplit {
   }
   virtual void Do_Split(std::shared_ptr<Splitter>& splitter, int64_t& elapse_read,
                         int64_t& num_batches, int64_t& num_rows, int64_t& split_time,
-                        const int num_partitions, SplitOptions options, benchmark::State& state) {}
+                        const int num_partitions, SplitOptions options,
+                        benchmark::State& state) {}
 
  protected:
   std::string file_name;
@@ -190,16 +190,15 @@ class BenchmarkShuffleSplit {
   parquet::ArrowReaderProperties properties;
 };
 
+class BenchmarkShuffleSplit_CacheScan_Benchmark : public BenchmarkShuffleSplit {
+ public:
+  BenchmarkShuffleSplit_CacheScan_Benchmark(std::string filename)
+      : BenchmarkShuffleSplit(filename) {}
 
-class BenchmarkShuffleSplit_CacheScan_Benchmark: public BenchmarkShuffleSplit{
-public:
-BenchmarkShuffleSplit_CacheScan_Benchmark(std::string filename):BenchmarkShuffleSplit(filename){}
-
-protected:
+ protected:
   void Do_Split(std::shared_ptr<Splitter>& splitter, int64_t& elapse_read,
-                        int64_t& num_batches, int64_t& num_rows, int64_t& split_time,
-                        const int num_partitions, SplitOptions options, benchmark::State& state) {
-
+                int64_t& num_batches, int64_t& num_rows, int64_t& split_time,
+                const int num_partitions, SplitOptions options, benchmark::State& state) {
     std::vector<int> local_column_indices;
     local_column_indices.push_back(0);
     local_column_indices.push_back(1);
@@ -208,7 +207,7 @@ BenchmarkShuffleSplit_CacheScan_Benchmark(std::string filename):BenchmarkShuffle
     local_column_indices.push_back(5);
     local_column_indices.push_back(6);
     local_column_indices.push_back(7);
-    
+
     std::shared_ptr<arrow::Schema> local_schema;
     local_schema = std::make_shared<arrow::Schema>(*schema.get());
 
@@ -222,23 +221,22 @@ BenchmarkShuffleSplit_CacheScan_Benchmark(std::string filename):BenchmarkShuffle
     ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(8));
     ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(3));
 
-    if(state.thread_index() == 0)
-      std::cout << local_schema->ToString() << std::endl;
+    if (state.thread_index() == 0) std::cout << local_schema->ToString() << std::endl;
+
+    ARROW_ASSIGN_OR_THROW(splitter,
+                          Splitter::Make("rr", local_schema, num_partitions, options));
 
-    ARROW_ASSIGN_OR_THROW(
-        splitter, Splitter::Make("rr", local_schema, num_partitions, options));
-    
     std::shared_ptr<arrow::RecordBatch> record_batch;
 
     std::unique_ptr<::parquet::arrow::FileReader> parquet_reader;
     std::shared_ptr<RecordBatchReader> record_batch_reader;
     ASSERT_NOT_OK(::parquet::arrow::FileReader::Make(
-        arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file), properties,
-        &parquet_reader));
+        arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file),
+        properties, &parquet_reader));
 
     std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
-    ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader(row_group_indices, local_column_indices,
-                                                      &record_batch_reader));
+    ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader(
+        row_group_indices, local_column_indices, &record_batch_reader));
     do {
       TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch));
 
@@ -248,38 +246,36 @@ BenchmarkShuffleSplit_CacheScan_Benchmark(std::string filename):BenchmarkShuffle
         num_rows += record_batch->num_rows();
       }
     } while (record_batch);
-    std::cout << "parquet parse done elapsed time " << elapse_read/1000000 << " ms " << std::endl;
+    std::cout << "parquet parse done elapsed time " << elapse_read / 1000000 << " ms "
+              << std::endl;
     std::cout << "batches = " << num_batches << " rows = " << num_rows << std::endl;
 
     for (auto _ : state) {
-      for_each(batches.begin(), batches.end(),
-              [&splitter, &split_time](std::shared_ptr<arrow::RecordBatch>& record_batch) {
-                TIME_NANO_OR_THROW(split_time, splitter->Split(*record_batch));
-              });
+      for_each(
+          batches.begin(), batches.end(),
+          [&splitter, &split_time](std::shared_ptr<arrow::RecordBatch>& record_batch) {
+            TIME_NANO_OR_THROW(split_time, splitter->Split(*record_batch));
+          });
     }
 
     TIME_NANO_OR_THROW(split_time, splitter->Stop());
   }
-
-
 };
 
+class BenchmarkShuffleSplit_IterateScan_Benchmark : public BenchmarkShuffleSplit {
+ public:
+  BenchmarkShuffleSplit_IterateScan_Benchmark(std::string filename)
+      : BenchmarkShuffleSplit(filename) {}
 
-class BenchmarkShuffleSplit_IterateScan_Benchmark: public BenchmarkShuffleSplit{
-public:
-BenchmarkShuffleSplit_IterateScan_Benchmark(std::string filename):BenchmarkShuffleSplit(filename){}
-
-protected:
+ protected:
   void Do_Split(std::shared_ptr<Splitter>& splitter, int64_t& elapse_read,
-                        int64_t& num_batches, int64_t& num_rows, int64_t& split_time,
-                        const int num_partitions, SplitOptions options, benchmark::State& state) {
-
-    if(state.thread_index() == 0)
-      std::cout << schema->ToString() << std::endl;
+                int64_t& num_batches, int64_t& num_rows, int64_t& split_time,
+                const int num_partitions, SplitOptions options, benchmark::State& state) {
+    if (state.thread_index() == 0) std::cout << schema->ToString() << std::endl;
 
     if (!expr_vector.empty()) {
       ARROW_ASSIGN_OR_THROW(splitter, Splitter::Make("hash", schema, num_partitions,
-                                                    expr_vector, std::move(options)));
+                                                     expr_vector, std::move(options)));
     } else {
       ARROW_ASSIGN_OR_THROW(
           splitter, Splitter::Make("rr", schema, num_partitions, std::move(options)));
@@ -290,13 +286,13 @@ BenchmarkShuffleSplit_IterateScan_Benchmark(std::string filename):BenchmarkShuff
     std::unique_ptr<::parquet::arrow::FileReader> parquet_reader;
     std::shared_ptr<RecordBatchReader> record_batch_reader;
     ASSERT_NOT_OK(::parquet::arrow::FileReader::Make(
-        arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file), properties,
-        &parquet_reader));
+        arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file),
+        properties, &parquet_reader));
 
     for (auto _ : state) {
       std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
-      ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader(row_group_indices, column_indices,
-                                                        &record_batch_reader));
+      ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader(
+          row_group_indices, column_indices, &record_batch_reader));
       TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch));
       while (record_batch) {
         num_batches += 1;
@@ -347,26 +343,20 @@ BenchmarkShuffleSplit_IterateScan_Benchmark(std::string filename):BenchmarkShuff
 }  // namespace sparkcolumnarplugin
 
 int main(int argc, char** argv) {
-
-  uint32_t iterations=1;
-  uint32_t partitions=512;
-  uint32_t threads=1;
+  uint32_t iterations = 1;
+  uint32_t partitions = 512;
+  uint32_t threads = 1;
   std::string datafile;
 
-  for (int i=0;i<argc;i++)
-  {
-    if(strcmp(argv[i],"--iterations")==0)
-    {
-      iterations=atol(argv[i+1]);
-    }else if (strcmp(argv[i],"--partitions")==0)
-    {
-      partitions=atol(argv[i+1]);
-    }else if (strcmp(argv[i],"--threads")==0)
-    {
-      threads=atol(argv[i+1]);
-    }else if (strcmp(argv[i],"--file")==0)
-    {
-      datafile=argv[i+1];
+  for (int i = 0; i < argc; i++) {
+    if (strcmp(argv[i], "--iterations") == 0) {
+      iterations = atol(argv[i + 1]);
+    } else if (strcmp(argv[i], "--partitions") == 0) {
+      partitions = atol(argv[i + 1]);
+    } else if (strcmp(argv[i], "--threads") == 0) {
+      threads = atol(argv[i + 1]);
+    } else if (strcmp(argv[i], "--file") == 0) {
+      datafile = argv[i + 1];
     }
   }
   std::cout << "iterations = " << iterations << std::endl;
@@ -377,36 +367,37 @@ int main(int argc, char** argv) {
   sparkcolumnarplugin::shuffle::BenchmarkShuffleSplit_CacheScan_Benchmark bck(datafile);
 
   benchmark::RegisterBenchmark("BenchmarkShuffleSplit::CacheScan", bck)
-    ->Iterations(iterations)
-    ->Args({partitions, arrow::Compression::FASTPFOR})
-    ->Threads(threads)
-    ->ReportAggregatesOnly(false)
-    ->MeasureProcessCPUTime()
-    ->Unit(benchmark::kSecond);
-
-/*  sparkcolumnarplugin::shuffle::BenchmarkShuffleSplit_IterateScan_Benchmark bck(datafile);
-
-  benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck)
-    ->Iterations(1)
-      ->Args({96*2, arrow::Compression::FASTPFOR})
-      ->Args({96*4, arrow::Compression::FASTPFOR})
-      ->Args({96*8, arrow::Compression::FASTPFOR})
-      ->Args({96*16, arrow::Compression::FASTPFOR})
-      ->Args({96*32, arrow::Compression::FASTPFOR})
-      ->Threads(24)
+      ->Iterations(iterations)
+      ->Args({partitions, arrow::Compression::FASTPFOR})
+      ->Threads(threads)
+      ->ReportAggregatesOnly(false)
+      ->MeasureProcessCPUTime()
       ->Unit(benchmark::kSecond);
 
-  benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck)
-    ->Iterations(1)
-      ->Args({4096, arrow::Compression::FASTPFOR})
-      ->Threads(1)
-      ->Threads(2)
-      ->Threads(4)
-      ->Threads(8)
-      ->Threads(16)
-      ->Threads(24)
-      ->Unit(benchmark::kSecond);
-*/
+  /*  sparkcolumnarplugin::shuffle::BenchmarkShuffleSplit_IterateScan_Benchmark
+    bck(datafile);
+
+    benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck)
+      ->Iterations(1)
+        ->Args({96*2, arrow::Compression::FASTPFOR})
+        ->Args({96*4, arrow::Compression::FASTPFOR})
+        ->Args({96*8, arrow::Compression::FASTPFOR})
+        ->Args({96*16, arrow::Compression::FASTPFOR})
+        ->Args({96*32, arrow::Compression::FASTPFOR})
+        ->Threads(24)
+        ->Unit(benchmark::kSecond);
+
+    benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck)
+      ->Iterations(1)
+        ->Args({4096, arrow::Compression::FASTPFOR})
+        ->Threads(1)
+        ->Threads(2)
+        ->Threads(4)
+        ->Threads(8)
+        ->Threads(16)
+        ->Threads(24)
+        ->Unit(benchmark::kSecond);
+  */
 
   benchmark::Initialize(&argc, argv);
   benchmark::RunSpecifiedBenchmarks();
diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc
index 798668dde..812dc4516 100644
--- a/native-sql-engine/cpp/src/shuffle/splitter.cc
+++ b/native-sql-engine/cpp/src/shuffle/splitter.cc
@@ -25,16 +25,16 @@
 #include <gandiva/node.h>
 #include <gandiva/projector.h>
 #include <gandiva/tree_expr_builder.h>
+#include <immintrin.h>
 
-#include <memory>
-#include <utility>
-#include <string>
 #include <cstring>
+#include <memory>
 #include <sstream>
+#include <string>
+#include <utility>
 
 #include "shuffle/utils.h"
 #include "utils/macros.h"
-#include <immintrin.h>
 
 /*#if defined(COLUMNAR_PLUGIN_USE_AVX512)
 #include <immintrin.h>
@@ -47,24 +47,23 @@ namespace sparkcolumnarplugin {
 namespace shuffle {
 using arrow::internal::checked_cast;
 
-
-
-
 template <typename T>
 std::string __m128i_toString(const __m128i var) {
-    std::stringstream sstr;
-    T values[16/sizeof(T)];
-    std::memcpy(values,&var,sizeof(values)); //See discussion below
-    if (sizeof(T) == 1) {
-        for (unsigned int i = 0; i < sizeof(__m128i); i++) { //C++11: Range for also possible
-            sstr << std::hex << (int) values[i] << " " << std::dec;
-        }
-    } else {
-        for (unsigned int i = 0; i < sizeof(__m128i) / sizeof(T); i++) { //C++11: Range for also possible
-            sstr << std::hex << values[i] << " " << std::dec;
-        }
+  std::stringstream sstr;
+  T values[16 / sizeof(T)];
+  std::memcpy(values, &var, sizeof(values));  // See discussion below
+  if (sizeof(T) == 1) {
+    for (unsigned int i = 0; i < sizeof(__m128i); i++) {  // C++11: Range for also
+                                                          // possible
+      sstr << std::hex << (int)values[i] << " " << std::dec;
+    }
+  } else {
+    for (unsigned int i = 0; i < sizeof(__m128i) / sizeof(T);
+         i++) {  // C++11: Range for also possible
+      sstr << std::hex << values[i] << " " << std::dec;
     }
-    return sstr.str();
+  }
+  return sstr.str();
 }
 
 SplitOptions SplitOptions::Defaults() { return SplitOptions(); }
@@ -1011,22 +1010,22 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb)
         PROCESS(uint32_t)
       case 64:
 #ifdef PROCESSAVX
-      std::transform(partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(), 
-          partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(),            
-          [](uint8_t* x, int16_t y) { return x+y*sizeof(uint64_t); });                           
-        for (auto pid = 0; pid < num_partitions_; pid++)                                       
-        {                                                                                      
-          auto dst_pid_base = reinterpret_cast<uint64_t*>(partition_buffer_idx_offset_[pid]); /*32k*/  
-          auto r = reducer_offset_offset_[pid];                                        /*8k*/  
-          auto size = reducer_offset_offset_[pid+1];                                           
+        std::transform(
+            partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(),
+            partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(),
+            [](uint8_t* x, int16_t y) { return x + y * sizeof(uint64_t); });
+        for (auto pid = 0; pid < num_partitions_; pid++) {
+          auto dst_pid_base =
+              reinterpret_cast<uint64_t*>(partition_buffer_idx_offset_[pid]); /*32k*/
+          auto r = reducer_offset_offset_[pid];                               /*8k*/
+          auto size = reducer_offset_offset_[pid + 1];
 #if 1
-          for (r; r<size && (((uint64_t)dst_pid_base & 0x1f) > 0); r++)                          
-          {                                                                                    
-            auto src_offset = reducer_offsets_[r];                                 /*16k*/     
-            *dst_pid_base = reinterpret_cast<uint64_t*>(src_addr)[src_offset];       /*64k*/     
-            _mm_prefetch(&(src_addr)[src_offset*sizeof(uint64_t)+64], _MM_HINT_T2);              
-            dst_pid_base+=1;                                                                   
-          }                                                                                    
+          for (r; r < size && (((uint64_t)dst_pid_base & 0x1f) > 0); r++) {
+            auto src_offset = reducer_offsets_[r];                             /*16k*/
+            *dst_pid_base = reinterpret_cast<uint64_t*>(src_addr)[src_offset]; /*64k*/
+            _mm_prefetch(&(src_addr)[src_offset * sizeof(uint64_t) + 64], _MM_HINT_T2);
+            dst_pid_base += 1;
+          }
 #if 0
           for (r; r+4<size; r+=4)                              
           {                                                                                    
@@ -1043,30 +1042,34 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb)
             _mm_prefetch(&(src_addr)[(uint32_t)reducer_offsets_[r+2]*sizeof(uint64_t)+64], _MM_HINT_T2);              
             _mm_prefetch(&(src_addr)[(uint32_t)reducer_offsets_[r+3]*sizeof(uint64_t)+64], _MM_HINT_T2);              
             dst_pid_base+=4;                                                                   
-          }    
+          }
 #endif
-          for (r; r+2<size; r+=2)                              
-          {                                                                                    
-            __m128i src_offset_2x = _mm_cvtsi32_si128(*((int32_t*)(reducer_offsets_.data()+r)));
-            src_offset_2x = _mm_shufflelo_epi16(src_offset_2x,0x98);
-
-            __m128i src_2x = _mm_i32gather_epi64((const long long int*)src_addr,src_offset_2x,8);
-            _mm_store_si128((__m128i*)dst_pid_base,src_2x); 
-            //_mm_stream_si128((__m128i*)dst_pid_base,src_2x); 
-                                                         
-            _mm_prefetch(&(src_addr)[(uint32_t)reducer_offsets_[r]*sizeof(uint64_t)+64], _MM_HINT_T2);              
-            _mm_prefetch(&(src_addr)[(uint32_t)reducer_offsets_[r+1]*sizeof(uint64_t)+64], _MM_HINT_T2);              
-            dst_pid_base+=2;                                                                   
-          }    
-#endif                                                                                
-          for (r; r<size; r++)                                                                 
-          {                                                                                    
-            auto src_offset = reducer_offsets_[r];                                 /*16k*/     
-            *dst_pid_base = reinterpret_cast<uint64_t*>(src_addr)[src_offset];       /*64k*/     
-            _mm_prefetch(&(src_addr)[src_offset*sizeof(uint64_t)+64], _MM_HINT_T2);              
-            dst_pid_base+=1;                                                                   
-          }                                                                                    
-        }                                                                                      
+          for (r; r + 2 < size; r += 2) {
+            __m128i src_offset_2x =
+                _mm_cvtsi32_si128(*((int32_t*)(reducer_offsets_.data() + r)));
+            src_offset_2x = _mm_shufflelo_epi16(src_offset_2x, 0x98);
+
+            __m128i src_2x =
+                _mm_i32gather_epi64((const long long int*)src_addr, src_offset_2x, 8);
+            _mm_store_si128((__m128i*)dst_pid_base, src_2x);
+            //_mm_stream_si128((__m128i*)dst_pid_base,src_2x);
+
+            _mm_prefetch(
+                &(src_addr)[(uint32_t)reducer_offsets_[r] * sizeof(uint64_t) + 64],
+                _MM_HINT_T2);
+            _mm_prefetch(
+                &(src_addr)[(uint32_t)reducer_offsets_[r + 1] * sizeof(uint64_t) + 64],
+                _MM_HINT_T2);
+            dst_pid_base += 2;
+          }
+#endif
+          for (r; r < size; r++) {
+            auto src_offset = reducer_offsets_[r];                             /*16k*/
+            *dst_pid_base = reinterpret_cast<uint64_t*>(src_addr)[src_offset]; /*64k*/
+            _mm_prefetch(&(src_addr)[src_offset * sizeof(uint64_t) + 64], _MM_HINT_T2);
+            dst_pid_base += 1;
+          }
+        }
         break;
 #else
         PROCESS(uint64_t)
@@ -1075,7 +1078,7 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb)
 #undef PROCESS
       case 128:  // arrow::Decimal128Type::type_id
 #ifdef PROCESSROW
-        // assume batch size = 32k; reducer# = 4K; row/reducer = 8
+                 // assume batch size = 32k; reducer# = 4K; row/reducer = 8
         std::transform(
             partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(),
             partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(),
@@ -1312,7 +1315,8 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch&
     if (rb.column_data(col_idx)->GetNullCount() == 0 &&
         column_has_null_[col_idx] == true) {
       // if the input record batch doesn't have null, set validity to True
-      // column_has_null_ is used to skip the partition_id_cnt_[pid] and dst_addrs[pid] access
+      // column_has_null_ is used to skip the partition_id_cnt_[pid] and dst_addrs[pid]
+      // access
       for (auto pid = 0; pid < num_partitions_; ++pid) {
         if (partition_id_cnt_[pid] > 0 && dst_addrs[pid] != nullptr) {
           arrow::BitUtil::SetBitsTo(dst_addrs[pid], partition_buffer_idx_base_[pid],
@@ -1560,14 +1564,13 @@ arrow::Status HashSplitter::ComputeAndCountPartitionId(const arrow::RecordBatch&
   for (auto i = 0; i < num_rows; ++i) {
     // positive mod
     auto pid = pid_arr->Value(i) % num_partitions_;
-    //force to generate ASM
-    __asm__ (
+    // force to generate ASM
+    __asm__(
         "lea (%[num_partitions],%[pid],1),%[tmp]\n"
         "test %[pid],%[pid]\n"
         "cmovs %[tmp],%[pid]\n"
         : [pid] "+r"(pid)
-        : [num_partitions]"r"(num_partitions_),[tmp]"r"(0)
-    );
+        : [num_partitions] "r"(num_partitions_), [tmp] "r"(0));
     partition_id_[i] = pid;
     partition_id_cnt_[pid]++;
   }
diff --git a/native-sql-engine/cpp/src/third_party/parallel_hashmap/btree.h b/native-sql-engine/cpp/src/third_party/parallel_hashmap/btree.h
index b9b0d94da..24c2d145b 100644
--- a/native-sql-engine/cpp/src/third_party/parallel_hashmap/btree.h
+++ b/native-sql-engine/cpp/src/third_party/parallel_hashmap/btree.h
@@ -661,9 +661,9 @@ constexpr bool do_less_than_comparison(const Compare& compare, const K& x, const
 // SFINAE prevents implicit conversions to int (such as from bool).
 template <typename Int, phmap::enable_if_t<std::is_same<int, Int>::value, int> = 0>
 constexpr phmap::weak_ordering compare_result_as_ordering(const Int c) {
-  return c < 0
-             ? phmap::weak_ordering::less
-             : c == 0 ? phmap::weak_ordering::equivalent : phmap::weak_ordering::greater;
+  return c < 0    ? phmap::weak_ordering::less
+         : c == 0 ? phmap::weak_ordering::equivalent
+                  : phmap::weak_ordering::greater;
 }
 constexpr phmap::weak_ordering compare_result_as_ordering(const phmap::weak_ordering c) {
   return c;
@@ -685,9 +685,9 @@ template <
         int> = 0>
 constexpr phmap::weak_ordering do_three_way_comparison(const Compare& compare, const K& x,
                                                        const LK& y) {
-  return compare(x, y) ? phmap::weak_ordering::less
-                       : compare(y, x) ? phmap::weak_ordering::greater
-                                       : phmap::weak_ordering::equivalent;
+  return compare(x, y)   ? phmap::weak_ordering::less
+         : compare(y, x) ? phmap::weak_ordering::greater
+                         : phmap::weak_ordering::equivalent;
 }
 
 }  // namespace compare_internal
@@ -1063,11 +1063,10 @@ class btree_node {
   // Compute how many values we can fit onto a leaf node taking into account
   // padding.
   constexpr static size_type NodeTargetValues(const int begin, const int end) {
-    return begin == end
-               ? begin
-               : SizeWithNValues((begin + end) / 2 + 1) > params_type::kTargetNodeSize
-                     ? NodeTargetValues(begin, (begin + end) / 2)
-                     : NodeTargetValues((begin + end) / 2 + 1, end);
+    return begin == end ? begin
+           : SizeWithNValues((begin + end) / 2 + 1) > params_type::kTargetNodeSize
+               ? NodeTargetValues(begin, (begin + end) / 2)
+               : NodeTargetValues((begin + end) / 2 + 1, end);
   }
 
   enum {
diff --git a/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap.h b/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap.h
index 4628cca30..05d227a43 100644
--- a/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap.h
+++ b/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap.h
@@ -2156,13 +2156,13 @@ class raw_hash_map : public raw_hash_set<Policy, Hash, Eq, Alloc> {
   // incomplete types as values, as in unordered_map<K, IncompleteType>.
   // MappedReference<> may be a non-reference type.
   template <class P>
-  using MappedReference = decltype(
-      P::value(std::addressof(std::declval<typename raw_hash_map::reference>())));
+  using MappedReference = decltype(P::value(
+      std::addressof(std::declval<typename raw_hash_map::reference>())));
 
   // MappedConstReference<> may be a non-reference type.
   template <class P>
-  using MappedConstReference = decltype(
-      P::value(std::addressof(std::declval<typename raw_hash_map::const_reference>())));
+  using MappedConstReference = decltype(P::value(
+      std::addressof(std::declval<typename raw_hash_map::const_reference>())));
 
   using KeyArgImpl = KeyArg<IsTransparent<Eq>::value && IsTransparent<Hash>::value>;
 
@@ -3409,8 +3409,8 @@ class parallel_hash_map
   // incomplete types as values, as in unordered_map<K, IncompleteType>.
   // MappedReference<> may be a non-reference type.
   template <class P>
-  using MappedReference = decltype(
-      P::value(std::addressof(std::declval<typename parallel_hash_map::reference>())));
+  using MappedReference = decltype(P::value(
+      std::addressof(std::declval<typename parallel_hash_map::reference>())));
 
   // MappedConstReference<> may be a non-reference type.
   template <class P>
diff --git a/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap_base.h b/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap_base.h
index 3b3b6b120..0f4e6375d 100644
--- a/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap_base.h
+++ b/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap_base.h
@@ -826,8 +826,8 @@ struct Invoker {
 
 // The result type of Invoke<F, Args...>.
 template <typename F, typename... Args>
-using InvokeT = decltype(
-    Invoker<F, Args...>::type::Invoke(std::declval<F>(), std::declval<Args>()...));
+using InvokeT = decltype(Invoker<F, Args...>::type::Invoke(std::declval<F>(),
+                                                           std::declval<Args>()...));
 
 // Invoke(f, args...) is an implementation of INVOKE(f, args...) from section
 // [func.require] of the C++ standard.
@@ -1002,9 +1002,10 @@ constexpr T&& forward(
 namespace utility_internal {
 // Helper method for expanding tuple into a called method.
 template <typename Functor, typename Tuple, std::size_t... Indexes>
-auto apply_helper(Functor&& functor, Tuple&& t, index_sequence<Indexes...>) -> decltype(
-    phmap::base_internal::Invoke(phmap::forward<Functor>(functor),
-                                 std::get<Indexes>(phmap::forward<Tuple>(t))...)) {
+auto apply_helper(Functor&& functor, Tuple&& t, index_sequence<Indexes...>)
+    -> decltype(phmap::base_internal::Invoke(
+        phmap::forward<Functor>(functor),
+        std::get<Indexes>(phmap::forward<Tuple>(t))...)) {
   return phmap::base_internal::Invoke(phmap::forward<Functor>(functor),
                                       std::get<Indexes>(phmap::forward<Tuple>(t))...);
 }
@@ -1887,19 +1888,18 @@ class optional_assign_base<copy_traits::non_movable> {
 
 template <typename T>
 constexpr copy_traits get_ctor_copy_traits() {
-  return std::is_copy_constructible<T>::value
-             ? copy_traits::copyable
-             : std::is_move_constructible<T>::value ? copy_traits::movable
-                                                    : copy_traits::non_movable;
+  return std::is_copy_constructible<T>::value   ? copy_traits::copyable
+         : std::is_move_constructible<T>::value ? copy_traits::movable
+                                                : copy_traits::non_movable;
 }
 
 template <typename T>
 constexpr copy_traits get_assign_copy_traits() {
   return phmap::is_copy_assignable<T>::value && std::is_copy_constructible<T>::value
              ? copy_traits::copyable
-             : phmap::is_move_assignable<T>::value && std::is_move_constructible<T>::value
-                   ? copy_traits::movable
-                   : copy_traits::non_movable;
+         : phmap::is_move_assignable<T>::value && std::is_move_constructible<T>::value
+             ? copy_traits::movable
+             : copy_traits::non_movable;
 }
 
 // Whether T is constructible or convertible from optional<U>.
@@ -2421,9 +2421,9 @@ constexpr optional<T> make_optional(std::initializer_list<U> il, Args&&... args)
 template <typename T, typename U>
 constexpr auto operator==(const optional<T>& x, const optional<U>& y)
     -> decltype(optional_internal::convertible_to_bool(*x == *y)) {
-  return static_cast<bool>(x) != static_cast<bool>(y)
-             ? false
-             : static_cast<bool>(x) == false ? true : static_cast<bool>(*x == *y);
+  return static_cast<bool>(x) != static_cast<bool>(y) ? false
+         : static_cast<bool>(x) == false              ? true
+                                                      : static_cast<bool>(*x == *y);
 }
 
 // Returns: If bool(x) != bool(y), true; otherwise, if bool(x) == false, false;
@@ -2431,9 +2431,9 @@ constexpr auto operator==(const optional<T>& x, const optional<U>& y)
 template <typename T, typename U>
 constexpr auto operator!=(const optional<T>& x, const optional<U>& y)
     -> decltype(optional_internal::convertible_to_bool(*x != *y)) {
-  return static_cast<bool>(x) != static_cast<bool>(y)
-             ? true
-             : static_cast<bool>(x) == false ? false : static_cast<bool>(*x != *y);
+  return static_cast<bool>(x) != static_cast<bool>(y) ? true
+         : static_cast<bool>(x) == false              ? false
+                                                      : static_cast<bool>(*x != *y);
 }
 // Returns: If !y, false; otherwise, if !x, true; otherwise *x < *y.
 template <typename T, typename U>

From d7ce830e78b985607f9d2dd611e476dfdf3f9a50 Mon Sep 17 00:00:00 2001
From: binwei <binwei.yang@intel.com>
Date: Sat, 30 Apr 2022 22:42:25 +0800
Subject: [PATCH 03/19] Allocate large block of memory then slice to each
 buffer

---
 native-sql-engine/cpp/CMakeLists.txt          |  6 +-
 native-sql-engine/cpp/src/shuffle/splitter.cc | 81 +++++++++++++------
 native-sql-engine/cpp/src/shuffle/splitter.h  |  8 +-
 .../cpp/src/tests/shuffle_split_test.cc       |  2 +-
 4 files changed, 69 insertions(+), 28 deletions(-)

diff --git a/native-sql-engine/cpp/CMakeLists.txt b/native-sql-engine/cpp/CMakeLists.txt
index fe7e989ee..48a923614 100644
--- a/native-sql-engine/cpp/CMakeLists.txt
+++ b/native-sql-engine/cpp/CMakeLists.txt
@@ -1,10 +1,10 @@
 cmake_minimum_required(VERSION 3.16)
 project(spark_columnar_plugin)
 
-#add_definitions(-DSKIPWRITE -DSKIPCOMPRESS -DPROCESSROW)
-add_definitions(-DPROCESSROW)
+add_definitions(-DSKIPWRITE -DSKIPCOMPRESS -DPROCESSROW)
+#add_definitions(-DPROCESSROW)
 
-#add_compile_options(-g)
+add_compile_options(-g)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 set(root_directory ${PROJECT_BINARY_DIR})
diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc
index 812dc4516..62e31df65 100644
--- a/native-sql-engine/cpp/src/shuffle/splitter.cc
+++ b/native-sql-engine/cpp/src/shuffle/splitter.cc
@@ -47,6 +47,11 @@ namespace sparkcolumnarplugin {
 namespace shuffle {
 using arrow::internal::checked_cast;
 
+#ifndef SPLIT_BUFFER_SIZE
+//by default, allocate 8M block, 2M page size
+#define SPLIT_BUFFER_SIZE 8*1024*1024
+#endif
+
 template <typename T>
 std::string __m128i_toString(const __m128i var) {
   std::stringstream sstr;
@@ -401,6 +406,36 @@ arrow::Status Splitter::Init() {
       tiny_bach_write_options_.codec,
       arrow::util::Codec::CreateInt32(arrow::Compression::UNCOMPRESSED));
 
+  //Allocate first buffer for split reducer
+  ARROW_ASSIGN_OR_RAISE(combine_buffer_, arrow::AllocateResizableBuffer(
+                                        SPLIT_BUFFER_SIZE,
+                                        options_.memory_pool));
+  combine_buffer_->Resize(0, /*shrink_to_fit =*/false);
+
+  return arrow::Status::OK();
+}
+arrow::Status Splitter::AllocateBufferFromPool(std::shared_ptr<arrow::Buffer>& buffer, uint32_t size)
+{
+  // if size is already larger than buffer pool size, allocate it directly
+  //make size 64byte aligned
+  auto reminder = size & 0x3f;
+  size+=(64-reminder) & ((reminder==0)-1);
+
+  if (size > SPLIT_BUFFER_SIZE )
+  {
+    ARROW_ASSIGN_OR_RAISE(buffer, arrow::AllocateResizableBuffer(
+                                        size, options_.memory_pool));
+    return arrow::Status::OK();
+  }else if (combine_buffer_->capacity() - combine_buffer_->size() < size)
+  {
+    //memory pool is not enough
+    ARROW_ASSIGN_OR_RAISE(combine_buffer_, arrow::AllocateResizableBuffer(
+                                        SPLIT_BUFFER_SIZE,
+                                        options_.memory_pool));
+  }
+  buffer = arrow::SliceMutableBuffer(combine_buffer_, combine_buffer_->size(),size);
+  
+  combine_buffer_->Resize(combine_buffer_->size() + size);
   return arrow::Status::OK();
 }
 
@@ -576,15 +611,13 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer
         default: {
           auto& buffers = partition_fixed_width_buffers_[fixed_width_idx][partition_id];
           if (buffers[0] != nullptr) {
-            buffers[0]->Resize((num_rows >> 3) + 1, /*shrink_to_fit =*/false);
+            buffers[0]=arrow::SliceBuffer(buffers[0],0,(num_rows >> 3) + 1);
           }
           if (buffers[1] != nullptr) {
             if (column_type_id_[i]->id() == arrow::BooleanType::type_id)
-              buffers[1]->Resize((num_rows >> 3) + 1, /*shrink_to_fit =*/false);
+              buffers[1]=arrow::SliceBuffer(buffers[1],0,(num_rows >> 3) + 1);
             else
-              buffers[1]->Resize(
-                  num_rows * (arrow::bit_width(column_type_id_[i]->id()) >> 3),
-                  /*shrink_to_fit =*/false);
+              buffers[1]=arrow::SliceBuffer(buffers[1],0,num_rows * (arrow::bit_width(column_type_id_[i]->id()) >> 3));
           }
 
           if (reset_buffers) {
@@ -642,12 +675,14 @@ arrow::Status Splitter::AllocatePartitionBuffers(int32_t partition_id, int32_t n
   auto binary_idx = 0;
   auto large_binary_idx = 0;
   auto list_idx = 0;
+  auto total_size = 0;
 
   std::vector<std::shared_ptr<arrow::BinaryBuilder>> new_binary_builders;
   std::vector<std::shared_ptr<arrow::LargeBinaryBuilder>> new_large_binary_builders;
   std::vector<std::shared_ptr<arrow::ArrayBuilder>> new_list_builders;
-  std::vector<std::shared_ptr<arrow::ResizableBuffer>> new_value_buffers;
-  std::vector<std::shared_ptr<arrow::ResizableBuffer>> new_validity_buffers;
+  std::vector<std::shared_ptr<arrow::Buffer>> new_value_buffers;
+  std::vector<std::shared_ptr<arrow::Buffer>> new_validity_buffers;
+
   for (auto i = 0; i < num_fields; ++i) {
     switch (column_type_id_[i]->id()) {
       case arrow::BinaryType::type_id:
@@ -688,30 +723,30 @@ arrow::Status Splitter::AllocatePartitionBuffers(int32_t partition_id, int32_t n
       case arrow::NullType::type_id:
         break;
       default: {
-        std::shared_ptr<arrow::ResizableBuffer> value_buffer;
+          try{
+        std::shared_ptr<arrow::Buffer> value_buffer;
         if (column_type_id_[i]->id() == arrow::BooleanType::type_id) {
-          ARROW_ASSIGN_OR_RAISE(value_buffer, arrow::AllocateResizableBuffer(
-                                                  arrow::BitUtil::BytesForBits(new_size),
-                                                  options_.memory_pool));
+          auto status = AllocateBufferFromPool(value_buffer, arrow::BitUtil::BytesForBits(new_size));
+          ARROW_RETURN_NOT_OK( status );
         } else {
-          ARROW_ASSIGN_OR_RAISE(
-              value_buffer,
-              arrow::AllocateResizableBuffer(
-                  new_size * (arrow::bit_width(column_type_id_[i]->id()) / 8),
-                  options_.memory_pool));
+            auto status = AllocateBufferFromPool(value_buffer, new_size * (arrow::bit_width(column_type_id_[i]->id()) >>3));
+            ARROW_RETURN_NOT_OK( status );
+
         }
         new_value_buffers.push_back(std::move(value_buffer));
         if (input_fixed_width_has_null_[fixed_width_idx]) {
-          std::shared_ptr<arrow::ResizableBuffer> validity_buffer;
-          ARROW_ASSIGN_OR_RAISE(
-              validity_buffer,
-              arrow::AllocateResizableBuffer(arrow::BitUtil::BytesForBits(new_size),
-                                             options_.memory_pool));
+          std::shared_ptr<arrow::Buffer> validity_buffer;
+          auto status = AllocateBufferFromPool(validity_buffer, arrow::BitUtil::BytesForBits(new_size));
+          ARROW_RETURN_NOT_OK( status );
           new_validity_buffers.push_back(std::move(validity_buffer));
         } else {
           new_validity_buffers.push_back(nullptr);
         }
         fixed_width_idx++;
+          }catch(const std::exception& e)
+          {
+            std::cout << "exception captured " << e.what() << std::endl;
+          }
         break;
       }
     }
@@ -746,10 +781,10 @@ arrow::Status Splitter::AllocatePartitionBuffers(int32_t partition_id, int32_t n
         break;
       default:
         partition_fixed_width_value_addrs_[fixed_width_idx][partition_id] =
-            const_cast<uint8_t*>(new_value_buffers[fixed_width_idx]->data());
+            new_value_buffers[fixed_width_idx]->mutable_data();
         if (input_fixed_width_has_null_[fixed_width_idx]) {
           partition_fixed_width_validity_addrs_[fixed_width_idx][partition_id] =
-              const_cast<uint8_t*>(new_validity_buffers[fixed_width_idx]->data());
+              new_validity_buffers[fixed_width_idx]->mutable_data();
         } else {
           partition_fixed_width_validity_addrs_[fixed_width_idx][partition_id] = nullptr;
         }
diff --git a/native-sql-engine/cpp/src/shuffle/splitter.h b/native-sql-engine/cpp/src/shuffle/splitter.h
index 2fb4bb3d4..1c1c8e2da 100644
--- a/native-sql-engine/cpp/src/shuffle/splitter.h
+++ b/native-sql-engine/cpp/src/shuffle/splitter.h
@@ -138,6 +138,8 @@ class Splitter {
 
   arrow::Status SplitListArray(const arrow::RecordBatch& rb);
 
+  arrow::Status AllocateBufferFromPool(std::shared_ptr<arrow::Buffer>& buffer, uint32_t size);
+
   template <typename T, typename ArrayType = typename arrow::TypeTraits<T>::ArrayType,
             typename BuilderType = typename arrow::TypeTraits<T>::BuilderType>
   arrow::Status AppendBinary(
@@ -188,7 +190,7 @@ class Splitter {
   // col partid
   std::vector<std::vector<uint8_t*>> partition_fixed_width_value_addrs_;
   // col partid
-  std::vector<std::vector<std::vector<std::shared_ptr<arrow::ResizableBuffer>>>>
+  std::vector<std::vector<std::vector<std::shared_ptr<arrow::Buffer>>>>
       partition_fixed_width_buffers_;
   // col partid
   std::vector<std::vector<std::shared_ptr<arrow::BinaryBuilder>>>
@@ -198,6 +200,10 @@ class Splitter {
       partition_large_binary_builders_;
   std::vector<std::vector<std::shared_ptr<arrow::ArrayBuilder>>> partition_list_builders_;
   // col partid
+
+  //slice the buffer for each reducer's column, in this way we can combine into large page
+  std::shared_ptr<arrow::ResizableBuffer> combine_buffer_; 
+
   // partid
   std::vector<std::vector<std::shared_ptr<arrow::ipc::IpcPayload>>>
       partition_cached_recordbatch_;
diff --git a/native-sql-engine/cpp/src/tests/shuffle_split_test.cc b/native-sql-engine/cpp/src/tests/shuffle_split_test.cc
index 1f12742cd..715364a6d 100644
--- a/native-sql-engine/cpp/src/tests/shuffle_split_test.cc
+++ b/native-sql-engine/cpp/src/tests/shuffle_split_test.cc
@@ -431,7 +431,7 @@ TEST_F(SplitterTest, TestSpillFailWithOutOfMemory) {
 }
 
 TEST_F(SplitterTest, TestSpillLargestPartition) {
-  std::shared_ptr<arrow::MemoryPool> pool = std::make_shared<MyMemoryPool>(4000000);
+  std::shared_ptr<arrow::MemoryPool> pool = std::make_shared<MyMemoryPool>(9*1024*1024);
   //  pool = std::make_shared<arrow::LoggingMemoryPool>(pool.get());
 
   int32_t num_partitions = 2;

From 7bdec939608ae5c6870bbba1672affc4dd2711d4 Mon Sep 17 00:00:00 2001
From: binwei <binwei.yang@intel.com>
Date: Sun, 1 May 2022 17:07:13 +0800
Subject: [PATCH 04/19] wip, rebase to master

---
 native-sql-engine/cpp/CMakeLists.txt          |  4 +-
 .../src/benchmarks/shuffle_split_benchmark.cc | 45 ++++++++++++++
 native-sql-engine/cpp/src/shuffle/splitter.cc | 11 +++-
 .../cpp/src/tests/shuffle_split_test.cc       | 58 ++++++++++++++++++-
 4 files changed, 112 insertions(+), 6 deletions(-)

diff --git a/native-sql-engine/cpp/CMakeLists.txt b/native-sql-engine/cpp/CMakeLists.txt
index 48a923614..e7d14e0c8 100644
--- a/native-sql-engine/cpp/CMakeLists.txt
+++ b/native-sql-engine/cpp/CMakeLists.txt
@@ -1,8 +1,8 @@
 cmake_minimum_required(VERSION 3.16)
 project(spark_columnar_plugin)
 
-add_definitions(-DSKIPWRITE -DSKIPCOMPRESS -DPROCESSROW)
-#add_definitions(-DPROCESSROW)
+#add_definitions(-DSKIPWRITE -DSKIPCOMPRESS -DPROCESSROW)
+add_definitions(-DPROCESSROW)
 
 add_compile_options(-g)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
diff --git a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
index d2bffe36a..ce4e88b62 100644
--- a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
+++ b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
@@ -41,6 +41,50 @@ namespace shuffle {
 const int batch_buffer_size = 32768;
 const int split_buffer_size = 8192;
 
+
+class MyLoggingMemoryPool : public MemoryPool {
+ public:
+  explicit MyLoggingMemoryPool(MemoryPool* pool): pool_(pool) {}
+  ~MyLoggingMemoryPool() override = default;
+
+  Status Allocate(int64_t size, uint8_t** out) override {
+    Status s = pool_->Allocate(size, out);
+    std::cout << "Allocate: size = " << size << std::endl;
+    return s;    
+  }
+  Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override
+  {
+    Status s = pool_->Reallocate(old_size, new_size, ptr);
+    std::cout << "Reallocate: old_size = " << old_size << " - new_size = " << new_size
+            << std::endl;
+    return s;
+  }
+
+  void Free(uint8_t* buffer, int64_t size) override{
+    pool_->Free(buffer, size);
+    std::cout << "Free: size = " << size << std::endl;
+  }
+
+  int64_t bytes_allocated() const override{
+    int64_t nb_bytes = pool_->bytes_allocated();
+    std::cout << "bytes_allocated: " << nb_bytes << std::endl;
+    return nb_bytes;
+  }
+
+  int64_t max_memory() const override{
+    int64_t mem = pool_->max_memory();
+    std::cout << "max_memory: " << mem << std::endl;
+    return mem;
+  }
+
+  std::string backend_name() const override{
+    return pool_->backend_name(); 
+  }
+
+ private:
+  MemoryPool* pool_;
+};
+
 class BenchmarkShuffleSplit {
  public:
   BenchmarkShuffleSplit(std::string file_name) { GetRecordBatchReader(file_name); }
@@ -188,6 +232,7 @@ class BenchmarkShuffleSplit {
   std::shared_ptr<arrow::Schema> schema;
   std::vector<std::shared_ptr<::gandiva::Expression>> expr_vector;
   parquet::ArrowReaderProperties properties;
+
 };
 
 class BenchmarkShuffleSplit_CacheScan_Benchmark : public BenchmarkShuffleSplit {
diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc
index 62e31df65..a5e3ca932 100644
--- a/native-sql-engine/cpp/src/shuffle/splitter.cc
+++ b/native-sql-engine/cpp/src/shuffle/splitter.cc
@@ -432,10 +432,11 @@ arrow::Status Splitter::AllocateBufferFromPool(std::shared_ptr<arrow::Buffer>& b
     ARROW_ASSIGN_OR_RAISE(combine_buffer_, arrow::AllocateResizableBuffer(
                                         SPLIT_BUFFER_SIZE,
                                         options_.memory_pool));
+    combine_buffer_->Resize(0, /*shrink_to_fit = */ false);
   }
   buffer = arrow::SliceMutableBuffer(combine_buffer_, combine_buffer_->size(),size);
   
-  combine_buffer_->Resize(combine_buffer_->size() + size);
+  combine_buffer_->Resize(combine_buffer_->size() + size, /*shrink_to_fit = */ false);
   return arrow::Status::OK();
 }
 
@@ -489,6 +490,7 @@ arrow::Status Splitter::Stop() {
     data_file_os_ = fout;
   }
 
+  std::cout << " cache record batch " << std::endl;
   // stop PartitionWriter and collect metrics
   for (auto pid = 0; pid < num_partitions_; ++pid) {
     RETURN_NOT_OK(CacheRecordBatch(pid, true));
@@ -508,11 +510,15 @@ arrow::Status Splitter::Stop() {
       partition_lengths_[pid] = 0;
     }
   }
+  this->combine_buffer_.reset();
 
   // close data file output Stream
   RETURN_NOT_OK(data_file_os_->Close());
 
   EVAL_END("write", options_.thread_id, options_.task_attempt_id)
+
+  
+
   return arrow::Status::OK();
 }
 int64_t batch_nbytes(const arrow::RecordBatch& batch) {
@@ -527,6 +533,7 @@ int64_t batch_nbytes(const arrow::RecordBatch& batch) {
         continue;
       }
       accumulated += buf->size();
+      std::cout << " buffer addr = 0x" << std::hex << buf->address() << std::dec << std::endl;
     }
   }
   return accumulated;
@@ -637,7 +644,7 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer
         }
       }
     }
-
+    std::cout << " cache record " << std::endl;
     auto batch = arrow::RecordBatch::Make(schema_, num_rows, std::move(arrays));
     int64_t raw_size = batch_nbytes(batch);
 
diff --git a/native-sql-engine/cpp/src/tests/shuffle_split_test.cc b/native-sql-engine/cpp/src/tests/shuffle_split_test.cc
index 715364a6d..cc05cd3e1 100644
--- a/native-sql-engine/cpp/src/tests/shuffle_split_test.cc
+++ b/native-sql-engine/cpp/src/tests/shuffle_split_test.cc
@@ -26,6 +26,20 @@
 
 #include <iostream>
 
+#include <execinfo.h>
+void print_trace(void) {
+    char **strings;
+    size_t i, size;
+    enum Constexpr { MAX_SIZE = 1024 };
+    void *array[MAX_SIZE];
+    size = backtrace(array, MAX_SIZE);
+    strings = backtrace_symbols(array, size);
+    for (i = 0; i < size; i++)
+        printf("    %s\n", strings[i]);
+    puts("");
+    free(strings);
+}
+
 #include "shuffle/splitter.h"
 #include "tests/test_utils.h"
 
@@ -42,6 +56,8 @@ class MyMemoryPool : public arrow::MemoryPool {
     }
     RETURN_NOT_OK(pool_->Allocate(size, out));
     stats_.UpdateAllocatedBytes(size);
+    std::cout << "Allocate: size = " << size << " addr = " << std::hex << (uint64_t)*out << std::dec << std::endl;
+    //print_trace();
     return arrow::Status::OK();
   }
 
@@ -49,14 +65,19 @@ class MyMemoryPool : public arrow::MemoryPool {
     if (new_size > capacity_) {
       return Status::OutOfMemory("malloc of size ", new_size, " failed");
     }
+    auto old_ptr = ptr;
     RETURN_NOT_OK(pool_->Reallocate(old_size, new_size, ptr));
     stats_.UpdateAllocatedBytes(new_size - old_size);
+    std::cout << "Reallocate: old_size = " << old_size << " old_ptr = " << std::hex << (uint64_t)*old_ptr << std::dec << " new_size = " << new_size << " addr = " << std::hex << (uint64_t)*ptr << std::dec << std::endl;
+    //print_trace();
     return arrow::Status::OK();
   }
 
   void Free(uint8_t* buffer, int64_t size) override {
     pool_->Free(buffer, size);
     stats_.UpdateAllocatedBytes(-size);
+    std::cout << "Free: size = " << size << " addr = " << std::hex << (uint64_t)buffer << std::dec << std::endl;
+    //print_trace();
   }
 
   int64_t bytes_allocated() const override { return stats_.bytes_allocated(); }
@@ -287,6 +308,39 @@ TEST_F(SplitterTest, TestRoundRobinSplitter) {
   }
 }
 
+TEST_F(SplitterTest, TestSplitterMemoryLeak) {
+  
+  std::shared_ptr<arrow::MemoryPool> pool = std::make_shared<MyMemoryPool>(9*1024*1024);
+
+  int32_t num_partitions = 2;
+  split_options_.buffer_size = 4;
+  split_options_.memory_pool = pool.get();
+
+  std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; 
+  ARROW_ASSIGN_OR_THROW(splitter_,
+                        Splitter::Make("rr", schema_, num_partitions, split_options_));
+
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_1_));
+  std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; 
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_2_));
+  std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; 
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_1_));
+  std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; 
+
+  std::cout << "split down " << std::endl;
+
+  ASSERT_NOT_OK(splitter_->Stop());
+  std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; 
+
+  std::cout << "stopped " << std::endl;
+
+  splitter_.reset();
+  std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; 
+  std::cout << "splitter_ killed " << std::endl;
+
+  split_options_.memory_pool = arrow::default_memory_pool();
+}
+
 TEST_F(SplitterTest, TestHashSplitter) {
   int32_t num_partitions = 2;
   split_options_.buffer_size = 4;
@@ -420,7 +474,7 @@ TEST_F(SplitterTest, TestSpillFailWithOutOfMemory) {
 
   int32_t num_partitions = 2;
   split_options_.buffer_size = 4;
-  split_options_.memory_pool = pool.get();
+  //split_options_.memory_pool = pool.get();
   ARROW_ASSIGN_OR_THROW(splitter_,
                         Splitter::Make("rr", schema_, num_partitions, split_options_));
 
@@ -436,7 +490,7 @@ TEST_F(SplitterTest, TestSpillLargestPartition) {
 
   int32_t num_partitions = 2;
   split_options_.buffer_size = 4;
-  split_options_.memory_pool = pool.get();
+  //split_options_.memory_pool = pool.get();
   split_options_.compression_type = arrow::Compression::UNCOMPRESSED;
   ARROW_ASSIGN_OR_THROW(splitter_,
                         Splitter::Make("rr", schema_, num_partitions, split_options_));

From 6b9881e0bbf5ed64d52c700a5bd83383f5a822f4 Mon Sep 17 00:00:00 2001
From: binwei <binwei.yang@intel.com>
Date: Sun, 1 May 2022 17:09:09 +0800
Subject: [PATCH 05/19] to rebase to master

---
 native-sql-engine/cpp/CMakeLists.txt          |   11 -
 .../src/benchmarks/shuffle_split_benchmark.cc |  450 -----
 native-sql-engine/cpp/src/shuffle/splitter.cc | 1674 -----------------
 .../cpp/src/tests/shuffle_split_test.cc       | 1139 -----------
 4 files changed, 3274 deletions(-)
 delete mode 100644 native-sql-engine/cpp/CMakeLists.txt
 delete mode 100644 native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
 delete mode 100644 native-sql-engine/cpp/src/shuffle/splitter.cc
 delete mode 100644 native-sql-engine/cpp/src/tests/shuffle_split_test.cc

diff --git a/native-sql-engine/cpp/CMakeLists.txt b/native-sql-engine/cpp/CMakeLists.txt
deleted file mode 100644
index e7d14e0c8..000000000
--- a/native-sql-engine/cpp/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-cmake_minimum_required(VERSION 3.16)
-project(spark_columnar_plugin)
-
-#add_definitions(-DSKIPWRITE -DSKIPCOMPRESS -DPROCESSROW)
-add_definitions(-DPROCESSROW)
-
-add_compile_options(-g)
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-
-set(root_directory ${PROJECT_BINARY_DIR})
-add_subdirectory(src)
diff --git a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
deleted file mode 100644
index ce4e88b62..000000000
--- a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
+++ /dev/null
@@ -1,450 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <arrow/filesystem/filesystem.h>
-#include <arrow/io/interfaces.h>
-#include <arrow/memory_pool.h>
-#include <arrow/record_batch.h>
-//#include <arrow/testing/gtest_util.h>
-#include <arrow/type.h>
-#include <arrow/util/io_util.h>
-//#include <gtest/gtest.h>
-#include <benchmark/benchmark.h>
-#include <parquet/arrow/reader.h>
-#include <parquet/file_reader.h>
-#include <sched.h>
-#include <shuffle/splitter.h>
-
-#include <chrono>
-
-#include "codegen/code_generator.h"
-#include "codegen/code_generator_factory.h"
-#include "tests/test_utils.h"
-
-namespace sparkcolumnarplugin {
-namespace shuffle {
-
-const int batch_buffer_size = 32768;
-const int split_buffer_size = 8192;
-
-
-class MyLoggingMemoryPool : public MemoryPool {
- public:
-  explicit MyLoggingMemoryPool(MemoryPool* pool): pool_(pool) {}
-  ~MyLoggingMemoryPool() override = default;
-
-  Status Allocate(int64_t size, uint8_t** out) override {
-    Status s = pool_->Allocate(size, out);
-    std::cout << "Allocate: size = " << size << std::endl;
-    return s;    
-  }
-  Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override
-  {
-    Status s = pool_->Reallocate(old_size, new_size, ptr);
-    std::cout << "Reallocate: old_size = " << old_size << " - new_size = " << new_size
-            << std::endl;
-    return s;
-  }
-
-  void Free(uint8_t* buffer, int64_t size) override{
-    pool_->Free(buffer, size);
-    std::cout << "Free: size = " << size << std::endl;
-  }
-
-  int64_t bytes_allocated() const override{
-    int64_t nb_bytes = pool_->bytes_allocated();
-    std::cout << "bytes_allocated: " << nb_bytes << std::endl;
-    return nb_bytes;
-  }
-
-  int64_t max_memory() const override{
-    int64_t mem = pool_->max_memory();
-    std::cout << "max_memory: " << mem << std::endl;
-    return mem;
-  }
-
-  std::string backend_name() const override{
-    return pool_->backend_name(); 
-  }
-
- private:
-  MemoryPool* pool_;
-};
-
-class BenchmarkShuffleSplit {
- public:
-  BenchmarkShuffleSplit(std::string file_name) { GetRecordBatchReader(file_name); }
-
-  void GetRecordBatchReader(const std::string& input_file) {
-    std::unique_ptr<::parquet::arrow::FileReader> parquet_reader;
-    std::shared_ptr<RecordBatchReader> record_batch_reader;
-
-    std::shared_ptr<arrow::fs::FileSystem> fs;
-    std::string file_name;
-    ARROW_ASSIGN_OR_THROW(fs, arrow::fs::FileSystemFromUriOrPath(input_file, &file_name))
-
-    ARROW_ASSIGN_OR_THROW(file, fs->OpenInputFile(file_name));
-
-    properties.set_batch_size(batch_buffer_size);
-    properties.set_pre_buffer(false);
-    properties.set_use_threads(false);
-
-    ASSERT_NOT_OK(::parquet::arrow::FileReader::Make(
-        arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file),
-        properties, &parquet_reader));
-
-    ASSERT_NOT_OK(parquet_reader->GetSchema(&schema));
-
-    auto num_rowgroups = parquet_reader->num_row_groups();
-
-    for (int i = 0; i < num_rowgroups; ++i) {
-      row_group_indices.push_back(i);
-    }
-
-    auto num_columns = schema->num_fields();
-    for (int i = 0; i < num_columns; ++i) {
-      column_indices.push_back(i);
-    }
-    const auto& fields = schema->fields();
-    for (const auto& field : fields) {
-      if (field->name() == "l_orderkey") {
-        auto node = gandiva::TreeExprBuilder::MakeField(field);
-        expr_vector.push_back(gandiva::TreeExprBuilder::MakeExpression(
-            std::move(node), arrow::field("res_" + field->name(), field->type())));
-      }
-    }
-  }
-
-  void operator()(benchmark::State& state) {
-    SetCPU(state.thread_index());
-    arrow::Compression::type compression_type = (arrow::Compression::type)state.range(1);
-
-    const int num_partitions = state.range(0);
-
-    auto options = SplitOptions::Defaults();
-    options.compression_type = compression_type;
-    options.buffer_size = split_buffer_size;
-    options.buffered_write = true;
-    options.offheap_per_task = 128 * 1024 * 1024 * 1024L;
-    options.prefer_spill = true;
-    options.write_schema = false;
-
-    std::shared_ptr<Splitter> splitter;
-    int64_t elapse_read = 0;
-    int64_t num_batches = 0;
-    int64_t num_rows = 0;
-    int64_t split_time = 0;
-
-    Do_Split(splitter, elapse_read, num_batches, num_rows, split_time, num_partitions,
-             options, state);
-
-    auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
-    fs->DeleteFile(splitter->DataFile());
-
-    state.SetBytesProcessed(int64_t(splitter->RawPartitionBytes()));
-
-    state.counters["rowgroups"] =
-        benchmark::Counter(row_group_indices.size(), benchmark::Counter::kAvgThreads,
-                           benchmark::Counter::OneK::kIs1000);
-    state.counters["columns"] =
-        benchmark::Counter(column_indices.size(), benchmark::Counter::kAvgThreads,
-                           benchmark::Counter::OneK::kIs1000);
-    state.counters["batches"] = benchmark::Counter(
-        num_batches, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
-    state.counters["num_rows"] = benchmark::Counter(
-        num_rows, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
-    state.counters["num_partitions"] =
-        benchmark::Counter(num_partitions, benchmark::Counter::kAvgThreads,
-                           benchmark::Counter::OneK::kIs1000);
-    state.counters["batch_buffer_size"] =
-        benchmark::Counter(batch_buffer_size, benchmark::Counter::kAvgThreads,
-                           benchmark::Counter::OneK::kIs1024);
-    state.counters["split_buffer_size"] =
-        benchmark::Counter(split_buffer_size, benchmark::Counter::kAvgThreads,
-                           benchmark::Counter::OneK::kIs1024);
-
-    state.counters["bytes_spilled"] =
-        benchmark::Counter(splitter->TotalBytesSpilled(), benchmark::Counter::kAvgThreads,
-                           benchmark::Counter::OneK::kIs1024);
-    state.counters["bytes_written"] =
-        benchmark::Counter(splitter->TotalBytesWritten(), benchmark::Counter::kAvgThreads,
-                           benchmark::Counter::OneK::kIs1024);
-    state.counters["bytes_raw"] =
-        benchmark::Counter(splitter->RawPartitionBytes(), benchmark::Counter::kAvgThreads,
-                           benchmark::Counter::OneK::kIs1024);
-    state.counters["bytes_spilled"] =
-        benchmark::Counter(splitter->TotalBytesSpilled(), benchmark::Counter::kAvgThreads,
-                           benchmark::Counter::OneK::kIs1024);
-
-    state.counters["parquet_parse"] = benchmark::Counter(
-        elapse_read, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
-    state.counters["compute_pid_time"] = benchmark::Counter(
-        splitter->TotalComputePidTime(), benchmark::Counter::kAvgThreads,
-        benchmark::Counter::OneK::kIs1000);
-    state.counters["write_time"] =
-        benchmark::Counter(splitter->TotalWriteTime(), benchmark::Counter::kAvgThreads,
-                           benchmark::Counter::OneK::kIs1000);
-    state.counters["spill_time"] =
-        benchmark::Counter(splitter->TotalSpillTime(), benchmark::Counter::kAvgThreads,
-                           benchmark::Counter::OneK::kIs1000);
-    state.counters["compress_time"] =
-        benchmark::Counter(splitter->TotalCompressTime(), benchmark::Counter::kAvgThreads,
-                           benchmark::Counter::OneK::kIs1000);
-
-    split_time = split_time - splitter->TotalSpillTime() -
-                 splitter->TotalComputePidTime() - splitter->TotalCompressTime() -
-                 splitter->TotalWriteTime();
-    state.counters["split_time"] = benchmark::Counter(
-        split_time, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
-  }
-
- protected:
-  long SetCPU(uint32_t cpuindex) {
-    cpu_set_t cs;
-    CPU_ZERO(&cs);
-    CPU_SET(cpuindex, &cs);
-    return sched_setaffinity(0, sizeof(cs), &cs);
-  }
-  virtual void Do_Split(std::shared_ptr<Splitter>& splitter, int64_t& elapse_read,
-                        int64_t& num_batches, int64_t& num_rows, int64_t& split_time,
-                        const int num_partitions, SplitOptions options,
-                        benchmark::State& state) {}
-
- protected:
-  std::string file_name;
-  std::shared_ptr<arrow::io::RandomAccessFile> file;
-  std::vector<int> row_group_indices;
-  std::vector<int> column_indices;
-  std::shared_ptr<arrow::Schema> schema;
-  std::vector<std::shared_ptr<::gandiva::Expression>> expr_vector;
-  parquet::ArrowReaderProperties properties;
-
-};
-
-class BenchmarkShuffleSplit_CacheScan_Benchmark : public BenchmarkShuffleSplit {
- public:
-  BenchmarkShuffleSplit_CacheScan_Benchmark(std::string filename)
-      : BenchmarkShuffleSplit(filename) {}
-
- protected:
-  void Do_Split(std::shared_ptr<Splitter>& splitter, int64_t& elapse_read,
-                int64_t& num_batches, int64_t& num_rows, int64_t& split_time,
-                const int num_partitions, SplitOptions options, benchmark::State& state) {
-    std::vector<int> local_column_indices;
-    local_column_indices.push_back(0);
-    local_column_indices.push_back(1);
-    local_column_indices.push_back(2);
-    local_column_indices.push_back(4);
-    local_column_indices.push_back(5);
-    local_column_indices.push_back(6);
-    local_column_indices.push_back(7);
-
-    std::shared_ptr<arrow::Schema> local_schema;
-    local_schema = std::make_shared<arrow::Schema>(*schema.get());
-
-    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(15));
-    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(14));
-    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(13));
-    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(12));
-    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(11));
-    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(10));
-    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(9));
-    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(8));
-    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(3));
-
-    if (state.thread_index() == 0) std::cout << local_schema->ToString() << std::endl;
-
-    ARROW_ASSIGN_OR_THROW(splitter,
-                          Splitter::Make("rr", local_schema, num_partitions, options));
-
-    std::shared_ptr<arrow::RecordBatch> record_batch;
-
-    std::unique_ptr<::parquet::arrow::FileReader> parquet_reader;
-    std::shared_ptr<RecordBatchReader> record_batch_reader;
-    ASSERT_NOT_OK(::parquet::arrow::FileReader::Make(
-        arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file),
-        properties, &parquet_reader));
-
-    std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
-    ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader(
-        row_group_indices, local_column_indices, &record_batch_reader));
-    do {
-      TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch));
-
-      if (record_batch) {
-        batches.push_back(record_batch);
-        num_batches += 1;
-        num_rows += record_batch->num_rows();
-      }
-    } while (record_batch);
-    std::cout << "parquet parse done elapsed time " << elapse_read / 1000000 << " ms "
-              << std::endl;
-    std::cout << "batches = " << num_batches << " rows = " << num_rows << std::endl;
-
-    for (auto _ : state) {
-      for_each(
-          batches.begin(), batches.end(),
-          [&splitter, &split_time](std::shared_ptr<arrow::RecordBatch>& record_batch) {
-            TIME_NANO_OR_THROW(split_time, splitter->Split(*record_batch));
-          });
-    }
-
-    TIME_NANO_OR_THROW(split_time, splitter->Stop());
-  }
-};
-
-class BenchmarkShuffleSplit_IterateScan_Benchmark : public BenchmarkShuffleSplit {
- public:
-  BenchmarkShuffleSplit_IterateScan_Benchmark(std::string filename)
-      : BenchmarkShuffleSplit(filename) {}
-
- protected:
-  void Do_Split(std::shared_ptr<Splitter>& splitter, int64_t& elapse_read,
-                int64_t& num_batches, int64_t& num_rows, int64_t& split_time,
-                const int num_partitions, SplitOptions options, benchmark::State& state) {
-    if (state.thread_index() == 0) std::cout << schema->ToString() << std::endl;
-
-    if (!expr_vector.empty()) {
-      ARROW_ASSIGN_OR_THROW(splitter, Splitter::Make("hash", schema, num_partitions,
-                                                     expr_vector, std::move(options)));
-    } else {
-      ARROW_ASSIGN_OR_THROW(
-          splitter, Splitter::Make("rr", schema, num_partitions, std::move(options)));
-    }
-
-    std::shared_ptr<arrow::RecordBatch> record_batch;
-
-    std::unique_ptr<::parquet::arrow::FileReader> parquet_reader;
-    std::shared_ptr<RecordBatchReader> record_batch_reader;
-    ASSERT_NOT_OK(::parquet::arrow::FileReader::Make(
-        arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file),
-        properties, &parquet_reader));
-
-    for (auto _ : state) {
-      std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
-      ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader(
-          row_group_indices, column_indices, &record_batch_reader));
-      TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch));
-      while (record_batch) {
-        num_batches += 1;
-        num_rows += record_batch->num_rows();
-        TIME_NANO_OR_THROW(split_time, splitter->Split(*record_batch));
-        TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch));
-      }
-    }
-    TIME_NANO_OR_THROW(split_time, splitter->Stop());
-  }
-};
-
-/*BENCHMARK_REGISTER_F(BenchmarkShuffleSplit, CacheScan)->Iterations(1)
-      ->Args({96*2, arrow::Compression::FASTPFOR})
-      ->Args({96*4, arrow::Compression::FASTPFOR})
-      ->Args({96*8, arrow::Compression::FASTPFOR})
-      ->Args({96*16, arrow::Compression::FASTPFOR})
-      ->Args({96*32, arrow::Compression::FASTPFOR})
-      ->Threads(1)
-      ->Threads(2)
-      ->Threads(4)
-      ->Threads(8)
-      ->Threads(16)
-      ->Threads(24)
-      ->Unit(benchmark::kSecond);
-*/
-/*BENCHMARK_REGISTER_F(BenchmarkShuffleSplit, IterateScan)->Iterations(1)
-      ->Args({96*2, arrow::Compression::FASTPFOR})
-      ->Args({96*4, arrow::Compression::FASTPFOR})
-      ->Args({96*8, arrow::Compression::FASTPFOR})
-      ->Args({96*16, arrow::Compression::FASTPFOR})
-      ->Args({96*32, arrow::Compression::FASTPFOR})
-      ->Threads(1)
-      ->Threads(2)
-      ->Threads(4)
-      ->Threads(8)
-      ->Threads(16)
-      ->Threads(24)
-      ->Unit(benchmark::kSecond);*/
-/*BENCHMARK_REGISTER_F(BenchmarkShuffleSplit, CacheScan)
-    ->Iterations(1000000)
-    ->Args({512, arrow::Compression::FASTPFOR})
-    ->Threads(1)
-    ->ReportAggregatesOnly(false)
-    ->MeasureProcessCPUTime()
-    ->Unit(benchmark::kSecond);*/
-}  // namespace shuffle
-}  // namespace sparkcolumnarplugin
-
-int main(int argc, char** argv) {
-  uint32_t iterations = 1;
-  uint32_t partitions = 512;
-  uint32_t threads = 1;
-  std::string datafile;
-
-  for (int i = 0; i < argc; i++) {
-    if (strcmp(argv[i], "--iterations") == 0) {
-      iterations = atol(argv[i + 1]);
-    } else if (strcmp(argv[i], "--partitions") == 0) {
-      partitions = atol(argv[i + 1]);
-    } else if (strcmp(argv[i], "--threads") == 0) {
-      threads = atol(argv[i + 1]);
-    } else if (strcmp(argv[i], "--file") == 0) {
-      datafile = argv[i + 1];
-    }
-  }
-  std::cout << "iterations = " << iterations << std::endl;
-  std::cout << "partitions = " << partitions << std::endl;
-  std::cout << "threads = " << threads << std::endl;
-  std::cout << "datafile = " << datafile << std::endl;
-
-  sparkcolumnarplugin::shuffle::BenchmarkShuffleSplit_CacheScan_Benchmark bck(datafile);
-
-  benchmark::RegisterBenchmark("BenchmarkShuffleSplit::CacheScan", bck)
-      ->Iterations(iterations)
-      ->Args({partitions, arrow::Compression::FASTPFOR})
-      ->Threads(threads)
-      ->ReportAggregatesOnly(false)
-      ->MeasureProcessCPUTime()
-      ->Unit(benchmark::kSecond);
-
-  /*  sparkcolumnarplugin::shuffle::BenchmarkShuffleSplit_IterateScan_Benchmark
-    bck(datafile);
-
-    benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck)
-      ->Iterations(1)
-        ->Args({96*2, arrow::Compression::FASTPFOR})
-        ->Args({96*4, arrow::Compression::FASTPFOR})
-        ->Args({96*8, arrow::Compression::FASTPFOR})
-        ->Args({96*16, arrow::Compression::FASTPFOR})
-        ->Args({96*32, arrow::Compression::FASTPFOR})
-        ->Threads(24)
-        ->Unit(benchmark::kSecond);
-
-    benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck)
-      ->Iterations(1)
-        ->Args({4096, arrow::Compression::FASTPFOR})
-        ->Threads(1)
-        ->Threads(2)
-        ->Threads(4)
-        ->Threads(8)
-        ->Threads(16)
-        ->Threads(24)
-        ->Unit(benchmark::kSecond);
-  */
-
-  benchmark::Initialize(&argc, argv);
-  benchmark::RunSpecifiedBenchmarks();
-  benchmark::Shutdown();
-}
\ No newline at end of file
diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc
deleted file mode 100644
index a5e3ca932..000000000
--- a/native-sql-engine/cpp/src/shuffle/splitter.cc
+++ /dev/null
@@ -1,1674 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "shuffle/splitter.h"
-
-#include <arrow/ipc/writer.h>
-#include <arrow/memory_pool.h>
-#include <arrow/type.h>
-#include <arrow/util/bit_util.h>
-#include <arrow/util/checked_cast.h>
-#include <gandiva/node.h>
-#include <gandiva/projector.h>
-#include <gandiva/tree_expr_builder.h>
-#include <immintrin.h>
-
-#include <cstring>
-#include <memory>
-#include <sstream>
-#include <string>
-#include <utility>
-
-#include "shuffle/utils.h"
-#include "utils/macros.h"
-
-/*#if defined(COLUMNAR_PLUGIN_USE_AVX512)
-#include <immintrin.h>
-#else
-#include <xmmintrin.h>
-#endif
-*/
-
-namespace sparkcolumnarplugin {
-namespace shuffle {
-using arrow::internal::checked_cast;
-
-#ifndef SPLIT_BUFFER_SIZE
-//by default, allocate 8M block, 2M page size
-#define SPLIT_BUFFER_SIZE 8*1024*1024
-#endif
-
-template <typename T>
-std::string __m128i_toString(const __m128i var) {
-  std::stringstream sstr;
-  T values[16 / sizeof(T)];
-  std::memcpy(values, &var, sizeof(values));  // See discussion below
-  if (sizeof(T) == 1) {
-    for (unsigned int i = 0; i < sizeof(__m128i); i++) {  // C++11: Range for also
-                                                          // possible
-      sstr << std::hex << (int)values[i] << " " << std::dec;
-    }
-  } else {
-    for (unsigned int i = 0; i < sizeof(__m128i) / sizeof(T);
-         i++) {  // C++11: Range for also possible
-      sstr << std::hex << values[i] << " " << std::dec;
-    }
-  }
-  return sstr.str();
-}
-
-SplitOptions SplitOptions::Defaults() { return SplitOptions(); }
-#if defined(COLUMNAR_PLUGIN_USE_AVX512)
-inline __m256i CountPartitionIdOccurrence(const std::vector<int32_t>& partition_id,
-                                          int32_t row) {
-  __m128i partid_cnt_low;
-  __m128i partid_cnt_high;
-  int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  partid_cnt_low = _mm_xor_si128(partid_cnt_low, partid_cnt_low);
-
-  tmp1 = (partition_id[row + 1] ^ partition_id[row]) == 0;
-  partid_cnt_low = _mm_insert_epi32(partid_cnt_low, tmp1, 1);
-
-  tmp2 = (partition_id[row + 2] ^ partition_id[row]) == 0;
-  tmp2 += (partition_id[row + 2] ^ partition_id[row + 1]) == 0;
-  partid_cnt_low = _mm_insert_epi32(partid_cnt_low, tmp2, 2);
-
-  tmp3 = (partition_id[row + 3] ^ partition_id[row]) == 0;
-  tmp3 += (partition_id[row + 3] ^ partition_id[row + 1]) == 0;
-  tmp3 += (partition_id[row + 3] ^ partition_id[row + 2]) == 0;
-  partid_cnt_low = _mm_insert_epi32(partid_cnt_low, tmp3, 3);
-
-  tmp4 = (partition_id[row + 4] ^ partition_id[row]) == 0;
-  tmp4 += (partition_id[row + 4] ^ partition_id[row + 1]) == 0;
-  tmp4 += (partition_id[row + 4] ^ partition_id[row + 2]) == 0;
-  tmp4 += (partition_id[row + 4] ^ partition_id[row + 3]) == 0;
-  partid_cnt_high = _mm_insert_epi32(partid_cnt_high, tmp4, 0);
-
-  tmp5 = (partition_id[row + 5] ^ partition_id[row]) == 0;
-  tmp5 += (partition_id[row + 5] ^ partition_id[row + 1]) == 0;
-  tmp5 += (partition_id[row + 5] ^ partition_id[row + 2]) == 0;
-  tmp5 += (partition_id[row + 5] ^ partition_id[row + 3]) == 0;
-  tmp5 += (partition_id[row + 5] ^ partition_id[row + 4]) == 0;
-  partid_cnt_high = _mm_insert_epi32(partid_cnt_high, tmp5, 1);
-
-  tmp6 = (partition_id[row + 6] ^ partition_id[row]) == 0;
-  tmp6 += (partition_id[row + 6] ^ partition_id[row + 1]) == 0;
-  tmp6 += (partition_id[row + 6] ^ partition_id[row + 2]) == 0;
-  tmp6 += (partition_id[row + 6] ^ partition_id[row + 3]) == 0;
-  tmp6 += (partition_id[row + 6] ^ partition_id[row + 4]) == 0;
-  tmp6 += (partition_id[row + 6] ^ partition_id[row + 5]) == 0;
-  partid_cnt_high = _mm_insert_epi32(partid_cnt_high, tmp6, 2);
-
-  tmp7 = (partition_id[row + 7] ^ partition_id[row]) == 0;
-  tmp7 += (partition_id[row + 7] ^ partition_id[row + 1]) == 0;
-  tmp7 += (partition_id[row + 7] ^ partition_id[row + 2]) == 0;
-  tmp7 += (partition_id[row + 7] ^ partition_id[row + 3]) == 0;
-  tmp7 += (partition_id[row + 7] ^ partition_id[row + 4]) == 0;
-  tmp7 += (partition_id[row + 7] ^ partition_id[row + 5]) == 0;
-  tmp7 += (partition_id[row + 7] ^ partition_id[row + 6]) == 0;
-  partid_cnt_high = _mm_insert_epi32(partid_cnt_high, tmp7, 3);
-
-  __m256i partid_cnt_8x = _mm256_castsi128_si256(partid_cnt_low);
-  partid_cnt_8x = _mm256_inserti128_si256(partid_cnt_8x, partid_cnt_high, 1);
-  return partid_cnt_8x;
-}
-
-inline void PrefetchDstAddr(__m512i dst_addr_8x, int32_t scale) {
-  _mm_prefetch(
-      (void*)(_mm_extract_epi64(_mm512_extracti64x2_epi64(dst_addr_8x, 0), 0) + scale),
-      _MM_HINT_T0);
-  _mm_prefetch(
-      (void*)(_mm_extract_epi64(_mm512_extracti64x2_epi64(dst_addr_8x, 0), 1) + scale),
-      _MM_HINT_T0);
-  _mm_prefetch(
-      (void*)(_mm_extract_epi64(_mm512_extracti64x2_epi64(dst_addr_8x, 1), 0) + scale),
-      _MM_HINT_T0);
-  _mm_prefetch(
-      (void*)(_mm_extract_epi64(_mm512_extracti64x2_epi64(dst_addr_8x, 1), 1) + scale),
-      _MM_HINT_T0);
-  _mm_prefetch(
-      (void*)(_mm_extract_epi64(_mm512_extracti64x2_epi64(dst_addr_8x, 2), 0) + scale),
-      _MM_HINT_T0);
-  _mm_prefetch(
-      (void*)(_mm_extract_epi64(_mm512_extracti64x2_epi64(dst_addr_8x, 2), 1) + scale),
-      _MM_HINT_T0);
-  _mm_prefetch(
-      (void*)(_mm_extract_epi64(_mm512_extracti64x2_epi64(dst_addr_8x, 3), 0) + scale),
-      _MM_HINT_T0);
-  _mm_prefetch(
-      (void*)(_mm_extract_epi64(_mm512_extracti64x2_epi64(dst_addr_8x, 3), 1) + scale),
-      _MM_HINT_T0);
-}
-#endif
-
-class Splitter::PartitionWriter {
- public:
-  explicit PartitionWriter(Splitter* splitter, int32_t partition_id)
-      : splitter_(splitter), partition_id_(partition_id) {}
-
-  arrow::Status Spill() {
-#ifndef SKIPWRITE
-    RETURN_NOT_OK(EnsureOpened());
-#endif
-    RETURN_NOT_OK(WriteRecordBatchPayload(spilled_file_os_.get(), partition_id_));
-    ClearCache();
-    return arrow::Status::OK();
-  }
-
-  arrow::Status WriteCachedRecordBatchAndClose() {
-    const auto& data_file_os = splitter_->data_file_os_;
-    ARROW_ASSIGN_OR_RAISE(auto before_write, data_file_os->Tell());
-
-    if (splitter_->options_.write_schema) {
-      RETURN_NOT_OK(WriteSchemaPayload(data_file_os.get()));
-    }
-
-    if (spilled_file_opened_) {
-      RETURN_NOT_OK(spilled_file_os_->Close());
-      RETURN_NOT_OK(MergeSpilled());
-    } else {
-      if (splitter_->partition_cached_recordbatch_size_[partition_id_] == 0) {
-        return arrow::Status::Invalid("Partition writer got empty partition");
-      }
-    }
-
-    RETURN_NOT_OK(WriteRecordBatchPayload(data_file_os.get(), partition_id_));
-    RETURN_NOT_OK(WriteEOS(data_file_os.get()));
-    ClearCache();
-
-    ARROW_ASSIGN_OR_RAISE(auto after_write, data_file_os->Tell());
-    partition_length = after_write - before_write;
-
-    return arrow::Status::OK();
-  }
-
-  // metrics
-  int64_t bytes_spilled = 0;
-  int64_t partition_length = 0;
-  int64_t compress_time = 0;
-
- private:
-  arrow::Status EnsureOpened() {
-    if (!spilled_file_opened_) {
-      ARROW_ASSIGN_OR_RAISE(spilled_file_,
-                            CreateTempShuffleFile(splitter_->NextSpilledFileDir()));
-      ARROW_ASSIGN_OR_RAISE(spilled_file_os_,
-                            arrow::io::FileOutputStream::Open(spilled_file_, true));
-      spilled_file_opened_ = true;
-    }
-    return arrow::Status::OK();
-  }
-
-  arrow::Status MergeSpilled() {
-    ARROW_ASSIGN_OR_RAISE(
-        auto spilled_file_is_,
-        arrow::io::MemoryMappedFile::Open(spilled_file_, arrow::io::FileMode::READ));
-    // copy spilled data blocks
-    ARROW_ASSIGN_OR_RAISE(auto nbytes, spilled_file_is_->GetSize());
-    ARROW_ASSIGN_OR_RAISE(auto buffer, spilled_file_is_->Read(nbytes));
-    RETURN_NOT_OK(splitter_->data_file_os_->Write(buffer));
-
-    // close spilled file streams and delete the file
-    RETURN_NOT_OK(spilled_file_is_->Close());
-    auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
-    RETURN_NOT_OK(fs->DeleteFile(spilled_file_));
-    bytes_spilled += nbytes;
-    return arrow::Status::OK();
-  }
-
-  arrow::Status WriteSchemaPayload(arrow::io::OutputStream* os) {
-    ARROW_ASSIGN_OR_RAISE(auto payload, splitter_->GetSchemaPayload());
-    int32_t metadata_length = 0;  // unused
-    RETURN_NOT_OK(arrow::ipc::WriteIpcPayload(
-        *payload, splitter_->options_.ipc_write_options, os, &metadata_length));
-    return arrow::Status::OK();
-  }
-
-  arrow::Status WriteRecordBatchPayload(arrow::io::OutputStream* os,
-                                        int32_t partition_id) {
-    int32_t metadata_length = 0;  // unused
-#ifndef SKIPWRITE
-    for (auto& payload : splitter_->partition_cached_recordbatch_[partition_id_]) {
-      RETURN_NOT_OK(arrow::ipc::WriteIpcPayload(
-          *payload, splitter_->options_.ipc_write_options, os, &metadata_length));
-      payload = nullptr;
-    }
-#endif
-    return arrow::Status::OK();
-  }
-
-  arrow::Status WriteEOS(arrow::io::OutputStream* os) {
-    // write EOS
-    constexpr int32_t kZeroLength = 0;
-    RETURN_NOT_OK(os->Write(&kIpcContinuationToken, sizeof(int32_t)));
-    RETURN_NOT_OK(os->Write(&kZeroLength, sizeof(int32_t)));
-    return arrow::Status::OK();
-  }
-
-  void ClearCache() {
-    splitter_->partition_cached_recordbatch_[partition_id_].clear();
-    splitter_->partition_cached_recordbatch_size_[partition_id_] = 0;
-  }
-
-  Splitter* splitter_;
-  int32_t partition_id_;
-  std::string spilled_file_;
-  std::shared_ptr<arrow::io::FileOutputStream> spilled_file_os_;
-
-  bool spilled_file_opened_ = false;
-};
-
-// ----------------------------------------------------------------------
-// Splitter
-
-arrow::Result<std::shared_ptr<Splitter>> Splitter::Make(
-    const std::string& short_name, std::shared_ptr<arrow::Schema> schema,
-    int num_partitions, const gandiva::ExpressionVector& expr_vector,
-    SplitOptions options) {
-  if (short_name == "hash") {
-    return HashSplitter::Create(num_partitions, std::move(schema), expr_vector,
-                                std::move(options));
-  } else if (short_name == "rr") {
-    return RoundRobinSplitter::Create(num_partitions, std::move(schema),
-                                      std::move(options));
-  } else if (short_name == "range") {
-    return FallbackRangeSplitter::Create(num_partitions, std::move(schema),
-                                         std::move(options));
-  } else if (short_name == "single") {
-    return RoundRobinSplitter::Create(1, std::move(schema), std::move(options));
-  }
-  return arrow::Status::NotImplemented("Partitioning " + short_name +
-                                       " not supported yet.");
-}
-
-arrow::Result<std::shared_ptr<Splitter>> Splitter::Make(
-    const std::string& short_name, std::shared_ptr<arrow::Schema> schema,
-    int num_partitions, SplitOptions options) {
-  return Make(short_name, std::move(schema), num_partitions, {}, std::move(options));
-}
-
-arrow::Status Splitter::Init() {
-  const auto& fields = schema_->fields();
-  ARROW_ASSIGN_OR_RAISE(column_type_id_, ToSplitterTypeId(schema_->fields()));
-
-  partition_writer_.resize(num_partitions_);
-
-  // pre-computed row count for each partition after the record batch split
-  partition_id_cnt_.resize(num_partitions_);
-  // pre-allocated buffer size for each partition, unit is row count
-  partition_buffer_size_.resize(num_partitions_);
-
-  // start index for each partition when new record batch starts to split
-  partition_buffer_idx_base_.resize(num_partitions_);
-  // the offset of each partition during record batch split
-  partition_buffer_idx_offset_.resize(num_partitions_);
-
-  partition_cached_recordbatch_.resize(num_partitions_);
-  partition_cached_recordbatch_size_.resize(num_partitions_);
-  partition_lengths_.resize(num_partitions_);
-  raw_partition_lengths_.resize(num_partitions_);
-  reducer_offset_offset_.resize(num_partitions_ + 1);
-
-  for (int i = 0; i < column_type_id_.size(); ++i) {
-    switch (column_type_id_[i]->id()) {
-      case arrow::BinaryType::type_id:
-      case arrow::StringType::type_id:
-        binary_array_idx_.push_back(i);
-        break;
-      case arrow::LargeBinaryType::type_id:
-      case arrow::LargeStringType::type_id:
-        large_binary_array_idx_.push_back(i);
-        break;
-      case arrow::StructType::type_id:
-      case arrow::MapType::type_id:
-      case arrow::LargeListType::type_id:
-      case arrow::ListType::type_id:
-        list_array_idx_.push_back(i);
-        break;
-      case arrow::NullType::type_id:
-        break;
-      default:
-        fixed_width_array_idx_.push_back(i);
-        break;
-    }
-  }
-
-  auto num_fixed_width = fixed_width_array_idx_.size();
-  partition_fixed_width_validity_addrs_.resize(num_fixed_width);
-  column_has_null_.resize(num_fixed_width, false);
-  partition_fixed_width_value_addrs_.resize(num_fixed_width);
-  partition_fixed_width_buffers_.resize(num_fixed_width);
-  binary_array_empirical_size_.resize(binary_array_idx_.size());
-  large_binary_array_empirical_size_.resize(large_binary_array_idx_.size());
-  input_fixed_width_has_null_.resize(num_fixed_width, false);
-  for (auto i = 0; i < num_fixed_width; ++i) {
-    partition_fixed_width_validity_addrs_[i].resize(num_partitions_, nullptr);
-    partition_fixed_width_value_addrs_[i].resize(num_partitions_, nullptr);
-    partition_fixed_width_buffers_[i].resize(num_partitions_);
-  }
-  partition_binary_builders_.resize(binary_array_idx_.size());
-  for (auto i = 0; i < binary_array_idx_.size(); ++i) {
-    partition_binary_builders_[i].resize(num_partitions_);
-  }
-  partition_large_binary_builders_.resize(large_binary_array_idx_.size());
-  for (auto i = 0; i < large_binary_array_idx_.size(); ++i) {
-    partition_large_binary_builders_[i].resize(num_partitions_);
-  }
-  partition_list_builders_.resize(list_array_idx_.size());
-  for (auto i = 0; i < list_array_idx_.size(); ++i) {
-    partition_list_builders_[i].resize(num_partitions_);
-  }
-
-  ARROW_ASSIGN_OR_RAISE(configured_dirs_, GetConfiguredLocalDirs());
-  sub_dir_selection_.assign(configured_dirs_.size(), 0);
-
-  // Both data_file and shuffle_index_file should be set through jni.
-  // For test purpose, Create a temporary subdirectory in the system temporary
-  // dir with prefix "columnar-shuffle"
-  if (options_.data_file.length() == 0) {
-    ARROW_ASSIGN_OR_RAISE(options_.data_file, CreateTempShuffleFile(configured_dirs_[0]));
-  }
-
-  auto& ipc_write_options = options_.ipc_write_options;
-  ipc_write_options.memory_pool = options_.memory_pool;
-  ipc_write_options.use_threads = false;
-
-  if (options_.compression_type == arrow::Compression::FASTPFOR) {
-    ARROW_ASSIGN_OR_RAISE(ipc_write_options.codec,
-                          arrow::util::Codec::CreateInt32(arrow::Compression::FASTPFOR));
-
-  } else if (options_.compression_type == arrow::Compression::LZ4_FRAME) {
-    ARROW_ASSIGN_OR_RAISE(ipc_write_options.codec,
-                          arrow::util::Codec::Create(arrow::Compression::LZ4_FRAME));
-  } else {
-    ARROW_ASSIGN_OR_RAISE(ipc_write_options.codec, arrow::util::Codec::CreateInt32(
-                                                       arrow::Compression::UNCOMPRESSED));
-  }
-
-  // initialize tiny batch write options
-  tiny_bach_write_options_ = ipc_write_options;
-  ARROW_ASSIGN_OR_RAISE(
-      tiny_bach_write_options_.codec,
-      arrow::util::Codec::CreateInt32(arrow::Compression::UNCOMPRESSED));
-
-  //Allocate first buffer for split reducer
-  ARROW_ASSIGN_OR_RAISE(combine_buffer_, arrow::AllocateResizableBuffer(
-                                        SPLIT_BUFFER_SIZE,
-                                        options_.memory_pool));
-  combine_buffer_->Resize(0, /*shrink_to_fit =*/false);
-
-  return arrow::Status::OK();
-}
-arrow::Status Splitter::AllocateBufferFromPool(std::shared_ptr<arrow::Buffer>& buffer, uint32_t size)
-{
-  // if size is already larger than buffer pool size, allocate it directly
-  //make size 64byte aligned
-  auto reminder = size & 0x3f;
-  size+=(64-reminder) & ((reminder==0)-1);
-
-  if (size > SPLIT_BUFFER_SIZE )
-  {
-    ARROW_ASSIGN_OR_RAISE(buffer, arrow::AllocateResizableBuffer(
-                                        size, options_.memory_pool));
-    return arrow::Status::OK();
-  }else if (combine_buffer_->capacity() - combine_buffer_->size() < size)
-  {
-    //memory pool is not enough
-    ARROW_ASSIGN_OR_RAISE(combine_buffer_, arrow::AllocateResizableBuffer(
-                                        SPLIT_BUFFER_SIZE,
-                                        options_.memory_pool));
-    combine_buffer_->Resize(0, /*shrink_to_fit = */ false);
-  }
-  buffer = arrow::SliceMutableBuffer(combine_buffer_, combine_buffer_->size(),size);
-  
-  combine_buffer_->Resize(combine_buffer_->size() + size, /*shrink_to_fit = */ false);
-  return arrow::Status::OK();
-}
-
-int64_t Splitter::CompressedSize(const arrow::RecordBatch& rb) {
-  auto payload = std::make_shared<arrow::ipc::IpcPayload>();
-  arrow::Status result;
-  result =
-      arrow::ipc::GetRecordBatchPayload(rb, options_.ipc_write_options, payload.get());
-  if (result.ok()) {
-    return payload->body_length;
-  } else {
-    result.UnknownError("Failed to get the compressed size.");
-    return -1;
-  }
-}
-
-arrow::Status Splitter::SetCompressType(arrow::Compression::type compressed_type) {
-  if (compressed_type == arrow::Compression::FASTPFOR) {
-    ARROW_ASSIGN_OR_RAISE(options_.ipc_write_options.codec,
-                          arrow::util::Codec::CreateInt32(arrow::Compression::FASTPFOR));
-
-  } else if (compressed_type == arrow::Compression::LZ4_FRAME) {
-    ARROW_ASSIGN_OR_RAISE(options_.ipc_write_options.codec,
-                          arrow::util::Codec::Create(arrow::Compression::LZ4_FRAME));
-  } else {
-    ARROW_ASSIGN_OR_RAISE(
-        options_.ipc_write_options.codec,
-        arrow::util::Codec::CreateInt32(arrow::Compression::UNCOMPRESSED));
-  }
-  return arrow::Status::OK();
-}
-
-arrow::Status Splitter::Split(const arrow::RecordBatch& rb) {
-  EVAL_START("split", options_.thread_id)
-  RETURN_NOT_OK(ComputeAndCountPartitionId(rb));
-  RETURN_NOT_OK(DoSplit(rb));
-  EVAL_END("split", options_.thread_id, options_.task_attempt_id)
-  return arrow::Status::OK();
-}
-
-arrow::Status Splitter::Stop() {
-  EVAL_START("write", options_.thread_id)
-  // open data file output stream
-  std::shared_ptr<arrow::io::FileOutputStream> fout;
-  ARROW_ASSIGN_OR_RAISE(fout,
-                        arrow::io::FileOutputStream::Open(options_.data_file, true));
-  if (options_.buffered_write) {
-    ARROW_ASSIGN_OR_RAISE(data_file_os_, arrow::io::UnlockedBufferedOutputStream::Create(
-                                             16384, options_.memory_pool, fout));
-  } else {
-    data_file_os_ = fout;
-  }
-
-  std::cout << " cache record batch " << std::endl;
-  // stop PartitionWriter and collect metrics
-  for (auto pid = 0; pid < num_partitions_; ++pid) {
-    RETURN_NOT_OK(CacheRecordBatch(pid, true));
-    if (partition_cached_recordbatch_size_[pid] > 0) {
-      if (partition_writer_[pid] == nullptr) {
-        partition_writer_[pid] = std::make_shared<PartitionWriter>(this, pid);
-      }
-    }
-    if (partition_writer_[pid] != nullptr) {
-      const auto& writer = partition_writer_[pid];
-      TIME_NANO_OR_RAISE(total_write_time_, writer->WriteCachedRecordBatchAndClose());
-      partition_lengths_[pid] = writer->partition_length;
-      total_bytes_written_ += writer->partition_length;
-      total_bytes_spilled_ += writer->bytes_spilled;
-      total_compress_time_ += writer->compress_time;
-    } else {
-      partition_lengths_[pid] = 0;
-    }
-  }
-  this->combine_buffer_.reset();
-
-  // close data file output Stream
-  RETURN_NOT_OK(data_file_os_->Close());
-
-  EVAL_END("write", options_.thread_id, options_.task_attempt_id)
-
-  
-
-  return arrow::Status::OK();
-}
-int64_t batch_nbytes(const arrow::RecordBatch& batch) {
-  int64_t accumulated = 0L;
-
-  for (const auto& array : batch.columns()) {
-    if (array == nullptr || array->data() == nullptr) {
-      continue;
-    }
-    for (const auto& buf : array->data()->buffers) {
-      if (buf == nullptr) {
-        continue;
-      }
-      accumulated += buf->size();
-      std::cout << " buffer addr = 0x" << std::hex << buf->address() << std::dec << std::endl;
-    }
-  }
-  return accumulated;
-}
-
-int64_t batch_nbytes(std::shared_ptr<arrow::RecordBatch> batch) {
-  if (batch == nullptr) {
-    return 0;
-  }
-  return batch_nbytes(*batch);
-}
-
-arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffers) {
-  static int printed = 0;
-
-  if (partition_buffer_idx_base_[partition_id] > 0) {
-    // already filled
-    auto fixed_width_idx = 0;
-    auto binary_idx = 0;
-    auto large_binary_idx = 0;
-    auto list_idx = 0;
-    auto num_fields = schema_->num_fields();
-    auto num_rows = partition_buffer_idx_base_[partition_id];
-    auto buffer_sizes = 0;
-    std::vector<std::shared_ptr<arrow::Array>> arrays(num_fields);
-    for (int i = 0; i < num_fields; ++i) {
-      switch (column_type_id_[i]->id()) {
-        case arrow::BinaryType::type_id:
-        case arrow::StringType::type_id: {
-          auto& builder = partition_binary_builders_[binary_idx][partition_id];
-          if (reset_buffers) {
-            RETURN_NOT_OK(builder->Finish(&arrays[i]));
-            builder->Reset();
-          } else {
-            auto data_size = builder->value_data_length();
-            RETURN_NOT_OK(builder->Finish(&arrays[i]));
-            builder->Reset();
-            RETURN_NOT_OK(builder->Reserve(num_rows));
-            RETURN_NOT_OK(builder->ReserveData(data_size));
-          }
-          binary_idx++;
-          break;
-        }
-        case arrow::LargeBinaryType::type_id:
-        case arrow::LargeStringType::type_id: {
-          auto& builder =
-              partition_large_binary_builders_[large_binary_idx][partition_id];
-          if (reset_buffers) {
-            RETURN_NOT_OK(builder->Finish(&arrays[i]));
-            builder->Reset();
-          } else {
-            auto data_size = builder->value_data_length();
-            RETURN_NOT_OK(builder->Finish(&arrays[i]));
-            builder->Reset();
-            RETURN_NOT_OK(builder->Reserve(num_rows));
-            RETURN_NOT_OK(builder->ReserveData(data_size));
-          }
-          large_binary_idx++;
-          break;
-        }
-        case arrow::StructType::type_id:
-        case arrow::MapType::type_id:
-        case arrow::LargeListType::type_id:
-        case arrow::ListType::type_id: {
-          auto& builder = partition_list_builders_[list_idx][partition_id];
-          if (reset_buffers) {
-            RETURN_NOT_OK(builder->Finish(&arrays[i]));
-            builder->Reset();
-          } else {
-            RETURN_NOT_OK(builder->Finish(&arrays[i]));
-            builder->Reset();
-            RETURN_NOT_OK(builder->Reserve(num_rows));
-          }
-          list_idx++;
-          break;
-        }
-        case arrow::NullType::type_id: {
-          arrays[i] = arrow::MakeArray(arrow::ArrayData::Make(
-              arrow::null(), num_rows, {nullptr, nullptr}, num_rows));
-          break;
-        }
-        default: {
-          auto& buffers = partition_fixed_width_buffers_[fixed_width_idx][partition_id];
-          if (buffers[0] != nullptr) {
-            buffers[0]=arrow::SliceBuffer(buffers[0],0,(num_rows >> 3) + 1);
-          }
-          if (buffers[1] != nullptr) {
-            if (column_type_id_[i]->id() == arrow::BooleanType::type_id)
-              buffers[1]=arrow::SliceBuffer(buffers[1],0,(num_rows >> 3) + 1);
-            else
-              buffers[1]=arrow::SliceBuffer(buffers[1],0,num_rows * (arrow::bit_width(column_type_id_[i]->id()) >> 3));
-          }
-
-          if (reset_buffers) {
-            arrays[i] = arrow::MakeArray(
-                arrow::ArrayData::Make(schema_->field(i)->type(), num_rows,
-                                       {std::move(buffers[0]), std::move(buffers[1])}));
-            buffers = {nullptr, nullptr};
-            partition_fixed_width_validity_addrs_[fixed_width_idx][partition_id] =
-                nullptr;
-            partition_fixed_width_value_addrs_[fixed_width_idx][partition_id] = nullptr;
-          } else {
-            arrays[i] = arrow::MakeArray(arrow::ArrayData::Make(
-                schema_->field(i)->type(), num_rows, {buffers[0], buffers[1]}));
-          }
-          fixed_width_idx++;
-          break;
-        }
-      }
-    }
-    std::cout << " cache record " << std::endl;
-    auto batch = arrow::RecordBatch::Make(schema_, num_rows, std::move(arrays));
-    int64_t raw_size = batch_nbytes(batch);
-
-    raw_partition_lengths_[partition_id] += raw_size;
-    auto payload = std::make_shared<arrow::ipc::IpcPayload>();
-#ifndef SKIPCOMPRESS
-    if (num_rows <= options_.batch_compress_threshold) {
-      TIME_NANO_OR_RAISE(total_compress_time_,
-                         arrow::ipc::GetRecordBatchPayload(
-                             *batch, tiny_bach_write_options_, payload.get()));
-    } else {
-      TIME_NANO_OR_RAISE(total_compress_time_,
-                         arrow::ipc::GetRecordBatchPayload(
-                             *batch, options_.ipc_write_options, payload.get()));
-    }
-#else
-    // for test reason
-    TIME_NANO_OR_RAISE(total_compress_time_,
-                       arrow::ipc::GetRecordBatchPayload(*batch, tiny_bach_write_options_,
-                                                         payload.get()));
-#endif
-
-    partition_cached_recordbatch_size_[partition_id] += payload->body_length;
-    partition_cached_recordbatch_[partition_id].push_back(std::move(payload));
-    partition_buffer_idx_base_[partition_id] = 0;
-  }
-
-  return arrow::Status::OK();
-}
-
-arrow::Status Splitter::AllocatePartitionBuffers(int32_t partition_id, int32_t new_size) {
-  // try to allocate new
-  auto num_fields = schema_->num_fields();
-  auto fixed_width_idx = 0;
-  auto binary_idx = 0;
-  auto large_binary_idx = 0;
-  auto list_idx = 0;
-  auto total_size = 0;
-
-  std::vector<std::shared_ptr<arrow::BinaryBuilder>> new_binary_builders;
-  std::vector<std::shared_ptr<arrow::LargeBinaryBuilder>> new_large_binary_builders;
-  std::vector<std::shared_ptr<arrow::ArrayBuilder>> new_list_builders;
-  std::vector<std::shared_ptr<arrow::Buffer>> new_value_buffers;
-  std::vector<std::shared_ptr<arrow::Buffer>> new_validity_buffers;
-
-  for (auto i = 0; i < num_fields; ++i) {
-    switch (column_type_id_[i]->id()) {
-      case arrow::BinaryType::type_id:
-      case arrow::StringType::type_id: {
-        auto builder = std::make_shared<arrow::BinaryBuilder>(options_.memory_pool);
-        assert(builder != nullptr);
-        RETURN_NOT_OK(builder->Reserve(new_size));
-        RETURN_NOT_OK(builder->ReserveData(
-            binary_array_empirical_size_[binary_idx] * new_size + 1024));
-        new_binary_builders.push_back(std::move(builder));
-        binary_idx++;
-        break;
-      }
-      case arrow::LargeBinaryType::type_id:
-      case arrow::LargeStringType::type_id: {
-        auto builder = std::make_shared<arrow::LargeBinaryBuilder>(options_.memory_pool);
-        assert(builder != nullptr);
-        RETURN_NOT_OK(builder->Reserve(new_size));
-        RETURN_NOT_OK(builder->ReserveData(
-            large_binary_array_empirical_size_[large_binary_idx] * new_size + 1024));
-        new_large_binary_builders.push_back(std::move(builder));
-        large_binary_idx++;
-        break;
-      }
-      case arrow::StructType::type_id:
-      case arrow::MapType::type_id:
-      case arrow::LargeListType::type_id:
-      case arrow::ListType::type_id: {
-        std::unique_ptr<arrow::ArrayBuilder> array_builder;
-        RETURN_NOT_OK(
-            MakeBuilder(options_.memory_pool, column_type_id_[i], &array_builder));
-        assert(array_builder != nullptr);
-        RETURN_NOT_OK(array_builder->Reserve(new_size));
-        new_list_builders.push_back(std::move(array_builder));
-        list_idx++;
-        break;
-      }
-      case arrow::NullType::type_id:
-        break;
-      default: {
-          try{
-        std::shared_ptr<arrow::Buffer> value_buffer;
-        if (column_type_id_[i]->id() == arrow::BooleanType::type_id) {
-          auto status = AllocateBufferFromPool(value_buffer, arrow::BitUtil::BytesForBits(new_size));
-          ARROW_RETURN_NOT_OK( status );
-        } else {
-            auto status = AllocateBufferFromPool(value_buffer, new_size * (arrow::bit_width(column_type_id_[i]->id()) >>3));
-            ARROW_RETURN_NOT_OK( status );
-
-        }
-        new_value_buffers.push_back(std::move(value_buffer));
-        if (input_fixed_width_has_null_[fixed_width_idx]) {
-          std::shared_ptr<arrow::Buffer> validity_buffer;
-          auto status = AllocateBufferFromPool(validity_buffer, arrow::BitUtil::BytesForBits(new_size));
-          ARROW_RETURN_NOT_OK( status );
-          new_validity_buffers.push_back(std::move(validity_buffer));
-        } else {
-          new_validity_buffers.push_back(nullptr);
-        }
-        fixed_width_idx++;
-          }catch(const std::exception& e)
-          {
-            std::cout << "exception captured " << e.what() << std::endl;
-          }
-        break;
-      }
-    }
-  }
-
-  // point to newly allocated buffers
-  fixed_width_idx = binary_idx = large_binary_idx = 0;
-  list_idx = 0;
-  for (auto i = 0; i < num_fields; ++i) {
-    switch (column_type_id_[i]->id()) {
-      case arrow::BinaryType::type_id:
-      case arrow::StringType::type_id:
-        partition_binary_builders_[binary_idx][partition_id] =
-            std::move(new_binary_builders[binary_idx]);
-        binary_idx++;
-        break;
-      case arrow::LargeBinaryType::type_id:
-      case arrow::LargeStringType::type_id:
-        partition_large_binary_builders_[large_binary_idx][partition_id] =
-            std::move(new_large_binary_builders[large_binary_idx]);
-        large_binary_idx++;
-        break;
-      case arrow::StructType::type_id:
-      case arrow::MapType::type_id:
-      case arrow::LargeListType::type_id:
-      case arrow::ListType::type_id:
-        partition_list_builders_[list_idx][partition_id] =
-            std::move(new_list_builders[list_idx]);
-        list_idx++;
-        break;
-      case arrow::NullType::type_id:
-        break;
-      default:
-        partition_fixed_width_value_addrs_[fixed_width_idx][partition_id] =
-            new_value_buffers[fixed_width_idx]->mutable_data();
-        if (input_fixed_width_has_null_[fixed_width_idx]) {
-          partition_fixed_width_validity_addrs_[fixed_width_idx][partition_id] =
-              new_validity_buffers[fixed_width_idx]->mutable_data();
-        } else {
-          partition_fixed_width_validity_addrs_[fixed_width_idx][partition_id] = nullptr;
-        }
-        partition_fixed_width_buffers_[fixed_width_idx][partition_id] = {
-            std::move(new_validity_buffers[fixed_width_idx]),
-            std::move(new_value_buffers[fixed_width_idx])};
-        fixed_width_idx++;
-        break;
-    }
-  }
-  partition_buffer_size_[partition_id] = new_size;
-  return arrow::Status::OK();
-}
-
-arrow::Status Splitter::AllocateNew(int32_t partition_id, int32_t new_size) {
-  auto status = AllocatePartitionBuffers(partition_id, new_size);
-  int32_t retry = 0;
-  while (status.IsOutOfMemory() && retry < 3) {
-    // retry allocate
-    std::cout << status.ToString() << std::endl
-              << std::to_string(++retry) << " retry to allocate new buffer for partition "
-              << std::to_string(partition_id) << std::endl;
-    int64_t spilled_size;
-    ARROW_ASSIGN_OR_RAISE(auto partition_to_spill, SpillLargestPartition(&spilled_size));
-    if (partition_to_spill == -1) {
-      std::cout << "Failed to allocate new buffer for partition "
-                << std::to_string(partition_id) << ". No partition buffer to spill."
-                << std::endl;
-      return status;
-    }
-    status = AllocatePartitionBuffers(partition_id, new_size);
-  }
-  if (status.IsOutOfMemory()) {
-    std::cout << "Failed to allocate new buffer for partition "
-              << std::to_string(partition_id) << ". Out of memory." << std::endl;
-  }
-  return status;
-}
-
-// call from memory management
-arrow::Status Splitter::SpillFixedSize(int64_t size, int64_t* actual) {
-  int64_t current_spilled = 0L;
-  int32_t try_count = 0;
-  while (current_spilled < size && try_count < 5) {
-    try_count++;
-    int64_t single_call_spilled;
-    ARROW_ASSIGN_OR_RAISE(int32_t spilled_partition_id,
-                          SpillLargestPartition(&single_call_spilled))
-    if (spilled_partition_id == -1) {
-      break;
-    }
-    current_spilled += single_call_spilled;
-  }
-  *actual = current_spilled;
-  return arrow::Status::OK();
-}
-
-arrow::Status Splitter::SpillPartition(int32_t partition_id) {
-  if (partition_writer_[partition_id] == nullptr) {
-    partition_writer_[partition_id] =
-        std::make_shared<PartitionWriter>(this, partition_id);
-  }
-  TIME_NANO_OR_RAISE(total_spill_time_, partition_writer_[partition_id]->Spill());
-  return arrow::Status::OK();
-}
-
-arrow::Result<int32_t> Splitter::SpillLargestPartition(int64_t* size) {
-  // spill the largest partition
-  auto max_size = 0;
-  int32_t partition_to_spill = -1;
-  for (auto i = 0; i < num_partitions_; ++i) {
-    if (partition_cached_recordbatch_size_[i] > max_size) {
-      max_size = partition_cached_recordbatch_size_[i];
-      partition_to_spill = i;
-    }
-  }
-  if (partition_to_spill != -1) {
-    RETURN_NOT_OK(SpillPartition(partition_to_spill));
-#ifdef DEBUG
-    std::cout << "Spilled partition " << std::to_string(partition_to_spill) << ", "
-              << std::to_string(max_size) << " bytes released" << std::endl;
-#endif
-    *size = max_size;
-  } else {
-    *size = 0;
-  }
-  return partition_to_spill;
-}
-
-arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) {
-#ifdef PROCESSROW
-
-  reducer_offsets_.resize(rb.num_rows());
-
-  reducer_offset_offset_[0] = 0;
-  for (auto pid = 1; pid <= num_partitions_; pid++) {
-    reducer_offset_offset_[pid] =
-        reducer_offset_offset_[pid - 1] + partition_id_cnt_[pid - 1];
-  }
-  for (auto row = 0; row < rb.num_rows(); row++) {
-    auto pid = partition_id_[row];
-    reducer_offsets_[reducer_offset_offset_[pid]] = row;
-    _mm_prefetch(reducer_offsets_.data() + reducer_offset_offset_[pid] + 32, _MM_HINT_T0);
-    reducer_offset_offset_[pid]++;
-  }
-  std::transform(reducer_offset_offset_.begin(), std::prev(reducer_offset_offset_.end()),
-                 partition_id_cnt_.begin(), reducer_offset_offset_.begin(),
-                 [](uint16_t x, int16_t y) { return x - y; });
-
-#endif
-  // for the first input record batch, scan binary arrays and large binary
-  // arrays to get their empirical sizes
-
-  uint32_t size_per_row = 0;
-  if (!empirical_size_calculated_) {
-    auto num_rows = rb.num_rows();
-    for (int i = 0; i < binary_array_idx_.size(); ++i) {
-      auto arr =
-          std::static_pointer_cast<arrow::BinaryArray>(rb.column(binary_array_idx_[i]));
-      auto length = arr->value_offset(num_rows) - arr->value_offset(0);
-      binary_array_empirical_size_[i] = length / num_rows;
-    }
-    for (int i = 0; i < large_binary_array_idx_.size(); ++i) {
-      auto arr = std::static_pointer_cast<arrow::LargeBinaryArray>(
-          rb.column(large_binary_array_idx_[i]));
-      auto length = arr->value_offset(num_rows) - arr->value_offset(0);
-      large_binary_array_empirical_size_[i] = length / num_rows;
-    }
-    empirical_size_calculated_ = true;
-  }
-
-  size_per_row = std::accumulate(binary_array_empirical_size_.begin(),
-                                 binary_array_empirical_size_.end(), 0);
-  size_per_row = std::accumulate(large_binary_array_empirical_size_.begin(),
-                                 large_binary_array_empirical_size_.end(), size_per_row);
-
-  for (auto col = 0; col < fixed_width_array_idx_.size(); ++col) {
-    auto col_idx = fixed_width_array_idx_[col];
-    size_per_row += arrow::bit_width(column_type_id_[col_idx]->id()) / 8;
-    if (rb.column_data(col_idx)->GetNullCount() != 0) {
-      input_fixed_width_has_null_[col] = true;
-    }
-  }
-
-  int64_t prealloc_row_cnt =
-      options_.offheap_per_task > 0 && size_per_row > 0
-          ? options_.offheap_per_task / 4 / size_per_row / num_partitions_
-          : options_.buffer_size;
-  prealloc_row_cnt = std::min(prealloc_row_cnt, (int64_t)options_.buffer_size);
-
-  // prepare partition buffers and spill if necessary
-  for (auto pid = 0; pid < num_partitions_; ++pid) {
-    if (partition_id_cnt_[pid] > 0) {
-      // make sure the size to be allocated is larger than the size to be filled
-      auto new_size = std::max((uint16_t)prealloc_row_cnt, partition_id_cnt_[pid]);
-      if (partition_buffer_size_[pid] == 0) {
-        // allocate buffer if it's not yet allocated
-        RETURN_NOT_OK(AllocatePartitionBuffers(pid, new_size));
-      } else if (partition_buffer_idx_base_[pid] + partition_id_cnt_[pid] >
-                 partition_buffer_size_[pid]) {
-        // if the size to be filled + allready filled > the buffer size, need to allocate
-        // new buffer
-        if (options_.prefer_spill) {
-          // if prefer_spill is set, spill current record batch, we may reuse the buffers
-
-          if (new_size > partition_buffer_size_[pid]) {
-            // if the partition size after split is already larger than allocated buffer
-            // size, need reallocate
-            RETURN_NOT_OK(CacheRecordBatch(pid, /*reset_buffers = */ true));
-            // splill immediately
-            RETURN_NOT_OK(SpillPartition(pid));
-            RETURN_NOT_OK(AllocatePartitionBuffers(pid, new_size));
-          } else {
-            // partition size after split is smaller than buffer size, no need to reset
-            // buffer, reuse it.
-            RETURN_NOT_OK(CacheRecordBatch(pid, /*reset_buffers = */ false));
-            RETURN_NOT_OK(SpillPartition(pid));
-          }
-        } else {
-          // if prefer_spill is disabled, cache the record batch
-          RETURN_NOT_OK(CacheRecordBatch(pid, /*reset_buffers = */ true));
-          // allocate partition buffer with retries
-          RETURN_NOT_OK(AllocateNew(pid, new_size));
-        }
-      }
-    }
-  }
-// now start to split the record batch
-#if defined(COLUMNAR_PLUGIN_USE_AVX512)
-  RETURN_NOT_OK(SplitFixedWidthValueBufferAVX(rb));
-#else
-  RETURN_NOT_OK(SplitFixedWidthValueBuffer(rb));
-#endif
-  RETURN_NOT_OK(SplitFixedWidthValidityBuffer(rb));
-  RETURN_NOT_OK(SplitBinaryArray(rb));
-  RETURN_NOT_OK(SplitLargeBinaryArray(rb));
-  RETURN_NOT_OK(SplitListArray(rb));
-
-  // update partition buffer base after split
-  for (auto pid = 0; pid < num_partitions_; ++pid) {
-    partition_buffer_idx_base_[pid] += partition_id_cnt_[pid];
-  }
-  return arrow::Status::OK();
-}
-
-arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb) {
-  const auto num_rows = rb.num_rows();
-  int64_t row;
-  std::vector<int16_t> partition_buffer_idx_offset;
-
-  for (auto col = 0; col < fixed_width_array_idx_.size(); ++col) {
-    const auto& dst_addrs = partition_fixed_width_value_addrs_[col];
-    std::copy(dst_addrs.begin(), dst_addrs.end(), partition_buffer_idx_offset_.begin());
-    auto col_idx = fixed_width_array_idx_[col];
-    auto src_addr = const_cast<uint8_t*>(rb.column_data(col_idx)->buffers[1]->data());
-
-    switch (arrow::bit_width(column_type_id_[col_idx]->id())) {
-#ifdef PROCESSROW
-// assume batch size = 32k; reducer# = 4K; row/reducer = 8
-#define PROCESS(_CTYPE)                                                                  \
-  std::transform(partition_buffer_idx_offset_.begin(),                                   \
-                 partition_buffer_idx_offset_.end(), partition_buffer_idx_base_.begin(), \
-                 partition_buffer_idx_offset_.begin(),                                   \
-                 [](uint8_t* x, int16_t y) { return x + y * sizeof(_CTYPE); });          \
-  for (auto pid = 0; pid < num_partitions_; pid++) {                                     \
-    auto dst_pid_base =                                                                  \
-        reinterpret_cast<_CTYPE*>(partition_buffer_idx_offset_[pid]); /*32k*/            \
-    auto r = reducer_offset_offset_[pid];                             /*8k*/             \
-    auto size = reducer_offset_offset_[pid + 1];                                         \
-    for (r; r < size; r++) {                                                             \
-      auto src_offset = reducer_offsets_[r];                           /*16k*/           \
-      *dst_pid_base = reinterpret_cast<_CTYPE*>(src_addr)[src_offset]; /*64k*/           \
-      _mm_prefetch(&(src_addr)[src_offset * sizeof(_CTYPE) + 64], _MM_HINT_T2);          \
-      dst_pid_base += 1;                                                                 \
-    }                                                                                    \
-  }                                                                                      \
-  break;
-#else
-#define PROCESS(_CTYPE)                                                                  \
-  std::transform(partition_buffer_idx_offset_.begin(),                                   \
-                 partition_buffer_idx_offset_.end(), partition_buffer_idx_base_.begin(), \
-                 partition_buffer_idx_offset_.begin(),                                   \
-                 [](uint8_t* x, int16_t y) { return x + y * sizeof(_CTYPE); });          \
-  for (row = 0; row < num_rows; ++row) {                                                 \
-    auto pid = partition_id_[row];                                                       \
-    auto dst_pid_base = reinterpret_cast<_CTYPE*>(partition_buffer_idx_offset_[pid]);    \
-    *dst_pid_base = reinterpret_cast<_CTYPE*>(src_addr)[row];                            \
-    partition_buffer_idx_offset_[pid] += sizeof(_CTYPE);                                 \
-    _mm_prefetch(&dst_pid_base[64 / sizeof(_CTYPE)], _MM_HINT_T0);                       \
-  }                                                                                      \
-  break;
-#endif
-      case 8:
-        PROCESS(uint8_t)
-      case 16:
-        PROCESS(uint16_t)
-      case 32:
-        PROCESS(uint32_t)
-      case 64:
-#ifdef PROCESSAVX
-        std::transform(
-            partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(),
-            partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(),
-            [](uint8_t* x, int16_t y) { return x + y * sizeof(uint64_t); });
-        for (auto pid = 0; pid < num_partitions_; pid++) {
-          auto dst_pid_base =
-              reinterpret_cast<uint64_t*>(partition_buffer_idx_offset_[pid]); /*32k*/
-          auto r = reducer_offset_offset_[pid];                               /*8k*/
-          auto size = reducer_offset_offset_[pid + 1];
-#if 1
-          for (r; r < size && (((uint64_t)dst_pid_base & 0x1f) > 0); r++) {
-            auto src_offset = reducer_offsets_[r];                             /*16k*/
-            *dst_pid_base = reinterpret_cast<uint64_t*>(src_addr)[src_offset]; /*64k*/
-            _mm_prefetch(&(src_addr)[src_offset * sizeof(uint64_t) + 64], _MM_HINT_T2);
-            dst_pid_base += 1;
-          }
-#if 0
-          for (r; r+4<size; r+=4)                              
-          {                                                                                    
-            auto src_offset = reducer_offsets_[r];                                 /*16k*/ 
-            __m128i src_ld = _mm_loadl_epi64((__m128i*)(&reducer_offsets_[r]));    
-            __m128i src_offset_4x = _mm_cvtepu16_epi32(src_ld);
-            
-            __m256i src_4x = _mm256_i32gather_epi64((const long long int*)src_addr,src_offset_4x,8);
-            //_mm256_store_si256((__m256i*)dst_pid_base,src_4x); 
-            _mm_stream_si128((__m128i*)dst_pid_base,src_2x);
-                                                         
-            _mm_prefetch(&(src_addr)[(uint32_t)reducer_offsets_[r]*sizeof(uint64_t)+64], _MM_HINT_T2);              
-            _mm_prefetch(&(src_addr)[(uint32_t)reducer_offsets_[r+1]*sizeof(uint64_t)+64], _MM_HINT_T2);              
-            _mm_prefetch(&(src_addr)[(uint32_t)reducer_offsets_[r+2]*sizeof(uint64_t)+64], _MM_HINT_T2);              
-            _mm_prefetch(&(src_addr)[(uint32_t)reducer_offsets_[r+3]*sizeof(uint64_t)+64], _MM_HINT_T2);              
-            dst_pid_base+=4;                                                                   
-          }
-#endif
-          for (r; r + 2 < size; r += 2) {
-            __m128i src_offset_2x =
-                _mm_cvtsi32_si128(*((int32_t*)(reducer_offsets_.data() + r)));
-            src_offset_2x = _mm_shufflelo_epi16(src_offset_2x, 0x98);
-
-            __m128i src_2x =
-                _mm_i32gather_epi64((const long long int*)src_addr, src_offset_2x, 8);
-            _mm_store_si128((__m128i*)dst_pid_base, src_2x);
-            //_mm_stream_si128((__m128i*)dst_pid_base,src_2x);
-
-            _mm_prefetch(
-                &(src_addr)[(uint32_t)reducer_offsets_[r] * sizeof(uint64_t) + 64],
-                _MM_HINT_T2);
-            _mm_prefetch(
-                &(src_addr)[(uint32_t)reducer_offsets_[r + 1] * sizeof(uint64_t) + 64],
-                _MM_HINT_T2);
-            dst_pid_base += 2;
-          }
-#endif
-          for (r; r < size; r++) {
-            auto src_offset = reducer_offsets_[r];                             /*16k*/
-            *dst_pid_base = reinterpret_cast<uint64_t*>(src_addr)[src_offset]; /*64k*/
-            _mm_prefetch(&(src_addr)[src_offset * sizeof(uint64_t) + 64], _MM_HINT_T2);
-            dst_pid_base += 1;
-          }
-        }
-        break;
-#else
-        PROCESS(uint64_t)
-#endif
-
-#undef PROCESS
-      case 128:  // arrow::Decimal128Type::type_id
-#ifdef PROCESSROW
-                 // assume batch size = 32k; reducer# = 4K; row/reducer = 8
-        std::transform(
-            partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(),
-            partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(),
-            [](uint8_t* x, int16_t y) { return x + y * 16; });
-        for (auto pid = 0; pid < num_partitions_; pid++) {
-          auto dst_pid_base =
-              reinterpret_cast<uint64_t*>(partition_buffer_idx_offset_[pid]); /*32k*/
-          auto r = reducer_offset_offset_[pid];                               /*8k*/
-          auto size = reducer_offset_offset_[pid + 1];
-          for (r; r < size; r++) {
-            auto src_offset = reducer_offsets_[r]; /*16k*/
-            *dst_pid_base =
-                reinterpret_cast<uint64_t*>(src_addr)[src_offset << 1]; /*128k*/
-            *(dst_pid_base + 1) =
-                reinterpret_cast<uint64_t*>(src_addr)[src_offset << 1 | 1]; /*128k*/
-            _mm_prefetch(&(src_addr)[src_offset * 16 + 64], _MM_HINT_T2);
-            dst_pid_base += 2;
-          }
-        }
-        break;
-#else
-        std::transform(
-            partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(),
-            partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(),
-            [](uint8_t* x, int16_t y) { return x + y * 16; });
-        for (auto row = 0; row < num_rows; ++row) {
-          auto pid = partition_id_[row];
-          reinterpret_cast<uint64_t*>(partition_buffer_idx_offset_[pid])[0] =
-              reinterpret_cast<uint64_t*>(src_addr)[row << 1];
-          reinterpret_cast<uint64_t*>(partition_buffer_idx_offset_[pid])[1] =
-              reinterpret_cast<uint64_t*>(src_addr)[row << 1 | 1];
-          partition_buffer_idx_offset_[pid] += 16;
-          _mm_prefetch(&reinterpret_cast<uint64_t*>(partition_buffer_idx_offset_[pid])[2],
-                       _MM_HINT_T0);
-        }
-        break;
-#endif
-      case 1:  // arrow::BooleanType::type_id:
-        partition_buffer_idx_offset.resize(partition_buffer_idx_base_.size());
-        std::copy(partition_buffer_idx_base_.begin(), partition_buffer_idx_base_.end(),
-                  partition_buffer_idx_offset.begin());
-        for (auto row = 0; row < num_rows; ++row) {
-          auto pid = partition_id_[row];
-          uint16_t dst_offset = partition_buffer_idx_offset[pid];
-          dst_addrs[pid][dst_offset >> 3] ^=
-              (dst_addrs[pid][dst_offset >> 3] >> (dst_offset & 7) ^
-               src_addr[row >> 3] >> (row & 7))
-              << (dst_offset & 7);
-          partition_buffer_idx_offset[pid]++;
-        }
-        break;
-      default:
-        return arrow::Status::Invalid("Column type " +
-                                      schema_->field(col_idx)->type()->ToString() +
-                                      " is not fixed width");
-    }
-  }
-  return arrow::Status::OK();
-}
-
-#if defined(COLUMNAR_PLUGIN_USE_AVX512)
-arrow::Status Splitter::SplitFixedWidthValueBufferAVX(const arrow::RecordBatch& rb) {
-  __m256i inc_one = _mm256_load_si256((__m256i*)(ONES));
-
-  const auto num_rows = rb.num_rows();
-  for (auto col = 0; col < fixed_width_array_idx_.size(); ++col) {
-    std::fill(std::begin(partition_buffer_idx_offset_),
-              std::end(partition_buffer_idx_offset_), 0);
-    auto col_idx = fixed_width_array_idx_[col];
-    auto src_addr = const_cast<uint8_t*>(rb.column_data(col_idx)->buffers[1]->data());
-    const auto& dst_addrs = partition_fixed_width_value_addrs_[col];
-
-    switch (column_type_id_[col_idx]) {
-#define PROCESS(SHUFFLE_TYPE, CTYPE)                                           \
-  case Type::SHUFFLE_TYPE:                                                     \
-    for (auto row = 0; row < num_rows; ++row) {                                \
-      auto pid = partition_id_[row];                                           \
-      auto dst_offset =                                                        \
-          partition_buffer_idx_base_[pid] + partition_buffer_idx_offset_[pid]; \
-      reinterpret_cast<CTYPE*>(dst_addrs[pid])[dst_offset] =                   \
-          reinterpret_cast<CTYPE*>(src_addr)[row];                             \
-      partition_buffer_idx_offset_[pid]++;                                     \
-      _mm_prefetch(&reinterpret_cast<CTYPE*>(dst_addrs[pid])[dst_offset + 1],  \
-                   _MM_HINT_T0);                                               \
-    }                                                                          \
-    break;
-      PROCESS(SHUFFLE_1BYTE, uint8_t)
-      PROCESS(SHUFFLE_2BYTE, uint16_t)
-#undef PROCESS
-      case Type::SHUFFLE_4BYTE: {
-        auto rows = num_rows - num_rows % 8;
-        auto src_addr_32 = reinterpret_cast<uint32_t*>(src_addr);
-        for (auto row = 0; row < rows; row += 8) {
-          __m256i partid_cnt_8x = CountPartitionIdOccurrence(partition_id_, row);
-
-          // partition id is 32 bit, 8 partition id
-          __m256i partid_8x = _mm256_loadu_si256((__m256i*)(partition_id_.data() + row));
-
-          // dst_base and dst_offset are 32 bit
-          __m256i dst_idx_base_8x =
-              _mm256_i32gather_epi32(partition_buffer_idx_base_.data(), partid_8x, 4);
-          __m256i dst_idx_offset_8x =
-              _mm256_i32gather_epi32(partition_buffer_idx_offset_.data(), partid_8x, 4);
-          dst_idx_offset_8x = _mm256_add_epi32(dst_idx_offset_8x, partid_cnt_8x);
-          __m256i dst_idx_8x = _mm256_add_epi32(dst_idx_base_8x, dst_idx_offset_8x);
-
-          // dst base address is 64 bit
-          __m512i dst_addr_base_8x =
-              _mm512_i32gather_epi64(partid_8x, dst_addrs.data(), 8);
-
-          // calculate dst address, dst_addr = dst_base_addr + dst_idx*4
-          //_mm512_cvtepu32_epi64: zero extend dst_offset 32bit -> 64bit
-          //_mm512_slli_epi64(_, 2): each 64bit dst_offset << 2
-          __m512i dst_addr_offset_8x =
-              _mm512_slli_epi64(_mm512_cvtepu32_epi64(dst_idx_8x), 2);
-          __m512i dst_addr_8x = _mm512_add_epi64(dst_addr_base_8x, dst_addr_offset_8x);
-
-          // source value is 32 bit
-          __m256i src_val_8x = _mm256_loadu_si256((__m256i*)(src_addr_32 + row));
-
-          // scatter
-          _mm512_i64scatter_epi32(nullptr, dst_addr_8x, src_val_8x, 1);
-
-          // update partition_buffer_idx_offset_
-          partid_cnt_8x = _mm256_add_epi32(partid_cnt_8x, inc_one);
-          for (int i = 0; i < 8; ++i) {
-            partition_buffer_idx_offset_[partition_id_[row + i]]++;
-          }
-
-          PrefetchDstAddr(dst_addr_8x, 4);
-        }
-        for (auto row = rows; row < num_rows; ++row) {
-          auto pid = partition_id_[row];
-          reinterpret_cast<uint32_t*>(dst_addrs[pid])[partition_buffer_idx_base_[pid] +
-                                                      partition_buffer_idx_offset_[pid]] =
-              (src_addr_32)[row];
-          partition_buffer_idx_offset_[pid]++;
-        }
-      } break;
-      case Type::SHUFFLE_8BYTE: {
-        auto rows = num_rows - num_rows % 8;
-        auto src_addr_64 = reinterpret_cast<uint64_t*>(src_addr);
-        for (auto row = 0; row < rows; row += 8) {
-          __m256i partid_cnt_8x = CountPartitionIdOccurrence(partition_id_, row);
-
-          // partition id is 32 bit, 8 partition id
-          __m256i partid_8x = _mm256_loadu_si256((__m256i*)(partition_id_.data() + row));
-
-          // dst_base and dst_offset are 32 bit
-          __m256i dst_idx_base_8x =
-              _mm256_i32gather_epi32(partition_buffer_idx_base_.data(), partid_8x, 4);
-          __m256i dst_idx_offset_8x =
-              _mm256_i32gather_epi32(partition_buffer_idx_offset_.data(), partid_8x, 4);
-          dst_idx_offset_8x = _mm256_add_epi32(dst_idx_offset_8x, partid_cnt_8x);
-          __m256i dst_idx_8x = _mm256_add_epi32(dst_idx_base_8x, dst_idx_offset_8x);
-
-          // dst base address is 64 bit
-          __m512i dst_addr_base_8x =
-              _mm512_i32gather_epi64(partid_8x, dst_addrs.data(), 8);
-
-          // calculate dst address, dst_addr = dst_base_addr + dst_idx*8
-          //_mm512_cvtepu32_epi64: zero extend dst_offset 32bit -> 64bit
-          //_mm512_slli_epi64(_, 3): each 64bit dst_offset << 3
-          __m512i dst_addr_offset_8x =
-              _mm512_slli_epi64(_mm512_cvtepu32_epi64(dst_idx_8x), 3);
-          __m512i dst_addr_8x = _mm512_add_epi64(dst_addr_base_8x, dst_addr_offset_8x);
-
-          // source value is 64 bit
-          __m512i src_val_8x = _mm512_loadu_si512((__m512i*)(src_addr_64 + row));
-
-          // scatter
-          _mm512_i64scatter_epi64(nullptr, dst_addr_8x, src_val_8x, 1);
-
-          // update partition_buffer_idx_offset_
-          partid_cnt_8x = _mm256_add_epi32(partid_cnt_8x, inc_one);
-          for (int i = 0; i < 8; ++i) {
-            partition_buffer_idx_offset_[partition_id_[row + i]]++;
-          }
-
-          PrefetchDstAddr(dst_addr_8x, 8);
-        }
-        // handle the rest
-        for (auto row = rows; row < num_rows; ++row) {
-          auto pid = partition_id_[row];
-          reinterpret_cast<uint64_t*>(dst_addrs[pid])[partition_buffer_idx_base_[pid] +
-                                                      partition_buffer_idx_offset_[pid]] =
-              (src_addr_64)[row];
-          partition_buffer_idx_offset_[pid]++;
-        }
-      } break;
-      case Type::SHUFFLE_DECIMAL128:
-        for (auto row = 0; row < num_rows; ++row) {
-          auto pid = partition_id_[row];
-          auto dst_offset =
-              (partition_buffer_idx_base_[pid] + partition_buffer_idx_offset_[pid]) << 1;
-          reinterpret_cast<uint64_t*>(dst_addrs[pid])[dst_offset] =
-              reinterpret_cast<uint64_t*>(src_addr)[row << 1];
-          reinterpret_cast<uint64_t*>(dst_addrs[pid])[dst_offset | 1] =
-              reinterpret_cast<uint64_t*>(src_addr)[row << 1 | 1];
-          partition_buffer_idx_offset_[pid]++;
-          _mm_prefetch(&reinterpret_cast<uint64_t*>(dst_addrs[pid])[dst_offset + 2],
-                       _MM_HINT_T0);
-        }
-        break;
-      case Type::SHUFFLE_BIT:
-        for (auto row = 0; row < num_rows; ++row) {
-          auto pid = partition_id_[row];
-          auto dst_offset =
-              partition_buffer_idx_base_[pid] + partition_buffer_idx_offset_[pid];
-          dst_addrs[pid][dst_offset >> 3] ^=
-              (dst_addrs[pid][dst_offset >> 3] >> (dst_offset & 7) ^
-               src_addr[row >> 3] >> (row & 7))
-              << (dst_offset & 7);
-          partition_buffer_idx_offset_[pid]++;
-        }
-        break;
-      default:
-        return arrow::Status::Invalid("Column type " +
-                                      schema_->field(col_idx)->type()->ToString() +
-                                      " is not fixed width");
-    }
-  }
-  return arrow::Status::OK();
-}
-#endif
-
-arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch& rb) {
-  const auto num_rows = rb.num_rows();
-  std::vector<int16_t> partition_buffer_idx_offset;
-
-  for (auto col = 0; col < fixed_width_array_idx_.size(); ++col) {
-    auto col_idx = fixed_width_array_idx_[col];
-    auto& dst_addrs = partition_fixed_width_validity_addrs_[col];
-    if (rb.column_data(col_idx)->GetNullCount() == 0 &&
-        column_has_null_[col_idx] == true) {
-      // if the input record batch doesn't have null, set validity to True
-      // column_has_null_ is used to skip the partition_id_cnt_[pid] and dst_addrs[pid]
-      // access
-      for (auto pid = 0; pid < num_partitions_; ++pid) {
-        if (partition_id_cnt_[pid] > 0 && dst_addrs[pid] != nullptr) {
-          arrow::BitUtil::SetBitsTo(dst_addrs[pid], partition_buffer_idx_base_[pid],
-                                    partition_id_cnt_[pid], true);
-        }
-      }
-    } else if (rb.column_data(col_idx)->GetNullCount() > 0) {
-      // there is Null count
-      column_has_null_[col_idx] = true;
-      for (auto pid = 0; pid < num_partitions_; ++pid) {
-        if (partition_id_cnt_[pid] > 0 && dst_addrs[pid] == nullptr) {
-          // init bitmap if it's null, initialize the buffer as true
-          auto new_size =
-              std::max(partition_id_cnt_[pid], (uint16_t)options_.buffer_size);
-          ARROW_ASSIGN_OR_RAISE(
-              auto validity_buffer,
-              arrow::AllocateResizableBuffer(arrow::BitUtil::BytesForBits(new_size),
-                                             options_.memory_pool));
-          dst_addrs[pid] = const_cast<uint8_t*>(validity_buffer->data());
-          arrow::BitUtil::SetBitsTo(dst_addrs[pid], 0, partition_buffer_idx_base_[pid],
-                                    true);
-          partition_fixed_width_buffers_[col][pid][0] = std::move(validity_buffer);
-        }
-      }
-
-      auto src_addr = const_cast<uint8_t*>(rb.column_data(col_idx)->buffers[0]->data());
-      partition_buffer_idx_offset.resize(partition_buffer_idx_base_.size());
-      std::copy(partition_buffer_idx_base_.begin(), partition_buffer_idx_base_.end(),
-                partition_buffer_idx_offset.begin());
-      for (auto row = 0; row < num_rows; ++row) {
-        auto pid = partition_id_[row];
-        auto dst_offset = partition_buffer_idx_offset[pid];
-        dst_addrs[pid][dst_offset >> 3] ^=
-            (dst_addrs[pid][dst_offset >> 3] >> (dst_offset & 7) ^
-             src_addr[row >> 3] >> (row & 7))
-            << (dst_offset & 7);
-        partition_buffer_idx_offset[pid]++;
-      }
-    }
-  }
-  return arrow::Status::OK();
-}
-
-arrow::Status Splitter::SplitBinaryArray(const arrow::RecordBatch& rb) {
-  for (int i = 0; i < binary_array_idx_.size(); ++i) {
-    RETURN_NOT_OK(AppendBinary<arrow::BinaryType>(
-        std::static_pointer_cast<arrow::BinaryArray>(rb.column(binary_array_idx_[i])),
-        partition_binary_builders_[i], rb.num_rows()));
-  }
-  return arrow::Status::OK();
-}
-
-arrow::Status Splitter::SplitLargeBinaryArray(const arrow::RecordBatch& rb) {
-  for (int i = 0; i < large_binary_array_idx_.size(); ++i) {
-    RETURN_NOT_OK(AppendBinary<arrow::LargeBinaryType>(
-        std::static_pointer_cast<arrow::LargeBinaryArray>(
-            rb.column(large_binary_array_idx_[i])),
-        partition_large_binary_builders_[i], rb.num_rows()));
-  }
-  return arrow::Status::OK();
-}
-
-#define PROCESS_SUPPORTED_TYPES(PROCESS) \
-  PROCESS(arrow::BooleanType)            \
-  PROCESS(arrow::UInt8Type)              \
-  PROCESS(arrow::Int8Type)               \
-  PROCESS(arrow::UInt16Type)             \
-  PROCESS(arrow::Int16Type)              \
-  PROCESS(arrow::UInt32Type)             \
-  PROCESS(arrow::Int32Type)              \
-  PROCESS(arrow::UInt64Type)             \
-  PROCESS(arrow::Int64Type)              \
-  PROCESS(arrow::FloatType)              \
-  PROCESS(arrow::DoubleType)             \
-  PROCESS(arrow::Date32Type)             \
-  PROCESS(arrow::Date64Type)             \
-  PROCESS(arrow::Decimal128Type)         \
-  PROCESS(arrow::StringType)             \
-  PROCESS(arrow::BinaryType)
-arrow::Status Splitter::SplitListArray(const arrow::RecordBatch& rb) {
-  for (int i = 0; i < list_array_idx_.size(); ++i) {
-    auto src_arr =
-        std::static_pointer_cast<arrow::ListArray>(rb.column(list_array_idx_[i]));
-    auto status = AppendList(rb.column(list_array_idx_[i]), partition_list_builders_[i],
-                             rb.num_rows());
-    if (!status.ok()) return status;
-  }
-  return arrow::Status::OK();
-}
-
-#undef PROCESS_SUPPORTED_TYPES
-
-template <typename T, typename ArrayType, typename BuilderType>
-arrow::Status Splitter::AppendBinary(
-    const std::shared_ptr<ArrayType>& src_arr,
-    const std::vector<std::shared_ptr<BuilderType>>& dst_builders, int64_t num_rows) {
-  using offset_type = typename T::offset_type;
-  if (src_arr->null_count() == 0) {
-    for (auto row = 0; row < num_rows; ++row) {
-      offset_type length;
-      auto value = src_arr->GetValue(row, &length);
-      const auto& builder = dst_builders[partition_id_[row]];
-      RETURN_NOT_OK(builder->Reserve(1));
-      RETURN_NOT_OK(builder->ReserveData(length));
-      builder->UnsafeAppend(value, length);
-    }
-  } else {
-    for (auto row = 0; row < num_rows; ++row) {
-      if (src_arr->IsValid(row)) {
-        offset_type length;
-        auto value = src_arr->GetValue(row, &length);
-        const auto& builder = dst_builders[partition_id_[row]];
-        RETURN_NOT_OK(builder->Reserve(1));
-        RETURN_NOT_OK(builder->ReserveData(length));
-        builder->UnsafeAppend(value, length);
-      } else {
-        dst_builders[partition_id_[row]]->AppendNull();
-      }
-    }
-  }
-  return arrow::Status::OK();
-}
-
-arrow::Status Splitter::AppendList(
-    const std::shared_ptr<arrow::Array>& src_arr,
-    const std::vector<std::shared_ptr<arrow::ArrayBuilder>>& dst_builders,
-    int64_t num_rows) {
-  for (auto row = 0; row < num_rows; ++row) {
-    RETURN_NOT_OK(dst_builders[partition_id_[row]]->AppendArraySlice(
-        *(src_arr->data().get()), row, 1));
-  }
-  return arrow::Status::OK();
-}
-
-std::string Splitter::NextSpilledFileDir() {
-  auto spilled_file_dir = GetSpilledShuffleFileDir(configured_dirs_[dir_selection_],
-                                                   sub_dir_selection_[dir_selection_]);
-  sub_dir_selection_[dir_selection_] =
-      (sub_dir_selection_[dir_selection_] + 1) % options_.num_sub_dirs;
-  dir_selection_ = (dir_selection_ + 1) % configured_dirs_.size();
-  return spilled_file_dir;
-}
-
-arrow::Result<std::shared_ptr<arrow::ipc::IpcPayload>> Splitter::GetSchemaPayload() {
-  if (schema_payload_ != nullptr) {
-    return schema_payload_;
-  }
-  schema_payload_ = std::make_shared<arrow::ipc::IpcPayload>();
-  arrow::ipc::DictionaryFieldMapper dict_file_mapper;  // unused
-  RETURN_NOT_OK(arrow::ipc::GetSchemaPayload(*schema_, options_.ipc_write_options,
-                                             dict_file_mapper, schema_payload_.get()));
-  return schema_payload_;
-}
-
-// ----------------------------------------------------------------------
-// RoundRobinSplitter
-
-arrow::Result<std::shared_ptr<RoundRobinSplitter>> RoundRobinSplitter::Create(
-    int32_t num_partitions, std::shared_ptr<arrow::Schema> schema, SplitOptions options) {
-  std::shared_ptr<RoundRobinSplitter> res(
-      new RoundRobinSplitter(num_partitions, std::move(schema), std::move(options)));
-  RETURN_NOT_OK(res->Init());
-  return res;
-}
-
-arrow::Status RoundRobinSplitter::ComputeAndCountPartitionId(
-    const arrow::RecordBatch& rb) {
-  std::fill(std::begin(partition_id_cnt_), std::end(partition_id_cnt_), 0);
-  partition_id_.resize(rb.num_rows());
-  for (auto& pid : partition_id_) {
-    pid = pid_selection_;
-    partition_id_cnt_[pid_selection_]++;
-    pid_selection_ = (pid_selection_ + 1) == num_partitions_ ? 0 : (pid_selection_ + 1);
-  }
-  return arrow::Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// HashSplitter
-
-arrow::Result<std::shared_ptr<HashSplitter>> HashSplitter::Create(
-    int32_t num_partitions, std::shared_ptr<arrow::Schema> schema,
-    const gandiva::ExpressionVector& expr_vector, SplitOptions options) {
-  std::shared_ptr<HashSplitter> res(
-      new HashSplitter(num_partitions, std::move(schema), std::move(options)));
-  RETURN_NOT_OK(res->Init());
-  RETURN_NOT_OK(res->CreateProjector(expr_vector));
-  return res;
-}
-
-arrow::Status HashSplitter::CreateProjector(
-    const gandiva::ExpressionVector& expr_vector) {
-  // same seed as spark's
-  auto hash = gandiva::TreeExprBuilder::MakeLiteral((int32_t)42);
-  for (const auto& expr : expr_vector) {
-    switch (expr->root()->return_type()->id()) {
-      case arrow::NullType::type_id:
-        break;
-      case arrow::BooleanType::type_id:
-      case arrow::Int8Type::type_id:
-      case arrow::Int16Type::type_id:
-      case arrow::Int32Type::type_id:
-      case arrow::FloatType::type_id:
-      case arrow::Date32Type::type_id:
-        hash = gandiva::TreeExprBuilder::MakeFunction(
-            "hash32_spark", {expr->root(), hash}, arrow::int32());
-        break;
-      case arrow::Int64Type::type_id:
-      case arrow::DoubleType::type_id:
-        hash = gandiva::TreeExprBuilder::MakeFunction(
-            "hash64_spark", {expr->root(), hash}, arrow::int32());
-        break;
-      case arrow::StringType::type_id:
-        hash = gandiva::TreeExprBuilder::MakeFunction(
-            "hashbuf_spark", {expr->root(), hash}, arrow::int32());
-        break;
-      default:
-        hash = gandiva::TreeExprBuilder::MakeFunction("hash32", {expr->root(), hash},
-                                                      arrow::int32());
-        /*return arrow::Status::NotImplemented("HashSplitter::CreateProjector
-           doesn't support type ", expr->result()->type()->ToString());*/
-    }
-  }
-  auto hash_expr =
-      gandiva::TreeExprBuilder::MakeExpression(hash, arrow::field("pid", arrow::int32()));
-  return gandiva::Projector::Make(schema_, {hash_expr}, &projector_);
-}
-
-arrow::Status HashSplitter::ComputeAndCountPartitionId(const arrow::RecordBatch& rb) {
-  auto num_rows = rb.num_rows();
-  partition_id_.resize(num_rows);
-  std::fill(std::begin(partition_id_cnt_), std::end(partition_id_cnt_), 0);
-
-  arrow::ArrayVector outputs;
-  TIME_NANO_OR_RAISE(total_compute_pid_time_,
-                     projector_->Evaluate(rb, options_.memory_pool, &outputs));
-  if (outputs.size() != 1) {
-    return arrow::Status::Invalid("Projector result should have one field, actual is ",
-                                  std::to_string(outputs.size()));
-  }
-  auto pid_arr = std::dynamic_pointer_cast<arrow::Int32Array>(outputs.at(0));
-  if (pid_arr == nullptr) {
-    return arrow::Status::Invalid("failed to cast outputs.at(0)");
-  }
-  for (auto i = 0; i < num_rows; ++i) {
-    // positive mod
-    auto pid = pid_arr->Value(i) % num_partitions_;
-    // force to generate ASM
-    __asm__(
-        "lea (%[num_partitions],%[pid],1),%[tmp]\n"
-        "test %[pid],%[pid]\n"
-        "cmovs %[tmp],%[pid]\n"
-        : [pid] "+r"(pid)
-        : [num_partitions] "r"(num_partitions_), [tmp] "r"(0));
-    partition_id_[i] = pid;
-    partition_id_cnt_[pid]++;
-  }
-  return arrow::Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// FallBackRangeSplitter
-
-arrow::Result<std::shared_ptr<FallbackRangeSplitter>> FallbackRangeSplitter::Create(
-    int32_t num_partitions, std::shared_ptr<arrow::Schema> schema, SplitOptions options) {
-  auto res = std::shared_ptr<FallbackRangeSplitter>(
-      new FallbackRangeSplitter(num_partitions, std::move(schema), std::move(options)));
-  RETURN_NOT_OK(res->Init());
-  return res;
-}
-
-arrow::Status FallbackRangeSplitter::Init() {
-  input_schema_ = std::move(schema_);
-  ARROW_ASSIGN_OR_RAISE(schema_, input_schema_->RemoveField(0))
-  return Splitter::Init();
-}
-
-arrow::Status FallbackRangeSplitter::Split(const arrow::RecordBatch& rb) {
-  EVAL_START("split", options_.thread_id)
-  RETURN_NOT_OK(ComputeAndCountPartitionId(rb));
-  ARROW_ASSIGN_OR_RAISE(auto remove_pid, rb.RemoveColumn(0));
-  RETURN_NOT_OK(DoSplit(*remove_pid));
-  EVAL_END("split", options_.thread_id, options_.task_attempt_id)
-  return arrow::Status::OK();
-}
-
-arrow::Status FallbackRangeSplitter::ComputeAndCountPartitionId(
-    const arrow::RecordBatch& rb) {
-  if (rb.column(0)->type_id() != arrow::Type::INT32) {
-    return arrow::Status::Invalid("RecordBatch field 0 should be ",
-                                  arrow::int32()->ToString(), ", actual is ",
-                                  rb.column(0)->type()->ToString());
-  }
-
-  auto pid_arr = reinterpret_cast<const int32_t*>(rb.column_data(0)->buffers[1]->data());
-  auto num_rows = rb.num_rows();
-  partition_id_.resize(num_rows);
-  std::fill(std::begin(partition_id_cnt_), std::end(partition_id_cnt_), 0);
-  for (auto i = 0; i < num_rows; ++i) {
-    auto pid = pid_arr[i];
-    if (pid >= num_partitions_) {
-      return arrow::Status::Invalid("Partition id ", std::to_string(pid),
-                                    " is equal or greater than ",
-                                    std::to_string(num_partitions_));
-    }
-    partition_id_[i] = pid;
-    partition_id_cnt_[pid]++;
-  }
-  return arrow::Status::OK();
-}
-
-}  // namespace shuffle
-}  // namespace sparkcolumnarplugin
diff --git a/native-sql-engine/cpp/src/tests/shuffle_split_test.cc b/native-sql-engine/cpp/src/tests/shuffle_split_test.cc
deleted file mode 100644
index cc05cd3e1..000000000
--- a/native-sql-engine/cpp/src/tests/shuffle_split_test.cc
+++ /dev/null
@@ -1,1139 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <arrow/compute/api.h>
-#include <arrow/datum.h>
-#include <arrow/io/api.h>
-#include <arrow/ipc/reader.h>
-#include <arrow/pretty_print.h>
-#include <arrow/record_batch.h>
-#include <arrow/util/io_util.h>
-#include <gtest/gtest.h>
-
-#include <iostream>
-
-#include <execinfo.h>
-void print_trace(void) {
-    char **strings;
-    size_t i, size;
-    enum Constexpr { MAX_SIZE = 1024 };
-    void *array[MAX_SIZE];
-    size = backtrace(array, MAX_SIZE);
-    strings = backtrace_symbols(array, size);
-    for (i = 0; i < size; i++)
-        printf("    %s\n", strings[i]);
-    puts("");
-    free(strings);
-}
-
-#include "shuffle/splitter.h"
-#include "tests/test_utils.h"
-
-namespace sparkcolumnarplugin {
-namespace shuffle {
-
-class MyMemoryPool : public arrow::MemoryPool {
- public:
-  explicit MyMemoryPool(int64_t capacity) : capacity_(capacity) {}
-
-  Status Allocate(int64_t size, uint8_t** out) override {
-    if (bytes_allocated() + size > capacity_) {
-      return Status::OutOfMemory("malloc of size ", size, " failed");
-    }
-    RETURN_NOT_OK(pool_->Allocate(size, out));
-    stats_.UpdateAllocatedBytes(size);
-    std::cout << "Allocate: size = " << size << " addr = " << std::hex << (uint64_t)*out << std::dec << std::endl;
-    //print_trace();
-    return arrow::Status::OK();
-  }
-
-  Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override {
-    if (new_size > capacity_) {
-      return Status::OutOfMemory("malloc of size ", new_size, " failed");
-    }
-    auto old_ptr = ptr;
-    RETURN_NOT_OK(pool_->Reallocate(old_size, new_size, ptr));
-    stats_.UpdateAllocatedBytes(new_size - old_size);
-    std::cout << "Reallocate: old_size = " << old_size << " old_ptr = " << std::hex << (uint64_t)*old_ptr << std::dec << " new_size = " << new_size << " addr = " << std::hex << (uint64_t)*ptr << std::dec << std::endl;
-    //print_trace();
-    return arrow::Status::OK();
-  }
-
-  void Free(uint8_t* buffer, int64_t size) override {
-    pool_->Free(buffer, size);
-    stats_.UpdateAllocatedBytes(-size);
-    std::cout << "Free: size = " << size << " addr = " << std::hex << (uint64_t)buffer << std::dec << std::endl;
-    //print_trace();
-  }
-
-  int64_t bytes_allocated() const override { return stats_.bytes_allocated(); }
-
-  int64_t max_memory() const override { return pool_->max_memory(); }
-
-  std::string backend_name() const override { return pool_->backend_name(); }
-
- private:
-  MemoryPool* pool_ = arrow::default_memory_pool();
-  int64_t capacity_;
-  arrow::internal::MemoryPoolStats stats_;
-};
-
-class SplitterTest : public ::testing::Test {
- protected:
-  void SetUp() {
-    auto f_na = field("f_na", arrow::null());
-    auto f_int8_a = field("f_int8_a", arrow::int8());
-    auto f_int8_b = field("f_int8_b", arrow::int8());
-    auto f_int32 = field("f_int32", arrow::int32());
-    auto f_uint64 = field("f_uint64", arrow::uint64());
-    auto f_double = field("f_double", arrow::float64());
-    auto f_bool = field("f_bool", arrow::boolean());
-    auto f_string = field("f_string", arrow::utf8());
-    auto f_nullable_string = field("f_nullable_string", arrow::utf8());
-    auto f_decimal = field("f_decimal128", arrow::decimal(10, 2));
-
-    ARROW_ASSIGN_OR_THROW(tmp_dir_1_,
-                          std::move(arrow::internal::TemporaryDir::Make(tmp_dir_prefix)))
-    ARROW_ASSIGN_OR_THROW(tmp_dir_2_,
-                          std::move(arrow::internal::TemporaryDir::Make(tmp_dir_prefix)))
-    auto config_dirs =
-        tmp_dir_1_->path().ToString() + "," + tmp_dir_2_->path().ToString();
-
-    setenv("NATIVESQL_SPARK_LOCAL_DIRS", config_dirs.c_str(), 1);
-
-    schema_ = arrow::schema({f_na, f_int8_a, f_int8_b, f_int32, f_uint64, f_double,
-                             f_bool, f_string, f_nullable_string, f_decimal});
-
-    MakeInputBatch(input_data_1, schema_, &input_batch_1_);
-    MakeInputBatch(input_data_2, schema_, &input_batch_2_);
-
-    split_options_ = SplitOptions::Defaults();
-  }
-
-  void TearDown() override {
-    if (file_ != nullptr && !file_->closed()) {
-      file_->Close();
-    }
-  }
-
-  static void CheckFileExsists(const std::string& file_name) {
-    ASSERT_EQ(*arrow::internal::FileExists(
-                  *arrow::internal::PlatformFilename::FromString(file_name)),
-              true);
-  }
-
-  arrow::Result<std::shared_ptr<arrow::RecordBatch>> TakeRows(
-      const std::shared_ptr<arrow::RecordBatch>& input_batch,
-      const std::string& json_idx) {
-    std::shared_ptr<arrow::Array> take_idx;
-    ASSERT_NOT_OK(
-        arrow::ipc::internal::json::ArrayFromJSON(arrow::int32(), json_idx, &take_idx));
-
-    auto cntx = arrow::compute::ExecContext();
-    std::shared_ptr<arrow::RecordBatch> res;
-    ARROW_ASSIGN_OR_RAISE(
-        arrow::Datum result,
-        arrow::compute::Take(arrow::Datum(input_batch), arrow::Datum(take_idx),
-                             arrow::compute::TakeOptions{}, &cntx));
-    return result.record_batch();
-  }
-
-  arrow::Result<std::shared_ptr<arrow::ipc::RecordBatchReader>>
-  GetRecordBatchStreamReader(const std::string& file_name) {
-    if (file_ != nullptr && !file_->closed()) {
-      RETURN_NOT_OK(file_->Close());
-    }
-    ARROW_ASSIGN_OR_RAISE(file_, arrow::io::ReadableFile::Open(file_name))
-    ARROW_ASSIGN_OR_RAISE(auto file_reader,
-                          arrow::ipc::RecordBatchStreamReader::Open(file_))
-    return file_reader;
-  }
-
-  static const std::string tmp_dir_prefix;
-  static const std::vector<std::string> input_data_1;
-  static const std::vector<std::string> input_data_2;
-
-  std::shared_ptr<arrow::internal::TemporaryDir> tmp_dir_1_;
-  std::shared_ptr<arrow::internal::TemporaryDir> tmp_dir_2_;
-
-  std::shared_ptr<arrow::Schema> schema_;
-  std::shared_ptr<Splitter> splitter_;
-  SplitOptions split_options_;
-
-  std::shared_ptr<arrow::RecordBatch> input_batch_1_;
-  std::shared_ptr<arrow::RecordBatch> input_batch_2_;
-
-  std::shared_ptr<arrow::io::ReadableFile> file_;
-};
-
-const std::string SplitterTest::tmp_dir_prefix = "columnar-shuffle-test";
-const std::vector<std::string> SplitterTest::input_data_1 = {
-    "[null, null, null, null, null, null, null, null, null, null]",
-    "[1, 2, 3, null, 4, null, 5, 6, null, 7]",
-    "[1, -1, null, null, -2, 2, null, null, 3, -3]",
-    "[1, 2, 3, 4, null, 5, 6, 7, 8, null]",
-    "[null, null, null, null, null, null, null, null, null, null]",
-    R"([-0.1234567, null, 0.1234567, null, -0.142857, null, 0.142857, 0.285714, 0.428617, null])",
-    "[null, true, false, null, true, true, false, true, null, null]",
-    R"(["alice0", "bob1", "alice2", "bob3", "Alice4", "Bob5", "AlicE6", "boB7", "ALICE8", "BOB9"])",
-    R"(["alice", "bob", null, null, "Alice", "Bob", null, "alicE", null, "boB"])",
-    R"(["-1.01", "2.01", "-3.01", null, "0.11", "3.14", "2.27", null, "-3.14", null])"};
-
-const std::vector<std::string> SplitterTest::input_data_2 = {
-    "[null, null]",    "[null, null]",
-    "[1, -1]",         "[100, null]",
-    "[1, 1]",          R"([0.142857, -0.142857])",
-    "[true, false]",   R"(["bob", "alice"])",
-    R"([null, null])", R"([null, null])"};
-
-TEST_F(SplitterTest, TestSingleSplitter) {
-  split_options_.buffer_size = 10;
-
-  ARROW_ASSIGN_OR_THROW(splitter_, Splitter::Make("rr", schema_, 1, split_options_))
-
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_1_));
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_2_));
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_1_));
-
-  ASSERT_NOT_OK(splitter_->Stop());
-
-  // verify data file
-  CheckFileExsists(splitter_->DataFile());
-
-  // verify output temporary files
-  const auto& lengths = splitter_->PartitionLengths();
-  ASSERT_EQ(lengths.size(), 1);
-
-  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
-  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
-
-  // verify schema
-  ASSERT_EQ(*file_reader->schema(), *schema_);
-
-  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
-  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
-  ASSERT_EQ(batches.size(), 3);
-
-  std::vector<arrow::RecordBatch*> expected = {input_batch_1_.get(), input_batch_2_.get(),
-                                               input_batch_1_.get()};
-  for (auto i = 0; i < batches.size(); ++i) {
-    const auto& rb = batches[i];
-    ASSERT_EQ(rb->num_columns(), schema_->num_fields());
-    for (auto j = 0; j < rb->num_columns(); ++j) {
-      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
-      ASSERT_TRUE(rb->column(j)->Equals(*expected[i]->column(j),
-                                        EqualOptions::Defaults().diff_sink(&std::cout)));
-    }
-    ASSERT_TRUE(rb->Equals(*expected[i]));
-  }
-}
-
-TEST_F(SplitterTest, TestRoundRobinSplitter) {
-  int32_t num_partitions = 2;
-  split_options_.buffer_size = 4;
-  ARROW_ASSIGN_OR_THROW(splitter_,
-                        Splitter::Make("rr", schema_, num_partitions, split_options_));
-
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_1_));
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_2_));
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_1_));
-
-  ASSERT_NOT_OK(splitter_->Stop());
-
-  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
-  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
-
-  // verify partition lengths
-  const auto& lengths = splitter_->PartitionLengths();
-  ASSERT_EQ(lengths.size(), 2);
-  ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]);
-
-  // verify schema
-  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
-  ASSERT_EQ(*file_reader->schema(), *schema_);
-
-  // prepare first block expected result
-  std::shared_ptr<arrow::RecordBatch> res_batch_0;
-  std::shared_ptr<arrow::RecordBatch> res_batch_1;
-  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_1_, "[0, 2, 4, 6, 8]"))
-  ARROW_ASSIGN_OR_THROW(res_batch_1, TakeRows(input_batch_2_, "[0]"))
-  std::vector<arrow::RecordBatch*> expected = {res_batch_0.get(), res_batch_1.get(),
-                                               res_batch_0.get()};
-
-  // verify first block
-  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
-  ASSERT_EQ(batches.size(), 3);
-  for (auto i = 0; i < batches.size(); ++i) {
-    const auto& rb = batches[i];
-    ASSERT_EQ(rb->num_columns(), schema_->num_fields());
-    for (auto j = 0; j < rb->num_columns(); ++j) {
-      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
-    }
-    ASSERT_TRUE(rb->Equals(*expected[i]));
-  }
-
-  // prepare second block expected result
-  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_1_, "[1, 3, 5, 7, 9]"))
-  ARROW_ASSIGN_OR_THROW(res_batch_1, TakeRows(input_batch_2_, "[1]"))
-  expected = {res_batch_0.get(), res_batch_1.get(), res_batch_0.get()};
-
-  // verify second block
-  batches.clear();
-  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
-  ASSERT_EQ(*file_reader->schema(), *schema_);
-  ASSERT_NOT_OK(file_->Advance(lengths[0]));
-  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
-  ASSERT_EQ(batches.size(), 3);
-  for (auto i = 0; i < batches.size(); ++i) {
-    const auto& rb = batches[i];
-    ASSERT_EQ(rb->num_columns(), schema_->num_fields());
-    for (auto j = 0; j < rb->num_columns(); ++j) {
-      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
-    }
-    ASSERT_TRUE(rb->Equals(*expected[i]));
-  }
-}
-
-TEST_F(SplitterTest, TestSplitterMemoryLeak) {
-  
-  std::shared_ptr<arrow::MemoryPool> pool = std::make_shared<MyMemoryPool>(9*1024*1024);
-
-  int32_t num_partitions = 2;
-  split_options_.buffer_size = 4;
-  split_options_.memory_pool = pool.get();
-
-  std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; 
-  ARROW_ASSIGN_OR_THROW(splitter_,
-                        Splitter::Make("rr", schema_, num_partitions, split_options_));
-
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_1_));
-  std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; 
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_2_));
-  std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; 
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_1_));
-  std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; 
-
-  std::cout << "split down " << std::endl;
-
-  ASSERT_NOT_OK(splitter_->Stop());
-  std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; 
-
-  std::cout << "stopped " << std::endl;
-
-  splitter_.reset();
-  std::cout << "bytes allocated " << pool->bytes_allocated() << std::endl; 
-  std::cout << "splitter_ killed " << std::endl;
-
-  split_options_.memory_pool = arrow::default_memory_pool();
-}
-
-TEST_F(SplitterTest, TestHashSplitter) {
-  int32_t num_partitions = 2;
-  split_options_.buffer_size = 4;
-
-  auto f_0 = TreeExprBuilder::MakeField(schema_->field(1));
-  auto f_1 = TreeExprBuilder::MakeField(schema_->field(2));
-  auto f_2 = TreeExprBuilder::MakeField(schema_->field(3));
-
-  auto node_0 = TreeExprBuilder::MakeFunction("add", {f_0, f_1}, int8());
-  auto expr_0 = TreeExprBuilder::MakeExpression(node_0, field("res0", int8()));
-  auto expr_1 = TreeExprBuilder::MakeExpression(f_2, field("f_uint64", uint64()));
-
-  ARROW_ASSIGN_OR_THROW(splitter_, Splitter::Make("hash", schema_, num_partitions,
-                                                  {expr_0, expr_1}, split_options_))
-
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_1_));
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_2_));
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_1_));
-
-  ASSERT_NOT_OK(splitter_->Stop());
-
-  const auto& lengths = splitter_->PartitionLengths();
-  ASSERT_EQ(lengths.size(), 2);
-
-  // verify data file
-  CheckFileExsists(splitter_->DataFile());
-
-  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
-  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
-
-  // verify schema
-  ASSERT_EQ(*file_reader->schema(), *schema_);
-
-  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
-  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
-
-  for (const auto& rb : batches) {
-    ASSERT_EQ(rb->num_columns(), schema_->num_fields());
-    for (auto i = 0; i < rb->num_columns(); ++i) {
-      ASSERT_EQ(rb->column(i)->length(), rb->num_rows());
-    }
-  }
-}
-
-TEST_F(SplitterTest, TestFallbackRangeSplitter) {
-  int32_t num_partitions = 2;
-  split_options_.buffer_size = 4;
-
-  std::shared_ptr<arrow::Array> pid_arr_0;
-  ASSERT_NOT_OK(arrow::ipc::internal::json::ArrayFromJSON(
-      arrow::int32(), "[0, 1, 0, 1, 0, 1, 0, 1, 0, 1]", &pid_arr_0));
-  std::shared_ptr<arrow::Array> pid_arr_1;
-  ASSERT_NOT_OK(
-      arrow::ipc::internal::json::ArrayFromJSON(arrow::int32(), "[0, 1]", &pid_arr_1));
-
-  std::shared_ptr<arrow::Schema> schema_w_pid;
-  std::shared_ptr<arrow::RecordBatch> input_batch_1_w_pid;
-  std::shared_ptr<arrow::RecordBatch> input_batch_2_w_pid;
-  ARROW_ASSIGN_OR_THROW(schema_w_pid,
-                        schema_->AddField(0, arrow::field("pid", arrow::int32())));
-  ARROW_ASSIGN_OR_THROW(input_batch_1_w_pid,
-                        input_batch_1_->AddColumn(0, "pid", pid_arr_0));
-  ARROW_ASSIGN_OR_THROW(input_batch_2_w_pid,
-                        input_batch_2_->AddColumn(0, "pid", pid_arr_1));
-
-  ARROW_ASSIGN_OR_THROW(splitter_, Splitter::Make("range", std::move(schema_w_pid),
-                                                  num_partitions, split_options_))
-
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_1_w_pid));
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_2_w_pid));
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_1_w_pid));
-
-  ASSERT_NOT_OK(splitter_->Stop());
-
-  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
-  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
-
-  // verify partition lengths
-  const auto& lengths = splitter_->PartitionLengths();
-  ASSERT_EQ(lengths.size(), 2);
-  ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]);
-
-  // verify schema
-  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
-  ASSERT_EQ(*file_reader->schema(), *schema_);
-
-  // prepare first block expected result
-  std::shared_ptr<arrow::RecordBatch> res_batch_0;
-  std::shared_ptr<arrow::RecordBatch> res_batch_1;
-  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_1_, "[0, 2, 4, 6, 8]"))
-  ARROW_ASSIGN_OR_THROW(res_batch_1, TakeRows(input_batch_2_, "[0]"))
-  std::vector<arrow::RecordBatch*> expected = {res_batch_0.get(), res_batch_1.get(),
-                                               res_batch_0.get()};
-
-  // verify first block
-  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
-  ASSERT_EQ(batches.size(), 3);
-  for (auto i = 0; i < batches.size(); ++i) {
-    const auto& rb = batches[i];
-    ASSERT_EQ(rb->num_columns(), schema_->num_fields());
-    for (auto j = 0; j < rb->num_columns(); ++j) {
-      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
-    }
-    ASSERT_TRUE(rb->Equals(*expected[i]));
-  }
-
-  // prepare second block expected result
-  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_1_, "[1, 3, 5, 7, 9]"))
-  ARROW_ASSIGN_OR_THROW(res_batch_1, TakeRows(input_batch_2_, "[1]"))
-  expected = {res_batch_0.get(), res_batch_1.get(), res_batch_0.get()};
-
-  // verify second block
-  batches.clear();
-  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
-  ASSERT_EQ(*file_reader->schema(), *schema_);
-  ASSERT_NOT_OK(file_->Advance(lengths[0]));
-  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
-  ASSERT_EQ(batches.size(), 3);
-  for (auto i = 0; i < batches.size(); ++i) {
-    const auto& rb = batches[i];
-    ASSERT_EQ(rb->num_columns(), schema_->num_fields());
-    for (auto j = 0; j < rb->num_columns(); ++j) {
-      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
-    }
-    ASSERT_TRUE(rb->Equals(*expected[i]));
-  }
-}
-
-TEST_F(SplitterTest, TestSpillFailWithOutOfMemory) {
-  auto pool = std::make_unique<MyMemoryPool>(0);
-
-  int32_t num_partitions = 2;
-  split_options_.buffer_size = 4;
-  //split_options_.memory_pool = pool.get();
-  ARROW_ASSIGN_OR_THROW(splitter_,
-                        Splitter::Make("rr", schema_, num_partitions, split_options_));
-
-  auto status = splitter_->Split(*input_batch_1_);
-  // should return OOM status because there's no partition buffer to spill
-  ASSERT_TRUE(status.IsOutOfMemory());
-  ASSERT_NOT_OK(splitter_->Stop());
-}
-
-TEST_F(SplitterTest, TestSpillLargestPartition) {
-  std::shared_ptr<arrow::MemoryPool> pool = std::make_shared<MyMemoryPool>(9*1024*1024);
-  //  pool = std::make_shared<arrow::LoggingMemoryPool>(pool.get());
-
-  int32_t num_partitions = 2;
-  split_options_.buffer_size = 4;
-  //split_options_.memory_pool = pool.get();
-  split_options_.compression_type = arrow::Compression::UNCOMPRESSED;
-  ARROW_ASSIGN_OR_THROW(splitter_,
-                        Splitter::Make("rr", schema_, num_partitions, split_options_));
-
-  for (int i = 0; i < 100; ++i) {
-    ASSERT_NOT_OK(splitter_->Split(*input_batch_1_));
-    ASSERT_NOT_OK(splitter_->Split(*input_batch_2_));
-    ASSERT_NOT_OK(splitter_->Split(*input_batch_1_));
-  }
-  ASSERT_NOT_OK(splitter_->Stop());
-}
-
-TEST_F(SplitterTest, TestRoundRobinListArraySplitter) {
-  auto f_arr_str = field("f_arr", arrow::list(arrow::utf8()));
-  auto f_arr_bool = field("f_bool", arrow::list(arrow::boolean()));
-  auto f_arr_int32 = field("f_int32", arrow::list(arrow::int32()));
-  auto f_arr_double = field("f_double", arrow::list(arrow::float64()));
-  auto f_arr_decimal = field("f_decimal", arrow::list(arrow::decimal(10, 2)));
-
-  auto rb_schema =
-      arrow::schema({f_arr_str, f_arr_bool, f_arr_int32, f_arr_double, f_arr_decimal});
-
-  const std::vector<std::string> input_data_arr = {
-      R"([["alice0", "bob1"], ["alice2"], ["bob3"], ["Alice4", "Bob5", "AlicE6"], ["boB7"], ["ALICE8", "BOB9"]])",
-      R"([[true, null], [true, true, true], [false], [true], [false], [false]])",
-      R"([[1, 2, 3], [9, 8], [null], [3, 1], [0], [1, 9, null]])",
-      R"([[0.26121], [-9.12123, 6.111111], [8.121], [7.21, null], [3.2123, 6,1121], [null]])",
-      R"([["0.26"], ["-9.12", "6.11"], ["8.12"], ["7.21", null], ["3.21", "6.11"], [null]])"};
-
-  std::shared_ptr<arrow::RecordBatch> input_batch_arr;
-  MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr);
-
-  int32_t num_partitions = 2;
-  split_options_.buffer_size = 4;
-  ARROW_ASSIGN_OR_THROW(splitter_,
-                        Splitter::Make("rr", rb_schema, num_partitions, split_options_));
-
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_arr));
-  ASSERT_NOT_OK(splitter_->Stop());
-
-  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
-  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
-
-  // verify partition lengths
-  const auto& lengths = splitter_->PartitionLengths();
-  ASSERT_EQ(lengths.size(), 2);
-  ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]);
-
-  // verify schema
-  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
-  ASSERT_EQ(*file_reader->schema(), *rb_schema);
-
-  // prepare first block expected result
-  std::shared_ptr<arrow::RecordBatch> res_batch_0;
-  std::shared_ptr<arrow::RecordBatch> res_batch_1;
-  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2, 4]"))
-  std::vector<arrow::RecordBatch*> expected = {res_batch_0.get()};
-
-  // verify first block
-  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
-  ASSERT_EQ(batches.size(), 1);
-  for (auto i = 0; i < batches.size(); ++i) {
-    const auto& rb = batches[i];
-    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
-    for (auto j = 0; j < rb->num_columns(); ++j) {
-      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
-    }
-    ASSERT_TRUE(rb->Equals(*expected[i]));
-  }
-
-  // prepare second block expected result
-  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3, 5]"))
-  expected = {res_batch_0.get()};
-
-  // verify second block
-  batches.clear();
-  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
-  ASSERT_EQ(*file_reader->schema(), *rb_schema);
-  ASSERT_NOT_OK(file_->Advance(lengths[0]));
-  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
-  ASSERT_EQ(batches.size(), 1);
-  for (auto i = 0; i < batches.size(); ++i) {
-    const auto& rb = batches[i];
-    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
-    for (auto j = 0; j < rb->num_columns(); ++j) {
-      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
-    }
-    ASSERT_TRUE(rb->Equals(*expected[i]));
-  }
-}
-
-TEST_F(SplitterTest, TestRoundRobinNestListArraySplitter) {
-  auto f_arr_str = field("f_str", arrow::list(arrow::list(arrow::utf8())));
-  auto f_arr_int32 = field("f_int32", arrow::list(arrow::list(arrow::int32())));
-
-  auto rb_schema = arrow::schema({f_arr_str, f_arr_int32});
-
-  const std::vector<std::string> input_data_arr = {
-      R"([[["alice0", "bob1"]], [["alice2"], ["bob3"]], [["Alice4", "Bob5", "AlicE6"]], [["boB7"], ["ALICE8", "BOB9"]]])",
-      R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])"};
-
-  std::shared_ptr<arrow::RecordBatch> input_batch_arr;
-  MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr);
-
-  int32_t num_partitions = 2;
-  split_options_.buffer_size = 4;
-  ARROW_ASSIGN_OR_THROW(splitter_,
-                        Splitter::Make("rr", rb_schema, num_partitions, split_options_));
-
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_arr));
-  ASSERT_NOT_OK(splitter_->Stop());
-
-  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
-  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
-
-  // verify partition lengths
-  const auto& lengths = splitter_->PartitionLengths();
-  ASSERT_EQ(lengths.size(), 2);
-  ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]);
-
-  // verify schema
-  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
-  ASSERT_EQ(*file_reader->schema(), *rb_schema);
-
-  // prepare first block expected result
-  std::shared_ptr<arrow::RecordBatch> res_batch_0;
-  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]"))
-  std::vector<arrow::RecordBatch*> expected = {res_batch_0.get()};
-
-  // verify first block
-  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
-  ASSERT_EQ(batches.size(), 1);
-  for (auto i = 0; i < batches.size(); ++i) {
-    const auto& rb = batches[i];
-    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
-    for (auto j = 0; j < rb->num_columns(); ++j) {
-      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
-    }
-    ASSERT_TRUE(rb->Equals(*expected[i]));
-  }
-
-  // prepare second block expected result
-  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]"))
-  expected = {res_batch_0.get()};
-
-  // verify second block
-  batches.clear();
-  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
-  ASSERT_EQ(*file_reader->schema(), *rb_schema);
-  ASSERT_NOT_OK(file_->Advance(lengths[0]));
-  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
-  ASSERT_EQ(batches.size(), 1);
-  for (auto i = 0; i < batches.size(); ++i) {
-    const auto& rb = batches[i];
-    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
-    for (auto j = 0; j < rb->num_columns(); ++j) {
-      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
-    }
-    ASSERT_TRUE(rb->Equals(*expected[i]));
-  }
-}
-
-TEST_F(SplitterTest, TestRoundRobinNestLargeListArraySplitter) {
-  auto f_arr_str = field("f_str", arrow::large_list(arrow::list(arrow::utf8())));
-  auto f_arr_int32 = field("f_int32", arrow::large_list(arrow::list(arrow::int32())));
-
-  auto rb_schema = arrow::schema({f_arr_str, f_arr_int32});
-
-  const std::vector<std::string> input_data_arr = {
-      R"([[["alice0", "bob1"]], [["alice2"], ["bob3"]], [["Alice4", "Bob5", "AlicE6"]], [["boB7"], ["ALICE8", "BOB9"]]])",
-      R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])"};
-
-  std::shared_ptr<arrow::RecordBatch> input_batch_arr;
-  MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr);
-
-  int32_t num_partitions = 2;
-  split_options_.buffer_size = 4;
-  ARROW_ASSIGN_OR_THROW(splitter_,
-                        Splitter::Make("rr", rb_schema, num_partitions, split_options_));
-
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_arr));
-  ASSERT_NOT_OK(splitter_->Stop());
-
-  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
-  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
-
-  // verify partition lengths
-  const auto& lengths = splitter_->PartitionLengths();
-  ASSERT_EQ(lengths.size(), 2);
-  ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]);
-
-  // verify schema
-  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
-  ASSERT_EQ(*file_reader->schema(), *rb_schema);
-
-  // prepare first block expected result
-  std::shared_ptr<arrow::RecordBatch> res_batch_0;
-  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]"))
-  std::vector<arrow::RecordBatch*> expected = {res_batch_0.get()};
-
-  // verify first block
-  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
-  ASSERT_EQ(batches.size(), 1);
-  for (auto i = 0; i < batches.size(); ++i) {
-    const auto& rb = batches[i];
-    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
-    for (auto j = 0; j < rb->num_columns(); ++j) {
-      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
-    }
-    ASSERT_TRUE(rb->Equals(*expected[i]));
-  }
-
-  // prepare second block expected result
-  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]"))
-  expected = {res_batch_0.get()};
-
-  // verify second block
-  batches.clear();
-  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
-  ASSERT_EQ(*file_reader->schema(), *rb_schema);
-  ASSERT_NOT_OK(file_->Advance(lengths[0]));
-  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
-  ASSERT_EQ(batches.size(), 1);
-  for (auto i = 0; i < batches.size(); ++i) {
-    const auto& rb = batches[i];
-    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
-    for (auto j = 0; j < rb->num_columns(); ++j) {
-      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
-    }
-    ASSERT_TRUE(rb->Equals(*expected[i]));
-  }
-}
-
-TEST_F(SplitterTest, TestRoundRobinListStructArraySplitter) {
-  auto f_arr_int32 = field("f_int32", arrow::list(arrow::list(arrow::int32())));
-  auto f_arr_list_struct =
-      field("f_list_struct", list(struct_({field("a", int32()), field("b", utf8())})));
-
-  auto rb_schema = arrow::schema({f_arr_int32, f_arr_list_struct});
-
-  const std::vector<std::string> input_data_arr = {
-      R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])",
-      R"([[{"a": 4, "b": null}], [{"a": 42, "b": null}, {"a": null, "b": "foo2"}], [{"a": 43, "b": "foo3"}], [{"a": 44, "b": "foo4"}]])"};
-
-  std::shared_ptr<arrow::RecordBatch> input_batch_arr;
-  MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr);
-
-  int32_t num_partitions = 2;
-  split_options_.buffer_size = 4;
-  ARROW_ASSIGN_OR_THROW(splitter_,
-                        Splitter::Make("rr", rb_schema, num_partitions, split_options_));
-
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_arr));
-  ASSERT_NOT_OK(splitter_->Stop());
-
-  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
-  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
-
-  // verify partition lengths
-  const auto& lengths = splitter_->PartitionLengths();
-  ASSERT_EQ(lengths.size(), 2);
-  ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]);
-
-  // verify schema
-  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
-  ASSERT_EQ(*file_reader->schema(), *rb_schema);
-
-  // prepare first block expected result
-  std::shared_ptr<arrow::RecordBatch> res_batch_0;
-  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]"))
-  std::vector<arrow::RecordBatch*> expected = {res_batch_0.get()};
-
-  // verify first block
-  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
-  ASSERT_EQ(batches.size(), 1);
-  for (auto i = 0; i < batches.size(); ++i) {
-    const auto& rb = batches[i];
-    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
-    for (auto j = 0; j < rb->num_columns(); ++j) {
-      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
-    }
-    ASSERT_TRUE(rb->Equals(*expected[i]));
-  }
-
-  // prepare second block expected result
-  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]"))
-  expected = {res_batch_0.get()};
-
-  // verify second block
-  batches.clear();
-  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
-  ASSERT_EQ(*file_reader->schema(), *rb_schema);
-  ASSERT_NOT_OK(file_->Advance(lengths[0]));
-  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
-  ASSERT_EQ(batches.size(), 1);
-  for (auto i = 0; i < batches.size(); ++i) {
-    const auto& rb = batches[i];
-    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
-    for (auto j = 0; j < rb->num_columns(); ++j) {
-      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
-    }
-    ASSERT_TRUE(rb->Equals(*expected[i]));
-  }
-}
-
-TEST_F(SplitterTest, TestRoundRobinListMapArraySplitter) {
-  auto f_arr_int32 = field("f_int32", arrow::list(arrow::list(arrow::int32())));
-  auto f_arr_list_map = field("f_list_map", list(map(utf8(), utf8())));
-
-  auto rb_schema = arrow::schema({f_arr_int32, f_arr_list_map});
-
-  const std::vector<std::string> input_data_arr = {
-      R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])",
-      R"([[[["key1", "val_aa1"]]], [[["key1", "val_bb1"]], [["key2", "val_bb2"]]], [[["key1", "val_cc1"]]], [[["key1", "val_dd1"]]]])"};
-
-  std::shared_ptr<arrow::RecordBatch> input_batch_arr;
-  MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr);
-
-  int32_t num_partitions = 2;
-  split_options_.buffer_size = 4;
-  ARROW_ASSIGN_OR_THROW(splitter_,
-                        Splitter::Make("rr", rb_schema, num_partitions, split_options_));
-
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_arr));
-  ASSERT_NOT_OK(splitter_->Stop());
-
-  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
-  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
-
-  // verify partition lengths
-  const auto& lengths = splitter_->PartitionLengths();
-  ASSERT_EQ(lengths.size(), 2);
-  ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]);
-
-  // verify schema
-  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
-  ASSERT_EQ(*file_reader->schema(), *rb_schema);
-
-  // prepare first block expected result
-  std::shared_ptr<arrow::RecordBatch> res_batch_0;
-  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]"))
-  std::vector<arrow::RecordBatch*> expected = {res_batch_0.get()};
-
-  // verify first block
-  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
-  ASSERT_EQ(batches.size(), 1);
-  for (auto i = 0; i < batches.size(); ++i) {
-    const auto& rb = batches[i];
-    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
-    for (auto j = 0; j < rb->num_columns(); ++j) {
-      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
-    }
-    ASSERT_TRUE(rb->Equals(*expected[i]));
-  }
-
-  // prepare second block expected result
-  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]"))
-  expected = {res_batch_0.get()};
-
-  // verify second block
-  batches.clear();
-  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
-  ASSERT_EQ(*file_reader->schema(), *rb_schema);
-  ASSERT_NOT_OK(file_->Advance(lengths[0]));
-  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
-  ASSERT_EQ(batches.size(), 1);
-  for (auto i = 0; i < batches.size(); ++i) {
-    const auto& rb = batches[i];
-    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
-    for (auto j = 0; j < rb->num_columns(); ++j) {
-      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
-    }
-    ASSERT_TRUE(rb->Equals(*expected[i]));
-  }
-}
-
-TEST_F(SplitterTest, TestRoundRobinStructArraySplitter) {
-  auto f_arr_int32 = field("f_int32", arrow::list(arrow::list(arrow::int32())));
-  auto f_arr_struct_list =
-      field("f_struct_list", struct_({field("a", list(int32())), field("b", utf8())}));
-
-  auto rb_schema = arrow::schema({f_arr_int32, f_arr_struct_list});
-
-  const std::vector<std::string> input_data_arr = {
-      R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])",
-      R"([{"a": [1,1,1,1], "b": null}, {"a": null, "b": "foo2"}, {"a": [3,3,3,3], "b": "foo3"}, {"a": [4,4,4,4], "b": "foo4"}])"};
-
-  std::shared_ptr<arrow::RecordBatch> input_batch_arr;
-  MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr);
-
-  int32_t num_partitions = 2;
-  split_options_.buffer_size = 4;
-  ARROW_ASSIGN_OR_THROW(splitter_,
-                        Splitter::Make("rr", rb_schema, num_partitions, split_options_));
-
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_arr));
-  ASSERT_NOT_OK(splitter_->Stop());
-
-  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
-  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
-
-  // verify partition lengths
-  const auto& lengths = splitter_->PartitionLengths();
-  ASSERT_EQ(lengths.size(), 2);
-  ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]);
-
-  // verify schema
-  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
-  ASSERT_EQ(*file_reader->schema(), *rb_schema);
-
-  // prepare first block expected result
-  std::shared_ptr<arrow::RecordBatch> res_batch_0;
-  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]"))
-  std::vector<arrow::RecordBatch*> expected = {res_batch_0.get()};
-
-  // verify first block
-  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
-  ASSERT_EQ(batches.size(), 1);
-  for (auto i = 0; i < batches.size(); ++i) {
-    const auto& rb = batches[i];
-    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
-    for (auto j = 0; j < rb->num_columns(); ++j) {
-      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
-    }
-    ASSERT_TRUE(rb->Equals(*expected[i]));
-  }
-
-  // prepare second block expected result
-  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]"))
-  expected = {res_batch_0.get()};
-
-  // verify second block
-  batches.clear();
-  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
-  ASSERT_EQ(*file_reader->schema(), *rb_schema);
-  ASSERT_NOT_OK(file_->Advance(lengths[0]));
-  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
-  ASSERT_EQ(batches.size(), 1);
-  for (auto i = 0; i < batches.size(); ++i) {
-    const auto& rb = batches[i];
-    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
-    for (auto j = 0; j < rb->num_columns(); ++j) {
-      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
-    }
-    ASSERT_TRUE(rb->Equals(*expected[i]));
-  }
-}
-
-TEST_F(SplitterTest, TestRoundRobinMapArraySplitter) {
-  auto f_arr_int32 = field("f_int32", arrow::list(arrow::list(arrow::int32())));
-  auto f_arr_map = field("f_map", map(utf8(), utf8()));
-
-  auto rb_schema = arrow::schema({f_arr_int32, f_arr_map});
-
-  const std::vector<std::string> input_data_arr = {
-      R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])",
-      R"([[["key1", "val_aa1"]], [["key1", "val_bb1"], ["key2", "val_bb2"]], [["key1", "val_cc1"]], [["key1", "val_dd1"]]])"};
-
-  std::shared_ptr<arrow::RecordBatch> input_batch_arr;
-  MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr);
-
-  int32_t num_partitions = 2;
-  split_options_.buffer_size = 4;
-  ARROW_ASSIGN_OR_THROW(splitter_,
-                        Splitter::Make("rr", rb_schema, num_partitions, split_options_));
-
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_arr));
-  ASSERT_NOT_OK(splitter_->Stop());
-
-  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
-  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
-
-  // verify partition lengths
-  const auto& lengths = splitter_->PartitionLengths();
-  ASSERT_EQ(lengths.size(), 2);
-  ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]);
-
-  // verify schema
-  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
-  ASSERT_EQ(*file_reader->schema(), *rb_schema);
-
-  // prepare first block expected result
-  std::shared_ptr<arrow::RecordBatch> res_batch_0;
-  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]"))
-  std::vector<arrow::RecordBatch*> expected = {res_batch_0.get()};
-
-  // verify first block
-  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
-  ASSERT_EQ(batches.size(), 1);
-  for (auto i = 0; i < batches.size(); ++i) {
-    const auto& rb = batches[i];
-    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
-    for (auto j = 0; j < rb->num_columns(); ++j) {
-      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
-    }
-    ASSERT_TRUE(rb->Equals(*expected[i]));
-  }
-
-  // prepare second block expected result
-  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]"))
-  expected = {res_batch_0.get()};
-
-  // verify second block
-  batches.clear();
-  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
-  ASSERT_EQ(*file_reader->schema(), *rb_schema);
-  ASSERT_NOT_OK(file_->Advance(lengths[0]));
-  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
-  ASSERT_EQ(batches.size(), 1);
-  for (auto i = 0; i < batches.size(); ++i) {
-    const auto& rb = batches[i];
-    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
-    for (auto j = 0; j < rb->num_columns(); ++j) {
-      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
-    }
-    ASSERT_TRUE(rb->Equals(*expected[i]));
-  }
-}
-
-TEST_F(SplitterTest, TestHashListArraySplitterWithMorePartitions) {
-  int32_t num_partitions = 5;
-  split_options_.buffer_size = 4;
-
-  auto f_uint64 = field("f_uint64", arrow::uint64());
-  auto f_arr_str = field("f_arr", arrow::list(arrow::utf8()));
-
-  auto rb_schema = arrow::schema({f_uint64, f_arr_str});
-
-  const std::vector<std::string> input_batch_1_data = {
-      R"([1, 2])", R"([["alice0", "bob1"], ["alice2"]])"};
-  std::shared_ptr<arrow::RecordBatch> input_batch_arr;
-  MakeInputBatch(input_batch_1_data, rb_schema, &input_batch_arr);
-
-  auto f_2 = TreeExprBuilder::MakeField(f_uint64);
-  auto expr_1 = TreeExprBuilder::MakeExpression(f_2, field("f_uint64", uint64()));
-
-  ARROW_ASSIGN_OR_THROW(splitter_, Splitter::Make("hash", rb_schema, num_partitions,
-                                                  {expr_1}, split_options_));
-
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_arr));
-
-  ASSERT_NOT_OK(splitter_->Stop());
-
-  const auto& lengths = splitter_->PartitionLengths();
-  ASSERT_EQ(lengths.size(), 5);
-
-  CheckFileExsists(splitter_->DataFile());
-
-  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
-  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
-
-  ASSERT_EQ(*file_reader->schema(), *rb_schema);
-
-  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
-  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
-
-  for (const auto& rb : batches) {
-    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
-    for (auto i = 0; i < rb->num_columns(); ++i) {
-      ASSERT_EQ(rb->column(i)->length(), rb->num_rows());
-    }
-  }
-}
-
-TEST_F(SplitterTest, TestRoundRobinListArraySplitterwithCompression) {
-  auto f_arr_str = field("f_arr", arrow::list(arrow::utf8()));
-  auto f_arr_bool = field("f_bool", arrow::list(arrow::boolean()));
-  auto f_arr_int32 = field("f_int32", arrow::list(arrow::int32()));
-  auto f_arr_double = field("f_double", arrow::list(arrow::float64()));
-  auto f_arr_decimal = field("f_decimal", arrow::list(arrow::decimal(10, 2)));
-
-  auto rb_schema =
-      arrow::schema({f_arr_str, f_arr_bool, f_arr_int32, f_arr_double, f_arr_decimal});
-
-  const std::vector<std::string> input_data_arr = {
-      R"([["alice0", "bob1"], ["alice2"], ["bob3"], ["Alice4", "Bob5", "AlicE6"], ["boB7"], ["ALICE8", "BOB9"]])",
-      R"([[true, null], [true, true, true], [false], [true], [false], [false]])",
-      R"([[1, 2, 3], [9, 8], [null], [3, 1], [0], [1, 9, null]])",
-      R"([[0.26121], [-9.12123, 6.111111], [8.121], [7.21, null], [3.2123, 6,1121], [null]])",
-      R"([["0.26"], ["-9.12", "6.11"], ["8.12"], ["7.21", null], ["3.21", "6.11"], [null]])"};
-
-  std::shared_ptr<arrow::RecordBatch> input_batch_arr;
-  MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr);
-
-  int32_t num_partitions = 2;
-  split_options_.buffer_size = 4;
-  ARROW_ASSIGN_OR_THROW(splitter_,
-                        Splitter::Make("rr", rb_schema, num_partitions, split_options_));
-  auto compression_type = arrow::util::Codec::GetCompressionType("lz4");
-  ASSERT_NOT_OK(splitter_->SetCompressType(compression_type.MoveValueUnsafe()));
-  ASSERT_NOT_OK(splitter_->Split(*input_batch_arr));
-  ASSERT_NOT_OK(splitter_->Stop());
-
-  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
-  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
-
-  // verify partition lengths
-  const auto& lengths = splitter_->PartitionLengths();
-  ASSERT_EQ(lengths.size(), 2);
-  ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]);
-
-  // verify schema
-  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
-  ASSERT_EQ(*file_reader->schema(), *rb_schema);
-
-  // prepare first block expected result
-  std::shared_ptr<arrow::RecordBatch> res_batch_0;
-  std::shared_ptr<arrow::RecordBatch> res_batch_1;
-  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2, 4]"))
-  std::vector<arrow::RecordBatch*> expected = {res_batch_0.get()};
-
-  // verify first block
-  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
-  ASSERT_EQ(batches.size(), 1);
-  for (auto i = 0; i < batches.size(); ++i) {
-    const auto& rb = batches[i];
-    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
-    for (auto j = 0; j < rb->num_columns(); ++j) {
-      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
-    }
-    ASSERT_TRUE(rb->Equals(*expected[i]));
-  }
-
-  // prepare second block expected result
-  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3, 5]"))
-  expected = {res_batch_0.get()};
-
-  // verify second block
-  batches.clear();
-  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
-  ASSERT_EQ(*file_reader->schema(), *rb_schema);
-  ASSERT_NOT_OK(file_->Advance(lengths[0]));
-  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
-  ASSERT_EQ(batches.size(), 1);
-  for (auto i = 0; i < batches.size(); ++i) {
-    const auto& rb = batches[i];
-    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
-    for (auto j = 0; j < rb->num_columns(); ++j) {
-      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
-    }
-    ASSERT_TRUE(rb->Equals(*expected[i]));
-  }
-}
-
-}  // namespace shuffle
-}  // namespace sparkcolumnarplugin

From 87b29fee0759ca1f893db7e240696a5690ea61bc Mon Sep 17 00:00:00 2001
From: binwei <binwei.yang@intel.com>
Date: Sun, 1 May 2022 17:11:20 +0800
Subject: [PATCH 06/19] return to original

---
 native-sql-engine/cpp/CMakeLists.txt          |  2 +-
 .../src/benchmarks/shuffle_split_benchmark.cc | 45 +++++++++
 native-sql-engine/cpp/src/shuffle/splitter.cc | 94 ++++++++++++++-----
 3 files changed, 114 insertions(+), 27 deletions(-)

diff --git a/native-sql-engine/cpp/CMakeLists.txt b/native-sql-engine/cpp/CMakeLists.txt
index fe7e989ee..e7d14e0c8 100644
--- a/native-sql-engine/cpp/CMakeLists.txt
+++ b/native-sql-engine/cpp/CMakeLists.txt
@@ -4,7 +4,7 @@ project(spark_columnar_plugin)
 #add_definitions(-DSKIPWRITE -DSKIPCOMPRESS -DPROCESSROW)
 add_definitions(-DPROCESSROW)
 
-#add_compile_options(-g)
+add_compile_options(-g)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 set(root_directory ${PROJECT_BINARY_DIR})
diff --git a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
index d2bffe36a..ce4e88b62 100644
--- a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
+++ b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
@@ -41,6 +41,50 @@ namespace shuffle {
 const int batch_buffer_size = 32768;
 const int split_buffer_size = 8192;
 
+
+class MyLoggingMemoryPool : public MemoryPool {
+ public:
+  explicit MyLoggingMemoryPool(MemoryPool* pool): pool_(pool) {}
+  ~MyLoggingMemoryPool() override = default;
+
+  Status Allocate(int64_t size, uint8_t** out) override {
+    Status s = pool_->Allocate(size, out);
+    std::cout << "Allocate: size = " << size << std::endl;
+    return s;    
+  }
+  Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override
+  {
+    Status s = pool_->Reallocate(old_size, new_size, ptr);
+    std::cout << "Reallocate: old_size = " << old_size << " - new_size = " << new_size
+            << std::endl;
+    return s;
+  }
+
+  void Free(uint8_t* buffer, int64_t size) override{
+    pool_->Free(buffer, size);
+    std::cout << "Free: size = " << size << std::endl;
+  }
+
+  int64_t bytes_allocated() const override{
+    int64_t nb_bytes = pool_->bytes_allocated();
+    std::cout << "bytes_allocated: " << nb_bytes << std::endl;
+    return nb_bytes;
+  }
+
+  int64_t max_memory() const override{
+    int64_t mem = pool_->max_memory();
+    std::cout << "max_memory: " << mem << std::endl;
+    return mem;
+  }
+
+  std::string backend_name() const override{
+    return pool_->backend_name(); 
+  }
+
+ private:
+  MemoryPool* pool_;
+};
+
 class BenchmarkShuffleSplit {
  public:
   BenchmarkShuffleSplit(std::string file_name) { GetRecordBatchReader(file_name); }
@@ -188,6 +232,7 @@ class BenchmarkShuffleSplit {
   std::shared_ptr<arrow::Schema> schema;
   std::vector<std::shared_ptr<::gandiva::Expression>> expr_vector;
   parquet::ArrowReaderProperties properties;
+
 };
 
 class BenchmarkShuffleSplit_CacheScan_Benchmark : public BenchmarkShuffleSplit {
diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc
index 7839c4ce4..a5e3ca932 100644
--- a/native-sql-engine/cpp/src/shuffle/splitter.cc
+++ b/native-sql-engine/cpp/src/shuffle/splitter.cc
@@ -47,6 +47,11 @@ namespace sparkcolumnarplugin {
 namespace shuffle {
 using arrow::internal::checked_cast;
 
+#ifndef SPLIT_BUFFER_SIZE
+//by default, allocate 8M block, 2M page size
+#define SPLIT_BUFFER_SIZE 8*1024*1024
+#endif
+
 template <typename T>
 std::string __m128i_toString(const __m128i var) {
   std::stringstream sstr;
@@ -401,6 +406,37 @@ arrow::Status Splitter::Init() {
       tiny_bach_write_options_.codec,
       arrow::util::Codec::CreateInt32(arrow::Compression::UNCOMPRESSED));
 
+  //Allocate first buffer for split reducer
+  ARROW_ASSIGN_OR_RAISE(combine_buffer_, arrow::AllocateResizableBuffer(
+                                        SPLIT_BUFFER_SIZE,
+                                        options_.memory_pool));
+  combine_buffer_->Resize(0, /*shrink_to_fit =*/false);
+
+  return arrow::Status::OK();
+}
+arrow::Status Splitter::AllocateBufferFromPool(std::shared_ptr<arrow::Buffer>& buffer, uint32_t size)
+{
+  // if size is already larger than buffer pool size, allocate it directly
+  //make size 64byte aligned
+  auto reminder = size & 0x3f;
+  size+=(64-reminder) & ((reminder==0)-1);
+
+  if (size > SPLIT_BUFFER_SIZE )
+  {
+    ARROW_ASSIGN_OR_RAISE(buffer, arrow::AllocateResizableBuffer(
+                                        size, options_.memory_pool));
+    return arrow::Status::OK();
+  }else if (combine_buffer_->capacity() - combine_buffer_->size() < size)
+  {
+    //memory pool is not enough
+    ARROW_ASSIGN_OR_RAISE(combine_buffer_, arrow::AllocateResizableBuffer(
+                                        SPLIT_BUFFER_SIZE,
+                                        options_.memory_pool));
+    combine_buffer_->Resize(0, /*shrink_to_fit = */ false);
+  }
+  buffer = arrow::SliceMutableBuffer(combine_buffer_, combine_buffer_->size(),size);
+  
+  combine_buffer_->Resize(combine_buffer_->size() + size, /*shrink_to_fit = */ false);
   return arrow::Status::OK();
 }
 
@@ -454,6 +490,7 @@ arrow::Status Splitter::Stop() {
     data_file_os_ = fout;
   }
 
+  std::cout << " cache record batch " << std::endl;
   // stop PartitionWriter and collect metrics
   for (auto pid = 0; pid < num_partitions_; ++pid) {
     RETURN_NOT_OK(CacheRecordBatch(pid, true));
@@ -473,11 +510,15 @@ arrow::Status Splitter::Stop() {
       partition_lengths_[pid] = 0;
     }
   }
+  this->combine_buffer_.reset();
 
   // close data file output Stream
   RETURN_NOT_OK(data_file_os_->Close());
 
   EVAL_END("write", options_.thread_id, options_.task_attempt_id)
+
+  
+
   return arrow::Status::OK();
 }
 int64_t batch_nbytes(const arrow::RecordBatch& batch) {
@@ -492,6 +533,7 @@ int64_t batch_nbytes(const arrow::RecordBatch& batch) {
         continue;
       }
       accumulated += buf->size();
+      std::cout << " buffer addr = 0x" << std::hex << buf->address() << std::dec << std::endl;
     }
   }
   return accumulated;
@@ -576,15 +618,13 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer
         default: {
           auto& buffers = partition_fixed_width_buffers_[fixed_width_idx][partition_id];
           if (buffers[0] != nullptr) {
-            buffers[0]->Resize((num_rows >> 3) + 1, /*shrink_to_fit =*/false);
+            buffers[0]=arrow::SliceBuffer(buffers[0],0,(num_rows >> 3) + 1);
           }
           if (buffers[1] != nullptr) {
             if (column_type_id_[i]->id() == arrow::BooleanType::type_id)
-              buffers[1]->Resize((num_rows >> 3) + 1, /*shrink_to_fit =*/false);
+              buffers[1]=arrow::SliceBuffer(buffers[1],0,(num_rows >> 3) + 1);
             else
-              buffers[1]->Resize(
-                  num_rows * (arrow::bit_width(column_type_id_[i]->id()) >> 3),
-                  /*shrink_to_fit =*/false);
+              buffers[1]=arrow::SliceBuffer(buffers[1],0,num_rows * (arrow::bit_width(column_type_id_[i]->id()) >> 3));
           }
 
           if (reset_buffers) {
@@ -604,7 +644,7 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer
         }
       }
     }
-
+    std::cout << " cache record " << std::endl;
     auto batch = arrow::RecordBatch::Make(schema_, num_rows, std::move(arrays));
     int64_t raw_size = batch_nbytes(batch);
 
@@ -642,12 +682,14 @@ arrow::Status Splitter::AllocatePartitionBuffers(int32_t partition_id, int32_t n
   auto binary_idx = 0;
   auto large_binary_idx = 0;
   auto list_idx = 0;
+  auto total_size = 0;
 
   std::vector<std::shared_ptr<arrow::BinaryBuilder>> new_binary_builders;
   std::vector<std::shared_ptr<arrow::LargeBinaryBuilder>> new_large_binary_builders;
   std::vector<std::shared_ptr<arrow::ArrayBuilder>> new_list_builders;
-  std::vector<std::shared_ptr<arrow::ResizableBuffer>> new_value_buffers;
-  std::vector<std::shared_ptr<arrow::ResizableBuffer>> new_validity_buffers;
+  std::vector<std::shared_ptr<arrow::Buffer>> new_value_buffers;
+  std::vector<std::shared_ptr<arrow::Buffer>> new_validity_buffers;
+
   for (auto i = 0; i < num_fields; ++i) {
     switch (column_type_id_[i]->id()) {
       case arrow::BinaryType::type_id:
@@ -688,30 +730,30 @@ arrow::Status Splitter::AllocatePartitionBuffers(int32_t partition_id, int32_t n
       case arrow::NullType::type_id:
         break;
       default: {
-        std::shared_ptr<arrow::ResizableBuffer> value_buffer;
+          try{
+        std::shared_ptr<arrow::Buffer> value_buffer;
         if (column_type_id_[i]->id() == arrow::BooleanType::type_id) {
-          ARROW_ASSIGN_OR_RAISE(value_buffer, arrow::AllocateResizableBuffer(
-                                                  arrow::BitUtil::BytesForBits(new_size),
-                                                  options_.memory_pool));
+          auto status = AllocateBufferFromPool(value_buffer, arrow::BitUtil::BytesForBits(new_size));
+          ARROW_RETURN_NOT_OK( status );
         } else {
-          ARROW_ASSIGN_OR_RAISE(
-              value_buffer,
-              arrow::AllocateResizableBuffer(
-                  new_size * (arrow::bit_width(column_type_id_[i]->id()) / 8),
-                  options_.memory_pool));
+            auto status = AllocateBufferFromPool(value_buffer, new_size * (arrow::bit_width(column_type_id_[i]->id()) >>3));
+            ARROW_RETURN_NOT_OK( status );
+
         }
         new_value_buffers.push_back(std::move(value_buffer));
         if (input_fixed_width_has_null_[fixed_width_idx]) {
-          std::shared_ptr<arrow::ResizableBuffer> validity_buffer;
-          ARROW_ASSIGN_OR_RAISE(
-              validity_buffer,
-              arrow::AllocateResizableBuffer(arrow::BitUtil::BytesForBits(new_size),
-                                             options_.memory_pool));
+          std::shared_ptr<arrow::Buffer> validity_buffer;
+          auto status = AllocateBufferFromPool(validity_buffer, arrow::BitUtil::BytesForBits(new_size));
+          ARROW_RETURN_NOT_OK( status );
           new_validity_buffers.push_back(std::move(validity_buffer));
         } else {
           new_validity_buffers.push_back(nullptr);
         }
         fixed_width_idx++;
+          }catch(const std::exception& e)
+          {
+            std::cout << "exception captured " << e.what() << std::endl;
+          }
         break;
       }
     }
@@ -746,10 +788,10 @@ arrow::Status Splitter::AllocatePartitionBuffers(int32_t partition_id, int32_t n
         break;
       default:
         partition_fixed_width_value_addrs_[fixed_width_idx][partition_id] =
-            const_cast<uint8_t*>(new_value_buffers[fixed_width_idx]->data());
+            new_value_buffers[fixed_width_idx]->mutable_data();
         if (input_fixed_width_has_null_[fixed_width_idx]) {
           partition_fixed_width_validity_addrs_[fixed_width_idx][partition_id] =
-              const_cast<uint8_t*>(new_validity_buffers[fixed_width_idx]->data());
+              new_validity_buffers[fixed_width_idx]->mutable_data();
         } else {
           partition_fixed_width_validity_addrs_[fixed_width_idx][partition_id] = nullptr;
         }
@@ -1569,8 +1611,8 @@ arrow::Status HashSplitter::ComputeAndCountPartitionId(const arrow::RecordBatch&
         "lea (%[num_partitions],%[pid],1),%[tmp]\n"
         "test %[pid],%[pid]\n"
         "cmovs %[tmp],%[pid]\n"
-        : [ pid ] "+r"(pid)
-        : [ num_partitions ] "r"(num_partitions_), [ tmp ] "r"(0));
+        : [pid] "+r"(pid)
+        : [num_partitions] "r"(num_partitions_), [tmp] "r"(0));
     partition_id_[i] = pid;
     partition_id_cnt_[pid]++;
   }

From 4d0e3cfb15e2adef17ccad357b29a65e0aa5bd33 Mon Sep 17 00:00:00 2001
From: binwei <binwei.yang@intel.com>
Date: Sun, 1 May 2022 17:49:12 +0800
Subject: [PATCH 07/19] added memory leak check in test

---
 native-sql-engine/cpp/src/shuffle/splitter.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc
index a5e3ca932..12d2c87bc 100644
--- a/native-sql-engine/cpp/src/shuffle/splitter.cc
+++ b/native-sql-engine/cpp/src/shuffle/splitter.cc
@@ -490,7 +490,6 @@ arrow::Status Splitter::Stop() {
     data_file_os_ = fout;
   }
 
-  std::cout << " cache record batch " << std::endl;
   // stop PartitionWriter and collect metrics
   for (auto pid = 0; pid < num_partitions_; ++pid) {
     RETURN_NOT_OK(CacheRecordBatch(pid, true));
@@ -511,6 +510,7 @@ arrow::Status Splitter::Stop() {
     }
   }
   this->combine_buffer_.reset();
+  this->schema_payload_.reset();
 
   // close data file output Stream
   RETURN_NOT_OK(data_file_os_->Close());
@@ -533,7 +533,6 @@ int64_t batch_nbytes(const arrow::RecordBatch& batch) {
         continue;
       }
       accumulated += buf->size();
-      std::cout << " buffer addr = 0x" << std::hex << buf->address() << std::dec << std::endl;
     }
   }
   return accumulated;
@@ -644,7 +643,6 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer
         }
       }
     }
-    std::cout << " cache record " << std::endl;
     auto batch = arrow::RecordBatch::Make(schema_, num_rows, std::move(arrays));
     int64_t raw_size = batch_nbytes(batch);
 

From 1db0b7e61bcb2408186d5245fd291713efccde0e Mon Sep 17 00:00:00 2001
From: binwei <binwei.yang@intel.com>
Date: Mon, 2 May 2022 19:19:40 +0800
Subject: [PATCH 08/19] Done

---
 native-sql-engine/cpp/CMakeLists.txt          |   2 +-
 .../src/benchmarks/shuffle_split_benchmark.cc | 141 ++++++++++++++----
 native-sql-engine/cpp/src/shuffle/splitter.cc |  95 ++++++------
 native-sql-engine/cpp/src/shuffle/splitter.h  |   8 +-
 4 files changed, 161 insertions(+), 85 deletions(-)

diff --git a/native-sql-engine/cpp/CMakeLists.txt b/native-sql-engine/cpp/CMakeLists.txt
index e7d14e0c8..fe7e989ee 100644
--- a/native-sql-engine/cpp/CMakeLists.txt
+++ b/native-sql-engine/cpp/CMakeLists.txt
@@ -4,7 +4,7 @@ project(spark_columnar_plugin)
 #add_definitions(-DSKIPWRITE -DSKIPCOMPRESS -DPROCESSROW)
 add_definitions(-DPROCESSROW)
 
-add_compile_options(-g)
+#add_compile_options(-g)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 set(root_directory ${PROJECT_BINARY_DIR})
diff --git a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
index ce4e88b62..2baf1915e 100644
--- a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
+++ b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
@@ -24,12 +24,25 @@
 #include <arrow/util/io_util.h>
 //#include <gtest/gtest.h>
 #include <benchmark/benchmark.h>
+#include <execinfo.h>
 #include <parquet/arrow/reader.h>
 #include <parquet/file_reader.h>
 #include <sched.h>
 #include <shuffle/splitter.h>
+#include <sys/mman.h>
 
 #include <chrono>
+void print_trace(void) {
+  char** strings;
+  size_t i, size;
+  enum Constexpr { MAX_SIZE = 1024 };
+  void* array[MAX_SIZE];
+  size = backtrace(array, MAX_SIZE);
+  strings = backtrace_symbols(array, size);
+  for (i = 0; i < size; i++) printf("    %s\n", strings[i]);
+  puts("");
+  free(strings);
+}
 
 #include "codegen/code_generator.h"
 #include "codegen/code_generator_factory.h"
@@ -38,51 +51,106 @@
 namespace sparkcolumnarplugin {
 namespace shuffle {
 
+#define ALIGNMENT 2048 * 1024
+
 const int batch_buffer_size = 32768;
 const int split_buffer_size = 8192;
 
-
-class MyLoggingMemoryPool : public MemoryPool {
+class MyMemoryPool : public arrow::MemoryPool {
  public:
-  explicit MyLoggingMemoryPool(MemoryPool* pool): pool_(pool) {}
-  ~MyLoggingMemoryPool() override = default;
+  explicit MyMemoryPool() {}
 
   Status Allocate(int64_t size, uint8_t** out) override {
-    Status s = pool_->Allocate(size, out);
-    std::cout << "Allocate: size = " << size << std::endl;
-    return s;    
+    RETURN_NOT_OK(pool_->Allocate(size, out));
+    stats_.UpdateAllocatedBytes(size);
+    // std::cout << "Allocate: size = " << size << " addr = " << std::hex <<
+    // (uint64_t)*out << std::dec << std::endl; print_trace();
+    return arrow::Status::OK();
   }
-  Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override
-  {
-    Status s = pool_->Reallocate(old_size, new_size, ptr);
-    std::cout << "Reallocate: old_size = " << old_size << " - new_size = " << new_size
-            << std::endl;
-    return s;
+
+  Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override {
+    auto old_ptr = *ptr;
+    RETURN_NOT_OK(pool_->Reallocate(old_size, new_size, ptr));
+    stats_.UpdateAllocatedBytes(new_size - old_size);
+    // std::cout << "Reallocate: old_size = " << old_size << " old_ptr = " << std::hex <<
+    // (uint64_t)old_ptr << std::dec << " new_size = " << new_size << " addr = " <<
+    // std::hex << (uint64_t)*ptr << std::dec << std::endl; print_trace();
+    return arrow::Status::OK();
   }
 
-  void Free(uint8_t* buffer, int64_t size) override{
+  void Free(uint8_t* buffer, int64_t size) override {
     pool_->Free(buffer, size);
-    std::cout << "Free: size = " << size << std::endl;
+    stats_.UpdateAllocatedBytes(-size);
+    // std::cout << "Free: size = " << size << " addr = " << std::hex << (uint64_t)buffer
+    // << std::dec << std::endl; print_trace();
   }
 
-  int64_t bytes_allocated() const override{
-    int64_t nb_bytes = pool_->bytes_allocated();
-    std::cout << "bytes_allocated: " << nb_bytes << std::endl;
-    return nb_bytes;
+  int64_t bytes_allocated() const override { return stats_.bytes_allocated(); }
+
+  int64_t max_memory() const override { return pool_->max_memory(); }
+
+  std::string backend_name() const override { return pool_->backend_name(); }
+
+ private:
+  MemoryPool* pool_ = arrow::default_memory_pool();
+  arrow::internal::MemoryPoolStats stats_;
+};
+
+#define ENABLELARGEPAGE
+
+class LargePageMemoryPool : public MemoryPool {
+ public:
+  explicit LargePageMemoryPool() {}
+
+  ~LargePageMemoryPool() override = default;
+
+  Status Allocate(int64_t size, uint8_t** out) override {
+#ifdef ENABLELARGEPAGE
+    if (size < 2 * 1024 * 1024) {
+      return pool_->Allocate(size, out);
+    } else {
+      Status st = pool_->AlignAllocate(size, out, ALIGNMENT);
+      madvise(*out, size, /*MADV_HUGEPAGE */ 14);
+      //std::cout << "Allocate: size = " << size << " addr = "  \
+      //    << std::hex << (uint64_t)*out  << " end = " << std::hex << (uint64_t)(*out+size) << std::dec << std::endl;
+      return st;
+    }
+#else
+    return pool_->Allocate(size, out);
+#endif
   }
 
-  int64_t max_memory() const override{
-    int64_t mem = pool_->max_memory();
-    std::cout << "max_memory: " << mem << std::endl;
-    return mem;
+  Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override {
+    return pool_->Reallocate(old_size, new_size, ptr);
+#ifdef ENABLELARGEPAGE
+    if (new_size < 2 * 1024 * 1024) {
+      return pool_->Reallocate(old_size, new_size, ptr);
+    } else {
+      Status st = pool_->AlignReallocate(old_size, new_size, ptr, ALIGNMENT);
+      // madvise(*ptr, new_size, /*MADV_HUGEPAGE */ 14);
+      return st;
+    }
+#else
+    return pool_->Reallocate(old_size, new_size, ptr);
+#endif
   }
 
-  std::string backend_name() const override{
-    return pool_->backend_name(); 
+  void Free(uint8_t* buffer, int64_t size) override {
+    if (size < 2 * 1024 * 1024) {
+      pool_->Free(buffer, size);
+    } else {
+      pool_->Free(buffer, size, ALIGNMENT);
+    }
   }
 
+  int64_t bytes_allocated() const override { return pool_->bytes_allocated(); }
+
+  int64_t max_memory() const override { return pool_->max_memory(); }
+
+  std::string backend_name() const override { return "LargePageMemoryPool"; }
+
  private:
-  MemoryPool* pool_;
+  MemoryPool* pool_ = arrow::default_memory_pool();
 };
 
 class BenchmarkShuffleSplit {
@@ -133,6 +201,8 @@ class BenchmarkShuffleSplit {
     SetCPU(state.thread_index());
     arrow::Compression::type compression_type = (arrow::Compression::type)state.range(1);
 
+    std::shared_ptr<arrow::MemoryPool> pool = std::make_shared<LargePageMemoryPool>();
+
     const int num_partitions = state.range(0);
 
     auto options = SplitOptions::Defaults();
@@ -142,6 +212,7 @@ class BenchmarkShuffleSplit {
     options.offheap_per_task = 128 * 1024 * 1024 * 1024L;
     options.prefer_spill = true;
     options.write_schema = false;
+    options.memory_pool = pool.get();
 
     std::shared_ptr<Splitter> splitter;
     int64_t elapse_read = 0;
@@ -210,6 +281,9 @@ class BenchmarkShuffleSplit {
                  splitter->TotalWriteTime();
     state.counters["split_time"] = benchmark::Counter(
         split_time, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
+    splitter.reset();
+    std::cout << " split reset memory allocated = "
+              << options.memory_pool->bytes_allocated() << std::endl;
   }
 
  protected:
@@ -232,7 +306,6 @@ class BenchmarkShuffleSplit {
   std::shared_ptr<arrow::Schema> schema;
   std::vector<std::shared_ptr<::gandiva::Expression>> expr_vector;
   parquet::ArrowReaderProperties properties;
-
 };
 
 class BenchmarkShuffleSplit_CacheScan_Benchmark : public BenchmarkShuffleSplit {
@@ -296,14 +369,18 @@ class BenchmarkShuffleSplit_CacheScan_Benchmark : public BenchmarkShuffleSplit {
     std::cout << "batches = " << num_batches << " rows = " << num_rows << std::endl;
 
     for (auto _ : state) {
-      for_each(
-          batches.begin(), batches.end(),
-          [&splitter, &split_time](std::shared_ptr<arrow::RecordBatch>& record_batch) {
-            TIME_NANO_OR_THROW(split_time, splitter->Split(*record_batch));
-          });
+      for_each(batches.begin(), batches.end(),
+               [&splitter, &split_time,
+                &options](std::shared_ptr<arrow::RecordBatch>& record_batch) {
+                 TIME_NANO_OR_THROW(split_time, splitter->Split(*record_batch));
+               });
+      // std::cout << " split done memory allocated = " <<
+      // options.memory_pool->bytes_allocated() << std::endl;
     }
 
     TIME_NANO_OR_THROW(split_time, splitter->Stop());
+    std::cout << " split stop memory allocated = "
+              << options.memory_pool->bytes_allocated() << std::endl;
   }
 };
 
diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc
index 12d2c87bc..2d031ebc2 100644
--- a/native-sql-engine/cpp/src/shuffle/splitter.cc
+++ b/native-sql-engine/cpp/src/shuffle/splitter.cc
@@ -48,8 +48,8 @@ namespace shuffle {
 using arrow::internal::checked_cast;
 
 #ifndef SPLIT_BUFFER_SIZE
-//by default, allocate 8M block, 2M page size
-#define SPLIT_BUFFER_SIZE 8*1024*1024
+// by default, allocate 8M block, 2M page size
+#define SPLIT_BUFFER_SIZE 8 * 1024 * 1024
 #endif
 
 template <typename T>
@@ -406,36 +406,31 @@ arrow::Status Splitter::Init() {
       tiny_bach_write_options_.codec,
       arrow::util::Codec::CreateInt32(arrow::Compression::UNCOMPRESSED));
 
-  //Allocate first buffer for split reducer
+  // Allocate first buffer for split reducer
   ARROW_ASSIGN_OR_RAISE(combine_buffer_, arrow::AllocateResizableBuffer(
-                                        SPLIT_BUFFER_SIZE,
-                                        options_.memory_pool));
+                                             SPLIT_BUFFER_SIZE, options_.memory_pool));
   combine_buffer_->Resize(0, /*shrink_to_fit =*/false);
 
   return arrow::Status::OK();
 }
-arrow::Status Splitter::AllocateBufferFromPool(std::shared_ptr<arrow::Buffer>& buffer, uint32_t size)
-{
+arrow::Status Splitter::AllocateBufferFromPool(std::shared_ptr<arrow::Buffer>& buffer,
+                                               uint32_t size) {
   // if size is already larger than buffer pool size, allocate it directly
-  //make size 64byte aligned
+  // make size 64byte aligned
   auto reminder = size & 0x3f;
-  size+=(64-reminder) & ((reminder==0)-1);
-
-  if (size > SPLIT_BUFFER_SIZE )
-  {
-    ARROW_ASSIGN_OR_RAISE(buffer, arrow::AllocateResizableBuffer(
-                                        size, options_.memory_pool));
+  size += (64 - reminder) & ((reminder == 0) - 1);
+  if (size > SPLIT_BUFFER_SIZE) {
+    ARROW_ASSIGN_OR_RAISE(buffer,
+                          arrow::AllocateResizableBuffer(size, options_.memory_pool));
     return arrow::Status::OK();
-  }else if (combine_buffer_->capacity() - combine_buffer_->size() < size)
-  {
-    //memory pool is not enough
+  } else if (combine_buffer_->capacity() - combine_buffer_->size() < size) {
+    // memory pool is not enough
     ARROW_ASSIGN_OR_RAISE(combine_buffer_, arrow::AllocateResizableBuffer(
-                                        SPLIT_BUFFER_SIZE,
-                                        options_.memory_pool));
+                                               SPLIT_BUFFER_SIZE, options_.memory_pool));
     combine_buffer_->Resize(0, /*shrink_to_fit = */ false);
   }
-  buffer = arrow::SliceMutableBuffer(combine_buffer_, combine_buffer_->size(),size);
-  
+  buffer = arrow::SliceMutableBuffer(combine_buffer_, combine_buffer_->size(), size);
+
   combine_buffer_->Resize(combine_buffer_->size() + size, /*shrink_to_fit = */ false);
   return arrow::Status::OK();
 }
@@ -517,8 +512,6 @@ arrow::Status Splitter::Stop() {
 
   EVAL_END("write", options_.thread_id, options_.task_attempt_id)
 
-  
-
   return arrow::Status::OK();
 }
 int64_t batch_nbytes(const arrow::RecordBatch& batch) {
@@ -617,13 +610,15 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer
         default: {
           auto& buffers = partition_fixed_width_buffers_[fixed_width_idx][partition_id];
           if (buffers[0] != nullptr) {
-            buffers[0]=arrow::SliceBuffer(buffers[0],0,(num_rows >> 3) + 1);
+            buffers[0] = arrow::SliceBuffer(buffers[0], 0, (num_rows >> 3) + 1);
           }
           if (buffers[1] != nullptr) {
             if (column_type_id_[i]->id() == arrow::BooleanType::type_id)
-              buffers[1]=arrow::SliceBuffer(buffers[1],0,(num_rows >> 3) + 1);
+              buffers[1] = arrow::SliceBuffer(buffers[1], 0, (num_rows >> 3) + 1);
             else
-              buffers[1]=arrow::SliceBuffer(buffers[1],0,num_rows * (arrow::bit_width(column_type_id_[i]->id()) >> 3));
+              buffers[1] = arrow::SliceBuffer(
+                  buffers[1], 0,
+                  num_rows * (arrow::bit_width(column_type_id_[i]->id()) >> 3));
           }
 
           if (reset_buffers) {
@@ -728,30 +723,32 @@ arrow::Status Splitter::AllocatePartitionBuffers(int32_t partition_id, int32_t n
       case arrow::NullType::type_id:
         break;
       default: {
-          try{
-        std::shared_ptr<arrow::Buffer> value_buffer;
-        if (column_type_id_[i]->id() == arrow::BooleanType::type_id) {
-          auto status = AllocateBufferFromPool(value_buffer, arrow::BitUtil::BytesForBits(new_size));
-          ARROW_RETURN_NOT_OK( status );
-        } else {
-            auto status = AllocateBufferFromPool(value_buffer, new_size * (arrow::bit_width(column_type_id_[i]->id()) >>3));
-            ARROW_RETURN_NOT_OK( status );
-
-        }
-        new_value_buffers.push_back(std::move(value_buffer));
-        if (input_fixed_width_has_null_[fixed_width_idx]) {
-          std::shared_ptr<arrow::Buffer> validity_buffer;
-          auto status = AllocateBufferFromPool(validity_buffer, arrow::BitUtil::BytesForBits(new_size));
-          ARROW_RETURN_NOT_OK( status );
-          new_validity_buffers.push_back(std::move(validity_buffer));
-        } else {
-          new_validity_buffers.push_back(nullptr);
-        }
-        fixed_width_idx++;
-          }catch(const std::exception& e)
-          {
-            std::cout << "exception captured " << e.what() << std::endl;
+        try {
+          std::shared_ptr<arrow::Buffer> value_buffer;
+          if (column_type_id_[i]->id() == arrow::BooleanType::type_id) {
+            auto status = AllocateBufferFromPool(value_buffer,
+                                                 arrow::BitUtil::BytesForBits(new_size));
+            ARROW_RETURN_NOT_OK(status);
+          } else {
+            auto status = AllocateBufferFromPool(
+                value_buffer,
+                new_size * (arrow::bit_width(column_type_id_[i]->id()) >> 3));
+            ARROW_RETURN_NOT_OK(status);
           }
+          new_value_buffers.push_back(std::move(value_buffer));
+          if (input_fixed_width_has_null_[fixed_width_idx]) {
+            std::shared_ptr<arrow::Buffer> validity_buffer;
+            auto status = AllocateBufferFromPool(validity_buffer,
+                                                 arrow::BitUtil::BytesForBits(new_size));
+            ARROW_RETURN_NOT_OK(status);
+            new_validity_buffers.push_back(std::move(validity_buffer));
+          } else {
+            new_validity_buffers.push_back(nullptr);
+          }
+          fixed_width_idx++;
+        } catch (const std::exception& e) {
+          std::cout << "exception captured " << e.what() << std::endl;
+        }
         break;
       }
     }
diff --git a/native-sql-engine/cpp/src/shuffle/splitter.h b/native-sql-engine/cpp/src/shuffle/splitter.h
index 1c1c8e2da..d50519a53 100644
--- a/native-sql-engine/cpp/src/shuffle/splitter.h
+++ b/native-sql-engine/cpp/src/shuffle/splitter.h
@@ -138,7 +138,8 @@ class Splitter {
 
   arrow::Status SplitListArray(const arrow::RecordBatch& rb);
 
-  arrow::Status AllocateBufferFromPool(std::shared_ptr<arrow::Buffer>& buffer, uint32_t size);
+  arrow::Status AllocateBufferFromPool(std::shared_ptr<arrow::Buffer>& buffer,
+                                       uint32_t size);
 
   template <typename T, typename ArrayType = typename arrow::TypeTraits<T>::ArrayType,
             typename BuilderType = typename arrow::TypeTraits<T>::BuilderType>
@@ -201,8 +202,9 @@ class Splitter {
   std::vector<std::vector<std::shared_ptr<arrow::ArrayBuilder>>> partition_list_builders_;
   // col partid
 
-  //slice the buffer for each reducer's column, in this way we can combine into large page
-  std::shared_ptr<arrow::ResizableBuffer> combine_buffer_; 
+  // slice the buffer for each reducer's column, in this way we can combine into large
+  // page
+  std::shared_ptr<arrow::ResizableBuffer> combine_buffer_;
 
   // partid
   std::vector<std::vector<std::shared_ptr<arrow::ipc::IpcPayload>>>

From dc4b579bd05edd14d7aca18c91c1fe2efcae8508 Mon Sep 17 00:00:00 2001
From: binwei <binwei.yang@intel.com>
Date: Mon, 2 May 2022 19:27:28 +0800
Subject: [PATCH 09/19] disable alignment allocation in benchmark since arrow
 doesn't support it

---
 .../cpp/src/benchmarks/shuffle_split_benchmark.cc           | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
index 2baf1915e..1f1d1b6e5 100644
--- a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
+++ b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
@@ -96,7 +96,7 @@ class MyMemoryPool : public arrow::MemoryPool {
   arrow::internal::MemoryPoolStats stats_;
 };
 
-#define ENABLELARGEPAGE
+//#define ENABLELARGEPAGE
 
 class LargePageMemoryPool : public MemoryPool {
  public:
@@ -136,11 +136,15 @@ class LargePageMemoryPool : public MemoryPool {
   }
 
   void Free(uint8_t* buffer, int64_t size) override {
+#ifdef ENABLELARGEPAGE
     if (size < 2 * 1024 * 1024) {
       pool_->Free(buffer, size);
     } else {
       pool_->Free(buffer, size, ALIGNMENT);
     }
+#else
+      pool_->Free(buffer, size);
+#endif
   }
 
   int64_t bytes_allocated() const override { return pool_->bytes_allocated(); }

From 173c86c5a5961b8a6efec6d2437aa136d0cf2759 Mon Sep 17 00:00:00 2001
From: binwei <binwei.yang@intel.com>
Date: Wed, 4 May 2022 00:24:58 +0800
Subject: [PATCH 10/19] optimized validity buffer assign. initialize the
 validity buffer as true once allocated. skip the initialize during split fix
 validity buffer bug

---
 .../src/benchmarks/shuffle_split_benchmark.cc | 18 ++---
 native-sql-engine/cpp/src/shuffle/splitter.cc | 65 +++++++++++++------
 native-sql-engine/cpp/src/shuffle/splitter.h  |  5 +-
 3 files changed, 56 insertions(+), 32 deletions(-)

diff --git a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
index 1f1d1b6e5..6f9a7f19e 100644
--- a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
+++ b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
@@ -96,7 +96,7 @@ class MyMemoryPool : public arrow::MemoryPool {
   arrow::internal::MemoryPoolStats stats_;
 };
 
-//#define ENABLELARGEPAGE
+#define ENABLELARGEPAGE
 
 class LargePageMemoryPool : public MemoryPool {
  public:
@@ -286,8 +286,6 @@ class BenchmarkShuffleSplit {
     state.counters["split_time"] = benchmark::Counter(
         split_time, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
     splitter.reset();
-    std::cout << " split reset memory allocated = "
-              << options.memory_pool->bytes_allocated() << std::endl;
   }
 
  protected:
@@ -323,17 +321,18 @@ class BenchmarkShuffleSplit_CacheScan_Benchmark : public BenchmarkShuffleSplit {
                 const int num_partitions, SplitOptions options, benchmark::State& state) {
     std::vector<int> local_column_indices;
     local_column_indices.push_back(0);
+/*    local_column_indices.push_back(0);
     local_column_indices.push_back(1);
     local_column_indices.push_back(2);
     local_column_indices.push_back(4);
     local_column_indices.push_back(5);
     local_column_indices.push_back(6);
-    local_column_indices.push_back(7);
+    local_column_indices.push_back(7);*/
 
     std::shared_ptr<arrow::Schema> local_schema;
     local_schema = std::make_shared<arrow::Schema>(*schema.get());
 
-    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(15));
+/*    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(15));
     ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(14));
     ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(13));
     ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(12));
@@ -342,7 +341,7 @@ class BenchmarkShuffleSplit_CacheScan_Benchmark : public BenchmarkShuffleSplit {
     ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(9));
     ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(8));
     ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(3));
-
+*/
     if (state.thread_index() == 0) std::cout << local_schema->ToString() << std::endl;
 
     ARROW_ASSIGN_OR_THROW(splitter,
@@ -383,8 +382,6 @@ class BenchmarkShuffleSplit_CacheScan_Benchmark : public BenchmarkShuffleSplit {
     }
 
     TIME_NANO_OR_THROW(split_time, splitter->Stop());
-    std::cout << " split stop memory allocated = "
-              << options.memory_pool->bytes_allocated() << std::endl;
   }
 };
 
@@ -500,7 +497,7 @@ int main(int argc, char** argv) {
       ->MeasureProcessCPUTime()
       ->Unit(benchmark::kSecond);
 
-  /*  sparkcolumnarplugin::shuffle::BenchmarkShuffleSplit_IterateScan_Benchmark
+/*    sparkcolumnarplugin::shuffle::BenchmarkShuffleSplit_IterateScan_Benchmark
     bck(datafile);
 
     benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck)
@@ -523,8 +520,7 @@ int main(int argc, char** argv) {
         ->Threads(16)
         ->Threads(24)
         ->Unit(benchmark::kSecond);
-  */
-
+*/
   benchmark::Initialize(&argc, argv);
   benchmark::RunSpecifiedBenchmarks();
   benchmark::Shutdown();
diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc
index 2d031ebc2..c96ad12f8 100644
--- a/native-sql-engine/cpp/src/shuffle/splitter.cc
+++ b/native-sql-engine/cpp/src/shuffle/splitter.cc
@@ -350,7 +350,6 @@ arrow::Status Splitter::Init() {
 
   auto num_fixed_width = fixed_width_array_idx_.size();
   partition_fixed_width_validity_addrs_.resize(num_fixed_width);
-  column_has_null_.resize(num_fixed_width, false);
   partition_fixed_width_value_addrs_.resize(num_fixed_width);
   partition_fixed_width_buffers_.resize(num_fixed_width);
   binary_array_empirical_size_.resize(binary_array_idx_.size());
@@ -507,6 +506,8 @@ arrow::Status Splitter::Stop() {
   this->combine_buffer_.reset();
   this->schema_payload_.reset();
 
+  std::cout << "src null count " << src_null_cnt << " dst null cnt = " << dst_null_cnt << std::endl;
+
   // close data file output Stream
   RETURN_NOT_OK(data_file_os_->Close());
 
@@ -608,7 +609,7 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer
           break;
         }
         default: {
-          auto& buffers = partition_fixed_width_buffers_[fixed_width_idx][partition_id];
+          auto buffers = partition_fixed_width_buffers_[fixed_width_idx][partition_id];
           if (buffers[0] != nullptr) {
             buffers[0] = arrow::SliceBuffer(buffers[0], 0, (num_rows >> 3) + 1);
           }
@@ -624,14 +625,21 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer
           if (reset_buffers) {
             arrays[i] = arrow::MakeArray(
                 arrow::ArrayData::Make(schema_->field(i)->type(), num_rows,
-                                       {std::move(buffers[0]), std::move(buffers[1])}));
-            buffers = {nullptr, nullptr};
+                                       {buffers[0],buffers[1]}));
+            if(buffers[0]!=nullptr)
+            {
+              dst_null_cnt+=arrays[i]->null_count();
+            }
             partition_fixed_width_validity_addrs_[fixed_width_idx][partition_id] =
                 nullptr;
             partition_fixed_width_value_addrs_[fixed_width_idx][partition_id] = nullptr;
           } else {
             arrays[i] = arrow::MakeArray(arrow::ArrayData::Make(
                 schema_->field(i)->type(), num_rows, {buffers[0], buffers[1]}));
+            if(buffers[0]!=nullptr)
+            {
+              dst_null_cnt+=arrays[i]->null_count();
+            }
           }
           fixed_width_idx++;
           break;
@@ -659,7 +667,7 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer
                        arrow::ipc::GetRecordBatchPayload(*batch, tiny_bach_write_options_,
                                                          payload.get()));
 #endif
-
+    
     partition_cached_recordbatch_size_[partition_id] += payload->body_length;
     partition_cached_recordbatch_[partition_id].push_back(std::move(payload));
     partition_buffer_idx_base_[partition_id] = 0;
@@ -741,6 +749,8 @@ arrow::Status Splitter::AllocatePartitionBuffers(int32_t partition_id, int32_t n
             auto status = AllocateBufferFromPool(validity_buffer,
                                                  arrow::BitUtil::BytesForBits(new_size));
             ARROW_RETURN_NOT_OK(status);
+            //initialize all true once allocated
+            memset(validity_buffer->mutable_data(), 0xff, validity_buffer->capacity());
             new_validity_buffers.push_back(std::move(validity_buffer));
           } else {
             new_validity_buffers.push_back(nullptr);
@@ -850,6 +860,18 @@ arrow::Status Splitter::SpillPartition(int32_t partition_id) {
         std::make_shared<PartitionWriter>(this, partition_id);
   }
   TIME_NANO_OR_RAISE(total_spill_time_, partition_writer_[partition_id]->Spill());
+
+  //reset validity buffer after spill
+  std::for_each(partition_fixed_width_buffers_.begin(),
+      partition_fixed_width_buffers_.end(),[partition_id](std::vector<arrow::BufferVector>& bufs){
+        if (bufs[partition_id][0]!=nullptr)
+        {
+          //initialize all true once allocated
+          auto addr = bufs[partition_id][0]->mutable_data();
+          memset(addr,0xff,bufs[partition_id][0]->capacity());
+        }
+      });
+
   return arrow::Status::OK();
 }
 
@@ -877,6 +899,7 @@ arrow::Result<int32_t> Splitter::SpillLargestPartition(int64_t* size) {
 }
 
 arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) {
+
 #ifdef PROCESSROW
 
   reducer_offsets_.resize(rb.num_rows());
@@ -926,9 +949,11 @@ arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) {
   for (auto col = 0; col < fixed_width_array_idx_.size(); ++col) {
     auto col_idx = fixed_width_array_idx_[col];
     size_per_row += arrow::bit_width(column_type_id_[col_idx]->id()) / 8;
-    if (rb.column_data(col_idx)->GetNullCount() != 0) {
+    //check input_fixed_width_has_null_[col] is cheaper than GetNullCount()
+    if (input_fixed_width_has_null_[col]==false && rb.column_data(col_idx)->GetNullCount() != 0) {
       input_fixed_width_has_null_[col] = true;
     }
+    src_null_cnt+=rb.column_data(col_idx)->GetNullCount();
   }
 
   int64_t prealloc_row_cnt =
@@ -989,6 +1014,7 @@ arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) {
   for (auto pid = 0; pid < num_partitions_; ++pid) {
     partition_buffer_idx_base_[pid] += partition_id_cnt_[pid];
   }
+  
   return arrow::Status::OK();
 }
 
@@ -1349,20 +1375,8 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch&
   for (auto col = 0; col < fixed_width_array_idx_.size(); ++col) {
     auto col_idx = fixed_width_array_idx_[col];
     auto& dst_addrs = partition_fixed_width_validity_addrs_[col];
-    if (rb.column_data(col_idx)->GetNullCount() == 0 &&
-        column_has_null_[col_idx] == true) {
-      // if the input record batch doesn't have null, set validity to True
-      // column_has_null_ is used to skip the partition_id_cnt_[pid] and dst_addrs[pid]
-      // access
-      for (auto pid = 0; pid < num_partitions_; ++pid) {
-        if (partition_id_cnt_[pid] > 0 && dst_addrs[pid] != nullptr) {
-          arrow::BitUtil::SetBitsTo(dst_addrs[pid], partition_buffer_idx_base_[pid],
-                                    partition_id_cnt_[pid], true);
-        }
-      }
-    } else if (rb.column_data(col_idx)->GetNullCount() > 0) {
+    if (rb.column_data(col_idx)->GetNullCount() > 0) {
       // there is Null count
-      column_has_null_[col_idx] = true;
       for (auto pid = 0; pid < num_partitions_; ++pid) {
         if (partition_id_cnt_[pid] > 0 && dst_addrs[pid] == nullptr) {
           // init bitmap if it's null, initialize the buffer as true
@@ -1383,6 +1397,8 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch&
       partition_buffer_idx_offset.resize(partition_buffer_idx_base_.size());
       std::copy(partition_buffer_idx_base_.begin(), partition_buffer_idx_base_.end(),
                 partition_buffer_idx_offset.begin());
+      std::vector<uint16_t> nullcnt;
+      nullcnt.resize(num_partitions_,0);
       for (auto row = 0; row < num_rows; ++row) {
         auto pid = partition_id_[row];
         auto dst_offset = partition_buffer_idx_offset[pid];
@@ -1392,6 +1408,17 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch&
             << (dst_offset & 7);
         partition_buffer_idx_offset[pid]++;
       }
+      // the last row may update the following bits to 0, reinitialize it as 1
+      for(auto pid=0;pid<num_partitions_;pid++)
+      {
+        if (partition_id_cnt_[pid] > 0 && dst_addrs[pid] != nullptr) {
+          auto lastoffset = partition_buffer_idx_base_[pid]+partition_id_cnt_[pid];
+
+          arrow::BitUtil::SetBitsTo(dst_addrs[pid], lastoffset, lastoffset+8-(lastoffset&0x7),
+                                    true);
+        }
+      }
+
     }
   }
   return arrow::Status::OK();
diff --git a/native-sql-engine/cpp/src/shuffle/splitter.h b/native-sql-engine/cpp/src/shuffle/splitter.h
index d50519a53..fc2f6c37a 100644
--- a/native-sql-engine/cpp/src/shuffle/splitter.h
+++ b/native-sql-engine/cpp/src/shuffle/splitter.h
@@ -186,8 +186,7 @@ class Splitter {
   std::vector<std::shared_ptr<PartitionWriter>> partition_writer_;
   // col partid
   std::vector<std::vector<uint8_t*>> partition_fixed_width_validity_addrs_;
-  // cache if the column has null so far for any reducer. To bypass the reducer check
-  std::vector<bool> column_has_null_;
+
   // col partid
   std::vector<std::vector<uint8_t*>> partition_fixed_width_value_addrs_;
   // col partid
@@ -255,6 +254,8 @@ class Splitter {
   int64_t total_compress_time_ = 0;
   int64_t total_compute_pid_time_ = 0;
   int64_t peak_memory_allocated_ = 0;
+  int64_t src_null_cnt = 0;
+  int64_t dst_null_cnt = 0;
 
   std::vector<int64_t> partition_lengths_;
   std::vector<int64_t> raw_partition_lengths_;

From 104ca15fd71f7c2817bcd14bb630e1079f839271 Mon Sep 17 00:00:00 2001
From: binwei <binwei.yang@intel.com>
Date: Thu, 5 May 2022 10:23:25 +0800
Subject: [PATCH 11/19] fix out of memory test

---
 native-sql-engine/cpp/src/shuffle/splitter.cc | 59 +++++++++----------
 1 file changed, 27 insertions(+), 32 deletions(-)

diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc
index c96ad12f8..59d111803 100644
--- a/native-sql-engine/cpp/src/shuffle/splitter.cc
+++ b/native-sql-engine/cpp/src/shuffle/splitter.cc
@@ -407,7 +407,7 @@ arrow::Status Splitter::Init() {
 
   // Allocate first buffer for split reducer
   ARROW_ASSIGN_OR_RAISE(combine_buffer_, arrow::AllocateResizableBuffer(
-                                             SPLIT_BUFFER_SIZE, options_.memory_pool));
+                                             0, options_.memory_pool));
   combine_buffer_->Resize(0, /*shrink_to_fit =*/false);
 
   return arrow::Status::OK();
@@ -505,8 +505,7 @@ arrow::Status Splitter::Stop() {
   }
   this->combine_buffer_.reset();
   this->schema_payload_.reset();
-
-  std::cout << "src null count " << src_null_cnt << " dst null cnt = " << dst_null_cnt << std::endl;
+  partition_fixed_width_buffers_.clear();
 
   // close data file output Stream
   RETURN_NOT_OK(data_file_os_->Close());
@@ -731,34 +730,30 @@ arrow::Status Splitter::AllocatePartitionBuffers(int32_t partition_id, int32_t n
       case arrow::NullType::type_id:
         break;
       default: {
-        try {
-          std::shared_ptr<arrow::Buffer> value_buffer;
-          if (column_type_id_[i]->id() == arrow::BooleanType::type_id) {
-            auto status = AllocateBufferFromPool(value_buffer,
-                                                 arrow::BitUtil::BytesForBits(new_size));
-            ARROW_RETURN_NOT_OK(status);
-          } else {
-            auto status = AllocateBufferFromPool(
-                value_buffer,
-                new_size * (arrow::bit_width(column_type_id_[i]->id()) >> 3));
-            ARROW_RETURN_NOT_OK(status);
-          }
-          new_value_buffers.push_back(std::move(value_buffer));
-          if (input_fixed_width_has_null_[fixed_width_idx]) {
-            std::shared_ptr<arrow::Buffer> validity_buffer;
-            auto status = AllocateBufferFromPool(validity_buffer,
-                                                 arrow::BitUtil::BytesForBits(new_size));
-            ARROW_RETURN_NOT_OK(status);
-            //initialize all true once allocated
-            memset(validity_buffer->mutable_data(), 0xff, validity_buffer->capacity());
-            new_validity_buffers.push_back(std::move(validity_buffer));
-          } else {
-            new_validity_buffers.push_back(nullptr);
-          }
-          fixed_width_idx++;
-        } catch (const std::exception& e) {
-          std::cout << "exception captured " << e.what() << std::endl;
+        std::shared_ptr<arrow::Buffer> value_buffer;
+        if (column_type_id_[i]->id() == arrow::BooleanType::type_id) {
+          auto status = AllocateBufferFromPool(value_buffer,
+                                                arrow::BitUtil::BytesForBits(new_size));
+          ARROW_RETURN_NOT_OK(status);
+        } else {
+          auto status = AllocateBufferFromPool(
+              value_buffer,
+              new_size * (arrow::bit_width(column_type_id_[i]->id()) >> 3));
+          ARROW_RETURN_NOT_OK(status);
         }
+        new_value_buffers.push_back(std::move(value_buffer));
+        if (input_fixed_width_has_null_[fixed_width_idx]) {
+          std::shared_ptr<arrow::Buffer> validity_buffer;
+          auto status = AllocateBufferFromPool(validity_buffer,
+                                                arrow::BitUtil::BytesForBits(new_size));
+          ARROW_RETURN_NOT_OK(status);
+          //initialize all true once allocated
+          memset(validity_buffer->mutable_data(), 0xff, validity_buffer->capacity());
+          new_validity_buffers.push_back(std::move(validity_buffer));
+        } else {
+          new_validity_buffers.push_back(nullptr);
+        }
+        fixed_width_idx++;
         break;
       }
     }
@@ -821,7 +816,7 @@ arrow::Status Splitter::AllocateNew(int32_t partition_id, int32_t new_size) {
               << std::to_string(partition_id) << std::endl;
     int64_t spilled_size;
     ARROW_ASSIGN_OR_RAISE(auto partition_to_spill, SpillLargestPartition(&spilled_size));
-    if (partition_to_spill == -1) {
+    if (partition_to_spill == -1) { 
       std::cout << "Failed to allocate new buffer for partition "
                 << std::to_string(partition_id) << ". No partition buffer to spill."
                 << std::endl;
@@ -1014,7 +1009,7 @@ arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) {
   for (auto pid = 0; pid < num_partitions_; ++pid) {
     partition_buffer_idx_base_[pid] += partition_id_cnt_[pid];
   }
-  
+
   return arrow::Status::OK();
 }
 

From 830cd70b3fe12269721735f855cfd10e71f845de Mon Sep 17 00:00:00 2001
From: binwei <binwei.yang@intel.com>
Date: Fri, 6 May 2022 15:07:23 +0800
Subject: [PATCH 12/19] fix setbitsto bug remove nullcnt

---
 native-sql-engine/cpp/src/shuffle/splitter.cc | 11 +----------
 native-sql-engine/cpp/src/shuffle/splitter.h  |  2 --
 2 files changed, 1 insertion(+), 12 deletions(-)

diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc
index 59d111803..af8ca584e 100644
--- a/native-sql-engine/cpp/src/shuffle/splitter.cc
+++ b/native-sql-engine/cpp/src/shuffle/splitter.cc
@@ -625,20 +625,12 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer
             arrays[i] = arrow::MakeArray(
                 arrow::ArrayData::Make(schema_->field(i)->type(), num_rows,
                                        {buffers[0],buffers[1]}));
-            if(buffers[0]!=nullptr)
-            {
-              dst_null_cnt+=arrays[i]->null_count();
-            }
             partition_fixed_width_validity_addrs_[fixed_width_idx][partition_id] =
                 nullptr;
             partition_fixed_width_value_addrs_[fixed_width_idx][partition_id] = nullptr;
           } else {
             arrays[i] = arrow::MakeArray(arrow::ArrayData::Make(
                 schema_->field(i)->type(), num_rows, {buffers[0], buffers[1]}));
-            if(buffers[0]!=nullptr)
-            {
-              dst_null_cnt+=arrays[i]->null_count();
-            }
           }
           fixed_width_idx++;
           break;
@@ -948,7 +940,6 @@ arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) {
     if (input_fixed_width_has_null_[col]==false && rb.column_data(col_idx)->GetNullCount() != 0) {
       input_fixed_width_has_null_[col] = true;
     }
-    src_null_cnt+=rb.column_data(col_idx)->GetNullCount();
   }
 
   int64_t prealloc_row_cnt =
@@ -1409,7 +1400,7 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch&
         if (partition_id_cnt_[pid] > 0 && dst_addrs[pid] != nullptr) {
           auto lastoffset = partition_buffer_idx_base_[pid]+partition_id_cnt_[pid];
 
-          arrow::BitUtil::SetBitsTo(dst_addrs[pid], lastoffset, lastoffset+8-(lastoffset&0x7),
+          arrow::BitUtil::SetBitsTo(dst_addrs[pid], lastoffset, 8-(lastoffset&0x7),
                                     true);
         }
       }
diff --git a/native-sql-engine/cpp/src/shuffle/splitter.h b/native-sql-engine/cpp/src/shuffle/splitter.h
index fc2f6c37a..cc7440926 100644
--- a/native-sql-engine/cpp/src/shuffle/splitter.h
+++ b/native-sql-engine/cpp/src/shuffle/splitter.h
@@ -254,8 +254,6 @@ class Splitter {
   int64_t total_compress_time_ = 0;
   int64_t total_compute_pid_time_ = 0;
   int64_t peak_memory_allocated_ = 0;
-  int64_t src_null_cnt = 0;
-  int64_t dst_null_cnt = 0;
 
   std::vector<int64_t> partition_lengths_;
   std::vector<int64_t> raw_partition_lengths_;

From 63f77ed9d3a129de4860a81ab22ccbaf20954fc8 Mon Sep 17 00:00:00 2001
From: binwei <binwei.yang@intel.com>
Date: Fri, 6 May 2022 16:01:33 +0800
Subject: [PATCH 13/19] add shuffle test

---
 .../cpp/src/tests/shuffle_split_test.cc       | 1134 +++++++++++++++++
 1 file changed, 1134 insertions(+)
 create mode 100644 native-sql-engine/cpp/src/tests/shuffle_split_test.cc

diff --git a/native-sql-engine/cpp/src/tests/shuffle_split_test.cc b/native-sql-engine/cpp/src/tests/shuffle_split_test.cc
new file mode 100644
index 000000000..d5d8de0bf
--- /dev/null
+++ b/native-sql-engine/cpp/src/tests/shuffle_split_test.cc
@@ -0,0 +1,1134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <arrow/compute/api.h>
+#include <arrow/datum.h>
+#include <arrow/io/api.h>
+#include <arrow/ipc/reader.h>
+#include <arrow/pretty_print.h>
+#include <arrow/record_batch.h>
+#include <arrow/util/io_util.h>
+#include <execinfo.h>
+#include <gtest/gtest.h>
+
+#include <iostream>
+void print_trace(void) {
+  char** strings;
+  size_t i, size;
+  enum Constexpr { MAX_SIZE = 1024 };
+  void* array[MAX_SIZE];
+  size = backtrace(array, MAX_SIZE);
+  strings = backtrace_symbols(array, size);
+  for (i = 0; i < size; i++) printf("    %s\n", strings[i]);
+  puts("");
+  free(strings);
+}
+
+#include "shuffle/splitter.h"
+#include "tests/test_utils.h"
+
+namespace sparkcolumnarplugin {
+namespace shuffle {
+
+class MyMemoryPool : public arrow::MemoryPool {
+ public:
+  explicit MyMemoryPool(int64_t capacity) : capacity_(capacity) {}
+
+  Status Allocate(int64_t size, uint8_t** out) override {
+    if (bytes_allocated() + size > capacity_) {
+      return Status::OutOfMemory("malloc of size ", size, " failed");
+    }
+    RETURN_NOT_OK(pool_->Allocate(size, out));
+    stats_.UpdateAllocatedBytes(size);
+     //std::cout << "Allocate: size = " << size << " addr = " << std::hex <<
+     //(uint64_t)*out << std::dec << std::endl;
+    // print_trace();
+    return arrow::Status::OK();
+  }
+
+  Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override {
+    if (new_size > capacity_) {
+      return Status::OutOfMemory("malloc of size ", new_size, " failed");
+    }
+    auto old_ptr = *ptr;
+    RETURN_NOT_OK(pool_->Reallocate(old_size, new_size, ptr));
+    stats_.UpdateAllocatedBytes(new_size - old_size);
+     //std::cout << "Reallocate: old_size = " << old_size << " old_ptr = " << std::hex <<
+     //(uint64_t)old_ptr << std::dec << " new_size = " << new_size << " addr = " <<
+     //std::hex << (uint64_t)*ptr << std::dec << std::endl;
+    //print_trace();
+    return arrow::Status::OK();
+  }
+
+  void Free(uint8_t* buffer, int64_t size) override {
+    pool_->Free(buffer, size);
+    stats_.UpdateAllocatedBytes(-size);
+     //std::cout << "Free: size = " << size << " addr = " << std::hex << (uint64_t)buffer
+     //<< std::dec << std::endl;
+    //print_trace();
+  }
+
+  int64_t bytes_allocated() const override { return stats_.bytes_allocated(); }
+
+  int64_t max_memory() const override { return pool_->max_memory(); }
+
+  std::string backend_name() const override { return pool_->backend_name(); }
+
+ private:
+  MemoryPool* pool_ = arrow::default_memory_pool();
+  int64_t capacity_;
+  arrow::internal::MemoryPoolStats stats_;
+};
+
+class SplitterTest : public ::testing::Test {
+ protected:
+  void SetUp() {
+    auto f_na = field("f_na", arrow::null());
+    auto f_int8_a = field("f_int8_a", arrow::int8());
+    auto f_int8_b = field("f_int8_b", arrow::int8());
+    auto f_int32 = field("f_int32", arrow::int32());
+    auto f_uint64 = field("f_uint64", arrow::uint64());
+    auto f_double = field("f_double", arrow::float64());
+    auto f_bool = field("f_bool", arrow::boolean());
+    auto f_string = field("f_string", arrow::utf8());
+    auto f_nullable_string = field("f_nullable_string", arrow::utf8());
+    auto f_decimal = field("f_decimal128", arrow::decimal(10, 2));
+
+    ARROW_ASSIGN_OR_THROW(tmp_dir_1_,
+                          std::move(arrow::internal::TemporaryDir::Make(tmp_dir_prefix)))
+    ARROW_ASSIGN_OR_THROW(tmp_dir_2_,
+                          std::move(arrow::internal::TemporaryDir::Make(tmp_dir_prefix)))
+    auto config_dirs =
+        tmp_dir_1_->path().ToString() + "," + tmp_dir_2_->path().ToString();
+
+    setenv("NATIVESQL_SPARK_LOCAL_DIRS", config_dirs.c_str(), 1);
+
+    schema_ = arrow::schema({f_na, f_int8_a, f_int8_b, f_int32, f_uint64, f_double,
+                             f_bool, f_string, f_nullable_string, f_decimal});
+
+    MakeInputBatch(input_data_1, schema_, &input_batch_1_);
+    MakeInputBatch(input_data_2, schema_, &input_batch_2_);
+
+    split_options_ = SplitOptions::Defaults();
+  }
+
+  void TearDown() override {
+    if (file_ != nullptr && !file_->closed()) {
+      file_->Close();
+    }
+  }
+
+  static void CheckFileExsists(const std::string& file_name) {
+    ASSERT_EQ(*arrow::internal::FileExists(
+                  *arrow::internal::PlatformFilename::FromString(file_name)),
+              true);
+  }
+
+  arrow::Result<std::shared_ptr<arrow::RecordBatch>> TakeRows(
+      const std::shared_ptr<arrow::RecordBatch>& input_batch,
+      const std::string& json_idx) {
+    std::shared_ptr<arrow::Array> take_idx;
+    ASSERT_NOT_OK(
+        arrow::ipc::internal::json::ArrayFromJSON(arrow::int32(), json_idx, &take_idx));
+
+    auto cntx = arrow::compute::ExecContext();
+    std::shared_ptr<arrow::RecordBatch> res;
+    ARROW_ASSIGN_OR_RAISE(
+        arrow::Datum result,
+        arrow::compute::Take(arrow::Datum(input_batch), arrow::Datum(take_idx),
+                             arrow::compute::TakeOptions{}, &cntx));
+    return result.record_batch();
+  }
+
+  arrow::Result<std::shared_ptr<arrow::ipc::RecordBatchReader>>
+  GetRecordBatchStreamReader(const std::string& file_name) {
+    if (file_ != nullptr && !file_->closed()) {
+      RETURN_NOT_OK(file_->Close());
+    }
+    ARROW_ASSIGN_OR_RAISE(file_, arrow::io::ReadableFile::Open(file_name))
+    ARROW_ASSIGN_OR_RAISE(auto file_reader,
+                          arrow::ipc::RecordBatchStreamReader::Open(file_))
+    return file_reader;
+  }
+
+  static const std::string tmp_dir_prefix;
+  static const std::vector<std::string> input_data_1;
+  static const std::vector<std::string> input_data_2;
+
+  std::shared_ptr<arrow::internal::TemporaryDir> tmp_dir_1_;
+  std::shared_ptr<arrow::internal::TemporaryDir> tmp_dir_2_;
+
+  std::shared_ptr<arrow::Schema> schema_;
+  std::shared_ptr<Splitter> splitter_;
+  SplitOptions split_options_;
+
+  std::shared_ptr<arrow::RecordBatch> input_batch_1_;
+  std::shared_ptr<arrow::RecordBatch> input_batch_2_;
+
+  std::shared_ptr<arrow::io::ReadableFile> file_;
+};
+
+const std::string SplitterTest::tmp_dir_prefix = "columnar-shuffle-test";
+const std::vector<std::string> SplitterTest::input_data_1 = {
+    "[null, null, null, null, null, null, null, null, null, null]",
+    "[1, 2, 3, null, 4, null, 5, 6, null, 7]",
+    "[1, -1, null, null, -2, 2, null, null, 3, -3]",
+    "[1, 2, 3, 4, null, 5, 6, 7, 8, null]",
+    "[null, null, null, null, null, null, null, null, null, null]",
+    R"([-0.1234567, null, 0.1234567, null, -0.142857, null, 0.142857, 0.285714, 0.428617, null])",
+    "[null, true, false, null, true, true, false, true, null, null]",
+    R"(["alice0", "bob1", "alice2", "bob3", "Alice4", "Bob5", "AlicE6", "boB7", "ALICE8", "BOB9"])",
+    R"(["alice", "bob", null, null, "Alice", "Bob", null, "alicE", null, "boB"])",
+    R"(["-1.01", "2.01", "-3.01", null, "0.11", "3.14", "2.27", null, "-3.14", null])"};
+
+const std::vector<std::string> SplitterTest::input_data_2 = {
+    "[null, null]",    "[null, null]",
+    "[1, -1]",         "[100, null]",
+    "[1, 1]",          R"([0.142857, -0.142857])",
+    "[true, false]",   R"(["bob", "alice"])",
+    R"([null, null])", R"([null, null])"};
+
+TEST_F(SplitterTest, TestSingleSplitter) {
+  split_options_.buffer_size = 10;
+
+  ARROW_ASSIGN_OR_THROW(splitter_, Splitter::Make("rr", schema_, 1, split_options_))
+
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_1_));
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_2_));
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_1_));
+
+  ASSERT_NOT_OK(splitter_->Stop());
+
+  // verify data file
+  CheckFileExsists(splitter_->DataFile());
+
+  // verify output temporary files
+  const auto& lengths = splitter_->PartitionLengths();
+  ASSERT_EQ(lengths.size(), 1);
+
+  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
+  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
+
+  // verify schema
+  ASSERT_EQ(*file_reader->schema(), *schema_);
+
+  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
+  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
+  ASSERT_EQ(batches.size(), 3);
+
+  std::vector<arrow::RecordBatch*> expected = {input_batch_1_.get(), input_batch_2_.get(),
+                                               input_batch_1_.get()};
+  for (auto i = 0; i < batches.size(); ++i) {
+    const auto& rb = batches[i];
+    ASSERT_EQ(rb->num_columns(), schema_->num_fields());
+    for (auto j = 0; j < rb->num_columns(); ++j) {
+      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
+      ASSERT_TRUE(rb->column(j)->Equals(*expected[i]->column(j),
+                                        EqualOptions::Defaults().diff_sink(&std::cout)));
+    }
+    ASSERT_TRUE(rb->Equals(*expected[i]));
+  }
+}
+
+TEST_F(SplitterTest, TestRoundRobinSplitter) {
+  int32_t num_partitions = 2;
+  split_options_.buffer_size = 4;
+  ARROW_ASSIGN_OR_THROW(splitter_,
+                        Splitter::Make("rr", schema_, num_partitions, split_options_));
+
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_1_));
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_2_));
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_1_));
+
+  ASSERT_NOT_OK(splitter_->Stop());
+
+  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
+  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
+
+  // verify partition lengths
+  const auto& lengths = splitter_->PartitionLengths();
+  ASSERT_EQ(lengths.size(), 2);
+  ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]);
+
+  // verify schema
+  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
+  ASSERT_EQ(*file_reader->schema(), *schema_);
+
+  // prepare first block expected result
+  std::shared_ptr<arrow::RecordBatch> res_batch_0;
+  std::shared_ptr<arrow::RecordBatch> res_batch_1;
+  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_1_, "[0, 2, 4, 6, 8]"))
+  ARROW_ASSIGN_OR_THROW(res_batch_1, TakeRows(input_batch_2_, "[0]"))
+  std::vector<arrow::RecordBatch*> expected = {res_batch_0.get(), res_batch_1.get(),
+                                               res_batch_0.get()};
+
+  // verify first block
+  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
+  ASSERT_EQ(batches.size(), 3);
+  for (auto i = 0; i < batches.size(); ++i) {
+    const auto& rb = batches[i];
+    ASSERT_EQ(rb->num_columns(), schema_->num_fields());
+    for (auto j = 0; j < rb->num_columns(); ++j) {
+      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
+    }
+    ASSERT_TRUE(rb->Equals(*expected[i]));
+  }
+
+  // prepare second block expected result
+  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_1_, "[1, 3, 5, 7, 9]"))
+  ARROW_ASSIGN_OR_THROW(res_batch_1, TakeRows(input_batch_2_, "[1]"))
+  expected = {res_batch_0.get(), res_batch_1.get(), res_batch_0.get()};
+
+  // verify second block
+  batches.clear();
+  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
+  ASSERT_EQ(*file_reader->schema(), *schema_);
+  ASSERT_NOT_OK(file_->Advance(lengths[0]));
+  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
+  ASSERT_EQ(batches.size(), 3);
+  for (auto i = 0; i < batches.size(); ++i) {
+    const auto& rb = batches[i];
+    ASSERT_EQ(rb->num_columns(), schema_->num_fields());
+    for (auto j = 0; j < rb->num_columns(); ++j) {
+      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
+    }
+    ASSERT_TRUE(rb->Equals(*expected[i]));
+  }
+}
+
+TEST_F(SplitterTest, TestSplitterMemoryLeak) {
+  std::shared_ptr<arrow::MemoryPool> pool =
+      std::make_shared<MyMemoryPool>(9 * 1024 * 1024);
+
+  int32_t num_partitions = 2;
+  split_options_.buffer_size = 4;
+  split_options_.memory_pool = pool.get();
+  split_options_.write_schema = false;
+
+  ARROW_ASSIGN_OR_THROW(splitter_,
+                        Splitter::Make("rr", schema_, num_partitions, split_options_));
+
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_1_));
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_2_));
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_1_));
+
+  ASSERT_NOT_OK(splitter_->Stop());
+
+  ASSERT_TRUE(pool->bytes_allocated() == 0);
+  splitter_.reset();
+  ASSERT_TRUE(pool->bytes_allocated() == 0);
+
+  split_options_.memory_pool = arrow::default_memory_pool();
+}
+
+TEST_F(SplitterTest, TestHashSplitter) {
+  int32_t num_partitions = 2;
+  split_options_.buffer_size = 4;
+
+  auto f_0 = TreeExprBuilder::MakeField(schema_->field(1));
+  auto f_1 = TreeExprBuilder::MakeField(schema_->field(2));
+  auto f_2 = TreeExprBuilder::MakeField(schema_->field(3));
+
+  auto node_0 = TreeExprBuilder::MakeFunction("add", {f_0, f_1}, int8());
+  auto expr_0 = TreeExprBuilder::MakeExpression(node_0, field("res0", int8()));
+  auto expr_1 = TreeExprBuilder::MakeExpression(f_2, field("f_uint64", uint64()));
+
+  ARROW_ASSIGN_OR_THROW(splitter_, Splitter::Make("hash", schema_, num_partitions,
+                                                  {expr_0, expr_1}, split_options_))
+
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_1_));
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_2_));
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_1_));
+
+  ASSERT_NOT_OK(splitter_->Stop());
+
+  const auto& lengths = splitter_->PartitionLengths();
+  ASSERT_EQ(lengths.size(), 2);
+
+  // verify data file
+  CheckFileExsists(splitter_->DataFile());
+
+  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
+  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
+
+  // verify schema
+  ASSERT_EQ(*file_reader->schema(), *schema_);
+
+  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
+  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
+
+  for (const auto& rb : batches) {
+    ASSERT_EQ(rb->num_columns(), schema_->num_fields());
+    for (auto i = 0; i < rb->num_columns(); ++i) {
+      ASSERT_EQ(rb->column(i)->length(), rb->num_rows());
+    }
+  }
+}
+
+TEST_F(SplitterTest, TestFallbackRangeSplitter) {
+  int32_t num_partitions = 2;
+  split_options_.buffer_size = 4;
+
+  std::shared_ptr<arrow::Array> pid_arr_0;
+  ASSERT_NOT_OK(arrow::ipc::internal::json::ArrayFromJSON(
+      arrow::int32(), "[0, 1, 0, 1, 0, 1, 0, 1, 0, 1]", &pid_arr_0));
+  std::shared_ptr<arrow::Array> pid_arr_1;
+  ASSERT_NOT_OK(
+      arrow::ipc::internal::json::ArrayFromJSON(arrow::int32(), "[0, 1]", &pid_arr_1));
+
+  std::shared_ptr<arrow::Schema> schema_w_pid;
+  std::shared_ptr<arrow::RecordBatch> input_batch_1_w_pid;
+  std::shared_ptr<arrow::RecordBatch> input_batch_2_w_pid;
+  ARROW_ASSIGN_OR_THROW(schema_w_pid,
+                        schema_->AddField(0, arrow::field("pid", arrow::int32())));
+  ARROW_ASSIGN_OR_THROW(input_batch_1_w_pid,
+                        input_batch_1_->AddColumn(0, "pid", pid_arr_0));
+  ARROW_ASSIGN_OR_THROW(input_batch_2_w_pid,
+                        input_batch_2_->AddColumn(0, "pid", pid_arr_1));
+
+  ARROW_ASSIGN_OR_THROW(splitter_, Splitter::Make("range", std::move(schema_w_pid),
+                                                  num_partitions, split_options_))
+
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_1_w_pid));
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_2_w_pid));
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_1_w_pid));
+
+  ASSERT_NOT_OK(splitter_->Stop());
+
+  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
+  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
+
+  // verify partition lengths
+  const auto& lengths = splitter_->PartitionLengths();
+  ASSERT_EQ(lengths.size(), 2);
+  ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]);
+
+  // verify schema
+  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
+  ASSERT_EQ(*file_reader->schema(), *schema_);
+
+  // prepare first block expected result
+  std::shared_ptr<arrow::RecordBatch> res_batch_0;
+  std::shared_ptr<arrow::RecordBatch> res_batch_1;
+  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_1_, "[0, 2, 4, 6, 8]"))
+  ARROW_ASSIGN_OR_THROW(res_batch_1, TakeRows(input_batch_2_, "[0]"))
+  std::vector<arrow::RecordBatch*> expected = {res_batch_0.get(), res_batch_1.get(),
+                                               res_batch_0.get()};
+
+  // verify first block
+  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
+  ASSERT_EQ(batches.size(), 3);
+  for (auto i = 0; i < batches.size(); ++i) {
+    const auto& rb = batches[i];
+    ASSERT_EQ(rb->num_columns(), schema_->num_fields());
+    for (auto j = 0; j < rb->num_columns(); ++j) {
+      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
+    }
+    ASSERT_TRUE(rb->Equals(*expected[i]));
+  }
+
+  // prepare second block expected result
+  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_1_, "[1, 3, 5, 7, 9]"))
+  ARROW_ASSIGN_OR_THROW(res_batch_1, TakeRows(input_batch_2_, "[1]"))
+  expected = {res_batch_0.get(), res_batch_1.get(), res_batch_0.get()};
+
+  // verify second block
+  batches.clear();
+  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
+  ASSERT_EQ(*file_reader->schema(), *schema_);
+  ASSERT_NOT_OK(file_->Advance(lengths[0]));
+  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
+  ASSERT_EQ(batches.size(), 3);
+  for (auto i = 0; i < batches.size(); ++i) {
+    const auto& rb = batches[i];
+    ASSERT_EQ(rb->num_columns(), schema_->num_fields());
+    for (auto j = 0; j < rb->num_columns(); ++j) {
+      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
+    }
+    ASSERT_TRUE(rb->Equals(*expected[i]));
+  }
+}
+
+TEST_F(SplitterTest, TestSpillFailWithOutOfMemory) {
+  auto pool = std::make_unique<MyMemoryPool>(0);
+
+  int32_t num_partitions = 2;
+  split_options_.buffer_size = 4;
+  split_options_.memory_pool = pool.get();
+  ARROW_ASSIGN_OR_THROW(splitter_,
+                        Splitter::Make("rr", schema_, num_partitions, split_options_));
+
+  auto status = splitter_->Split(*input_batch_1_);
+  // should return OOM status because there's no partition buffer to spill
+  ASSERT_TRUE(status.IsOutOfMemory());
+  ASSERT_NOT_OK(splitter_->Stop());
+}
+
+TEST_F(SplitterTest, TestSpillLargestPartition) {
+  std::shared_ptr<arrow::MemoryPool> pool =
+      std::make_shared<MyMemoryPool>(9 * 1024 * 1024);
+  //  pool = std::make_shared<arrow::LoggingMemoryPool>(pool.get());
+
+  int32_t num_partitions = 2;
+  split_options_.buffer_size = 4;
+  // split_options_.memory_pool = pool.get();
+  split_options_.compression_type = arrow::Compression::UNCOMPRESSED;
+  ARROW_ASSIGN_OR_THROW(splitter_,
+                        Splitter::Make("rr", schema_, num_partitions, split_options_));
+
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_NOT_OK(splitter_->Split(*input_batch_1_));
+    ASSERT_NOT_OK(splitter_->Split(*input_batch_2_));
+    ASSERT_NOT_OK(splitter_->Split(*input_batch_1_));
+  }
+  ASSERT_NOT_OK(splitter_->Stop());
+}
+
+TEST_F(SplitterTest, TestRoundRobinListArraySplitter) {
+  auto f_arr_str = field("f_arr", arrow::list(arrow::utf8()));
+  auto f_arr_bool = field("f_bool", arrow::list(arrow::boolean()));
+  auto f_arr_int32 = field("f_int32", arrow::list(arrow::int32()));
+  auto f_arr_double = field("f_double", arrow::list(arrow::float64()));
+  auto f_arr_decimal = field("f_decimal", arrow::list(arrow::decimal(10, 2)));
+
+  auto rb_schema =
+      arrow::schema({f_arr_str, f_arr_bool, f_arr_int32, f_arr_double, f_arr_decimal});
+
+  const std::vector<std::string> input_data_arr = {
+      R"([["alice0", "bob1"], ["alice2"], ["bob3"], ["Alice4", "Bob5", "AlicE6"], ["boB7"], ["ALICE8", "BOB9"]])",
+      R"([[true, null], [true, true, true], [false], [true], [false], [false]])",
+      R"([[1, 2, 3], [9, 8], [null], [3, 1], [0], [1, 9, null]])",
+      R"([[0.26121], [-9.12123, 6.111111], [8.121], [7.21, null], [3.2123, 6,1121], [null]])",
+      R"([["0.26"], ["-9.12", "6.11"], ["8.12"], ["7.21", null], ["3.21", "6.11"], [null]])"};
+
+  std::shared_ptr<arrow::RecordBatch> input_batch_arr;
+  MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr);
+
+  int32_t num_partitions = 2;
+  split_options_.buffer_size = 4;
+  ARROW_ASSIGN_OR_THROW(splitter_,
+                        Splitter::Make("rr", rb_schema, num_partitions, split_options_));
+
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_arr));
+  ASSERT_NOT_OK(splitter_->Stop());
+
+  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
+  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
+
+  // verify partition lengths
+  const auto& lengths = splitter_->PartitionLengths();
+  ASSERT_EQ(lengths.size(), 2);
+  ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]);
+
+  // verify schema
+  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
+  ASSERT_EQ(*file_reader->schema(), *rb_schema);
+
+  // prepare first block expected result
+  std::shared_ptr<arrow::RecordBatch> res_batch_0;
+  std::shared_ptr<arrow::RecordBatch> res_batch_1;
+  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2, 4]"))
+  std::vector<arrow::RecordBatch*> expected = {res_batch_0.get()};
+
+  // verify first block
+  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
+  ASSERT_EQ(batches.size(), 1);
+  for (auto i = 0; i < batches.size(); ++i) {
+    const auto& rb = batches[i];
+    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
+    for (auto j = 0; j < rb->num_columns(); ++j) {
+      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
+    }
+    ASSERT_TRUE(rb->Equals(*expected[i]));
+  }
+
+  // prepare second block expected result
+  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3, 5]"))
+  expected = {res_batch_0.get()};
+
+  // verify second block
+  batches.clear();
+  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
+  ASSERT_EQ(*file_reader->schema(), *rb_schema);
+  ASSERT_NOT_OK(file_->Advance(lengths[0]));
+  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
+  ASSERT_EQ(batches.size(), 1);
+  for (auto i = 0; i < batches.size(); ++i) {
+    const auto& rb = batches[i];
+    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
+    for (auto j = 0; j < rb->num_columns(); ++j) {
+      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
+    }
+    ASSERT_TRUE(rb->Equals(*expected[i]));
+  }
+}
+
+TEST_F(SplitterTest, TestRoundRobinNestListArraySplitter) {
+  auto f_arr_str = field("f_str", arrow::list(arrow::list(arrow::utf8())));
+  auto f_arr_int32 = field("f_int32", arrow::list(arrow::list(arrow::int32())));
+
+  auto rb_schema = arrow::schema({f_arr_str, f_arr_int32});
+
+  const std::vector<std::string> input_data_arr = {
+      R"([[["alice0", "bob1"]], [["alice2"], ["bob3"]], [["Alice4", "Bob5", "AlicE6"]], [["boB7"], ["ALICE8", "BOB9"]]])",
+      R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])"};
+
+  std::shared_ptr<arrow::RecordBatch> input_batch_arr;
+  MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr);
+
+  int32_t num_partitions = 2;
+  split_options_.buffer_size = 4;
+  ARROW_ASSIGN_OR_THROW(splitter_,
+                        Splitter::Make("rr", rb_schema, num_partitions, split_options_));
+
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_arr));
+  ASSERT_NOT_OK(splitter_->Stop());
+
+  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
+  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
+
+  // verify partition lengths
+  const auto& lengths = splitter_->PartitionLengths();
+  ASSERT_EQ(lengths.size(), 2);
+  ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]);
+
+  // verify schema
+  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
+  ASSERT_EQ(*file_reader->schema(), *rb_schema);
+
+  // prepare first block expected result
+  std::shared_ptr<arrow::RecordBatch> res_batch_0;
+  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]"))
+  std::vector<arrow::RecordBatch*> expected = {res_batch_0.get()};
+
+  // verify first block
+  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
+  ASSERT_EQ(batches.size(), 1);
+  for (auto i = 0; i < batches.size(); ++i) {
+    const auto& rb = batches[i];
+    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
+    for (auto j = 0; j < rb->num_columns(); ++j) {
+      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
+    }
+    ASSERT_TRUE(rb->Equals(*expected[i]));
+  }
+
+  // prepare second block expected result
+  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]"))
+  expected = {res_batch_0.get()};
+
+  // verify second block
+  batches.clear();
+  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
+  ASSERT_EQ(*file_reader->schema(), *rb_schema);
+  ASSERT_NOT_OK(file_->Advance(lengths[0]));
+  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
+  ASSERT_EQ(batches.size(), 1);
+  for (auto i = 0; i < batches.size(); ++i) {
+    const auto& rb = batches[i];
+    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
+    for (auto j = 0; j < rb->num_columns(); ++j) {
+      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
+    }
+    ASSERT_TRUE(rb->Equals(*expected[i]));
+  }
+}
+
+TEST_F(SplitterTest, TestRoundRobinNestLargeListArraySplitter) {
+  auto f_arr_str = field("f_str", arrow::large_list(arrow::list(arrow::utf8())));
+  auto f_arr_int32 = field("f_int32", arrow::large_list(arrow::list(arrow::int32())));
+
+  auto rb_schema = arrow::schema({f_arr_str, f_arr_int32});
+
+  const std::vector<std::string> input_data_arr = {
+      R"([[["alice0", "bob1"]], [["alice2"], ["bob3"]], [["Alice4", "Bob5", "AlicE6"]], [["boB7"], ["ALICE8", "BOB9"]]])",
+      R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])"};
+
+  std::shared_ptr<arrow::RecordBatch> input_batch_arr;
+  MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr);
+
+  int32_t num_partitions = 2;
+  split_options_.buffer_size = 4;
+  ARROW_ASSIGN_OR_THROW(splitter_,
+                        Splitter::Make("rr", rb_schema, num_partitions, split_options_));
+
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_arr));
+  ASSERT_NOT_OK(splitter_->Stop());
+
+  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
+  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
+
+  // verify partition lengths
+  const auto& lengths = splitter_->PartitionLengths();
+  ASSERT_EQ(lengths.size(), 2);
+  ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]);
+
+  // verify schema
+  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
+  ASSERT_EQ(*file_reader->schema(), *rb_schema);
+
+  // prepare first block expected result
+  std::shared_ptr<arrow::RecordBatch> res_batch_0;
+  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]"))
+  std::vector<arrow::RecordBatch*> expected = {res_batch_0.get()};
+
+  // verify first block
+  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
+  ASSERT_EQ(batches.size(), 1);
+  for (auto i = 0; i < batches.size(); ++i) {
+    const auto& rb = batches[i];
+    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
+    for (auto j = 0; j < rb->num_columns(); ++j) {
+      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
+    }
+    ASSERT_TRUE(rb->Equals(*expected[i]));
+  }
+
+  // prepare second block expected result
+  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]"))
+  expected = {res_batch_0.get()};
+
+  // verify second block
+  batches.clear();
+  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
+  ASSERT_EQ(*file_reader->schema(), *rb_schema);
+  ASSERT_NOT_OK(file_->Advance(lengths[0]));
+  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
+  ASSERT_EQ(batches.size(), 1);
+  for (auto i = 0; i < batches.size(); ++i) {
+    const auto& rb = batches[i];
+    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
+    for (auto j = 0; j < rb->num_columns(); ++j) {
+      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
+    }
+    ASSERT_TRUE(rb->Equals(*expected[i]));
+  }
+}
+
+TEST_F(SplitterTest, TestRoundRobinListStructArraySplitter) {
+  auto f_arr_int32 = field("f_int32", arrow::list(arrow::list(arrow::int32())));
+  auto f_arr_list_struct =
+      field("f_list_struct", list(struct_({field("a", int32()), field("b", utf8())})));
+
+  auto rb_schema = arrow::schema({f_arr_int32, f_arr_list_struct});
+
+  const std::vector<std::string> input_data_arr = {
+      R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])",
+      R"([[{"a": 4, "b": null}], [{"a": 42, "b": null}, {"a": null, "b": "foo2"}], [{"a": 43, "b": "foo3"}], [{"a": 44, "b": "foo4"}]])"};
+
+  std::shared_ptr<arrow::RecordBatch> input_batch_arr;
+  MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr);
+
+  int32_t num_partitions = 2;
+  split_options_.buffer_size = 4;
+  ARROW_ASSIGN_OR_THROW(splitter_,
+                        Splitter::Make("rr", rb_schema, num_partitions, split_options_));
+
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_arr));
+  ASSERT_NOT_OK(splitter_->Stop());
+
+  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
+  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
+
+  // verify partition lengths
+  const auto& lengths = splitter_->PartitionLengths();
+  ASSERT_EQ(lengths.size(), 2);
+  ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]);
+
+  // verify schema
+  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
+  ASSERT_EQ(*file_reader->schema(), *rb_schema);
+
+  // prepare first block expected result
+  std::shared_ptr<arrow::RecordBatch> res_batch_0;
+  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]"))
+  std::vector<arrow::RecordBatch*> expected = {res_batch_0.get()};
+
+  // verify first block
+  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
+  ASSERT_EQ(batches.size(), 1);
+  for (auto i = 0; i < batches.size(); ++i) {
+    const auto& rb = batches[i];
+    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
+    for (auto j = 0; j < rb->num_columns(); ++j) {
+      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
+    }
+    ASSERT_TRUE(rb->Equals(*expected[i]));
+  }
+
+  // prepare second block expected result
+  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]"))
+  expected = {res_batch_0.get()};
+
+  // verify second block
+  batches.clear();
+  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
+  ASSERT_EQ(*file_reader->schema(), *rb_schema);
+  ASSERT_NOT_OK(file_->Advance(lengths[0]));
+  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
+  ASSERT_EQ(batches.size(), 1);
+  for (auto i = 0; i < batches.size(); ++i) {
+    const auto& rb = batches[i];
+    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
+    for (auto j = 0; j < rb->num_columns(); ++j) {
+      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
+    }
+    ASSERT_TRUE(rb->Equals(*expected[i]));
+  }
+}
+
+TEST_F(SplitterTest, TestRoundRobinListMapArraySplitter) {
+  auto f_arr_int32 = field("f_int32", arrow::list(arrow::list(arrow::int32())));
+  auto f_arr_list_map = field("f_list_map", list(map(utf8(), utf8())));
+
+  auto rb_schema = arrow::schema({f_arr_int32, f_arr_list_map});
+
+  const std::vector<std::string> input_data_arr = {
+      R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])",
+      R"([[[["key1", "val_aa1"]]], [[["key1", "val_bb1"]], [["key2", "val_bb2"]]], [[["key1", "val_cc1"]]], [[["key1", "val_dd1"]]]])"};
+
+  std::shared_ptr<arrow::RecordBatch> input_batch_arr;
+  MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr);
+
+  int32_t num_partitions = 2;
+  split_options_.buffer_size = 4;
+  ARROW_ASSIGN_OR_THROW(splitter_,
+                        Splitter::Make("rr", rb_schema, num_partitions, split_options_));
+
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_arr));
+  ASSERT_NOT_OK(splitter_->Stop());
+
+  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
+  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
+
+  // verify partition lengths
+  const auto& lengths = splitter_->PartitionLengths();
+  ASSERT_EQ(lengths.size(), 2);
+  ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]);
+
+  // verify schema
+  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
+  ASSERT_EQ(*file_reader->schema(), *rb_schema);
+
+  // prepare first block expected result
+  std::shared_ptr<arrow::RecordBatch> res_batch_0;
+  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]"))
+  std::vector<arrow::RecordBatch*> expected = {res_batch_0.get()};
+
+  // verify first block
+  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
+  ASSERT_EQ(batches.size(), 1);
+  for (auto i = 0; i < batches.size(); ++i) {
+    const auto& rb = batches[i];
+    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
+    for (auto j = 0; j < rb->num_columns(); ++j) {
+      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
+    }
+    ASSERT_TRUE(rb->Equals(*expected[i]));
+  }
+
+  // prepare second block expected result
+  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]"))
+  expected = {res_batch_0.get()};
+
+  // verify second block
+  batches.clear();
+  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
+  ASSERT_EQ(*file_reader->schema(), *rb_schema);
+  ASSERT_NOT_OK(file_->Advance(lengths[0]));
+  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
+  ASSERT_EQ(batches.size(), 1);
+  for (auto i = 0; i < batches.size(); ++i) {
+    const auto& rb = batches[i];
+    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
+    for (auto j = 0; j < rb->num_columns(); ++j) {
+      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
+    }
+    ASSERT_TRUE(rb->Equals(*expected[i]));
+  }
+}
+
+TEST_F(SplitterTest, TestRoundRobinStructArraySplitter) {
+  auto f_arr_int32 = field("f_int32", arrow::list(arrow::list(arrow::int32())));
+  auto f_arr_struct_list =
+      field("f_struct_list", struct_({field("a", list(int32())), field("b", utf8())}));
+
+  auto rb_schema = arrow::schema({f_arr_int32, f_arr_struct_list});
+
+  const std::vector<std::string> input_data_arr = {
+      R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])",
+      R"([{"a": [1,1,1,1], "b": null}, {"a": null, "b": "foo2"}, {"a": [3,3,3,3], "b": "foo3"}, {"a": [4,4,4,4], "b": "foo4"}])"};
+
+  std::shared_ptr<arrow::RecordBatch> input_batch_arr;
+  MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr);
+
+  int32_t num_partitions = 2;
+  split_options_.buffer_size = 4;
+  ARROW_ASSIGN_OR_THROW(splitter_,
+                        Splitter::Make("rr", rb_schema, num_partitions, split_options_));
+
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_arr));
+  ASSERT_NOT_OK(splitter_->Stop());
+
+  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
+  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
+
+  // verify partition lengths
+  const auto& lengths = splitter_->PartitionLengths();
+  ASSERT_EQ(lengths.size(), 2);
+  ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]);
+
+  // verify schema
+  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
+  ASSERT_EQ(*file_reader->schema(), *rb_schema);
+
+  // prepare first block expected result
+  std::shared_ptr<arrow::RecordBatch> res_batch_0;
+  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]"))
+  std::vector<arrow::RecordBatch*> expected = {res_batch_0.get()};
+
+  // verify first block
+  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
+  ASSERT_EQ(batches.size(), 1);
+  for (auto i = 0; i < batches.size(); ++i) {
+    const auto& rb = batches[i];
+    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
+    for (auto j = 0; j < rb->num_columns(); ++j) {
+      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
+    }
+    ASSERT_TRUE(rb->Equals(*expected[i]));
+  }
+
+  // prepare second block expected result
+  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]"))
+  expected = {res_batch_0.get()};
+
+  // verify second block
+  batches.clear();
+  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
+  ASSERT_EQ(*file_reader->schema(), *rb_schema);
+  ASSERT_NOT_OK(file_->Advance(lengths[0]));
+  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
+  ASSERT_EQ(batches.size(), 1);
+  for (auto i = 0; i < batches.size(); ++i) {
+    const auto& rb = batches[i];
+    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
+    for (auto j = 0; j < rb->num_columns(); ++j) {
+      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
+    }
+    ASSERT_TRUE(rb->Equals(*expected[i]));
+  }
+}
+
+TEST_F(SplitterTest, TestRoundRobinMapArraySplitter) {
+  auto f_arr_int32 = field("f_int32", arrow::list(arrow::list(arrow::int32())));
+  auto f_arr_map = field("f_map", map(utf8(), utf8()));
+
+  auto rb_schema = arrow::schema({f_arr_int32, f_arr_map});
+
+  const std::vector<std::string> input_data_arr = {
+      R"([[[1, 2, 3]], [[9, 8], [null]], [[3, 1], [0]], [[1, 9, null]]])",
+      R"([[["key1", "val_aa1"]], [["key1", "val_bb1"], ["key2", "val_bb2"]], [["key1", "val_cc1"]], [["key1", "val_dd1"]]])"};
+
+  std::shared_ptr<arrow::RecordBatch> input_batch_arr;
+  MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr);
+
+  int32_t num_partitions = 2;
+  split_options_.buffer_size = 4;
+  ARROW_ASSIGN_OR_THROW(splitter_,
+                        Splitter::Make("rr", rb_schema, num_partitions, split_options_));
+
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_arr));
+  ASSERT_NOT_OK(splitter_->Stop());
+
+  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
+  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
+
+  // verify partition lengths
+  const auto& lengths = splitter_->PartitionLengths();
+  ASSERT_EQ(lengths.size(), 2);
+  ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]);
+
+  // verify schema
+  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
+  ASSERT_EQ(*file_reader->schema(), *rb_schema);
+
+  // prepare first block expected result
+  std::shared_ptr<arrow::RecordBatch> res_batch_0;
+  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2]"))
+  std::vector<arrow::RecordBatch*> expected = {res_batch_0.get()};
+
+  // verify first block
+  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
+  ASSERT_EQ(batches.size(), 1);
+  for (auto i = 0; i < batches.size(); ++i) {
+    const auto& rb = batches[i];
+    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
+    for (auto j = 0; j < rb->num_columns(); ++j) {
+      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
+    }
+    ASSERT_TRUE(rb->Equals(*expected[i]));
+  }
+
+  // prepare second block expected result
+  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3]"))
+  expected = {res_batch_0.get()};
+
+  // verify second block
+  batches.clear();
+  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
+  ASSERT_EQ(*file_reader->schema(), *rb_schema);
+  ASSERT_NOT_OK(file_->Advance(lengths[0]));
+  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
+  ASSERT_EQ(batches.size(), 1);
+  for (auto i = 0; i < batches.size(); ++i) {
+    const auto& rb = batches[i];
+    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
+    for (auto j = 0; j < rb->num_columns(); ++j) {
+      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
+    }
+    ASSERT_TRUE(rb->Equals(*expected[i]));
+  }
+}
+
+TEST_F(SplitterTest, TestHashListArraySplitterWithMorePartitions) {
+  int32_t num_partitions = 5;
+  split_options_.buffer_size = 4;
+
+  auto f_uint64 = field("f_uint64", arrow::uint64());
+  auto f_arr_str = field("f_arr", arrow::list(arrow::utf8()));
+
+  auto rb_schema = arrow::schema({f_uint64, f_arr_str});
+
+  const std::vector<std::string> input_batch_1_data = {
+      R"([1, 2])", R"([["alice0", "bob1"], ["alice2"]])"};
+  std::shared_ptr<arrow::RecordBatch> input_batch_arr;
+  MakeInputBatch(input_batch_1_data, rb_schema, &input_batch_arr);
+
+  auto f_2 = TreeExprBuilder::MakeField(f_uint64);
+  auto expr_1 = TreeExprBuilder::MakeExpression(f_2, field("f_uint64", uint64()));
+
+  ARROW_ASSIGN_OR_THROW(splitter_, Splitter::Make("hash", rb_schema, num_partitions,
+                                                  {expr_1}, split_options_));
+
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_arr));
+
+  ASSERT_NOT_OK(splitter_->Stop());
+
+  const auto& lengths = splitter_->PartitionLengths();
+  ASSERT_EQ(lengths.size(), 5);
+
+  CheckFileExsists(splitter_->DataFile());
+
+  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
+  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
+
+  ASSERT_EQ(*file_reader->schema(), *rb_schema);
+
+  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
+  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
+
+  for (const auto& rb : batches) {
+    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
+    for (auto i = 0; i < rb->num_columns(); ++i) {
+      ASSERT_EQ(rb->column(i)->length(), rb->num_rows());
+    }
+  }
+}
+
+TEST_F(SplitterTest, TestRoundRobinListArraySplitterwithCompression) {
+  auto f_arr_str = field("f_arr", arrow::list(arrow::utf8()));
+  auto f_arr_bool = field("f_bool", arrow::list(arrow::boolean()));
+  auto f_arr_int32 = field("f_int32", arrow::list(arrow::int32()));
+  auto f_arr_double = field("f_double", arrow::list(arrow::float64()));
+  auto f_arr_decimal = field("f_decimal", arrow::list(arrow::decimal(10, 2)));
+
+  auto rb_schema =
+      arrow::schema({f_arr_str, f_arr_bool, f_arr_int32, f_arr_double, f_arr_decimal});
+
+  const std::vector<std::string> input_data_arr = {
+      R"([["alice0", "bob1"], ["alice2"], ["bob3"], ["Alice4", "Bob5", "AlicE6"], ["boB7"], ["ALICE8", "BOB9"]])",
+      R"([[true, null], [true, true, true], [false], [true], [false], [false]])",
+      R"([[1, 2, 3], [9, 8], [null], [3, 1], [0], [1, 9, null]])",
+      R"([[0.26121], [-9.12123, 6.111111], [8.121], [7.21, null], [3.2123, 6,1121], [null]])",
+      R"([["0.26"], ["-9.12", "6.11"], ["8.12"], ["7.21", null], ["3.21", "6.11"], [null]])"};
+
+  std::shared_ptr<arrow::RecordBatch> input_batch_arr;
+  MakeInputBatch(input_data_arr, rb_schema, &input_batch_arr);
+
+  int32_t num_partitions = 2;
+  split_options_.buffer_size = 4;
+  ARROW_ASSIGN_OR_THROW(splitter_,
+                        Splitter::Make("rr", rb_schema, num_partitions, split_options_));
+  auto compression_type = arrow::util::Codec::GetCompressionType("lz4");
+  ASSERT_NOT_OK(splitter_->SetCompressType(compression_type.MoveValueUnsafe()));
+  ASSERT_NOT_OK(splitter_->Split(*input_batch_arr));
+  ASSERT_NOT_OK(splitter_->Stop());
+
+  std::shared_ptr<arrow::ipc::RecordBatchReader> file_reader;
+  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
+
+  // verify partition lengths
+  const auto& lengths = splitter_->PartitionLengths();
+  ASSERT_EQ(lengths.size(), 2);
+  ASSERT_EQ(*file_->GetSize(), lengths[0] + lengths[1]);
+
+  // verify schema
+  std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
+  ASSERT_EQ(*file_reader->schema(), *rb_schema);
+
+  // prepare first block expected result
+  std::shared_ptr<arrow::RecordBatch> res_batch_0;
+  std::shared_ptr<arrow::RecordBatch> res_batch_1;
+  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[0, 2, 4]"))
+  std::vector<arrow::RecordBatch*> expected = {res_batch_0.get()};
+
+  // verify first block
+  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
+  ASSERT_EQ(batches.size(), 1);
+  for (auto i = 0; i < batches.size(); ++i) {
+    const auto& rb = batches[i];
+    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
+    for (auto j = 0; j < rb->num_columns(); ++j) {
+      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
+    }
+    ASSERT_TRUE(rb->Equals(*expected[i]));
+  }
+
+  // prepare second block expected result
+  ARROW_ASSIGN_OR_THROW(res_batch_0, TakeRows(input_batch_arr, "[1, 3, 5]"))
+  expected = {res_batch_0.get()};
+
+  // verify second block
+  batches.clear();
+  ARROW_ASSIGN_OR_THROW(file_reader, GetRecordBatchStreamReader(splitter_->DataFile()));
+  ASSERT_EQ(*file_reader->schema(), *rb_schema);
+  ASSERT_NOT_OK(file_->Advance(lengths[0]));
+  ASSERT_NOT_OK(file_reader->ReadAll(&batches));
+  ASSERT_EQ(batches.size(), 1);
+  for (auto i = 0; i < batches.size(); ++i) {
+    const auto& rb = batches[i];
+    ASSERT_EQ(rb->num_columns(), rb_schema->num_fields());
+    for (auto j = 0; j < rb->num_columns(); ++j) {
+      ASSERT_EQ(rb->column(j)->length(), rb->num_rows());
+    }
+    ASSERT_TRUE(rb->Equals(*expected[i]));
+  }
+}
+
+}  // namespace shuffle
+}  // namespace sparkcolumnarplugin

From 2bca314012bc3e2be93ee70483af9e45eded1d28 Mon Sep 17 00:00:00 2001
From: binwei <binwei.yang@intel.com>
Date: Fri, 6 May 2022 17:19:14 +0800
Subject: [PATCH 14/19] remove unused variables

---
 native-sql-engine/cpp/src/shuffle/splitter.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc
index af8ca584e..bb9b9909b 100644
--- a/native-sql-engine/cpp/src/shuffle/splitter.cc
+++ b/native-sql-engine/cpp/src/shuffle/splitter.cc
@@ -1383,8 +1383,6 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch&
       partition_buffer_idx_offset.resize(partition_buffer_idx_base_.size());
       std::copy(partition_buffer_idx_base_.begin(), partition_buffer_idx_base_.end(),
                 partition_buffer_idx_offset.begin());
-      std::vector<uint16_t> nullcnt;
-      nullcnt.resize(num_partitions_,0);
       for (auto row = 0; row < num_rows; ++row) {
         auto pid = partition_id_[row];
         auto dst_offset = partition_buffer_idx_offset[pid];

From 8d5a41efcebd2b9dccae8a1c1eb704fadb702057 Mon Sep 17 00:00:00 2001
From: binwei <binwei.yang@intel.com>
Date: Sat, 7 May 2022 17:16:44 +0800
Subject: [PATCH 15/19] allocate validity buffer from pool

---
 native-sql-engine/cpp/src/shuffle/splitter.cc | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc
index bb9b9909b..8e4069509 100644
--- a/native-sql-engine/cpp/src/shuffle/splitter.cc
+++ b/native-sql-engine/cpp/src/shuffle/splitter.cc
@@ -621,16 +621,14 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer
                   num_rows * (arrow::bit_width(column_type_id_[i]->id()) >> 3));
           }
 
+          if ()
+          arrays[i] = arrow::MakeArray(
+              arrow::ArrayData::Make(schema_->field(i)->type(), num_rows,
+                                      {buffers[0],buffers[1]}));
           if (reset_buffers) {
-            arrays[i] = arrow::MakeArray(
-                arrow::ArrayData::Make(schema_->field(i)->type(), num_rows,
-                                       {buffers[0],buffers[1]}));
             partition_fixed_width_validity_addrs_[fixed_width_idx][partition_id] =
                 nullptr;
             partition_fixed_width_value_addrs_[fixed_width_idx][partition_id] = nullptr;
-          } else {
-            arrays[i] = arrow::MakeArray(arrow::ArrayData::Make(
-                schema_->field(i)->type(), num_rows, {buffers[0], buffers[1]}));
           }
           fixed_width_idx++;
           break;
@@ -937,6 +935,7 @@ arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) {
     auto col_idx = fixed_width_array_idx_[col];
     size_per_row += arrow::bit_width(column_type_id_[col_idx]->id()) / 8;
     //check input_fixed_width_has_null_[col] is cheaper than GetNullCount()
+    // once input_fixed_width_has_null_ is set to true, we didn't reset it after spill
     if (input_fixed_width_has_null_[col]==false && rb.column_data(col_idx)->GetNullCount() != 0) {
       input_fixed_width_has_null_[col] = true;
     }
@@ -1368,10 +1367,11 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch&
           // init bitmap if it's null, initialize the buffer as true
           auto new_size =
               std::max(partition_id_cnt_[pid], (uint16_t)options_.buffer_size);
-          ARROW_ASSIGN_OR_RAISE(
-              auto validity_buffer,
-              arrow::AllocateResizableBuffer(arrow::BitUtil::BytesForBits(new_size),
-                                             options_.memory_pool));
+
+          std::shared_ptr<arrow::Buffer> validity_buffer;
+          auto status = AllocateBufferFromPool(validity_buffer,
+                                                arrow::BitUtil::BytesForBits(new_size));
+          ARROW_RETURN_NOT_OK(status);
           dst_addrs[pid] = const_cast<uint8_t*>(validity_buffer->data());
           arrow::BitUtil::SetBitsTo(dst_addrs[pid], 0, partition_buffer_idx_base_[pid],
                                     true);

From f2e2fb3f70c84ab0239eb8d9a340b3d58d812b9a Mon Sep 17 00:00:00 2001
From: binwei <binwei.yang@intel.com>
Date: Sun, 8 May 2022 21:18:21 +0800
Subject: [PATCH 16/19] fix bug set validity buffer after allocation fix bug
 during of last bits after process valitity buffer

---
 native-sql-engine/cpp/src/shuffle/splitter.cc | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc
index 8e4069509..a91ff96df 100644
--- a/native-sql-engine/cpp/src/shuffle/splitter.cc
+++ b/native-sql-engine/cpp/src/shuffle/splitter.cc
@@ -49,7 +49,7 @@ using arrow::internal::checked_cast;
 
 #ifndef SPLIT_BUFFER_SIZE
 // by default, allocate 8M block, 2M page size
-#define SPLIT_BUFFER_SIZE 8 * 1024 * 1024
+#define SPLIT_BUFFER_SIZE 8*1024*1024
 #endif
 
 template <typename T>
@@ -610,18 +610,17 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer
         default: {
           auto buffers = partition_fixed_width_buffers_[fixed_width_idx][partition_id];
           if (buffers[0] != nullptr) {
-            buffers[0] = arrow::SliceBuffer(buffers[0], 0, (num_rows >> 3) + 1);
+            buffers[0] = arrow::SliceBuffer(buffers[0], 0, arrow::BitUtil::BytesForBits(num_rows));
           }
           if (buffers[1] != nullptr) {
             if (column_type_id_[i]->id() == arrow::BooleanType::type_id)
-              buffers[1] = arrow::SliceBuffer(buffers[1], 0, (num_rows >> 3) + 1);
+              buffers[1] = arrow::SliceBuffer(buffers[1], 0, arrow::BitUtil::BytesForBits(num_rows));
             else
               buffers[1] = arrow::SliceBuffer(
                   buffers[1], 0,
                   num_rows * (arrow::bit_width(column_type_id_[i]->id()) >> 3));
           }
 
-          if ()
           arrays[i] = arrow::MakeArray(
               arrow::ArrayData::Make(schema_->field(i)->type(), num_rows,
                                       {buffers[0],buffers[1]}));
@@ -1367,18 +1366,15 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch&
           // init bitmap if it's null, initialize the buffer as true
           auto new_size =
               std::max(partition_id_cnt_[pid], (uint16_t)options_.buffer_size);
-
           std::shared_ptr<arrow::Buffer> validity_buffer;
           auto status = AllocateBufferFromPool(validity_buffer,
                                                 arrow::BitUtil::BytesForBits(new_size));
           ARROW_RETURN_NOT_OK(status);
           dst_addrs[pid] = const_cast<uint8_t*>(validity_buffer->data());
-          arrow::BitUtil::SetBitsTo(dst_addrs[pid], 0, partition_buffer_idx_base_[pid],
-                                    true);
+          memset(validity_buffer->mutable_data(),0xff,validity_buffer->capacity());
           partition_fixed_width_buffers_[col][pid][0] = std::move(validity_buffer);
         }
       }
-
       auto src_addr = const_cast<uint8_t*>(rb.column_data(col_idx)->buffers[0]->data());
       partition_buffer_idx_offset.resize(partition_buffer_idx_base_.size());
       std::copy(partition_buffer_idx_base_.begin(), partition_buffer_idx_base_.end(),
@@ -1397,9 +1393,12 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch&
       {
         if (partition_id_cnt_[pid] > 0 && dst_addrs[pid] != nullptr) {
           auto lastoffset = partition_buffer_idx_base_[pid]+partition_id_cnt_[pid];
-
-          arrow::BitUtil::SetBitsTo(dst_addrs[pid], lastoffset, 8-(lastoffset&0x7),
-                                    true);
+          uint8_t dst = dst_addrs[pid][lastoffset>>3];
+          uint8_t msk = 0x1 << (lastoffset & 0x7);
+          msk=~(msk-1);
+          msk &= ((lastoffset & 7) == 0)-1;
+          dst |= msk;
+          dst_addrs[pid][lastoffset>>3]=dst;
         }
       }
 

From 4f8dee59f909fd0d312a90fa6786f78da46d83d3 Mon Sep 17 00:00:00 2001
From: binwei <binwei.yang@intel.com>
Date: Mon, 9 May 2022 10:32:07 +0800
Subject: [PATCH 17/19] Add arrow check for batch size and part number use
 uint32 as row number size

---
 native-sql-engine/cpp/src/shuffle/splitter.cc | 31 ++++++++++++-------
 native-sql-engine/cpp/src/shuffle/splitter.h  | 20 ++++++------
 2 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc
index a91ff96df..d3caabb03 100644
--- a/native-sql-engine/cpp/src/shuffle/splitter.cc
+++ b/native-sql-engine/cpp/src/shuffle/splitter.cc
@@ -303,6 +303,12 @@ arrow::Result<std::shared_ptr<Splitter>> Splitter::Make(
 }
 
 arrow::Status Splitter::Init() {
+
+  // partition number should be less than 64k
+  ARROW_CHECK_LE(num_partitions_,64*1024);
+  // split record batch size should be less than 32k
+  ARROW_CHECK_LE(options_.buffer_size,32*1024);
+
   const auto& fields = schema_->fields();
   ARROW_ASSIGN_OR_RAISE(column_type_id_, ToSplitterTypeId(schema_->fields()));
 
@@ -884,6 +890,9 @@ arrow::Result<int32_t> Splitter::SpillLargestPartition(int64_t* size) {
 
 arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) {
 
+  //buffer is allocated less than 64K
+  //ARROW_CHECK_LE(rb.num_rows(),64*1024);
+
 #ifdef PROCESSROW
 
   reducer_offsets_.resize(rb.num_rows());
@@ -901,7 +910,7 @@ arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) {
   }
   std::transform(reducer_offset_offset_.begin(), std::prev(reducer_offset_offset_.end()),
                  partition_id_cnt_.begin(), reducer_offset_offset_.begin(),
-                 [](uint16_t x, int16_t y) { return x - y; });
+                 [](row_offset_type x, row_offset_type y) { return x - y; });
 
 #endif
   // for the first input record batch, scan binary arrays and large binary
@@ -950,7 +959,7 @@ arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) {
   for (auto pid = 0; pid < num_partitions_; ++pid) {
     if (partition_id_cnt_[pid] > 0) {
       // make sure the size to be allocated is larger than the size to be filled
-      auto new_size = std::max((uint16_t)prealloc_row_cnt, partition_id_cnt_[pid]);
+      auto new_size = std::max((row_offset_type)prealloc_row_cnt, partition_id_cnt_[pid]);
       if (partition_buffer_size_[pid] == 0) {
         // allocate buffer if it's not yet allocated
         RETURN_NOT_OK(AllocatePartitionBuffers(pid, new_size));
@@ -1005,7 +1014,7 @@ arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) {
 arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb) {
   const auto num_rows = rb.num_rows();
   int64_t row;
-  std::vector<int16_t> partition_buffer_idx_offset;
+  std::vector<row_offset_type> partition_buffer_idx_offset;
 
   for (auto col = 0; col < fixed_width_array_idx_.size(); ++col) {
     const auto& dst_addrs = partition_fixed_width_value_addrs_[col];
@@ -1020,7 +1029,7 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb)
   std::transform(partition_buffer_idx_offset_.begin(),                                   \
                  partition_buffer_idx_offset_.end(), partition_buffer_idx_base_.begin(), \
                  partition_buffer_idx_offset_.begin(),                                   \
-                 [](uint8_t* x, int16_t y) { return x + y * sizeof(_CTYPE); });          \
+                 [](uint8_t* x, row_offset_type y) { return x + y * sizeof(_CTYPE); });          \
   for (auto pid = 0; pid < num_partitions_; pid++) {                                     \
     auto dst_pid_base =                                                                  \
         reinterpret_cast<_CTYPE*>(partition_buffer_idx_offset_[pid]); /*32k*/            \
@@ -1039,7 +1048,7 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb)
   std::transform(partition_buffer_idx_offset_.begin(),                                   \
                  partition_buffer_idx_offset_.end(), partition_buffer_idx_base_.begin(), \
                  partition_buffer_idx_offset_.begin(),                                   \
-                 [](uint8_t* x, int16_t y) { return x + y * sizeof(_CTYPE); });          \
+                 [](uint8_t* x, row_offset_type y) { return x + y * sizeof(_CTYPE); });          \
   for (row = 0; row < num_rows; ++row) {                                                 \
     auto pid = partition_id_[row];                                                       \
     auto dst_pid_base = reinterpret_cast<_CTYPE*>(partition_buffer_idx_offset_[pid]);    \
@@ -1060,7 +1069,7 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb)
         std::transform(
             partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(),
             partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(),
-            [](uint8_t* x, int16_t y) { return x + y * sizeof(uint64_t); });
+            [](uint8_t* x, row_offset_type y) { return x + y * sizeof(uint64_t); });
         for (auto pid = 0; pid < num_partitions_; pid++) {
           auto dst_pid_base =
               reinterpret_cast<uint64_t*>(partition_buffer_idx_offset_[pid]); /*32k*/
@@ -1129,7 +1138,7 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb)
         std::transform(
             partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(),
             partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(),
-            [](uint8_t* x, int16_t y) { return x + y * 16; });
+            [](uint8_t* x, row_offset_type y) { return x + y * 16; });
         for (auto pid = 0; pid < num_partitions_; pid++) {
           auto dst_pid_base =
               reinterpret_cast<uint64_t*>(partition_buffer_idx_offset_[pid]); /*32k*/
@@ -1150,7 +1159,7 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb)
         std::transform(
             partition_buffer_idx_offset_.begin(), partition_buffer_idx_offset_.end(),
             partition_buffer_idx_base_.begin(), partition_buffer_idx_offset_.begin(),
-            [](uint8_t* x, int16_t y) { return x + y * 16; });
+            [](uint8_t* x, row_offset_type y) { return x + y * 16; });
         for (auto row = 0; row < num_rows; ++row) {
           auto pid = partition_id_[row];
           reinterpret_cast<uint64_t*>(partition_buffer_idx_offset_[pid])[0] =
@@ -1169,7 +1178,7 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb)
                   partition_buffer_idx_offset.begin());
         for (auto row = 0; row < num_rows; ++row) {
           auto pid = partition_id_[row];
-          uint16_t dst_offset = partition_buffer_idx_offset[pid];
+          row_offset_type dst_offset = partition_buffer_idx_offset[pid];
           dst_addrs[pid][dst_offset >> 3] ^=
               (dst_addrs[pid][dst_offset >> 3] >> (dst_offset & 7) ^
                src_addr[row >> 3] >> (row & 7))
@@ -1354,7 +1363,7 @@ arrow::Status Splitter::SplitFixedWidthValueBufferAVX(const arrow::RecordBatch&
 
 arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch& rb) {
   const auto num_rows = rb.num_rows();
-  std::vector<int16_t> partition_buffer_idx_offset;
+  std::vector<row_offset_type> partition_buffer_idx_offset;
 
   for (auto col = 0; col < fixed_width_array_idx_.size(); ++col) {
     auto col_idx = fixed_width_array_idx_[col];
@@ -1365,7 +1374,7 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch&
         if (partition_id_cnt_[pid] > 0 && dst_addrs[pid] == nullptr) {
           // init bitmap if it's null, initialize the buffer as true
           auto new_size =
-              std::max(partition_id_cnt_[pid], (uint16_t)options_.buffer_size);
+              std::max(partition_id_cnt_[pid], (row_offset_type)options_.buffer_size);
           std::shared_ptr<arrow::Buffer> validity_buffer;
           auto status = AllocateBufferFromPool(validity_buffer,
                                                 arrow::BitUtil::BytesForBits(new_size));
diff --git a/native-sql-engine/cpp/src/shuffle/splitter.h b/native-sql-engine/cpp/src/shuffle/splitter.h
index cc7440926..ace9e5661 100644
--- a/native-sql-engine/cpp/src/shuffle/splitter.h
+++ b/native-sql-engine/cpp/src/shuffle/splitter.h
@@ -48,6 +48,8 @@ class Splitter {
 
   virtual const std::shared_ptr<arrow::Schema>& input_schema() const { return schema_; }
 
+  typedef uint32_t row_offset_type;
+
   /**
    * Split input record batch into partition buffers according to the computed
    * partition id. The largest partition buffer will be spilled if memory
@@ -177,8 +179,8 @@ class Splitter {
 
   // partid
   std::vector<int32_t> partition_buffer_size_;
-  // partid
-  std::vector<uint16_t> partition_buffer_idx_base_;
+  // partid, value is reducer batch's offset, output rb rownum < 64k
+  std::vector<row_offset_type> partition_buffer_idx_base_;
   // partid
   // temp array to hold the destination pointer
   std::vector<uint8_t*> partition_buffer_idx_offset_;
@@ -231,14 +233,14 @@ class Splitter {
   std::vector<bool> input_fixed_width_has_null_;
 
   // updated for each input record batch
-  // col
+  // col; value is partition number, part_num < 64k
   std::vector<uint16_t> partition_id_;
-  // [num_rows]
-  std::vector<uint16_t> reducer_offsets_;
-  // [num_partitions]
-  std::vector<uint16_t> reducer_offset_offset_;
-  // col
-  std::vector<uint16_t> partition_id_cnt_;
+  // [num_rows] ; value is offset in input record batch; input rb rownum < 64k
+  std::vector<row_offset_type> reducer_offsets_;
+  // [num_partitions]; value is offset of row in record batch; input rb rownum < 64k
+  std::vector<row_offset_type> reducer_offset_offset_;
+  // col  ; value is reducer's row number for each input record batch; output rb rownum < 64k
+  std::vector<row_offset_type> partition_id_cnt_;
 
   int32_t num_partitions_;
   std::shared_ptr<arrow::Schema> schema_;

From 791c0cc32d37ec009c71c83476b6822587fe3fbb Mon Sep 17 00:00:00 2001
From: binwei <binwei.yang@intel.com>
Date: Mon, 9 May 2022 11:23:42 +0800
Subject: [PATCH 18/19] format code

---
 native-sql-engine/cpp/src/shuffle/splitter.cc | 87 +++++++++----------
 native-sql-engine/cpp/src/shuffle/splitter.h  |  3 +-
 2 files changed, 44 insertions(+), 46 deletions(-)

diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc
index d3caabb03..3ee7ff014 100644
--- a/native-sql-engine/cpp/src/shuffle/splitter.cc
+++ b/native-sql-engine/cpp/src/shuffle/splitter.cc
@@ -49,7 +49,7 @@ using arrow::internal::checked_cast;
 
 #ifndef SPLIT_BUFFER_SIZE
 // by default, allocate 8M block, 2M page size
-#define SPLIT_BUFFER_SIZE 8*1024*1024
+#define SPLIT_BUFFER_SIZE 8 * 1024 * 1024
 #endif
 
 template <typename T>
@@ -303,11 +303,10 @@ arrow::Result<std::shared_ptr<Splitter>> Splitter::Make(
 }
 
 arrow::Status Splitter::Init() {
-
   // partition number should be less than 64k
-  ARROW_CHECK_LE(num_partitions_,64*1024);
+  ARROW_CHECK_LE(num_partitions_, 64 * 1024);
   // split record batch size should be less than 32k
-  ARROW_CHECK_LE(options_.buffer_size,32*1024);
+  ARROW_CHECK_LE(options_.buffer_size, 32 * 1024);
 
   const auto& fields = schema_->fields();
   ARROW_ASSIGN_OR_RAISE(column_type_id_, ToSplitterTypeId(schema_->fields()));
@@ -412,8 +411,8 @@ arrow::Status Splitter::Init() {
       arrow::util::Codec::CreateInt32(arrow::Compression::UNCOMPRESSED));
 
   // Allocate first buffer for split reducer
-  ARROW_ASSIGN_OR_RAISE(combine_buffer_, arrow::AllocateResizableBuffer(
-                                             0, options_.memory_pool));
+  ARROW_ASSIGN_OR_RAISE(combine_buffer_,
+                        arrow::AllocateResizableBuffer(0, options_.memory_pool));
   combine_buffer_->Resize(0, /*shrink_to_fit =*/false);
 
   return arrow::Status::OK();
@@ -616,20 +615,21 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer
         default: {
           auto buffers = partition_fixed_width_buffers_[fixed_width_idx][partition_id];
           if (buffers[0] != nullptr) {
-            buffers[0] = arrow::SliceBuffer(buffers[0], 0, arrow::BitUtil::BytesForBits(num_rows));
+            buffers[0] =
+                arrow::SliceBuffer(buffers[0], 0, arrow::BitUtil::BytesForBits(num_rows));
           }
           if (buffers[1] != nullptr) {
             if (column_type_id_[i]->id() == arrow::BooleanType::type_id)
-              buffers[1] = arrow::SliceBuffer(buffers[1], 0, arrow::BitUtil::BytesForBits(num_rows));
+              buffers[1] = arrow::SliceBuffer(buffers[1], 0,
+                                              arrow::BitUtil::BytesForBits(num_rows));
             else
               buffers[1] = arrow::SliceBuffer(
                   buffers[1], 0,
                   num_rows * (arrow::bit_width(column_type_id_[i]->id()) >> 3));
           }
 
-          arrays[i] = arrow::MakeArray(
-              arrow::ArrayData::Make(schema_->field(i)->type(), num_rows,
-                                      {buffers[0],buffers[1]}));
+          arrays[i] = arrow::MakeArray(arrow::ArrayData::Make(
+              schema_->field(i)->type(), num_rows, {buffers[0], buffers[1]}));
           if (reset_buffers) {
             partition_fixed_width_validity_addrs_[fixed_width_idx][partition_id] =
                 nullptr;
@@ -661,7 +661,7 @@ arrow::Status Splitter::CacheRecordBatch(int32_t partition_id, bool reset_buffer
                        arrow::ipc::GetRecordBatchPayload(*batch, tiny_bach_write_options_,
                                                          payload.get()));
 #endif
-    
+
     partition_cached_recordbatch_size_[partition_id] += payload->body_length;
     partition_cached_recordbatch_[partition_id].push_back(std::move(payload));
     partition_buffer_idx_base_[partition_id] = 0;
@@ -728,21 +728,20 @@ arrow::Status Splitter::AllocatePartitionBuffers(int32_t partition_id, int32_t n
         std::shared_ptr<arrow::Buffer> value_buffer;
         if (column_type_id_[i]->id() == arrow::BooleanType::type_id) {
           auto status = AllocateBufferFromPool(value_buffer,
-                                                arrow::BitUtil::BytesForBits(new_size));
+                                               arrow::BitUtil::BytesForBits(new_size));
           ARROW_RETURN_NOT_OK(status);
         } else {
           auto status = AllocateBufferFromPool(
-              value_buffer,
-              new_size * (arrow::bit_width(column_type_id_[i]->id()) >> 3));
+              value_buffer, new_size * (arrow::bit_width(column_type_id_[i]->id()) >> 3));
           ARROW_RETURN_NOT_OK(status);
         }
         new_value_buffers.push_back(std::move(value_buffer));
         if (input_fixed_width_has_null_[fixed_width_idx]) {
           std::shared_ptr<arrow::Buffer> validity_buffer;
           auto status = AllocateBufferFromPool(validity_buffer,
-                                                arrow::BitUtil::BytesForBits(new_size));
+                                               arrow::BitUtil::BytesForBits(new_size));
           ARROW_RETURN_NOT_OK(status);
-          //initialize all true once allocated
+          // initialize all true once allocated
           memset(validity_buffer->mutable_data(), 0xff, validity_buffer->capacity());
           new_validity_buffers.push_back(std::move(validity_buffer));
         } else {
@@ -811,7 +810,7 @@ arrow::Status Splitter::AllocateNew(int32_t partition_id, int32_t new_size) {
               << std::to_string(partition_id) << std::endl;
     int64_t spilled_size;
     ARROW_ASSIGN_OR_RAISE(auto partition_to_spill, SpillLargestPartition(&spilled_size));
-    if (partition_to_spill == -1) { 
+    if (partition_to_spill == -1) {
       std::cout << "Failed to allocate new buffer for partition "
                 << std::to_string(partition_id) << ". No partition buffer to spill."
                 << std::endl;
@@ -851,16 +850,16 @@ arrow::Status Splitter::SpillPartition(int32_t partition_id) {
   }
   TIME_NANO_OR_RAISE(total_spill_time_, partition_writer_[partition_id]->Spill());
 
-  //reset validity buffer after spill
+  // reset validity buffer after spill
   std::for_each(partition_fixed_width_buffers_.begin(),
-      partition_fixed_width_buffers_.end(),[partition_id](std::vector<arrow::BufferVector>& bufs){
-        if (bufs[partition_id][0]!=nullptr)
-        {
-          //initialize all true once allocated
-          auto addr = bufs[partition_id][0]->mutable_data();
-          memset(addr,0xff,bufs[partition_id][0]->capacity());
-        }
-      });
+                partition_fixed_width_buffers_.end(),
+                [partition_id](std::vector<arrow::BufferVector>& bufs) {
+                  if (bufs[partition_id][0] != nullptr) {
+                    // initialize all true once allocated
+                    auto addr = bufs[partition_id][0]->mutable_data();
+                    memset(addr, 0xff, bufs[partition_id][0]->capacity());
+                  }
+                });
 
   return arrow::Status::OK();
 }
@@ -889,9 +888,8 @@ arrow::Result<int32_t> Splitter::SpillLargestPartition(int64_t* size) {
 }
 
 arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) {
-
-  //buffer is allocated less than 64K
-  //ARROW_CHECK_LE(rb.num_rows(),64*1024);
+  // buffer is allocated less than 64K
+  // ARROW_CHECK_LE(rb.num_rows(),64*1024);
 
 #ifdef PROCESSROW
 
@@ -942,9 +940,10 @@ arrow::Status Splitter::DoSplit(const arrow::RecordBatch& rb) {
   for (auto col = 0; col < fixed_width_array_idx_.size(); ++col) {
     auto col_idx = fixed_width_array_idx_[col];
     size_per_row += arrow::bit_width(column_type_id_[col_idx]->id()) / 8;
-    //check input_fixed_width_has_null_[col] is cheaper than GetNullCount()
-    // once input_fixed_width_has_null_ is set to true, we didn't reset it after spill
-    if (input_fixed_width_has_null_[col]==false && rb.column_data(col_idx)->GetNullCount() != 0) {
+    // check input_fixed_width_has_null_[col] is cheaper than GetNullCount()
+    //  once input_fixed_width_has_null_ is set to true, we didn't reset it after spill
+    if (input_fixed_width_has_null_[col] == false &&
+        rb.column_data(col_idx)->GetNullCount() != 0) {
       input_fixed_width_has_null_[col] = true;
     }
   }
@@ -1029,7 +1028,7 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb)
   std::transform(partition_buffer_idx_offset_.begin(),                                   \
                  partition_buffer_idx_offset_.end(), partition_buffer_idx_base_.begin(), \
                  partition_buffer_idx_offset_.begin(),                                   \
-                 [](uint8_t* x, row_offset_type y) { return x + y * sizeof(_CTYPE); });          \
+                 [](uint8_t* x, row_offset_type y) { return x + y * sizeof(_CTYPE); });  \
   for (auto pid = 0; pid < num_partitions_; pid++) {                                     \
     auto dst_pid_base =                                                                  \
         reinterpret_cast<_CTYPE*>(partition_buffer_idx_offset_[pid]); /*32k*/            \
@@ -1048,7 +1047,7 @@ arrow::Status Splitter::SplitFixedWidthValueBuffer(const arrow::RecordBatch& rb)
   std::transform(partition_buffer_idx_offset_.begin(),                                   \
                  partition_buffer_idx_offset_.end(), partition_buffer_idx_base_.begin(), \
                  partition_buffer_idx_offset_.begin(),                                   \
-                 [](uint8_t* x, row_offset_type y) { return x + y * sizeof(_CTYPE); });          \
+                 [](uint8_t* x, row_offset_type y) { return x + y * sizeof(_CTYPE); });  \
   for (row = 0; row < num_rows; ++row) {                                                 \
     auto pid = partition_id_[row];                                                       \
     auto dst_pid_base = reinterpret_cast<_CTYPE*>(partition_buffer_idx_offset_[pid]);    \
@@ -1377,10 +1376,10 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch&
               std::max(partition_id_cnt_[pid], (row_offset_type)options_.buffer_size);
           std::shared_ptr<arrow::Buffer> validity_buffer;
           auto status = AllocateBufferFromPool(validity_buffer,
-                                                arrow::BitUtil::BytesForBits(new_size));
+                                               arrow::BitUtil::BytesForBits(new_size));
           ARROW_RETURN_NOT_OK(status);
           dst_addrs[pid] = const_cast<uint8_t*>(validity_buffer->data());
-          memset(validity_buffer->mutable_data(),0xff,validity_buffer->capacity());
+          memset(validity_buffer->mutable_data(), 0xff, validity_buffer->capacity());
           partition_fixed_width_buffers_[col][pid][0] = std::move(validity_buffer);
         }
       }
@@ -1398,19 +1397,17 @@ arrow::Status Splitter::SplitFixedWidthValidityBuffer(const arrow::RecordBatch&
         partition_buffer_idx_offset[pid]++;
       }
       // the last row may update the following bits to 0, reinitialize it as 1
-      for(auto pid=0;pid<num_partitions_;pid++)
-      {
+      for (auto pid = 0; pid < num_partitions_; pid++) {
         if (partition_id_cnt_[pid] > 0 && dst_addrs[pid] != nullptr) {
-          auto lastoffset = partition_buffer_idx_base_[pid]+partition_id_cnt_[pid];
-          uint8_t dst = dst_addrs[pid][lastoffset>>3];
+          auto lastoffset = partition_buffer_idx_base_[pid] + partition_id_cnt_[pid];
+          uint8_t dst = dst_addrs[pid][lastoffset >> 3];
           uint8_t msk = 0x1 << (lastoffset & 0x7);
-          msk=~(msk-1);
-          msk &= ((lastoffset & 7) == 0)-1;
+          msk = ~(msk - 1);
+          msk &= ((lastoffset & 7) == 0) - 1;
           dst |= msk;
-          dst_addrs[pid][lastoffset>>3]=dst;
+          dst_addrs[pid][lastoffset >> 3] = dst;
         }
       }
-
     }
   }
   return arrow::Status::OK();
diff --git a/native-sql-engine/cpp/src/shuffle/splitter.h b/native-sql-engine/cpp/src/shuffle/splitter.h
index ace9e5661..ab71446f9 100644
--- a/native-sql-engine/cpp/src/shuffle/splitter.h
+++ b/native-sql-engine/cpp/src/shuffle/splitter.h
@@ -239,7 +239,8 @@ class Splitter {
   std::vector<row_offset_type> reducer_offsets_;
   // [num_partitions]; value is offset of row in record batch; input rb rownum < 64k
   std::vector<row_offset_type> reducer_offset_offset_;
-  // col  ; value is reducer's row number for each input record batch; output rb rownum < 64k
+  // col  ; value is reducer's row number for each input record batch; output rb rownum <
+  // 64k
   std::vector<row_offset_type> partition_id_cnt_;
 
   int32_t num_partitions_;

From 1d8b0d65ff18b60ded9f4348585b6b68364a5922 Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Mon, 9 May 2022 13:09:28 +0800
Subject: [PATCH 19/19] fix format

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 .../src/benchmarks/shuffle_split_benchmark.cc | 84 +++++++++----------
 native-sql-engine/cpp/src/shuffle/splitter.cc |  4 +-
 .../cpp/src/tests/shuffle_split_test.cc       | 18 ++--
 .../src/third_party/parallel_hashmap/btree.h  | 21 ++---
 .../src/third_party/parallel_hashmap/phmap.h  | 12 +--
 .../third_party/parallel_hashmap/phmap_base.h | 36 ++++----
 6 files changed, 88 insertions(+), 87 deletions(-)

diff --git a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
index 6f9a7f19e..106d7dba8 100644
--- a/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
+++ b/native-sql-engine/cpp/src/benchmarks/shuffle_split_benchmark.cc
@@ -143,7 +143,7 @@ class LargePageMemoryPool : public MemoryPool {
       pool_->Free(buffer, size, ALIGNMENT);
     }
 #else
-      pool_->Free(buffer, size);
+    pool_->Free(buffer, size);
 #endif
   }
 
@@ -321,27 +321,27 @@ class BenchmarkShuffleSplit_CacheScan_Benchmark : public BenchmarkShuffleSplit {
                 const int num_partitions, SplitOptions options, benchmark::State& state) {
     std::vector<int> local_column_indices;
     local_column_indices.push_back(0);
-/*    local_column_indices.push_back(0);
-    local_column_indices.push_back(1);
-    local_column_indices.push_back(2);
-    local_column_indices.push_back(4);
-    local_column_indices.push_back(5);
-    local_column_indices.push_back(6);
-    local_column_indices.push_back(7);*/
+    /*    local_column_indices.push_back(0);
+        local_column_indices.push_back(1);
+        local_column_indices.push_back(2);
+        local_column_indices.push_back(4);
+        local_column_indices.push_back(5);
+        local_column_indices.push_back(6);
+        local_column_indices.push_back(7);*/
 
     std::shared_ptr<arrow::Schema> local_schema;
     local_schema = std::make_shared<arrow::Schema>(*schema.get());
 
-/*    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(15));
-    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(14));
-    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(13));
-    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(12));
-    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(11));
-    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(10));
-    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(9));
-    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(8));
-    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(3));
-*/
+    /*    ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(15));
+        ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(14));
+        ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(13));
+        ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(12));
+        ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(11));
+        ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(10));
+        ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(9));
+        ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(8));
+        ARROW_ASSIGN_OR_THROW(local_schema, local_schema->RemoveField(3));
+    */
     if (state.thread_index() == 0) std::cout << local_schema->ToString() << std::endl;
 
     ARROW_ASSIGN_OR_THROW(splitter,
@@ -497,30 +497,30 @@ int main(int argc, char** argv) {
       ->MeasureProcessCPUTime()
       ->Unit(benchmark::kSecond);
 
-/*    sparkcolumnarplugin::shuffle::BenchmarkShuffleSplit_IterateScan_Benchmark
-    bck(datafile);
-
-    benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck)
-      ->Iterations(1)
-        ->Args({96*2, arrow::Compression::FASTPFOR})
-        ->Args({96*4, arrow::Compression::FASTPFOR})
-        ->Args({96*8, arrow::Compression::FASTPFOR})
-        ->Args({96*16, arrow::Compression::FASTPFOR})
-        ->Args({96*32, arrow::Compression::FASTPFOR})
-        ->Threads(24)
-        ->Unit(benchmark::kSecond);
-
-    benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck)
-      ->Iterations(1)
-        ->Args({4096, arrow::Compression::FASTPFOR})
-        ->Threads(1)
-        ->Threads(2)
-        ->Threads(4)
-        ->Threads(8)
-        ->Threads(16)
-        ->Threads(24)
-        ->Unit(benchmark::kSecond);
-*/
+  /*    sparkcolumnarplugin::shuffle::BenchmarkShuffleSplit_IterateScan_Benchmark
+      bck(datafile);
+
+      benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck)
+        ->Iterations(1)
+          ->Args({96*2, arrow::Compression::FASTPFOR})
+          ->Args({96*4, arrow::Compression::FASTPFOR})
+          ->Args({96*8, arrow::Compression::FASTPFOR})
+          ->Args({96*16, arrow::Compression::FASTPFOR})
+          ->Args({96*32, arrow::Compression::FASTPFOR})
+          ->Threads(24)
+          ->Unit(benchmark::kSecond);
+
+      benchmark::RegisterBenchmark("BenchmarkShuffleSplit::IterateScan", bck)
+        ->Iterations(1)
+          ->Args({4096, arrow::Compression::FASTPFOR})
+          ->Threads(1)
+          ->Threads(2)
+          ->Threads(4)
+          ->Threads(8)
+          ->Threads(16)
+          ->Threads(24)
+          ->Unit(benchmark::kSecond);
+  */
   benchmark::Initialize(&argc, argv);
   benchmark::RunSpecifiedBenchmarks();
   benchmark::Shutdown();
diff --git a/native-sql-engine/cpp/src/shuffle/splitter.cc b/native-sql-engine/cpp/src/shuffle/splitter.cc
index 3ee7ff014..5213d607a 100644
--- a/native-sql-engine/cpp/src/shuffle/splitter.cc
+++ b/native-sql-engine/cpp/src/shuffle/splitter.cc
@@ -1622,8 +1622,8 @@ arrow::Status HashSplitter::ComputeAndCountPartitionId(const arrow::RecordBatch&
         "lea (%[num_partitions],%[pid],1),%[tmp]\n"
         "test %[pid],%[pid]\n"
         "cmovs %[tmp],%[pid]\n"
-        : [pid] "+r"(pid)
-        : [num_partitions] "r"(num_partitions_), [tmp] "r"(0));
+        : [ pid ] "+r"(pid)
+        : [ num_partitions ] "r"(num_partitions_), [ tmp ] "r"(0));
     partition_id_[i] = pid;
     partition_id_cnt_[pid]++;
   }
diff --git a/native-sql-engine/cpp/src/tests/shuffle_split_test.cc b/native-sql-engine/cpp/src/tests/shuffle_split_test.cc
index d5d8de0bf..fa03be61d 100644
--- a/native-sql-engine/cpp/src/tests/shuffle_split_test.cc
+++ b/native-sql-engine/cpp/src/tests/shuffle_split_test.cc
@@ -54,8 +54,8 @@ class MyMemoryPool : public arrow::MemoryPool {
     }
     RETURN_NOT_OK(pool_->Allocate(size, out));
     stats_.UpdateAllocatedBytes(size);
-     //std::cout << "Allocate: size = " << size << " addr = " << std::hex <<
-     //(uint64_t)*out << std::dec << std::endl;
+    // std::cout << "Allocate: size = " << size << " addr = " << std::hex <<
+    //(uint64_t)*out << std::dec << std::endl;
     // print_trace();
     return arrow::Status::OK();
   }
@@ -67,19 +67,19 @@ class MyMemoryPool : public arrow::MemoryPool {
     auto old_ptr = *ptr;
     RETURN_NOT_OK(pool_->Reallocate(old_size, new_size, ptr));
     stats_.UpdateAllocatedBytes(new_size - old_size);
-     //std::cout << "Reallocate: old_size = " << old_size << " old_ptr = " << std::hex <<
-     //(uint64_t)old_ptr << std::dec << " new_size = " << new_size << " addr = " <<
-     //std::hex << (uint64_t)*ptr << std::dec << std::endl;
-    //print_trace();
+    // std::cout << "Reallocate: old_size = " << old_size << " old_ptr = " << std::hex <<
+    //(uint64_t)old_ptr << std::dec << " new_size = " << new_size << " addr = " <<
+    // std::hex << (uint64_t)*ptr << std::dec << std::endl;
+    // print_trace();
     return arrow::Status::OK();
   }
 
   void Free(uint8_t* buffer, int64_t size) override {
     pool_->Free(buffer, size);
     stats_.UpdateAllocatedBytes(-size);
-     //std::cout << "Free: size = " << size << " addr = " << std::hex << (uint64_t)buffer
-     //<< std::dec << std::endl;
-    //print_trace();
+    // std::cout << "Free: size = " << size << " addr = " << std::hex << (uint64_t)buffer
+    //<< std::dec << std::endl;
+    // print_trace();
   }
 
   int64_t bytes_allocated() const override { return stats_.bytes_allocated(); }
diff --git a/native-sql-engine/cpp/src/third_party/parallel_hashmap/btree.h b/native-sql-engine/cpp/src/third_party/parallel_hashmap/btree.h
index 24c2d145b..b9b0d94da 100644
--- a/native-sql-engine/cpp/src/third_party/parallel_hashmap/btree.h
+++ b/native-sql-engine/cpp/src/third_party/parallel_hashmap/btree.h
@@ -661,9 +661,9 @@ constexpr bool do_less_than_comparison(const Compare& compare, const K& x, const
 // SFINAE prevents implicit conversions to int (such as from bool).
 template <typename Int, phmap::enable_if_t<std::is_same<int, Int>::value, int> = 0>
 constexpr phmap::weak_ordering compare_result_as_ordering(const Int c) {
-  return c < 0    ? phmap::weak_ordering::less
-         : c == 0 ? phmap::weak_ordering::equivalent
-                  : phmap::weak_ordering::greater;
+  return c < 0
+             ? phmap::weak_ordering::less
+             : c == 0 ? phmap::weak_ordering::equivalent : phmap::weak_ordering::greater;
 }
 constexpr phmap::weak_ordering compare_result_as_ordering(const phmap::weak_ordering c) {
   return c;
@@ -685,9 +685,9 @@ template <
         int> = 0>
 constexpr phmap::weak_ordering do_three_way_comparison(const Compare& compare, const K& x,
                                                        const LK& y) {
-  return compare(x, y)   ? phmap::weak_ordering::less
-         : compare(y, x) ? phmap::weak_ordering::greater
-                         : phmap::weak_ordering::equivalent;
+  return compare(x, y) ? phmap::weak_ordering::less
+                       : compare(y, x) ? phmap::weak_ordering::greater
+                                       : phmap::weak_ordering::equivalent;
 }
 
 }  // namespace compare_internal
@@ -1063,10 +1063,11 @@ class btree_node {
   // Compute how many values we can fit onto a leaf node taking into account
   // padding.
   constexpr static size_type NodeTargetValues(const int begin, const int end) {
-    return begin == end ? begin
-           : SizeWithNValues((begin + end) / 2 + 1) > params_type::kTargetNodeSize
-               ? NodeTargetValues(begin, (begin + end) / 2)
-               : NodeTargetValues((begin + end) / 2 + 1, end);
+    return begin == end
+               ? begin
+               : SizeWithNValues((begin + end) / 2 + 1) > params_type::kTargetNodeSize
+                     ? NodeTargetValues(begin, (begin + end) / 2)
+                     : NodeTargetValues((begin + end) / 2 + 1, end);
   }
 
   enum {
diff --git a/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap.h b/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap.h
index 05d227a43..4628cca30 100644
--- a/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap.h
+++ b/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap.h
@@ -2156,13 +2156,13 @@ class raw_hash_map : public raw_hash_set<Policy, Hash, Eq, Alloc> {
   // incomplete types as values, as in unordered_map<K, IncompleteType>.
   // MappedReference<> may be a non-reference type.
   template <class P>
-  using MappedReference = decltype(P::value(
-      std::addressof(std::declval<typename raw_hash_map::reference>())));
+  using MappedReference = decltype(
+      P::value(std::addressof(std::declval<typename raw_hash_map::reference>())));
 
   // MappedConstReference<> may be a non-reference type.
   template <class P>
-  using MappedConstReference = decltype(P::value(
-      std::addressof(std::declval<typename raw_hash_map::const_reference>())));
+  using MappedConstReference = decltype(
+      P::value(std::addressof(std::declval<typename raw_hash_map::const_reference>())));
 
   using KeyArgImpl = KeyArg<IsTransparent<Eq>::value && IsTransparent<Hash>::value>;
 
@@ -3409,8 +3409,8 @@ class parallel_hash_map
   // incomplete types as values, as in unordered_map<K, IncompleteType>.
   // MappedReference<> may be a non-reference type.
   template <class P>
-  using MappedReference = decltype(P::value(
-      std::addressof(std::declval<typename parallel_hash_map::reference>())));
+  using MappedReference = decltype(
+      P::value(std::addressof(std::declval<typename parallel_hash_map::reference>())));
 
   // MappedConstReference<> may be a non-reference type.
   template <class P>
diff --git a/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap_base.h b/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap_base.h
index 0f4e6375d..3b3b6b120 100644
--- a/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap_base.h
+++ b/native-sql-engine/cpp/src/third_party/parallel_hashmap/phmap_base.h
@@ -826,8 +826,8 @@ struct Invoker {
 
 // The result type of Invoke<F, Args...>.
 template <typename F, typename... Args>
-using InvokeT = decltype(Invoker<F, Args...>::type::Invoke(std::declval<F>(),
-                                                           std::declval<Args>()...));
+using InvokeT = decltype(
+    Invoker<F, Args...>::type::Invoke(std::declval<F>(), std::declval<Args>()...));
 
 // Invoke(f, args...) is an implementation of INVOKE(f, args...) from section
 // [func.require] of the C++ standard.
@@ -1002,10 +1002,9 @@ constexpr T&& forward(
 namespace utility_internal {
 // Helper method for expanding tuple into a called method.
 template <typename Functor, typename Tuple, std::size_t... Indexes>
-auto apply_helper(Functor&& functor, Tuple&& t, index_sequence<Indexes...>)
-    -> decltype(phmap::base_internal::Invoke(
-        phmap::forward<Functor>(functor),
-        std::get<Indexes>(phmap::forward<Tuple>(t))...)) {
+auto apply_helper(Functor&& functor, Tuple&& t, index_sequence<Indexes...>) -> decltype(
+    phmap::base_internal::Invoke(phmap::forward<Functor>(functor),
+                                 std::get<Indexes>(phmap::forward<Tuple>(t))...)) {
   return phmap::base_internal::Invoke(phmap::forward<Functor>(functor),
                                       std::get<Indexes>(phmap::forward<Tuple>(t))...);
 }
@@ -1888,18 +1887,19 @@ class optional_assign_base<copy_traits::non_movable> {
 
 template <typename T>
 constexpr copy_traits get_ctor_copy_traits() {
-  return std::is_copy_constructible<T>::value   ? copy_traits::copyable
-         : std::is_move_constructible<T>::value ? copy_traits::movable
-                                                : copy_traits::non_movable;
+  return std::is_copy_constructible<T>::value
+             ? copy_traits::copyable
+             : std::is_move_constructible<T>::value ? copy_traits::movable
+                                                    : copy_traits::non_movable;
 }
 
 template <typename T>
 constexpr copy_traits get_assign_copy_traits() {
   return phmap::is_copy_assignable<T>::value && std::is_copy_constructible<T>::value
              ? copy_traits::copyable
-         : phmap::is_move_assignable<T>::value && std::is_move_constructible<T>::value
-             ? copy_traits::movable
-             : copy_traits::non_movable;
+             : phmap::is_move_assignable<T>::value && std::is_move_constructible<T>::value
+                   ? copy_traits::movable
+                   : copy_traits::non_movable;
 }
 
 // Whether T is constructible or convertible from optional<U>.
@@ -2421,9 +2421,9 @@ constexpr optional<T> make_optional(std::initializer_list<U> il, Args&&... args)
 template <typename T, typename U>
 constexpr auto operator==(const optional<T>& x, const optional<U>& y)
     -> decltype(optional_internal::convertible_to_bool(*x == *y)) {
-  return static_cast<bool>(x) != static_cast<bool>(y) ? false
-         : static_cast<bool>(x) == false              ? true
-                                                      : static_cast<bool>(*x == *y);
+  return static_cast<bool>(x) != static_cast<bool>(y)
+             ? false
+             : static_cast<bool>(x) == false ? true : static_cast<bool>(*x == *y);
 }
 
 // Returns: If bool(x) != bool(y), true; otherwise, if bool(x) == false, false;
@@ -2431,9 +2431,9 @@ constexpr auto operator==(const optional<T>& x, const optional<U>& y)
 template <typename T, typename U>
 constexpr auto operator!=(const optional<T>& x, const optional<U>& y)
     -> decltype(optional_internal::convertible_to_bool(*x != *y)) {
-  return static_cast<bool>(x) != static_cast<bool>(y) ? true
-         : static_cast<bool>(x) == false              ? false
-                                                      : static_cast<bool>(*x != *y);
+  return static_cast<bool>(x) != static_cast<bool>(y)
+             ? true
+             : static_cast<bool>(x) == false ? false : static_cast<bool>(*x != *y);
 }
 // Returns: If !y, false; otherwise, if !x, true; otherwise *x < *y.
 template <typename T, typename U>