From 52b78aecf60713587d0c97ce46d5f323e6287e95 Mon Sep 17 00:00:00 2001 From: zouxiang Date: Tue, 6 Jun 2023 20:34:45 +0800 Subject: [PATCH] refactor: optimize sst filter build to consume less CPU (#967) ## Rationale When doing benchmark, xor filter build cost too much CPU. ## Detailed Changes - Remove datum to_vec ## Test Plan --------- Co-authored-by: jiacai2050 --- analytic_engine/src/sst/parquet/writer.rs | 5 +- common_types/src/datum.rs | 70 +++++++++++++++++++++++ 2 files changed, 73 insertions(+), 2 deletions(-) diff --git a/analytic_engine/src/sst/parquet/writer.rs b/analytic_engine/src/sst/parquet/writer.rs index 7b7e14c109..5e1adddb05 100644 --- a/analytic_engine/src/sst/parquet/writer.rs +++ b/analytic_engine/src/sst/parquet/writer.rs @@ -153,8 +153,9 @@ impl RecordBatchGroupWriter { for (col_idx, column) in partial_batch.columns().iter().enumerate() { for row in 0..column.num_rows() { let datum = column.datum(row); - let bytes = datum.to_bytes(); - builder.add_key(col_idx, &bytes); + datum.do_with_bytes(|bytes| { + builder.add_key(col_idx, bytes); + }); } } } diff --git a/common_types/src/datum.rs b/common_types/src/datum.rs index b53c1a0f78..bf694a5c6d 100644 --- a/common_types/src/datum.rs +++ b/common_types/src/datum.rs @@ -562,6 +562,76 @@ impl Datum { } } + pub fn do_with_bytes(&self, mut f: F) + where + F: FnMut(&[u8]), + { + match self { + Datum::Double(v) => { + let arr = v.to_le_bytes(); + f(arr.as_slice()) + } + Datum::Float(v) => { + let arr = v.to_le_bytes(); + f(arr.as_slice()) + } + Datum::UInt64(v) => { + let arr = v.to_le_bytes(); + f(arr.as_slice()) + } + Datum::UInt32(v) => { + let arr = v.to_le_bytes(); + f(arr.as_slice()) + } + Datum::UInt16(v) => { + let arr = v.to_le_bytes(); + f(arr.as_slice()) + } + Datum::UInt8(v) => { + let arr = v.to_le_bytes(); + f(arr.as_slice()) + } + Datum::Int64(v) => { + let arr = v.to_le_bytes(); + f(arr.as_slice()) + } + Datum::Int32(v) => { + let arr = v.to_le_bytes(); + f(arr.as_slice()) + } + Datum::Int16(v) => { + let arr = v.to_le_bytes(); + f(arr.as_slice()) + } + Datum::Int8(v) => { + let arr = v.to_le_bytes(); + f(arr.as_slice()) + } + Datum::Boolean(v) => { + if *v { + f(&[1]) + } else { + f(&[0]) + } + } + Datum::Null => f(&[0]), + Datum::Timestamp(v) => { + let arr = v.as_i64().to_le_bytes(); + f(arr.as_slice()) + } + Datum::Varbinary(v) => f(v.as_ref()), + Datum::String(v) => f(v.as_bytes()), + Datum::Date(v) => { + let arr = v.to_le_bytes(); + f(arr.as_slice()) + } + Datum::Time(v) => { + let arr = v.to_le_bytes(); + f(arr.as_slice()) + } + } + } + pub fn to_bytes(&self) -> Vec { match self { Datum::Double(v) => v.to_le_bytes().to_vec(),