diff --git a/Cargo.lock b/Cargo.lock index 86e7b6767203b..99a32c432bf1c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1440,7 +1440,6 @@ dependencies = [ "dashmap", "educe", "enum-as-inner", - "enum_dispatch", "ethnum", "futures", "goldenfile", @@ -1476,6 +1475,7 @@ dependencies = [ "common-settings", "jsonb", "lexical-core", + "match-template", "micromarshal", "num", "ordered-float 3.4.0", diff --git a/src/query/catalog/src/plan/pushdown.rs b/src/query/catalog/src/plan/pushdown.rs index 5dfc170feca50..345986594cae8 100644 --- a/src/query/catalog/src/plan/pushdown.rs +++ b/src/query/catalog/src/plan/pushdown.rs @@ -93,8 +93,7 @@ impl PushDownInfo { if let RemoteExpr::::ColumnRef { id, .. } = &order.0 { // TODO: support sub column of nested type. let field = schema.field_with_name(id).unwrap(); - let data_type: DataType = field.data_type().into(); - if !support(&data_type) { + if !support(&field.data_type().into()) { return None; } diff --git a/src/query/expression/Cargo.toml b/src/query/expression/Cargo.toml index d2089b0d829a7..465bde5e68d6b 100755 --- a/src/query/expression/Cargo.toml +++ b/src/query/expression/Cargo.toml @@ -27,7 +27,6 @@ comfy-table = "6" dashmap = "5.4" educe = "0.4" enum-as-inner = "0.5" -enum_dispatch = "0.3.8" ethnum = { version = "1.3", features = ["serde", "macros"] } futures = "0.3.24" hex = "0.4.3" diff --git a/src/query/expression/src/block.rs b/src/query/expression/src/block.rs index bec57c05e3d79..eb04f545c702a 100644 --- a/src/query/expression/src/block.rs +++ b/src/query/expression/src/block.rs @@ -452,9 +452,8 @@ impl DataBlock { let column = if !block_column_ids.contains(&column_id) { let default_val = &default_vals[i]; let table_data_type = field.data_type(); - let data_type: DataType = table_data_type.into(); BlockEntry { - data_type, + data_type: table_data_type.into(), value: Value::Scalar(default_val.to_owned()), } } else { diff --git a/src/query/expression/src/deserializations/array.rs b/src/query/expression/src/deserializations/array.rs deleted file mode 100644 index d4245aa5cf146..0000000000000 --- a/src/query/expression/src/deserializations/array.rs +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright 2022 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use common_exception::ErrorCode; -use common_exception::Result; -use common_io::prelude::BinaryRead; -use common_io::prelude::FormatSettings; - -use crate::types::array::ArrayColumn; -use crate::types::AnyType; -use crate::types::DataType; -use crate::types::ValueType; -use crate::Column; -use crate::ColumnBuilder; -use crate::Scalar; -use crate::TypeDeserializer; -use crate::TypeDeserializerImpl; - -pub struct ArrayDeserializer { - pub inner: Box, - inner_ty: DataType, - offsets: Vec, -} - -impl ArrayDeserializer { - pub fn with_capacity(capacity: usize, inner_ty: &DataType) -> Self { - let mut offsets = Vec::with_capacity(capacity + 1); - offsets.push(0); - Self { - inner: Box::new(TypeDeserializerImpl::with_capacity(inner_ty, capacity)), - inner_ty: inner_ty.clone(), - offsets, - } - } - - pub fn add_offset(&mut self, size: usize) { - self.offsets - .push(*self.offsets.last().unwrap() + size as u64); - } - - pub fn pop_offset(&mut self) -> Result { - if self.offsets.len() <= 1 { - return Err(ErrorCode::BadDataValueType("Array is empty".to_string())); - } - let total = self.offsets.pop().unwrap(); - Ok((total - *self.offsets.last().unwrap()) as usize) - } -} - -impl TypeDeserializer for ArrayDeserializer { - fn memory_size(&self) -> usize { - self.inner.memory_size() + self.offsets.len() * std::mem::size_of::() - } - - fn len(&self) -> usize { - self.offsets.len() - 1 - } - - fn de_binary(&mut self, reader: &mut &[u8], format: &FormatSettings) -> Result<()> { - let size = reader.read_uvarint()?; - for _i in 0..size { - self.inner.de_binary(reader, format)?; - } - self.add_offset(size as usize); - Ok(()) - } - - fn de_default(&mut self) { - self.add_offset(0); - } - - fn de_fixed_binary_batch( - &mut self, - reader: &[u8], - step: usize, - rows: usize, - format: &FormatSettings, - ) -> Result<()> { - for row in 0..rows { - let mut reader = &reader[step * row..]; - let size = reader.read_uvarint()?; - for _i in 0..size { - self.inner.de_binary(&mut reader, format)?; - } - self.add_offset(size as usize); - } - Ok(()) - } - - fn de_json(&mut self, value: &serde_json::Value, format: &FormatSettings) -> Result<()> { - match value { - serde_json::Value::Array(vals) => { - for val in vals { - self.inner.de_json(val, format)?; - } - self.add_offset(vals.len()); - Ok(()) - } - _ => Err(ErrorCode::BadBytes("Incorrect json value, must be array")), - } - } - - fn append_data_value(&mut self, value: Scalar, format: &FormatSettings) -> Result<()> { - let value = value.as_array().unwrap(); - for val in AnyType::iter_column(value) { - self.inner.append_data_value(val.to_owned(), format)?; - } - self.add_offset(value.len()); - Ok(()) - } - - fn pop_data_value(&mut self) -> Result { - let size = self.pop_offset()?; - let mut vals = Vec::with_capacity(size); - for _ in 0..size { - let val = self.inner.pop_data_value()?; - vals.push(val); - } - let mut builder = ColumnBuilder::with_capacity(&self.inner_ty, size); - while !vals.is_empty() { - builder.push(vals.pop().unwrap().as_ref()); - } - Ok(Scalar::Array(builder.build())) - } - - fn finish_to_column(&mut self) -> Column { - let values = self.inner.finish_to_column(); - let offsets = std::mem::take(&mut self.offsets); - Column::Array(Box::new(ArrayColumn { - values, - offsets: offsets.into(), - })) - } -} diff --git a/src/query/expression/src/deserializations/boolean.rs b/src/query/expression/src/deserializations/boolean.rs deleted file mode 100644 index 9dc2fa32f3836..0000000000000 --- a/src/query/expression/src/deserializations/boolean.rs +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright 2022 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use common_arrow::arrow::bitmap::MutableBitmap; -use common_exception::ErrorCode; -use common_exception::Result; -use common_io::prelude::BinaryRead; -use common_io::prelude::FormatSettings; - -use crate::Column; -use crate::Scalar; -use crate::TypeDeserializer; - -pub type BooleanDeserializer = MutableBitmap; - -impl TypeDeserializer for BooleanDeserializer { - fn memory_size(&self) -> usize { - self.len() - } - - fn len(&self) -> usize { - MutableBitmap::len(self) - } - - fn de_binary(&mut self, reader: &mut &[u8], _format: &FormatSettings) -> Result<()> { - let value: bool = reader.read_scalar()?; - self.push(value); - Ok(()) - } - - fn de_default(&mut self) { - self.push(false); - } - - fn de_fixed_binary_batch( - &mut self, - reader: &[u8], - step: usize, - rows: usize, - _format: &FormatSettings, - ) -> Result<()> { - for row in 0..rows { - let mut reader = &reader[step * row..]; - let value: bool = reader.read_scalar()?; - self.push(value); - } - Ok(()) - } - - fn de_json(&mut self, value: &serde_json::Value, _format: &FormatSettings) -> Result<()> { - match value { - serde_json::Value::Bool(v) => self.push(*v), - _ => return Err(ErrorCode::from("Incorrect boolean value")), - } - Ok(()) - } - - fn append_data_value(&mut self, value: Scalar, _format: &FormatSettings) -> Result<()> { - let v = value - .as_boolean() - .ok_or_else(|| ErrorCode::from("Unable to get boolean value"))?; - self.push(*v); - Ok(()) - } - - fn pop_data_value(&mut self) -> Result { - match self.pop() { - Some(v) => Ok(Scalar::Boolean(v)), - None => Err(ErrorCode::from( - "Boolean column is empty when pop data value", - )), - } - } - - fn finish_to_column(&mut self) -> Column { - self.shrink_to_fit(); - let bitmap = std::mem::replace(self, Self::with_capacity(0)); - Column::Boolean(bitmap.into()) - } -} diff --git a/src/query/expression/src/deserializations/date.rs b/src/query/expression/src/deserializations/date.rs deleted file mode 100644 index 41d86cc164d61..0000000000000 --- a/src/query/expression/src/deserializations/date.rs +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright 2022 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::io::Cursor; - -use chrono::Datelike; -use chrono::NaiveDate; -use common_exception::ErrorCode; -use common_exception::Result; -use common_io::cursor_ext::BufferReadDateTimeExt; -use common_io::prelude::BinaryRead; -use common_io::prelude::FormatSettings; - -use crate::types::date::check_date; -use crate::Column; -use crate::Scalar; -use crate::TypeDeserializer; - -pub struct DateDeserializer { - pub buffer: Vec, - pub builder: Vec, -} - -impl DateDeserializer { - pub fn with_capacity(capacity: usize) -> Self { - Self { - buffer: vec![], - builder: Vec::with_capacity(capacity), - } - } -} - -impl TypeDeserializer for DateDeserializer { - fn memory_size(&self) -> usize { - self.builder.len() * std::mem::size_of::() - } - - fn len(&self) -> usize { - self.builder.len() - } - - fn de_binary(&mut self, reader: &mut &[u8], _format: &FormatSettings) -> Result<()> { - let value: i32 = reader.read_scalar()?; - self.builder.push(value); - Ok(()) - } - - fn de_default(&mut self) { - self.builder.push(i32::default()); - } - - fn de_fixed_binary_batch( - &mut self, - reader: &[u8], - step: usize, - rows: usize, - _format: &FormatSettings, - ) -> Result<()> { - for row in 0..rows { - let mut reader = &reader[step * row..]; - let value: i32 = reader.read_scalar()?; - self.builder.push(value); - } - Ok(()) - } - - fn de_json(&mut self, value: &serde_json::Value, format: &FormatSettings) -> Result<()> { - match value { - serde_json::Value::String(v) => { - let mut reader = Cursor::new(v.as_bytes()); - let date = reader.read_date_text(&format.timezone)?; - let days = uniform_date(date); - self.builder.push(days); - Ok(()) - } - _ => Err(ErrorCode::from("Incorrect string value")), - } - } - - fn append_data_value(&mut self, value: Scalar, _format: &FormatSettings) -> Result<()> { - let v = value - .as_date() - .ok_or_else(|| "Unable to get date value".to_string())?; - check_date(*v as i64).map_err(ErrorCode::from_string)?; - self.builder.push(*v); - Ok(()) - } - - fn pop_data_value(&mut self) -> Result { - match self.builder.pop() { - Some(v) => Ok(Scalar::Date(v)), - None => Err(ErrorCode::from("Date column is empty when pop data value")), - } - } - - fn finish_to_column(&mut self) -> Column { - self.builder.shrink_to_fit(); - Column::Date(std::mem::take(&mut self.builder).into()) - } -} - -pub const EPOCH_DAYS_FROM_CE: i32 = 719_163; - -#[inline] -pub fn uniform_date(date: NaiveDate) -> i32 { - date.num_days_from_ce() - EPOCH_DAYS_FROM_CE -} diff --git a/src/query/expression/src/deserializations/map.rs b/src/query/expression/src/deserializations/map.rs deleted file mode 100644 index f1fc29b072c44..0000000000000 --- a/src/query/expression/src/deserializations/map.rs +++ /dev/null @@ -1,185 +0,0 @@ -// Copyright 2023 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::HashSet; - -use common_exception::ErrorCode; -use common_exception::Result; -use common_io::prelude::BinaryRead; -use common_io::prelude::FormatSettings; - -use crate::types::array::ArrayColumn; -use crate::types::map::KvPair; -use crate::types::AnyType; -use crate::types::DataType; -use crate::types::ValueType; -use crate::Column; -use crate::ColumnBuilder; -use crate::Scalar; -use crate::TypeDeserializer; -use crate::TypeDeserializerImpl; - -pub struct MapDeserializer { - pub key: Box, - pub value: Box, - inner_ty: DataType, - offsets: Vec, -} - -impl MapDeserializer { - pub fn with_capacity(capacity: usize, inner_ty: &DataType) -> Self { - let mut offsets = Vec::with_capacity(capacity + 1); - offsets.push(0); - match inner_ty { - DataType::Tuple(typs) => { - let key_ty = &typs[0]; - let value_ty = &typs[1]; - Self { - key: Box::new(TypeDeserializerImpl::with_capacity(key_ty, capacity)), - value: Box::new(TypeDeserializerImpl::with_capacity(value_ty, capacity)), - inner_ty: inner_ty.clone(), - offsets, - } - } - _ => unreachable!(), - } - } - - pub fn add_offset(&mut self, size: usize) { - self.offsets - .push(*self.offsets.last().unwrap() + size as u64); - } - - pub fn pop_offset(&mut self) -> Result { - if self.offsets.len() <= 1 { - return Err(ErrorCode::BadDataValueType("Map is empty".to_string())); - } - let total = self.offsets.pop().unwrap(); - Ok((total - *self.offsets.last().unwrap()) as usize) - } -} - -impl TypeDeserializer for MapDeserializer { - fn memory_size(&self) -> usize { - self.key.memory_size() - + self.value.memory_size() - + self.offsets.len() * std::mem::size_of::() - } - - fn len(&self) -> usize { - self.offsets.len() - 1 - } - - fn de_binary(&mut self, reader: &mut &[u8], format: &FormatSettings) -> Result<()> { - let size = reader.read_uvarint()?; - for _i in 0..size { - self.key.de_binary(reader, format)?; - self.value.de_binary(reader, format)?; - } - self.add_offset(size as usize); - Ok(()) - } - - fn de_default(&mut self) { - self.add_offset(0); - } - - fn de_fixed_binary_batch( - &mut self, - reader: &[u8], - step: usize, - rows: usize, - format: &FormatSettings, - ) -> Result<()> { - for row in 0..rows { - let mut reader = &reader[step * row..]; - let size = reader.read_uvarint()?; - for _i in 0..size { - self.key.de_binary(&mut reader, format)?; - self.value.de_binary(&mut reader, format)?; - } - self.add_offset(size as usize); - } - Ok(()) - } - - fn de_json(&mut self, value: &serde_json::Value, format: &FormatSettings) -> Result<()> { - match value { - serde_json::Value::Object(obj) => { - for (key, val) in obj.iter() { - let key = serde_json::Value::String(key.to_string()); - self.key.de_json(&key, format)?; - self.value.de_json(val, format)?; - } - self.add_offset(obj.len()); - Ok(()) - } - _ => Err(ErrorCode::BadBytes("Incorrect json value, must be object")), - } - } - - fn append_data_value(&mut self, value: Scalar, format: &FormatSettings) -> Result<()> { - let col = value.as_map().unwrap(); - let kv_col = KvPair::::try_downcast_column(col).unwrap(); - let mut set = HashSet::new(); - for (key, val) in kv_col.iter() { - let key = key.to_owned(); - if set.contains(&key) { - return Err(ErrorCode::BadBytes( - "map keys have to be unique".to_string(), - )); - } - set.insert(key.clone()); - self.key.append_data_value(key, format)?; - self.value.append_data_value(val.to_owned(), format)?; - } - self.add_offset(col.len()); - Ok(()) - } - - fn pop_data_value(&mut self) -> Result { - let size = self.pop_offset()?; - let mut keys = Vec::with_capacity(size); - let mut vals = Vec::with_capacity(size); - for _ in 0..size { - let key = self.key.pop_data_value()?; - keys.push(key); - let val = self.value.pop_data_value()?; - vals.push(val); - } - let mut builder = ColumnBuilder::with_capacity(&self.inner_ty, size); - while !keys.is_empty() && !vals.is_empty() { - let key = keys.pop().unwrap(); - let val = vals.pop().unwrap(); - let scalar = Scalar::Tuple(vec![key, val]); - builder.push(scalar.as_ref()); - } - Ok(Scalar::Map(builder.build())) - } - - fn finish_to_column(&mut self) -> Column { - let key_col = self.key.finish_to_column(); - let value_col = self.value.finish_to_column(); - let len = key_col.len(); - let values = Column::Tuple { - fields: vec![key_col, value_col], - len, - }; - let offsets = std::mem::take(&mut self.offsets); - Column::Map(Box::new(ArrayColumn { - values, - offsets: offsets.into(), - })) - } -} diff --git a/src/query/expression/src/deserializations/mod.rs b/src/query/expression/src/deserializations/mod.rs deleted file mode 100644 index 91fbe9a2cf81f..0000000000000 --- a/src/query/expression/src/deserializations/mod.rs +++ /dev/null @@ -1,173 +0,0 @@ -// Copyright 2022 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use common_arrow::arrow::bitmap::MutableBitmap; -use common_io::prelude::*; - -mod array; -mod boolean; -mod date; -mod decimal; -mod map; -mod null; -mod nullable; -mod number; -mod string; -mod timestamp; -mod tuple; -mod variant; - -pub use array::*; -pub use boolean::*; -use common_exception::Result; -pub use date::*; -pub use decimal::*; -use enum_dispatch::enum_dispatch; -use ethnum::i256; -pub use map::*; -pub use null::*; -pub use nullable::*; -pub use number::*; -use serde_json::Value; -pub use string::*; -pub use timestamp::*; -pub use tuple::*; -pub use variant::*; - -use crate::types::number::F32; -use crate::types::number::F64; -use crate::types::string::StringColumnBuilder; -use crate::types::DataType; -use crate::types::DecimalDataType; -use crate::types::NumberDataType; -use crate::Column; -use crate::Scalar; - -#[enum_dispatch] -pub trait TypeDeserializer: Send + Sync { - fn memory_size(&self) -> usize; - fn len(&self) -> usize; - - fn de_binary(&mut self, reader: &mut &[u8], format: &FormatSettings) -> Result<()>; - - fn de_default(&mut self); - - fn de_fixed_binary_batch( - &mut self, - reader: &[u8], - step: usize, - rows: usize, - format: &FormatSettings, - ) -> Result<()>; - - fn de_json(&mut self, reader: &Value, format: &FormatSettings) -> Result<()>; - - fn de_null(&mut self, _format: &FormatSettings) -> bool { - false - } - - fn append_data_value(&mut self, value: Scalar, format: &FormatSettings) -> Result<()>; - - /// Note this method will return err only when inner builder is empty. - fn pop_data_value(&mut self) -> Result; - - fn finish_to_column(&mut self) -> Column; -} - -#[enum_dispatch(TypeDeserializer)] -pub enum TypeDeserializerImpl { - Null(NullDeserializer), - Nullable(NullableDeserializer), - Array(ArrayDeserializer), - Map(MapDeserializer), - Boolean(BooleanDeserializer), - Int8(NumberDeserializer), - Int16(NumberDeserializer), - Int32(NumberDeserializer), - Int64(NumberDeserializer), - UInt8(NumberDeserializer), - UInt16(NumberDeserializer), - UInt32(NumberDeserializer), - UInt64(NumberDeserializer), - Float32(NumberDeserializer), - Float64(NumberDeserializer), - Decimal128(DecimalDeserializer), - Decimal256(DecimalDeserializer), - - Date(DateDeserializer), - Timestamp(TimestampDeserializer), - String(StringDeserializer), - Struct(StructDeserializer), - Variant(VariantDeserializer), -} - -impl TypeDeserializerImpl { - pub fn with_capacity(ty: &DataType, capacity: usize) -> TypeDeserializerImpl { - match ty { - DataType::Null => 0.into(), - DataType::Boolean => MutableBitmap::with_capacity(capacity).into(), - DataType::String => StringColumnBuilder::with_capacity(capacity, capacity * 4).into(), - DataType::Number(num_ty) => match num_ty { - NumberDataType::UInt8 => { - NumberDeserializer::::with_capacity(capacity).into() - } - NumberDataType::UInt16 => { - NumberDeserializer::::with_capacity(capacity).into() - } - NumberDataType::UInt32 => { - NumberDeserializer::::with_capacity(capacity).into() - } - NumberDataType::UInt64 => { - NumberDeserializer::::with_capacity(capacity).into() - } - NumberDataType::Int8 => { - NumberDeserializer::::with_capacity(capacity).into() - } - NumberDataType::Int16 => { - NumberDeserializer::::with_capacity(capacity).into() - } - NumberDataType::Int32 => { - NumberDeserializer::::with_capacity(capacity).into() - } - NumberDataType::Int64 => { - NumberDeserializer::::with_capacity(capacity).into() - } - NumberDataType::Float32 => { - NumberDeserializer::::with_capacity(capacity).into() - } - NumberDataType::Float64 => { - NumberDeserializer::::with_capacity(capacity).into() - } - }, - DataType::Date => DateDeserializer::with_capacity(capacity).into(), - DataType::Timestamp => TimestampDeserializer::with_capacity(capacity).into(), - DataType::Nullable(inner_ty) => { - NullableDeserializer::with_capacity(capacity, inner_ty.as_ref()).into() - } - DataType::Variant => VariantDeserializer::with_capacity(capacity).into(), - DataType::Array(ty) => ArrayDeserializer::with_capacity(capacity, ty).into(), - DataType::Map(ty) => MapDeserializer::with_capacity(capacity, ty).into(), - DataType::Tuple(types) => TupleDeserializer::with_capacity(capacity, types).into(), - DataType::Decimal(types) => match types { - DecimalDataType::Decimal128(_) => { - DecimalDeserializer::::with_capacity(types, capacity).into() - } - DecimalDataType::Decimal256(_) => { - DecimalDeserializer::::with_capacity(types, capacity).into() - } - }, - _ => unimplemented!(), - } - } -} diff --git a/src/query/expression/src/deserializations/null.rs b/src/query/expression/src/deserializations/null.rs deleted file mode 100644 index 98749d47aa978..0000000000000 --- a/src/query/expression/src/deserializations/null.rs +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright 2022 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use common_exception::ErrorCode; -use common_exception::Result; -use common_io::prelude::*; - -use crate::Column; -use crate::Scalar; -use crate::TypeDeserializer; - -pub type NullDeserializer = usize; - -impl TypeDeserializer for NullDeserializer { - fn memory_size(&self) -> usize { - std::mem::size_of::() - } - - fn len(&self) -> usize { - *self - } - - fn de_binary(&mut self, _reader: &mut &[u8], _format: &FormatSettings) -> Result<()> { - *self += 1; - Ok(()) - } - - fn de_default(&mut self) { - *self += 1; - } - - fn de_fixed_binary_batch( - &mut self, - _reader: &[u8], - _step: usize, - rows: usize, - _format: &FormatSettings, - ) -> Result<()> { - for _ in 0..rows { - *self += 1; - } - Ok(()) - } - - fn de_json(&mut self, _value: &serde_json::Value, _format: &FormatSettings) -> Result<()> { - *self += 1; - Ok(()) - } - - fn append_data_value(&mut self, _value: Scalar, _format: &FormatSettings) -> Result<()> { - *self += 1; - Ok(()) - } - - fn pop_data_value(&mut self) -> Result { - if *self > 0 { - *self -= 1; - Ok(Scalar::Null) - } else { - Err(ErrorCode::from("Null column is empty when pop data value")) - } - } - - fn finish_to_column(&mut self) -> Column { - Column::Null { len: *self } - } -} diff --git a/src/query/expression/src/deserializations/nullable.rs b/src/query/expression/src/deserializations/nullable.rs deleted file mode 100644 index 814d2804c9164..0000000000000 --- a/src/query/expression/src/deserializations/nullable.rs +++ /dev/null @@ -1,132 +0,0 @@ -// Copyright 2022 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use common_arrow::arrow::bitmap::MutableBitmap; -use common_exception::ErrorCode; -use common_exception::Result; -use common_io::prelude::*; - -use crate::types::nullable::NullableColumn; -use crate::types::DataType; -use crate::Column; -use crate::Scalar; -use crate::TypeDeserializer; -use crate::TypeDeserializerImpl; - -pub struct NullableDeserializer { - pub validity: MutableBitmap, - pub inner: Box, -} - -impl NullableDeserializer { - pub fn with_capacity(capacity: usize, inner_ty: &DataType) -> Self { - Self { - validity: MutableBitmap::new(), - inner: Box::new(TypeDeserializerImpl::with_capacity(inner_ty, capacity)), - } - } -} - -impl TypeDeserializer for NullableDeserializer { - fn memory_size(&self) -> usize { - self.inner.memory_size() + self.validity.as_slice().len() - } - - fn len(&self) -> usize { - self.inner.len() - } - - fn de_binary(&mut self, reader: &mut &[u8], format: &FormatSettings) -> Result<()> { - let valid: bool = reader.read_scalar()?; - if valid { - self.inner.de_binary(reader, format)?; - } else { - self.inner.de_default(); - } - self.validity.push(valid); - Ok(()) - } - - fn de_default(&mut self) { - self.inner.de_default(); - self.validity.push(false); - } - - fn de_fixed_binary_batch( - &mut self, - _reader: &[u8], - _step: usize, - _rows: usize, - _format: &FormatSettings, - ) -> Result<()> { - Err(ErrorCode::from("unreachable")) - } - - fn de_json(&mut self, value: &serde_json::Value, format: &FormatSettings) -> Result<()> { - match value { - serde_json::Value::Null => { - self.de_null(format); - Ok(()) - } - other => { - self.validity.push(true); - self.inner.de_json(other, format) - } - } - } - - fn de_null(&mut self, _format: &FormatSettings) -> bool { - self.inner.de_default(); - self.validity.push(false); - true - } - - fn append_data_value(&mut self, value: Scalar, format: &FormatSettings) -> Result<()> { - match value { - Scalar::Null => { - self.validity.push(false); - self.inner.de_default(); - } - _ => { - self.validity.push(true); - self.inner.append_data_value(value, format)?; - } - } - Ok(()) - } - - fn pop_data_value(&mut self) -> Result { - match self.validity.pop() { - Some(v) => { - if v { - self.inner.pop_data_value() - } else { - let _ = self.inner.pop_data_value(); - Ok(Scalar::Null) - } - } - None => Err(ErrorCode::from( - "Nullable column is empty when pop data value", - )), - } - } - - fn finish_to_column(&mut self) -> Column { - let col = NullableColumn { - column: self.inner.finish_to_column(), - validity: std::mem::take(&mut self.validity).into(), - }; - Column::Nullable(Box::new(col)) - } -} diff --git a/src/query/expression/src/deserializations/number.rs b/src/query/expression/src/deserializations/number.rs deleted file mode 100644 index 521655f11e0c8..0000000000000 --- a/src/query/expression/src/deserializations/number.rs +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright 2022 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::io::Cursor; -use std::marker::PhantomData; - -use common_exception::ErrorCode; -use common_exception::Result; -use common_io::cursor_ext::*; -use common_io::prelude::BinaryRead; -use common_io::prelude::FormatSettings; -use common_io::prelude::StatBuffer; -use lexical_core::FromLexical; -use micromarshal::Unmarshal; - -use crate::types::number::Number; -use crate::Column; -use crate::Scalar; -use crate::TypeDeserializer; - -pub struct NumberDeserializer { - pub builder: Vec, - _p: PhantomData

, -} - -impl NumberDeserializer { - pub fn with_capacity(capacity: usize) -> Self { - Self { - builder: Vec::with_capacity(capacity), - _p: PhantomData, - } - } -} - -impl TypeDeserializer for NumberDeserializer -where - T: Number + Unmarshal + StatBuffer + From

, - P: Unmarshal

+ StatBuffer + FromLexical, -{ - fn memory_size(&self) -> usize { - self.builder.len() * std::mem::size_of::() - } - - fn len(&self) -> usize { - self.builder.len() - } - - fn de_binary(&mut self, reader: &mut &[u8], _format: &FormatSettings) -> Result<()> { - let value: T = reader.read_scalar()?; - self.builder.push(value); - Ok(()) - } - - fn de_default(&mut self) { - self.builder.push(T::default()); - } - - fn de_fixed_binary_batch( - &mut self, - reader: &[u8], - step: usize, - rows: usize, - _format: &FormatSettings, - ) -> Result<()> { - for row in 0..rows { - let mut reader = &reader[step * row..]; - let value: T = reader.read_scalar()?; - self.builder.push(value); - } - Ok(()) - } - - fn de_json(&mut self, value: &serde_json::Value, _format: &FormatSettings) -> Result<()> { - match value { - serde_json::Value::Number(v) => { - let v = v.to_string(); - let mut reader = Cursor::new(v.as_bytes()); - - let v: P = if !T::FLOATING { - reader.read_int_text() - } else { - reader.read_float_text() - }?; - - self.builder.push(v.into()); - Ok(()) - } - _ => Err(ErrorCode::from("Incorrect json value, must be number")), - } - } - - fn append_data_value(&mut self, value: Scalar, _format: &FormatSettings) -> Result<()> { - let v = value - .as_number() - .ok_or_else(|| ErrorCode::from(format!("Unable to get number value {}", value)))?; - let num = T::try_downcast_scalar(v).unwrap(); - self.builder.push(num); - Ok(()) - } - - fn pop_data_value(&mut self) -> Result { - match self.builder.pop() { - Some(v) => Ok(Scalar::Number(T::upcast_scalar(v))), - None => Err(ErrorCode::from( - "Number column is empty when pop data value", - )), - } - } - - fn finish_to_column(&mut self) -> Column { - self.builder.shrink_to_fit(); - let col = T::upcast_column(std::mem::take(&mut self.builder).into()); - Column::Number(col) - } -} diff --git a/src/query/expression/src/deserializations/string.rs b/src/query/expression/src/deserializations/string.rs deleted file mode 100644 index 290dcd72f00b8..0000000000000 --- a/src/query/expression/src/deserializations/string.rs +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright 2022 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::io::Read; - -use common_exception::ErrorCode; -use common_exception::Result; -use common_io::prelude::BinaryRead; -use common_io::prelude::FormatSettings; - -use crate::types::string::StringColumn; -use crate::types::string::StringColumnBuilder; -use crate::Column; -use crate::Scalar; -use crate::TypeDeserializer; - -pub type StringDeserializer = StringColumnBuilder; - -impl TypeDeserializer for StringDeserializer { - fn memory_size(&self) -> usize { - self.data.len() * std::mem::size_of::() - + self.offsets.len() * std::mem::size_of::() - } - - fn len(&self) -> usize { - StringColumnBuilder::len(self) - } - - // See GroupHash.rs for StringColumn - #[allow(clippy::uninit_vec)] - fn de_binary(&mut self, reader: &mut &[u8], _format: &FormatSettings) -> Result<()> { - let offset: u64 = reader.read_uvarint()?; - - self.data.resize(offset as usize + self.data.len(), 0); - let last = *self.offsets.last().unwrap() as usize; - reader.read_exact(&mut self.data[last..last + offset as usize])?; - - self.commit_row(); - Ok(()) - } - - fn de_default(&mut self) { - self.put_str(""); - self.commit_row(); - } - - fn de_fixed_binary_batch( - &mut self, - reader: &[u8], - step: usize, - rows: usize, - _format: &FormatSettings, - ) -> Result<()> { - for row in 0..rows { - let reader = &reader[step * row..]; - self.put_slice(reader); - self.commit_row(); - } - Ok(()) - } - - fn de_json(&mut self, value: &serde_json::Value, _format: &FormatSettings) -> Result<()> { - match value { - serde_json::Value::String(s) => { - self.put_str(s.as_str()); - self.commit_row(); - Ok(()) - } - _ => Err(ErrorCode::from("Incorrect json value, must be string")), - } - } - - fn append_data_value(&mut self, value: Scalar, _format: &FormatSettings) -> Result<()> { - let v = value - .as_string() - .ok_or_else(|| ErrorCode::from("Unable to get string value"))?; - self.put(v.as_slice()); - self.commit_row(); - Ok(()) - } - - fn pop_data_value(&mut self) -> Result { - match self.pop() { - Some(v) => Ok(Scalar::String(v)), - None => Err(ErrorCode::from( - "String column is empty when pop data value", - )), - } - } - - fn finish_to_column(&mut self) -> Column { - let col = StringColumn { - data: std::mem::take(&mut self.data).into(), - offsets: std::mem::take(&mut self.offsets).into(), - }; - Column::String(col) - } -} diff --git a/src/query/expression/src/deserializations/timestamp.rs b/src/query/expression/src/deserializations/timestamp.rs deleted file mode 100644 index c579d4200aac4..0000000000000 --- a/src/query/expression/src/deserializations/timestamp.rs +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright 2022 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::io::Cursor; - -use common_exception::ErrorCode; -use common_exception::Result; -use common_io::cursor_ext::*; -use common_io::prelude::BinaryRead; -use common_io::prelude::FormatSettings; - -use crate::types::timestamp::check_timestamp; -use crate::Column; -use crate::Scalar; -use crate::TypeDeserializer; - -pub struct TimestampDeserializer { - pub buffer: Vec, - pub builder: Vec, -} - -impl TimestampDeserializer { - pub fn with_capacity(capacity: usize) -> Self { - Self { - buffer: vec![], - builder: Vec::with_capacity(capacity), - } - } -} - -impl TypeDeserializer for TimestampDeserializer { - fn memory_size(&self) -> usize { - self.builder.len() * std::mem::size_of::() - } - - fn len(&self) -> usize { - self.builder.len() - } - - fn de_binary(&mut self, reader: &mut &[u8], _format: &FormatSettings) -> Result<()> { - let value: i64 = reader.read_scalar()?; - check_timestamp(value)?; - self.builder.push(value); - Ok(()) - } - - fn de_default(&mut self) { - self.builder.push(i64::default()); - } - - fn de_json(&mut self, value: &serde_json::Value, format: &FormatSettings) -> Result<()> { - match value { - serde_json::Value::String(v) => { - let v = v.clone(); - let mut reader = Cursor::new(v.as_bytes()); - let ts = reader.read_timestamp_text(&format.timezone)?; - - let micros = ts.timestamp_micros(); - check_timestamp(micros)?; - self.builder.push(micros); - Ok(()) - } - _ => Err(ErrorCode::from("Incorrect timestamp value")), - } - } - - fn de_fixed_binary_batch( - &mut self, - reader: &[u8], - step: usize, - rows: usize, - _format: &FormatSettings, - ) -> Result<()> { - for row in 0..rows { - let mut reader = &reader[step * row..]; - let value: i64 = reader.read_scalar()?; - self.builder.push(value); - } - Ok(()) - } - - fn append_data_value(&mut self, value: Scalar, _format: &FormatSettings) -> Result<()> { - let v = value - .as_timestamp() - .ok_or_else(|| ErrorCode::from("Unable to get timestamp value"))?; - check_timestamp(*v)?; - self.builder.push(*v); - Ok(()) - } - - fn pop_data_value(&mut self) -> Result { - match self.builder.pop() { - Some(v) => Ok(Scalar::Timestamp(v)), - None => Err(ErrorCode::from( - "Timestamp column is empty when pop data value", - )), - } - } - - fn finish_to_column(&mut self) -> Column { - self.builder.shrink_to_fit(); - Column::Timestamp(std::mem::take(&mut self.builder).into()) - } -} diff --git a/src/query/expression/src/deserializations/tuple.rs b/src/query/expression/src/deserializations/tuple.rs deleted file mode 100644 index 40db7873063b2..0000000000000 --- a/src/query/expression/src/deserializations/tuple.rs +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright 2022 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use common_exception::ErrorCode; -use common_exception::Result; -use common_io::prelude::FormatSettings; - -use crate::types::DataType; -use crate::Column; -use crate::Scalar; -use crate::TypeDeserializer; -use crate::TypeDeserializerImpl; - -pub struct TupleDeserializer { - pub inners: Vec, -} - -pub type StructDeserializer = TupleDeserializer; - -impl TupleDeserializer { - pub fn with_capacity(capacity: usize, inners: &[DataType]) -> Self { - let inners = inners - .iter() - .map(|ty| TypeDeserializerImpl::with_capacity(ty, capacity)) - .collect(); - Self { inners } - } -} - -impl TypeDeserializer for TupleDeserializer { - fn memory_size(&self) -> usize { - self.inners.iter().map(|d| d.memory_size()).sum() - } - - fn len(&self) -> usize { - self.inners.first().map(|c| c.len()).unwrap_or_default() - } - - fn de_binary(&mut self, reader: &mut &[u8], format: &FormatSettings) -> Result<()> { - for inner in self.inners.iter_mut() { - inner.de_binary(reader, format)?; - } - Ok(()) - } - - fn de_default(&mut self) { - for inner in self.inners.iter_mut() { - inner.de_default(); - } - } - - fn de_json(&mut self, value: &serde_json::Value, format: &FormatSettings) -> Result<()> { - match value { - serde_json::Value::Array(obj) => { - if self.inners.len() != obj.len() { - return Err(ErrorCode::from_string(format!( - "Incorrect json value, expect {} values, but get {} values", - self.inners.len(), - obj.len() - ))); - } - for (inner, val) in self.inners.iter_mut().zip(obj.iter()) { - inner.de_json(val, format)?; - } - Ok(()) - } - _ => Err(ErrorCode::from("Incorrect tuple value")), - } - } - - fn de_fixed_binary_batch( - &mut self, - reader: &[u8], - step: usize, - rows: usize, - format: &FormatSettings, - ) -> Result<()> { - for row in 0..rows { - let mut reader = &reader[step * row..]; - for inner in self.inners.iter_mut() { - inner.de_binary(&mut reader, format)?; - } - } - Ok(()) - } - - fn append_data_value(&mut self, value: Scalar, format: &FormatSettings) -> Result<()> { - let v = value - .as_tuple() - .ok_or_else(|| ErrorCode::from("Unable to get tuple value"))?; - - for (v, inner) in v.iter().zip(self.inners.iter_mut()) { - inner.append_data_value(v.clone(), format)?; - } - Ok(()) - } - - fn pop_data_value(&mut self) -> Result { - let mut vals = Vec::with_capacity(self.inners.len()); - for inner in self.inners.iter_mut() { - let val = inner.pop_data_value()?; - vals.push(val); - } - Ok(Scalar::Tuple(vals)) - } - - fn finish_to_column(&mut self) -> Column { - let fields: Vec = self - .inners - .iter_mut() - .map(|f| f.finish_to_column()) - .collect(); - let len = fields.iter().map(|f| f.len()).next().unwrap_or(0); - Column::Tuple { fields, len } - } -} diff --git a/src/query/expression/src/deserializations/variant.rs b/src/query/expression/src/deserializations/variant.rs deleted file mode 100644 index 8c57c2c6be46e..0000000000000 --- a/src/query/expression/src/deserializations/variant.rs +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright 2022 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::io::Read; - -use common_exception::ErrorCode; -use common_exception::Result; -use common_io::prelude::*; - -use crate::types::string::StringColumn; -use crate::types::string::StringColumnBuilder; -use crate::types::variant::JSONB_NULL; -use crate::Column; -use crate::Scalar; -use crate::TypeDeserializer; - -pub struct VariantDeserializer { - pub builder: StringColumnBuilder, -} - -impl VariantDeserializer { - pub fn with_capacity(capacity: usize) -> Self { - Self { - builder: StringColumnBuilder::with_capacity(capacity, capacity * 4), - } - } -} - -impl TypeDeserializer for VariantDeserializer { - fn memory_size(&self) -> usize { - self.builder.data.len() * std::mem::size_of::() - + self.builder.offsets.len() * std::mem::size_of::() - } - - fn len(&self) -> usize { - self.builder.len() - } - - fn de_default(&mut self) { - self.builder.put(JSONB_NULL); - self.builder.commit_row(); - } - - fn append_data_value(&mut self, value: Scalar, _format: &FormatSettings) -> Result<()> { - let v = value - .as_variant() - .ok_or_else(|| ErrorCode::from("Unable to get variant value"))?; - self.builder.put(v.as_slice()); - self.builder.commit_row(); - Ok(()) - } - - fn pop_data_value(&mut self) -> Result { - match self.builder.pop() { - Some(v) => Ok(Scalar::Variant(v)), - None => Err(ErrorCode::from( - "Variant column is empty when pop data value", - )), - } - } - - fn finish_to_column(&mut self) -> Column { - let col = StringColumn { - data: std::mem::take(&mut self.builder.data).into(), - offsets: std::mem::take(&mut self.builder.offsets).into(), - }; - Column::Variant(col) - } - - fn de_binary(&mut self, reader: &mut &[u8], _format: &FormatSettings) -> Result<()> { - let offset: u64 = reader.read_uvarint()?; - - self.builder - .data - .resize(offset as usize + self.builder.data.len(), 0); - let last = *self.builder.offsets.last().unwrap() as usize; - reader.read_exact(&mut self.builder.data[last..last + offset as usize])?; - - self.builder.commit_row(); - Ok(()) - } - - fn de_fixed_binary_batch( - &mut self, - reader: &[u8], - step: usize, - rows: usize, - _format: &FormatSettings, - ) -> Result<()> { - for row in 0..rows { - let val = &reader[step * row..]; - self.builder.put_slice(val); - self.builder.commit_row(); - } - Ok(()) - } - - fn de_json(&mut self, value: &serde_json::Value, _format: &FormatSettings) -> Result<()> { - let v = jsonb::Value::from(value); - v.write_to_vec(&mut self.builder.data); - self.builder.commit_row(); - Ok(()) - } -} diff --git a/src/query/expression/src/evaluator.rs b/src/query/expression/src/evaluator.rs index 16f2c6ca5b21b..8ae9bf2be2610 100644 --- a/src/query/expression/src/evaluator.rs +++ b/src/query/expression/src/evaluator.rs @@ -188,10 +188,7 @@ impl<'a> Evaluator<'a> { } Value::Column(col) => { let len = col.len(); - Value::Column(Column::Tuple { - fields: vec![col, Column::Null { len }], - len, - }) + Value::Column(Column::Tuple(vec![col, Column::Null { len }])) } }, Err((_, value, bitmap, err)) => { @@ -210,10 +207,7 @@ impl<'a> Evaluator<'a> { column: err_col, validity: bitmap, })); - Value::Column(Column::Tuple { - fields: vec![value_col, err_col], - len: num_rows, - }) + Value::Column(Column::Tuple(vec![value_col, err_col])) } }), }; @@ -426,7 +420,7 @@ impl<'a> Evaluator<'a> { .collect::>>()?; Ok(Value::Scalar(Scalar::Tuple(new_fields))) } - Value::Column(Column::Tuple { fields, len }) => { + Value::Column(Column::Tuple(fields)) => { let new_fields = fields .into_iter() .zip(fields_src_ty.iter()) @@ -436,10 +430,7 @@ impl<'a> Evaluator<'a> { .map(|val| val.into_column().unwrap()) }) .collect::>()?; - Ok(Value::Column(Column::Tuple { - fields: new_fields, - len, - })) + Ok(Value::Column(Column::Tuple(new_fields))) } other => unreachable!("source: {}", other), } @@ -613,7 +604,7 @@ impl<'a> Evaluator<'a> { .collect::>()?; Ok(Value::Scalar(Scalar::Tuple(new_fields))) } - Value::Column(Column::Tuple { fields, len }) => { + Value::Column(Column::Tuple(fields)) => { let new_fields = fields .into_iter() .zip(fields_src_ty.iter()) @@ -625,10 +616,7 @@ impl<'a> Evaluator<'a> { .unwrap()) }) .collect::>()?; - let new_col = Column::Tuple { - fields: new_fields, - len, - }; + let new_col = Column::Tuple(new_fields); Ok(Value::Column(new_col)) } other => unreachable!("source: {}", other), diff --git a/src/query/expression/src/kernels/concat.rs b/src/query/expression/src/kernels/concat.rs index 8bd6b36bad841..c43262c59a290 100644 --- a/src/query/expression/src/kernels/concat.rs +++ b/src/query/expression/src/kernels/concat.rs @@ -43,8 +43,6 @@ use crate::BlockEntry; use crate::Column; use crate::ColumnBuilder; use crate::DataBlock; -use crate::TypeDeserializer; -use crate::TypeDeserializerImpl; use crate::Value; impl DataBlock { @@ -143,10 +141,7 @@ impl Column { Column::Array(col) => { let mut offsets = Vec::with_capacity(capacity + 1); offsets.push(0); - let builder = ColumnBuilder::from_column( - TypeDeserializerImpl::with_capacity(&col.values.data_type(), capacity) - .finish_to_column(), - ); + let builder = ColumnBuilder::with_capacity(&col.values.data_type(), capacity); let builder = ArrayColumnBuilder { builder, offsets }; Self::concat_value_types::>(builder, columns) } @@ -154,11 +149,10 @@ impl Column { let mut offsets = Vec::with_capacity(capacity + 1); offsets.push(0); let builder = ColumnBuilder::from_column( - TypeDeserializerImpl::with_capacity(&col.values.data_type(), capacity) - .finish_to_column(), + ColumnBuilder::with_capacity(&col.values.data_type(), capacity).build(), ); let (key_builder, val_builder) = match builder { - ColumnBuilder::Tuple { fields, .. } => (fields[0].clone(), fields[1].clone()), + ColumnBuilder::Tuple(fields) => (fields[0].clone(), fields[1].clone()), _ => unreachable!(), }; let builder = KvColumnBuilder { @@ -183,20 +177,17 @@ impl Column { Column::Nullable(Box::new(NullableColumn { column, validity })) } - Column::Tuple { fields, .. } => { + Column::Tuple(fields) => { let fields = (0..fields.len()) .map(|idx| { let cs: Vec = columns .iter() - .map(|col| col.as_tuple().unwrap().0[idx].clone()) + .map(|col| col.as_tuple().unwrap()[idx].clone()) .collect(); Self::concat(&cs) }) .collect(); - Column::Tuple { - fields, - len: capacity, - } + Column::Tuple(fields) } Column::Variant(_) => { let data_capacity = columns.iter().map(|c| c.memory_size() - c.len() * 8).sum(); diff --git a/src/query/expression/src/kernels/filter.rs b/src/query/expression/src/kernels/filter.rs index 3e50aadb7c30b..7f9b69ea0fd0b 100644 --- a/src/query/expression/src/kernels/filter.rs +++ b/src/query/expression/src/kernels/filter.rs @@ -39,8 +39,6 @@ use crate::BlockEntry; use crate::Column; use crate::ColumnBuilder; use crate::DataBlock; -use crate::TypeDeserializer; -use crate::TypeDeserializerImpl; use crate::Value; impl DataBlock { @@ -139,10 +137,7 @@ impl Column { Column::Array(column) => { let mut offsets = Vec::with_capacity(length + 1); offsets.push(0); - let builder = ColumnBuilder::from_column( - TypeDeserializerImpl::with_capacity(&column.values.data_type(), length) - .finish_to_column(), - ); + let builder = ColumnBuilder::with_capacity(&column.values.data_type(), length); let builder = ArrayColumnBuilder { builder, offsets }; Self::filter_scalar_types::>(column, builder, filter) } @@ -150,11 +145,10 @@ impl Column { let mut offsets = Vec::with_capacity(length + 1); offsets.push(0); let builder = ColumnBuilder::from_column( - TypeDeserializerImpl::with_capacity(&column.values.data_type(), length) - .finish_to_column(), + ColumnBuilder::with_capacity(&column.values.data_type(), length).build(), ); let (key_builder, val_builder) = match builder { - ColumnBuilder::Tuple { fields, .. } => (fields[0].clone(), fields[1].clone()), + ColumnBuilder::Tuple(fields) => (fields[0].clone(), fields[1].clone()), _ => unreachable!(), }; let builder = KvColumnBuilder { @@ -177,10 +171,9 @@ impl Column { validity: BooleanType::try_downcast_column(&validity).unwrap(), })) } - Column::Tuple { fields, .. } => { - let len = filter.len() - filter.unset_bits(); + Column::Tuple(fields) => { let fields = fields.iter().map(|c| c.filter(filter)).collect(); - Column::Tuple { fields, len } + Column::Tuple(fields) } Column::Variant(column) => { let bytes_per_row = column.data.len() / filter.len().max(1); diff --git a/src/query/expression/src/kernels/group_by_hash.rs b/src/query/expression/src/kernels/group_by_hash.rs index 7567493297245..a798010d371ce 100644 --- a/src/query/expression/src/kernels/group_by_hash.rs +++ b/src/query/expression/src/kernels/group_by_hash.rs @@ -23,7 +23,6 @@ use common_exception::ErrorCode; use common_exception::Result; use common_hashtable::FastHash; use common_io::prelude::BinaryWrite; -use common_io::prelude::FormatSettings; use ethnum::i256; use ethnum::u256; use ethnum::U256; @@ -46,8 +45,7 @@ use crate::with_decimal_mapped_type; use crate::with_integer_mapped_type; use crate::with_number_mapped_type; use crate::Column; -use crate::TypeDeserializer; -use crate::TypeDeserializerImpl; +use crate::ColumnBuilder; #[derive(Debug)] pub enum KeysState { @@ -338,35 +336,28 @@ where T: Clone for (_, data_type) in sorted_group_items.iter() { let non_null_type = data_type.remove_nullable(); - let mut deserializer = TypeDeserializerImpl::with_capacity(&non_null_type, rows); + let mut column = ColumnBuilder::with_capacity(&non_null_type, rows); let reader = vec8.as_slice(); - let format = FormatSettings::default(); let col = match data_type.is_nullable() { false => { - deserializer.de_fixed_binary_batch(&reader[offsize..], step, rows, &format)?; - deserializer.finish_to_column() + column.push_fix_len_binaries(&reader[offsize..], step, rows)?; + column.build() } true => { - let mut bitmap_deserializer = - TypeDeserializerImpl::with_capacity(&DataType::Boolean, rows); - bitmap_deserializer.de_fixed_binary_batch( - &reader[null_offsize..], - step, - rows, - &format, - )?; + let mut bitmap_column = ColumnBuilder::with_capacity(&DataType::Boolean, rows); + bitmap_column.push_fix_len_binaries(&reader[null_offsize..], step, rows)?; null_offsize += 1; - let col = bitmap_deserializer.finish_to_column(); + let col = bitmap_column.build(); let col = BooleanType::try_downcast_column(&col).unwrap(); // we store 1 for nulls in fixed_hash let bitmap = col.not(); - deserializer.de_fixed_binary_batch(&reader[offsize..], step, rows, &format)?; - let inner = deserializer.finish_to_column(); + column.push_fix_len_binaries(&reader[offsize..], step, rows)?; + let inner = column.build(); Column::Nullable(Box::new(NullableColumn { column: inner, validity: bitmap, @@ -560,7 +551,7 @@ pub fn serialize_column_binary(column: &Column, row: usize, vec: &mut Vec) { serialize_column_binary(&c.column, row, vec); } } - Column::Tuple { fields, .. } => { + Column::Tuple(fields) => { for inner_col in fields.iter() { serialize_column_binary(inner_col, row, vec); } diff --git a/src/query/expression/src/kernels/scatter.rs b/src/query/expression/src/kernels/scatter.rs index 0791fc1dcae14..4a5b3d538ea90 100644 --- a/src/query/expression/src/kernels/scatter.rs +++ b/src/query/expression/src/kernels/scatter.rs @@ -41,8 +41,6 @@ use crate::Column; use crate::ColumnBuilder; use crate::DataBlock; use crate::Scalar; -use crate::TypeDeserializer; -use crate::TypeDeserializerImpl; use crate::Value; impl DataBlock { @@ -197,10 +195,7 @@ impl Column { Column::Array(column) => { let mut offsets = Vec::with_capacity(length + 1); offsets.push(0); - let builder = ColumnBuilder::from_column( - TypeDeserializerImpl::with_capacity(&column.values.data_type(), length) - .finish_to_column(), - ); + let builder = ColumnBuilder::with_capacity(&column.values.data_type(), length); let builder = ArrayColumnBuilder { builder, offsets }; Self::scatter_scalars::, _>( column, @@ -213,11 +208,10 @@ impl Column { let mut offsets = Vec::with_capacity(length + 1); offsets.push(0); let builder = ColumnBuilder::from_column( - TypeDeserializerImpl::with_capacity(&column.values.data_type(), length) - .finish_to_column(), + ColumnBuilder::with_capacity(&column.values.data_type(), length).build(), ); let (key_builder, val_builder) = match builder { - ColumnBuilder::Tuple { fields, .. } => (fields[0].clone(), fields[1].clone()), + ColumnBuilder::Tuple(fields) => (fields[0].clone(), fields[1].clone()), _ => unreachable!(), }; let builder = KvColumnBuilder { @@ -252,7 +246,7 @@ impl Column { }) .collect() } - Column::Tuple { fields, .. } => { + Column::Tuple(fields) => { let fields_vs: Vec> = fields .iter() .map(|c| c.scatter(data_type, indices, scatter_size)) @@ -265,10 +259,7 @@ impl Column { for col in &fields_vs { fields.push(col[s].clone()); } - res.push(Column::Tuple { - len: fields.first().map_or(0, |f| f.len()), - fields, - }); + res.push(Column::Tuple(fields)); } res } diff --git a/src/query/expression/src/kernels/take.rs b/src/query/expression/src/kernels/take.rs index 7550d9d2c7600..47945b0e456a7 100644 --- a/src/query/expression/src/kernels/take.rs +++ b/src/query/expression/src/kernels/take.rs @@ -36,8 +36,6 @@ use crate::BlockEntry; use crate::Column; use crate::ColumnBuilder; use crate::DataBlock; -use crate::TypeDeserializer; -use crate::TypeDeserializerImpl; use crate::Value; impl DataBlock { @@ -108,10 +106,7 @@ impl Column { Column::Array(column) => { let mut offsets = Vec::with_capacity(length + 1); offsets.push(0); - let builder = ColumnBuilder::from_column( - TypeDeserializerImpl::with_capacity(&column.values.data_type(), self.len()) - .finish_to_column(), - ); + let builder = ColumnBuilder::with_capacity(&column.values.data_type(), self.len()); let builder = ArrayColumnBuilder { builder, offsets }; Self::take_value_types::, _>(column, builder, indices) } @@ -119,11 +114,10 @@ impl Column { let mut offsets = Vec::with_capacity(length + 1); offsets.push(0); let builder = ColumnBuilder::from_column( - TypeDeserializerImpl::with_capacity(&column.values.data_type(), self.len()) - .finish_to_column(), + ColumnBuilder::with_capacity(&column.values.data_type(), self.len()).build(), ); let (key_builder, val_builder) = match builder { - ColumnBuilder::Tuple { fields, .. } => (fields[0].clone(), fields[1].clone()), + ColumnBuilder::Tuple(fields) => (fields[0].clone(), fields[1].clone()), _ => unreachable!(), }; let builder = KvColumnBuilder { @@ -142,12 +136,9 @@ impl Column { validity: BooleanType::try_downcast_column(&validity).unwrap(), })) } - Column::Tuple { fields, .. } => { + Column::Tuple(fields) => { let fields = fields.iter().map(|c| c.take(indices)).collect(); - Column::Tuple { - fields, - len: indices.len(), - } + Column::Tuple(fields) } Column::Variant(column) => Self::take_arg_types::(column, indices), } diff --git a/src/query/expression/src/kernels/take_chunks.rs b/src/query/expression/src/kernels/take_chunks.rs index bebbdeb3e093b..c9c82f5d2e868 100644 --- a/src/query/expression/src/kernels/take_chunks.rs +++ b/src/query/expression/src/kernels/take_chunks.rs @@ -39,8 +39,6 @@ use crate::Column; use crate::ColumnBuilder; use crate::DataBlock; use crate::Scalar; -use crate::TypeDeserializer; -use crate::TypeDeserializerImpl; use crate::Value; // Block idx, row idx in the block, repeat times @@ -242,10 +240,7 @@ impl Column { Column::Array(column) => { let mut offsets = Vec::with_capacity(result_size + 1); offsets.push(0); - let builder = ColumnBuilder::from_column( - TypeDeserializerImpl::with_capacity(&column.values.data_type(), result_size) - .finish_to_column(), - ); + let builder = ColumnBuilder::with_capacity(&column.values.data_type(), result_size); let builder = ArrayColumnBuilder { builder, offsets }; Self::take_block_value_types::>(columns, builder, indices) } @@ -253,11 +248,10 @@ impl Column { let mut offsets = Vec::with_capacity(result_size + 1); offsets.push(0); let builder = ColumnBuilder::from_column( - TypeDeserializerImpl::with_capacity(&column.values.data_type(), result_size) - .finish_to_column(), + ColumnBuilder::with_capacity(&column.values.data_type(), result_size).build(), ); let (key_builder, val_builder) = match builder { - ColumnBuilder::Tuple { fields, .. } => (fields[0].clone(), fields[1].clone()), + ColumnBuilder::Tuple(fields) => (fields[0].clone(), fields[1].clone()), _ => unreachable!(), }; let builder = KvColumnBuilder { @@ -309,7 +303,7 @@ impl Column { let inner_columns = columns .iter() .map(|c| match c { - Column::Tuple { fields, .. } => fields.clone(), + Column::Tuple(fields) => fields.clone(), _ => unreachable!(), }) .collect::>(); @@ -326,10 +320,7 @@ impl Column { }) .collect(); - Column::Tuple { - fields, - len: result_size, - } + Column::Tuple(fields) } Column::Variant(_) => { let builder = VariantType::create_builder(result_size, &[]); diff --git a/src/query/expression/src/lib.rs b/src/query/expression/src/lib.rs index 21c76d5b12956..6d4b35a50e667 100755 --- a/src/query/expression/src/lib.rs +++ b/src/query/expression/src/lib.rs @@ -37,7 +37,6 @@ mod block; pub mod converts; -mod deserializations; mod evaluator; mod expression; mod function; @@ -53,7 +52,6 @@ pub mod values; pub use crate::block::BlockMetaInfo; pub use crate::block::BlockMetaInfoPtr; pub use crate::block::*; -pub use crate::deserializations::*; pub use crate::evaluator::*; pub use crate::expression::*; pub use crate::function::*; diff --git a/src/query/expression/src/schema.rs b/src/query/expression/src/schema.rs index 26bb10e920686..97bda93edc1c9 100644 --- a/src/query/expression/src/schema.rs +++ b/src/query/expression/src/schema.rs @@ -62,7 +62,6 @@ use crate::BlockEntry; use crate::Column; use crate::FromData; use crate::Scalar; -use crate::TypeDeserializerImpl; use crate::Value; use crate::ARROW_EXT_TYPE_EMPTY_ARRAY; use crate::ARROW_EXT_TYPE_EMPTY_MAP; @@ -254,17 +253,6 @@ impl DataSchema { ArrowSchema::from(fields).with_metadata(self.metadata.clone()) } - - pub fn create_deserializers(&self, capacity: usize) -> Vec { - let mut deserializers = Vec::with_capacity(self.num_fields()); - for field in self.fields() { - deserializers.push(TypeDeserializerImpl::with_capacity( - &field.data_type, - capacity, - )); - } - deserializers - } } impl TableSchema { @@ -730,15 +718,6 @@ impl TableSchema { ArrowSchema::from(fields).with_metadata(self.metadata.clone()) } - - pub fn create_deserializers(&self, capacity: usize) -> Vec { - let mut deserializers = Vec::with_capacity(self.num_fields()); - for field in self.fields() { - let data_type: DataType = field.data_type().into(); - deserializers.push(TypeDeserializerImpl::with_capacity(&data_type, capacity)); - } - deserializers - } } impl DataField { @@ -1166,7 +1145,7 @@ impl TableDataType { } BlockEntry { data_type: DataType::Tuple(types), - value: Value::Column(Column::Tuple { fields, len }), + value: Value::Column(Column::Tuple(fields)), } } TableDataType::Variant => { diff --git a/src/query/expression/src/types/array.rs b/src/query/expression/src/types/array.rs index 79336f17b2c64..f548ef63ac48d 100755 --- a/src/query/expression/src/types/array.rs +++ b/src/query/expression/src/types/array.rs @@ -355,3 +355,20 @@ impl ArrayColumnBuilder { } } } + +impl ArrayColumnBuilder { + pub fn pop(&mut self) -> Option { + if self.len() > 0 { + let pop_count = self.offsets[self.offsets.len() - 1] as usize + - self.offsets[self.offsets.len() - 2] as usize; + self.offsets.pop(); + let mut builder = ColumnBuilder::with_capacity(&self.builder.data_type(), pop_count); + for _ in 0..pop_count { + builder.push(self.builder.pop().unwrap().as_ref()); + } + Some(builder.build()) + } else { + None + } + } +} diff --git a/src/query/expression/src/types/map.rs b/src/query/expression/src/types/map.rs index 9c5c8d484727c..81efd3cf59f4f 100755 --- a/src/query/expression/src/types/map.rs +++ b/src/query/expression/src/types/map.rs @@ -66,7 +66,7 @@ impl ValueType for KvPair { fn try_downcast_column<'a>(col: &'a Column) -> Option { match col { - Column::Tuple { fields, .. } => Some(KvColumn { + Column::Tuple(fields) => Some(KvColumn { keys: K::try_downcast_column(&fields[0])?, values: V::try_downcast_column(&fields[1])?, }), @@ -92,10 +92,10 @@ impl ValueType for KvPair { } fn upcast_column(col: Self::Column) -> Column { - Column::Tuple { - len: col.len(), - fields: vec![K::upcast_column(col.keys), V::upcast_column(col.values)], - } + Column::Tuple(vec![ + K::upcast_column(col.keys), + V::upcast_column(col.values), + ]) } fn upcast_domain((): Self::Domain) -> Domain { diff --git a/src/query/expression/src/types/nullable.rs b/src/query/expression/src/types/nullable.rs index 68227ea9d9a4b..0c88d33616998 100755 --- a/src/query/expression/src/types/nullable.rs +++ b/src/query/expression/src/types/nullable.rs @@ -350,6 +350,17 @@ impl NullableColumnBuilder { } } +impl NullableColumnBuilder { + pub fn pop(&mut self) -> Option> { + if self.validity.pop()? { + Some(Some(self.builder.pop().unwrap())) + } else { + self.builder.pop().unwrap(); + Some(None) + } + } +} + #[derive(Debug, Clone, PartialEq)] pub struct NullableDomain { pub has_null: bool, diff --git a/src/query/expression/src/utils/display.rs b/src/query/expression/src/utils/display.rs index 8dc7478426197..b74956f78daf2 100755 --- a/src/query/expression/src/utils/display.rs +++ b/src/query/expression/src/utils/display.rs @@ -171,11 +171,7 @@ impl Debug for Column { Column::Array(col) => write!(f, "{col:?}"), Column::Map(col) => write!(f, "{col:?}"), Column::Nullable(col) => write!(f, "{col:?}"), - Column::Tuple { fields, len } => f - .debug_struct("Tuple") - .field("fields", fields) - .field("len", len) - .finish(), + Column::Tuple(fields) => f.debug_tuple("Tuple").field(fields).finish(), Column::Variant(col) => write!(f, "{col:?}"), } } diff --git a/src/query/expression/src/utils/mod.rs b/src/query/expression/src/utils/mod.rs index 5f4220d544583..808871f3cb9d4 100644 --- a/src/query/expression/src/utils/mod.rs +++ b/src/query/expression/src/utils/mod.rs @@ -20,19 +20,9 @@ mod column_from; pub mod date_helper; pub mod display; pub mod filter_helper; +pub mod serialize; use common_arrow::arrow::bitmap::Bitmap; -use common_arrow::arrow::chunk::Chunk as ArrowChunk; -use common_arrow::arrow::datatypes::DataType as ArrowDataType; -use common_arrow::arrow::io::parquet::write::transverse; -use common_arrow::arrow::io::parquet::write::RowGroupIterator; -use common_arrow::arrow::io::parquet::write::WriteOptions; -use common_arrow::parquet::compression::CompressionOptions; -use common_arrow::parquet::encoding::Encoding; -use common_arrow::parquet::metadata::ThriftFileMetaData; -use common_arrow::parquet::write::Version; -use common_arrow::write_parquet_file; -use common_exception::ErrorCode; use common_exception::Result; use common_exception::Span; @@ -46,7 +36,6 @@ use crate::Evaluator; use crate::FunctionContext; use crate::FunctionRegistry; use crate::RawExpr; -use crate::TableSchema; use crate::Value; /// A convenient shortcut to evaluate a scalar function. @@ -97,74 +86,3 @@ pub fn column_merge_validity(column: &Column, bitmap: Option) -> Option< _ => bitmap, } } - -pub fn serialize_to_parquet_with_compression( - blocks: Vec, - schema: impl AsRef, - buf: &mut Vec, - compression: CompressionOptions, -) -> Result<(u64, ThriftFileMetaData)> { - let arrow_schema = schema.as_ref().to_arrow(); - - let row_group_write_options = WriteOptions { - write_statistics: false, - compression, - version: Version::V2, - data_pagesize_limit: None, - }; - let batches = blocks - .into_iter() - .map(ArrowChunk::try_from) - .collect::>>()?; - - let encoding_map = |data_type: &ArrowDataType| match data_type { - ArrowDataType::Dictionary(..) => Encoding::RleDictionary, - _ => col_encoding(data_type), - }; - - let encodings: Vec> = arrow_schema - .fields - .iter() - .map(|f| transverse(&f.data_type, encoding_map)) - .collect::>(); - - let row_groups = RowGroupIterator::try_new( - batches.into_iter().map(Ok), - &arrow_schema, - row_group_write_options, - encodings, - )?; - - use common_arrow::parquet::write::WriteOptions as FileWriteOption; - let options = FileWriteOption { - write_statistics: false, - version: Version::V2, - }; - - match write_parquet_file(buf, row_groups, arrow_schema.clone(), options) { - Ok(result) => Ok(result), - Err(cause) => Err(ErrorCode::ParquetFileInvalid(cause.to_string())), - } -} - -pub fn serialize_to_parquet( - blocks: Vec, - schema: impl AsRef, - buf: &mut Vec, -) -> Result<(u64, ThriftFileMetaData)> { - serialize_to_parquet_with_compression(blocks, schema, buf, CompressionOptions::Lz4Raw) -} - -fn col_encoding(_data_type: &ArrowDataType) -> Encoding { - // Although encoding does work, parquet2 has not implemented decoding of DeltaLengthByteArray yet, we fallback to Plain - // From parquet2: Decoding "DeltaLengthByteArray"-encoded required V2 pages is not yet implemented for Binary. - // - // match data_type { - // ArrowDataType::Binary - // | ArrowDataType::LargeBinary - // | ArrowDataType::Utf8 - // | ArrowDataType::LargeUtf8 => Encoding::DeltaLengthByteArray, - // _ => Encoding::Plain, - //} - Encoding::Plain -} diff --git a/src/query/expression/src/deserializations/decimal.rs b/src/query/expression/src/utils/serialize.rs similarity index 54% rename from src/query/expression/src/deserializations/decimal.rs rename to src/query/expression/src/utils/serialize.rs index 2b845a77e1874..131679258c468 100644 --- a/src/query/expression/src/deserializations/decimal.rs +++ b/src/query/expression/src/utils/serialize.rs @@ -1,4 +1,4 @@ -// Copyright 2023 Datafuse Labs. +// Copyright 2022 Datafuse Labs. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,139 +14,102 @@ use std::cmp::Ordering; +use chrono::Datelike; +use chrono::NaiveDate; +use common_arrow::arrow::chunk::Chunk as ArrowChunk; +use common_arrow::arrow::datatypes::DataType as ArrowDataType; +use common_arrow::arrow::io::parquet::write::transverse; +use common_arrow::arrow::io::parquet::write::RowGroupIterator; +use common_arrow::arrow::io::parquet::write::WriteOptions; +use common_arrow::parquet::compression::CompressionOptions; +use common_arrow::parquet::encoding::Encoding; +use common_arrow::parquet::metadata::ThriftFileMetaData; +use common_arrow::parquet::write::Version; +use common_arrow::write_parquet_file; use common_exception::ErrorCode; use common_exception::Result; -use common_io::prelude::FormatSettings; use crate::types::decimal::Decimal; use crate::types::decimal::DecimalSize; -use crate::types::DecimalDataType; -use crate::Column; -use crate::Scalar; -use crate::TypeDeserializer; - -pub struct DecimalDeserializer { - pub values: Vec, - pub ty: DecimalDataType, - // for fast access - pub size: DecimalSize, -} +use crate::DataBlock; +use crate::TableSchema; + +pub fn serialize_to_parquet_with_compression( + blocks: Vec, + schema: impl AsRef, + buf: &mut Vec, + compression: CompressionOptions, +) -> Result<(u64, ThriftFileMetaData)> { + let arrow_schema = schema.as_ref().to_arrow(); + + let row_group_write_options = WriteOptions { + write_statistics: false, + compression, + version: Version::V2, + data_pagesize_limit: None, + }; + let batches = blocks + .into_iter() + .map(ArrowChunk::try_from) + .collect::>>()?; + + let encoding_map = |data_type: &ArrowDataType| match data_type { + ArrowDataType::Dictionary(..) => Encoding::RleDictionary, + _ => col_encoding(data_type), + }; -impl DecimalDeserializer { - pub fn with_capacity(ty: &DecimalDataType, capacity: usize) -> Self { - Self { - size: ty.size(), - ty: *ty, - values: Vec::with_capacity(capacity), - } - } -} + let encodings: Vec> = arrow_schema + .fields + .iter() + .map(|f| transverse(&f.data_type, encoding_map)) + .collect::>(); + + let row_groups = RowGroupIterator::try_new( + batches.into_iter().map(Ok), + &arrow_schema, + row_group_write_options, + encodings, + )?; + + use common_arrow::parquet::write::WriteOptions as FileWriteOption; + let options = FileWriteOption { + write_statistics: false, + version: Version::V2, + }; -impl DecimalDeserializer { - pub fn de_json_inner(&mut self, value: &serde_json::Value) -> Result<()> { - match value { - serde_json::Value::Number(n) => { - if n.is_i64() { - self.values.push( - T::from_i64(n.as_i64().unwrap()) - .with_size(self.size) - .ok_or_else(overflow_error)?, - ); - Ok(()) - } else if n.is_u64() { - self.values.push( - T::from_u64(n.as_u64().unwrap()) - .with_size(self.size) - .ok_or_else(overflow_error)?, - ); - Ok(()) - } else { - let f = n.as_f64().unwrap() * (10_f64).powi(self.size.scale as i32); - let n = T::from_float(f); - self.values.push(n); - Ok(()) - } - } - serde_json::Value::String(s) => { - let (n, _) = read_decimal_with_size::(s.as_bytes(), self.size, true)?; - self.values.push(n); - Ok(()) - } - _ => Err(ErrorCode::from("Incorrect json value for decimal")), - } + match write_parquet_file(buf, row_groups, arrow_schema.clone(), options) { + Ok(result) => Ok(result), + Err(cause) => Err(ErrorCode::ParquetFileInvalid(cause.to_string())), } } -impl TypeDeserializer for DecimalDeserializer { - fn memory_size(&self) -> usize { - self.values.len() * T::mem_size() - } - - fn len(&self) -> usize { - self.values.len() - } - - // See GroupHash.rs for StringColumn - #[allow(clippy::uninit_vec)] - fn de_binary(&mut self, reader: &mut &[u8], _format: &FormatSettings) -> Result<()> { - let t: T = T::de_binary(reader); - self.values.push(t); - Ok(()) - } - - fn de_default(&mut self) { - self.values.push(T::zero()); - } - - fn de_fixed_binary_batch( - &mut self, - reader: &[u8], - step: usize, - rows: usize, - _format: &FormatSettings, - ) -> Result<()> { - for row in 0..rows { - let mut row_reader = &reader[step * row..]; - let value: T = T::de_binary(&mut row_reader); - self.values.push(value); - } - Ok(()) - } - - fn de_json(&mut self, value: &serde_json::Value, _format: &FormatSettings) -> Result<()> { - self.de_json_inner(value) - } - - fn append_data_value(&mut self, value: Scalar, _format: &FormatSettings) -> Result<()> { - let d = value - .as_decimal() - .ok_or_else(|| ErrorCode::from("Unable to get decimal value"))?; - let i = T::try_downcast_scalar(d) - .ok_or_else(|| ErrorCode::from("Unable to get decimal value"))?; - self.values.push(i); - Ok(()) - } - - fn pop_data_value(&mut self) -> Result { - match self.values.pop() { - Some(v) => Ok(T::upcast_scalar(v, self.size)), - None => Err(ErrorCode::from( - "Decimal column is empty when pop data value", - )), - } - } - - fn finish_to_column(&mut self) -> Column { - Column::Decimal(T::to_column(std::mem::take(&mut self.values), self.size)) - } +pub fn serialize_to_parquet( + blocks: Vec, + schema: impl AsRef, + buf: &mut Vec, +) -> Result<(u64, ThriftFileMetaData)> { + serialize_to_parquet_with_compression(blocks, schema, buf, CompressionOptions::Lz4Raw) } -fn parse_error(msg: &str) -> ErrorCode { - ErrorCode::BadArguments(format!("bad decimal literal: {msg}")) +pub fn col_encoding(_data_type: &ArrowDataType) -> Encoding { + // Although encoding does work, parquet2 has not implemented decoding of DeltaLengthByteArray yet, we fallback to Plain + // From parquet2: Decoding "DeltaLengthByteArray"-encoded required V2 pages is not yet implemented for Binary. + // + // match data_type { + // ArrowDataType::Binary + // | ArrowDataType::LargeBinary + // | ArrowDataType::Utf8 + // | ArrowDataType::LargeUtf8 => Encoding::DeltaLengthByteArray, + // _ => Encoding::Plain, + //} + Encoding::Plain } -fn overflow_error() -> ErrorCode { - ErrorCode::Overflow("decimal overflow") +pub const EPOCH_DAYS_FROM_CE: i32 = 719_163; + +#[inline] +pub fn uniform_date(date: NaiveDate) -> i32 { + date.num_days_from_ce() - EPOCH_DAYS_FROM_CE } pub fn read_decimal_with_size( @@ -156,18 +119,18 @@ pub fn read_decimal_with_size( ) -> Result<(T, usize)> { let (n, d, e, n_read) = read_decimal::(buf, size.precision as u32, exact)?; if d as i32 + e > (size.precision - size.scale).into() { - return Err(overflow_error()); + return Err(decimal_overflow_error()); } let scale_diff = e + size.scale as i32; let n = match scale_diff.cmp(&0) { Ordering::Less => { // e < 0, than -e is the actual scale, (-e) > scale means we need to cut more n.checked_div(T::e(-scale_diff as u32)) - .ok_or_else(overflow_error)? + .ok_or_else(decimal_overflow_error)? } Ordering::Greater => n .checked_mul(T::e(scale_diff as u32)) - .ok_or_else(overflow_error)?, + .ok_or_else(decimal_overflow_error)?, Ordering::Equal => n, }; Ok((n, n_read)) @@ -188,7 +151,7 @@ pub fn read_decimal( exact: bool, ) -> Result<(T, u8, i32, usize)> { if buf.is_empty() { - return Err(parse_error("empty")); + return Err(decimal_parse_error("empty")); } let mut n = T::zero(); @@ -227,16 +190,18 @@ pub fn read_decimal( b'0'..=b'9' => { digits += 1; if digits > max_digits { - return Err(overflow_error()); + return Err(decimal_overflow_error()); } else { let v = buf[pos]; if v == b'0' { zeros += 1; } else { - n = n.checked_mul(T::e(zeros + 1)).ok_or_else(overflow_error)?; + n = n + .checked_mul(T::e(zeros + 1)) + .ok_or_else(decimal_overflow_error)?; n = n .checked_add(T::from_u64((v - b'0') as u64)) - .ok_or_else(overflow_error)?; + .ok_or_else(decimal_overflow_error)?; zeros = 0; } } @@ -254,7 +219,7 @@ pub fn read_decimal( } _ => { if exact { - return Err(parse_error("unexpected char")); + return Err(decimal_parse_error("unexpected char")); } else { stop = pos as i32; break; @@ -265,7 +230,9 @@ pub fn read_decimal( } if zeros > 0 { - n = n.checked_mul(T::e(zeros)).ok_or_else(overflow_error)?; + n = n + .checked_mul(T::e(zeros)) + .ok_or_else(decimal_overflow_error)?; zeros = 0; } @@ -289,10 +256,12 @@ pub fn read_decimal( continue; } else { let v = buf[pos]; - n = n.checked_mul(T::e(zeros + 1)).ok_or_else(overflow_error)?; + n = n + .checked_mul(T::e(zeros + 1)) + .ok_or_else(decimal_overflow_error)?; n = n .checked_add(T::from_u64((v - b'0') as u64)) - .ok_or_else(overflow_error)?; + .ok_or_else(decimal_overflow_error)?; digits += zeros + 1; zeros = 0; } @@ -304,7 +273,7 @@ pub fn read_decimal( } _ => { if exact { - return Err(parse_error("unexpected char")); + return Err(decimal_parse_error("unexpected char")); } else { stop = pos as i32; break; @@ -317,7 +286,7 @@ pub fn read_decimal( if digits == 0 && zeros == 0 && !leading_zero { // these are ok: 0 0.0 0. .0 +0 - return Err(parse_error("no digits")); + return Err(decimal_parse_error("no digits")); } let mut exponent = if has_point { @@ -329,7 +298,7 @@ pub fn read_decimal( if has_e && stop < 0 { let mut exp = 0i32; if pos == len - 1 { - return Err(parse_error("empty exponent")); + return Err(decimal_parse_error("empty exponent")); } let exp_sign = match buf[pos] { @@ -345,7 +314,7 @@ pub fn read_decimal( }; if pos == len - 1 { - return Err(parse_error("bad exponent")); + return Err(decimal_parse_error("bad exponent")); } for (i, v) in buf[pos..].iter().enumerate() { @@ -356,7 +325,7 @@ pub fn read_decimal( } c => { if exact { - return Err(parse_error(&format!("unexpected char: {c}"))); + return Err(decimal_parse_error(&format!("unexpected char: {c}"))); } else { stop = (pos + i) as i32; break; @@ -367,7 +336,43 @@ pub fn read_decimal( exponent += exp * exp_sign; } - let n = n.checked_mul(sign).ok_or_else(overflow_error)?; + let n = n.checked_mul(sign).ok_or_else(decimal_overflow_error)?; let n_read = if stop > 0 { stop as usize } else { len }; Ok((n, digits as u8, exponent, n_read)) } + +pub fn read_decimal_from_json( + value: &serde_json::Value, + size: DecimalSize, +) -> Result { + match value { + serde_json::Value::Number(n) => { + if n.is_i64() { + Ok(T::from_i64(n.as_i64().unwrap()) + .with_size(size) + .ok_or_else(decimal_overflow_error)?) + } else if n.is_u64() { + Ok(T::from_u64(n.as_u64().unwrap()) + .with_size(size) + .ok_or_else(decimal_overflow_error)?) + } else { + let f = n.as_f64().unwrap() * (10_f64).powi(size.scale as i32); + let n = T::from_float(f); + Ok(n) + } + } + serde_json::Value::String(s) => { + let (n, _) = read_decimal_with_size::(s.as_bytes(), size, true)?; + Ok(n) + } + _ => Err(ErrorCode::from("Incorrect json value for decimal")), + } +} + +fn decimal_parse_error(msg: &str) -> ErrorCode { + ErrorCode::BadArguments(format!("bad decimal literal: {msg}")) +} + +fn decimal_overflow_error() -> ErrorCode { + ErrorCode::Overflow("decimal overflow") +} diff --git a/src/query/expression/src/values.rs b/src/query/expression/src/values.rs index 4ed11fc100378..59dc139f9c639 100755 --- a/src/query/expression/src/values.rs +++ b/src/query/expression/src/values.rs @@ -14,6 +14,7 @@ use std::cmp::Ordering; use std::hash::Hash; +use std::io::Read; use std::ops::Range; use common_arrow::arrow::bitmap::and; @@ -25,7 +26,10 @@ use common_arrow::arrow::datatypes::DataType as ArrowType; use common_arrow::arrow::datatypes::TimeUnit; use common_arrow::arrow::offset::OffsetsBuffer; use common_arrow::arrow::trusted_len::TrustedLen; +use common_exception::Result; +use common_io::prelude::BinaryRead; use enum_as_inner::EnumAsInner; +use ethnum::i256; use itertools::Itertools; use ordered_float::OrderedFloat; use serde::de::Visitor; @@ -38,6 +42,7 @@ use crate::property::Domain; use crate::types::array::ArrayColumn; use crate::types::array::ArrayColumnBuilder; use crate::types::boolean::BooleanDomain; +use crate::types::decimal::Decimal; use crate::types::decimal::DecimalColumn; use crate::types::decimal::DecimalColumnBuilder; use crate::types::decimal::DecimalDataType; @@ -55,6 +60,7 @@ use crate::types::number::F64; use crate::types::string::StringColumn; use crate::types::string::StringColumnBuilder; use crate::types::string::StringDomain; +use crate::types::timestamp::check_timestamp; use crate::types::variant::JSONB_NULL; use crate::types::*; use crate::utils::arrow::append_bitmap; @@ -63,9 +69,10 @@ use crate::utils::arrow::buffer_into_mut; use crate::utils::arrow::constant_bitmap; use crate::utils::arrow::deserialize_column; use crate::utils::arrow::serialize_column; +use crate::with_decimal_mapped_type; use crate::with_decimal_type; +use crate::with_number_mapped_type; use crate::with_number_type; -use crate::TypeDeserializerImpl; #[derive(Debug, Clone, PartialEq, EnumAsInner)] pub enum Value { @@ -128,21 +135,15 @@ pub enum Column { Array(Box>), Map(Box>), Nullable(Box>), - Tuple { fields: Vec, len: usize }, + Tuple(Vec), Variant(StringColumn), } #[derive(Debug, Clone, EnumAsInner)] pub enum ColumnBuilder { - Null { - len: usize, - }, - EmptyArray { - len: usize, - }, - EmptyMap { - len: usize, - }, + Null { len: usize }, + EmptyArray { len: usize }, + EmptyMap { len: usize }, Number(NumberColumnBuilder), Decimal(DecimalColumnBuilder), Boolean(MutableBitmap), @@ -152,10 +153,7 @@ pub enum ColumnBuilder { Array(Box>), Map(Box>), Nullable(Box>), - Tuple { - fields: Vec, - len: usize, - }, + Tuple(Vec), Variant(StringColumnBuilder), } @@ -598,9 +596,7 @@ impl PartialOrd for Column { (Column::Nullable(col1), Column::Nullable(col2)) => { col1.iter().partial_cmp(col2.iter()) } - (Column::Tuple { fields: col1, .. }, Column::Tuple { fields: col2, .. }) => { - col1.partial_cmp(col2) - } + (Column::Tuple(fields1), Column::Tuple(fields2)) => fields1.partial_cmp(fields2), (Column::Variant(col1), Column::Variant(col2)) => col1 .iter() .partial_cmp_by(col2.iter(), |v1, v2| jsonb::compare(v1, v2).ok()), @@ -634,7 +630,7 @@ impl Column { Column::Array(col) => col.len(), Column::Map(col) => col.len(), Column::Nullable(col) => col.len(), - Column::Tuple { len, .. } => *len, + Column::Tuple(fields) => fields[0].len(), Column::Variant(col) => col.len(), } } @@ -653,7 +649,7 @@ impl Column { Column::Array(col) => Some(ScalarRef::Array(col.index(index)?)), Column::Map(col) => Some(ScalarRef::Map(col.index(index)?)), Column::Nullable(col) => Some(col.index(index)?.unwrap_or(ScalarRef::Null)), - Column::Tuple { fields, .. } => Some(ScalarRef::Tuple( + Column::Tuple(fields) => Some(ScalarRef::Tuple( fields .iter() .map(|field| field.index(index)) @@ -679,7 +675,7 @@ impl Column { Column::Array(col) => ScalarRef::Array(col.index_unchecked(index)), Column::Map(col) => ScalarRef::Map(col.index_unchecked(index)), Column::Nullable(col) => col.index_unchecked(index).unwrap_or(ScalarRef::Null), - Column::Tuple { fields, .. } => ScalarRef::Tuple( + Column::Tuple(fields) => ScalarRef::Tuple( fields .iter() .map(|field| field.index_unchecked(index)) @@ -698,9 +694,8 @@ impl Column { ); if range.is_empty() { - use crate::deserializations::TypeDeserializer; - let mut de = TypeDeserializerImpl::with_capacity(&self.data_type(), 0); - return de.finish_to_column(); + let builder = ColumnBuilder::with_capacity(&self.data_type(), 0); + return builder.build(); } match self { @@ -728,13 +723,12 @@ impl Column { Column::Array(col) => Column::Array(Box::new(col.slice(range))), Column::Map(col) => Column::Map(Box::new(col.slice(range))), Column::Nullable(col) => Column::Nullable(Box::new(col.slice(range))), - Column::Tuple { fields, .. } => Column::Tuple { - fields: fields + Column::Tuple(fields) => Column::Tuple( + fields .iter() .map(|field| field.slice(range.clone())) .collect(), - len: range.end - range.start, - }, + ), Column::Variant(col) => Column::Variant(col.slice(range)), } } @@ -812,7 +806,7 @@ impl Column { value: Some(Box::new(inner_domain)), }) } - Column::Tuple { fields, .. } => { + Column::Tuple(fields) => { let domains = fields.iter().map(|col| col.domain()).collect::>(); Domain::Tuple(domains) } @@ -848,7 +842,7 @@ impl Column { let inner = inner.column.data_type(); DataType::Nullable(Box::new(inner)) } - Column::Tuple { fields, .. } => { + Column::Tuple(fields) => { let inner = fields.iter().map(|col| col.data_type()).collect::>(); DataType::Tuple(inner) } @@ -1038,7 +1032,7 @@ impl Column { let offsets: Buffer = col.offsets.iter().map(|offset| *offset as i32).collect(); let values = match (&arrow_type, &col.values) { - (ArrowType::Map(inner_field, _), Column::Tuple { fields, .. }) => { + (ArrowType::Map(inner_field, _), Column::Tuple(fields)) => { let inner_type = inner_field.data_type.clone(); Box::new( common_arrow::arrow::array::StructArray::try_new( @@ -1065,7 +1059,7 @@ impl Column { let arrow_array = col.column.as_arrow(); Self::set_validity(arrow_array.clone(), &col.validity) } - Column::Tuple { fields, .. } => Box::new( + Column::Tuple(fields) => Box::new( common_arrow::arrow::array::StructArray::try_new( arrow_type, fields.iter().map(|field| field.as_arrow()).collect(), @@ -1132,7 +1126,6 @@ impl Column { arrow_col: &dyn common_arrow::arrow::array::Array, data_type: &DataType, ) -> Column { - use common_arrow::arrow::array::Array as _; use common_arrow::arrow::datatypes::DataType as ArrowDataType; let is_nullable = data_type.is_nullable(); @@ -1437,10 +1430,7 @@ impl Column { .zip(struct_type.iter()) .map(|(field, dt)| Column::from_arrow(&**field, dt)) .collect::>(); - Column::Tuple { - fields, - len: arrow_col.len(), - } + Column::Tuple(fields) } ArrowDataType::Decimal(precision, scale) => { let arrow_col = arrow_col @@ -1508,18 +1498,18 @@ impl Column { Column::Null { .. } => std::mem::size_of::(), Column::EmptyArray { .. } => std::mem::size_of::(), Column::EmptyMap { .. } => std::mem::size_of::(), - Column::Number(NumberColumn::UInt8(_)) => self.len(), - Column::Number(NumberColumn::UInt16(_)) => self.len() * 2, - Column::Number(NumberColumn::UInt32(_)) => self.len() * 4, - Column::Number(NumberColumn::UInt64(_)) => self.len() * 8, - Column::Number(NumberColumn::Float32(_)) => self.len() * 4, - Column::Number(NumberColumn::Float64(_)) => self.len() * 8, - Column::Number(NumberColumn::Int8(_)) => self.len(), - Column::Number(NumberColumn::Int16(_)) => self.len() * 2, - Column::Number(NumberColumn::Int32(_)) => self.len() * 4, - Column::Number(NumberColumn::Int64(_)) => self.len() * 8, - Column::Decimal(DecimalColumn::Decimal128(_, _)) => self.len() * 16, - Column::Decimal(DecimalColumn::Decimal256(_, _)) => self.len() * 32, + Column::Number(NumberColumn::UInt8(col)) => col.len(), + Column::Number(NumberColumn::UInt16(col)) => col.len() * 2, + Column::Number(NumberColumn::UInt32(col)) => col.len() * 4, + Column::Number(NumberColumn::UInt64(col)) => col.len() * 8, + Column::Number(NumberColumn::Float32(col)) => col.len() * 4, + Column::Number(NumberColumn::Float64(col)) => col.len() * 8, + Column::Number(NumberColumn::Int8(col)) => col.len(), + Column::Number(NumberColumn::Int16(col)) => col.len() * 2, + Column::Number(NumberColumn::Int32(col)) => col.len() * 4, + Column::Number(NumberColumn::Int64(col)) => col.len() * 8, + Column::Decimal(DecimalColumn::Decimal128(col, _)) => col.len() * 16, + Column::Decimal(DecimalColumn::Decimal256(col, _)) => col.len() * 32, Column::Boolean(c) => c.as_slice().0.len(), Column::String(col) => col.data.len() + col.offsets.len() * 8, Column::Timestamp(col) => col.len() * 8, @@ -1527,7 +1517,7 @@ impl Column { Column::Array(col) => col.values.memory_size() + col.offsets.len() * 8, Column::Map(col) => col.values.memory_size() + col.offsets.len() * 8, Column::Nullable(c) => c.column.memory_size() + c.validity.as_slice().0.len(), - Column::Tuple { fields, .. } => fields.iter().map(|f| f.memory_size()).sum(), + Column::Tuple(fields) => fields.iter().map(|f| f.memory_size()).sum(), Column::Variant(col) => col.data.len() + col.offsets.len() * 8, } } @@ -1607,13 +1597,12 @@ impl ColumnBuilder { Column::Nullable(box col) => { ColumnBuilder::Nullable(Box::new(NullableColumnBuilder::from_column(col))) } - Column::Tuple { fields, len } => ColumnBuilder::Tuple { - fields: fields + Column::Tuple(fields) => ColumnBuilder::Tuple( + fields .iter() .map(|col| ColumnBuilder::from_column(col.clone())) .collect(), - len, - }, + ), Column::Variant(col) => ColumnBuilder::Variant(StringColumnBuilder::from_column(col)), } } @@ -1666,14 +1655,13 @@ impl ColumnBuilder { DataType::Tuple(fields_ty) => fields_ty, _ => unreachable!(), }; - ColumnBuilder::Tuple { - fields: fields + ColumnBuilder::Tuple( + fields .iter() .zip(fields_ty) .map(|(field, ty)| ColumnBuilder::repeat(field, n, ty)) .collect(), - len: n, - } + ) } ScalarRef::Variant(s) => ColumnBuilder::Variant(StringColumnBuilder::repeat(s, n)), } @@ -1693,11 +1681,76 @@ impl ColumnBuilder { ColumnBuilder::Array(builder) => builder.len(), ColumnBuilder::Map(builder) => builder.len(), ColumnBuilder::Nullable(builder) => builder.len(), - ColumnBuilder::Tuple { len, .. } => *len, + ColumnBuilder::Tuple(fields) => fields[0].len(), ColumnBuilder::Variant(builder) => builder.len(), } } + pub fn memory_size(&self) -> usize { + match self { + ColumnBuilder::Null { .. } => std::mem::size_of::(), + ColumnBuilder::EmptyArray { .. } => std::mem::size_of::(), + ColumnBuilder::EmptyMap { .. } => std::mem::size_of::(), + ColumnBuilder::Number(NumberColumnBuilder::UInt8(builder)) => builder.len(), + ColumnBuilder::Number(NumberColumnBuilder::UInt16(builder)) => builder.len() * 2, + ColumnBuilder::Number(NumberColumnBuilder::UInt32(builder)) => builder.len() * 4, + ColumnBuilder::Number(NumberColumnBuilder::UInt64(builder)) => builder.len() * 8, + ColumnBuilder::Number(NumberColumnBuilder::Float32(builder)) => builder.len() * 4, + ColumnBuilder::Number(NumberColumnBuilder::Float64(builder)) => builder.len() * 8, + ColumnBuilder::Number(NumberColumnBuilder::Int8(builder)) => builder.len(), + ColumnBuilder::Number(NumberColumnBuilder::Int16(builder)) => builder.len() * 2, + ColumnBuilder::Number(NumberColumnBuilder::Int32(builder)) => builder.len() * 4, + ColumnBuilder::Number(NumberColumnBuilder::Int64(builder)) => builder.len() * 8, + ColumnBuilder::Decimal(DecimalColumnBuilder::Decimal128(builder, _)) => { + builder.len() * 16 + } + ColumnBuilder::Decimal(DecimalColumnBuilder::Decimal256(builder, _)) => { + builder.len() * 32 + } + ColumnBuilder::Boolean(c) => c.as_slice().len(), + ColumnBuilder::String(col) => col.data.len() + col.offsets.len() * 8, + ColumnBuilder::Timestamp(col) => col.len() * 8, + ColumnBuilder::Date(col) => col.len() * 4, + ColumnBuilder::Array(col) => col.builder.memory_size() + col.offsets.len() * 8, + ColumnBuilder::Map(col) => col.builder.memory_size() + col.offsets.len() * 8, + ColumnBuilder::Nullable(c) => c.builder.memory_size() + c.validity.as_slice().len(), + ColumnBuilder::Tuple(fields) => fields.iter().map(|f| f.memory_size()).sum(), + ColumnBuilder::Variant(col) => col.data.len() + col.offsets.len() * 8, + } + } + + pub fn data_type(&self) -> DataType { + match self { + ColumnBuilder::Null { .. } => DataType::Null, + ColumnBuilder::EmptyArray { .. } => DataType::EmptyArray, + ColumnBuilder::EmptyMap { .. } => DataType::EmptyMap, + ColumnBuilder::Number(col) => with_number_type!(|NUM_TYPE| match col { + NumberColumnBuilder::NUM_TYPE(_) => DataType::Number(NumberDataType::NUM_TYPE), + }), + ColumnBuilder::Decimal(col) => with_decimal_type!(|DECIMAL_TYPE| match col { + DecimalColumnBuilder::DECIMAL_TYPE(_, size) => + DataType::Decimal(DecimalDataType::DECIMAL_TYPE(*size)), + }), + ColumnBuilder::Boolean(_) => DataType::Boolean, + ColumnBuilder::String(_) => DataType::String, + ColumnBuilder::Timestamp(_) => DataType::Timestamp, + ColumnBuilder::Date(_) => DataType::Date, + ColumnBuilder::Array(col) => { + let inner = col.builder.data_type(); + DataType::Array(Box::new(inner)) + } + ColumnBuilder::Map(col) => { + let inner = col.builder.data_type(); + DataType::Map(Box::new(inner)) + } + ColumnBuilder::Nullable(col) => DataType::Nullable(Box::new(col.builder.data_type())), + ColumnBuilder::Tuple(fields) => { + DataType::Tuple(fields.iter().map(|f| f.data_type()).collect::>()) + } + ColumnBuilder::Variant(_) => DataType::Variant, + } + } + pub fn with_capacity(ty: &DataType, capacity: usize) -> ColumnBuilder { match ty { DataType::Null => ColumnBuilder::Null { len: 0 }, @@ -1735,13 +1788,15 @@ impl ColumnBuilder { offsets, })) } - DataType::Tuple(fields) => ColumnBuilder::Tuple { - fields: fields - .iter() - .map(|field| Self::with_capacity(field, capacity)) - .collect(), - len: 0, - }, + DataType::Tuple(fields) => { + assert!(!fields.is_empty()); + ColumnBuilder::Tuple( + fields + .iter() + .map(|field| Self::with_capacity(field, capacity)) + .collect(), + ) + } DataType::Variant => { ColumnBuilder::Variant(StringColumnBuilder::with_capacity(capacity, 0)) } @@ -1779,12 +1834,11 @@ impl ColumnBuilder { (ColumnBuilder::Nullable(builder), scalar) => { builder.push(scalar); } - (ColumnBuilder::Tuple { fields, len }, ScalarRef::Tuple(value)) => { + (ColumnBuilder::Tuple(fields), ScalarRef::Tuple(value)) => { assert_eq!(fields.len(), value.len()); for (field, scalar) in fields.iter_mut().zip(value.iter()) { field.push(scalar.clone()); } - *len += 1; } (ColumnBuilder::Variant(builder), ScalarRef::Variant(value)) => { builder.put_slice(value); @@ -1808,11 +1862,10 @@ impl ColumnBuilder { ColumnBuilder::Array(builder) => builder.push_default(), ColumnBuilder::Map(builder) => builder.push_default(), ColumnBuilder::Nullable(builder) => builder.push_null(), - ColumnBuilder::Tuple { fields, len } => { + ColumnBuilder::Tuple(fields) => { for field in fields { field.push_default(); } - *len += 1; } ColumnBuilder::Variant(builder) => { builder.put_slice(JSONB_NULL); @@ -1821,6 +1874,224 @@ impl ColumnBuilder { } } + pub fn push_binary(&mut self, reader: &mut &[u8]) -> Result<()> { + match self { + ColumnBuilder::Null { len } => *len += 1, + ColumnBuilder::EmptyArray { len } => *len += 1, + ColumnBuilder::EmptyMap { len } => *len += 1, + ColumnBuilder::Number(builder) => with_number_mapped_type!(|NUM_TYPE| match builder { + NumberColumnBuilder::NUM_TYPE(builder) => { + let value: NUM_TYPE = reader.read_scalar()?; + builder.push(value); + } + }), + ColumnBuilder::Decimal(builder) => { + with_decimal_mapped_type!(|DECIMAL_TYPE| match builder { + DecimalColumnBuilder::DECIMAL_TYPE(builder, _) => + builder.push(DECIMAL_TYPE::de_binary(reader)), + }) + } + ColumnBuilder::Boolean(builder) => { + let v: bool = reader.read_scalar()?; + builder.push(v); + } + ColumnBuilder::String(builder) | ColumnBuilder::Variant(builder) => { + let offset: u64 = reader.read_uvarint()?; + builder.data.resize(offset as usize + builder.data.len(), 0); + let last = *builder.offsets.last().unwrap() as usize; + reader.read_exact(&mut builder.data[last..last + offset as usize])?; + builder.commit_row(); + } + ColumnBuilder::Timestamp(builder) => { + let value: i64 = reader.read_scalar()?; + check_timestamp(value)?; + builder.push(value); + } + ColumnBuilder::Date(builder) => { + let value: i32 = reader.read_scalar()?; + builder.push(value); + } + ColumnBuilder::Array(builder) => { + let len = reader.read_uvarint()?; + for _ in 0..len { + builder.builder.push_binary(reader)?; + } + builder.commit_row(); + } + ColumnBuilder::Map(builder) => { + const KEY: usize = 0; + const VALUE: usize = 1; + let len = reader.read_uvarint()?; + let map_builder = builder.builder.as_tuple_mut().unwrap(); + for _ in 0..len { + map_builder[KEY].push_binary(reader)?; + map_builder[VALUE].push_binary(reader)?; + } + builder.commit_row(); + } + ColumnBuilder::Nullable(builder) => { + let valid: bool = reader.read_scalar()?; + if valid { + builder.builder.push_binary(reader)?; + builder.validity.push(true); + } else { + builder.push_null(); + } + } + ColumnBuilder::Tuple(fields) => { + for field in fields { + field.push_binary(reader)?; + } + } + }; + + Ok(()) + } + + pub fn push_fix_len_binaries(&mut self, reader: &[u8], step: usize, rows: usize) -> Result<()> { + match self { + ColumnBuilder::Null { len } => *len += rows, + ColumnBuilder::EmptyArray { len } => *len += rows, + ColumnBuilder::EmptyMap { len } => *len += rows, + ColumnBuilder::Number(builder) => with_number_mapped_type!(|NUM_TYPE| match builder { + NumberColumnBuilder::NUM_TYPE(builder) => { + for row in 0..rows { + let mut reader = &reader[step * row..]; + let value: NUM_TYPE = reader.read_scalar()?; + builder.push(value); + } + } + }), + ColumnBuilder::Decimal(builder) => { + with_decimal_mapped_type!(|DECIMAL_TYPE| match builder { + DecimalColumnBuilder::DECIMAL_TYPE(builder, _) => { + for row in 0..rows { + let mut reader = &reader[step * row..]; + builder.push(DECIMAL_TYPE::de_binary(&mut reader)); + } + } + }) + } + ColumnBuilder::Boolean(builder) => { + for row in 0..rows { + let mut reader = &reader[step * row..]; + let v: bool = reader.read_scalar()?; + builder.push(v); + } + } + ColumnBuilder::String(builder) | ColumnBuilder::Variant(builder) => { + for row in 0..rows { + let reader = &reader[step * row..]; + builder.put_slice(reader); + builder.commit_row(); + } + } + ColumnBuilder::Timestamp(builder) => { + for row in 0..rows { + let mut reader = &reader[step * row..]; + let value: i64 = reader.read_scalar()?; + check_timestamp(value)?; + builder.push(value); + } + } + ColumnBuilder::Date(builder) => { + for row in 0..rows { + let mut reader = &reader[step * row..]; + let value: i32 = reader.read_scalar()?; + builder.push(value); + } + } + ColumnBuilder::Array(builder) => { + for row in 0..rows { + let mut reader = &reader[step * row..]; + let len = reader.read_uvarint()?; + for _ in 0..len { + builder.builder.push_binary(&mut reader)?; + } + builder.commit_row(); + } + } + ColumnBuilder::Map(builder) => { + const KEY: usize = 0; + const VALUE: usize = 1; + for row in 0..rows { + let mut reader = &reader[step * row..]; + let map_builder = builder.builder.as_tuple_mut().unwrap(); + let len = reader.read_uvarint()?; + for _ in 0..len { + map_builder[KEY].push_binary(&mut reader)?; + map_builder[VALUE].push_binary(&mut reader)?; + } + builder.commit_row(); + } + } + ColumnBuilder::Nullable(_) => { + unimplemented!() + } + ColumnBuilder::Tuple(fields) => { + for row in 0..rows { + let mut reader = &reader[step * row..]; + for field in fields.iter_mut() { + field.push_binary(&mut reader)?; + } + } + } + } + + Ok(()) + } + + pub fn pop(&mut self) -> Option { + match self { + ColumnBuilder::Null { len } => { + if *len > 0 { + *len -= 1; + Some(Scalar::Null) + } else { + None + } + } + ColumnBuilder::EmptyArray { len } => { + if *len > 0 { + *len -= 1; + Some(Scalar::EmptyArray) + } else { + None + } + } + ColumnBuilder::EmptyMap { len } => { + if *len > 0 { + *len -= 1; + Some(Scalar::EmptyMap) + } else { + None + } + } + ColumnBuilder::Number(builder) => builder.pop().map(Scalar::Number), + ColumnBuilder::Decimal(builder) => builder.pop().map(Scalar::Decimal), + ColumnBuilder::Boolean(builder) => builder.pop().map(Scalar::Boolean), + ColumnBuilder::String(builder) => builder.pop().map(Scalar::String), + ColumnBuilder::Timestamp(builder) => builder.pop().map(Scalar::Timestamp), + ColumnBuilder::Date(builder) => builder.pop().map(Scalar::Date), + ColumnBuilder::Array(builder) => builder.pop().map(Scalar::Array), + ColumnBuilder::Map(builder) => builder.pop().map(Scalar::Map), + ColumnBuilder::Nullable(builder) => Some(builder.pop()?.unwrap_or(Scalar::Null)), + ColumnBuilder::Tuple(fields) => { + if fields[0].len() > 0 { + Some(Scalar::Tuple( + fields + .iter_mut() + .map(|field| field.pop().unwrap()) + .collect(), + )) + } else { + None + } + } + ColumnBuilder::Variant(builder) => builder.pop().map(Scalar::Variant), + } + } + pub fn append_column(&mut self, other: &Column) { match (self, other) { (ColumnBuilder::Null { len }, Column::Null { len: other_len }) => { @@ -1862,18 +2133,11 @@ impl ColumnBuilder { (ColumnBuilder::Nullable(builder), Column::Nullable(other)) => { builder.append_column(other); } - ( - ColumnBuilder::Tuple { fields, len }, - Column::Tuple { - fields: other_fields, - len: other_len, - }, - ) => { + (ColumnBuilder::Tuple(fields), Column::Tuple(other_fields)) => { assert_eq!(fields.len(), other_fields.len()); for (field, other_field) in fields.iter_mut().zip(other_fields.iter()) { field.append_column(other_field); } - *len += other_len; } (this, other) => unreachable!("unable append {other:?} into {this:?}"), } @@ -1893,10 +2157,10 @@ impl ColumnBuilder { ColumnBuilder::Array(builder) => Column::Array(Box::new(builder.build())), ColumnBuilder::Map(builder) => Column::Map(Box::new(builder.build())), ColumnBuilder::Nullable(builder) => Column::Nullable(Box::new(builder.build())), - ColumnBuilder::Tuple { fields, len } => Column::Tuple { - fields: fields.into_iter().map(|field| field.build()).collect(), - len, - }, + ColumnBuilder::Tuple(fields) => { + assert!(fields.iter().map(|field| field.len()).all_equal()); + Column::Tuple(fields.into_iter().map(|field| field.build()).collect()) + } ColumnBuilder::Variant(builder) => Column::Variant(builder.build()), } } @@ -1916,7 +2180,7 @@ impl ColumnBuilder { ColumnBuilder::Array(builder) => Scalar::Array(builder.build_scalar()), ColumnBuilder::Map(builder) => Scalar::Map(builder.build_scalar()), ColumnBuilder::Nullable(builder) => builder.build_scalar().unwrap_or(Scalar::Null), - ColumnBuilder::Tuple { fields, .. } => Scalar::Tuple( + ColumnBuilder::Tuple(fields) => Scalar::Tuple( fields .into_iter() .map(|field| field.build_scalar()) @@ -1955,7 +2219,7 @@ impl<'a> Iterator for ColumnIterator<'a> { unsafe impl<'a> TrustedLen for ColumnIterator<'a> {} #[macro_export] -macro_rules! for_all_number_varints{ +macro_rules! for_all_number_varints { ($macro:tt $(, $x:tt)*) => { $macro! { [$($x),*], diff --git a/src/query/expression/tests/it/decimal.rs b/src/query/expression/tests/it/decimal.rs index 53a718e3576fa..a80f588924399 100644 --- a/src/query/expression/tests/it/decimal.rs +++ b/src/query/expression/tests/it/decimal.rs @@ -13,8 +13,8 @@ // limitations under the License. use common_exception::Result; -use common_expression::read_decimal; -use common_expression::read_decimal_with_size; +use common_expression::serialize::read_decimal; +use common_expression::serialize::read_decimal_with_size; use common_expression::types::decimal::DecimalSize; #[test] diff --git a/src/query/formats/Cargo.toml b/src/query/formats/Cargo.toml index bbdf851c1ead3..efb3cb5a7c3c8 100644 --- a/src/query/formats/Cargo.toml +++ b/src/query/formats/Cargo.toml @@ -14,6 +14,7 @@ test = false bstr = "1.0.1" chrono-tz = { workspace = true } lexical-core = "0.8.5" +match-template = "0.0.1" micromarshal = "0.2.1" num = "0.4.0" ordered-float = { workspace = true } @@ -35,3 +36,6 @@ storages-common-table-meta = { path = "../storages/common/table-meta" } common-arrow = { path = "../../common/arrow" } pretty_assertions = "1.3.0" + +[package.metadata.cargo-machete] +ignored = ["match-template"] diff --git a/src/query/formats/src/field_decoder/csv.rs b/src/query/formats/src/field_decoder/csv.rs index 2dfff02b4c674..9c5366d304ba0 100644 --- a/src/query/formats/src/field_decoder/csv.rs +++ b/src/query/formats/src/field_decoder/csv.rs @@ -17,11 +17,10 @@ use std::io::BufRead; use std::io::Cursor; use common_exception::Result; -use common_expression::ArrayDeserializer; -use common_expression::MapDeserializer; -use common_expression::StringDeserializer; -use common_expression::StructDeserializer; -use common_expression::VariantDeserializer; +use common_expression::types::array::ArrayColumnBuilder; +use common_expression::types::string::StringColumnBuilder; +use common_expression::types::AnyType; +use common_expression::ColumnBuilder; use common_io::constants::FALSE_BYTES_LOWER; use common_io::constants::INF_BYTES_LOWER; use common_io::constants::NULL_BYTES_ESCAPE; @@ -85,7 +84,7 @@ impl FieldDecoderRowBased for FieldDecoderCSV { fn read_string>( &self, - column: &mut StringDeserializer, + column: &mut StringColumnBuilder, reader: &mut Cursor, _raw: bool, ) -> Result<()> { @@ -99,13 +98,13 @@ impl FieldDecoderRowBased for FieldDecoderCSV { fn read_variant>( &self, - column: &mut VariantDeserializer, + column: &mut StringColumnBuilder, reader: &mut Cursor, _raw: bool, ) -> Result<()> { let buf = reader.remaining_slice(); - column.builder.put_slice(buf); - column.builder.commit_row(); + column.put_slice(buf); + column.commit_row(); reader.consume(buf.len()); Ok(()) @@ -113,7 +112,7 @@ impl FieldDecoderRowBased for FieldDecoderCSV { fn read_array>( &self, - column: &mut ArrayDeserializer, + column: &mut ArrayColumnBuilder, reader: &mut Cursor, _raw: bool, ) -> Result<()> { @@ -123,7 +122,7 @@ impl FieldDecoderRowBased for FieldDecoderCSV { fn read_map>( &self, - column: &mut MapDeserializer, + column: &mut ArrayColumnBuilder, reader: &mut Cursor, _raw: bool, ) -> Result<()> { @@ -131,13 +130,13 @@ impl FieldDecoderRowBased for FieldDecoderCSV { Ok(()) } - fn read_struct>( + fn read_tuple>( &self, - column: &mut StructDeserializer, + fields: &mut Vec, reader: &mut Cursor, _raw: bool, ) -> Result<()> { - self.nested.read_struct(column, reader, false)?; + self.nested.read_tuple(fields, reader, false)?; Ok(()) } } diff --git a/src/query/formats/src/field_decoder/fast_values.rs b/src/query/formats/src/field_decoder/fast_values.rs index e178eccad2635..4fa7304ed80d1 100644 --- a/src/query/formats/src/field_decoder/fast_values.rs +++ b/src/query/formats/src/field_decoder/fast_values.rs @@ -19,28 +19,25 @@ use std::io::BufRead; use std::io::Cursor; use bstr::ByteSlice; +use common_arrow::arrow::bitmap::MutableBitmap; use common_exception::ErrorCode; use common_exception::Result; -use common_expression::read_decimal_with_size; +use common_expression::serialize::read_decimal_with_size; +use common_expression::serialize::uniform_date; +use common_expression::types::array::ArrayColumnBuilder; use common_expression::types::date::check_date; use common_expression::types::decimal::Decimal; +use common_expression::types::decimal::DecimalColumnBuilder; +use common_expression::types::decimal::DecimalSize; +use common_expression::types::nullable::NullableColumnBuilder; use common_expression::types::number::Number; +use common_expression::types::string::StringColumnBuilder; use common_expression::types::timestamp::check_timestamp; -use common_expression::uniform_date; -use common_expression::ArrayDeserializer; -use common_expression::BooleanDeserializer; -use common_expression::DateDeserializer; -use common_expression::DecimalDeserializer; -use common_expression::MapDeserializer; -use common_expression::NullDeserializer; -use common_expression::NullableDeserializer; -use common_expression::NumberDeserializer; -use common_expression::StringDeserializer; -use common_expression::StructDeserializer; -use common_expression::TimestampDeserializer; -use common_expression::TypeDeserializer; -use common_expression::TypeDeserializerImpl; -use common_expression::VariantDeserializer; +use common_expression::types::AnyType; +use common_expression::types::NumberColumnBuilder; +use common_expression::with_decimal_type; +use common_expression::with_number_mapped_type; +use common_expression::ColumnBuilder; use common_io::constants::FALSE_BYTES_LOWER; use common_io::constants::INF_BYTES_LOWER; use common_io::constants::NAN_BYTES_LOWER; @@ -52,9 +49,7 @@ use common_io::cursor_ext::ReadBytesExt; use common_io::cursor_ext::ReadCheckPointExt; use common_io::cursor_ext::ReadNumberExt; use common_io::prelude::FormatSettings; -use common_io::prelude::StatBuffer; use lexical_core::FromLexical; -use micromarshal::Unmarshal; use num::cast::AsPrimitive; use crate::CommonSettings; @@ -63,7 +58,6 @@ use crate::FieldDecoder; #[derive(Clone)] pub struct FastFieldDecoderValues { pub common_settings: CommonSettings, - format: FormatSettings, } impl FieldDecoder for FastFieldDecoderValues { @@ -83,7 +77,6 @@ impl FastFieldDecoderValues { inf_bytes: INF_BYTES_LOWER.as_bytes().to_vec(), timezone: format.timezone, }, - format, } } @@ -108,39 +101,40 @@ impl FastFieldDecoderValues { pub fn read_field>( &self, - column: &mut TypeDeserializerImpl, + column: &mut ColumnBuilder, reader: &mut Cursor, positions: &mut VecDeque, ) -> Result<()> { match column { - TypeDeserializerImpl::Null(c) => self.read_null(c, reader), - TypeDeserializerImpl::Nullable(c) => self.read_nullable(c, reader, positions), - TypeDeserializerImpl::Boolean(c) => self.read_bool(c, reader), - TypeDeserializerImpl::Int8(c) => self.read_int(c, reader), - TypeDeserializerImpl::Int16(c) => self.read_int(c, reader), - TypeDeserializerImpl::Int32(c) => self.read_int(c, reader), - TypeDeserializerImpl::Int64(c) => self.read_int(c, reader), - TypeDeserializerImpl::UInt8(c) => self.read_int(c, reader), - TypeDeserializerImpl::UInt16(c) => self.read_int(c, reader), - TypeDeserializerImpl::UInt32(c) => self.read_int(c, reader), - TypeDeserializerImpl::UInt64(c) => self.read_int(c, reader), - TypeDeserializerImpl::Float32(c) => self.read_float(c, reader), - TypeDeserializerImpl::Float64(c) => self.read_float(c, reader), - TypeDeserializerImpl::Decimal128(c) => self.read_decimal(c, reader), - TypeDeserializerImpl::Decimal256(c) => self.read_decimal(c, reader), - TypeDeserializerImpl::Date(c) => self.read_date(c, reader, positions), - TypeDeserializerImpl::Timestamp(c) => self.read_timestamp(c, reader, positions), - TypeDeserializerImpl::String(c) => self.read_string(c, reader, positions), - TypeDeserializerImpl::Array(c) => self.read_array(c, reader, positions), - TypeDeserializerImpl::Map(c) => self.read_map(c, reader, positions), - TypeDeserializerImpl::Struct(c) => self.read_struct(c, reader, positions), - TypeDeserializerImpl::Variant(c) => self.read_variant(c, reader, positions), + ColumnBuilder::Null { len } => self.read_null(len, reader), + ColumnBuilder::Nullable(c) => self.read_nullable(c, reader, positions), + ColumnBuilder::Boolean(c) => self.read_bool(c, reader), + ColumnBuilder::Number(c) => with_number_mapped_type!(|NUM_TYPE| match c { + NumberColumnBuilder::NUM_TYPE(c) => { + if NUM_TYPE::FLOATING { + self.read_float(c, reader) + } else { + self.read_int(c, reader) + } + } + }), + ColumnBuilder::Decimal(c) => with_decimal_type!(|DECIMAL_TYPE| match c { + DecimalColumnBuilder::DECIMAL_TYPE(c, size) => self.read_decimal(c, *size, reader), + }), + ColumnBuilder::Date(c) => self.read_date(c, reader, positions), + ColumnBuilder::Timestamp(c) => self.read_timestamp(c, reader, positions), + ColumnBuilder::String(c) => self.read_string(c, reader, positions), + ColumnBuilder::Array(c) => self.read_array(c, reader, positions), + ColumnBuilder::Map(c) => self.read_map(c, reader, positions), + ColumnBuilder::Tuple(fields) => self.read_tuple(fields, reader, positions), + ColumnBuilder::Variant(c) => self.read_variant(c, reader, positions), + _ => unimplemented!(), } } fn read_bool>( &self, - column: &mut BooleanDeserializer, + column: &mut MutableBitmap, reader: &mut Cursor, ) -> Result<()> { if self.match_bytes(reader, &self.common_settings().true_bytes) { @@ -159,69 +153,59 @@ impl FastFieldDecoderValues { } } - fn read_null>( - &self, - column: &mut NullDeserializer, - _reader: &mut Cursor, - ) -> Result<()> { - column.de_default(); + fn read_null>(&self, len: &mut usize, _reader: &mut Cursor) -> Result<()> { + *len += 1; Ok(()) } fn read_nullable>( &self, - column: &mut NullableDeserializer, + column: &mut NullableColumnBuilder, reader: &mut Cursor, positions: &mut VecDeque, ) -> Result<()> { - if reader.eof() { - column.de_default(); - } else if reader.ignore_bytes(b"NULL") || reader.ignore_bytes(b"null") { - column.de_default(); - return Ok(()); + if reader.eof() || reader.ignore_bytes(b"NULL") || reader.ignore_bytes(b"null") { + column.push_null(); } else { - self.read_field(column.inner.as_mut(), reader, positions)?; + self.read_field(&mut column.builder, reader, positions)?; column.validity.push(true); } Ok(()) } - fn read_int>( - &self, - column: &mut NumberDeserializer, - reader: &mut Cursor, - ) -> Result<()> + fn read_int>(&self, column: &mut Vec, reader: &mut Cursor) -> Result<()> where - T: Number + Unmarshal + StatBuffer + From

, - P: Unmarshal

+ StatBuffer + FromLexical, + T: Number + From, + T::Native: FromLexical, { - let v: P = reader.read_int_text()?; - column.builder.push(v.into()); + let v: T::Native = reader.read_int_text()?; + column.push(v.into()); Ok(()) } - fn read_float>( + fn read_float>( &self, - column: &mut NumberDeserializer, + column: &mut Vec, reader: &mut Cursor, ) -> Result<()> where - T: Number + Unmarshal + StatBuffer + From

, - P: Unmarshal

+ StatBuffer + FromLexical, + T: Number + From, + T::Native: FromLexical, { - let v: P = reader.read_float_text()?; - column.builder.push(v.into()); + let v: T::Native = reader.read_float_text()?; + column.push(v.into()); Ok(()) } fn read_decimal, D: Decimal>( &self, - column: &mut DecimalDeserializer, + column: &mut Vec, + size: DecimalSize, reader: &mut Cursor, ) -> Result<()> { let buf = reader.remaining_slice(); - let (n, n_read) = read_decimal_with_size(buf, column.size, false)?; - column.values.push(n); + let (n, n_read) = read_decimal_with_size(buf, size, false)?; + column.push(n); reader.consume(n_read); Ok(()) } @@ -238,7 +222,7 @@ impl FastFieldDecoderValues { fn read_string>( &self, - column: &mut StringDeserializer, + column: &mut StringColumnBuilder, reader: &mut Cursor, positions: &mut VecDeque, ) -> Result<()> { @@ -249,32 +233,32 @@ impl FastFieldDecoderValues { fn read_date>( &self, - column: &mut DateDeserializer, + column: &mut Vec, reader: &mut Cursor, positions: &mut VecDeque, ) -> Result<()> { - column.buffer.clear(); - self.read_string_inner(reader, &mut column.buffer, positions)?; - let mut buffer_readr = Cursor::new(&column.buffer); + let mut buf = Vec::new(); + self.read_string_inner(reader, &mut buf, positions)?; + let mut buffer_readr = Cursor::new(&buf); let date = buffer_readr.read_date_text(&self.common_settings().timezone)?; let days = uniform_date(date); check_date(days as i64)?; - column.builder.push(days); + column.push(days); Ok(()) } fn read_timestamp>( &self, - column: &mut TimestampDeserializer, + column: &mut Vec, reader: &mut Cursor, positions: &mut VecDeque, ) -> Result<()> { - column.buffer.clear(); - self.read_string_inner(reader, &mut column.buffer, positions)?; - let mut buffer_readr = Cursor::new(&column.buffer); + let mut buf = Vec::new(); + self.read_string_inner(reader, &mut buf, positions)?; + let mut buffer_readr = Cursor::new(&buf); let ts = buffer_readr.read_timestamp_text(&self.common_settings().timezone)?; if !buffer_readr.eof() { - let data = column.buffer.to_str().unwrap_or("not utf8"); + let data = buf.to_str().unwrap_or("not utf8"); let msg = format!( "fail to deserialize timestamp, unexpected end at pos {} of {}", buffer_readr.position(), @@ -284,19 +268,18 @@ impl FastFieldDecoderValues { } let micros = ts.timestamp_micros(); check_timestamp(micros)?; - column.builder.push(micros.as_()); + column.push(micros.as_()); Ok(()) } fn read_array>( &self, - column: &mut ArrayDeserializer, + column: &mut ArrayColumnBuilder, reader: &mut Cursor, positions: &mut VecDeque, ) -> Result<()> { reader.must_ignore_byte(b'[')?; - let mut idx = 0; - loop { + for idx in 0.. { let _ = reader.ignore_white_spaces(); if reader.ignore_byte(b']') { break; @@ -305,24 +288,24 @@ impl FastFieldDecoderValues { reader.must_ignore_byte(b',')?; } let _ = reader.ignore_white_spaces(); - self.read_field(column.inner.as_mut(), reader, positions)?; - idx += 1; + self.read_field(&mut column.builder, reader, positions)?; } - - column.add_offset(idx); + column.commit_row(); Ok(()) } fn read_map>( &self, - column: &mut MapDeserializer, + column: &mut ArrayColumnBuilder, reader: &mut Cursor, positions: &mut VecDeque, ) -> Result<()> { + const KEY: usize = 0; + const VALUE: usize = 1; reader.must_ignore_byte(b'{')?; - let mut idx = 0; let mut set = HashSet::new(); - loop { + let map_builder = column.builder.as_tuple_mut().unwrap(); + for idx in 0.. { let _ = reader.ignore_white_spaces(); if reader.ignore_byte(b'}') { break; @@ -331,42 +314,40 @@ impl FastFieldDecoderValues { reader.must_ignore_byte(b',')?; } let _ = reader.ignore_white_spaces(); - self.read_field(column.key.as_mut(), reader, positions)?; + self.read_field(&mut map_builder[KEY], reader, positions)?; // check duplicate map keys - let key = column.key.pop_data_value().unwrap(); + let key = map_builder[KEY].pop().unwrap(); if set.contains(&key) { - column.add_offset(idx); + column.commit_row(); return Err(ErrorCode::BadBytes( "map keys have to be unique".to_string(), )); } set.insert(key.clone()); - column.key.append_data_value(key, &self.format)?; + map_builder[KEY].push(key.as_ref()); let _ = reader.ignore_white_spaces(); reader.must_ignore_byte(b':')?; let _ = reader.ignore_white_spaces(); - self.read_field(column.value.as_mut(), reader, positions)?; - idx += 1; + self.read_field(&mut map_builder[VALUE], reader, positions)?; } - - column.add_offset(idx); + column.commit_row(); Ok(()) } - fn read_struct>( + fn read_tuple>( &self, - column: &mut StructDeserializer, + fields: &mut [ColumnBuilder], reader: &mut Cursor, positions: &mut VecDeque, ) -> Result<()> { reader.must_ignore_byte(b'(')?; - for (idx, inner) in column.inners.iter_mut().enumerate() { + for (idx, field) in fields.iter_mut().enumerate() { let _ = reader.ignore_white_spaces(); if idx != 0 { reader.must_ignore_byte(b',')?; } let _ = reader.ignore_white_spaces(); - self.read_field(inner, reader, positions)?; + self.read_field(field, reader, positions)?; } reader.must_ignore_byte(b')')?; Ok(()) @@ -374,12 +355,12 @@ impl FastFieldDecoderValues { fn read_variant>( &self, - column: &mut VariantDeserializer, + column: &mut StringColumnBuilder, reader: &mut Cursor, positions: &mut VecDeque, ) -> Result<()> { - self.read_string_inner(reader, &mut column.builder.data, positions)?; - column.builder.commit_row(); + self.read_string_inner(reader, &mut column.data, positions)?; + column.commit_row(); Ok(()) } } diff --git a/src/query/formats/src/field_decoder/json_ast.rs b/src/query/formats/src/field_decoder/json_ast.rs index 9977e095a56f4..17e6d815acd7c 100644 --- a/src/query/formats/src/field_decoder/json_ast.rs +++ b/src/query/formats/src/field_decoder/json_ast.rs @@ -16,33 +16,28 @@ use std::any::Any; use std::io::Cursor; use chrono_tz::Tz; +use common_arrow::arrow::bitmap::MutableBitmap; use common_exception::ErrorCode; use common_exception::Result; +use common_expression::serialize::read_decimal_from_json; +use common_expression::serialize::uniform_date; +use common_expression::types::array::ArrayColumnBuilder; use common_expression::types::date::check_date; use common_expression::types::decimal::Decimal; +use common_expression::types::decimal::DecimalColumnBuilder; +use common_expression::types::decimal::DecimalSize; +use common_expression::types::nullable::NullableColumnBuilder; use common_expression::types::number::Number; +use common_expression::types::string::StringColumnBuilder; use common_expression::types::timestamp::check_timestamp; -use common_expression::uniform_date; -use common_expression::ArrayDeserializer; -use common_expression::BooleanDeserializer; -use common_expression::DateDeserializer; -use common_expression::DecimalDeserializer; -use common_expression::MapDeserializer; -use common_expression::NullDeserializer; -use common_expression::NullableDeserializer; -use common_expression::NumberDeserializer; -use common_expression::StringDeserializer; -use common_expression::StructDeserializer; -use common_expression::TimestampDeserializer; -use common_expression::TypeDeserializer; -use common_expression::TypeDeserializerImpl; -use common_expression::VariantDeserializer; +use common_expression::types::AnyType; +use common_expression::types::NumberColumnBuilder; +use common_expression::with_decimal_type; +use common_expression::with_number_mapped_type; +use common_expression::ColumnBuilder; use common_io::cursor_ext::BufferReadDateTimeExt; use common_io::cursor_ext::ReadNumberExt; -use common_io::prelude::FormatSettings; -use common_io::prelude::StatBuffer; use lexical_core::FromLexical; -use micromarshal::Unmarshal; use num::cast::AsPrimitive; use serde_json::Value; @@ -68,34 +63,35 @@ impl FieldJsonAstDecoder { } } - pub fn read_field(&self, column: &mut TypeDeserializerImpl, value: &Value) -> Result<()> { + pub fn read_field(&self, column: &mut ColumnBuilder, value: &Value) -> Result<()> { match column { - TypeDeserializerImpl::Null(c) => self.read_null(c, value), - TypeDeserializerImpl::Nullable(c) => self.read_nullable(c, value), - TypeDeserializerImpl::Boolean(c) => self.read_bool(c, value), - TypeDeserializerImpl::Int8(c) => self.read_int(c, value), - TypeDeserializerImpl::Int16(c) => self.read_int(c, value), - TypeDeserializerImpl::Int32(c) => self.read_int(c, value), - TypeDeserializerImpl::Int64(c) => self.read_int(c, value), - TypeDeserializerImpl::UInt8(c) => self.read_int(c, value), - TypeDeserializerImpl::UInt16(c) => self.read_int(c, value), - TypeDeserializerImpl::UInt32(c) => self.read_int(c, value), - TypeDeserializerImpl::UInt64(c) => self.read_int(c, value), - TypeDeserializerImpl::Float32(c) => self.read_float(c, value), - TypeDeserializerImpl::Float64(c) => self.read_float(c, value), - TypeDeserializerImpl::Decimal128(c) => self.read_decimal(c, value), - TypeDeserializerImpl::Decimal256(c) => self.read_decimal(c, value), - TypeDeserializerImpl::Date(c) => self.read_date(c, value), - TypeDeserializerImpl::Timestamp(c) => self.read_timestamp(c, value), - TypeDeserializerImpl::String(c) => self.read_string(c, value), - TypeDeserializerImpl::Array(c) => self.read_array(c, value), - TypeDeserializerImpl::Map(c) => self.read_map(c, value), - TypeDeserializerImpl::Struct(c) => self.read_struct(c, value), - TypeDeserializerImpl::Variant(c) => self.read_variant(c, value), + ColumnBuilder::Null { len } => self.read_null(len, value), + ColumnBuilder::Nullable(c) => self.read_nullable(c, value), + ColumnBuilder::Boolean(c) => self.read_bool(c, value), + ColumnBuilder::Number(c) => with_number_mapped_type!(|NUM_TYPE| match c { + NumberColumnBuilder::NUM_TYPE(c) => { + if NUM_TYPE::FLOATING { + self.read_float(c, value) + } else { + self.read_int(c, value) + } + } + }), + ColumnBuilder::Decimal(c) => with_decimal_type!(|DECIMAL_TYPE| match c { + DecimalColumnBuilder::DECIMAL_TYPE(c, size) => self.read_decimal(c, *size, value), + }), + ColumnBuilder::Date(c) => self.read_date(c, value), + ColumnBuilder::Timestamp(c) => self.read_timestamp(c, value), + ColumnBuilder::String(c) => self.read_string(c, value), + ColumnBuilder::Array(c) => self.read_array(c, value), + ColumnBuilder::Map(c) => self.read_map(c, value), + ColumnBuilder::Tuple(fields) => self.read_tuple(fields, value), + ColumnBuilder::Variant(c) => self.read_variant(c, value), + _ => unimplemented!(), } } - fn read_bool(&self, column: &mut BooleanDeserializer, value: &Value) -> Result<()> { + fn read_bool(&self, column: &mut MutableBitmap, value: &Value) -> Result<()> { match value { Value::Bool(v) => column.push(*v), _ => return Err(ErrorCode::BadBytes("Incorrect boolean value")), @@ -103,60 +99,66 @@ impl FieldJsonAstDecoder { Ok(()) } - fn read_null(&self, column: &mut NullDeserializer, _value: &Value) -> Result<()> { - column.de_default(); + fn read_null(&self, len: &mut usize, _value: &Value) -> Result<()> { + *len += 1; Ok(()) } - fn read_nullable(&self, column: &mut NullableDeserializer, value: &Value) -> Result<()> { + fn read_nullable( + &self, + column: &mut NullableColumnBuilder, + value: &Value, + ) -> Result<()> { match value { Value::Null => { - column.validity.push(false); - column.inner.de_default(); + column.push_null(); } other => { - self.read_field(column.inner.as_mut(), other)?; + self.read_field(&mut column.builder, other)?; column.validity.push(true); } } Ok(()) } - fn read_int(&self, column: &mut NumberDeserializer, value: &Value) -> Result<()> - where T: Number + Unmarshal + StatBuffer + FromLexical { + fn read_int(&self, column: &mut Vec, value: &Value) -> Result<()> + where + T: Number + From, + T::Native: FromLexical, + { match value { Value::Number(v) => { let v = v.to_string(); let mut reader = Cursor::new(v.as_bytes()); - let v: T = if !T::FLOATING { + let v: T::Native = if !T::FLOATING { reader.read_int_text() } else { reader.read_float_text() }?; - column.builder.push(v); + column.push(v.into()); Ok(()) } _ => Err(ErrorCode::BadBytes("Incorrect json value, must be number")), } } - fn read_float(&self, column: &mut NumberDeserializer, value: &Value) -> Result<()> + fn read_float(&self, column: &mut Vec, value: &Value) -> Result<()> where - T: Number + Unmarshal + StatBuffer + From

, - P: Unmarshal

+ StatBuffer + FromLexical, + T: Number + From, + T::Native: FromLexical, { match value { Value::Number(v) => { let v = v.to_string(); let mut reader = Cursor::new(v.as_bytes()); - let v: P = if !T::FLOATING { + let v: T::Native = if !T::FLOATING { reader.read_int_text() } else { reader.read_float_text() }?; - column.builder.push(v.into()); + column.push(v.into()); Ok(()) } _ => Err(ErrorCode::BadBytes("Incorrect json value, must be number")), @@ -165,14 +167,15 @@ impl FieldJsonAstDecoder { fn read_decimal( &self, - column: &mut DecimalDeserializer, + column: &mut Vec, + size: DecimalSize, value: &Value, - ) -> Result<()> -where { - column.de_json_inner(value) + ) -> Result<()> { + column.push(read_decimal_from_json(value, size)?); + Ok(()) } - fn read_string(&self, column: &mut StringDeserializer, value: &Value) -> Result<()> { + fn read_string(&self, column: &mut StringColumnBuilder, value: &Value) -> Result<()> { match value { Value::String(s) => { column.put_str(s.as_str()); @@ -183,20 +186,20 @@ where { } } - fn read_date(&self, column: &mut DateDeserializer, value: &Value) -> Result<()> { + fn read_date(&self, column: &mut Vec, value: &Value) -> Result<()> { match value { Value::String(v) => { let mut reader = Cursor::new(v.as_bytes()); let date = reader.read_date_text(&self.timezone)?; let days = uniform_date(date); check_date(days as i64)?; - column.builder.push(days); + column.push(days); Ok(()) } Value::Number(number) => match number.as_i64() { Some(n) => { let n = check_date(n)?; - column.builder.push(n); + column.push(n); Ok(()) } None => Err(ErrorCode::BadArguments("Incorrect date value")), @@ -205,7 +208,7 @@ where { } } - fn read_timestamp(&self, column: &mut TimestampDeserializer, value: &Value) -> Result<()> { + fn read_timestamp(&self, column: &mut Vec, value: &Value) -> Result<()> { match value { Value::String(v) => { let v = v.clone(); @@ -214,13 +217,13 @@ where { let micros = ts.timestamp_micros(); check_timestamp(micros)?; - column.builder.push(micros.as_()); + column.push(micros.as_()); Ok(()) } Value::Number(number) => match number.as_i64() { Some(n) => { check_timestamp(n)?; - column.builder.push(n); + column.push(n); Ok(()) } None => Err(ErrorCode::BadArguments( @@ -231,52 +234,57 @@ where { } } - fn read_variant(&self, column: &mut VariantDeserializer, value: &Value) -> Result<()> { - column.de_json(value, &FormatSettings::default())?; + fn read_variant(&self, column: &mut StringColumnBuilder, value: &Value) -> Result<()> { + let v = jsonb::Value::from(value); + v.write_to_vec(&mut column.data); + column.commit_row(); Ok(()) } - fn read_array(&self, column: &mut ArrayDeserializer, value: &Value) -> Result<()> { + fn read_array(&self, column: &mut ArrayColumnBuilder, value: &Value) -> Result<()> { match value { Value::Array(vals) => { for val in vals { - self.read_field(column.inner.as_mut(), val)?; + self.read_field(&mut column.builder, val)?; } - column.add_offset(vals.len()); + column.commit_row(); Ok(()) } _ => Err(ErrorCode::BadBytes("Incorrect json value, must be array")), } } - fn read_map(&self, column: &mut MapDeserializer, value: &Value) -> Result<()> { + fn read_map(&self, column: &mut ArrayColumnBuilder, value: &Value) -> Result<()> { + const KEY: usize = 0; + const VALUE: usize = 1; + let map_builder = column.builder.as_tuple_mut().unwrap(); match value { Value::Object(obj) => { for (key, val) in obj.iter() { let key = Value::String(key.to_string()); - self.read_field(column.key.as_mut(), &key)?; - self.read_field(column.value.as_mut(), val)?; + self.read_field(&mut map_builder[KEY], &key)?; + self.read_field(&mut map_builder[VALUE], val)?; } - column.add_offset(obj.len()); + column.commit_row(); Ok(()) } _ => Err(ErrorCode::BadBytes("Incorrect json value, must be object")), } } - fn read_struct(&self, column: &mut StructDeserializer, value: &Value) -> Result<()> { + fn read_tuple(&self, fields: &mut Vec, value: &Value) -> Result<()> { match value { Value::Object(obj) => { - if column.inners.len() != obj.len() { + if fields.len() != obj.len() { return Err(ErrorCode::BadBytes(format!( "Incorrect json value, expect {} values, but get {} values", - column.inners.len(), + fields.len(), obj.len() ))); } - for (inner, item) in column.inners.iter_mut().zip(obj.iter()) { + for (field, item) in fields.iter_mut().zip(obj.iter()) { let (_, val) = item; - self.read_field(inner, val)?; + self.read_field(field, val)?; } Ok(()) } diff --git a/src/query/formats/src/field_decoder/row_based.rs b/src/query/formats/src/field_decoder/row_based.rs index 00e18047188c3..49ed7c4cfe92d 100644 --- a/src/query/formats/src/field_decoder/row_based.rs +++ b/src/query/formats/src/field_decoder/row_based.rs @@ -18,36 +18,30 @@ use std::io::Seek; use std::io::SeekFrom; use bstr::ByteSlice; +use common_arrow::arrow::bitmap::MutableBitmap; use common_exception::ErrorCode; use common_exception::Result; -use common_expression::read_decimal_with_size; +use common_expression::serialize::read_decimal_with_size; +use common_expression::serialize::uniform_date; +use common_expression::types::array::ArrayColumnBuilder; use common_expression::types::date::check_date; use common_expression::types::decimal::Decimal; +use common_expression::types::decimal::DecimalColumnBuilder; +use common_expression::types::decimal::DecimalSize; +use common_expression::types::nullable::NullableColumnBuilder; use common_expression::types::number::Number; +use common_expression::types::string::StringColumnBuilder; use common_expression::types::timestamp::check_timestamp; -use common_expression::uniform_date; -use common_expression::ArrayDeserializer; -use common_expression::BooleanDeserializer; -use common_expression::DateDeserializer; -use common_expression::DecimalDeserializer; -use common_expression::MapDeserializer; -use common_expression::NullDeserializer; -use common_expression::NullableDeserializer; -use common_expression::NumberDeserializer; -use common_expression::StringDeserializer; -use common_expression::StructDeserializer; -use common_expression::TimestampDeserializer; -use common_expression::TypeDeserializer; -use common_expression::TypeDeserializerImpl; -use common_expression::VariantDeserializer; +use common_expression::types::AnyType; +use common_expression::types::NumberColumnBuilder; +use common_expression::with_decimal_type; +use common_expression::with_number_mapped_type; +use common_expression::ColumnBuilder; use common_io::cursor_ext::BufferReadDateTimeExt; use common_io::cursor_ext::ReadBytesExt; use common_io::cursor_ext::ReadCheckPointExt; use common_io::cursor_ext::ReadNumberExt; -use common_io::prelude::StatBuffer; use lexical_core::FromLexical; -use micromarshal::Unmarshal; -use num::cast::AsPrimitive; use crate::field_decoder::FieldDecoder; use crate::CommonSettings; @@ -69,39 +63,41 @@ pub trait FieldDecoderRowBased: FieldDecoder { fn read_field>( &self, - column: &mut TypeDeserializerImpl, + column: &mut ColumnBuilder, reader: &mut Cursor, raw: bool, ) -> Result<()> { match column { - TypeDeserializerImpl::Null(c) => self.read_null(c, reader, raw), - TypeDeserializerImpl::Nullable(c) => self.read_nullable(c, reader, raw), - TypeDeserializerImpl::Boolean(c) => self.read_bool(c, reader, raw), - TypeDeserializerImpl::Int8(c) => self.read_int(c, reader, raw), - TypeDeserializerImpl::Int16(c) => self.read_int(c, reader, raw), - TypeDeserializerImpl::Int32(c) => self.read_int(c, reader, raw), - TypeDeserializerImpl::Int64(c) => self.read_int(c, reader, raw), - TypeDeserializerImpl::UInt8(c) => self.read_int(c, reader, raw), - TypeDeserializerImpl::UInt16(c) => self.read_int(c, reader, raw), - TypeDeserializerImpl::UInt32(c) => self.read_int(c, reader, raw), - TypeDeserializerImpl::UInt64(c) => self.read_int(c, reader, raw), - TypeDeserializerImpl::Float32(c) => self.read_float(c, reader, raw), - TypeDeserializerImpl::Float64(c) => self.read_float(c, reader, raw), - TypeDeserializerImpl::Decimal128(c) => self.read_decimal(c, reader, raw), - TypeDeserializerImpl::Decimal256(c) => self.read_decimal(c, reader, raw), - TypeDeserializerImpl::Date(c) => self.read_date(c, reader, raw), - TypeDeserializerImpl::Timestamp(c) => self.read_timestamp(c, reader, raw), - TypeDeserializerImpl::String(c) => self.read_string(c, reader, raw), - TypeDeserializerImpl::Array(c) => self.read_array(c, reader, raw), - TypeDeserializerImpl::Map(c) => self.read_map(c, reader, raw), - TypeDeserializerImpl::Struct(c) => self.read_struct(c, reader, raw), - TypeDeserializerImpl::Variant(c) => self.read_variant(c, reader, raw), + ColumnBuilder::Null { len } => self.read_null(len, reader, raw), + ColumnBuilder::Nullable(c) => self.read_nullable(c, reader, raw), + ColumnBuilder::Boolean(c) => self.read_bool(c, reader, raw), + ColumnBuilder::Number(c) => with_number_mapped_type!(|NUM_TYPE| match c { + NumberColumnBuilder::NUM_TYPE(c) => { + if NUM_TYPE::FLOATING { + self.read_float(c, reader, raw) + } else { + self.read_int(c, reader, raw) + } + } + }), + ColumnBuilder::Decimal(c) => with_decimal_type!(|DECIMAL_TYPE| match c { + DecimalColumnBuilder::DECIMAL_TYPE(c, size) => + self.read_decimal(c, *size, reader, raw), + }), + ColumnBuilder::Date(c) => self.read_date(c, reader, raw), + ColumnBuilder::Timestamp(c) => self.read_timestamp(c, reader, raw), + ColumnBuilder::String(c) => self.read_string(c, reader, raw), + ColumnBuilder::Array(c) => self.read_array(c, reader, raw), + ColumnBuilder::Map(c) => self.read_map(c, reader, raw), + ColumnBuilder::Tuple(fields) => self.read_tuple(fields, reader, raw), + ColumnBuilder::Variant(c) => self.read_variant(c, reader, raw), + _ => unimplemented!(), } } fn read_bool>( &self, - column: &mut BooleanDeserializer, + column: &mut MutableBitmap, reader: &mut Cursor, _raw: bool, ) -> Result<()> { @@ -123,29 +119,29 @@ pub trait FieldDecoderRowBased: FieldDecoder { fn read_null>( &self, - column: &mut NullDeserializer, + column_len: &mut usize, _reader: &mut Cursor, _raw: bool, ) -> Result<()> { - column.de_default(); + *column_len += 1; Ok(()) } fn read_nullable>( &self, - column: &mut NullableDeserializer, + column: &mut NullableColumnBuilder, reader: &mut Cursor, raw: bool, ) -> Result<()> { if reader.eof() { - column.de_default(); + column.push_null(); } else if self.match_bytes(reader, &self.common_settings().null_bytes) && self.ignore_field_end(reader) { - column.de_default(); + column.push_null(); return Ok(()); } else { - self.read_field(column.inner.as_mut(), reader, raw)?; + self.read_field(&mut column.builder, reader, raw)?; column.validity.push(true); } Ok(()) @@ -158,81 +154,82 @@ pub trait FieldDecoderRowBased: FieldDecoder { raw: bool, ) -> Result<()>; - fn read_int>( + fn read_int>( &self, - column: &mut NumberDeserializer, + column: &mut Vec, reader: &mut Cursor, _raw: bool, ) -> Result<()> where - T: Number + Unmarshal + StatBuffer + From

, - P: Unmarshal

+ StatBuffer + FromLexical, + T: Number + From, + T::Native: FromLexical, { - let v: P = reader.read_int_text()?; - column.builder.push(v.into()); + let v: T::Native = reader.read_int_text()?; + column.push(v.into()); Ok(()) } - fn read_float>( + fn read_float>( &self, - column: &mut NumberDeserializer, + column: &mut Vec, reader: &mut Cursor, _raw: bool, ) -> Result<()> where - T: Number + Unmarshal + StatBuffer + From

, - P: Unmarshal

+ StatBuffer + FromLexical, + T: Number + From, + T::Native: FromLexical, { - let v: P = reader.read_float_text()?; - column.builder.push(v.into()); + let v: T::Native = reader.read_float_text()?; + column.push(v.into()); Ok(()) } fn read_decimal, D: Decimal>( &self, - column: &mut DecimalDeserializer, + column: &mut Vec, + size: DecimalSize, reader: &mut Cursor, _raw: bool, ) -> Result<()> { let buf = reader.remaining_slice(); - let (n, n_read) = read_decimal_with_size(buf, column.size, false)?; - column.values.push(n); + let (n, n_read) = read_decimal_with_size(buf, size, false)?; + column.push(n); reader.consume(n_read); Ok(()) } fn read_string>( &self, - column: &mut StringDeserializer, + column: &mut StringColumnBuilder, reader: &mut Cursor, _raw: bool, ) -> Result<()>; fn read_date>( &self, - column: &mut DateDeserializer, + column: &mut Vec, reader: &mut Cursor, raw: bool, ) -> Result<()> { - column.buffer.clear(); - self.read_string_inner(reader, &mut column.buffer, raw)?; - let mut buffer_readr = Cursor::new(&column.buffer); + let mut buf = Vec::new(); + self.read_string_inner(reader, &mut buf, raw)?; + let mut buffer_readr = Cursor::new(&buf); let date = buffer_readr.read_date_text(&self.common_settings().timezone)?; let days = uniform_date(date); check_date(days as i64)?; - column.builder.push(days); + column.push(days); Ok(()) } fn read_timestamp>( &self, - column: &mut TimestampDeserializer, + column: &mut Vec, reader: &mut Cursor, raw: bool, ) -> Result<()> { - column.buffer.clear(); - self.read_string_inner(reader, &mut column.buffer, raw)?; - let mut buffer_readr = Cursor::new(&column.buffer); + let mut buf = Vec::new(); + self.read_string_inner(reader, &mut buf, raw)?; + let mut buffer_readr = Cursor::new(&buf); let pos = buffer_readr.position(); let ts_result = buffer_readr.read_num_text_exact(); let ts = match ts_result { @@ -242,7 +239,7 @@ pub trait FieldDecoderRowBased: FieldDecoder { .expect("buffer reader seek must success"); let t = buffer_readr.read_timestamp_text(&self.common_settings().timezone)?; if !buffer_readr.eof() { - let data = column.buffer.to_str().unwrap_or("not utf8"); + let data = buf.to_str().unwrap_or("not utf8"); let msg = format!( "fail to deserialize timestamp, unexpected end at pos {} of {}", buffer_readr.position(), @@ -255,38 +252,38 @@ pub trait FieldDecoderRowBased: FieldDecoder { Ok(t) => t, }; check_timestamp(ts)?; - column.builder.push(ts.as_()); + column.push(ts); Ok(()) } fn read_variant>( &self, - column: &mut VariantDeserializer, + column: &mut StringColumnBuilder, reader: &mut Cursor, raw: bool, ) -> Result<()> { - self.read_string_inner(reader, &mut column.builder.data, raw)?; - column.builder.commit_row(); + self.read_string_inner(reader, &mut column.data, raw)?; + column.commit_row(); Ok(()) } fn read_array>( &self, - column: &mut ArrayDeserializer, + column: &mut ArrayColumnBuilder, reader: &mut Cursor, raw: bool, ) -> Result<()>; fn read_map>( &self, - column: &mut MapDeserializer, + column: &mut ArrayColumnBuilder, reader: &mut Cursor, raw: bool, ) -> Result<()>; - fn read_struct>( + fn read_tuple>( &self, - column: &mut StructDeserializer, + fields: &mut Vec, reader: &mut Cursor, raw: bool, ) -> Result<()>; diff --git a/src/query/formats/src/field_decoder/tsv.rs b/src/query/formats/src/field_decoder/tsv.rs index 1311d1f7e7bcd..a964b68d191e7 100644 --- a/src/query/formats/src/field_decoder/tsv.rs +++ b/src/query/formats/src/field_decoder/tsv.rs @@ -18,11 +18,10 @@ use std::io::Cursor; use common_exception::ErrorCode; use common_exception::Result; -use common_expression::ArrayDeserializer; -use common_expression::MapDeserializer; -use common_expression::StringDeserializer; -use common_expression::StructDeserializer; -use common_expression::TypeDeserializer; +use common_expression::types::array::ArrayColumnBuilder; +use common_expression::types::string::StringColumnBuilder; +use common_expression::types::AnyType; +use common_expression::ColumnBuilder; use common_io::constants::FALSE_BYTES_NUM; use common_io::constants::INF_BYTES_LOWER; use common_io::constants::NAN_BYTES_LOWER; @@ -30,7 +29,6 @@ use common_io::constants::NULL_BYTES_ESCAPE; use common_io::constants::TRUE_BYTES_NUM; use common_io::cursor_ext::BufferReadStringExt; use common_io::cursor_ext::ReadBytesExt; -use common_io::prelude::FormatSettings; use crate::field_decoder::row_based::FieldDecoderRowBased; use crate::CommonSettings; @@ -41,7 +39,6 @@ use crate::FileFormatOptionsExt; pub struct FieldDecoderTSV { pub common_settings: CommonSettings, pub quote_char: u8, - format: FormatSettings, } impl FieldDecoderTSV { @@ -56,9 +53,6 @@ impl FieldDecoderTSV { timezone: options.timezone, }, quote_char: options.get_quote_char(), - format: FormatSettings { - timezone: options.timezone, - }, } } } @@ -94,7 +88,7 @@ impl FieldDecoderRowBased for FieldDecoderTSV { fn read_string>( &self, - column: &mut StringDeserializer, + column: &mut StringColumnBuilder, reader: &mut Cursor, raw: bool, ) -> Result<()> { @@ -105,13 +99,12 @@ impl FieldDecoderRowBased for FieldDecoderTSV { fn read_array>( &self, - column: &mut ArrayDeserializer, + column: &mut ArrayColumnBuilder, reader: &mut Cursor, _raw: bool, ) -> Result<()> { reader.must_ignore_byte(b'[')?; - let mut idx = 0; - loop { + for idx in 0.. { let _ = reader.ignore_white_spaces(); if reader.ignore_byte(b']') { break; @@ -120,23 +113,24 @@ impl FieldDecoderRowBased for FieldDecoderTSV { reader.must_ignore_byte(b',')?; } let _ = reader.ignore_white_spaces(); - self.read_field(column.inner.as_mut(), reader, false)?; - idx += 1; + self.read_field(&mut column.builder, reader, false)?; } - column.add_offset(idx); + column.commit_row(); Ok(()) } fn read_map>( &self, - column: &mut MapDeserializer, + column: &mut ArrayColumnBuilder, reader: &mut Cursor, _raw: bool, ) -> Result<()> { + const KEY: usize = 0; + const VALUE: usize = 1; reader.must_ignore_byte(b'{')?; - let mut idx = 0; let mut set = HashSet::new(); - loop { + let map_builder = column.builder.as_tuple_mut().unwrap(); + for idx in 0.. { let _ = reader.ignore_white_spaces(); if reader.ignore_byte(b'}') { break; @@ -145,41 +139,40 @@ impl FieldDecoderRowBased for FieldDecoderTSV { reader.must_ignore_byte(b',')?; } let _ = reader.ignore_white_spaces(); - self.read_field(column.key.as_mut(), reader, false)?; + self.read_field(&mut map_builder[KEY], reader, false)?; // check duplicate map keys - let key = column.key.pop_data_value().unwrap(); + let key = map_builder[KEY].pop().unwrap(); if set.contains(&key) { - column.add_offset(idx); + column.commit_row(); return Err(ErrorCode::BadBytes( "map keys have to be unique".to_string(), )); } - set.insert(key.clone()); - column.key.append_data_value(key, &self.format)?; + map_builder[KEY].push(key.as_ref()); + set.insert(key); let _ = reader.ignore_white_spaces(); reader.must_ignore_byte(b':')?; let _ = reader.ignore_white_spaces(); - self.read_field(column.value.as_mut(), reader, false)?; - idx += 1; + self.read_field(&mut map_builder[VALUE], reader, false)?; } - column.add_offset(idx); + column.commit_row(); Ok(()) } - fn read_struct>( + fn read_tuple>( &self, - column: &mut StructDeserializer, + fields: &mut Vec, reader: &mut Cursor, _raw: bool, ) -> Result<()> { reader.must_ignore_byte(b'(')?; - for (idx, inner) in column.inners.iter_mut().enumerate() { + for (idx, field) in fields.iter_mut().enumerate() { let _ = reader.ignore_white_spaces(); if idx != 0 { reader.must_ignore_byte(b',')?; } let _ = reader.ignore_white_spaces(); - self.read_field(inner, reader, false)?; + self.read_field(field, reader, false)?; } reader.must_ignore_byte(b')')?; Ok(()) diff --git a/src/query/formats/src/field_decoder/values.rs b/src/query/formats/src/field_decoder/values.rs index 0bebee35aad9c..b98a8edfed4a1 100644 --- a/src/query/formats/src/field_decoder/values.rs +++ b/src/query/formats/src/field_decoder/values.rs @@ -19,12 +19,11 @@ use std::io::Cursor; use chrono_tz::Tz; use common_exception::ErrorCode; use common_exception::Result; -use common_expression::ArrayDeserializer; -use common_expression::MapDeserializer; -use common_expression::NullableDeserializer; -use common_expression::StringDeserializer; -use common_expression::StructDeserializer; -use common_expression::TypeDeserializer; +use common_expression::types::array::ArrayColumnBuilder; +use common_expression::types::nullable::NullableColumnBuilder; +use common_expression::types::string::StringColumnBuilder; +use common_expression::types::AnyType; +use common_expression::ColumnBuilder; use common_io::constants::FALSE_BYTES_LOWER; use common_io::constants::INF_BYTES_LOWER; use common_io::constants::NAN_BYTES_LOWER; @@ -32,7 +31,6 @@ use common_io::constants::NULL_BYTES_UPPER; use common_io::constants::TRUE_BYTES_LOWER; use common_io::cursor_ext::BufferReadStringExt; use common_io::cursor_ext::ReadBytesExt; -use common_io::prelude::FormatSettings; use crate::field_decoder::row_based::FieldDecoderRowBased; use crate::CommonSettings; @@ -42,7 +40,6 @@ use crate::FileFormatOptionsExt; #[derive(Clone)] pub struct FieldDecoderValues { pub common_settings: CommonSettings, - format: FormatSettings, } impl FieldDecoderValues { @@ -56,9 +53,6 @@ impl FieldDecoderValues { inf_bytes: INF_BYTES_LOWER.as_bytes().to_vec(), timezone: options.timezone, }, - format: FormatSettings { - timezone: options.timezone, - }, } } @@ -72,7 +66,6 @@ impl FieldDecoderValues { inf_bytes: INF_BYTES_LOWER.as_bytes().to_vec(), timezone, }, - format: FormatSettings { timezone }, } } } @@ -95,19 +88,19 @@ impl FieldDecoderRowBased for FieldDecoderValues { fn read_nullable>( &self, - column: &mut NullableDeserializer, + column: &mut NullableColumnBuilder, reader: &mut Cursor, raw: bool, ) -> Result<()> { if reader.eof() { - column.de_default(); + column.push_null(); } else if (raw && (self.match_bytes(reader, b"NULL") || self.match_bytes(reader, b"null"))) || (!raw && (reader.ignore_bytes(b"NULL") || reader.ignore_bytes(b"null"))) { - column.de_default(); + column.push_null(); return Ok(()); } else { - self.read_field(column.inner.as_mut(), reader, raw)?; + self.read_field(&mut column.builder, reader, raw)?; column.validity.push(true); } Ok(()) @@ -125,7 +118,7 @@ impl FieldDecoderRowBased for FieldDecoderValues { fn read_string>( &self, - column: &mut StringDeserializer, + column: &mut StringColumnBuilder, reader: &mut Cursor, _raw: bool, ) -> Result<()> { @@ -136,13 +129,12 @@ impl FieldDecoderRowBased for FieldDecoderValues { fn read_array>( &self, - column: &mut ArrayDeserializer, + column: &mut ArrayColumnBuilder, reader: &mut Cursor, _raw: bool, ) -> Result<()> { reader.must_ignore_byte(b'[')?; - let mut idx = 0; - loop { + for idx in 0.. { let _ = reader.ignore_white_spaces(); if reader.ignore_byte(b']') { break; @@ -151,23 +143,24 @@ impl FieldDecoderRowBased for FieldDecoderValues { reader.must_ignore_byte(b',')?; } let _ = reader.ignore_white_spaces(); - self.read_field(column.inner.as_mut(), reader, false)?; - idx += 1; + self.read_field(&mut column.builder, reader, false)?; } - column.add_offset(idx); + column.commit_row(); Ok(()) } fn read_map>( &self, - column: &mut MapDeserializer, + column: &mut ArrayColumnBuilder, reader: &mut Cursor, _raw: bool, ) -> Result<()> { + const KEY: usize = 0; + const VALUE: usize = 1; reader.must_ignore_byte(b'{')?; - let mut idx = 0; let mut set = HashSet::new(); - loop { + let map_builder = column.builder.as_tuple_mut().unwrap(); + for idx in 0.. { let _ = reader.ignore_white_spaces(); if reader.ignore_byte(b'}') { break; @@ -176,41 +169,40 @@ impl FieldDecoderRowBased for FieldDecoderValues { reader.must_ignore_byte(b',')?; } let _ = reader.ignore_white_spaces(); - self.read_field(column.key.as_mut(), reader, false)?; + self.read_field(&mut map_builder[KEY], reader, false)?; // check duplicate map keys - let key = column.key.pop_data_value().unwrap(); + let key = map_builder[KEY].pop().unwrap(); if set.contains(&key) { - column.add_offset(idx); + column.commit_row(); return Err(ErrorCode::BadBytes( "map keys have to be unique".to_string(), )); } - set.insert(key.clone()); - column.key.append_data_value(key, &self.format)?; + map_builder[KEY].push(key.as_ref()); + set.insert(key); let _ = reader.ignore_white_spaces(); reader.must_ignore_byte(b':')?; let _ = reader.ignore_white_spaces(); - self.read_field(column.value.as_mut(), reader, false)?; - idx += 1; + self.read_field(&mut map_builder[VALUE], reader, false)?; } - column.add_offset(idx); + column.commit_row(); Ok(()) } - fn read_struct>( + fn read_tuple>( &self, - column: &mut StructDeserializer, + fields: &mut Vec, reader: &mut Cursor, _raw: bool, ) -> Result<()> { reader.must_ignore_byte(b'(')?; - for (idx, inner) in column.inners.iter_mut().enumerate() { + for (idx, field) in fields.iter_mut().enumerate() { let _ = reader.ignore_white_spaces(); if idx != 0 { reader.must_ignore_byte(b',')?; } let _ = reader.ignore_white_spaces(); - self.read_field(inner, reader, false)?; + self.read_field(field, reader, false)?; } reader.must_ignore_byte(b')')?; Ok(()) diff --git a/src/query/formats/src/field_decoder/xml.rs b/src/query/formats/src/field_decoder/xml.rs index 0a320d77d138c..af848ff25cbf1 100644 --- a/src/query/formats/src/field_decoder/xml.rs +++ b/src/query/formats/src/field_decoder/xml.rs @@ -17,11 +17,10 @@ use std::io::BufRead; use std::io::Cursor; use common_exception::Result; -use common_expression::ArrayDeserializer; -use common_expression::MapDeserializer; -use common_expression::StringDeserializer; -use common_expression::StructDeserializer; -use common_expression::VariantDeserializer; +use common_expression::types::array::ArrayColumnBuilder; +use common_expression::types::string::StringColumnBuilder; +use common_expression::types::AnyType; +use common_expression::ColumnBuilder; use common_io::constants::FALSE_BYTES_LOWER; use common_io::constants::INF_BYTES_LOWER; use common_io::constants::NAN_BYTES_LOWER; @@ -88,7 +87,7 @@ impl FieldDecoderRowBased for FieldDecoderXML { fn read_string>( &self, - column: &mut StringDeserializer, + column: &mut StringColumnBuilder, reader: &mut Cursor, _raw: bool, ) -> Result<()> { @@ -101,23 +100,21 @@ impl FieldDecoderRowBased for FieldDecoderXML { fn read_variant>( &self, - column: &mut VariantDeserializer, + column: &mut StringColumnBuilder, reader: &mut Cursor, _raw: bool, ) -> Result<()> { let buf = reader.remaining_slice(); + column.put_slice(buf); + column.commit_row(); - column.builder.put_slice(buf); - column.builder.commit_row(); - - let len = buf.len(); - reader.consume(len); + reader.consume(buf.len()); Ok(()) } fn read_array>( &self, - column: &mut ArrayDeserializer, + column: &mut ArrayColumnBuilder, reader: &mut Cursor, _raw: bool, ) -> Result<()> { @@ -127,7 +124,7 @@ impl FieldDecoderRowBased for FieldDecoderXML { fn read_map>( &self, - column: &mut MapDeserializer, + column: &mut ArrayColumnBuilder, reader: &mut Cursor, _raw: bool, ) -> Result<()> { @@ -135,13 +132,13 @@ impl FieldDecoderRowBased for FieldDecoderXML { Ok(()) } - fn read_struct>( + fn read_tuple>( &self, - column: &mut StructDeserializer, + fields: &mut Vec, reader: &mut Cursor, _raw: bool, ) -> Result<()> { - self.nested.read_struct(column, reader, false)?; + self.nested.read_tuple(fields, reader, false)?; Ok(()) } } diff --git a/src/query/formats/src/field_encoder/json.rs b/src/query/formats/src/field_encoder/json.rs index df5f188ec0f2e..dd61dcafb2c89 100644 --- a/src/query/formats/src/field_encoder/json.rs +++ b/src/query/formats/src/field_encoder/json.rs @@ -113,7 +113,7 @@ impl FieldEncoderRowBased for FieldEncoderJSON { out_buf.push(b'{'); let inner = &T::upcast_column(column.values.clone()); match inner { - Column::Tuple { fields, .. } => { + Column::Tuple(fields) => { for i in start..end { if i != start { out_buf.extend_from_slice(b","); diff --git a/src/query/formats/src/field_encoder/row_based.rs b/src/query/formats/src/field_encoder/row_based.rs index 780e5b684e01e..27d7325d23226 100644 --- a/src/query/formats/src/field_encoder/row_based.rs +++ b/src/query/formats/src/field_encoder/row_based.rs @@ -63,7 +63,7 @@ pub trait FieldEncoderRowBased { Column::Nullable(box c) => self.write_nullable(c, row_index, out_buf, raw), Column::Array(box c) => self.write_array(c, row_index, out_buf, raw), Column::Map(box c) => self.write_map(c, row_index, out_buf, raw), - Column::Tuple { fields, .. } => self.write_tuple(fields, row_index, out_buf, raw), + Column::Tuple(fields) => self.write_tuple(fields, row_index, out_buf, raw), Column::Variant(c) => self.write_variant(c, row_index, out_buf, raw), } } diff --git a/src/query/formats/src/field_encoder/tsv.rs b/src/query/formats/src/field_encoder/tsv.rs index 45f4268bc5afd..9480188989752 100644 --- a/src/query/formats/src/field_encoder/tsv.rs +++ b/src/query/formats/src/field_encoder/tsv.rs @@ -93,7 +93,7 @@ impl FieldEncoderRowBased for FieldEncoderTSV { out_buf.push(b'{'); let inner = &T::upcast_column(column.values.clone()); match inner { - Column::Tuple { fields, .. } => { + Column::Tuple(fields) => { for i in start..end { if i != start { out_buf.extend_from_slice(b","); diff --git a/src/query/formats/src/field_encoder/values.rs b/src/query/formats/src/field_encoder/values.rs index eba4ac949d26e..a36a8fdb489f4 100644 --- a/src/query/formats/src/field_encoder/values.rs +++ b/src/query/formats/src/field_encoder/values.rs @@ -129,7 +129,7 @@ impl FieldEncoderRowBased for FieldEncoderValues { out_buf.push(b'{'); let inner = &T::upcast_column(column.values.clone()); match inner { - Column::Tuple { fields, .. } => { + Column::Tuple(fields) => { for i in start..end { if i != start { out_buf.extend_from_slice(b","); diff --git a/src/query/functions/src/scalars/arithmetic.rs b/src/query/functions/src/scalars/arithmetic.rs index 5adc0128ae873..a9e1d3866a2c0 100644 --- a/src/query/functions/src/scalars/arithmetic.rs +++ b/src/query/functions/src/scalars/arithmetic.rs @@ -57,7 +57,6 @@ use common_expression::FunctionProperty; use common_expression::FunctionRegistry; use common_expression::FunctionSignature; use common_expression::Scalar; -use common_expression::TypeDeserializer; use common_io::display_decimal_128; use common_io::display_decimal_256; use ethnum::i256; @@ -743,34 +742,30 @@ fn decimal_to_string( let from_type = from_type.as_decimal().unwrap(); - let result = match from_type { + let column = match from_type { DecimalDataType::Decimal128(_) => { let (buffer, from_size) = i128::try_downcast_column(&column).unwrap(); - let mut builder = StringColumnBuilder::with_capacity(buffer.len(), buffer.len() * 10); for x in buffer { builder.put_str(&display_decimal_128(x, from_size.scale)); builder.commit_row(); } - builder.finish_to_column() + builder } - DecimalDataType::Decimal256(_) => { let (buffer, from_size) = i256::try_downcast_column(&column).unwrap(); - let mut builder = StringColumnBuilder::with_capacity(buffer.len(), buffer.len() * 10); for x in buffer { builder.put_str(&display_decimal_256(x, from_size.scale)); builder.commit_row(); } - builder.finish_to_column() + builder } }; if is_scalar { - let scalar = result.index(0).unwrap(); - Value::Scalar(scalar.to_owned()) + Value::Scalar(Scalar::String(column.build_scalar())) } else { - Value::Column(result) + Value::Column(Column::String(column.build())) } } diff --git a/src/query/functions/src/scalars/comparison.rs b/src/query/functions/src/scalars/comparison.rs index 54ed622b9d167..7bab069f803e1 100644 --- a/src/query/functions/src/scalars/comparison.rs +++ b/src/query/functions/src/scalars/comparison.rs @@ -462,7 +462,7 @@ fn register_tuple_cmp(registry: &mut FunctionRegistry) { ValueRef::Scalar(ScalarRef::Tuple(fields)) => { fields.iter().cloned().map(ValueRef::Scalar).collect() } - ValueRef::Column(Column::Tuple { fields, .. }) => { + ValueRef::Column(Column::Tuple(fields)) => { fields.iter().cloned().map(ValueRef::Column).collect() } _ => unreachable!(), @@ -471,7 +471,7 @@ fn register_tuple_cmp(registry: &mut FunctionRegistry) { ValueRef::Scalar(ScalarRef::Tuple(fields)) => { fields.iter().cloned().map(ValueRef::Scalar).collect() } - ValueRef::Column(Column::Tuple { fields, .. }) => { + ValueRef::Column(Column::Tuple(fields)) => { fields.iter().cloned().map(ValueRef::Column).collect() } _ => unreachable!(), diff --git a/src/query/functions/src/scalars/control.rs b/src/query/functions/src/scalars/control.rs index 9a72677050543..4349302c5a608 100644 --- a/src/query/functions/src/scalars/control.rs +++ b/src/query/functions/src/scalars/control.rs @@ -49,7 +49,7 @@ fn unwrap_error<'a>( (ValueRef::Scalar(value), ValueRef::Scalar(error)) } ValueRef::Column(col) => { - let (inner_col, _) = col.as_tuple().unwrap(); + let inner_col = col.as_tuple().unwrap(); let value = ValueRef::Column(inner_col.first().unwrap().clone()); let error = ValueRef::Column(inner_col.last().unwrap().clone()); (value, error) diff --git a/src/query/functions/src/scalars/decimal.rs b/src/query/functions/src/scalars/decimal.rs index af940c3d288b5..0f9b6ca642f91 100644 --- a/src/query/functions/src/scalars/decimal.rs +++ b/src/query/functions/src/scalars/decimal.rs @@ -17,7 +17,7 @@ use std::ops::*; use std::sync::Arc; use common_arrow::arrow::buffer::Buffer; -use common_expression::read_decimal_with_size; +use common_expression::serialize::read_decimal_with_size; use common_expression::types::decimal::*; use common_expression::types::string::StringColumn; use common_expression::types::*; diff --git a/src/query/functions/src/scalars/geo.rs b/src/query/functions/src/scalars/geo.rs index 8175a28062e21..d709329ada786 100644 --- a/src/query/functions/src/scalars/geo.rs +++ b/src/query/functions/src/scalars/geo.rs @@ -365,7 +365,7 @@ fn point_in_polygon_fn(args: &[ValueRef], _: &mut EvalContext) -> Value _ => unreachable!(), }) .collect(), - ValueRef::Column(Column::Tuple { fields, .. }) => fields + ValueRef::Column(Column::Tuple(fields)) => fields .iter() .cloned() .map(|c| ValueRef::Column(Float64Type::try_downcast_column(&c).unwrap())) diff --git a/src/query/functions/src/scalars/tuple.rs b/src/query/functions/src/scalars/tuple.rs index 79ee0ff81ca3f..8d6c6052e26ca 100644 --- a/src/query/functions/src/scalars/tuple.rs +++ b/src/query/functions/src/scalars/tuple.rs @@ -62,7 +62,7 @@ pub fn register(registry: &mut FunctionRegistry) { ValueRef::Column(col) => col.clone(), }) .collect(); - Value::Column(Column::Tuple { fields, len }) + Value::Column(Column::Tuple(fields)) } else { // All args are scalars, so we return a scalar as result let fields = args @@ -103,9 +103,7 @@ pub fn register(registry: &mut FunctionRegistry) { }), eval: Box::new(move |args, _| match &args[0] { ValueRef::Scalar(ScalarRef::Tuple(fields)) => Value::Scalar(fields[idx].to_owned()), - ValueRef::Column(Column::Tuple { fields, .. }) => { - Value::Column(fields[idx].to_owned()) - } + ValueRef::Column(Column::Tuple(fields)) => Value::Column(fields[idx].to_owned()), _ => unreachable!(), }), })) @@ -156,7 +154,7 @@ pub fn register(registry: &mut FunctionRegistry) { ValueRef::Scalar(ScalarRef::Null) => Value::Scalar(Scalar::Null), ValueRef::Scalar(ScalarRef::Tuple(fields)) => Value::Scalar(fields[idx].to_owned()), ValueRef::Column(Column::Nullable(box NullableColumn { - column: Column::Tuple { fields, .. }, + column: Column::Tuple(fields), validity, })) => { let field_col = fields[idx].as_nullable().unwrap(); diff --git a/src/query/functions/tests/it/scalars/testdata/cast.txt b/src/query/functions/tests/it/scalars/testdata/cast.txt index dbe8b6927989d..eef261683d309 100644 --- a/src/query/functions/tests/it/scalars/testdata/cast.txt +++ b/src/query/functions/tests/it/scalars/testdata/cast.txt @@ -1967,7 +1967,7 @@ evaluation (internal): +--------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | a | UInt64([0, 1, 255, 65535, 4294967295, 18446744073709551615]) | | b | Float64([0, 4294967295, 18446744073709551616, -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, inf]) | -| Output | Tuple { fields: [NullableColumn { column: Float32([0, 1, 255, 65535, 4294967296, 18446744073709551616]), validity: [0b__111111] }, NullableColumn { column: Int32([0, 1, 255, 65535, 0, 0]), validity: [0b__001111] }, NullableColumn { column: Float32([0, 4294967296, 18446744073709551616, -inf, inf, inf]), validity: [0b__111111] }, NullableColumn { column: Int32([0, 0, 0, 0, 0, 0]), validity: [0b__000001] }], len: 6 } | +| Output | Tuple([NullableColumn { column: Float32([0, 1, 255, 65535, 4294967296, 18446744073709551616]), validity: [0b__111111] }, NullableColumn { column: Int32([0, 1, 255, 65535, 0, 0]), validity: [0b__001111] }, NullableColumn { column: Float32([0, 4294967296, 18446744073709551616, -inf, inf, inf]), validity: [0b__111111] }, NullableColumn { column: Int32([0, 0, 0, 0, 0, 0]), validity: [0b__000001] }]) | +--------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ @@ -2013,13 +2013,13 @@ evaluation: | Row 4 | 256 | -129 | (NULL, NULL, NULL) | +--------+-----------+------------+-------------------------------------------------+ evaluation (internal): -+--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| Column | Data | -+--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| a | Int16([0, 1, 2, 127, 256]) | -| b | Int16([0, 1, -127, -128, -129]) | -| Output | Tuple { fields: [NullableColumn { column: Int8([0, 1, 2, 127, 0]), validity: [0b___01111] }, NullableColumn { column: UInt8([0, 1, 0, 0, 0]), validity: [0b___00011] }, NullableColumn { column: Boolean([0b___00000]), validity: [0b___00000] }], len: 5 } | -+--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ ++--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Column | Data | ++--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| a | Int16([0, 1, 2, 127, 256]) | +| b | Int16([0, 1, -127, -128, -129]) | +| Output | Tuple([NullableColumn { column: Int8([0, 1, 2, 127, 0]), validity: [0b___01111] }, NullableColumn { column: UInt8([0, 1, 0, 0, 0]), validity: [0b___00011] }, NullableColumn { column: Boolean([0b___00000]), validity: [0b___00000] }]) | ++--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ ast : TRY_CAST(a AS INT16) diff --git a/src/query/functions/tests/it/scalars/testdata/map.txt b/src/query/functions/tests/it/scalars/testdata/map.txt index e0ee4e9c8f336..d9009f00879cd 100644 --- a/src/query/functions/tests/it/scalars/testdata/map.txt +++ b/src/query/functions/tests/it/scalars/testdata/map.txt @@ -69,17 +69,17 @@ evaluation: | Row 2 | 3 | 6 | 9 | "c" | NULL | "g" | {3:"c", 6:NULL, 9:"g"} | +--------+---------+---------+---------+-------------+---------------------+---------------------+------------------------+ evaluation (internal): -+--------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| Column | Data | -+--------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| a_col | Int8([1, 2, 3]) | -| b_col | Int8([4, 5, 6]) | -| c_col | Int8([7, 8, 9]) | -| d_col | NullableColumn { column: StringColumn { data: 0x616263, offsets: [0, 1, 2, 3] }, validity: [0b_____111] } | -| e_col | NullableColumn { column: StringColumn { data: 0x6465, offsets: [0, 1, 2, 2] }, validity: [0b_____011] } | -| f_col | NullableColumn { column: StringColumn { data: 0x6667, offsets: [0, 1, 1, 2] }, validity: [0b_____101] } | -| Output | ArrayColumn { values: Tuple { fields: [Int8([1, 4, 7, 2, 5, 8, 3, 6, 9]), NullableColumn { column: StringColumn { data: 0x61646662656367, offsets: [0, 1, 2, 3, 4, 5, 5, 6, 6, 7] }, validity: [0b01011111, 0b_______1] }], len: 9 }, offsets: [0, 3, 6, 9] } | -+--------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ ++--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Column | Data | ++--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| a_col | Int8([1, 2, 3]) | +| b_col | Int8([4, 5, 6]) | +| c_col | Int8([7, 8, 9]) | +| d_col | NullableColumn { column: StringColumn { data: 0x616263, offsets: [0, 1, 2, 3] }, validity: [0b_____111] } | +| e_col | NullableColumn { column: StringColumn { data: 0x6465, offsets: [0, 1, 2, 2] }, validity: [0b_____011] } | +| f_col | NullableColumn { column: StringColumn { data: 0x6667, offsets: [0, 1, 1, 2] }, validity: [0b_____101] } | +| Output | ArrayColumn { values: Tuple([Int8([1, 4, 7, 2, 5, 8, 3, 6, 9]), NullableColumn { column: StringColumn { data: 0x61646662656367, offsets: [0, 1, 2, 3, 4, 5, 5, 6, 6, 7] }, validity: [0b01011111, 0b_______1] }]), offsets: [0, 3, 6, 9] } | ++--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ ast : map(['k1', 'k2'], [a_col, b_col]) @@ -97,13 +97,13 @@ evaluation: | Row 2 | 3 | 6 | {"k1":3, "k2":6} | +--------+---------+---------+-------------------+ evaluation (internal): -+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| Column | Data | -+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| a_col | Int8([1, 2, 3]) | -| b_col | Int8([4, 5, 6]) | -| Output | ArrayColumn { values: Tuple { fields: [StringColumn { data: 0x6b316b326b316b326b316b32, offsets: [0, 2, 4, 6, 8, 10, 12] }, Int8([1, 4, 2, 5, 3, 6])], len: 6 }, offsets: [0, 2, 4, 6] } | -+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ ++--------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Column | Data | ++--------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| a_col | Int8([1, 2, 3]) | +| b_col | Int8([4, 5, 6]) | +| Output | ArrayColumn { values: Tuple([StringColumn { data: 0x6b316b326b316b326b316b32, offsets: [0, 2, 4, 6, 8, 10, 12] }, Int8([1, 4, 2, 5, 3, 6])]), offsets: [0, 2, 4, 6] } | ++--------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+ ast : map([],[])[1] diff --git a/src/query/functions/tests/it/scalars/testdata/tuple.txt b/src/query/functions/tests/it/scalars/testdata/tuple.txt index d19b1b0d9278d..e5c2cb831dbc5 100644 --- a/src/query/functions/tests/it/scalars/testdata/tuple.txt +++ b/src/query/functions/tests/it/scalars/testdata/tuple.txt @@ -49,12 +49,12 @@ evaluation: | Row 3 | "d" | ("d", "d") | +--------+----------------------+----------------------------------------------+ evaluation (internal): -+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| Column | Data | -+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| s | NullableColumn { column: StringColumn { data: 0x61626364, offsets: [0, 1, 2, 3, 4] }, validity: [0b____1011] } | -| Output | Tuple { fields: [NullableColumn { column: StringColumn { data: 0x61626364, offsets: [0, 1, 2, 3, 4] }, validity: [0b____1011] }, NullableColumn { column: StringColumn { data: 0x61626364, offsets: [0, 1, 2, 3, 4] }, validity: [0b____1011] }], len: 4 } | -+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ ++--------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Column | Data | ++--------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| s | NullableColumn { column: StringColumn { data: 0x61626364, offsets: [0, 1, 2, 3, 4] }, validity: [0b____1011] } | +| Output | Tuple([NullableColumn { column: StringColumn { data: 0x61626364, offsets: [0, 1, 2, 3, 4] }, validity: [0b____1011] }, NullableColumn { column: StringColumn { data: 0x61626364, offsets: [0, 1, 2, 3, 4] }, validity: [0b____1011] }]) | ++--------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ error: @@ -164,11 +164,11 @@ evaluation: | Row 3 | NULL | NULL | +--------+---------------------------------+----------------------+ evaluation (internal): -+--------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| Column | Data | -+--------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| col | NullableColumn { column: Tuple { fields: [NullableColumn { column: StringColumn { data: 0x61626364, offsets: [0, 1, 2, 3, 4] }, validity: [0b____0011] }], len: 4 }, validity: [0b____0101] } | -| Output | NullableColumn { column: StringColumn { data: 0x61626364, offsets: [0, 1, 2, 3, 4] }, validity: [0b____0001] } | -+--------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ ++--------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Column | Data | ++--------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| col | NullableColumn { column: Tuple([NullableColumn { column: StringColumn { data: 0x61626364, offsets: [0, 1, 2, 3, 4] }, validity: [0b____0011] }]), validity: [0b____0101] } | +| Output | NullableColumn { column: StringColumn { data: 0x61626364, offsets: [0, 1, 2, 3, 4] }, validity: [0b____0001] } | ++--------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ diff --git a/src/query/functions/tests/it/scalars/tuple.rs b/src/query/functions/tests/it/scalars/tuple.rs index 8e4cebb2501d6..1f93f2e253742 100644 --- a/src/query/functions/tests/it/scalars/tuple.rs +++ b/src/query/functions/tests/it/scalars/tuple.rs @@ -58,13 +58,10 @@ fn test_get(file: &mut impl Write) { run_ast(file, "col.1", &[( "col", Column::Nullable(Box::new(NullableColumn { - column: Column::Tuple { - fields: vec![StringType::from_data_with_validity( - &["a", "b", "c", "d"], - vec![true, true, false, false], - )], - len: 4, - }, + column: Column::Tuple(vec![StringType::from_data_with_validity( + &["a", "b", "c", "d"], + vec![true, true, false, false], + )]), validity: vec![true, false, true, false].into(), })), )]); diff --git a/src/query/pipeline/sources/src/input_formats/impls/input_format_csv.rs b/src/query/pipeline/sources/src/input_formats/impls/input_format_csv.rs index b970b5b2e550d..ca8151d0e7afd 100644 --- a/src/query/pipeline/sources/src/input_formats/impls/input_format_csv.rs +++ b/src/query/pipeline/sources/src/input_formats/impls/input_format_csv.rs @@ -20,9 +20,8 @@ use std::sync::Arc; use common_exception::ErrorCode; use common_exception::Result; +use common_expression::ColumnBuilder; use common_expression::TableSchemaRef; -use common_expression::TypeDeserializer; -use common_expression::TypeDeserializerImpl; use common_formats::FieldDecoder; use common_formats::FieldDecoderCSV; use common_formats::FieldDecoderRowBased; @@ -54,19 +53,19 @@ impl InputFormatCSV { fn read_row( field_decoder: &FieldDecoderCSV, buf: &[u8], - deserializers: &mut [TypeDeserializerImpl], + columns: &mut [ColumnBuilder], schema: &TableSchemaRef, field_ends: &[usize], ) -> Result<()> { let mut field_start = 0; - for (c, deserializer) in deserializers.iter_mut().enumerate() { + for (c, column) in columns.iter_mut().enumerate() { let field_end = field_ends[c]; let col_data = &buf[field_start..field_end]; let mut reader = Cursor::new(col_data); if reader.eof() { - deserializer.de_default(); + column.push_default(); } else { - if let Err(e) = field_decoder.read_field(deserializer, &mut reader, true) { + if let Err(e) = field_decoder.read_field(column, &mut reader, true) { let err_msg = format_column_error(schema, c, col_data, &e.message()); return Err(ErrorCode::BadBytes(err_msg)); }; diff --git a/src/query/pipeline/sources/src/input_formats/impls/input_format_ndjson.rs b/src/query/pipeline/sources/src/input_formats/impls/input_format_ndjson.rs index 22a5dcdbe55d4..ca9dbea9cf23e 100644 --- a/src/query/pipeline/sources/src/input_formats/impls/input_format_ndjson.rs +++ b/src/query/pipeline/sources/src/input_formats/impls/input_format_ndjson.rs @@ -19,8 +19,8 @@ use std::sync::Arc; use bstr::ByteSlice; use common_exception::ErrorCode; use common_exception::Result; +use common_expression::ColumnBuilder; use common_expression::TableSchemaRef; -use common_expression::TypeDeserializerImpl; use common_formats::FieldDecoder; use common_formats::FieldJsonAstDecoder; use common_formats::FileFormatOptionsExt; @@ -42,7 +42,7 @@ impl InputFormatNDJson { fn read_row( field_decoder: &FieldJsonAstDecoder, buf: &[u8], - deserializers: &mut [TypeDeserializerImpl], + columns: &mut [ColumnBuilder], schema: &TableSchemaRef, ) -> Result<()> { let mut json: serde_json::Value = serde_json::from_reader(buf)?; @@ -54,13 +54,13 @@ impl InputFormatNDJson { } } - for (f, deser) in schema.fields().iter().zip(deserializers.iter_mut()) { + for (f, column) in schema.fields().iter().zip(columns.iter_mut()) { let value = if field_decoder.ident_case_sensitive { &json[f.name().to_owned()] } else { &json[f.name().to_lowercase()] }; - field_decoder.read_field(deser, value).map_err(|e| { + field_decoder.read_field(column, value).map_err(|e| { let value_str = format!("{:?}", value); ErrorCode::BadBytes(format!( "{}. column={} value={}", diff --git a/src/query/pipeline/sources/src/input_formats/impls/input_format_tsv.rs b/src/query/pipeline/sources/src/input_formats/impls/input_format_tsv.rs index 6902b46da8ecd..a6e9590899f79 100644 --- a/src/query/pipeline/sources/src/input_formats/impls/input_format_tsv.rs +++ b/src/query/pipeline/sources/src/input_formats/impls/input_format_tsv.rs @@ -18,9 +18,8 @@ use std::sync::Arc; use common_exception::ErrorCode; use common_exception::Result; +use common_expression::ColumnBuilder; use common_expression::TableSchemaRef; -use common_expression::TypeDeserializer; -use common_expression::TypeDeserializerImpl; use common_formats::FieldDecoder; use common_formats::FieldDecoderRowBased; use common_formats::FieldDecoderTSV; @@ -46,10 +45,10 @@ impl InputFormatTSV { field_delimiter: u8, field_decoder: &FieldDecoderTSV, buf: &[u8], - deserializers: &mut Vec, + columns: &mut Vec, schema: &TableSchemaRef, ) -> Result<()> { - let num_columns = deserializers.len(); + let num_columns = columns.len(); let mut column_index = 0; let mut field_start = 0; let mut pos = 0; @@ -59,14 +58,12 @@ impl InputFormatTSV { if pos == buf_len || buf[pos] == field_delimiter { let col_data = &buf[field_start..pos]; if col_data.is_empty() { - deserializers[column_index].de_default(); + columns[column_index].push_default(); } else { let mut reader = Cursor::new(col_data); - if let Err(e) = field_decoder.read_field( - &mut deserializers[column_index], - &mut reader, - true, - ) { + if let Err(e) = + field_decoder.read_field(&mut columns[column_index], &mut reader, true) + { err_msg = Some(format_column_error( schema, column_index, diff --git a/src/query/pipeline/sources/src/input_formats/impls/input_format_xml.rs b/src/query/pipeline/sources/src/input_formats/impls/input_format_xml.rs index 095f80e72debb..8de36ef2624f8 100644 --- a/src/query/pipeline/sources/src/input_formats/impls/input_format_xml.rs +++ b/src/query/pipeline/sources/src/input_formats/impls/input_format_xml.rs @@ -17,9 +17,8 @@ use std::sync::Arc; use common_exception::ErrorCode; use common_exception::Result; +use common_expression::ColumnBuilder; use common_expression::TableSchemaRef; -use common_expression::TypeDeserializer; -use common_expression::TypeDeserializerImpl; use common_formats::FieldDecoder; use common_formats::FieldDecoderRowBased; use common_formats::FieldDecoderXML; @@ -47,7 +46,7 @@ impl InputFormatXML { fn read_row( field_decoder: &FieldDecoderXML, row_data: &mut HashMap>, - deserializers: &mut [TypeDeserializerImpl], + columns: &mut [ColumnBuilder], schema: &TableSchemaRef, path: &str, row_index: usize, @@ -61,7 +60,7 @@ impl InputFormatXML { row_data.clone() }; - for (field, deserializer) in schema.fields().iter().zip(deserializers.iter_mut()) { + for (field, column) in schema.fields().iter().zip(columns.iter_mut()) { let value = if field_decoder.ident_case_sensitive { raw_data.get(field.name()) } else { @@ -70,9 +69,9 @@ impl InputFormatXML { if let Some(value) = value { let mut reader = Cursor::new(&**value); if reader.eof() { - deserializer.de_default(); + column.push_default(); } else { - if let Err(e) = field_decoder.read_field(deserializer, &mut reader, true) { + if let Err(e) = field_decoder.read_field(column, &mut reader, true) { let value_str = format!("{:?}", value); let err_msg = format!("{}. column={} value={}", e, field.name(), value_str); return Err(xml_error(&err_msg, path, row_index)); @@ -85,7 +84,7 @@ impl InputFormatXML { } } } else { - deserializer.de_default(); + column.push_default(); } } Ok(()) diff --git a/src/query/pipeline/sources/src/input_formats/input_format_text.rs b/src/query/pipeline/sources/src/input_formats/input_format_text.rs index a6f7fc40e48b7..7b4f16e0f1bfc 100644 --- a/src/query/pipeline/sources/src/input_formats/input_format_text.rs +++ b/src/query/pipeline/sources/src/input_formats/input_format_text.rs @@ -25,10 +25,9 @@ use common_compress::DecompressState; use common_exception::ErrorCode; use common_exception::Result; use common_expression::Column; +use common_expression::ColumnBuilder; use common_expression::DataBlock; use common_expression::TableSchemaRef; -use common_expression::TypeDeserializer; -use common_expression::TypeDeserializerImpl; use common_formats::FieldDecoder; use common_formats::FileFormatOptionsExt; use common_meta_app::principal::StageFileFormatType; @@ -234,7 +233,7 @@ pub trait InputFormatTextBase: Sized + Send + Sync + 'static { ) -> Result>; fn on_error_continue( - columns: &mut Vec, + columns: &mut Vec, num_rows: usize, e: ErrorCode, error_map: &mut HashMap, @@ -242,7 +241,7 @@ pub trait InputFormatTextBase: Sized + Send + Sync + 'static { columns.iter_mut().for_each(|c| { // check if parts of columns inserted data, if so, pop it. if c.len() > num_rows { - c.pop_data_value().expect("must success"); + c.pop().expect("must success"); } }); error_map @@ -252,7 +251,7 @@ pub trait InputFormatTextBase: Sized + Send + Sync + 'static { } fn on_error_abort( - columns: &mut Vec, + columns: &mut Vec, num_rows: usize, abort_num: u64, error_count: &AtomicU64, @@ -264,7 +263,7 @@ pub trait InputFormatTextBase: Sized + Send + Sync + 'static { columns.iter_mut().for_each(|c| { // check if parts of columns inserted data, if so, pop it. if c.len() > num_rows { - c.pop_data_value().expect("must success"); + c.pop().expect("must success"); } }); Ok(()) @@ -453,7 +452,7 @@ impl AligningStateTrait for AligningStateMaybeCompressed pub struct BlockBuilder { pub field_decoder: Arc, pub ctx: Arc, - pub mutable_columns: Vec, + pub mutable_columns: Vec, pub num_rows: usize, phantom: PhantomData, } @@ -463,13 +462,15 @@ impl BlockBuilder { let columns: Vec = self .mutable_columns .iter_mut() - .map(|deserializer| deserializer.finish_to_column()) + .map(|col| { + let empty_builder = ColumnBuilder::with_capacity( + &col.data_type(), + self.ctx.block_compact_thresholds.min_rows_per_block, + ); + std::mem::replace(col, empty_builder).build() + }) .collect(); - self.mutable_columns = self - .ctx - .schema - .create_deserializers(self.ctx.block_compact_thresholds.min_rows_per_block); self.num_rows = 0; if columns.is_empty() || columns[0].len() == 0 { @@ -503,7 +504,15 @@ impl BlockBuilderTrait for BlockBuilder { fn create(ctx: Arc) -> Self { let columns = ctx .schema - .create_deserializers(ctx.block_compact_thresholds.min_rows_per_block); + .fields() + .iter() + .map(|f| { + ColumnBuilder::with_capacity( + &f.data_type().into(), + ctx.block_compact_thresholds.min_rows_per_block, + ) + }) + .collect(); let field_decoder = T::create_field_decoder(&ctx.format_options); BlockBuilder { diff --git a/src/query/service/src/interpreters/interpreter_insert.rs b/src/query/service/src/interpreters/interpreter_insert.rs index 74adf463d17e7..7349e43403b5e 100644 --- a/src/query/service/src/interpreters/interpreter_insert.rs +++ b/src/query/service/src/interpreters/interpreter_insert.rs @@ -39,14 +39,13 @@ use common_expression::types::number::NumberScalar; use common_expression::types::DataType; use common_expression::types::NumberDataType; use common_expression::BlockEntry; +use common_expression::ColumnBuilder; use common_expression::DataBlock; use common_expression::DataField; use common_expression::DataSchema; use common_expression::DataSchemaRef; use common_expression::Expr; -use common_expression::Scalar as DataScalar; -use common_expression::TypeDeserializer; -use common_expression::TypeDeserializerImpl; +use common_expression::Scalar; use common_expression::Value; use common_formats::FastFieldDecoderValues; use common_io::cursor_ext::ReadBytesExt; @@ -154,7 +153,7 @@ impl InsertInterpreter { } } - async fn prepared_values(&self, values_str: &str) -> Result<(DataSchemaRef, Vec)> { + async fn prepared_values(&self, values_str: &str) -> Result<(DataSchemaRef, Vec)> { let settings = self.ctx.get_settings(); let sql_dialect = settings.get_sql_dialect()?; let tokens = tokenize_sql(values_str)?; @@ -582,48 +581,41 @@ impl ValueSource { reader: &mut Cursor, positions: &mut VecDeque, ) -> Result { - let mut desers = self + let mut columns = self .schema .fields() .iter() - .map(|f| TypeDeserializerImpl::with_capacity(f.data_type(), estimated_rows)) + .map(|f| ColumnBuilder::with_capacity(f.data_type(), estimated_rows)) .collect::>(); - let mut rows = 0; let format = self.ctx.get_format_settings()?; let field_decoder = FastFieldDecoderValues::create_for_insert(format); - loop { + for row in 0.. { let _ = reader.ignore_white_spaces(); if reader.eof() { break; } // Not the first row - if rows != 0 { + if row != 0 { reader.must_ignore_byte(b',')?; } self.parse_next_row( &field_decoder, reader, - &mut desers, + &mut columns, positions, &self.bind_context, self.metadata.clone(), ) .await?; - rows += 1; } - if rows == 0 { - return Ok(DataBlock::empty_with_schema(self.schema.clone())); - } - - let columns = desers - .iter_mut() - .map(|deser| deser.finish_to_column()) + let columns = columns + .into_iter() + .map(|col| col.build()) .collect::>(); - Ok(DataBlock::new_from_columns(columns)) } @@ -632,13 +624,13 @@ impl ValueSource { &self, field_decoder: &FastFieldDecoderValues, reader: &mut Cursor, - desers: &mut [TypeDeserializerImpl], + columns: &mut [ColumnBuilder], positions: &mut VecDeque, bind_context: &BindContext, metadata: MetadataRef, ) -> Result<()> { let _ = reader.ignore_white_spaces(); - let col_size = desers.len(); + let col_size = columns.len(); let start_pos_of_row = reader.checkpoint(); // Start of the row --- '(' @@ -660,12 +652,12 @@ impl ValueSource { let _ = reader.ignore_white_spaces(); let col_end = if col_idx + 1 == col_size { b')' } else { b',' }; - let deser = desers + let col = columns .get_mut(col_idx) - .ok_or_else(|| ErrorCode::Internal("Deserializer is None"))?; + .ok_or_else(|| ErrorCode::Internal("ColumnBuilder is None"))?; let (need_fallback, pop_count) = field_decoder - .read_field(deser, reader, positions) + .read_field(col, reader, positions) .map(|_| { let _ = reader.ignore_white_spaces(); let need_fallback = reader.ignore_byte(col_end).not(); @@ -673,10 +665,10 @@ impl ValueSource { }) .unwrap_or((true, col_idx)); - // Deserializer and expr-parser both will eat the end ')' of the row. + // ColumnBuilder and expr-parser both will eat the end ')' of the row. if need_fallback { - for deser in desers.iter_mut().take(pop_count) { - deser.pop_data_value()?; + for col in columns.iter_mut().take(pop_count) { + col.pop(); } skip_to_next_row(reader, 1)?; let end_pos_of_row = reader.position(); @@ -704,9 +696,8 @@ impl ValueSource { ) .await?; - let format = self.ctx.get_format_settings()?; - for (append_idx, deser) in desers.iter_mut().enumerate().take(col_size) { - deser.append_data_value(values[append_idx].clone(), &format)?; + for (col, scalar) in columns.iter_mut().zip(values) { + col.push(scalar.as_ref()); } reader.set_position(end_pos_of_row); return Ok(()); @@ -801,13 +792,13 @@ async fn fill_default_value( if field.data_type().is_nullable() { let expr = Expr::Constant { span: None, - scalar: DataScalar::Null, + scalar: Scalar::Null, data_type: field.data_type().clone(), }; map_exprs.push(expr); } else { let data_type = field.data_type().clone(); - let default_value = DataScalar::default_value(&data_type); + let default_value = Scalar::default_value(&data_type); let expr = Expr::Constant { span: None, scalar: default_value, @@ -826,7 +817,7 @@ async fn exprs_to_scalar( name_resolution_ctx: &NameResolutionContext, bind_context: &BindContext, metadata: MetadataRef, -) -> Result> { +) -> Result> { let schema_fields_len = schema.fields().len(); if exprs.len() != schema_fields_len { return Err(ErrorCode::TableSchemaMismatch(format!( @@ -873,7 +864,7 @@ async fn exprs_to_scalar( let one_row_chunk = DataBlock::new( vec![BlockEntry { data_type: DataType::Number(NumberDataType::UInt8), - value: Value::Scalar(DataScalar::Number(NumberScalar::UInt8(1))), + value: Value::Scalar(Scalar::Number(NumberScalar::UInt8(1))), }], 1, ); @@ -883,13 +874,13 @@ async fn exprs_to_scalar( ctx: func_ctx, }; let res = expression_transform.transform(one_row_chunk)?; - let data_scalars: Vec = res + let scalars: Vec = res .columns() .iter() .skip(1) .map(|col| unsafe { col.value.as_ref().index_unchecked(0).to_owned() }) .collect(); - Ok(data_scalars) + Ok(scalars) } // TODO:(everpcpc) tmp copy from src/query/sql/src/planner/binder/copy.rs diff --git a/src/query/service/src/interpreters/interpreter_table_describe.rs b/src/query/service/src/interpreters/interpreter_table_describe.rs index 18f6cd07fa936..afc92c4d36644 100644 --- a/src/query/service/src/interpreters/interpreter_table_describe.rs +++ b/src/query/service/src/interpreters/interpreter_table_describe.rs @@ -17,7 +17,6 @@ use std::sync::Arc; use common_exception::ErrorCode; use common_exception::Result; use common_expression::infer_table_schema; -use common_expression::types::DataType; use common_expression::types::StringType; use common_expression::DataBlock; use common_expression::DataSchemaRef; @@ -97,8 +96,7 @@ impl Interpreter for DescribeTableInterpreter { } None => { - let data_type: DataType = field.data_type().into(); - let value = Scalar::default_value(&data_type); + let value = Scalar::default_value(&field.data_type().into()); default_exprs.push(value.to_string().as_bytes().to_vec()); } } diff --git a/src/query/service/src/pipelines/processors/transforms/group_by/aggregator_groups_builder.rs b/src/query/service/src/pipelines/processors/transforms/group_by/aggregator_groups_builder.rs index c5afa88d61d17..a4cebec798095 100644 --- a/src/query/service/src/pipelines/processors/transforms/group_by/aggregator_groups_builder.rs +++ b/src/query/service/src/pipelines/processors/transforms/group_by/aggregator_groups_builder.rs @@ -18,10 +18,8 @@ use common_exception::Result; use common_expression::types::string::StringColumnBuilder; use common_expression::types::DataType; use common_expression::Column; +use common_expression::ColumnBuilder; use common_expression::HashMethodFixedKeys; -use common_expression::TypeDeserializer; -use common_expression::TypeDeserializerImpl; -use common_io::prelude::FormatSettings; use crate::pipelines::processors::AggregatorParams; @@ -115,23 +113,22 @@ impl<'a> GroupColumnsBuilder for SerializedKeysGroupColumnsBuilder<'a> { } fn finish(mut self) -> Result> { - if let Some(mut builder) = self.single_builder.take() { - let col = builder.finish_to_column(); - return Ok(vec![col]); + if let Some(builder) = self.single_builder.take() { + let col = builder.build(); + return Ok(vec![Column::String(col)]); } let rows = self.data.len(); let keys = self.data.as_mut_slice(); let mut res = Vec::with_capacity(self.group_data_types.len()); - let format = FormatSettings::default(); for data_type in self.group_data_types.iter() { - let mut deserializer = TypeDeserializerImpl::with_capacity(data_type, rows); + let mut column = ColumnBuilder::with_capacity(data_type, rows); for (_, key) in keys.iter_mut().enumerate() { - deserializer.de_binary(key, &format)?; + column.push_binary(key)?; } - res.push(deserializer.finish_to_column()); + res.push(column.build()); } Ok(res) diff --git a/src/query/service/src/pipelines/processors/transforms/transform_merge_block.rs b/src/query/service/src/pipelines/processors/transforms/transform_merge_block.rs index 044372a955574..3936520980cf7 100644 --- a/src/query/service/src/pipelines/processors/transforms/transform_merge_block.rs +++ b/src/query/service/src/pipelines/processors/transforms/transform_merge_block.rs @@ -19,12 +19,10 @@ use async_channel::Receiver; use common_exception::ErrorCode; use common_exception::Result; use common_expression::BlockEntry; +use common_expression::ColumnBuilder; use common_expression::DataBlock; use common_expression::DataSchemaRef; -use common_expression::TypeDeserializer; -use common_expression::TypeDeserializerImpl; use common_expression::Value; -use common_io::prelude::FormatSettings; use common_pipeline_core::processors::port::InputPort; use common_pipeline_core::processors::port::OutputPort; use common_pipeline_core::processors::processor::Event; @@ -108,14 +106,13 @@ impl TransformMergeBlock { if left_data_type.remove_nullable() == right_data_type.remove_nullable() { let origin_column = block.get_by_offset(index).clone(); - let mut builder = TypeDeserializerImpl::with_capacity(left_data_type, block.num_rows()); - let settings = FormatSettings::default(); + let mut builder = ColumnBuilder::with_capacity(left_data_type, block.num_rows()); let value = origin_column.value.as_ref(); for idx in 0..block.num_rows() { let scalar = value.index(idx).unwrap(); - builder.append_data_value(scalar.to_owned(), &settings)?; + builder.push(scalar); } - let col = builder.finish_to_column(); + let col = builder.build(); Ok(BlockEntry { data_type: left_data_type.clone(), value: Value::Column(col), diff --git a/src/query/service/tests/it/storages/fuse/operations/alter_table.rs b/src/query/service/tests/it/storages/fuse/operations/alter_table.rs index 5e96ce29e1786..b9c6d6f29c6d1 100644 --- a/src/query/service/tests/it/storages/fuse/operations/alter_table.rs +++ b/src/query/service/tests/it/storages/fuse/operations/alter_table.rs @@ -188,10 +188,7 @@ async fn test_fuse_table_optimize_alter_table() -> Result<()> { let column0 = Int32Type::from_data(vec![1, 2]); let column3 = UInt64Type::from_data(vec![3, 4]); let column4 = Float64Type::from_data(vec![13.0, 14.0]); - let tuple_column = Column::Tuple { - fields: vec![column3, column4], - len: 2, - }; + let tuple_column = Column::Tuple(vec![column3, column4]); DataBlock::new_from_columns(vec![column0, tuple_column]) }; diff --git a/src/query/service/tests/it/storages/fuse/statistics.rs b/src/query/service/tests/it/storages/fuse/statistics.rs index 2d967d39a446f..9e4fe112178ee 100644 --- a/src/query/service/tests/it/storages/fuse/statistics.rs +++ b/src/query/service/tests/it/storages/fuse/statistics.rs @@ -127,10 +127,7 @@ fn test_ft_tuple_stats_block_stats() -> common_exception::Result<()> { Int32Type::from_data(vec![1, 2, 3]), Int32Type::from_data(vec![4, 5, 6]), ]; - let column = Column::Tuple { - fields: inner_columns, - len: 3, - }; + let column = Column::Tuple(inner_columns); let block = DataBlock::new_from_columns(vec![column]); diff --git a/src/query/service/tests/it/storages/fuse/table_test_fixture.rs b/src/query/service/tests/it/storages/fuse/table_test_fixture.rs index 72edb886a6084..7664a8d94e629 100644 --- a/src/query/service/tests/it/storages/fuse/table_test_fixture.rs +++ b/src/query/service/tests/it/storages/fuse/table_test_fixture.rs @@ -259,10 +259,7 @@ impl TestFixture { .collect::>(), ); let tuple_inner_columns = vec![column1, column2]; - let tuple_column = Column::Tuple { - fields: tuple_inner_columns, - len: rows_per_block, - }; + let tuple_column = Column::Tuple(tuple_inner_columns); let columns = vec![column0, tuple_column]; diff --git a/src/query/service/tests/it/storages/statistics/column_statistics.rs b/src/query/service/tests/it/storages/statistics/column_statistics.rs index e16c0f338abb7..11f01db8fa4e3 100644 --- a/src/query/service/tests/it/storages/statistics/column_statistics.rs +++ b/src/query/service/tests/it/storages/statistics/column_statistics.rs @@ -69,14 +69,8 @@ fn gen_sample_block() -> (DataBlock, Vec, TableSchemaRef) { let col_g = Float64Type::from_data(vec![10.0f64, 11., 12.]); // inner/root nodes - let col_b = Column::Tuple { - fields: vec![col_c.clone(), col_d.clone()], - len: 3, - }; - let col_a = Column::Tuple { - fields: vec![col_b, col_e.clone()], - len: 3, - }; + let col_b = Column::Tuple(vec![col_c.clone(), col_d.clone()]); + let col_a = Column::Tuple(vec![col_b, col_e.clone()]); let columns = vec![col_a, col_f.clone(), col_g.clone()]; ( diff --git a/src/query/storages/common/blocks/src/block.rs b/src/query/storages/common/blocks/src/block.rs index 0bd86d3bcf527..01bfa49d393e5 100644 --- a/src/query/storages/common/blocks/src/block.rs +++ b/src/query/storages/common/blocks/src/block.rs @@ -23,6 +23,7 @@ use common_arrow::parquet::write::Version; use common_arrow::write_parquet_file; use common_exception::ErrorCode; use common_exception::Result; +use common_expression::serialize::col_encoding; use common_expression::DataBlock; use common_expression::TableSchema; use storages_common_table_meta::table::TableCompression; @@ -79,8 +80,3 @@ pub fn blocks_to_parquet( ))), } } - -// fallback to plain encoding due to performance issue -fn col_encoding(_data_type: &ArrowDataType) -> Encoding { - Encoding::Plain -} diff --git a/src/query/storages/common/index/tests/it/filters/bloom_filter.rs b/src/query/storages/common/index/tests/it/filters/bloom_filter.rs index 5508a83d5403c..9c6419f38598e 100644 --- a/src/query/storages/common/index/tests/it/filters/bloom_filter.rs +++ b/src/query/storages/common/index/tests/it/filters/bloom_filter.rs @@ -81,13 +81,10 @@ fn test_bloom_filter() -> Result<()> { }, BlockEntry { data_type: map_ty.clone(), - value: Value::Scalar(Scalar::Map(Column::Tuple { - fields: vec![ - UInt8Type::from_data(vec![1, 2]), - StringType::from_data(vec!["a", "b"]), - ], - len: 2, - })), + value: Value::Scalar(Scalar::Map(Column::Tuple(vec![ + UInt8Type::from_data(vec![1, 2]), + StringType::from_data(vec!["a", "b"]), + ]))), }, ], 2, diff --git a/src/query/storages/fuse/src/statistics/column_statistic.rs b/src/query/storages/fuse/src/statistics/column_statistic.rs index b3d85f09f8b1b..5ecd4a40eee67 100644 --- a/src/query/storages/fuse/src/statistics/column_statistic.rs +++ b/src/query/storages/fuse/src/statistics/column_statistic.rs @@ -154,7 +154,7 @@ pub fn gen_col_stats_lite( match data_type { DataType::Tuple(inner_types) => { if let Some((col, val)) = col_scalar { - let (inner_columns, _) = col.as_tuple().unwrap(); + let inner_columns = col.as_tuple().unwrap(); let inner_scalars = val.as_tuple().unwrap(); for ((inner_column, inner_type), inner_scalar) in inner_columns.iter().zip(inner_types).zip(inner_scalars) @@ -256,7 +256,7 @@ pub mod traverse { match data_type.remove_nullable() { DataType::Tuple(inner_types) => match (data_type.is_nullable(), column) { (false, Some(column)) => { - let (inner_columns, _) = column.as_tuple().unwrap(); + let inner_columns = column.as_tuple().unwrap(); for (inner_column, inner_type) in inner_columns.iter().zip(inner_types.iter()) { traverse_recursive(None, Some(inner_column), inner_type, leaves)?; } diff --git a/src/query/storages/memory/src/memory_table.rs b/src/query/storages/memory/src/memory_table.rs index 703222b4d1255..032ebbfbf66bd 100644 --- a/src/query/storages/memory/src/memory_table.rs +++ b/src/query/storages/memory/src/memory_table.rs @@ -327,7 +327,7 @@ impl MemoryTableSource { match &entry.data_type { DataType::Tuple(inner_tys) => { let col = entry.value.clone().into_column().unwrap(); - let (inner_columns, _) = col.into_tuple().unwrap(); + let inner_columns = col.into_tuple().unwrap(); let mut values = Vec::with_capacity(inner_tys.len()); for (col, ty) in inner_columns.iter().zip(inner_tys.iter()) { values.push(BlockEntry {