From 3827974aa0fc35a199620af6a7ddccc3b37c7912 Mon Sep 17 00:00:00 2001 From: Filippo Rossi Date: Thu, 10 Oct 2024 16:45:52 +0200 Subject: [PATCH 01/15] [logical-types] add NativeType and LogicalType --- datafusion/common/src/lib.rs | 1 + datafusion/common/src/types/builtin.rs | 22 +++ datafusion/common/src/types/logical.rs | 41 +++++ datafusion/common/src/types/mod.rs | 7 + datafusion/common/src/types/native.rs | 237 +++++++++++++++++++++++++ 5 files changed, 308 insertions(+) create mode 100644 datafusion/common/src/types/builtin.rs create mode 100644 datafusion/common/src/types/logical.rs create mode 100644 datafusion/common/src/types/mod.rs create mode 100644 datafusion/common/src/types/native.rs diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index 10541e01914a..b8ba1ed4e8cb 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -43,6 +43,7 @@ pub mod scalar; pub mod stats; pub mod test_util; pub mod tree_node; +pub mod types; pub mod utils; /// Reexport arrow crate diff --git a/datafusion/common/src/types/builtin.rs b/datafusion/common/src/types/builtin.rs new file mode 100644 index 000000000000..4dbbec21833f --- /dev/null +++ b/datafusion/common/src/types/builtin.rs @@ -0,0 +1,22 @@ +use super::{LogicalType, NativeType}; + +#[derive(Debug)] +pub struct BuiltinType { + native: NativeType, +} + +impl LogicalType for BuiltinType { + fn native(&self) -> &NativeType { + &self.native + } + + fn name(&self) -> Option<&str> { + None + } +} + +impl From for BuiltinType { + fn from(native: NativeType) -> Self { + Self { native } + } +} diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs new file mode 100644 index 000000000000..a5b66db573c3 --- /dev/null +++ b/datafusion/common/src/types/logical.rs @@ -0,0 +1,41 @@ +use core::fmt; +use std::{cmp::Ordering, hash::Hash, sync::Arc}; + +use super::NativeType; + +/// A reference counted [`LogicalType`] +pub type LogicalTypeRef = Arc; + +pub trait LogicalType: fmt::Debug { + fn native(&self) -> &NativeType; + fn name(&self) -> Option<&str>; +} + +impl PartialEq for dyn LogicalType { + fn eq(&self, other: &Self) -> bool { + self.native().eq(other.native()) && self.name().eq(&other.name()) + } +} + +impl Eq for dyn LogicalType {} + +impl PartialOrd for dyn LogicalType { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for dyn LogicalType { + fn cmp(&self, other: &Self) -> Ordering { + self.name() + .cmp(&other.name()) + .then(self.native().cmp(other.native())) + } +} + +impl Hash for dyn LogicalType { + fn hash(&self, state: &mut H) { + self.name().hash(state); + self.native().hash(state); + } +} diff --git a/datafusion/common/src/types/mod.rs b/datafusion/common/src/types/mod.rs new file mode 100644 index 000000000000..afce5e89fb9c --- /dev/null +++ b/datafusion/common/src/types/mod.rs @@ -0,0 +1,7 @@ +mod builtin; +mod logical; +mod native; + +pub use builtin::*; +pub use logical::*; +pub use native::*; diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs new file mode 100644 index 000000000000..b8f1a031885c --- /dev/null +++ b/datafusion/common/src/types/native.rs @@ -0,0 +1,237 @@ +use std::sync::Arc; + +use arrow_schema::{DataType, IntervalUnit, TimeUnit}; + +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub enum NativeType { + /// Null type + Null, + /// A boolean datatype representing the values `true` and `false`. + Boolean, + /// A signed 8-bit integer. + Int8, + /// A signed 16-bit integer. + Int16, + /// A signed 32-bit integer. + Int32, + /// A signed 64-bit integer. + Int64, + /// An unsigned 8-bit integer. + UInt8, + /// An unsigned 16-bit integer. + UInt16, + /// An unsigned 32-bit integer. + UInt32, + /// An unsigned 64-bit integer. + UInt64, + /// A 16-bit floating point number. + Float16, + /// A 32-bit floating point number. + Float32, + /// A 64-bit floating point number. + Float64, + /// A timestamp with an optional timezone. + /// + /// Time is measured as a Unix epoch, counting the seconds from + /// 00:00:00.000 on 1 January 1970, excluding leap seconds, + /// as a signed 64-bit integer. + /// + /// The time zone is a string indicating the name of a time zone, one of: + /// + /// * As used in the Olson time zone database (the "tz database" or + /// "tzdata"), such as "America/New_York" + /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 + /// + /// Timestamps with a non-empty timezone + /// ------------------------------------ + /// + /// If a Timestamp column has a non-empty timezone value, its epoch is + /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone + /// (the Unix epoch), regardless of the Timestamp's own timezone. + /// + /// Therefore, timestamp values with a non-empty timezone correspond to + /// physical points in time together with some additional information about + /// how the data was obtained and/or how to display it (the timezone). + /// + /// For example, the timestamp value 0 with the timezone string "Europe/Paris" + /// corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the + /// application may prefer to display it as "January 1st 1970, 01h00" in + /// the Europe/Paris timezone (which is the same physical point in time). + /// + /// One consequence is that timestamp values with a non-empty timezone + /// can be compared and ordered directly, since they all share the same + /// well-known point of reference (the Unix epoch). + /// + /// Timestamps with an unset / empty timezone + /// ----------------------------------------- + /// + /// If a Timestamp column has no timezone value, its epoch is + /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone. + /// + /// Therefore, timestamp values without a timezone cannot be meaningfully + /// interpreted as physical points in time, but only as calendar / clock + /// indications ("wall clock time") in an unspecified timezone. + /// + /// For example, the timestamp value 0 with an empty timezone string + /// corresponds to "January 1st 1970, 00h00" in an unknown timezone: there + /// is not enough information to interpret it as a well-defined physical + /// point in time. + /// + /// One consequence is that timestamp values without a timezone cannot + /// be reliably compared or ordered, since they may have different points of + /// reference. In particular, it is *not* possible to interpret an unset + /// or empty timezone as the same as "UTC". + /// + /// Conversion between timezones + /// ---------------------------- + /// + /// If a Timestamp column has a non-empty timezone, changing the timezone + /// to a different non-empty value is a metadata-only operation: + /// the timestamp values need not change as their point of reference remains + /// the same (the Unix epoch). + /// + /// However, if a Timestamp column has no timezone value, changing it to a + /// non-empty value requires to think about the desired semantics. + /// One possibility is to assume that the original timestamp values are + /// relative to the epoch of the timezone being set; timestamp values should + /// then adjusted to the Unix epoch (for example, changing the timezone from + /// empty to "Europe/Paris" would require converting the timestamp values + /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is + /// nevertheless correct). + /// + /// ``` + /// # use arrow_schema::{DataType, TimeUnit}; + /// DataType::Timestamp(TimeUnit::Second, None); + /// DataType::Timestamp(TimeUnit::Second, Some("literal".into())); + /// DataType::Timestamp(TimeUnit::Second, Some("string".to_string().into())); + /// ``` + Timestamp(TimeUnit, Option>), + /// A signed 32-bit date representing the elapsed time since UNIX epoch (1970-01-01) + /// in days. + Date, + /// A signed 32-bit time representing the elapsed time since midnight in the unit of `TimeUnit`. + /// Must be either seconds or milliseconds. + Time32(TimeUnit), + /// A signed 64-bit time representing the elapsed time since midnight in the unit of `TimeUnit`. + /// Must be either microseconds or nanoseconds. + Time64(TimeUnit), + /// Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds. + Duration(TimeUnit), + /// A "calendar" interval which models types that don't necessarily + /// have a precise duration without the context of a base timestamp (e.g. + /// days can differ in length during day light savings time transitions). + Interval(IntervalUnit), + /// Opaque binary data of variable length. + Binary, + /// Opaque binary data of fixed size. + /// Enum parameter specifies the number of bytes per value. + FixedSizeBinary(i32), + /// A variable-length string in Unicode with UTF-8 encoding. + Utf8, + /// A list of some logical data type with variable length. + List(Box), + /// A list of some logical data type with fixed length. + FixedSizeList(Box, i32), + /// A nested datatype that contains a number of sub-fields. + Struct(Box<[(String, NativeType)]>), + /// A nested datatype that can represent slots of differing types. + Union(Box<[(i8, NativeType)]>), + /// Exact 128-bit width decimal value with precision and scale + /// + /// * precision is the total number of digits + /// * scale is the number of digits past the decimal + /// + /// For example the number 123.45 has precision 5 and scale 2. + /// + /// In certain situations, scale could be negative number. For + /// negative scale, it is the number of padding 0 to the right + /// of the digits. + /// + /// For example the number 12300 could be treated as a decimal + /// has precision 3 and scale -2. + Decimal128(u8, i8), + /// Exact 256-bit width decimal value with precision and scale + /// + /// * precision is the total number of digits + /// * scale is the number of digits past the decimal + /// + /// For example the number 123.45 has precision 5 and scale 2. + /// + /// In certain situations, scale could be negative number. For + /// negative scale, it is the number of padding 0 to the right + /// of the digits. + /// + /// For example the number 12300 could be treated as a decimal + /// has precision 3 and scale -2. + Decimal256(u8, i8), + /// A Map is a logical nested type that is represented as + /// + /// `List>` + /// + /// The keys and values are each respectively contiguous. + /// The key and value types are not constrained, but keys should be + /// hashable and unique. + /// Whether the keys are sorted can be set in the `bool` after the `Field`. + /// + /// In a field with Map type, the field has a child Struct field, which then + /// has two children: key type and the second the value type. The names of the + /// child fields may be respectively "entries", "key", and "value", but this is + /// not enforced. + Map(Box), +} + +impl From for NativeType { + fn from(value: DataType) -> Self { + use NativeType::*; + match value { + DataType::Null => Null, + DataType::Boolean => Boolean, + DataType::Int8 => Int8, + DataType::Int16 => Int16, + DataType::Int32 => Int32, + DataType::Int64 => Int64, + DataType::UInt8 => UInt8, + DataType::UInt16 => UInt16, + DataType::UInt32 => UInt32, + DataType::UInt64 => UInt64, + DataType::Float16 => Float16, + DataType::Float32 => Float32, + DataType::Float64 => Float64, + DataType::Timestamp(time_unit, arc) => Timestamp(time_unit, arc), + DataType::Date32 | DataType::Date64 => Date, + DataType::Time32(time_unit) => Time32(time_unit), + DataType::Time64(time_unit) => Time64(time_unit), + DataType::Duration(time_unit) => Duration(time_unit), + DataType::Interval(interval_unit) => Interval(interval_unit), + DataType::Binary | DataType::LargeBinary | DataType::BinaryView => Binary, + DataType::FixedSizeBinary(size) => FixedSizeBinary(size), + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => Utf8, + DataType::List(field) + | DataType::ListView(field) + | DataType::LargeList(field) + | DataType::LargeListView(field) => { + List(Box::new(field.data_type().clone().into())) + } + DataType::FixedSizeList(field, size) => { + FixedSizeList(Box::new(field.data_type().clone().into()), size) + } + DataType::Struct(fields) => Struct( + fields + .into_iter() + .map(|field| (field.name().clone(), field.data_type().clone().into())) + .collect(), + ), + DataType::Union(union_fields, _) => Union( + union_fields + .iter() + .map(|(i, field)| (i, field.data_type().clone().into())) + .collect(), + ), + DataType::Dictionary(_, data_type) => data_type.as_ref().clone().into(), + DataType::Decimal128(p, s) => Decimal128(p, s), + DataType::Decimal256(p, s) => Decimal256(p, s), + DataType::Map(field, _) => Map(Box::new(field.data_type().clone().into())), + DataType::RunEndEncoded(_, field) => field.data_type().clone().into(), + } + } +} From 17a70d858a94bfce2be9a1f7af0521e6f71ae79e Mon Sep 17 00:00:00 2001 From: Filippo Rossi Date: Thu, 10 Oct 2024 16:55:54 +0200 Subject: [PATCH 02/15] Add license header --- datafusion/common/src/types/builtin.rs | 17 +++++++++++++++++ datafusion/common/src/types/logical.rs | 17 +++++++++++++++++ datafusion/common/src/types/mod.rs | 17 +++++++++++++++++ datafusion/common/src/types/native.rs | 17 +++++++++++++++++ 4 files changed, 68 insertions(+) diff --git a/datafusion/common/src/types/builtin.rs b/datafusion/common/src/types/builtin.rs index 4dbbec21833f..51fca8f21ebe 100644 --- a/datafusion/common/src/types/builtin.rs +++ b/datafusion/common/src/types/builtin.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use super::{LogicalType, NativeType}; #[derive(Debug)] diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs index a5b66db573c3..312051726b4e 100644 --- a/datafusion/common/src/types/logical.rs +++ b/datafusion/common/src/types/logical.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use core::fmt; use std::{cmp::Ordering, hash::Hash, sync::Arc}; diff --git a/datafusion/common/src/types/mod.rs b/datafusion/common/src/types/mod.rs index afce5e89fb9c..84779fcbe3ac 100644 --- a/datafusion/common/src/types/mod.rs +++ b/datafusion/common/src/types/mod.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + mod builtin; mod logical; mod native; diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index b8f1a031885c..0b80fcb3af3e 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use std::sync::Arc; use arrow_schema::{DataType, IntervalUnit, TimeUnit}; From 3dba963f8619434c5ae4b5b385e80fe2e4330efe Mon Sep 17 00:00:00 2001 From: Filippo Rossi Date: Tue, 15 Oct 2024 09:38:51 +0200 Subject: [PATCH 03/15] Add NativeField and derivates --- datafusion/common/src/types/native.rs | 141 +++++++++++++++----------- 1 file changed, 83 insertions(+), 58 deletions(-) diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index 0b80fcb3af3e..ef625154c51f 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -17,7 +17,22 @@ use std::sync::Arc; -use arrow_schema::{DataType, IntervalUnit, TimeUnit}; +use arrow_schema::{DataType, Field, Fields, IntervalUnit, TimeUnit, UnionFields}; + +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct NativeField { + name: String, + native_type: NativeType, + nullable: bool, +} + +pub type NativeFieldRef = Arc; + +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct NativeFields(Arc<[NativeFieldRef]>); + +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct NativeUnionFields(Arc<[(i8, NativeFieldRef)]>); #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub enum NativeType { @@ -123,15 +138,11 @@ pub enum NativeType { /// DataType::Timestamp(TimeUnit::Second, Some("string".to_string().into())); /// ``` Timestamp(TimeUnit, Option>), - /// A signed 32-bit date representing the elapsed time since UNIX epoch (1970-01-01) + /// A signed date representing the elapsed time since UNIX epoch (1970-01-01) /// in days. Date, - /// A signed 32-bit time representing the elapsed time since midnight in the unit of `TimeUnit`. - /// Must be either seconds or milliseconds. - Time32(TimeUnit), - /// A signed 64-bit time representing the elapsed time since midnight in the unit of `TimeUnit`. - /// Must be either microseconds or nanoseconds. - Time64(TimeUnit), + /// A signed time representing the elapsed time since midnight in the unit of `TimeUnit`. + Time(TimeUnit), /// Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds. Duration(TimeUnit), /// A "calendar" interval which models types that don't necessarily @@ -146,28 +157,14 @@ pub enum NativeType { /// A variable-length string in Unicode with UTF-8 encoding. Utf8, /// A list of some logical data type with variable length. - List(Box), + List(NativeFieldRef), /// A list of some logical data type with fixed length. - FixedSizeList(Box, i32), + FixedSizeList(NativeFieldRef, i32), /// A nested datatype that contains a number of sub-fields. - Struct(Box<[(String, NativeType)]>), + Struct(NativeFields), /// A nested datatype that can represent slots of differing types. - Union(Box<[(i8, NativeType)]>), - /// Exact 128-bit width decimal value with precision and scale - /// - /// * precision is the total number of digits - /// * scale is the number of digits past the decimal - /// - /// For example the number 123.45 has precision 5 and scale 2. - /// - /// In certain situations, scale could be negative number. For - /// negative scale, it is the number of padding 0 to the right - /// of the digits. - /// - /// For example the number 12300 could be treated as a decimal - /// has precision 3 and scale -2. - Decimal128(u8, i8), - /// Exact 256-bit width decimal value with precision and scale + Union(NativeUnionFields), + /// Decimal value with precision and scale /// /// * precision is the total number of digits /// * scale is the number of digits past the decimal @@ -180,23 +177,23 @@ pub enum NativeType { /// /// For example the number 12300 could be treated as a decimal /// has precision 3 and scale -2. - Decimal256(u8, i8), - /// A Map is a logical nested type that is represented as + Decimal(u8, i8), + /// A Map is a type that an association between a key and a value. /// - /// `List>` - /// - /// The keys and values are each respectively contiguous. /// The key and value types are not constrained, but keys should be /// hashable and unique. - /// Whether the keys are sorted can be set in the `bool` after the `Field`. /// /// In a field with Map type, the field has a child Struct field, which then /// has two children: key type and the second the value type. The names of the /// child fields may be respectively "entries", "key", and "value", but this is /// not enforced. - Map(Box), + Map(NativeFieldRef), } +// The following From, From, ... implementations are temporary +// mapping solutions to provide backwards compatibility while transitioning from +// the purely physical system to a logical / physical system. + impl From for NativeType { fn from(value: DataType) -> Self { use NativeType::*; @@ -214,41 +211,69 @@ impl From for NativeType { DataType::Float16 => Float16, DataType::Float32 => Float32, DataType::Float64 => Float64, - DataType::Timestamp(time_unit, arc) => Timestamp(time_unit, arc), + DataType::Timestamp(tu, tz) => Timestamp(tu, tz), DataType::Date32 | DataType::Date64 => Date, - DataType::Time32(time_unit) => Time32(time_unit), - DataType::Time64(time_unit) => Time64(time_unit), - DataType::Duration(time_unit) => Duration(time_unit), - DataType::Interval(interval_unit) => Interval(interval_unit), + DataType::Time32(tu) | DataType::Time64(tu) => Time(tu), + DataType::Duration(tu) => Duration(tu), + DataType::Interval(iu) => Interval(iu), DataType::Binary | DataType::LargeBinary | DataType::BinaryView => Binary, DataType::FixedSizeBinary(size) => FixedSizeBinary(size), DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => Utf8, DataType::List(field) | DataType::ListView(field) | DataType::LargeList(field) - | DataType::LargeListView(field) => { - List(Box::new(field.data_type().clone().into())) - } + | DataType::LargeListView(field) => List(Arc::new(field.as_ref().into())), DataType::FixedSizeList(field, size) => { - FixedSizeList(Box::new(field.data_type().clone().into()), size) + FixedSizeList(Arc::new(field.as_ref().into()), size) + } + DataType::Struct(fields) => Struct(NativeFields::from(&fields)), + DataType::Union(union_fields, _) => { + Union(NativeUnionFields::from(&union_fields)) } - DataType::Struct(fields) => Struct( - fields - .into_iter() - .map(|field| (field.name().clone(), field.data_type().clone().into())) - .collect(), - ), - DataType::Union(union_fields, _) => Union( - union_fields - .iter() - .map(|(i, field)| (i, field.data_type().clone().into())) - .collect(), - ), DataType::Dictionary(_, data_type) => data_type.as_ref().clone().into(), - DataType::Decimal128(p, s) => Decimal128(p, s), - DataType::Decimal256(p, s) => Decimal256(p, s), - DataType::Map(field, _) => Map(Box::new(field.data_type().clone().into())), + DataType::Decimal128(p, s) | DataType::Decimal256(p, s) => Decimal(p, s), + DataType::Map(field, _) => Map(Arc::new(field.as_ref().into())), DataType::RunEndEncoded(_, field) => field.data_type().clone().into(), } } } + +impl From<&Field> for NativeField { + fn from(value: &Field) -> Self { + Self { + name: value.name().clone(), + native_type: value.data_type().clone().into(), + nullable: value.is_nullable(), + } + } +} + +impl From<&Fields> for NativeFields { + fn from(value: &Fields) -> Self { + value + .iter() + .map(|field| Arc::new(NativeField::from(field.as_ref()))) + .collect() + } +} + +impl FromIterator for NativeFields { + fn from_iter>(iter: T) -> Self { + Self(iter.into_iter().collect()) + } +} + +impl From<&UnionFields> for NativeUnionFields { + fn from(value: &UnionFields) -> Self { + value + .iter() + .map(|(i, field)| (i, Arc::new(NativeField::from(field.as_ref())))) + .collect() + } +} + +impl FromIterator<(i8, NativeFieldRef)> for NativeUnionFields { + fn from_iter>(iter: T) -> Self { + Self(iter.into_iter().collect()) + } +} From 42179705fb071627fc498cec6ef198c4b7658545 Mon Sep 17 00:00:00 2001 From: Filippo Rossi Date: Tue, 15 Oct 2024 12:30:49 +0200 Subject: [PATCH 04/15] Support TypeSignatures --- datafusion/common/src/types/builtin.rs | 39 ------------- datafusion/common/src/types/logical.rs | 78 ++++++++++++++++++++++++-- datafusion/common/src/types/mod.rs | 2 - datafusion/common/src/types/native.rs | 12 ++++ 4 files changed, 84 insertions(+), 47 deletions(-) delete mode 100644 datafusion/common/src/types/builtin.rs diff --git a/datafusion/common/src/types/builtin.rs b/datafusion/common/src/types/builtin.rs deleted file mode 100644 index 51fca8f21ebe..000000000000 --- a/datafusion/common/src/types/builtin.rs +++ /dev/null @@ -1,39 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use super::{LogicalType, NativeType}; - -#[derive(Debug)] -pub struct BuiltinType { - native: NativeType, -} - -impl LogicalType for BuiltinType { - fn native(&self) -> &NativeType { - &self.native - } - - fn name(&self) -> Option<&str> { - None - } -} - -impl From for BuiltinType { - fn from(native: NativeType) -> Self { - Self { native } - } -} diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs index 312051726b4e..c99b78ecfaeb 100644 --- a/datafusion/common/src/types/logical.rs +++ b/datafusion/common/src/types/logical.rs @@ -20,17 +20,46 @@ use std::{cmp::Ordering, hash::Hash, sync::Arc}; use super::NativeType; +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub enum TypeSignature<'a> { + /// Represents a built-in native type. + Native(&'a NativeType), + /// Represents an arrow-compatible extension type. + /// (https://arrow.apache.org/docs/format/Columnar.html#extension-types) + /// + /// The `name` should contain the same value as 'ARROW:extension:name'. + Extension { + name: &'a str, + parameters: &'a [TypeParameter<'a>], + }, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub enum TypeParameter<'a> { + Type(TypeSignature<'a>), + Number(i128), +} + /// A reference counted [`LogicalType`] pub type LogicalTypeRef = Arc; -pub trait LogicalType: fmt::Debug { +pub trait LogicalType: Sync + Send { fn native(&self) -> &NativeType; - fn name(&self) -> Option<&str>; + fn signature(&self) -> TypeSignature<'_>; +} + +impl fmt::Debug for dyn LogicalType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("LogicalType") + .field(&self.signature()) + .field(&self.native()) + .finish() + } } impl PartialEq for dyn LogicalType { fn eq(&self, other: &Self) -> bool { - self.native().eq(other.native()) && self.name().eq(&other.name()) + self.native().eq(other.native()) && self.signature().eq(&other.signature()) } } @@ -44,15 +73,52 @@ impl PartialOrd for dyn LogicalType { impl Ord for dyn LogicalType { fn cmp(&self, other: &Self) -> Ordering { - self.name() - .cmp(&other.name()) + self.signature() + .cmp(&other.signature()) .then(self.native().cmp(other.native())) } } impl Hash for dyn LogicalType { fn hash(&self, state: &mut H) { - self.name().hash(state); + self.signature().hash(state); self.native().hash(state); } } + +#[cfg(test)] +mod test { + #![allow(dead_code)] + + use super::{LogicalType, TypeParameter, TypeSignature}; + use crate::types::NativeType; + + struct MagicalType {} + + impl LogicalType for MagicalType { + fn native(&self) -> &NativeType { + &NativeType::Utf8 + } + + fn signature(&self) -> TypeSignature<'_> { + TypeSignature::Extension { + name: "MagicalType", + parameters: &[TypeParameter::Type(TypeSignature::Native( + &NativeType::Boolean, + ))], + } + } + } + + fn test(logical_type: &dyn LogicalType) { + match logical_type.signature() { + TypeSignature::Extension { + name: "MagicalType", + parameters: + [TypeParameter::Type(TypeSignature::Native(NativeType::Boolean))], + } => {} + TypeSignature::Native(NativeType::Binary) => todo!(), + _ => unimplemented!(), + }; + } +} diff --git a/datafusion/common/src/types/mod.rs b/datafusion/common/src/types/mod.rs index 84779fcbe3ac..4e1bcb75cb56 100644 --- a/datafusion/common/src/types/mod.rs +++ b/datafusion/common/src/types/mod.rs @@ -15,10 +15,8 @@ // specific language governing permissions and limitations // under the License. -mod builtin; mod logical; mod native; -pub use builtin::*; pub use logical::*; pub use native::*; diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index ef625154c51f..71b9c8e26361 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -19,6 +19,8 @@ use std::sync::Arc; use arrow_schema::{DataType, Field, Fields, IntervalUnit, TimeUnit, UnionFields}; +use super::{LogicalType, TypeSignature}; + #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct NativeField { name: String, @@ -190,6 +192,16 @@ pub enum NativeType { Map(NativeFieldRef), } +impl LogicalType for NativeType { + fn native(&self) -> &NativeType { + self + } + + fn signature(&self) -> TypeSignature<'_> { + TypeSignature::Native(self) + } +} + // The following From, From, ... implementations are temporary // mapping solutions to provide backwards compatibility while transitioning from // the purely physical system to a logical / physical system. From 01f0089c16c58e7b2d81b3f18f3022b92f38770d Mon Sep 17 00:00:00 2001 From: Filippo Rossi Date: Tue, 15 Oct 2024 12:42:19 +0200 Subject: [PATCH 05/15] Fix doc --- datafusion/common/src/types/logical.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs index c99b78ecfaeb..076a4998fefd 100644 --- a/datafusion/common/src/types/logical.rs +++ b/datafusion/common/src/types/logical.rs @@ -25,7 +25,7 @@ pub enum TypeSignature<'a> { /// Represents a built-in native type. Native(&'a NativeType), /// Represents an arrow-compatible extension type. - /// (https://arrow.apache.org/docs/format/Columnar.html#extension-types) + /// () /// /// The `name` should contain the same value as 'ARROW:extension:name'. Extension { From 5b5f4c13beb61f37f2d21ea0ec747275dc4a5123 Mon Sep 17 00:00:00 2001 From: Filippo Rossi Date: Thu, 17 Oct 2024 15:54:16 +0200 Subject: [PATCH 06/15] Add documentation --- datafusion/common/src/types/logical.rs | 30 ++++++++++++++++- datafusion/common/src/types/native.rs | 45 +++++++++++++++++++++++--- 2 files changed, 70 insertions(+), 5 deletions(-) diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs index 076a4998fefd..9da26b050d59 100644 --- a/datafusion/common/src/types/logical.rs +++ b/datafusion/common/src/types/logical.rs @@ -40,9 +40,37 @@ pub enum TypeParameter<'a> { Number(i128), } -/// A reference counted [`LogicalType`] +/// A reference counted [`LogicalType`]. pub type LogicalTypeRef = Arc; +/// Representation of a logical type with its signature and its native backing +/// type. +/// +/// The logical type is meant to be used during the DataFusion logical planning +/// phase in order to reason about logical types without worrying about their +/// underlying physical implementation. +/// +/// ### Extension types +/// +/// [`LogicalType`] is a trait in order to allow the possibility of declaring +/// extension types: +/// +/// ``` +/// struct JSON {} +/// +/// impl LogicalType for JSON { +/// fn native(&self) -> &NativeType { +/// &NativeType::Utf8 +/// } +/// +/// fn signature(&self) -> TypeSignature<'_> { +/// TypeSignature::Extension { +/// name: "JSON", +/// parameters: &[], +/// } +/// } +/// } +/// ``` pub trait LogicalType: Sync + Send { fn native(&self) -> &NativeType; fn signature(&self) -> TypeSignature<'_>; diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index 71b9c8e26361..1ebf4e19b01e 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -15,12 +15,13 @@ // specific language governing permissions and limitations // under the License. -use std::sync::Arc; +use std::{ops::Deref, sync::Arc}; use arrow_schema::{DataType, Field, Fields, IntervalUnit, TimeUnit, UnionFields}; use super::{LogicalType, TypeSignature}; +/// A record of a native type, its name and its nullability. #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct NativeField { name: String, @@ -28,19 +29,55 @@ pub struct NativeField { nullable: bool, } +impl NativeField { + pub fn name(&self) -> &str { + &self.name + } + + pub fn native_type(&self) -> &NativeType { + &self.native_type + } + + pub fn nullable(&self) -> bool { + self.nullable + } +} + +/// A reference counted [`NativeField`]. pub type NativeFieldRef = Arc; +/// A cheaply cloneable, owned collection of [`NativeFieldRef`]. #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct NativeFields(Arc<[NativeFieldRef]>); +impl Deref for NativeFields { + type Target = [NativeFieldRef]; + + fn deref(&self) -> &Self::Target { + self.0.as_ref() + } +} + +/// A cheaply cloneable, owned collection of [`NativeFieldRef`] and their +/// corresponding type ids. #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct NativeUnionFields(Arc<[(i8, NativeFieldRef)]>); +impl Deref for NativeUnionFields { + type Target = [(i8, NativeFieldRef)]; + + fn deref(&self) -> &Self::Target { + self.0.as_ref() + } +} + +/// Representation of a type that DataFusion can handle natively. It is a subset +/// of the physical variants in Arrow's native [`DataType`]. #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub enum NativeType { /// Null type Null, - /// A boolean datatype representing the values `true` and `false`. + /// A boolean type representing the values `true` and `false`. Boolean, /// A signed 8-bit integer. Int8, @@ -162,9 +199,9 @@ pub enum NativeType { List(NativeFieldRef), /// A list of some logical data type with fixed length. FixedSizeList(NativeFieldRef, i32), - /// A nested datatype that contains a number of sub-fields. + /// A nested type that contains a number of sub-fields. Struct(NativeFields), - /// A nested datatype that can represent slots of differing types. + /// A nested type that can represent slots of differing types. Union(NativeUnionFields), /// Decimal value with precision and scale /// From 88e1b3cb685313972ea405fd7aaf04cd74eab15b Mon Sep 17 00:00:00 2001 From: Filippo Rossi Date: Thu, 17 Oct 2024 16:51:36 +0200 Subject: [PATCH 07/15] Fix doc tests --- datafusion/common/src/types/logical.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs index 9da26b050d59..0aac09df3e87 100644 --- a/datafusion/common/src/types/logical.rs +++ b/datafusion/common/src/types/logical.rs @@ -56,6 +56,8 @@ pub type LogicalTypeRef = Arc; /// extension types: /// /// ``` +/// use datafusion_common::types::{LogicalType, NativeType, TypeSignature}; +/// /// struct JSON {} /// /// impl LogicalType for JSON { From ab16a2dab656215b14334d9d87eca30c33956035 Mon Sep 17 00:00:00 2001 From: Filippo Rossi Date: Thu, 17 Oct 2024 19:57:43 +0200 Subject: [PATCH 08/15] Remove dummy test --- datafusion/common/src/types/logical.rs | 38 +------------------------- 1 file changed, 1 insertion(+), 37 deletions(-) diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs index 0aac09df3e87..0121e33d9d5e 100644 --- a/datafusion/common/src/types/logical.rs +++ b/datafusion/common/src/types/logical.rs @@ -20,6 +20,7 @@ use std::{cmp::Ordering, hash::Hash, sync::Arc}; use super::NativeType; +/// Signature that uniquely identifies a type among other types. #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub enum TypeSignature<'a> { /// Represents a built-in native type. @@ -115,40 +116,3 @@ impl Hash for dyn LogicalType { self.native().hash(state); } } - -#[cfg(test)] -mod test { - #![allow(dead_code)] - - use super::{LogicalType, TypeParameter, TypeSignature}; - use crate::types::NativeType; - - struct MagicalType {} - - impl LogicalType for MagicalType { - fn native(&self) -> &NativeType { - &NativeType::Utf8 - } - - fn signature(&self) -> TypeSignature<'_> { - TypeSignature::Extension { - name: "MagicalType", - parameters: &[TypeParameter::Type(TypeSignature::Native( - &NativeType::Boolean, - ))], - } - } - } - - fn test(logical_type: &dyn LogicalType) { - match logical_type.signature() { - TypeSignature::Extension { - name: "MagicalType", - parameters: - [TypeParameter::Type(TypeSignature::Native(NativeType::Boolean))], - } => {} - TypeSignature::Native(NativeType::Binary) => todo!(), - _ => unimplemented!(), - }; - } -} From 7ed789176369c30972a41c76aaa32e3635a13222 Mon Sep 17 00:00:00 2001 From: Filippo Rossi Date: Tue, 22 Oct 2024 22:53:54 +0200 Subject: [PATCH 09/15] From NativeField to LogicalField --- datafusion/common/src/types/field.rs | 114 ++++++++++++++++++++++++ datafusion/common/src/types/logical.rs | 2 +- datafusion/common/src/types/mod.rs | 2 + datafusion/common/src/types/native.rs | 115 +++---------------------- 4 files changed, 130 insertions(+), 103 deletions(-) create mode 100644 datafusion/common/src/types/field.rs diff --git a/datafusion/common/src/types/field.rs b/datafusion/common/src/types/field.rs new file mode 100644 index 000000000000..85c7c157272a --- /dev/null +++ b/datafusion/common/src/types/field.rs @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_schema::{Field, Fields, UnionFields}; +use std::hash::{Hash, Hasher}; +use std::{ops::Deref, sync::Arc}; + +use super::{LogicalTypeRef, NativeType}; + +/// A record of a logical type, its name and its nullability. +#[derive(Debug, Clone, Eq, PartialOrd, Ord)] +pub struct LogicalField { + pub name: String, + pub logical_type: LogicalTypeRef, + pub nullable: bool, +} + +impl PartialEq for LogicalField { + fn eq(&self, other: &Self) -> bool { + self.name == other.name + && self.logical_type.eq(&other.logical_type) + && self.nullable == other.nullable + } +} + +impl Hash for LogicalField { + fn hash(&self, state: &mut H) { + self.name.hash(state); + self.logical_type.hash(state); + self.nullable.hash(state); + } +} + +impl From<&Field> for LogicalField { + fn from(value: &Field) -> Self { + Self { + name: value.name().clone(), + logical_type: Arc::new(NativeType::from(value.data_type().clone())), + nullable: value.is_nullable(), + } + } +} + +/// A reference counted [`LogicalField`]. +pub type LogicalFieldRef = Arc; + +/// A cheaply cloneable, owned collection of [`LogicalFieldRef`]. +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct LogicalFields(Arc<[LogicalFieldRef]>); + +impl Deref for LogicalFields { + type Target = [LogicalFieldRef]; + + fn deref(&self) -> &Self::Target { + self.0.as_ref() + } +} + +impl From<&Fields> for LogicalFields { + fn from(value: &Fields) -> Self { + value + .iter() + .map(|field| Arc::new(LogicalField::from(field.as_ref()))) + .collect() + } +} + +impl FromIterator for LogicalFields { + fn from_iter>(iter: T) -> Self { + Self(iter.into_iter().collect()) + } +} + +/// A cheaply cloneable, owned collection of [`LogicalFieldRef`] and their +/// corresponding type ids. +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct LogicalUnionFields(Arc<[(i8, LogicalFieldRef)]>); + +impl Deref for LogicalUnionFields { + type Target = [(i8, LogicalFieldRef)]; + + fn deref(&self) -> &Self::Target { + self.0.as_ref() + } +} + +impl From<&UnionFields> for LogicalUnionFields { + fn from(value: &UnionFields) -> Self { + value + .iter() + .map(|(i, field)| (i, Arc::new(LogicalField::from(field.as_ref())))) + .collect() + } +} + +impl FromIterator<(i8, LogicalFieldRef)> for LogicalUnionFields { + fn from_iter>(iter: T) -> Self { + Self(iter.into_iter().collect()) + } +} diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs index 0121e33d9d5e..fe609fd4873b 100644 --- a/datafusion/common/src/types/logical.rs +++ b/datafusion/common/src/types/logical.rs @@ -80,7 +80,7 @@ pub trait LogicalType: Sync + Send { } impl fmt::Debug for dyn LogicalType { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_tuple("LogicalType") .field(&self.signature()) .field(&self.native()) diff --git a/datafusion/common/src/types/mod.rs b/datafusion/common/src/types/mod.rs index 4e1bcb75cb56..cc6d30f4f62c 100644 --- a/datafusion/common/src/types/mod.rs +++ b/datafusion/common/src/types/mod.rs @@ -15,8 +15,10 @@ // specific language governing permissions and limitations // under the License. +mod field; mod logical; mod native; +pub use field::*; pub use logical::*; pub use native::*; diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index 1ebf4e19b01e..66e2e6feae6b 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -15,61 +15,13 @@ // specific language governing permissions and limitations // under the License. -use std::{ops::Deref, sync::Arc}; +use std::sync::Arc; -use arrow_schema::{DataType, Field, Fields, IntervalUnit, TimeUnit, UnionFields}; +use arrow_schema::{DataType, IntervalUnit, TimeUnit}; -use super::{LogicalType, TypeSignature}; - -/// A record of a native type, its name and its nullability. -#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -pub struct NativeField { - name: String, - native_type: NativeType, - nullable: bool, -} - -impl NativeField { - pub fn name(&self) -> &str { - &self.name - } - - pub fn native_type(&self) -> &NativeType { - &self.native_type - } - - pub fn nullable(&self) -> bool { - self.nullable - } -} - -/// A reference counted [`NativeField`]. -pub type NativeFieldRef = Arc; - -/// A cheaply cloneable, owned collection of [`NativeFieldRef`]. -#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -pub struct NativeFields(Arc<[NativeFieldRef]>); - -impl Deref for NativeFields { - type Target = [NativeFieldRef]; - - fn deref(&self) -> &Self::Target { - self.0.as_ref() - } -} - -/// A cheaply cloneable, owned collection of [`NativeFieldRef`] and their -/// corresponding type ids. -#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -pub struct NativeUnionFields(Arc<[(i8, NativeFieldRef)]>); - -impl Deref for NativeUnionFields { - type Target = [(i8, NativeFieldRef)]; - - fn deref(&self) -> &Self::Target { - self.0.as_ref() - } -} +use super::{ + LogicalFieldRef, LogicalFields, LogicalType, LogicalUnionFields, TypeSignature, +}; /// Representation of a type that DataFusion can handle natively. It is a subset /// of the physical variants in Arrow's native [`DataType`]. @@ -196,13 +148,13 @@ pub enum NativeType { /// A variable-length string in Unicode with UTF-8 encoding. Utf8, /// A list of some logical data type with variable length. - List(NativeFieldRef), + List(LogicalFieldRef), /// A list of some logical data type with fixed length. - FixedSizeList(NativeFieldRef, i32), + FixedSizeList(LogicalFieldRef, i32), /// A nested type that contains a number of sub-fields. - Struct(NativeFields), + Struct(LogicalFields), /// A nested type that can represent slots of differing types. - Union(NativeUnionFields), + Union(LogicalUnionFields), /// Decimal value with precision and scale /// /// * precision is the total number of digits @@ -222,11 +174,10 @@ pub enum NativeType { /// The key and value types are not constrained, but keys should be /// hashable and unique. /// - /// In a field with Map type, the field has a child Struct field, which then - /// has two children: key type and the second the value type. The names of the + /// In a field with Map type, key type and the second the value type. The names of the /// child fields may be respectively "entries", "key", and "value", but this is /// not enforced. - Map(NativeFieldRef), + Map(LogicalFieldRef), } impl LogicalType for NativeType { @@ -275,9 +226,9 @@ impl From for NativeType { DataType::FixedSizeList(field, size) => { FixedSizeList(Arc::new(field.as_ref().into()), size) } - DataType::Struct(fields) => Struct(NativeFields::from(&fields)), + DataType::Struct(fields) => Struct(LogicalFields::from(&fields)), DataType::Union(union_fields, _) => { - Union(NativeUnionFields::from(&union_fields)) + Union(LogicalUnionFields::from(&union_fields)) } DataType::Dictionary(_, data_type) => data_type.as_ref().clone().into(), DataType::Decimal128(p, s) | DataType::Decimal256(p, s) => Decimal(p, s), @@ -286,43 +237,3 @@ impl From for NativeType { } } } - -impl From<&Field> for NativeField { - fn from(value: &Field) -> Self { - Self { - name: value.name().clone(), - native_type: value.data_type().clone().into(), - nullable: value.is_nullable(), - } - } -} - -impl From<&Fields> for NativeFields { - fn from(value: &Fields) -> Self { - value - .iter() - .map(|field| Arc::new(NativeField::from(field.as_ref()))) - .collect() - } -} - -impl FromIterator for NativeFields { - fn from_iter>(iter: T) -> Self { - Self(iter.into_iter().collect()) - } -} - -impl From<&UnionFields> for NativeUnionFields { - fn from(value: &UnionFields) -> Self { - value - .iter() - .map(|(i, field)| (i, Arc::new(NativeField::from(field.as_ref())))) - .collect() - } -} - -impl FromIterator<(i8, NativeFieldRef)> for NativeUnionFields { - fn from_iter>(iter: T) -> Self { - Self(iter.into_iter().collect()) - } -} From bdd0155595ad960d79d9eeb812e0a16b3bd6d77d Mon Sep 17 00:00:00 2001 From: Filippo Rossi Date: Fri, 25 Oct 2024 16:10:29 +0200 Subject: [PATCH 10/15] Add default_cast_for --- datafusion/common/src/types/logical.rs | 16 ++- datafusion/common/src/types/native.rs | 155 ++++++++++++++++++++++++- 2 files changed, 162 insertions(+), 9 deletions(-) diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs index fe609fd4873b..f9649c82a2a7 100644 --- a/datafusion/common/src/types/logical.rs +++ b/datafusion/common/src/types/logical.rs @@ -15,11 +15,12 @@ // specific language governing permissions and limitations // under the License. +use super::NativeType; +use crate::error::Result; +use arrow_schema::DataType; use core::fmt; use std::{cmp::Ordering, hash::Hash, sync::Arc}; -use super::NativeType; - /// Signature that uniquely identifies a type among other types. #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub enum TypeSignature<'a> { @@ -75,8 +76,17 @@ pub type LogicalTypeRef = Arc; /// } /// ``` pub trait LogicalType: Sync + Send { + /// Get the native backing type of this logical type. fn native(&self) -> &NativeType; + /// Get the unique type signature for this logical type. Logical types with identical + /// signatures are considered equal. fn signature(&self) -> TypeSignature<'_>; + + /// Get the default physical type to cast `origin` to in order to obtain a physical type + /// that is logically compatible with this logical type. + fn default_cast_for(&self, origin: &DataType) -> Result { + self.native().default_cast_for(origin) + } } impl fmt::Debug for dyn LogicalType { @@ -90,7 +100,7 @@ impl fmt::Debug for dyn LogicalType { impl PartialEq for dyn LogicalType { fn eq(&self, other: &Self) -> bool { - self.native().eq(other.native()) && self.signature().eq(&other.signature()) + self.signature().eq(&other.signature()) } } diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index 66e2e6feae6b..c11957ac7038 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -15,13 +15,15 @@ // specific language governing permissions and limitations // under the License. -use std::sync::Arc; - -use arrow_schema::{DataType, IntervalUnit, TimeUnit}; - use super::{ - LogicalFieldRef, LogicalFields, LogicalType, LogicalUnionFields, TypeSignature, + LogicalField, LogicalFieldRef, LogicalFields, LogicalType, LogicalUnionFields, + TypeSignature, }; +use crate::error::{Result, _internal_err}; +use arrow_schema::{ + DataType, Field, FieldRef, Fields, IntervalUnit, TimeUnit, UnionFields, +}; +use std::sync::Arc; /// Representation of a type that DataFusion can handle natively. It is a subset /// of the physical variants in Arrow's native [`DataType`]. @@ -188,6 +190,147 @@ impl LogicalType for NativeType { fn signature(&self) -> TypeSignature<'_> { TypeSignature::Native(self) } + + fn default_cast_for(&self, origin: &DataType) -> Result { + use DataType::*; + + fn default_field_cast(to: &LogicalField, from: &Field) -> Result { + Ok(Arc::new(Field::new( + to.name.clone(), + to.logical_type.default_cast_for(from.data_type())?, + to.nullable, + ))) + } + + Ok(match (self, origin) { + (Self::Null, _) => Null, + (Self::Boolean, _) => Boolean, + (Self::Int8, _) => Int8, + (Self::Int16, _) => Int16, + (Self::Int32, _) => Int32, + (Self::Int64, _) => Int64, + (Self::UInt8, _) => UInt8, + (Self::UInt16, _) => UInt16, + (Self::UInt32, _) => UInt32, + (Self::UInt64, _) => UInt64, + (Self::Float16, _) => Float16, + (Self::Float32, _) => Float32, + (Self::Float64, _) => Float64, + (Self::Decimal(p, s), _) if p <= &38 => Decimal128(*p, *s), + (Self::Decimal(p, s), _) => Decimal256(*p, *s), + (Self::Timestamp(tu, tz), _) => Timestamp(*tu, tz.clone()), + (Self::Date, _) => Date32, + (Self::Time(tu), _) => match tu { + TimeUnit::Second | TimeUnit::Millisecond => Time32(*tu), + TimeUnit::Microsecond | TimeUnit::Nanosecond => Time64(*tu), + }, + (Self::Duration(tu), _) => Duration(*tu), + (Self::Interval(iu), _) => Interval(*iu), + (Self::Binary, LargeUtf8) => LargeBinary, + (Self::Binary, Utf8View) => BinaryView, + (Self::Binary, _) => Binary, + (Self::FixedSizeBinary(size), _) => FixedSizeBinary(*size), + (Self::Utf8, LargeBinary) => LargeUtf8, + (Self::Utf8, BinaryView) => Utf8View, + (Self::Utf8, _) => Utf8, + (Self::List(to_field), List(from_field) | FixedSizeList(from_field, _)) => { + List(default_field_cast(to_field, from_field)?) + } + (Self::List(to_field), LargeList(from_field)) => { + LargeList(default_field_cast(to_field, from_field)?) + } + (Self::List(to_field), ListView(from_field)) => { + ListView(default_field_cast(to_field, from_field)?) + } + (Self::List(to_field), LargeListView(from_field)) => { + LargeListView(default_field_cast(to_field, from_field)?) + } + // List array where each element is a len 1 list of the origin type + (Self::List(field), _) => List(Arc::new(Field::new( + field.name.clone(), + field.logical_type.default_cast_for(origin)?, + field.nullable, + ))), + ( + Self::FixedSizeList(to_field, to_size), + FixedSizeList(from_field, from_size), + ) if from_size == to_size => { + FixedSizeList(default_field_cast(to_field, from_field)?, *to_size) + } + ( + Self::FixedSizeList(to_field, size), + List(from_field) + | LargeList(from_field) + | ListView(from_field) + | LargeListView(from_field), + ) => FixedSizeList(default_field_cast(to_field, from_field)?, *size), + // FixedSizeList array where each element is a len 1 list of the origin type + (Self::FixedSizeList(field, size), _) => FixedSizeList( + Arc::new(Field::new( + field.name.clone(), + field.logical_type.default_cast_for(origin)?, + field.nullable, + )), + *size, + ), + // From https://github.com/apache/arrow-rs/blob/56525efbd5f37b89d1b56aa51709cab9f81bc89e/arrow-cast/src/cast/mod.rs#L189-L196 + (Self::Struct(to_fields), Struct(from_fields)) + if from_fields.len() == to_fields.len() => + { + Struct( + from_fields + .iter() + .zip(to_fields.iter()) + .map(|(from, to)| default_field_cast(to, from)) + .collect::>()?, + ) + } + (Self::Struct(to_fields), Null) => Struct( + to_fields + .iter() + .map(|field| { + Ok(Arc::new(Field::new( + field.name.clone(), + field.logical_type.default_cast_for(&Null)?, + field.nullable, + ))) + }) + .collect::>()?, + ), + (Self::Map(to_field), Map(from_field, sorted)) => { + Map(default_field_cast(to_field, from_field)?, *sorted) + } + (Self::Map(field), Null) => Map( + Arc::new(Field::new( + field.name.clone(), + field.logical_type.default_cast_for(&Null)?, + field.nullable, + )), + false, + ), + (Self::Union(to_fields), Union(from_fields, mode)) + if from_fields.len() == to_fields.len() => + { + Union( + from_fields + .iter() + .zip(to_fields.iter()) + .map(|((_, from), (i, to))| { + Ok((*i, default_field_cast(to, from)?)) + }) + .collect::>()?, + *mode, + ) + } + _ => { + return _internal_err!( + "Unavailable default cast for native type {:?} from physical type {:?}", + self, + origin + ) + } + }) + } } // The following From, From, ... implementations are temporary @@ -230,9 +373,9 @@ impl From for NativeType { DataType::Union(union_fields, _) => { Union(LogicalUnionFields::from(&union_fields)) } - DataType::Dictionary(_, data_type) => data_type.as_ref().clone().into(), DataType::Decimal128(p, s) | DataType::Decimal256(p, s) => Decimal(p, s), DataType::Map(field, _) => Map(Arc::new(field.as_ref().into())), + DataType::Dictionary(_, data_type) => data_type.as_ref().clone().into(), DataType::RunEndEncoded(_, field) => field.data_type().clone().into(), } } From 056a8f129d3a5c3c88da6654f0a7a8e78e112f56 Mon Sep 17 00:00:00 2001 From: Filippo Rossi Date: Fri, 25 Oct 2024 17:30:49 +0200 Subject: [PATCH 11/15] Add type order with can_cast_types --- datafusion/common/src/types/native.rs | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index c11957ac7038..0fc0a654394a 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -20,6 +20,7 @@ use super::{ TypeSignature, }; use crate::error::{Result, _internal_err}; +use arrow::compute::can_cast_types; use arrow_schema::{ DataType, Field, FieldRef, Fields, IntervalUnit, TimeUnit, UnionFields, }; @@ -228,11 +229,19 @@ impl LogicalType for NativeType { (Self::Interval(iu), _) => Interval(*iu), (Self::Binary, LargeUtf8) => LargeBinary, (Self::Binary, Utf8View) => BinaryView, - (Self::Binary, _) => Binary, + (Self::Binary, data_type) if can_cast_types(data_type, &BinaryView) => { + BinaryView + } + (Self::Binary, data_type) if can_cast_types(data_type, &LargeBinary) => { + LargeBinary + } + (Self::Binary, data_type) if can_cast_types(data_type, &Binary) => Binary, (Self::FixedSizeBinary(size), _) => FixedSizeBinary(*size), (Self::Utf8, LargeBinary) => LargeUtf8, (Self::Utf8, BinaryView) => Utf8View, - (Self::Utf8, _) => Utf8, + (Self::Utf8, data_type) if can_cast_types(data_type, &Utf8View) => Utf8View, + (Self::Utf8, data_type) if can_cast_types(data_type, &LargeUtf8) => LargeUtf8, + (Self::Utf8, data_type) if can_cast_types(data_type, &Utf8) => Utf8, (Self::List(to_field), List(from_field) | FixedSizeList(from_field, _)) => { List(default_field_cast(to_field, from_field)?) } From 117f23b73e8cae014fc427fc3287d54507d809a8 Mon Sep 17 00:00:00 2001 From: Filippo Rossi Date: Sat, 26 Oct 2024 18:06:17 +0200 Subject: [PATCH 12/15] Rename NativeType Utf8 to String --- datafusion/common/src/types/logical.rs | 2 +- datafusion/common/src/types/native.rs | 16 +++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs index f9649c82a2a7..bde393992a0c 100644 --- a/datafusion/common/src/types/logical.rs +++ b/datafusion/common/src/types/logical.rs @@ -64,7 +64,7 @@ pub type LogicalTypeRef = Arc; /// /// impl LogicalType for JSON { /// fn native(&self) -> &NativeType { -/// &NativeType::Utf8 +/// &NativeType::String /// } /// /// fn signature(&self) -> TypeSignature<'_> { diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index 0fc0a654394a..34324a1e60ec 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -149,7 +149,7 @@ pub enum NativeType { /// Enum parameter specifies the number of bytes per value. FixedSizeBinary(i32), /// A variable-length string in Unicode with UTF-8 encoding. - Utf8, + String, /// A list of some logical data type with variable length. List(LogicalFieldRef), /// A list of some logical data type with fixed length. @@ -237,11 +237,13 @@ impl LogicalType for NativeType { } (Self::Binary, data_type) if can_cast_types(data_type, &Binary) => Binary, (Self::FixedSizeBinary(size), _) => FixedSizeBinary(*size), - (Self::Utf8, LargeBinary) => LargeUtf8, - (Self::Utf8, BinaryView) => Utf8View, - (Self::Utf8, data_type) if can_cast_types(data_type, &Utf8View) => Utf8View, - (Self::Utf8, data_type) if can_cast_types(data_type, &LargeUtf8) => LargeUtf8, - (Self::Utf8, data_type) if can_cast_types(data_type, &Utf8) => Utf8, + (Self::String, LargeBinary) => LargeUtf8, + (Self::String, BinaryView) => Utf8View, + (Self::String, data_type) if can_cast_types(data_type, &Utf8View) => Utf8View, + (Self::String, data_type) if can_cast_types(data_type, &LargeUtf8) => { + LargeUtf8 + } + (Self::String, data_type) if can_cast_types(data_type, &Utf8) => Utf8, (Self::List(to_field), List(from_field) | FixedSizeList(from_field, _)) => { List(default_field_cast(to_field, from_field)?) } @@ -370,7 +372,7 @@ impl From for NativeType { DataType::Interval(iu) => Interval(iu), DataType::Binary | DataType::LargeBinary | DataType::BinaryView => Binary, DataType::FixedSizeBinary(size) => FixedSizeBinary(size), - DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => Utf8, + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => String, DataType::List(field) | DataType::ListView(field) | DataType::LargeList(field) From e0923e2e68e4f20aaa6b686a3ee1038501edb929 Mon Sep 17 00:00:00 2001 From: Filippo Rossi Date: Sat, 26 Oct 2024 18:15:15 +0200 Subject: [PATCH 13/15] NativeType from &DataType --- datafusion/common/src/types/native.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index 34324a1e60ec..bfb546783ea2 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -391,3 +391,9 @@ impl From for NativeType { } } } + +impl From<&DataType> for NativeType { + fn from(value: &DataType) -> Self { + value.clone().into() + } +} From f1ff9631c49a8d2a7ec22267f2277609af772d1b Mon Sep 17 00:00:00 2001 From: Filippo Rossi Date: Sat, 26 Oct 2024 18:38:57 +0200 Subject: [PATCH 14/15] Add builtin types --- datafusion/common/src/types/builtin.rs | 44 ++++++++++++++++++++++++++ datafusion/common/src/types/mod.rs | 2 ++ 2 files changed, 46 insertions(+) create mode 100644 datafusion/common/src/types/builtin.rs diff --git a/datafusion/common/src/types/builtin.rs b/datafusion/common/src/types/builtin.rs new file mode 100644 index 000000000000..1fc89aef6e76 --- /dev/null +++ b/datafusion/common/src/types/builtin.rs @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::types::{LogicalTypeRef, NativeType}; +use std::sync::{Arc, LazyLock}; + +macro_rules! singleton { + ($name:ident, $ty:ident) => { + #[doc = concat!("Singleton instance of a logical type representing [`NativeType::", stringify!($ty), "`].")] + pub static $name: LazyLock = + LazyLock::new(|| Arc::new(NativeType::$ty)); + }; +} + +singleton!(LOGICAL_NULL, Null); +singleton!(LOGICAL_BOOLEAN, Boolean); +singleton!(LOGICAL_INT8, Int8); +singleton!(LOGICAL_INT16, Int16); +singleton!(LOGICAL_INT32, Int32); +singleton!(LOGICAL_INT64, Int64); +singleton!(LOGICAL_UINT8, UInt8); +singleton!(LOGICAL_UINT16, UInt16); +singleton!(LOGICAL_UINT32, UInt32); +singleton!(LOGICAL_UINT64, UInt64); +singleton!(LOGICAL_FLOAT16, Float16); +singleton!(LOGICAL_FLOAT32, Float32); +singleton!(LOGICAL_FLOAT64, Float64); +singleton!(LOGICAL_DATE, Date); +singleton!(LOGICAL_BINARY, Binary); +singleton!(LOGICAL_STRING, String); diff --git a/datafusion/common/src/types/mod.rs b/datafusion/common/src/types/mod.rs index cc6d30f4f62c..2f9ce4ce0282 100644 --- a/datafusion/common/src/types/mod.rs +++ b/datafusion/common/src/types/mod.rs @@ -15,10 +15,12 @@ // specific language governing permissions and limitations // under the License. +mod builtin; mod field; mod logical; mod native; +pub use builtin::*; pub use field::*; pub use logical::*; pub use native::*; From 7942f91be3db8bc43cf0e5c717f89aa0f5d70d99 Mon Sep 17 00:00:00 2001 From: Filippo Rossi Date: Sat, 26 Oct 2024 20:22:11 +0200 Subject: [PATCH 15/15] From LazyLock to OnceLock --- datafusion/common/src/types/builtin.rs | 47 ++++++++++++++------------ 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/datafusion/common/src/types/builtin.rs b/datafusion/common/src/types/builtin.rs index 1fc89aef6e76..c6105d37c3bd 100644 --- a/datafusion/common/src/types/builtin.rs +++ b/datafusion/common/src/types/builtin.rs @@ -16,29 +16,34 @@ // under the License. use crate::types::{LogicalTypeRef, NativeType}; -use std::sync::{Arc, LazyLock}; +use std::sync::{Arc, OnceLock}; macro_rules! singleton { - ($name:ident, $ty:ident) => { - #[doc = concat!("Singleton instance of a logical type representing [`NativeType::", stringify!($ty), "`].")] - pub static $name: LazyLock = - LazyLock::new(|| Arc::new(NativeType::$ty)); + ($name:ident, $getter:ident, $ty:ident) => { + // TODO: Use LazyLock instead of getter function when MSRV gets bumped + static $name: OnceLock = OnceLock::new(); + + #[doc = "Getter for singleton instance of a logical type representing"] + #[doc = concat!("[`NativeType::", stringify!($ty), "`].")] + pub fn $getter() -> LogicalTypeRef { + Arc::clone($name.get_or_init(|| Arc::new(NativeType::$ty))) + } }; } -singleton!(LOGICAL_NULL, Null); -singleton!(LOGICAL_BOOLEAN, Boolean); -singleton!(LOGICAL_INT8, Int8); -singleton!(LOGICAL_INT16, Int16); -singleton!(LOGICAL_INT32, Int32); -singleton!(LOGICAL_INT64, Int64); -singleton!(LOGICAL_UINT8, UInt8); -singleton!(LOGICAL_UINT16, UInt16); -singleton!(LOGICAL_UINT32, UInt32); -singleton!(LOGICAL_UINT64, UInt64); -singleton!(LOGICAL_FLOAT16, Float16); -singleton!(LOGICAL_FLOAT32, Float32); -singleton!(LOGICAL_FLOAT64, Float64); -singleton!(LOGICAL_DATE, Date); -singleton!(LOGICAL_BINARY, Binary); -singleton!(LOGICAL_STRING, String); +singleton!(LOGICAL_NULL, logical_null, Null); +singleton!(LOGICAL_BOOLEAN, logical_boolean, Boolean); +singleton!(LOGICAL_INT8, logical_int8, Int8); +singleton!(LOGICAL_INT16, logical_int16, Int16); +singleton!(LOGICAL_INT32, logical_int32, Int32); +singleton!(LOGICAL_INT64, logical_int64, Int64); +singleton!(LOGICAL_UINT8, logical_uint8, UInt8); +singleton!(LOGICAL_UINT16, logical_uint16, UInt16); +singleton!(LOGICAL_UINT32, logical_uint32, UInt32); +singleton!(LOGICAL_UINT64, logical_uint64, UInt64); +singleton!(LOGICAL_FLOAT16, logical_float16, Float16); +singleton!(LOGICAL_FLOAT32, logical_float32, Float32); +singleton!(LOGICAL_FLOAT64, logical_float64, Float64); +singleton!(LOGICAL_DATE, logical_date, Date); +singleton!(LOGICAL_BINARY, logical_binary, Binary); +singleton!(LOGICAL_STRING, logical_string, String);