diff --git a/arrow-data/src/data/bytes.rs b/arrow-data/src/data/bytes.rs index 86839c67124d..521c1959aaa1 100644 --- a/arrow-data/src/data/bytes.rs +++ b/arrow-data/src/data/bytes.rs @@ -73,7 +73,7 @@ mod private { } /// Types backed by a variable length slice of bytes -pub trait Bytes: private::BytesSealed { +pub trait Bytes: private::BytesSealed + std::fmt::Debug { const TYPE: BytesType; } @@ -195,6 +195,7 @@ impl private::BytesOffsetSealed for i64 { } /// An enumeration of the types of [`ArrayDataBytesOffset`] +#[derive(Debug, Clone)] pub enum ArrayDataBytes { Binary(ArrayDataBytesOffset<[u8]>), Utf8(ArrayDataBytesOffset), @@ -217,18 +218,29 @@ impl ArrayDataBytes { } /// An enumeration of the types of [`BytesArrayData`] +#[derive(Debug)] pub enum ArrayDataBytesOffset { Small(BytesArrayData), Large(BytesArrayData), } +impl Clone for ArrayDataBytesOffset { + fn clone(&self) -> Self { + match self { + Self::Small(v) => Self::Small(v.clone()), + Self::Large(v) => Self::Large(v.clone()), + } + } +} + impl From> for ArrayDataBytes { fn from(value: BytesArrayData) -> Self { B::upcast(O::upcast(value)) } } -/// ArrayData for arrays of [`Bytes`] +/// ArrayData for [variable-sized arrays](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout) of [`Bytes`] +#[derive(Debug)] pub struct BytesArrayData { data_type: DataType, nulls: Option, @@ -237,13 +249,25 @@ pub struct BytesArrayData { phantom: PhantomData, } -impl BytesArrayData { +impl Clone for BytesArrayData { + fn clone(&self) -> Self { + Self { + data_type: self.data_type.clone(), + nulls: self.nulls.clone(), + offsets: self.offsets.clone(), + values: self.values.clone(), + phantom: Default::default(), + } + } +} + +impl BytesArrayData { /// Creates a new [`BytesArrayData`] /// /// # Safety /// /// - Each consecutive window of `offsets` must identify a valid slice of `values` - /// - `nulls.len() == offsets.len() + 1` + /// - `nulls.len() == offsets.len() - 1` /// - `data_type` must be valid for this layout pub unsafe fn new_unchecked( data_type: DataType, @@ -270,7 +294,7 @@ impl BytesArrayData { /// Returns the offsets #[inline] - pub fn value_offsets(&self) -> &[O] { + pub fn offsets(&self) -> &[O] { &self.offsets } @@ -286,3 +310,49 @@ impl BytesArrayData { &self.data_type } } + +/// ArrayData for [fixed-size arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout) of bytes +#[derive(Debug, Clone)] +pub struct FixedSizeBinaryArrayData { + data_type: DataType, + nulls: Option, + values: Buffer, +} + +impl FixedSizeBinaryArrayData { + /// Creates a new [`FixedSizeBinaryArrayData`] + /// + /// # Safety + /// + /// - `data_type` must be valid for this layout + /// - `nulls.len() == values.len() / element_size` + pub unsafe fn new_unchecked( + data_type: DataType, + values: Buffer, + nulls: Option, + ) -> Self { + Self { + data_type, + nulls, + values, + } + } + + /// Returns the raw byte data + #[inline] + pub fn values(&self) -> &[u8] { + &self.values + } + + /// Returns the null buffer if any + #[inline] + pub fn null_buffer(&self) -> Option<&NullBuffer> { + self.nulls.as_ref() + } + + /// Returns the data type of this array + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type + } +} diff --git a/arrow-data/src/data/dictionary.rs b/arrow-data/src/data/dictionary.rs new file mode 100644 index 000000000000..2ec4ee005287 --- /dev/null +++ b/arrow-data/src/data/dictionary.rs @@ -0,0 +1,174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::data::types::DictionaryKeyType; +use crate::ArrayData; +use arrow_buffer::buffer::{NullBuffer, ScalarBuffer}; +use arrow_buffer::ArrowNativeType; +use arrow_schema::DataType; + +mod private { + use super::*; + + pub trait DictionaryKeySealed { + /// Downcast [`ArrayDataDictionary`] to `[DictionaryArrayData`] + fn downcast_ref(data: &ArrayDataDictionary) -> Option<&DictionaryArrayData> + where + Self: DictionaryKey; + + /// Downcast [`ArrayDataDictionary`] to `[DictionaryArrayData`] + fn downcast(data: ArrayDataDictionary) -> Option> + where + Self: DictionaryKey; + + /// Cast [`DictionaryArrayData`] to [`ArrayDataDictionary`] + fn upcast(v: DictionaryArrayData) -> ArrayDataDictionary + where + Self: DictionaryKey; + } +} + +/// Types of dictionary key used by dictionary arrays +pub trait DictionaryKey: private::DictionaryKeySealed + ArrowNativeType { + const TYPE: DictionaryKeyType; +} + +macro_rules! dictionary { + ($t:ty,$v:ident) => { + impl DictionaryKey for $t { + const TYPE: DictionaryKeyType = DictionaryKeyType::$v; + } + impl private::DictionaryKeySealed for $t { + fn downcast_ref( + data: &ArrayDataDictionary, + ) -> Option<&DictionaryArrayData> { + match data { + ArrayDataDictionary::$v(v) => Some(v), + _ => None, + } + } + + fn downcast(data: ArrayDataDictionary) -> Option> { + match data { + ArrayDataDictionary::$v(v) => Some(v), + _ => None, + } + } + + fn upcast(v: DictionaryArrayData) -> ArrayDataDictionary { + ArrayDataDictionary::$v(v) + } + } + }; +} + +dictionary!(i8, Int8); +dictionary!(i16, Int16); +dictionary!(i32, Int32); +dictionary!(i64, Int64); +dictionary!(u8, UInt8); +dictionary!(u16, UInt16); +dictionary!(u32, UInt32); +dictionary!(u64, UInt64); + +/// An enumeration of the types of [`DictionaryArrayData`] +#[derive(Debug, Clone)] +pub enum ArrayDataDictionary { + Int8(DictionaryArrayData), + Int16(DictionaryArrayData), + Int32(DictionaryArrayData), + Int64(DictionaryArrayData), + UInt8(DictionaryArrayData), + UInt16(DictionaryArrayData), + UInt32(DictionaryArrayData), + UInt64(DictionaryArrayData), +} + +impl ArrayDataDictionary { + /// Downcast this [`ArrayDataDictionary`] to the corresponding [`DictionaryArrayData`] + pub fn downcast_ref(&self) -> Option<&DictionaryArrayData> { + K::downcast_ref(self) + } + + /// Downcast this [`ArrayDataDictionary`] to the corresponding [`DictionaryArrayData`] + pub fn downcast(self) -> Option> { + K::downcast(self) + } +} + +impl From> for ArrayDataDictionary { + fn from(value: DictionaryArrayData) -> Self { + K::upcast(value) + } +} + +/// ArrayData for [dictionary arrays](https://arrow.apache.org/docs/format/Columnar.html#dictionary-encoded-layout) +#[derive(Debug, Clone)] +pub struct DictionaryArrayData { + data_type: DataType, + nulls: Option, + keys: ScalarBuffer, + child: Box, +} + +impl DictionaryArrayData { + /// Create a new [`DictionaryArrayData`] + /// + /// # Safety + /// + /// - `data_type` must be valid for this layout + /// - child must have a type matching `data_type` + /// - all values in `keys` must be `0 < v < child.len()` or be a null according to `nulls` + /// - `nulls` must have the same length as `child` + pub unsafe fn new_unchecked( + data_type: DataType, + keys: ScalarBuffer, + nulls: Option, + child: ArrayData, + ) -> Self { + Self { + data_type, + nulls, + keys, + child: Box::new(child), + } + } + + /// Returns the null buffer if any + #[inline] + pub fn nulls(&self) -> Option<&NullBuffer> { + self.nulls.as_ref() + } + + /// Returns the keys + #[inline] + pub fn keys(&self) -> &[K] { + &self.keys + } + + /// Returns the child data + #[inline] + pub fn child(&self) -> &ArrayData { + self.child.as_ref() + } + + /// Returns the data type of this array + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type + } +} diff --git a/arrow-data/src/data/list.rs b/arrow-data/src/data/list.rs new file mode 100644 index 000000000000..59909289e933 --- /dev/null +++ b/arrow-data/src/data/list.rs @@ -0,0 +1,241 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::data::types::OffsetType; +use crate::ArrayData; +use arrow_buffer::buffer::{NullBuffer, ScalarBuffer}; +use arrow_buffer::{ArrowNativeType, Buffer}; +use arrow_schema::DataType; + +mod private { + use super::*; + + pub trait ListOffsetSealed { + /// Downcast [`ArrayDataList`] to `[ListArrayData`] + fn downcast_ref(data: &ArrayDataList) -> Option<&ListArrayData> + where + Self: ListOffset; + + /// Downcast [`ArrayDataList`] to `[ListArrayData`] + fn downcast(data: ArrayDataList) -> Option> + where + Self: ListOffset; + + /// Cast [`ListArrayData`] to [`ArrayDataList`] + fn upcast(v: ListArrayData) -> ArrayDataList + where + Self: ListOffset; + } +} + +/// Types of offset used by variable length list arrays +pub trait ListOffset: private::ListOffsetSealed + ArrowNativeType { + const TYPE: OffsetType; +} + +impl ListOffset for i32 { + const TYPE: OffsetType = OffsetType::Int32; +} + +impl private::ListOffsetSealed for i32 { + fn downcast_ref(data: &ArrayDataList) -> Option<&ListArrayData> + where + Self: ListOffset, + { + match data { + ArrayDataList::Small(v) => Some(v), + ArrayDataList::Large(_) => None, + } + } + + fn downcast(data: ArrayDataList) -> Option> + where + Self: ListOffset, + { + match data { + ArrayDataList::Small(v) => Some(v), + ArrayDataList::Large(_) => None, + } + } + + fn upcast(v: ListArrayData) -> ArrayDataList + where + Self: ListOffset, + { + ArrayDataList::Small(v) + } +} + +impl ListOffset for i64 { + const TYPE: OffsetType = OffsetType::Int64; +} + +impl private::ListOffsetSealed for i64 { + fn downcast_ref(data: &ArrayDataList) -> Option<&ListArrayData> + where + Self: ListOffset, + { + match data { + ArrayDataList::Small(_) => None, + ArrayDataList::Large(v) => Some(v), + } + } + + fn downcast(data: ArrayDataList) -> Option> + where + Self: ListOffset, + { + match data { + ArrayDataList::Small(_) => None, + ArrayDataList::Large(v) => Some(v), + } + } + + fn upcast(v: ListArrayData) -> ArrayDataList + where + Self: ListOffset, + { + ArrayDataList::Large(v) + } +} + +/// An enumeration of the types of [`ListArrayData`] +#[derive(Debug, Clone)] +pub enum ArrayDataList { + Small(ListArrayData), + Large(ListArrayData), +} + +impl ArrayDataList { + /// Downcast this [`ArrayDataList`] to the corresponding [`ListArrayData`] + pub fn downcast_ref(&self) -> Option<&ListArrayData> { + O::downcast_ref(self) + } + + /// Downcast this [`ArrayDataList`] to the corresponding [`ListArrayData`] + pub fn downcast(self) -> Option> { + O::downcast(self) + } +} + +impl From> for ArrayDataList { + fn from(value: ListArrayData) -> Self { + O::upcast(value) + } +} + +/// ArrayData for [variable-size list arrays](https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout) +#[derive(Debug, Clone)] +pub struct ListArrayData { + data_type: DataType, + nulls: Option, + offsets: ScalarBuffer, + child: Box, +} + +impl ListArrayData { + /// Create a new [`ListArrayData`] + /// + /// # Safety + /// + /// - Each consecutive window of `offsets` must identify a valid slice of `child` + /// - `nulls.len() == offsets.len() - 1` + /// - `data_type` must be valid for this layout + pub unsafe fn new_unchecked( + data_type: DataType, + offsets: ScalarBuffer, + nulls: Option, + child: ArrayData, + ) -> Self { + Self { + data_type, + nulls, + offsets, + child: Box::new(child), + } + } + + /// Returns the null buffer if any + #[inline] + pub fn nulls(&self) -> Option<&NullBuffer> { + self.nulls.as_ref() + } + + /// Returns the offsets + #[inline] + pub fn offsets(&self) -> &[O] { + &self.offsets + } + + /// Returns the child data + #[inline] + pub fn child(&self) -> &ArrayData { + self.child.as_ref() + } + + /// Returns the data type of this array + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type + } +} + +/// ArrayData for [fixed-size list arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-list-layout) +#[derive(Debug, Clone)] +pub struct FixedSizeListArrayData { + data_type: DataType, + nulls: Option, + child: Box, +} + +impl FixedSizeListArrayData { + /// Create a new [`FixedSizeListArrayData`] + /// + /// # Safety + /// + /// - `data_type` must be valid for this layout + /// - `nulls.len() == values.len() / element_size` + pub unsafe fn new_unchecked( + data_type: DataType, + nulls: Option, + child: ArrayData, + ) -> Self { + Self { + data_type, + nulls, + child: Box::new(child), + } + } + + /// Returns the null buffer if any + #[inline] + pub fn nulls(&self) -> Option<&NullBuffer> { + self.nulls.as_ref() + } + + /// Returns the child data + #[inline] + pub fn child(&self) -> &ArrayData { + self.child.as_ref() + } + + /// Returns the data type of this array + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type + } +} diff --git a/arrow-data/src/data/mod.rs b/arrow-data/src/data/mod.rs index eb1fe2bcffa2..2f9e142b1d96 100644 --- a/arrow-data/src/data/mod.rs +++ b/arrow-data/src/data/mod.rs @@ -32,9 +32,19 @@ use crate::equal; #[allow(unused)] // Private until ready (#1176) mod bytes; #[allow(unused)] // Private until ready (#1176) +mod dictionary; +#[allow(unused)] // Private until ready (#1176) +mod list; +#[allow(unused)] // Private until ready (#1176) mod primitive; #[allow(unused)] // Private until ready (#1176) +mod run; +#[allow(unused)] // Private until ready (#1176) +mod r#struct; +#[allow(unused)] // Private until ready (#1176) mod types; +#[allow(unused)] // Private until ready (#1176) +mod union; #[inline] pub(crate) fn contains_nulls( diff --git a/arrow-data/src/data/primitive.rs b/arrow-data/src/data/primitive.rs index d34ef42dbbb7..058b3e822056 100644 --- a/arrow-data/src/data/primitive.rs +++ b/arrow-data/src/data/primitive.rs @@ -43,13 +43,13 @@ mod private { } pub trait Primitive: private::PrimitiveSealed + ArrowNativeType { - const VARIANT: PrimitiveType; + const TYPE: PrimitiveType; } macro_rules! primitive { ($t:ty,$v:ident) => { impl Primitive for $t { - const VARIANT: PrimitiveType = PrimitiveType::$v; + const TYPE: PrimitiveType = PrimitiveType::$v; } impl private::PrimitiveSealed for $t { fn downcast_ref( @@ -118,7 +118,13 @@ impl ArrayDataPrimitive { } } -/// ArrayData for arrays of [`Primitive`] +impl From> for ArrayDataPrimitive { + fn from(value: PrimitiveArrayData

) -> Self { + P::upcast(value) + } +} + +/// ArrayData for [fixed size arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout) of [`Primitive`] #[derive(Debug, Clone)] pub struct PrimitiveArrayData { data_type: DataType, @@ -126,12 +132,6 @@ pub struct PrimitiveArrayData { values: ScalarBuffer, } -impl From> for ArrayDataPrimitive { - fn from(value: PrimitiveArrayData

) -> Self { - P::upcast(value) - } -} - impl PrimitiveArrayData { /// Create a new [`PrimitiveArrayData`] /// @@ -147,10 +147,10 @@ impl PrimitiveArrayData { ) -> Self { let physical = PhysicalType::from(&data_type); assert!( - matches!(physical, PhysicalType::Primitive(p) if p == T::VARIANT), + matches!(physical, PhysicalType::Primitive(p) if p == T::TYPE), "Illegal physical type for PrimitiveArrayData of datatype {:?}, expected {:?} got {:?}", data_type, - T::VARIANT, + T::TYPE, physical ); diff --git a/arrow-data/src/data/run.rs b/arrow-data/src/data/run.rs new file mode 100644 index 000000000000..cd993de1bf25 --- /dev/null +++ b/arrow-data/src/data/run.rs @@ -0,0 +1,149 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::data::types::RunEndType; +use crate::ArrayData; +use arrow_buffer::buffer::ScalarBuffer; +use arrow_buffer::ArrowNativeType; +use arrow_schema::DataType; +use std::marker::PhantomData; + +mod private { + use super::*; + + pub trait RunEndSealed { + /// Downcast [`ArrayDataRun`] to `[RunArrayData`] + fn downcast_ref(data: &ArrayDataRun) -> Option<&RunArrayData> + where + Self: RunEnd; + + /// Downcast [`ArrayDataRun`] to `[RunArrayData`] + fn downcast(data: ArrayDataRun) -> Option> + where + Self: RunEnd; + + /// Cast [`RunArrayData`] to [`ArrayDataRun`] + fn upcast(v: RunArrayData) -> ArrayDataRun + where + Self: RunEnd; + } +} + +pub trait RunEnd: private::RunEndSealed + ArrowNativeType { + const TYPE: RunEndType; +} + +macro_rules! run_end { + ($t:ty,$v:ident) => { + impl RunEnd for $t { + const TYPE: RunEndType = RunEndType::$v; + } + impl private::RunEndSealed for $t { + fn downcast_ref(data: &ArrayDataRun) -> Option<&RunArrayData> { + match data { + ArrayDataRun::$v(v) => Some(v), + _ => None, + } + } + + fn downcast(data: ArrayDataRun) -> Option> { + match data { + ArrayDataRun::$v(v) => Some(v), + _ => None, + } + } + + fn upcast(v: RunArrayData) -> ArrayDataRun { + ArrayDataRun::$v(v) + } + } + }; +} + +run_end!(i16, Int16); +run_end!(i32, Int32); +run_end!(i64, Int64); + +/// An enumeration of the types of [`RunArrayData`] +pub enum ArrayDataRun { + Int16(RunArrayData), + Int32(RunArrayData), + Int64(RunArrayData), +} + +impl ArrayDataRun { + /// Downcast this [`ArrayDataRun`] to the corresponding [`RunArrayData`] + pub fn downcast_ref(&self) -> Option<&RunArrayData> { + E::downcast_ref(self) + } + + /// Downcast this [`ArrayDataRun`] to the corresponding [`RunArrayData`] + pub fn downcast(self) -> Option> { + E::downcast(self) + } +} + +impl From> for ArrayDataRun { + fn from(value: RunArrayData) -> Self { + E::upcast(value) + } +} + +/// ArrayData for [run-end encoded arrays](https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout) +pub struct RunArrayData { + data_type: DataType, + run_ends: ScalarBuffer, + child: Box, +} + +impl RunArrayData { + /// Create a new [`RunArrayData`] + /// + /// # Safety + /// + /// - `data_type` must be valid for this layout + /// - `run_ends` must contain monotonically increasing, positive values `<= child.len()` + pub unsafe fn new_unchecked( + data_type: DataType, + run_ends: ScalarBuffer, + child: ArrayData, + ) -> Self { + Self { + data_type, + run_ends, + child: Box::new(child), + } + } + + /// Returns the run ends + #[inline] + pub fn run_ends(&self) -> &[E] { + &self.run_ends + } + + /// Returns the data type of this array + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type + } + + /// Returns the child data + #[inline] + pub fn child(&self) -> &ArrayData { + self.child.as_ref() + } +} diff --git a/arrow-data/src/data/struct.rs b/arrow-data/src/data/struct.rs new file mode 100644 index 000000000000..d9999261902e --- /dev/null +++ b/arrow-data/src/data/struct.rs @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::ArrayData; +use arrow_buffer::buffer::NullBuffer; +use arrow_schema::DataType; + +/// ArrayData for [struct arrays](https://arrow.apache.org/docs/format/Columnar.html#struct-layout) +#[derive(Debug, Clone)] +pub struct StructArrayData { + data_type: DataType, + len: usize, + nulls: Option, + children: Vec, +} + +impl StructArrayData { + /// Create a new [`StructArrayData`] + /// + /// # Safety + /// + /// - data_type must be a StructArray with fields matching `child_data` + /// - all child data and nulls must have length matching `len` + pub unsafe fn new_unchecked( + data_type: DataType, + len: usize, + nulls: Option, + children: Vec, + ) -> Self { + Self { + data_type, + len, + nulls, + children, + } + } + + /// Returns the length of this [`StructArrayData`] + #[inline] + pub fn len(&self) -> usize { + self.len + } + + /// Returns `true` if this [`StructArrayData`] has zero length + #[inline] + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Returns the null buffer if any + #[inline] + pub fn nulls(&self) -> Option<&NullBuffer> { + self.nulls.as_ref() + } + + /// Returns the primitive values + #[inline] + pub fn children(&self) -> &[ArrayData] { + &self.children + } + + /// Returns the data type of this array + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type + } +} diff --git a/arrow-data/src/data/types.rs b/arrow-data/src/data/types.rs index 09e169f6aa61..3414e481ca66 100644 --- a/arrow-data/src/data/types.rs +++ b/arrow-data/src/data/types.rs @@ -80,7 +80,6 @@ pub enum PhysicalType { Bytes(OffsetType, BytesType), FixedSizeList, List(OffsetType), - Map, Struct, Union, Dictionary(DictionaryKeyType), @@ -141,7 +140,7 @@ impl From<&DataType> for PhysicalType { DataType::UInt64 => Self::Dictionary(DictionaryKeyType::UInt64), d => panic!("illegal dictionary key data type {d}"), }, - DataType::Map(_, _) => Self::Map, + DataType::Map(_, _) => Self::List(OffsetType::Int32), DataType::RunEndEncoded(f, _) => match f.data_type() { DataType::Int16 => Self::Run(RunEndType::Int16), DataType::Int32 => Self::Run(RunEndType::Int32), diff --git a/arrow-data/src/data/union.rs b/arrow-data/src/data/union.rs new file mode 100644 index 000000000000..7861bd154e71 --- /dev/null +++ b/arrow-data/src/data/union.rs @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::ArrayData; +use arrow_buffer::buffer::ScalarBuffer; +use arrow_schema::DataType; + +/// ArrayData for [union arrays](https://arrow.apache.org/docs/format/Columnar.html#union-layout) +#[derive(Debug, Clone)] +pub struct UnionArrayData { + data_type: DataType, + type_ids: ScalarBuffer, + offsets: Option>, + children: Vec, +} + +impl UnionArrayData { + /// Creates a new [`UnionArrayData`] + /// + /// # Safety + /// + /// - `data_type` must be valid for this layout + /// - `type_ids` must only contain values corresponding to a field in `data_type` + /// - `children` must match the field definitions in `data_type` + /// - For each value id in type_ids, the corresponding offset, must be in bounds for the child + pub unsafe fn new_unchecked( + data_type: DataType, + type_ids: ScalarBuffer, + offsets: Option>, + children: Vec, + ) -> Self { + Self { + data_type, + type_ids, + offsets, + children, + } + } + + /// Returns the type ids for this array + #[inline] + pub fn type_ids(&self) -> &[i8] { + &self.type_ids + } + + /// Returns the offsets for this array if this is a dense union + #[inline] + pub fn offsets(&self) -> Option<&[i32]> { + self.offsets.as_deref() + } + + /// Returns the children of this array + #[inline] + pub fn children(&self) -> &[ArrayData] { + &self.children + } + + /// Returns the data type of this array + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type + } +}