From 9b8d7ae5db6d7e696fbeada8655f32d827e65d1a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Tue, 21 Mar 2023 17:22:59 +0000 Subject: [PATCH] Revert structured ArrayData (#1799) --- arrow-data/src/data/boolean.rs | 139 -------- arrow-data/src/data/buffers.rs | 10 - arrow-data/src/data/bytes.rs | 559 ------------------------------ arrow-data/src/data/dictionary.rs | 289 --------------- arrow-data/src/data/list.rs | 422 ---------------------- arrow-data/src/data/mod.rs | 230 ++++-------- arrow-data/src/data/null.rs | 104 ------ arrow-data/src/data/primitive.rs | 304 ---------------- arrow-data/src/data/run.rs | 277 --------------- arrow-data/src/data/struct.rs | 129 ------- arrow-data/src/data/types.rs | 152 -------- arrow-data/src/data/union.rs | 171 --------- 12 files changed, 72 insertions(+), 2714 deletions(-) delete mode 100644 arrow-data/src/data/boolean.rs delete mode 100644 arrow-data/src/data/bytes.rs delete mode 100644 arrow-data/src/data/dictionary.rs delete mode 100644 arrow-data/src/data/list.rs delete mode 100644 arrow-data/src/data/null.rs delete mode 100644 arrow-data/src/data/primitive.rs delete mode 100644 arrow-data/src/data/run.rs delete mode 100644 arrow-data/src/data/struct.rs delete mode 100644 arrow-data/src/data/types.rs delete mode 100644 arrow-data/src/data/union.rs diff --git a/arrow-data/src/data/boolean.rs b/arrow-data/src/data/boolean.rs deleted file mode 100644 index 258624cc1c66..000000000000 --- a/arrow-data/src/data/boolean.rs +++ /dev/null @@ -1,139 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::data::types::PhysicalType; -use crate::data::ArrayDataLayout; -use crate::{ArrayDataBuilder, Buffers}; -use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; -use arrow_schema::DataType; - -#[derive(Debug, Clone)] -pub struct BooleanArrayData { - data_type: DataType, - values: BooleanBuffer, - nulls: Option, -} - -impl BooleanArrayData { - /// Create a new [`BooleanArrayData`] - /// - /// # Panics - /// - /// Panics if - /// - `nulls` and `values` are different lengths - /// - `PhysicalType::from(&data_type) != PhysicalType::Boolean` - pub fn new( - data_type: DataType, - values: BooleanBuffer, - nulls: Option, - ) -> Self { - let physical = PhysicalType::from(&data_type); - assert_eq!( - physical, PhysicalType::Boolean, - "Illegal physical type for BooleanArrayData of datatype {:?}, expected {:?} got {:?}", - data_type, - PhysicalType::Boolean, - physical - ); - - if let Some(n) = nulls.as_ref() { - assert_eq!(values.len(), n.len()) - } - Self { - data_type, - values, - nulls, - } - } - - /// Create a new [`BooleanArrayData`] - /// - /// # Safety - /// - /// - `nulls` and `values` are the same lengths - /// - `PhysicalType::from(&data_type) == PhysicalType::Boolean` - pub unsafe fn new_unchecked( - data_type: DataType, - values: BooleanBuffer, - nulls: Option, - ) -> Self { - Self { - data_type, - values, - nulls, - } - } - - /// Creates a new [`BooleanArrayData`] from raw buffers - /// - /// # Safety - /// - /// See [`BooleanArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { - let values = builder.buffers.into_iter().next().unwrap(); - let values = BooleanBuffer::new(values, builder.offset, builder.len); - Self { - values, - data_type: builder.data_type, - nulls: builder.nulls, - } - } - - /// Returns the null buffer if any - #[inline] - pub fn nulls(&self) -> Option<&NullBuffer> { - self.nulls.as_ref() - } - - /// Returns the boolean values - #[inline] - pub fn values(&self) -> &BooleanBuffer { - &self.values - } - - /// Returns the data type of this array - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Returns the underlying parts of this [`BooleanArrayData`] - pub fn into_parts(self) -> (DataType, BooleanBuffer, Option) { - (self.data_type, self.values, self.nulls) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - Self { - data_type: self.data_type.clone(), - values: self.values.slice(offset, len), - nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), - } - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - ArrayDataLayout { - data_type: &self.data_type, - len: self.values.len(), - offset: self.values.offset(), - nulls: self.nulls.as_ref(), - buffers: Buffers::one(self.values().inner()), - child_data: &[], - } - } -} diff --git a/arrow-data/src/data/buffers.rs b/arrow-data/src/data/buffers.rs index 8a498d319aae..883e92e36d82 100644 --- a/arrow-data/src/data/buffers.rs +++ b/arrow-data/src/data/buffers.rs @@ -33,16 +33,6 @@ impl<'a> Buffers<'a> { } } - #[inline] - pub(crate) fn one(b: &'a Buffer) -> Self { - Self([Some(b), None]) - } - - #[inline] - pub(crate) fn two(a: &'a Buffer, b: &'a Buffer) -> Self { - Self([Some(a), Some(b)]) - } - /// Returns the number of [`Buffer`] in this collection #[inline] pub fn len(&self) -> usize { diff --git a/arrow-data/src/data/bytes.rs b/arrow-data/src/data/bytes.rs deleted file mode 100644 index 9ac267130b7a..000000000000 --- a/arrow-data/src/data/bytes.rs +++ /dev/null @@ -1,559 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::data::types::{BytesType, OffsetType}; -use crate::data::ArrayDataLayout; -use crate::{ArrayDataBuilder, Buffers}; -use arrow_buffer::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; -use arrow_buffer::{ArrowNativeType, Buffer}; -use arrow_schema::DataType; -use std::marker::PhantomData; - -mod private { - use super::*; - - pub trait BytesSealed { - /// Create from bytes without performing any validation - /// - /// # Safety - /// - /// If `str`, `b` must be a valid UTF-8 sequence - unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self; - - /// Downcast [`ArrayDataBytes`] to `[ArrayDataBytesOffset`] - fn downcast_ref(data: &ArrayDataBytes) -> Option<&ArrayDataBytesOffset> - where - Self: Bytes; - - /// Downcast [`ArrayDataBytes`] to `[ArrayDataBytesOffset`] - fn downcast(data: ArrayDataBytes) -> Option> - where - Self: Bytes; - - /// Cast [`ArrayDataBytesOffset`] to [`ArrayDataBytes`] - fn upcast(v: ArrayDataBytesOffset) -> ArrayDataBytes - where - Self: Bytes; - } - - pub trait BytesOffsetSealed { - /// Downcast [`ArrayDataBytesOffset`] to `[BytesArrayData`] - fn downcast_ref( - data: &ArrayDataBytesOffset, - ) -> Option<&BytesArrayData> - where - Self: BytesOffset; - - /// Downcast [`ArrayDataBytesOffset`] to `[BytesArrayData`] - fn downcast( - data: ArrayDataBytesOffset, - ) -> Option> - where - Self: BytesOffset; - - /// Cast [`BytesArrayData`] to [`ArrayDataBytesOffset`] - fn upcast( - v: BytesArrayData, - ) -> ArrayDataBytesOffset - where - Self: BytesOffset; - } -} - -/// Types backed by a variable length slice of bytes -pub trait Bytes: private::BytesSealed + std::fmt::Debug { - const TYPE: BytesType; -} - -impl Bytes for [u8] { - const TYPE: BytesType = BytesType::Binary; -} - -impl private::BytesSealed for [u8] { - unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self { - b - } - - fn downcast_ref(data: &ArrayDataBytes) -> Option<&ArrayDataBytesOffset> { - match data { - ArrayDataBytes::Binary(v) => Some(v), - ArrayDataBytes::Utf8(_) => None, - } - } - - fn downcast(data: ArrayDataBytes) -> Option> { - match data { - ArrayDataBytes::Binary(v) => Some(v), - ArrayDataBytes::Utf8(_) => None, - } - } - - fn upcast(v: ArrayDataBytesOffset) -> ArrayDataBytes { - ArrayDataBytes::Binary(v) - } -} - -impl Bytes for str { - const TYPE: BytesType = BytesType::Utf8; -} - -impl private::BytesSealed for str { - unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self { - std::str::from_utf8_unchecked(b) - } - - fn downcast_ref(data: &ArrayDataBytes) -> Option<&ArrayDataBytesOffset> { - match data { - ArrayDataBytes::Binary(_) => None, - ArrayDataBytes::Utf8(v) => Some(v), - } - } - - fn downcast(data: ArrayDataBytes) -> Option> { - match data { - ArrayDataBytes::Binary(_) => None, - ArrayDataBytes::Utf8(v) => Some(v), - } - } - - fn upcast(v: ArrayDataBytesOffset) -> ArrayDataBytes { - ArrayDataBytes::Utf8(v) - } -} - -/// Types of offset used by variable length byte arrays -pub trait BytesOffset: private::BytesOffsetSealed + ArrowNativeType { - const TYPE: OffsetType; -} - -impl BytesOffset for i32 { - const TYPE: OffsetType = OffsetType::Int32; -} - -impl private::BytesOffsetSealed for i32 { - fn downcast_ref( - data: &ArrayDataBytesOffset, - ) -> Option<&BytesArrayData> { - match data { - ArrayDataBytesOffset::Small(v) => Some(v), - ArrayDataBytesOffset::Large(_) => None, - } - } - - fn downcast( - data: ArrayDataBytesOffset, - ) -> Option> { - match data { - ArrayDataBytesOffset::Small(v) => Some(v), - ArrayDataBytesOffset::Large(_) => None, - } - } - - fn upcast(v: BytesArrayData) -> ArrayDataBytesOffset { - ArrayDataBytesOffset::Small(v) - } -} - -impl BytesOffset for i64 { - const TYPE: OffsetType = OffsetType::Int64; -} - -impl private::BytesOffsetSealed for i64 { - fn downcast_ref( - data: &ArrayDataBytesOffset, - ) -> Option<&BytesArrayData> { - match data { - ArrayDataBytesOffset::Small(_) => None, - ArrayDataBytesOffset::Large(v) => Some(v), - } - } - - fn downcast( - data: ArrayDataBytesOffset, - ) -> Option> { - match data { - ArrayDataBytesOffset::Small(_) => None, - ArrayDataBytesOffset::Large(v) => Some(v), - } - } - - fn upcast(v: BytesArrayData) -> ArrayDataBytesOffset { - ArrayDataBytesOffset::Large(v) - } -} - -/// Applies op to each variant of [`ArrayDataBytes`] -macro_rules! bytes_op { - ($array:ident, $op:block) => { - match $array { - ArrayDataBytes::Binary($array) => match $array { - ArrayDataBytesOffset::Small($array) => $op - ArrayDataBytesOffset::Large($array) => $op - } - ArrayDataBytes::Utf8($array) => match $array { - ArrayDataBytesOffset::Small($array) => $op - ArrayDataBytesOffset::Large($array) => $op - } - } - }; -} - -/// An enumeration of the types of [`ArrayDataBytesOffset`] -#[derive(Debug, Clone)] -pub enum ArrayDataBytes { - Binary(ArrayDataBytesOffset<[u8]>), - Utf8(ArrayDataBytesOffset), -} - -impl ArrayDataBytes { - /// Downcast this [`ArrayDataBytes`] to the corresponding [`BytesArrayData`] - pub fn downcast_ref( - &self, - ) -> Option<&BytesArrayData> { - O::downcast_ref(B::downcast_ref(self)?) - } - - /// Downcast this [`ArrayDataBytes`] to the corresponding [`BytesArrayData`] - pub fn downcast( - self, - ) -> Option> { - O::downcast(B::downcast(self)?) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - let s = self; - bytes_op!(s, { s.slice(offset, len).into() }) - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - let s = self; - bytes_op!(s, { s.layout() }) - } - - /// Creates a new [`ArrayDataBytes`] from raw buffers - /// - /// # Safety - /// - /// See [`BytesArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw( - builder: ArrayDataBuilder, - offset: OffsetType, - bytes: BytesType, - ) -> Self { - match bytes { - BytesType::Binary => Self::Binary(match offset { - OffsetType::Int32 => { - ArrayDataBytesOffset::Small(BytesArrayData::from_raw(builder)) - } - OffsetType::Int64 => { - ArrayDataBytesOffset::Large(BytesArrayData::from_raw(builder)) - } - }), - BytesType::Utf8 => Self::Utf8(match offset { - OffsetType::Int32 => { - ArrayDataBytesOffset::Small(BytesArrayData::from_raw(builder)) - } - OffsetType::Int64 => { - ArrayDataBytesOffset::Large(BytesArrayData::from_raw(builder)) - } - }), - } - } -} - -/// An enumeration of the types of [`BytesArrayData`] -#[derive(Debug)] -pub enum ArrayDataBytesOffset { - Small(BytesArrayData), - Large(BytesArrayData), -} - -impl Clone for ArrayDataBytesOffset { - fn clone(&self) -> Self { - match self { - Self::Small(v) => Self::Small(v.clone()), - Self::Large(v) => Self::Large(v.clone()), - } - } -} - -impl From> for ArrayDataBytes { - fn from(value: BytesArrayData) -> Self { - B::upcast(O::upcast(value)) - } -} - -/// ArrayData for [variable-sized arrays](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout) of [`Bytes`] -#[derive(Debug)] -pub struct BytesArrayData { - data_type: DataType, - offsets: OffsetBuffer, - values: Buffer, - nulls: Option, - phantom: PhantomData, -} - -impl Clone for BytesArrayData { - fn clone(&self) -> Self { - Self { - data_type: self.data_type.clone(), - nulls: self.nulls.clone(), - offsets: self.offsets.clone(), - values: self.values.clone(), - phantom: Default::default(), - } - } -} - -impl BytesArrayData { - /// Creates a new [`BytesArrayData`] - /// - /// # Safety - /// - /// - Each consecutive window of `offsets` must identify a valid slice of `values` - /// - `nulls.len() == offsets.len() - 1` - /// - `PhysicalType::from(&data_type) == PhysicalType::Bytes(O::TYPE, B::TYPE)` - pub unsafe fn new_unchecked( - data_type: DataType, - offsets: OffsetBuffer, - values: Buffer, - nulls: Option, - ) -> Self { - Self { - data_type, - nulls, - offsets, - values, - phantom: Default::default(), - } - } - - /// Creates a new [`BytesArrayData`] from an [`ArrayDataBuilder`] - /// - /// # Safety - /// - /// See [`Self::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { - let mut iter = builder.buffers.into_iter(); - let offsets = iter.next().unwrap(); - let values = iter.next().unwrap(); - - let offsets = match builder.len { - 0 => OffsetBuffer::new_empty(), - _ => OffsetBuffer::new_unchecked(ScalarBuffer::new( - offsets, - builder.offset, - builder.len + 1, - )), - }; - - Self { - values, - offsets, - data_type: builder.data_type, - nulls: builder.nulls, - phantom: Default::default(), - } - } - - /// Returns the length - #[inline] - pub fn len(&self) -> usize { - self.offsets.len().wrapping_sub(1) - } - - /// Returns true if this array is empty - #[inline] - pub fn is_empty(&self) -> bool { - self.offsets.len() <= 1 - } - - /// Returns the raw byte data - #[inline] - pub fn values(&self) -> &B { - // Safety: - // Bytes must be valid - unsafe { B::from_bytes_unchecked(self.values.as_slice()) } - } - - /// Returns the offsets - #[inline] - pub fn offsets(&self) -> &OffsetBuffer { - &self.offsets - } - - /// Returns the null buffer if any - #[inline] - pub fn nulls(&self) -> Option<&NullBuffer> { - self.nulls.as_ref() - } - - /// Returns the data type of this array - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Returns the underlying parts of this [`BytesArrayData`] - pub fn into_parts(self) -> (DataType, OffsetBuffer, Buffer, Option) { - (self.data_type, self.offsets, self.values, self.nulls) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - Self { - values: self.values.clone(), - offsets: self.offsets.slice(offset, len), - data_type: self.data_type.clone(), - nulls: self.nulls().as_ref().map(|x| x.slice(offset, len)), - phantom: Default::default(), - } - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - ArrayDataLayout { - data_type: &self.data_type, - len: self.offsets.len().wrapping_sub(1), - offset: 0, - nulls: self.nulls.as_ref(), - buffers: Buffers::two(self.offsets.inner().inner(), &self.values), - child_data: &[], - } - } -} - -/// ArrayData for [fixed-size arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout) of bytes -#[derive(Debug, Clone)] -pub struct FixedSizeBinaryArrayData { - data_type: DataType, - len: usize, - element_size: usize, - values: Buffer, - nulls: Option, -} - -impl FixedSizeBinaryArrayData { - /// Creates a new [`FixedSizeBinaryArrayData`] - /// - /// # Safety - /// - /// - `PhysicalType::from(&data_type) == PhysicalType::FixedSizeBinary(element_size)` - /// - `nulls.len() == values.len() / element_size == len` - pub unsafe fn new_unchecked( - data_type: DataType, - len: usize, - element_size: usize, - values: Buffer, - nulls: Option, - ) -> Self { - Self { - data_type, - nulls, - values, - len, - element_size, - } - } - - /// Creates a new [`FixedSizeBinaryArrayData`] from raw buffers - /// - /// # Safety - /// - /// See [`FixedSizeBinaryArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder, size: usize) -> Self { - let values = builder.buffers[0] - .slice_with_length(builder.offset * size, builder.len * size); - Self { - values, - data_type: builder.data_type, - len: builder.len, - element_size: size, - nulls: builder.nulls, - } - } - - /// Returns the length - #[inline] - pub fn len(&self) -> usize { - self.len - } - - /// Returns true if this array is empty - #[inline] - pub fn is_empty(&self) -> bool { - self.len == 0 - } - - /// Returns the size of each element - #[inline] - pub fn element_size(&self) -> usize { - self.element_size - } - - /// Returns the raw byte data - #[inline] - pub fn values(&self) -> &[u8] { - &self.values - } - - /// Returns the null buffer if any - #[inline] - pub fn nulls(&self) -> Option<&NullBuffer> { - self.nulls.as_ref() - } - - /// Returns the data type of this array - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Returns the underlying parts of this [`FixedSizeBinaryArrayData`] - pub fn into_parts(self) -> (DataType, Buffer, Option) { - (self.data_type, self.values, self.nulls) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - let offset_element = offset.checked_mul(self.element_size).expect("overflow"); - let len_element = len.checked_mul(self.element_size).expect("overflow"); - let values = self.values.slice_with_length(offset_element, len_element); - - Self { - len, - values, - data_type: self.data_type.clone(), - element_size: self.element_size, - nulls: self.nulls().as_ref().map(|x| x.slice(offset, len)), - } - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - ArrayDataLayout { - data_type: &self.data_type, - len: self.len, - offset: 0, - nulls: self.nulls.as_ref(), - buffers: Buffers::one(&self.values), - child_data: &[], - } - } -} diff --git a/arrow-data/src/data/dictionary.rs b/arrow-data/src/data/dictionary.rs deleted file mode 100644 index c95ee464b608..000000000000 --- a/arrow-data/src/data/dictionary.rs +++ /dev/null @@ -1,289 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::data::types::DictionaryKeyType; -use crate::data::ArrayDataLayout; -use crate::{ArrayData, ArrayDataBuilder, Buffers}; -use arrow_buffer::buffer::{NullBuffer, ScalarBuffer}; -use arrow_buffer::ArrowNativeType; -use arrow_schema::DataType; - -mod private { - use super::*; - - pub trait DictionaryKeySealed { - /// Downcast [`ArrayDataDictionary`] to `[DictionaryArrayData`] - fn downcast_ref(data: &ArrayDataDictionary) -> Option<&DictionaryArrayData> - where - Self: DictionaryKey; - - /// Downcast [`ArrayDataDictionary`] to `[DictionaryArrayData`] - fn downcast(data: ArrayDataDictionary) -> Option> - where - Self: DictionaryKey; - - /// Cast [`DictionaryArrayData`] to [`ArrayDataDictionary`] - fn upcast(v: DictionaryArrayData) -> ArrayDataDictionary - where - Self: DictionaryKey; - } -} - -/// Types of dictionary key used by dictionary arrays -pub trait DictionaryKey: private::DictionaryKeySealed + ArrowNativeType { - const TYPE: DictionaryKeyType; -} - -macro_rules! dictionary { - ($t:ty,$v:ident) => { - impl DictionaryKey for $t { - const TYPE: DictionaryKeyType = DictionaryKeyType::$v; - } - impl private::DictionaryKeySealed for $t { - fn downcast_ref( - data: &ArrayDataDictionary, - ) -> Option<&DictionaryArrayData> { - match data { - ArrayDataDictionary::$v(v) => Some(v), - _ => None, - } - } - - fn downcast(data: ArrayDataDictionary) -> Option> { - match data { - ArrayDataDictionary::$v(v) => Some(v), - _ => None, - } - } - - fn upcast(v: DictionaryArrayData) -> ArrayDataDictionary { - ArrayDataDictionary::$v(v) - } - } - }; -} - -dictionary!(i8, Int8); -dictionary!(i16, Int16); -dictionary!(i32, Int32); -dictionary!(i64, Int64); -dictionary!(u8, UInt8); -dictionary!(u16, UInt16); -dictionary!(u32, UInt32); -dictionary!(u64, UInt64); - -/// Applies op to each variant of [`ArrayDataDictionary`] -macro_rules! dictionary_op { - ($array:ident, $op:block) => { - match $array { - ArrayDataDictionary::Int8($array) => $op - ArrayDataDictionary::Int16($array) => $op - ArrayDataDictionary::Int32($array) => $op - ArrayDataDictionary::Int64($array) => $op - ArrayDataDictionary::UInt8($array) => $op - ArrayDataDictionary::UInt16($array) => $op - ArrayDataDictionary::UInt32($array) => $op - ArrayDataDictionary::UInt64($array) => $op - } - }; -} - -/// An enumeration of the types of [`DictionaryArrayData`] -#[derive(Debug, Clone)] -pub enum ArrayDataDictionary { - Int8(DictionaryArrayData), - Int16(DictionaryArrayData), - Int32(DictionaryArrayData), - Int64(DictionaryArrayData), - UInt8(DictionaryArrayData), - UInt16(DictionaryArrayData), - UInt32(DictionaryArrayData), - UInt64(DictionaryArrayData), -} - -impl ArrayDataDictionary { - /// Downcast this [`ArrayDataDictionary`] to the corresponding [`DictionaryArrayData`] - pub fn downcast_ref(&self) -> Option<&DictionaryArrayData> { - K::downcast_ref(self) - } - - /// Downcast this [`ArrayDataDictionary`] to the corresponding [`DictionaryArrayData`] - pub fn downcast(self) -> Option> { - K::downcast(self) - } - - /// Returns the values of this dictionary - pub fn values(&self) -> &ArrayData { - let s = self; - dictionary_op!(s, { s.values() }) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - let s = self; - dictionary_op!(s, { s.slice(offset, len).into() }) - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - let s = self; - dictionary_op!(s, { s.layout() }) - } - - /// Creates a new [`ArrayDataDictionary`] from raw buffers - /// - /// # Safety - /// - /// See [`DictionaryArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw( - builder: ArrayDataBuilder, - key: DictionaryKeyType, - ) -> Self { - use DictionaryKeyType::*; - match key { - Int8 => Self::Int8(DictionaryArrayData::from_raw(builder)), - Int16 => Self::Int16(DictionaryArrayData::from_raw(builder)), - Int32 => Self::Int32(DictionaryArrayData::from_raw(builder)), - Int64 => Self::Int64(DictionaryArrayData::from_raw(builder)), - UInt8 => Self::UInt8(DictionaryArrayData::from_raw(builder)), - UInt16 => Self::UInt16(DictionaryArrayData::from_raw(builder)), - UInt32 => Self::UInt32(DictionaryArrayData::from_raw(builder)), - UInt64 => Self::UInt64(DictionaryArrayData::from_raw(builder)), - } - } -} - -impl From> for ArrayDataDictionary { - fn from(value: DictionaryArrayData) -> Self { - K::upcast(value) - } -} - -/// ArrayData for [dictionary arrays](https://arrow.apache.org/docs/format/Columnar.html#dictionary-encoded-layout) -#[derive(Debug, Clone)] -pub struct DictionaryArrayData { - data_type: DataType, - nulls: Option, - keys: ScalarBuffer, - values: Box, -} - -impl DictionaryArrayData { - /// Create a new [`DictionaryArrayData`] - /// - /// # Safety - /// - /// - `PhysicalType::from(&data_type) == PhysicalType::Dictionary(K::TYPE)` - /// - child must have a type matching `data_type` - /// - all values in `keys` must be `0 < v < child.len()` or be a null according to `nulls` - /// - `nulls` must have the same length as `child` - pub unsafe fn new_unchecked( - data_type: DataType, - keys: ScalarBuffer, - nulls: Option, - child: ArrayData, - ) -> Self { - Self { - data_type, - nulls, - keys, - values: Box::new(child), - } - } - - /// Creates a new [`DictionaryArrayData`] from raw buffers - /// - /// # Safety - /// - /// See [`Self::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { - let keys = builder.buffers.into_iter().next().unwrap(); - let keys = ScalarBuffer::new(keys, builder.offset, builder.len); - let values = builder.child_data.into_iter().next().unwrap(); - Self { - keys, - data_type: builder.data_type, - nulls: builder.nulls, - values: Box::new(values), - } - } - - /// Returns the length - #[inline] - pub fn len(&self) -> usize { - self.keys.len() - } - - /// Returns true if this array is empty - #[inline] - pub fn is_empty(&self) -> bool { - self.keys.is_empty() - } - - /// Returns the null buffer if any - #[inline] - pub fn nulls(&self) -> Option<&NullBuffer> { - self.nulls.as_ref() - } - - /// Returns the keys - #[inline] - pub fn keys(&self) -> &[K] { - &self.keys - } - - /// Returns the values data - #[inline] - pub fn values(&self) -> &ArrayData { - self.values.as_ref() - } - - /// Returns the data type of this array - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Returns the underlying parts of this [`DictionaryArrayData`] - pub fn into_parts( - self, - ) -> (DataType, ScalarBuffer, Option, ArrayData) { - (self.data_type, self.keys, self.nulls, *self.values) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - Self { - keys: self.keys.slice(offset, len), - data_type: self.data_type.clone(), - nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), - values: self.values.clone(), - } - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - ArrayDataLayout { - data_type: &self.data_type, - len: self.keys.len(), - offset: 0, - nulls: self.nulls.as_ref(), - buffers: Buffers::one(self.keys.inner()), - child_data: std::slice::from_ref(self.values.as_ref()), - } - } -} diff --git a/arrow-data/src/data/list.rs b/arrow-data/src/data/list.rs deleted file mode 100644 index bcc89f8ba2ca..000000000000 --- a/arrow-data/src/data/list.rs +++ /dev/null @@ -1,422 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::data::types::OffsetType; -use crate::data::ArrayDataLayout; -use crate::{ArrayData, ArrayDataBuilder, Buffers}; -use arrow_buffer::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; -use arrow_buffer::ArrowNativeType; -use arrow_schema::DataType; - -mod private { - use super::*; - - pub trait ListOffsetSealed { - /// Downcast [`ArrayDataList`] to `[ListArrayData`] - fn downcast_ref(data: &ArrayDataList) -> Option<&ListArrayData> - where - Self: ListOffset; - - /// Downcast [`ArrayDataList`] to `[ListArrayData`] - fn downcast(data: ArrayDataList) -> Option> - where - Self: ListOffset; - - /// Cast [`ListArrayData`] to [`ArrayDataList`] - fn upcast(v: ListArrayData) -> ArrayDataList - where - Self: ListOffset; - } -} - -/// Types of offset used by variable length list arrays -pub trait ListOffset: private::ListOffsetSealed + ArrowNativeType { - const TYPE: OffsetType; -} - -impl ListOffset for i32 { - const TYPE: OffsetType = OffsetType::Int32; -} - -impl private::ListOffsetSealed for i32 { - fn downcast_ref(data: &ArrayDataList) -> Option<&ListArrayData> - where - Self: ListOffset, - { - match data { - ArrayDataList::Small(v) => Some(v), - ArrayDataList::Large(_) => None, - } - } - - fn downcast(data: ArrayDataList) -> Option> - where - Self: ListOffset, - { - match data { - ArrayDataList::Small(v) => Some(v), - ArrayDataList::Large(_) => None, - } - } - - fn upcast(v: ListArrayData) -> ArrayDataList - where - Self: ListOffset, - { - ArrayDataList::Small(v) - } -} - -impl ListOffset for i64 { - const TYPE: OffsetType = OffsetType::Int64; -} - -impl private::ListOffsetSealed for i64 { - fn downcast_ref(data: &ArrayDataList) -> Option<&ListArrayData> - where - Self: ListOffset, - { - match data { - ArrayDataList::Small(_) => None, - ArrayDataList::Large(v) => Some(v), - } - } - - fn downcast(data: ArrayDataList) -> Option> - where - Self: ListOffset, - { - match data { - ArrayDataList::Small(_) => None, - ArrayDataList::Large(v) => Some(v), - } - } - - fn upcast(v: ListArrayData) -> ArrayDataList - where - Self: ListOffset, - { - ArrayDataList::Large(v) - } -} - -/// Applies op to each variant of [`ListArrayData`] -macro_rules! list_op { - ($array:ident, $op:block) => { - match $array { - ArrayDataList::Small($array) => $op - ArrayDataList::Large($array) => $op - } - }; -} - -/// An enumeration of the types of [`ListArrayData`] -#[derive(Debug, Clone)] -pub enum ArrayDataList { - Small(ListArrayData), - Large(ListArrayData), -} - -impl ArrayDataList { - /// Downcast this [`ArrayDataList`] to the corresponding [`ListArrayData`] - pub fn downcast_ref(&self) -> Option<&ListArrayData> { - O::downcast_ref(self) - } - - /// Downcast this [`ArrayDataList`] to the corresponding [`ListArrayData`] - pub fn downcast(self) -> Option> { - O::downcast(self) - } - - /// Returns the values of this [`ArrayDataList`] - pub fn values(&self) -> &ArrayData { - let s = self; - list_op!(s, { s.values() }) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - let s = self; - list_op!(s, { s.slice(offset, len).into() }) - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - let s = self; - list_op!(s, { s.layout() }) - } - - /// Creates a new [`ArrayDataList`] from raw buffers - /// - /// # Safety - /// - /// See [`ListArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder, offset: OffsetType) -> Self { - match offset { - OffsetType::Int32 => Self::Small(ListArrayData::from_raw(builder)), - OffsetType::Int64 => Self::Large(ListArrayData::from_raw(builder)), - } - } -} - -impl From> for ArrayDataList { - fn from(value: ListArrayData) -> Self { - O::upcast(value) - } -} - -/// ArrayData for [variable-size list arrays](https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout) -#[derive(Debug, Clone)] -pub struct ListArrayData { - data_type: DataType, - nulls: Option, - offsets: OffsetBuffer, - values: Box, -} - -impl ListArrayData { - /// Create a new [`ListArrayData`] - /// - /// # Safety - /// - /// - `PhysicalType::from(&data_type) == PhysicalType::List(O::TYPE)` - /// - Each consecutive window of `offsets` must identify a valid slice of `child` - /// - `nulls.len() == offsets.len() - 1` - pub unsafe fn new_unchecked( - data_type: DataType, - offsets: OffsetBuffer, - nulls: Option, - values: ArrayData, - ) -> Self { - Self { - data_type, - nulls, - offsets, - values: Box::new(values), - } - } - - /// Creates a new [`ListArrayData`] from an [`ArrayDataBuilder`] - /// - /// # Safety - /// - /// See [`Self::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { - let offsets = builder.buffers.into_iter().next().unwrap(); - let values = builder.child_data.into_iter().next().unwrap(); - - let offsets = match builder.len { - 0 => OffsetBuffer::new_empty(), - _ => OffsetBuffer::new_unchecked(ScalarBuffer::new( - offsets, - builder.offset, - builder.len + 1, - )), - }; - - Self { - offsets, - data_type: builder.data_type, - nulls: builder.nulls, - values: Box::new(values), - } - } - - /// Returns the length - #[inline] - pub fn len(&self) -> usize { - self.offsets.len().wrapping_sub(1) - } - - /// Returns true if this array is empty - #[inline] - pub fn is_empty(&self) -> bool { - self.offsets.len() <= 1 - } - - /// Returns the null buffer if any - #[inline] - pub fn nulls(&self) -> Option<&NullBuffer> { - self.nulls.as_ref() - } - - /// Returns the offsets - #[inline] - pub fn offsets(&self) -> &OffsetBuffer { - &self.offsets - } - - /// Returns the values of this [`ListArrayData`] - #[inline] - pub fn values(&self) -> &ArrayData { - self.values.as_ref() - } - - /// Returns the data type of this array - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Returns the underlying parts of this [`ListArrayData`] - pub fn into_parts( - self, - ) -> (DataType, OffsetBuffer, Option, ArrayData) { - (self.data_type, self.offsets, self.nulls, *self.values) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - Self { - data_type: self.data_type.clone(), - nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), - offsets: self.offsets.slice(offset, len), - values: self.values.clone(), - } - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - ArrayDataLayout { - data_type: &self.data_type, - len: self.len(), - offset: 0, - nulls: self.nulls.as_ref(), - buffers: Buffers::one(self.offsets.inner().inner()), - child_data: std::slice::from_ref(self.values.as_ref()), - } - } -} - -/// ArrayData for [fixed-size list arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-list-layout) -#[derive(Debug, Clone)] -pub struct FixedSizeListArrayData { - data_type: DataType, - len: usize, - element_size: usize, - nulls: Option, - child: Box, -} - -impl FixedSizeListArrayData { - /// Create a new [`FixedSizeListArrayData`] - /// - /// # Safety - /// - /// - `PhysicalType::from(&data_type) == PhysicalType::FixedSizeList(element_size)` - /// - `nulls.len() == values.len() / element_size == len` - pub unsafe fn new_unchecked( - data_type: DataType, - len: usize, - element_size: usize, - nulls: Option, - child: ArrayData, - ) -> Self { - Self { - data_type, - len, - element_size, - nulls, - child: Box::new(child), - } - } - - /// Creates a new [`FixedSizeListArrayData`] from raw buffers - /// - /// # Safety - /// - /// See [`FixedSizeListArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder, size: usize) -> Self { - let child = - builder.child_data[0].slice(builder.offset * size, builder.len * size); - Self { - data_type: builder.data_type, - len: builder.len, - element_size: size, - nulls: builder.nulls, - child: Box::new(child), - } - } - - /// Returns the length - #[inline] - pub fn len(&self) -> usize { - self.len - } - - /// Returns true if this array is empty - #[inline] - pub fn is_empty(&self) -> bool { - self.len == 0 - } - - /// Returns the size of each element - #[inline] - pub fn element_size(&self) -> usize { - self.element_size - } - - /// Returns the null buffer if any - #[inline] - pub fn nulls(&self) -> Option<&NullBuffer> { - self.nulls.as_ref() - } - - /// Returns the child data - #[inline] - pub fn child(&self) -> &ArrayData { - self.child.as_ref() - } - - /// Returns the data type of this array - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Returns the underlying parts of this [`FixedSizeListArrayData`] - pub fn into_parts(self) -> (DataType, Option, ArrayData) { - (self.data_type, self.nulls, *self.child) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - let offset_element = offset.checked_mul(self.element_size).expect("overflow"); - let len_element = len.checked_mul(self.element_size).expect("overflow"); - let child = self.child.slice(offset_element, len_element); - - Self { - len, - data_type: self.data_type.clone(), - element_size: self.element_size, - nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), - child: Box::new(child), - } - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - ArrayDataLayout { - data_type: &self.data_type, - len: self.len, - offset: 0, - nulls: self.nulls.as_ref(), - buffers: Buffers::default(), - child_data: std::slice::from_ref(self.child.as_ref()), - } - } -} diff --git a/arrow-data/src/data/mod.rs b/arrow-data/src/data/mod.rs index 784911dc0a85..cc908d639553 100644 --- a/arrow-data/src/data/mod.rs +++ b/arrow-data/src/data/mod.rs @@ -33,27 +33,6 @@ use crate::equal; mod buffers; pub use buffers::*; -#[allow(unused)] // Private until ready (#1799) -mod boolean; -#[allow(unused)] // Private until ready (#1799) -mod bytes; -#[allow(unused)] // Private until ready (#1799) -mod dictionary; -#[allow(unused)] // Private until ready (#1799) -mod list; -#[allow(unused)] // Private until ready (#1799) -mod null; -#[allow(unused)] // Private until ready (#1799) -mod primitive; -#[allow(unused)] // Private until ready (#1799) -mod run; -#[allow(unused)] // Private until ready (#1799) -mod r#struct; -#[allow(unused)] // Private until ready (#1799) -mod types; -#[allow(unused)] // Private until ready (#1799) -mod union; - #[inline] pub(crate) fn contains_nulls( null_bit_buffer: Option<&NullBuffer>, @@ -351,7 +330,7 @@ impl ArrayData { // We don't need to validate children as we can assume that the // [`ArrayData`] in `child_data` have already been validated through // a call to `ArrayData::try_new` or created using unsafe - ArrayDataLayout::new(&new_self).validate_data()?; + new_self.validate_data()?; Ok(new_self) } @@ -441,15 +420,14 @@ impl ArrayData { /// If multiple [`ArrayData`]s refer to the same underlying /// [`Buffer`]s they will both report the same size. pub fn get_buffer_memory_size(&self) -> usize { - let s = ArrayDataLayout::new(self); let mut size = 0; - for buffer in s.buffers { + for buffer in &self.buffers { size += buffer.capacity(); } - if let Some(bitmap) = s.nulls { + if let Some(bitmap) = &self.nulls { size += bitmap.buffer().capacity() } - for child in s.child_data { + for child in &self.child_data { size += child.get_buffer_memory_size(); } size @@ -468,15 +446,14 @@ impl ArrayData { /// first `20` elements, then [`Self::get_slice_memory_size`] on the /// sliced [`ArrayData`] would return `20 * 8 = 160`. pub fn get_slice_memory_size(&self) -> Result { - let s = ArrayDataLayout::new(self); let mut result: usize = 0; - let layout = layout(s.data_type); + let layout = layout(&self.data_type); for spec in layout.buffers.iter() { match spec { BufferSpec::FixedWidth { byte_width } => { let buffer_size = - s.len.checked_mul(*byte_width).ok_or_else(|| { + self.len.checked_mul(*byte_width).ok_or_else(|| { ArrowError::ComputeError( "Integer overflow computing buffer size".to_string(), ) @@ -485,26 +462,26 @@ impl ArrayData { } BufferSpec::VariableWidth => { let buffer_len: usize; - match s.data_type { + match self.data_type { DataType::Utf8 | DataType::Binary => { - let offsets = s.typed_offsets::()?; - buffer_len = (offsets[s.len] - offsets[0]) as usize; + let offsets = self.typed_offsets::()?; + buffer_len = (offsets[self.len] - offsets[0] ) as usize; } DataType::LargeUtf8 | DataType::LargeBinary => { - let offsets = s.typed_offsets::()?; - buffer_len = (offsets[s.len] - offsets[0]) as usize; + let offsets = self.typed_offsets::()?; + buffer_len = (offsets[self.len] - offsets[0]) as usize; } _ => { return Err(ArrowError::NotYetImplemented(format!( - "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}", - s.data_type + "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}", + self.data_type ))) } }; result += buffer_len; } BufferSpec::BitMap => { - let buffer_size = bit_util::ceil(s.len, 8); + let buffer_size = bit_util::ceil(self.len, 8); result += buffer_size; } BufferSpec::AlwaysNull => { @@ -513,11 +490,11 @@ impl ArrayData { } } - if s.nulls.is_some() { - result += bit_util::ceil(s.len, 8); + if self.nulls().is_some() { + result += bit_util::ceil(self.len, 8); } - for child in s.child_data { + for child in &self.child_data { result += child.get_slice_memory_size()?; } Ok(result) @@ -532,18 +509,17 @@ impl ArrayData { /// [`Self::get_buffer_memory_size`] + /// `size_of_val(child)` for all children pub fn get_array_memory_size(&self) -> usize { - let s = ArrayDataLayout::new(self); let mut size = mem::size_of_val(self); // Calculate rest of the fields top down which contain actual data - for buffer in s.buffers { + for buffer in &self.buffers { size += mem::size_of::(); size += buffer.capacity(); } - if let Some(nulls) = s.nulls { + if let Some(nulls) = &self.nulls { size += nulls.buffer().capacity(); } - for child in s.child_data { + for child in &self.child_data { size += child.get_array_memory_size(); } @@ -730,101 +706,11 @@ impl ArrayData { /// See [ArrayData::validate_data] to validate fully the offset content /// and the validity of utf8 data pub fn validate(&self) -> Result<(), ArrowError> { - ArrayDataLayout::new(self).validate() - } - - /// Validate that the data contained within this [`ArrayData`] is valid - /// - /// 1. Null count is correct - /// 2. All offsets are valid - /// 3. All String data is valid UTF-8 - /// 4. All dictionary offsets are valid - /// - /// Internally this calls: - /// - /// * [`Self::validate`] - /// * [`Self::validate_nulls`] - /// * [`Self::validate_values`] - /// - /// Note: this does not recurse into children, for a recursive variant - /// see [`Self::validate_full`] - pub fn validate_data(&self) -> Result<(), ArrowError> { - ArrayDataLayout::new(self).validate_data() - } - - /// Performs a full recursive validation of this [`ArrayData`] and all its children - /// - /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`] - /// and all its children recursively - pub fn validate_full(&self) -> Result<(), ArrowError> { - ArrayDataLayout::new(self).validate_full() - } - - /// Validates the values stored within this [`ArrayData`] are valid - /// without recursing into child [`ArrayData`] - /// - /// Does not (yet) check - /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) - /// Validates the the null count is correct and that any - /// nullability requirements of its children are correct - pub fn validate_nulls(&self) -> Result<(), ArrowError> { - ArrayDataLayout::new(self).validate_nulls() - } - - /// Validates the values stored within this [`ArrayData`] are valid - /// without recursing into child [`ArrayData`] - /// - /// Does not (yet) check - /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) - pub fn validate_values(&self) -> Result<(), ArrowError> { - ArrayDataLayout::new(self).validate_values() - } - - /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons - /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may - /// return false when the arrays are logically equal - pub fn ptr_eq(&self, other: &Self) -> bool { - ArrayDataLayout::new(self).ptr_eq(&ArrayDataLayout::new(other)) - } - - /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`] - pub fn into_builder(self) -> ArrayDataBuilder { - self.into() - } -} - -/// A flat representation of [`ArrayData`] -/// -/// This is temporary measure to bridge the gap between the strongly-typed -/// ArrayData enumeration and the older-style struct representation (#1799) -#[derive(Copy, Clone)] -pub(crate) struct ArrayDataLayout<'a> { - data_type: &'a DataType, - len: usize, - offset: usize, - nulls: Option<&'a NullBuffer>, - buffers: Buffers<'a>, - child_data: &'a [ArrayData], -} - -impl<'a> ArrayDataLayout<'a> { - fn new(data: &'a ArrayData) -> Self { - Self { - data_type: &data.data_type, - len: data.len, - offset: data.offset, - nulls: data.nulls.as_ref(), - buffers: Buffers::from_slice(&data.buffers), - child_data: &data.child_data, - } - } - - fn validate(&self) -> Result<(), ArrowError> { // Need at least this mich space in each buffer let len_plus_offset = self.len + self.offset; // Check that the data layout conforms to the spec - let layout = layout(self.data_type); + let layout = layout(&self.data_type); if !layout.can_contain_null_mask && self.nulls.is_some() { return Err(ArrowError::InvalidArgumentError(format!( @@ -879,7 +765,7 @@ impl<'a> ArrayDataLayout<'a> { } // check null bit buffer size - if let Some(nulls) = self.nulls { + if let Some(nulls) = self.nulls() { if nulls.null_count() > self.len { return Err(ArrowError::InvalidArgumentError(format!( "null_count {} for an array exceeds length of {} elements", @@ -1141,7 +1027,7 @@ impl<'a> ArrayDataLayout<'a> { fn get_single_valid_child_data( &self, expected_type: &DataType, - ) -> Result, ArrowError> { + ) -> Result<&ArrayData, ArrowError> { self.validate_num_child_data(1)?; self.get_valid_child_data(0, expected_type) } @@ -1166,7 +1052,7 @@ impl<'a> ArrayDataLayout<'a> { &self, i: usize, expected_type: &DataType, - ) -> Result { + ) -> Result<&ArrayData, ArrowError> { let values_data = self.child_data .get(i) .ok_or_else(|| { @@ -1175,9 +1061,8 @@ impl<'a> ArrayDataLayout<'a> { self.data_type, i+1, self.child_data.len() )) })?; - let values_data = ArrayDataLayout::new(values_data); - if expected_type != values_data.data_type { + if expected_type != &values_data.data_type { return Err(ArrowError::InvalidArgumentError(format!( "Child type mismatch for {}. Expected {} but child data had {}", self.data_type, expected_type, values_data.data_type @@ -1188,7 +1073,22 @@ impl<'a> ArrayDataLayout<'a> { Ok(values_data) } - fn validate_data(&self) -> Result<(), ArrowError> { + /// Validate that the data contained within this [`ArrayData`] is valid + /// + /// 1. Null count is correct + /// 2. All offsets are valid + /// 3. All String data is valid UTF-8 + /// 4. All dictionary offsets are valid + /// + /// Internally this calls: + /// + /// * [`Self::validate`] + /// * [`Self::validate_nulls`] + /// * [`Self::validate_values`] + /// + /// Note: this does not recurse into children, for a recursive variant + /// see [`Self::validate_full`] + pub fn validate_data(&self) -> Result<(), ArrowError> { self.validate()?; self.validate_nulls()?; @@ -1196,7 +1096,11 @@ impl<'a> ArrayDataLayout<'a> { Ok(()) } - fn validate_full(&self) -> Result<(), ArrowError> { + /// Performs a full recursive validation of this [`ArrayData`] and all its children + /// + /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`] + /// and all its children recursively + pub fn validate_full(&self) -> Result<(), ArrowError> { self.validate_data()?; // validate all children recursively self.child_data @@ -1213,7 +1117,14 @@ impl<'a> ArrayDataLayout<'a> { Ok(()) } - fn validate_nulls(&self) -> Result<(), ArrowError> { + /// Validates the values stored within this [`ArrayData`] are valid + /// without recursing into child [`ArrayData`] + /// + /// Does not (yet) check + /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) + /// Validates the the null count is correct and that any + /// nullability requirements of its children are correct + pub fn validate_nulls(&self) -> Result<(), ArrowError> { if let Some(nulls) = &self.nulls { let actual = nulls.len() - nulls.inner().count_set_bits(); if actual != nulls.null_count() { @@ -1231,12 +1142,11 @@ impl<'a> ArrayDataLayout<'a> { match &self.data_type { DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => { if !f.is_nullable() { - let child = ArrayDataLayout::new(&self.child_data[0]); - self.validate_non_nullable(None, 0, child)? + self.validate_non_nullable(None, 0, &self.child_data[0])? } } DataType::FixedSizeList(field, len) => { - let child = ArrayDataLayout::new(&self.child_data[0]); + let child = &self.child_data[0]; if !field.is_nullable() { match &self.nulls { Some(nulls) => { @@ -1265,8 +1175,7 @@ impl<'a> ArrayDataLayout<'a> { } } DataType::Struct(fields) => { - for (field, child) in fields.iter().zip(self.child_data) { - let child = ArrayDataLayout::new(child); + for (field, child) in fields.iter().zip(&self.child_data) { if !field.is_nullable() { match &self.nulls { Some(n) => self.validate_non_nullable( @@ -1290,11 +1199,11 @@ impl<'a> ArrayDataLayout<'a> { &self, mask: Option<&Buffer>, offset: usize, - child: ArrayDataLayout<'_>, + child: &ArrayData, ) -> Result<(), ArrowError> { let mask = match mask { Some(mask) => mask.as_ref(), - None => return match child.nulls.map(|x| x.null_count()).unwrap_or_default() { + None => return match child.null_count() { 0 => Ok(()), _ => Err(ArrowError::InvalidArgumentError(format!( "non-nullable child of type {} contains nulls not present in parent {}", @@ -1304,7 +1213,7 @@ impl<'a> ArrayDataLayout<'a> { }, }; - match child.nulls { + match child.nulls() { Some(nulls) => { let mask = BitChunks::new(mask, offset, child.len); let nulls = BitChunks::new(nulls.validity(), nulls.offset(), child.len); @@ -1333,7 +1242,7 @@ impl<'a> ArrayDataLayout<'a> { /// /// Does not (yet) check /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) - fn validate_values(&self) -> Result<(), ArrowError> { + pub fn validate_values(&self) -> Result<(), ArrowError> { match &self.data_type { DataType::Utf8 => self.validate_utf8::(), DataType::LargeUtf8 => self.validate_utf8::(), @@ -1343,11 +1252,11 @@ impl<'a> ArrayDataLayout<'a> { } DataType::List(_) | DataType::Map(_, _) => { let child = &self.child_data[0]; - self.validate_offsets_full::(child.len()) + self.validate_offsets_full::(child.len) } DataType::LargeList(_) => { let child = &self.child_data[0]; - self.validate_offsets_full::(child.len()) + self.validate_offsets_full::(child.len) } DataType::Union(_, _, _) => { // Validate Union Array as part of implementing new Union semantics @@ -1358,7 +1267,7 @@ impl<'a> ArrayDataLayout<'a> { Ok(()) } DataType::Dictionary(key_type, _value_type) => { - let dictionary_length: i64 = self.child_data[0].len().try_into().unwrap(); + let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap(); let max_value = dictionary_length - 1; match key_type.as_ref() { DataType::UInt8 => self.check_bounds::(max_value), @@ -1373,7 +1282,7 @@ impl<'a> ArrayDataLayout<'a> { } } DataType::RunEndEncoded(run_ends, _values) => { - let run_ends_data = ArrayDataLayout::new(&self.child_data[0]); + let run_ends_data = self.child_data()[0].clone(); match run_ends.data_type() { DataType::Int16 => run_ends_data.check_run_ends::(), DataType::Int32 => run_ends_data.check_run_ends::(), @@ -1517,7 +1426,7 @@ impl<'a> ArrayDataLayout<'a> { indexes.iter().enumerate().try_for_each(|(i, &dict_index)| { // Do not check the value is null (value can be arbitrary) - if self.nulls.map(|x| x.is_null(i)).unwrap_or_default() { + if self.is_null(i) { return Ok(()); } let dict_index: i64 = dict_index.try_into().map_err(|_| { @@ -1605,6 +1514,11 @@ impl<'a> ArrayDataLayout<'a> { .zip(other.child_data.iter()) .all(|(a, b)| a.ptr_eq(b)) } + + /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`] + pub fn into_builder(self) -> ArrayDataBuilder { + self.into() + } } /// Return the expected [`DataTypeLayout`] Arrays of this data @@ -1889,7 +1803,7 @@ impl ArrayDataBuilder { pub fn build(self) -> Result { let data = unsafe { self.build_unchecked() }; #[cfg(not(feature = "force_validate"))] - ArrayDataLayout::new(&data).validate_data()?; + data.validate_data()?; Ok(data) } } diff --git a/arrow-data/src/data/null.rs b/arrow-data/src/data/null.rs deleted file mode 100644 index b8a4d7270833..000000000000 --- a/arrow-data/src/data/null.rs +++ /dev/null @@ -1,104 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::data::types::PhysicalType; -use crate::data::ArrayDataLayout; -use crate::{ArrayDataBuilder, Buffers}; -use arrow_schema::DataType; - -/// ArrayData for [null arrays](https://arrow.apache.org/docs/format/Columnar.html#null-layout) -#[derive(Debug, Clone)] -pub struct NullArrayData { - data_type: DataType, - len: usize, -} - -impl NullArrayData { - /// Create a new [`NullArrayData`] - /// - /// # Panic - /// - /// - `PhysicalType::from(&data_type) != PhysicalType::Null` - pub fn new(data_type: DataType, len: usize) -> Self { - assert_eq!( - PhysicalType::from(&data_type), - PhysicalType::Null, - "Illegal physical type for NullArrayData of datatype {data_type:?}", - ); - Self { data_type, len } - } - - /// Create a new [`NullArrayData`] - /// - /// # Safety - /// - /// - `PhysicalType::from(&data_type) == PhysicalType::Null` - pub unsafe fn new_unchecked(data_type: DataType, len: usize) -> Self { - Self { data_type, len } - } - - /// Creates a new [`NullArrayData`] from raw buffers - /// - /// # Safety - /// - /// See [`NullArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { - Self { - data_type: builder.data_type, - len: builder.len, - } - } - - /// Returns the data type of this array - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Returns the length of this array - #[inline] - pub fn len(&self) -> usize { - self.len - } - - /// Returns the [`DataType`] and length of this [`NullArrayData`] - pub fn into_parts(self) -> (DataType, usize) { - (self.data_type, self.len) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - let new_len = offset.saturating_add(len); - assert!(new_len <= self.len); - Self { - data_type: self.data_type.clone(), - len, - } - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - ArrayDataLayout { - data_type: &self.data_type, - len: self.len, - offset: 0, - nulls: None, - buffers: Buffers::default(), - child_data: &[], - } - } -} diff --git a/arrow-data/src/data/primitive.rs b/arrow-data/src/data/primitive.rs deleted file mode 100644 index ed8ed8d7aabb..000000000000 --- a/arrow-data/src/data/primitive.rs +++ /dev/null @@ -1,304 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::data::types::{PhysicalType, PrimitiveType}; -use crate::data::ArrayDataLayout; -use crate::{ArrayDataBuilder, Buffers}; -use arrow_buffer::buffer::{NullBuffer, ScalarBuffer}; -use arrow_buffer::{i256, ArrowNativeType}; -use arrow_schema::DataType; -use half::f16; - -mod private { - use super::*; - - pub trait PrimitiveSealed { - /// Downcast [`ArrayDataPrimitive`] to `[PrimitiveArrayData`] - fn downcast_ref(data: &ArrayDataPrimitive) -> Option<&PrimitiveArrayData> - where - Self: Primitive; - - /// Downcast [`ArrayDataPrimitive`] to `[PrimitiveArrayData`] - fn downcast(data: ArrayDataPrimitive) -> Option> - where - Self: Primitive; - - /// Cast [`ArrayDataPrimitive`] to [`ArrayDataPrimitive`] - fn upcast(v: PrimitiveArrayData) -> ArrayDataPrimitive - where - Self: Primitive; - } -} - -pub trait Primitive: private::PrimitiveSealed + ArrowNativeType { - const TYPE: PrimitiveType; -} - -/// Applies op to each variant of [`ArrayDataPrimitive`] -macro_rules! primitive_op { - ($array:ident, $op:block) => { - match $array { - ArrayDataPrimitive::Int8($array) => $op - ArrayDataPrimitive::Int16($array) => $op - ArrayDataPrimitive::Int32($array) => $op - ArrayDataPrimitive::Int64($array) => $op - ArrayDataPrimitive::Int128($array) => $op - ArrayDataPrimitive::Int256($array) => $op - ArrayDataPrimitive::UInt8($array) => $op - ArrayDataPrimitive::UInt16($array) => $op - ArrayDataPrimitive::UInt32($array) => $op - ArrayDataPrimitive::UInt64($array) => $op - ArrayDataPrimitive::Float16($array) => $op - ArrayDataPrimitive::Float32($array) => $op - ArrayDataPrimitive::Float64($array) => $op - } - }; -} - -macro_rules! primitive { - ($t:ty,$v:ident) => { - impl Primitive for $t { - const TYPE: PrimitiveType = PrimitiveType::$v; - } - impl private::PrimitiveSealed for $t { - fn downcast_ref( - data: &ArrayDataPrimitive, - ) -> Option<&PrimitiveArrayData> { - match data { - ArrayDataPrimitive::$v(v) => Some(v), - _ => None, - } - } - - fn downcast(data: ArrayDataPrimitive) -> Option> { - match data { - ArrayDataPrimitive::$v(v) => Some(v), - _ => None, - } - } - - fn upcast(v: PrimitiveArrayData) -> ArrayDataPrimitive { - ArrayDataPrimitive::$v(v) - } - } - }; -} - -primitive!(i8, Int8); -primitive!(i16, Int16); -primitive!(i32, Int32); -primitive!(i64, Int64); -primitive!(i128, Int128); -primitive!(i256, Int256); -primitive!(u8, UInt8); -primitive!(u16, UInt16); -primitive!(u32, UInt32); -primitive!(u64, UInt64); -primitive!(f16, Float16); -primitive!(f32, Float32); -primitive!(f64, Float64); - -/// An enumeration of the types of [`PrimitiveArrayData`] -#[derive(Debug, Clone)] -pub enum ArrayDataPrimitive { - Int8(PrimitiveArrayData), - Int16(PrimitiveArrayData), - Int32(PrimitiveArrayData), - Int64(PrimitiveArrayData), - Int128(PrimitiveArrayData), - Int256(PrimitiveArrayData), - UInt8(PrimitiveArrayData), - UInt16(PrimitiveArrayData), - UInt32(PrimitiveArrayData), - UInt64(PrimitiveArrayData), - Float16(PrimitiveArrayData), - Float32(PrimitiveArrayData), - Float64(PrimitiveArrayData), -} - -impl ArrayDataPrimitive { - /// Downcast this [`ArrayDataPrimitive`] to the corresponding [`PrimitiveArrayData`] - pub fn downcast_ref(&self) -> Option<&PrimitiveArrayData

> { - P::downcast_ref(self) - } - - /// Downcast this [`ArrayDataPrimitive`] to the corresponding [`PrimitiveArrayData`] - pub fn downcast(self) -> Option> { - P::downcast(self) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - let s = self; - primitive_op!(s, { s.slice(offset, len).into() }) - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - let s = self; - primitive_op!(s, { s.layout() }) - } - - /// Creates a new [`ArrayDataPrimitive`] from raw buffers - /// - /// # Safety - /// - /// See [`PrimitiveArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw( - builder: ArrayDataBuilder, - primitive: PrimitiveType, - ) -> Self { - use PrimitiveType::*; - match primitive { - Int8 => Self::Int8(PrimitiveArrayData::from_raw(builder)), - Int16 => Self::Int16(PrimitiveArrayData::from_raw(builder)), - Int32 => Self::Int32(PrimitiveArrayData::from_raw(builder)), - Int64 => Self::Int64(PrimitiveArrayData::from_raw(builder)), - Int128 => Self::Int128(PrimitiveArrayData::from_raw(builder)), - Int256 => Self::Int256(PrimitiveArrayData::from_raw(builder)), - UInt8 => Self::UInt8(PrimitiveArrayData::from_raw(builder)), - UInt16 => Self::UInt16(PrimitiveArrayData::from_raw(builder)), - UInt32 => Self::UInt32(PrimitiveArrayData::from_raw(builder)), - UInt64 => Self::UInt64(PrimitiveArrayData::from_raw(builder)), - Float16 => Self::Float16(PrimitiveArrayData::from_raw(builder)), - Float32 => Self::Float32(PrimitiveArrayData::from_raw(builder)), - Float64 => Self::Float64(PrimitiveArrayData::from_raw(builder)), - } - } -} - -impl From> for ArrayDataPrimitive { - fn from(value: PrimitiveArrayData

) -> Self { - P::upcast(value) - } -} - -/// ArrayData for [fixed size arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout) of [`Primitive`] -#[derive(Debug, Clone)] -pub struct PrimitiveArrayData { - data_type: DataType, - values: ScalarBuffer, - nulls: Option, -} - -impl PrimitiveArrayData { - /// Create a new [`PrimitiveArrayData`] - /// - /// # Panics - /// - /// Panics if - /// - `PhysicalType::from(&data_type) != PhysicalType::Primitive(T::TYPE)` - /// - `nulls` and `values` are different lengths - pub fn new( - data_type: DataType, - values: ScalarBuffer, - nulls: Option, - ) -> Self { - assert_eq!( - PhysicalType::from(&data_type), - PhysicalType::Primitive(T::TYPE), - "Illegal physical type for PrimitiveArrayData of datatype {data_type:?}", - ); - - if let Some(n) = nulls.as_ref() { - assert_eq!(values.len(), n.len()) - } - - Self { - data_type, - values, - nulls, - } - } - - /// Create a new [`PrimitiveArrayData`] - /// - /// # Safety - /// - /// - `PhysicalType::from(&data_type) == PhysicalType::Primitive(T::TYPE)` - /// - `nulls` and `values` must be the same length - pub unsafe fn new_unchecked( - data_type: DataType, - values: ScalarBuffer, - nulls: Option, - ) -> Self { - Self { - data_type, - values, - nulls, - } - } - - /// Creates a new [`PrimitiveArrayData`] from an [`ArrayDataBuilder`] - /// - /// # Safety - /// - /// See [`PrimitiveArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { - let values = builder.buffers.into_iter().next().unwrap(); - let values = ScalarBuffer::new(values, builder.offset, builder.len); - Self { - values, - data_type: builder.data_type, - nulls: builder.nulls, - } - } - - /// Returns the null buffer if any - #[inline] - pub fn nulls(&self) -> Option<&NullBuffer> { - self.nulls.as_ref() - } - - /// Returns the primitive values - #[inline] - pub fn values(&self) -> &ScalarBuffer { - &self.values - } - - /// Returns the data type of this array - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Returns the underlying parts of this [`PrimitiveArrayData`] - pub fn into_parts(self) -> (DataType, ScalarBuffer, Option) { - (self.data_type, self.values, self.nulls) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - Self { - data_type: self.data_type.clone(), - values: self.values.slice(offset, len), - nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), - } - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - ArrayDataLayout { - data_type: &self.data_type, - len: self.values.len(), - offset: 0, - nulls: self.nulls.as_ref(), - buffers: Buffers::one(self.values.inner()), - child_data: &[], - } - } -} diff --git a/arrow-data/src/data/run.rs b/arrow-data/src/data/run.rs deleted file mode 100644 index 7f80206a70fa..000000000000 --- a/arrow-data/src/data/run.rs +++ /dev/null @@ -1,277 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::data::primitive::{Primitive, PrimitiveArrayData}; -use crate::data::types::RunEndType; -use crate::data::ArrayDataLayout; -use crate::{ArrayData, ArrayDataBuilder, Buffers}; -use arrow_buffer::buffer::{RunEndBuffer, ScalarBuffer}; -use arrow_buffer::ArrowNativeType; -use arrow_schema::DataType; - -mod private { - use super::*; - - pub trait RunEndSealed { - const ENDS_TYPE: DataType; - - /// Downcast [`ArrayDataRun`] to `[RunArrayData`] - fn downcast_ref(data: &ArrayDataRun) -> Option<&RunArrayData> - where - Self: RunEnd; - - /// Downcast [`ArrayDataRun`] to `[RunArrayData`] - fn downcast(data: ArrayDataRun) -> Option> - where - Self: RunEnd; - - /// Cast [`RunArrayData`] to [`ArrayDataRun`] - fn upcast(v: RunArrayData) -> ArrayDataRun - where - Self: RunEnd; - } -} - -pub trait RunEnd: private::RunEndSealed + ArrowNativeType + Primitive { - const TYPE: RunEndType; -} - -macro_rules! run_end { - ($t:ty,$v:ident) => { - impl RunEnd for $t { - const TYPE: RunEndType = RunEndType::$v; - } - impl private::RunEndSealed for $t { - const ENDS_TYPE: DataType = DataType::$v; - - fn downcast_ref(data: &ArrayDataRun) -> Option<&RunArrayData> { - match data { - ArrayDataRun::$v(v) => Some(v), - _ => None, - } - } - - fn downcast(data: ArrayDataRun) -> Option> { - match data { - ArrayDataRun::$v(v) => Some(v), - _ => None, - } - } - - fn upcast(v: RunArrayData) -> ArrayDataRun { - ArrayDataRun::$v(v) - } - } - }; -} - -run_end!(i16, Int16); -run_end!(i32, Int32); -run_end!(i64, Int64); - -/// Applies op to each variant of [`ArrayDataRun`] -macro_rules! run_op { - ($array:ident, $op:block) => { - match $array { - ArrayDataRun::Int16($array) => $op - ArrayDataRun::Int32($array) => $op - ArrayDataRun::Int64($array) => $op - } - }; -} - -/// An enumeration of the types of [`RunArrayData`] -#[derive(Debug, Clone)] -pub enum ArrayDataRun { - Int16(RunArrayData), - Int32(RunArrayData), - Int64(RunArrayData), -} - -impl ArrayDataRun { - /// Downcast this [`ArrayDataRun`] to the corresponding [`RunArrayData`] - pub fn downcast_ref(&self) -> Option<&RunArrayData> { - ::downcast_ref(self) - } - - /// Downcast this [`ArrayDataRun`] to the corresponding [`RunArrayData`] - pub fn downcast(self) -> Option> { - ::downcast(self) - } - - /// Returns the values of this [`ArrayDataRun`] - #[inline] - pub fn values(&self) -> &ArrayData { - let s = self; - run_op!(s, { s.values() }) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - let s = self; - run_op!(s, { s.slice(offset, len).into() }) - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - let s = self; - run_op!(s, { s.layout() }) - } - - /// Creates a new [`ArrayDataRun`] from raw buffers - /// - /// # Safety - /// - /// See [`RunArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder, run: RunEndType) -> Self { - use RunEndType::*; - match run { - Int16 => Self::Int16(RunArrayData::from_raw(builder)), - Int32 => Self::Int32(RunArrayData::from_raw(builder)), - Int64 => Self::Int64(RunArrayData::from_raw(builder)), - } - } -} - -impl From> for ArrayDataRun { - fn from(value: RunArrayData) -> Self { - ::upcast(value) - } -} - -/// ArrayData for [run-end encoded arrays](https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout) -#[derive(Debug, Clone)] -pub struct RunArrayData { - data_type: DataType, - run_ends: RunEndBuffer, - /// The children of this RunArrayData: - /// 1: the run ends - /// 2: the values - /// - /// We store an array so that a slice can be returned in [`RunArrayData::layout`] - children: Box<[ArrayData; 2]>, -} - -impl RunArrayData { - /// Create a new [`RunArrayData`] - /// - /// # Safety - /// - /// - `PhysicalType::from(&data_type) == PhysicalType::Run(E::TYPE)` - /// - `run_ends` must contain monotonically increasing, positive values `<= len` - /// - `run_ends.get_end_physical_index() < values.len()` - pub unsafe fn new_unchecked( - data_type: DataType, - run_ends: RunEndBuffer, - values: ArrayData, - ) -> Self { - let inner = run_ends.inner(); - let child = ArrayDataBuilder::new(E::ENDS_TYPE) - .len(inner.len()) - .buffers(vec![inner.inner().clone()]) - .build_unchecked(); - - Self { - data_type, - run_ends, - children: Box::new([child, values]), - } - } - - /// Creates a new [`RunArrayData`] from raw buffers - /// - /// # Safety - /// - /// See [`RunArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { - let mut iter = builder.child_data.into_iter(); - let child1 = iter.next().unwrap(); - let child2 = iter.next().unwrap(); - - let p = ScalarBuffer::new(child1.buffers[0].clone(), child1.offset, child1.len); - let run_ends = RunEndBuffer::new_unchecked(p, builder.offset, builder.len); - - Self { - run_ends, - data_type: builder.data_type, - children: Box::new([child1, child2]), - } - } - - /// Returns the length - #[inline] - pub fn len(&self) -> usize { - self.run_ends.len() - } - - /// Returns the offset - #[inline] - pub fn offset(&self) -> usize { - self.run_ends.offset() - } - - /// Returns true if this array is empty - #[inline] - pub fn is_empty(&self) -> bool { - self.run_ends.is_empty() - } - - /// Returns the run ends - #[inline] - pub fn run_ends(&self) -> &RunEndBuffer { - &self.run_ends - } - - /// Returns the data type of this array - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Returns the child data - #[inline] - pub fn values(&self) -> &ArrayData { - &self.children[1] - } - - /// Returns the underlying parts of this [`RunArrayData`] - pub fn into_parts(self) -> (DataType, RunEndBuffer, ArrayData) { - let child = self.children.into_iter().nth(1).unwrap(); - (self.data_type, self.run_ends, child) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - Self { - data_type: self.data_type.clone(), - run_ends: self.run_ends.slice(offset, len), - children: self.children.clone(), - } - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - ArrayDataLayout { - data_type: &self.data_type, - len: self.run_ends.len(), - offset: self.run_ends.offset(), - nulls: None, - buffers: Buffers::default(), - child_data: self.children.as_ref(), - } - } -} diff --git a/arrow-data/src/data/struct.rs b/arrow-data/src/data/struct.rs deleted file mode 100644 index 229c10912a59..000000000000 --- a/arrow-data/src/data/struct.rs +++ /dev/null @@ -1,129 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::data::ArrayDataLayout; -use crate::{ArrayData, ArrayDataBuilder, Buffers}; -use arrow_buffer::buffer::NullBuffer; -use arrow_schema::DataType; - -/// ArrayData for [struct arrays](https://arrow.apache.org/docs/format/Columnar.html#struct-layout) -#[derive(Debug, Clone)] -pub struct StructArrayData { - data_type: DataType, - len: usize, - nulls: Option, - children: Vec, -} - -impl StructArrayData { - /// Create a new [`StructArrayData`] - /// - /// # Safety - /// - /// - `PhysicalType::from(&data_type) == PhysicalType::Struct` - /// - all child data and nulls must have length matching `len` - pub unsafe fn new_unchecked( - data_type: DataType, - len: usize, - nulls: Option, - children: Vec, - ) -> Self { - Self { - data_type, - len, - nulls, - children, - } - } - - /// Creates a new [`StructArrayData`] from raw buffers - /// - /// # Safety - /// - /// See [`StructArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { - let children = builder - .child_data - .into_iter() - .map(|x| x.slice(builder.offset, builder.len)) - .collect(); - - Self { - data_type: builder.data_type, - len: builder.len, - nulls: builder.nulls, - children, - } - } - - /// Returns the length of this [`StructArrayData`] - #[inline] - pub fn len(&self) -> usize { - self.len - } - - /// Returns `true` if this [`StructArrayData`] has zero length - #[inline] - pub fn is_empty(&self) -> bool { - self.len == 0 - } - - /// Returns the null buffer if any - #[inline] - pub fn nulls(&self) -> Option<&NullBuffer> { - self.nulls.as_ref() - } - - /// Returns the primitive values - #[inline] - pub fn children(&self) -> &[ArrayData] { - &self.children - } - - /// Returns the data type of this array - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Returns the underlying parts of this [`StructArrayData`] - pub fn into_parts(self) -> (DataType, Option, Vec) { - (self.data_type, self.nulls, self.children) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - Self { - len, - data_type: self.data_type.clone(), - nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), - children: self.children.iter().map(|c| c.slice(offset, len)).collect(), - } - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - ArrayDataLayout { - data_type: &self.data_type, - len: self.len, - offset: 0, - nulls: self.nulls.as_ref(), - buffers: Buffers::default(), - child_data: &self.children, - } - } -} diff --git a/arrow-data/src/data/types.rs b/arrow-data/src/data/types.rs deleted file mode 100644 index bb65b42124f3..000000000000 --- a/arrow-data/src/data/types.rs +++ /dev/null @@ -1,152 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use arrow_schema::{DataType, IntervalUnit, UnionMode}; - -/// An enumeration of the primitive types implementing [`ArrowNativeType`](arrow_buffer::ArrowNativeType) -#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] -pub enum PrimitiveType { - Int8, - Int16, - Int32, - Int64, - Int128, - Int256, - UInt8, - UInt16, - UInt32, - UInt64, - Float16, - Float32, - Float64, -} - -/// An enumeration of the types of offsets for variable length encodings -#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] -pub enum OffsetType { - Int32, - Int64, -} - -/// An enumeration of the types of variable length byte arrays -#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] -pub enum BytesType { - Binary, - Utf8, -} - -/// An enumeration of the types of dictionary key -#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] -pub enum DictionaryKeyType { - Int8, - Int16, - Int32, - Int64, - UInt8, - UInt16, - UInt32, - UInt64, -} - -/// An enumeration of the types of run key -#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] -pub enum RunEndType { - Int16, - Int32, - Int64, -} - -/// Describes the physical representation of a given [`DataType`] -#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] -pub enum PhysicalType { - Null, - Boolean, - Primitive(PrimitiveType), - FixedSizeBinary(usize), - Bytes(OffsetType, BytesType), - FixedSizeList(usize), - List(OffsetType), - Struct, - Union(UnionMode), - Dictionary(DictionaryKeyType), - Run(RunEndType), -} - -impl From<&DataType> for PhysicalType { - fn from(value: &DataType) -> Self { - match value { - DataType::Null => Self::Null, - DataType::Boolean => Self::Boolean, - DataType::Int8 => Self::Primitive(PrimitiveType::Int8), - DataType::Int16 => Self::Primitive(PrimitiveType::Int16), - DataType::Int32 => Self::Primitive(PrimitiveType::Int32), - DataType::Int64 => Self::Primitive(PrimitiveType::Int64), - DataType::UInt8 => Self::Primitive(PrimitiveType::UInt8), - DataType::UInt16 => Self::Primitive(PrimitiveType::UInt16), - DataType::UInt32 => Self::Primitive(PrimitiveType::UInt32), - DataType::UInt64 => Self::Primitive(PrimitiveType::UInt64), - DataType::Float16 => Self::Primitive(PrimitiveType::Float16), - DataType::Float32 => Self::Primitive(PrimitiveType::Float32), - DataType::Float64 => Self::Primitive(PrimitiveType::Float64), - DataType::Timestamp(_, _) => Self::Primitive(PrimitiveType::Int64), - DataType::Date32 => Self::Primitive(PrimitiveType::Int32), - DataType::Date64 => Self::Primitive(PrimitiveType::Int64), - DataType::Time32(_) => Self::Primitive(PrimitiveType::Int32), - DataType::Time64(_) => Self::Primitive(PrimitiveType::Int64), - DataType::Duration(_) => Self::Primitive(PrimitiveType::Int64), - DataType::Decimal128(_, _) => Self::Primitive(PrimitiveType::Int128), - DataType::Decimal256(_, _) => Self::Primitive(PrimitiveType::Int256), - DataType::Interval(IntervalUnit::YearMonth) => { - Self::Primitive(PrimitiveType::Int32) - } - DataType::Interval(IntervalUnit::DayTime) => { - Self::Primitive(PrimitiveType::Int64) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - Self::Primitive(PrimitiveType::Int128) - } - DataType::FixedSizeBinary(size) => Self::FixedSizeBinary(*size as usize), - DataType::Binary => Self::Bytes(OffsetType::Int32, BytesType::Binary), - DataType::LargeBinary => Self::Bytes(OffsetType::Int64, BytesType::Binary), - DataType::Utf8 => Self::Bytes(OffsetType::Int32, BytesType::Utf8), - DataType::LargeUtf8 => Self::Bytes(OffsetType::Int64, BytesType::Utf8), - DataType::List(_) => Self::List(OffsetType::Int32), - DataType::FixedSizeList(_, size) => Self::FixedSizeList(*size as usize), - DataType::LargeList(_) => Self::List(OffsetType::Int64), - DataType::Struct(_) => Self::Struct, - DataType::Union(_, _, mode) => Self::Union(*mode), - DataType::Dictionary(k, _) => match k.as_ref() { - DataType::Int8 => Self::Dictionary(DictionaryKeyType::Int8), - DataType::Int16 => Self::Dictionary(DictionaryKeyType::Int16), - DataType::Int32 => Self::Dictionary(DictionaryKeyType::Int32), - DataType::Int64 => Self::Dictionary(DictionaryKeyType::Int64), - DataType::UInt8 => Self::Dictionary(DictionaryKeyType::UInt8), - DataType::UInt16 => Self::Dictionary(DictionaryKeyType::UInt16), - DataType::UInt32 => Self::Dictionary(DictionaryKeyType::UInt32), - DataType::UInt64 => Self::Dictionary(DictionaryKeyType::UInt64), - d => panic!("illegal dictionary key data type {d}"), - }, - DataType::Map(_, _) => Self::List(OffsetType::Int32), - DataType::RunEndEncoded(f, _) => match f.data_type() { - DataType::Int16 => Self::Run(RunEndType::Int16), - DataType::Int32 => Self::Run(RunEndType::Int32), - DataType::Int64 => Self::Run(RunEndType::Int64), - d => panic!("illegal run end data type {d}"), - }, - } - } -} diff --git a/arrow-data/src/data/union.rs b/arrow-data/src/data/union.rs deleted file mode 100644 index 7d53a1f18067..000000000000 --- a/arrow-data/src/data/union.rs +++ /dev/null @@ -1,171 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::data::ArrayDataLayout; -use crate::{ArrayData, ArrayDataBuilder, Buffers}; -use arrow_buffer::buffer::ScalarBuffer; -use arrow_schema::{DataType, UnionMode}; - -/// ArrayData for [union arrays](https://arrow.apache.org/docs/format/Columnar.html#union-layout) -#[derive(Debug, Clone)] -pub struct UnionArrayData { - data_type: DataType, - type_ids: ScalarBuffer, - offsets: Option>, - children: Vec, -} - -impl UnionArrayData { - /// Creates a new [`UnionArrayData`] - /// - /// # Safety - /// - /// - `PhysicalType::from(&data_type) == PhysicalType::Union(mode)` - /// - `offsets` is `Some` iff the above `mode == UnionMode::Sparse` - /// - `type_ids` must only contain values corresponding to a field in `data_type` - /// - `children` must match the field definitions in `data_type` - /// - For each value id in type_ids, the corresponding offset, must be in bounds for the child - pub unsafe fn new_unchecked( - data_type: DataType, - type_ids: ScalarBuffer, - offsets: Option>, - children: Vec, - ) -> Self { - Self { - data_type, - type_ids, - offsets, - children, - } - } - - /// Creates a new [`UnionArrayData`] from raw buffers - /// - /// # Safety - /// - /// See [`UnionArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder, mode: UnionMode) -> Self { - match mode { - UnionMode::Sparse => { - let type_ids = builder.buffers.into_iter().next().unwrap(); - let type_ids = ScalarBuffer::new(type_ids, builder.offset, builder.len); - let children = builder - .child_data - .into_iter() - .map(|x| x.slice(builder.offset, builder.len)) - .collect(); - - Self { - type_ids, - children, - data_type: builder.data_type, - offsets: None, - } - } - UnionMode::Dense => { - let mut iter = builder.buffers.into_iter(); - let type_ids = iter.next().unwrap(); - let offsets = iter.next().unwrap(); - let type_ids = ScalarBuffer::new(type_ids, builder.offset, builder.len); - let offsets = ScalarBuffer::new(offsets, builder.offset, builder.len); - - Self { - type_ids, - data_type: builder.data_type, - offsets: Some(offsets), - children: builder.child_data, - } - } - } - } - - /// Returns the length of this array - #[inline] - pub fn len(&self) -> usize { - self.type_ids.len() - } - - /// Returns the type ids for this array - #[inline] - pub fn type_ids(&self) -> &ScalarBuffer { - &self.type_ids - } - - /// Returns the offsets for this array if this is a dense union - #[inline] - pub fn offsets(&self) -> Option<&ScalarBuffer> { - self.offsets.as_ref() - } - - /// Returns the children of this array - #[inline] - pub fn children(&self) -> &[ArrayData] { - &self.children - } - - /// Returns the data type of this array - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Returns the underlying parts of this [`UnionArrayData`] - pub fn into_parts( - self, - ) -> ( - DataType, - ScalarBuffer, - Option>, - Vec, - ) { - (self.data_type, self.type_ids, self.offsets, self.children) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - let (offsets, children) = match &self.offsets { - Some(offsets) => (Some(offsets.slice(offset, len)), self.children.clone()), - None => ( - None, - self.children.iter().map(|c| c.slice(offset, len)).collect(), - ), - }; - Self { - data_type: self.data_type.clone(), - type_ids: self.type_ids.slice(offset, len), - offsets, - children, - } - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - let buffers = match &self.offsets { - Some(offsets) => Buffers::two(self.type_ids.inner(), offsets.inner()), - None => Buffers::one(self.type_ids.inner()), - }; - - ArrayDataLayout { - data_type: &self.data_type, - len: self.type_ids.len(), - offset: 0, - nulls: None, - buffers, - child_data: &self.children, - } - } -}