From c2c61cf217461f720289b5d686cee7a7573d839b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Wed, 12 Apr 2023 10:22:19 +0100 Subject: [PATCH 1/4] Remove ArrayData from Array (#3880) --- arrow-arith/src/aggregate.rs | 7 +- arrow-array/src/array/boolean_array.rs | 82 ++++++--- arrow-array/src/array/byte_array.rs | 90 +++++++--- arrow-array/src/array/dictionary_array.rs | 99 +++++++--- .../src/array/fixed_size_binary_array.rs | 101 ++++++++--- .../src/array/fixed_size_list_array.rs | 149 ++++++++------- arrow-array/src/array/list_array.rs | 76 ++++++-- arrow-array/src/array/map_array.rs | 67 +++++-- arrow-array/src/array/mod.rs | 119 +++--------- arrow-array/src/array/null_array.rs | 54 ++++-- arrow-array/src/array/primitive_array.rs | 122 ++++++++----- arrow-array/src/array/run_array.rs | 89 +++++++-- arrow-array/src/array/struct_array.rs | 105 +++++++---- arrow-array/src/array/union_array.rs | 105 +++++++++-- arrow-array/src/record_batch.rs | 4 +- arrow-cast/src/cast.rs | 2 - arrow-ipc/src/writer.rs | 169 ++++-------------- arrow-select/src/nullif.rs | 6 +- 18 files changed, 884 insertions(+), 562 deletions(-) diff --git a/arrow-arith/src/aggregate.rs b/arrow-arith/src/aggregate.rs index a9944db13ee1..9ed6dee516a4 100644 --- a/arrow-arith/src/aggregate.rs +++ b/arrow-arith/src/aggregate.rs @@ -1241,13 +1241,12 @@ mod tests { .into_iter() .collect(); let sliced_input = sliced_input.slice(4, 2); - let sliced_input = sliced_input.as_boolean(); - assert_eq!(sliced_input, &input); + assert_eq!(sliced_input, input); - let actual = min_boolean(sliced_input); + let actual = min_boolean(&sliced_input); assert_eq!(actual, expected); - let actual = max_boolean(sliced_input); + let actual = max_boolean(&sliced_input); assert_eq!(actual, expected); } diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index dea5c07da281..d03f0fd040f2 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -20,7 +20,7 @@ use crate::builder::BooleanBuilder; use crate::iterator::BooleanIter; use crate::{Array, ArrayAccessor, ArrayRef}; use arrow_buffer::{bit_util, BooleanBuffer, MutableBuffer, NullBuffer}; -use arrow_data::ArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::DataType; use std::any::Any; use std::sync::Arc; @@ -66,8 +66,8 @@ use std::sync::Arc; /// ``` #[derive(Clone)] pub struct BooleanArray { - data: ArrayData, values: BooleanBuffer, + nulls: Option, } impl std::fmt::Debug for BooleanArray { @@ -90,27 +90,25 @@ impl BooleanArray { if let Some(n) = nulls.as_ref() { assert_eq!(values.len(), n.len()); } - - // TODO: Don't store ArrayData inside arrays (#3880) - let data = unsafe { - ArrayData::builder(DataType::Boolean) - .len(values.len()) - .offset(values.offset()) - .nulls(nulls) - .buffers(vec![values.inner().clone()]) - .build_unchecked() - }; - Self { data, values } + Self { values, nulls } } /// Returns the length of this array. pub fn len(&self) -> usize { - self.data.len() + self.values.len() } /// Returns whether this array is empty. pub fn is_empty(&self) -> bool { - self.data.is_empty() + self.values.is_empty() + } + + /// Returns a zero-copy slice of this array with the indicated offset and length. + pub fn slice(&self, offset: usize, length: usize) -> Self { + Self { + values: self.values.slice(offset, length), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)), + } } /// Returns a new boolean array builder @@ -125,7 +123,7 @@ impl BooleanArray { /// Returns the number of non null, true values within this array pub fn true_count(&self) -> usize { - match self.data.nulls() { + match self.nulls() { Some(nulls) => { let null_chunks = nulls.inner().bit_chunks().iter_padded(); let value_chunks = self.values().bit_chunks().iter_padded(); @@ -247,25 +245,48 @@ impl Array for BooleanArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &DataType::Boolean + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { - // TODO: Slice buffers directly (#3880) - Arc::new(Self::from(self.data.slice(offset, length))) + Arc::new(self.slice(offset, length)) + } + + fn len(&self) -> usize { + self.values.len() + } + + fn is_empty(&self) -> bool { + self.values.is_empty() + } + + fn offset(&self) -> usize { + self.values.offset() } fn nulls(&self) -> Option<&NullBuffer> { - self.data.nulls() + self.nulls.as_ref() + } + + fn get_buffer_memory_size(&self) -> usize { + let mut sum = self.values.inner().capacity(); + if let Some(x) = &self.nulls { + sum += x.buffer().capacity() + } + sum + } + + fn get_array_memory_size(&self) -> usize { + std::mem::size_of::() + self.get_buffer_memory_size() } } @@ -324,13 +345,22 @@ impl From for BooleanArray { let values = BooleanBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); - Self { data, values } + Self { + values, + nulls: data.nulls().cloned(), + } } } impl From for ArrayData { fn from(array: BooleanArray) -> Self { - array.data + let builder = ArrayDataBuilder::new(DataType::Boolean) + .len(array.values.len()) + .offset(array.values.offset()) + .nulls(array.nulls) + .buffers(vec![array.values.into_inner()]); + + unsafe { builder.build_unchecked() } } } diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index f0e43e6949e9..e23079ef9be9 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -23,7 +23,7 @@ use crate::types::ByteArrayType; use crate::{Array, ArrayAccessor, ArrayRef, OffsetSizeTrait}; use arrow_buffer::{ArrowNativeType, Buffer}; use arrow_buffer::{NullBuffer, OffsetBuffer}; -use arrow_data::ArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::DataType; use std::any::Any; use std::sync::Arc; @@ -39,17 +39,19 @@ use std::sync::Arc; /// [`BinaryArray`]: crate::BinaryArray /// [`LargeBinaryArray`]: crate::LargeBinaryArray pub struct GenericByteArray { - data: ArrayData, + data_type: DataType, value_offsets: OffsetBuffer, value_data: Buffer, + nulls: Option, } impl Clone for GenericByteArray { fn clone(&self) -> Self { Self { - data: self.data.clone(), + data_type: self.data_type.clone(), value_offsets: self.value_offsets.clone(), value_data: self.value_data.clone(), + nulls: self.nulls.clone(), } } } @@ -135,7 +137,7 @@ impl GenericByteArray { /// Panics if index `i` is out of bounds. pub fn value(&self, i: usize) -> &T::Native { assert!( - i < self.data.len(), + i < self.len(), "Trying to access an element at index {} from a {}{}Array of length {}", i, T::Offset::PREFIX, @@ -154,29 +156,33 @@ impl GenericByteArray { /// Returns a zero-copy slice of this array with the indicated offset and length. pub fn slice(&self, offset: usize, length: usize) -> Self { - // TODO: Slice buffers directly (#3880) - self.data.slice(offset, length).into() + Self { + data_type: self.data_type.clone(), + value_offsets: self.value_offsets.slice(offset, length), + value_data: self.value_data.clone(), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)), + } } /// Returns `GenericByteBuilder` of this byte array for mutating its values if the underlying /// offset and data buffers are not shared by others. pub fn into_builder(self) -> Result, Self> { let len = self.len(); - let null_bit_buffer = self.data.nulls().map(|b| b.inner().sliced()); + let value_len = + T::Offset::as_usize(self.value_offsets()[len] - self.value_offsets()[0]); + + let data = self.into_data(); + let null_bit_buffer = data.nulls().map(|b| b.inner().sliced()); let element_len = std::mem::size_of::(); - let offset_buffer = self.data.buffers()[0] - .slice_with_length(self.data.offset() * element_len, (len + 1) * element_len); + let offset_buffer = data.buffers()[0] + .slice_with_length(data.offset() * element_len, (len + 1) * element_len); let element_len = std::mem::size_of::(); - let value_len = - T::Offset::as_usize(self.value_offsets()[len] - self.value_offsets()[0]); - let value_buffer = self.data.buffers()[1] - .slice_with_length(self.data.offset() * element_len, value_len * element_len); + let value_buffer = data.buffers()[1] + .slice_with_length(data.offset() * element_len, value_len * element_len); - drop(self.data); - drop(self.value_data); - drop(self.value_offsets); + drop(data); let try_mutable_null_buffer = match null_bit_buffer { None => Ok(None), @@ -258,24 +264,49 @@ impl Array for GenericByteArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &self.data_type + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.slice(offset, length)) } + fn len(&self) -> usize { + self.value_offsets.len() - 1 + } + + fn is_empty(&self) -> bool { + self.value_offsets.len() <= 1 + } + + fn offset(&self) -> usize { + 0 + } + fn nulls(&self) -> Option<&NullBuffer> { - self.data.nulls() + self.nulls.as_ref() + } + + fn get_buffer_memory_size(&self) -> usize { + let mut sum = self.value_offsets.inner().inner().capacity(); + sum += self.value_data.capacity(); + if let Some(x) = &self.nulls { + sum += x.buffer().capacity() + } + sum + } + + fn get_array_memory_size(&self) -> usize { + std::mem::size_of::() + self.get_buffer_memory_size() } } @@ -313,18 +344,25 @@ impl From for GenericByteArray { let value_offsets = unsafe { get_offsets(&data) }; let value_data = data.buffers()[1].clone(); Self { - data, - // SAFETY: - // ArrayData must be valid, and validated data type above value_offsets, value_data, + data_type: data.data_type().clone(), + nulls: data.nulls().cloned(), } } } impl From> for ArrayData { fn from(array: GenericByteArray) -> Self { - array.data + let len = array.len(); + + let offsets = array.value_offsets.into_inner().into_inner(); + let builder = ArrayDataBuilder::new(array.data_type) + .len(len) + .buffers(vec![offsets, array.value_data]) + .nulls(array.nulls); + + unsafe { builder.build_unchecked() } } } diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index dd6213d543ea..f25a077a81ba 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -208,9 +208,7 @@ pub type UInt64DictionaryArray = DictionaryArray; /// assert_eq!(&array, &expected); /// ``` pub struct DictionaryArray { - /// Data of this dictionary. Note that this is _not_ compatible with the C Data interface, - /// as, in the current implementation, `values` below are the first child of this struct. - data: ArrayData, + data_type: DataType, /// The keys of this dictionary. These are constructed from the /// buffer and null bitmap of `data`. Also, note that these do @@ -228,7 +226,7 @@ pub struct DictionaryArray { impl Clone for DictionaryArray { fn clone(&self) -> Self { Self { - data: self.data.clone(), + data_type: self.data_type.clone(), keys: self.keys.clone(), values: self.values.clone(), is_ordered: self.is_ordered, @@ -325,8 +323,12 @@ impl DictionaryArray { /// Returns a zero-copy slice of this array with the indicated offset and length. pub fn slice(&self, offset: usize, length: usize) -> Self { - // TODO: Slice buffers directly (#3880) - self.data.slice(offset, length).into() + Self { + data_type: self.data_type.clone(), + keys: self.keys.slice(offset, length), + values: self.values.clone(), + is_ordered: self.is_ordered, + } } /// Downcast this dictionary to a [`TypedDictionaryArray`] @@ -390,8 +392,7 @@ impl DictionaryArray { assert!(values.len() >= self.values.len()); let builder = self - .data - .clone() + .to_data() .into_builder() .data_type(DataType::Dictionary( Box::new(K::DATA_TYPE), @@ -419,7 +420,6 @@ impl DictionaryArray { let key_array = self.keys().clone(); let value_array = self.values().as_primitive::().clone(); - drop(self.data); drop(self.keys); drop(self.values); @@ -504,20 +504,22 @@ impl From for DictionaryArray { key_data_type ); + let values = make_array(data.child_data()[0].clone()); + let data_type = data.data_type().clone(); + // create a zero-copy of the keys' data // SAFETY: // ArrayData is valid and verified type above let keys = PrimitiveArray::::from(unsafe { - data.clone() - .into_builder() + data.into_builder() .data_type(T::DATA_TYPE) .child_data(vec![]) .build_unchecked() }); - let values = make_array(data.child_data()[0].clone()); + Self { - data, + data_type, keys, values, is_ordered: false, @@ -530,7 +532,14 @@ impl From for DictionaryArray { impl From> for ArrayData { fn from(array: DictionaryArray) -> Self { - array.data + let builder = array + .keys + .into_data() + .into_builder() + .data_type(array.data_type) + .child_data(vec![array.values.to_data()]); + + unsafe { builder.build_unchecked() } } } @@ -594,24 +603,46 @@ impl Array for DictionaryArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &self.data_type + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.slice(offset, length)) } + fn len(&self) -> usize { + self.keys.len() + } + + fn is_empty(&self) -> bool { + self.keys.is_empty() + } + + fn offset(&self) -> usize { + self.keys.offset() + } + fn nulls(&self) -> Option<&NullBuffer> { - self.data.nulls() + self.keys.nulls() + } + + fn get_buffer_memory_size(&self) -> usize { + self.keys.get_buffer_memory_size() + self.values.get_buffer_memory_size() + } + + fn get_array_memory_size(&self) -> usize { + std::mem::size_of::() + + self.keys.get_buffer_memory_size() + + self.values.get_array_memory_size() } } @@ -685,10 +716,6 @@ impl<'a, K: ArrowDictionaryKeyType, V: Sync> Array for TypedDictionaryArray<'a, self.dictionary } - fn data(&self) -> &ArrayData { - &self.dictionary.data - } - fn to_data(&self) -> ArrayData { self.dictionary.to_data() } @@ -697,13 +724,37 @@ impl<'a, K: ArrowDictionaryKeyType, V: Sync> Array for TypedDictionaryArray<'a, self.dictionary.into_data() } + fn data_type(&self) -> &DataType { + self.dictionary.data_type() + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.dictionary.slice(offset, length)) } + fn len(&self) -> usize { + self.dictionary.len() + } + + fn is_empty(&self) -> bool { + self.dictionary.is_empty() + } + + fn offset(&self) -> usize { + self.dictionary.offset() + } + fn nulls(&self) -> Option<&NullBuffer> { self.dictionary.nulls() } + + fn get_buffer_memory_size(&self) -> usize { + self.dictionary.get_buffer_memory_size() + } + + fn get_array_memory_size(&self) -> usize { + self.dictionary.get_array_memory_size() + } } impl<'a, K, V> IntoIterator for TypedDictionaryArray<'a, K, V> diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index f8d2f04dee69..08ce76c066c3 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -20,7 +20,7 @@ use crate::iterator::FixedSizeBinaryIter; use crate::{Array, ArrayAccessor, ArrayRef, FixedSizeListArray}; use arrow_buffer::buffer::NullBuffer; use arrow_buffer::{bit_util, Buffer, MutableBuffer}; -use arrow_data::ArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::sync::Arc; @@ -51,9 +51,11 @@ use std::sync::Arc; /// #[derive(Clone)] pub struct FixedSizeBinaryArray { - data: ArrayData, + data_type: DataType, // Must be DataType::FixedSizeBinary(value_length) value_data: Buffer, - length: i32, + nulls: Option, + len: usize, + value_length: i32, } impl FixedSizeBinaryArray { @@ -62,12 +64,12 @@ impl FixedSizeBinaryArray { /// Panics if index `i` is out of bounds. pub fn value(&self, i: usize) -> &[u8] { assert!( - i < self.data.len(), + i < self.len(), "Trying to access an element at index {} from a FixedSizeBinaryArray of length {}", i, self.len() ); - let offset = i + self.data.offset(); + let offset = i + self.offset(); unsafe { let pos = self.value_offset_at(offset); std::slice::from_raw_parts( @@ -81,7 +83,7 @@ impl FixedSizeBinaryArray { /// # Safety /// Caller is responsible for ensuring that the index is within the bounds of the array pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] { - let offset = i + self.data.offset(); + let offset = i + self.offset(); let pos = self.value_offset_at(offset); std::slice::from_raw_parts( self.value_data.as_ptr().offset(pos as isize), @@ -94,7 +96,7 @@ impl FixedSizeBinaryArray { /// Note this doesn't do any bound checking, for performance reason. #[inline] pub fn value_offset(&self, i: usize) -> i32 { - self.value_offset_at(self.data.offset() + i) + self.value_offset_at(self.offset() + i) } /// Returns the length for an element. @@ -102,18 +104,30 @@ impl FixedSizeBinaryArray { /// All elements have the same length as the array is a fixed size. #[inline] pub fn value_length(&self) -> i32 { - self.length + self.value_length } /// Returns a clone of the value data buffer pub fn value_data(&self) -> Buffer { - self.data.buffers()[0].clone() + self.value_data.clone() } /// Returns a zero-copy slice of this array with the indicated offset and length. - pub fn slice(&self, offset: usize, length: usize) -> Self { - // TODO: Slice buffers directly (#3880) - self.data.slice(offset, length).into() + pub fn slice(&self, offset: usize, len: usize) -> Self { + assert!( + offset.saturating_add(len) <= self.len, + "the length + offset of the sliced FixedSizeBinaryArray cannot exceed the existing length" + ); + + let size = self.value_length as usize; + + Self { + data_type: self.data_type.clone(), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, len)), + value_length: self.value_length, + value_data: self.value_data.slice_with_length(offset * size, len * size), + len, + } } /// Create an array from an iterable argument of sparse byte slices. @@ -364,7 +378,7 @@ impl FixedSizeBinaryArray { #[inline] fn value_offset_at(&self, i: usize) -> i32 { - self.length * i as i32 + self.value_length * i as i32 } /// constructs a new iterator @@ -380,22 +394,33 @@ impl From for FixedSizeBinaryArray { 1, "FixedSizeBinaryArray data should contain 1 buffer only (values)" ); - let value_data = data.buffers()[0].clone(); - let length = match data.data_type() { + let value_length = match data.data_type() { DataType::FixedSizeBinary(len) => *len, _ => panic!("Expected data type to be FixedSizeBinary"), }; + + let size = value_length as usize; + let value_data = + data.buffers()[0].slice_with_length(data.offset() * size, data.len() * size); + Self { - data, + data_type: data.data_type().clone(), + nulls: data.nulls().cloned(), + len: data.len(), value_data, - length, + value_length, } } } impl From for ArrayData { fn from(array: FixedSizeBinaryArray) -> Self { - array.data + let builder = ArrayDataBuilder::new(array.data_type) + .len(array.len) + .buffers(vec![array.value_data]) + .nulls(array.nulls); + + unsafe { builder.build_unchecked() } } } @@ -468,24 +493,48 @@ impl Array for FixedSizeBinaryArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &self.data_type + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.slice(offset, length)) } + fn len(&self) -> usize { + self.len + } + + fn is_empty(&self) -> bool { + self.len == 0 + } + + fn offset(&self) -> usize { + 0 + } + fn nulls(&self) -> Option<&NullBuffer> { - self.data.nulls() + self.nulls.as_ref() + } + + fn get_buffer_memory_size(&self) -> usize { + let mut sum = self.value_data.capacity(); + if let Some(n) = &self.nulls { + sum += n.buffer().capacity(); + } + sum + } + + fn get_array_memory_size(&self) -> usize { + std::mem::size_of::() + self.get_buffer_memory_size() } } @@ -566,9 +615,9 @@ mod tests { fixed_size_binary_array.value(1) ); assert_eq!(2, fixed_size_binary_array.len()); - assert_eq!(5, fixed_size_binary_array.value_offset(0)); + assert_eq!(0, fixed_size_binary_array.value_offset(0)); assert_eq!(5, fixed_size_binary_array.value_length()); - assert_eq!(10, fixed_size_binary_array.value_offset(1)); + assert_eq!(5, fixed_size_binary_array.value_offset(1)); } #[test] diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index a56bb017f6b0..86adafa066f0 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -19,7 +19,7 @@ use crate::array::print_long_array; use crate::builder::{FixedSizeListBuilder, PrimitiveBuilder}; use crate::{make_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType}; use arrow_buffer::buffer::NullBuffer; -use arrow_data::ArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::DataType; use std::any::Any; use std::sync::Arc; @@ -64,9 +64,11 @@ use std::sync::Arc; /// [crate::array::FixedSizeBinaryArray] #[derive(Clone)] pub struct FixedSizeListArray { - data: ArrayData, + data_type: DataType, // Must be DataType::FixedSizeList(value_length) values: ArrayRef, - length: i32, + nulls: Option, + value_length: i32, + len: usize, } impl FixedSizeListArray { @@ -91,7 +93,7 @@ impl FixedSizeListArray { /// Note this doesn't do any bound checking, for performance reason. #[inline] pub fn value_offset(&self, i: usize) -> i32 { - self.value_offset_at(self.data.offset() + i) + self.value_offset_at(i) } /// Returns the length for an element. @@ -99,18 +101,29 @@ impl FixedSizeListArray { /// All elements have the same length as the array is a fixed size. #[inline] pub const fn value_length(&self) -> i32 { - self.length + self.value_length } #[inline] const fn value_offset_at(&self, i: usize) -> i32 { - i as i32 * self.length + i as i32 * self.value_length } /// Returns a zero-copy slice of this array with the indicated offset and length. - pub fn slice(&self, offset: usize, length: usize) -> Self { - // TODO: Slice buffers directly (#3880) - self.data.slice(offset, length).into() + pub fn slice(&self, offset: usize, len: usize) -> Self { + assert!( + offset.saturating_add(len) <= self.len, + "the length + offset of the sliced FixedSizeListArray cannot exceed the existing length" + ); + let size = self.value_length as usize; + + Self { + data_type: self.data_type.clone(), + values: self.values.slice(offset * size, len * size), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, len)), + value_length: self.value_length, + len, + } } /// Creates a [`FixedSizeListArray`] from an iterator of primitive values @@ -163,45 +176,35 @@ impl FixedSizeListArray { impl From for FixedSizeListArray { fn from(data: ArrayData) -> Self { - assert_eq!( - data.buffers().len(), - 0, - "FixedSizeListArray data should not contain a buffer for value offsets" - ); - assert_eq!( - data.child_data().len(), - 1, - "FixedSizeListArray should contain a single child array (values array)" - ); - let values = make_array(data.child_data()[0].clone()); - let length = match data.data_type() { - DataType::FixedSizeList(_, len) => { - if *len > 0 { - // check that child data is multiple of length - assert_eq!( - values.len() % *len as usize, - 0, - "FixedSizeListArray child array length should be a multiple of {len}" - ); - } - - *len - } + let value_length = match data.data_type() { + DataType::FixedSizeList(_, len) => *len, _ => { panic!("FixedSizeListArray data should contain a FixedSizeList data type") } }; + + let size = value_length as usize; + let values = make_array( + data.child_data()[0].slice(data.offset() * size, data.len() * size), + ); Self { - data, + data_type: data.data_type().clone(), values, - length, + nulls: data.nulls().cloned(), + value_length, + len: data.len(), } } } impl From for ArrayData { fn from(array: FixedSizeListArray) -> Self { - array.data + let builder = ArrayDataBuilder::new(array.data_type) + .len(array.len) + .nulls(array.nulls) + .child_data(vec![array.values.to_data()]); + + unsafe { builder.build_unchecked() } } } @@ -210,24 +213,52 @@ impl Array for FixedSizeListArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &self.data_type + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.slice(offset, length)) } + fn len(&self) -> usize { + self.len + } + + fn is_empty(&self) -> bool { + self.len == 0 + } + + fn offset(&self) -> usize { + 0 + } + fn nulls(&self) -> Option<&NullBuffer> { - self.data.nulls() + self.nulls.as_ref() + } + + fn get_buffer_memory_size(&self) -> usize { + let mut size = self.values.get_buffer_memory_size(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size + } + + fn get_array_memory_size(&self) -> usize { + let mut size = std::mem::size_of::() + self.values.get_array_memory_size(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size } } @@ -258,7 +289,6 @@ mod tests { use super::*; use crate::cast::AsArray; use crate::types::Int32Type; - use crate::Int32Array; use arrow_buffer::{bit_util, Buffer}; use arrow_schema::Field; @@ -289,15 +319,7 @@ mod tests { assert_eq!(0, list_array.null_count()); assert_eq!(6, list_array.value_offset(2)); assert_eq!(3, list_array.value_length()); - assert_eq!( - 0, - list_array - .value(0) - .as_any() - .downcast_ref::() - .unwrap() - .value(0) - ); + assert_eq!(0, list_array.value(0).as_primitive::().value(0)); for i in 0..3 { assert!(list_array.is_valid(i)); assert!(!list_array.is_null(i)); @@ -305,26 +327,24 @@ mod tests { // Now test with a non-zero offset let list_data = ArrayData::builder(list_data_type) - .len(3) + .len(2) .offset(1) .add_child_data(value_data.clone()) .build() .unwrap(); let list_array = FixedSizeListArray::from(list_data); - assert_eq!(value_data, list_array.values().to_data()); + assert_eq!(value_data.slice(3, 6), list_array.values().to_data()); assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(3, list_array.len()); + assert_eq!(2, list_array.len()); assert_eq!(0, list_array.null_count()); assert_eq!(3, list_array.value(0).as_primitive::().value(0)); - assert_eq!(6, list_array.value_offset(1)); + assert_eq!(3, list_array.value_offset(1)); assert_eq!(3, list_array.value_length()); } #[test] - #[should_panic( - expected = "FixedSizeListArray child array length should be a multiple of 3" - )] + #[should_panic(expected = "assertion failed: (offset + length) <= self.len()")] // Different error messages, so skip for now // https://github.com/apache/arrow-rs/issues/1545 #[cfg(not(feature = "force_validate"))] @@ -389,11 +409,10 @@ mod tests { let sliced_array = list_array.slice(1, 4); assert_eq!(4, sliced_array.len()); - assert_eq!(1, sliced_array.offset()); assert_eq!(2, sliced_array.null_count()); for i in 0..sliced_array.len() { - if bit_util::get_bit(&null_bits, sliced_array.offset() + i) { + if bit_util::get_bit(&null_bits, 1 + i) { assert!(sliced_array.is_valid(i)); } else { assert!(sliced_array.is_null(i)); @@ -406,12 +425,14 @@ mod tests { .downcast_ref::() .unwrap(); assert_eq!(2, sliced_list_array.value_length()); - assert_eq!(6, sliced_list_array.value_offset(2)); - assert_eq!(8, sliced_list_array.value_offset(3)); + assert_eq!(4, sliced_list_array.value_offset(2)); + assert_eq!(6, sliced_list_array.value_offset(3)); } #[test] - #[should_panic(expected = "assertion failed: (offset + length) <= self.len()")] + #[should_panic( + expected = "the offset of the new Buffer cannot exceed the existing length" + )] fn test_fixed_size_list_array_index_out_of_bound() { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index f47ea80696e7..8e6f84743f2a 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -21,7 +21,7 @@ use crate::{ iterator::GenericListArrayIter, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, }; use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer}; -use arrow_data::ArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field}; use num::Integer; use std::any::Any; @@ -52,7 +52,8 @@ impl OffsetSizeTrait for i64 { /// /// For non generic lists, you may wish to consider using [`ListArray`] or [`LargeListArray`]` pub struct GenericListArray { - data: ArrayData, + data_type: DataType, + nulls: Option, values: ArrayRef, value_offsets: OffsetBuffer, } @@ -60,7 +61,8 @@ pub struct GenericListArray { impl Clone for GenericListArray { fn clone(&self) -> Self { Self { - data: self.data.clone(), + data_type: self.data_type.clone(), + nulls: self.nulls.clone(), values: self.values.clone(), value_offsets: self.value_offsets.clone(), } @@ -144,8 +146,12 @@ impl GenericListArray { /// Returns a zero-copy slice of this array with the indicated offset and length. pub fn slice(&self, offset: usize, length: usize) -> Self { - // TODO: Slice buffers directly (#3880) - self.data.slice(offset, length).into() + Self { + data_type: self.data_type.clone(), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)), + values: self.values.clone(), + value_offsets: self.value_offsets.slice(offset, length), + } } /// Creates a [`GenericListArray`] from an iterator of primitive values @@ -201,7 +207,14 @@ impl From> for ArrayData { fn from(array: GenericListArray) -> Self { - array.data + let len = array.len(); + let builder = ArrayDataBuilder::new(array.data_type) + .len(len) + .nulls(array.nulls) + .buffers(vec![array.value_offsets.into_inner().into_inner()]) + .child_data(vec![array.values.to_data()]); + + unsafe { builder.build_unchecked() } } } @@ -244,7 +257,8 @@ impl GenericListArray { let value_offsets = unsafe { get_offsets(&data) }; Ok(Self { - data, + data_type: data.data_type().clone(), + nulls: data.nulls().cloned(), values, value_offsets, }) @@ -256,24 +270,54 @@ impl Array for GenericListArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &self.data_type + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.slice(offset, length)) } + fn len(&self) -> usize { + self.value_offsets.len() - 1 + } + + fn is_empty(&self) -> bool { + self.value_offsets.len() <= 1 + } + + fn offset(&self) -> usize { + 0 + } + fn nulls(&self) -> Option<&NullBuffer> { - self.data.nulls() + self.nulls.as_ref() + } + + fn get_buffer_memory_size(&self) -> usize { + let mut size = self.values.get_buffer_memory_size(); + size += self.value_offsets.inner().inner().capacity(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size + } + + fn get_array_memory_size(&self) -> usize { + let mut size = std::mem::size_of::() + self.values.get_array_memory_size(); + size += self.value_offsets.inner().inner().capacity(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size } } @@ -649,11 +693,10 @@ mod tests { let sliced_array = list_array.slice(1, 6); assert_eq!(6, sliced_array.len()); - assert_eq!(1, sliced_array.offset()); assert_eq!(3, sliced_array.null_count()); for i in 0..sliced_array.len() { - if bit_util::get_bit(&null_bits, sliced_array.offset() + i) { + if bit_util::get_bit(&null_bits, 1 + i) { assert!(sliced_array.is_valid(i)); } else { assert!(sliced_array.is_null(i)); @@ -713,11 +756,10 @@ mod tests { let sliced_array = list_array.slice(1, 6); assert_eq!(6, sliced_array.len()); - assert_eq!(1, sliced_array.offset()); assert_eq!(3, sliced_array.null_count()); for i in 0..sliced_array.len() { - if bit_util::get_bit(&null_bits, sliced_array.offset() + i) { + if bit_util::get_bit(&null_bits, 1 + i) { assert!(sliced_array.is_valid(i)); } else { assert!(sliced_array.is_null(i)); diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index 1629532b8452..18b3eb3cec32 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -18,7 +18,7 @@ use crate::array::{get_offsets, print_long_array}; use crate::{make_array, Array, ArrayRef, ListArray, StringArray, StructArray}; use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, OffsetBuffer, ToByteSlice}; -use arrow_data::ArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field}; use std::any::Any; use std::sync::Arc; @@ -30,7 +30,8 @@ use std::sync::Arc; /// [StructArray] with 2 child fields. #[derive(Clone)] pub struct MapArray { - data: ArrayData, + data_type: DataType, + nulls: Option, /// The [`StructArray`] that is the direct child of this array entries: ArrayRef, /// The first child of `entries`, the "keys" of this MapArray @@ -112,8 +113,14 @@ impl MapArray { /// Returns a zero-copy slice of this array with the indicated offset and length. pub fn slice(&self, offset: usize, length: usize) -> Self { - // TODO: Slice buffers directly (#3880) - self.data.slice(offset, length).into() + Self { + data_type: self.data_type.clone(), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)), + entries: self.entries.clone(), + keys: self.keys.clone(), + values: self.values.clone(), + value_offsets: self.value_offsets.slice(offset, length), + } } } @@ -126,7 +133,14 @@ impl From for MapArray { impl From for ArrayData { fn from(array: MapArray) -> Self { - array.data + let len = array.len(); + let builder = ArrayDataBuilder::new(array.data_type) + .len(len) + .nulls(array.nulls) + .buffers(vec![array.value_offsets.into_inner().into_inner()]) + .child_data(vec![array.entries.to_data()]); + + unsafe { builder.build_unchecked() } } } @@ -177,7 +191,8 @@ impl MapArray { let value_offsets = unsafe { get_offsets(&data) }; Ok(Self { - data, + data_type: data.data_type().clone(), + nulls: data.nulls().cloned(), entries, keys, values, @@ -229,34 +244,54 @@ impl Array for MapArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into_data() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &self.data_type + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.slice(offset, length)) } + fn len(&self) -> usize { + self.value_offsets.len() - 1 + } + + fn is_empty(&self) -> bool { + self.value_offsets.len() <= 1 + } + + fn offset(&self) -> usize { + 0 + } + fn nulls(&self) -> Option<&NullBuffer> { - self.data.nulls() + self.nulls.as_ref() } - /// Returns the total number of bytes of memory occupied by the buffers owned by this [MapArray]. fn get_buffer_memory_size(&self) -> usize { - self.data.get_buffer_memory_size() + let mut size = self.entries.get_buffer_memory_size(); + size += self.value_offsets.inner().inner().capacity(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size } - /// Returns the total number of bytes of memory occupied physically by this [MapArray]. fn get_array_memory_size(&self) -> usize { - self.data.get_array_memory_size() + std::mem::size_of_val(self) + let mut size = std::mem::size_of::() + self.entries.get_array_memory_size(); + size += self.value_offsets.inner().inner().capacity(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size } } diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index fa6e970b497a..e6fd6828bac7 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -94,10 +94,6 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// ``` fn as_any(&self) -> &dyn Any; - /// Returns a reference to the underlying data of this array - #[deprecated(note = "Use Array::to_data or Array::into_data")] - fn data(&self) -> &ArrayData; - /// Returns the underlying data of this array fn to_data(&self) -> ArrayData; @@ -106,13 +102,6 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// Unlike [`Array::to_data`] this consumes self, allowing it avoid unnecessary clones fn into_data(self) -> ArrayData; - /// Returns a reference-counted pointer to the underlying data of this array. - #[deprecated(note = "Use Array::to_data or Array::into_data")] - #[allow(deprecated)] - fn data_ref(&self) -> &ArrayData { - self.data() - } - /// Returns a reference to the [`DataType`](arrow_schema::DataType) of this array. /// /// # Example: @@ -125,10 +114,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// /// assert_eq!(*array.data_type(), DataType::Int32); /// ``` - #[allow(deprecated)] // (#3880) - fn data_type(&self) -> &DataType { - self.data_ref().data_type() - } + fn data_type(&self) -> &DataType; /// Returns a zero-copy slice of this array with the indicated offset and length. /// @@ -156,10 +142,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// /// assert_eq!(array.len(), 5); /// ``` - #[allow(deprecated)] // (#3880) - fn len(&self) -> usize { - self.data_ref().len() - } + fn len(&self) -> usize; /// Returns whether this array is empty. /// @@ -172,10 +155,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// /// assert_eq!(array.is_empty(), false); /// ``` - #[allow(deprecated)] // (#3880) - fn is_empty(&self) -> bool { - self.data_ref().is_empty() - } + fn is_empty(&self) -> bool; /// Returns the offset into the underlying data used by this array(-slice). /// Note that the underlying data can be shared by many arrays. @@ -184,19 +164,15 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// # Example: /// /// ``` - /// use arrow_array::{Array, Int32Array}; + /// use arrow_array::{Array, BooleanArray}; /// - /// let array = Int32Array::from(vec![1, 2, 3, 4, 5]); - /// // Make slice over the values [2, 3, 4] + /// let array = BooleanArray::from(vec![false, false, true, true]); /// let array_slice = array.slice(1, 3); /// /// assert_eq!(array.offset(), 0); /// assert_eq!(array_slice.offset(), 1); /// ``` - #[allow(deprecated)] // (#3880) - fn offset(&self) -> usize { - self.data_ref().offset() - } + fn offset(&self) -> usize; /// Returns the null buffers of this array if any fn nulls(&self) -> Option<&NullBuffer>; @@ -253,21 +229,12 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// Returns the total number of bytes of memory pointed to by this array. /// The buffers store bytes in the Arrow memory format, and include the data as well as the validity map. - #[allow(deprecated)] // (#3880) - fn get_buffer_memory_size(&self) -> usize { - self.data_ref().get_buffer_memory_size() - } + fn get_buffer_memory_size(&self) -> usize; /// Returns the total number of bytes of memory occupied physically by this array. /// This value will always be greater than returned by `get_buffer_memory_size()` and /// includes the overhead of the data structures that contain the pointers to the various buffers. - #[allow(deprecated)] // (#3880) - fn get_array_memory_size(&self) -> usize { - // both data.get_array_memory_size and size_of_val(self) include ArrayData fields, - // to only count additional fields of this array subtract size_of(ArrayData) - self.data_ref().get_array_memory_size() + std::mem::size_of_val(self) - - std::mem::size_of::() - } + fn get_array_memory_size(&self) -> usize; } /// A reference-counted reference to a generic `Array`. @@ -279,11 +246,6 @@ impl Array for ArrayRef { self.as_ref().as_any() } - #[allow(deprecated)] - fn data(&self) -> &ArrayData { - self.as_ref().data() - } - fn to_data(&self) -> ArrayData { self.as_ref().to_data() } @@ -292,11 +254,6 @@ impl Array for ArrayRef { self.to_data() } - #[allow(deprecated)] - fn data_ref(&self) -> &ArrayData { - self.as_ref().data_ref() - } - fn data_type(&self) -> &DataType { self.as_ref().data_type() } @@ -347,11 +304,6 @@ impl<'a, T: Array> Array for &'a T { T::as_any(self) } - #[allow(deprecated)] - fn data(&self) -> &ArrayData { - T::data(self) - } - fn to_data(&self) -> ArrayData { T::to_data(self) } @@ -360,11 +312,6 @@ impl<'a, T: Array> Array for &'a T { self.to_data() } - #[allow(deprecated)] - fn data_ref(&self) -> &ArrayData { - T::data_ref(self) - } - fn data_type(&self) -> &DataType { T::data_type(self) } @@ -435,93 +382,80 @@ pub trait ArrayAccessor: Array { } impl PartialEq for dyn Array + '_ { - #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for dyn Array + '_ { - #[allow(deprecated)] fn eq(&self, other: &T) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for NullArray { - #[allow(deprecated)] fn eq(&self, other: &NullArray) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for PrimitiveArray { - #[allow(deprecated)] fn eq(&self, other: &PrimitiveArray) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for DictionaryArray { - #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for BooleanArray { - #[allow(deprecated)] fn eq(&self, other: &BooleanArray) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for GenericStringArray { - #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for GenericBinaryArray { - #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for FixedSizeBinaryArray { - #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for GenericListArray { - #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for MapArray { - #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for FixedSizeListArray { - #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for StructArray { - #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } @@ -752,7 +686,7 @@ mod tests { use super::*; use crate::cast::{as_union_array, downcast_array}; use crate::downcast_run_array; - use arrow_buffer::{Buffer, MutableBuffer}; + use arrow_buffer::MutableBuffer; use arrow_schema::{Field, Fields, UnionFields, UnionMode}; #[test] @@ -962,13 +896,9 @@ mod tests { assert_eq!(0, null_arr.get_buffer_memory_size()); assert_eq!( - std::mem::size_of::(), + std::mem::size_of::(), null_arr.get_array_memory_size() ); - assert_eq!( - std::mem::size_of::(), - std::mem::size_of::(), - ); } #[test] @@ -1001,8 +931,7 @@ mod tests { // which includes the optional validity buffer // plus one buffer on the heap assert_eq!( - std::mem::size_of::>() - + std::mem::size_of::(), + std::mem::size_of::>(), empty_with_bitmap.get_array_memory_size() ); diff --git a/arrow-array/src/array/null_array.rs b/arrow-array/src/array/null_array.rs index b5d9247a6d7f..c7f61d91da70 100644 --- a/arrow-array/src/array/null_array.rs +++ b/arrow-array/src/array/null_array.rs @@ -19,7 +19,7 @@ use crate::{Array, ArrayRef}; use arrow_buffer::buffer::NullBuffer; -use arrow_data::ArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::DataType; use std::any::Any; use std::sync::Arc; @@ -40,7 +40,7 @@ use std::sync::Arc; /// ``` #[derive(Clone)] pub struct NullArray { - data: ArrayData, + len: usize, } impl NullArray { @@ -50,15 +50,17 @@ impl NullArray { /// other [`DataType`]. /// pub fn new(length: usize) -> Self { - let array_data = ArrayData::builder(DataType::Null).len(length); - let array_data = unsafe { array_data.build_unchecked() }; - NullArray::from(array_data) + Self { len: length } } /// Returns a zero-copy slice of this array with the indicated offset and length. - pub fn slice(&self, offset: usize, length: usize) -> Self { - // TODO: Slice buffers directly (#3880) - self.data.slice(offset, length).into() + pub fn slice(&self, offset: usize, len: usize) -> Self { + assert!( + offset.saturating_add(len) <= self.len, + "the length + offset of the sliced BooleanBuffer cannot exceed the existing length" + ); + + Self { len } } } @@ -67,22 +69,34 @@ impl Array for NullArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &DataType::Null + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.slice(offset, length)) } + fn len(&self) -> usize { + self.len + } + + fn is_empty(&self) -> bool { + self.len == 0 + } + + fn offset(&self) -> usize { + 0 + } + fn nulls(&self) -> Option<&NullBuffer> { None } @@ -104,6 +118,14 @@ impl Array for NullArray { fn null_count(&self) -> usize { self.len() } + + fn get_buffer_memory_size(&self) -> usize { + 0 + } + + fn get_array_memory_size(&self) -> usize { + std::mem::size_of::() + } } impl From for NullArray { @@ -122,13 +144,14 @@ impl From for NullArray { data.nulls().is_none(), "NullArray data should not contain a null buffer, as no buffers are required" ); - Self { data } + Self { len: data.len() } } } impl From for ArrayData { fn from(array: NullArray) -> Self { - array.data + let builder = ArrayDataBuilder::new(DataType::Null).len(array.len); + unsafe { builder.build_unchecked() } } } @@ -158,7 +181,6 @@ mod tests { let array2 = array1.slice(8, 16); assert_eq!(array2.len(), 16); assert_eq!(array2.null_count(), 16); - assert_eq!(array2.offset(), 8); } #[test] diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 75bf85b3f2a0..fc17fb04ee95 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -29,7 +29,7 @@ use arrow_buffer::{ i256, ArrowNativeType, BooleanBuffer, Buffer, NullBuffer, ScalarBuffer, }; use arrow_data::bit_iterator::try_for_each_valid_idx; -use arrow_data::ArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType}; use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, NaiveTime}; use half::f16; @@ -248,17 +248,18 @@ pub use crate::types::ArrowPrimitiveType; /// } /// ``` pub struct PrimitiveArray { - /// Underlying ArrayData - data: ArrayData, + data_type: DataType, /// Values data values: ScalarBuffer, + nulls: Option, } impl Clone for PrimitiveArray { fn clone(&self) -> Self { Self { - data: self.data.clone(), + data_type: self.data_type.clone(), values: self.values.clone(), + nulls: self.nulls.clone(), } } } @@ -282,15 +283,19 @@ impl PrimitiveArray { } // TODO: Don't store ArrayData inside arrays (#3880) - let data = unsafe { - ArrayData::builder(data_type) - .len(values.len()) - .nulls(nulls) - .buffers(vec![values.inner().clone()]) - .build_unchecked() - }; + // let data = unsafe { + // ArrayData::builder(data_type) + // .len(values.len()) + // .nulls(nulls) + // .buffers(vec![values.inner().clone()]) + // .build_unchecked() + // }; - Self { data, values } + Self { + data_type, + values, + nulls, + } } /// Asserts that `data_type` is compatible with `Self` @@ -306,12 +311,12 @@ impl PrimitiveArray { /// Returns the length of this array. #[inline] pub fn len(&self) -> usize { - self.data.len() + self.values.len() } /// Returns whether this array is empty. pub fn is_empty(&self) -> bool { - self.data.is_empty() + self.values.is_empty() } /// Returns the values of this array @@ -410,8 +415,11 @@ impl PrimitiveArray { /// Returns a zero-copy slice of this array with the indicated offset and length. pub fn slice(&self, offset: usize, length: usize) -> Self { - // TODO: Slice buffers directly (#3880) - self.data.slice(offset, length).into() + Self { + data_type: self.data_type.clone(), + values: self.values.slice(offset, length), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)), + } } /// Reinterprets this array's contents as a different data type without copying @@ -436,7 +444,7 @@ impl PrimitiveArray { where K: ArrowPrimitiveType, { - let d = self.data.clone().into_builder().data_type(K::DATA_TYPE); + let d = self.to_data().into_builder().data_type(K::DATA_TYPE); // SAFETY: // Native type is the same @@ -629,14 +637,14 @@ impl PrimitiveArray { /// data buffer is not shared by others. pub fn into_builder(self) -> Result, Self> { let len = self.len(); - let null_bit_buffer = self.data.nulls().map(|b| b.inner().sliced()); + let data = self.into_data(); + let null_bit_buffer = data.nulls().map(|b| b.inner().sliced()); let element_len = std::mem::size_of::(); - let buffer = self.data.buffers()[0] - .slice_with_length(self.data.offset() * element_len, len * element_len); + let buffer = data.buffers()[0] + .slice_with_length(data.offset() * element_len, len * element_len); - drop(self.data); - drop(self.values); + drop(data); let try_mutable_null_buffer = match null_bit_buffer { None => Ok(None), @@ -686,7 +694,12 @@ impl PrimitiveArray { impl From> for ArrayData { fn from(array: PrimitiveArray) -> Self { - array.data + let builder = ArrayDataBuilder::new(array.data_type) + .len(array.values.len()) + .nulls(array.nulls) + .buffers(vec![array.values.into_inner()]); + + unsafe { builder.build_unchecked() } } } @@ -695,24 +708,48 @@ impl Array for PrimitiveArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &self.data_type + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.slice(offset, length)) } + fn len(&self) -> usize { + self.values.len() + } + + fn is_empty(&self) -> bool { + self.values.is_empty() + } + + fn offset(&self) -> usize { + 0 + } + fn nulls(&self) -> Option<&NullBuffer> { - self.data.nulls() + self.nulls.as_ref() + } + + fn get_buffer_memory_size(&self) -> usize { + let mut size = self.values.inner().capacity(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size + } + + fn get_array_memory_size(&self) -> usize { + std::mem::size_of::() + self.get_buffer_memory_size() } } @@ -1061,8 +1098,7 @@ impl PrimitiveArray { /// Construct a timestamp array with an optional timezone pub fn with_timezone_opt>>(&self, timezone: Option) -> Self { let array_data = unsafe { - self.data - .clone() + self.to_data() .into_builder() .data_type(DataType::Timestamp(T::UNIT, timezone.map(Into::into))) .build_unchecked() @@ -1083,7 +1119,11 @@ impl From for PrimitiveArray { let values = ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); - Self { data, values } + Self { + data_type: data.data_type().clone(), + values, + nulls: data.nulls().cloned(), + } } } @@ -1108,12 +1148,10 @@ impl PrimitiveArray { self.validate_precision_scale(precision, scale)?; // safety: self.data is valid DataType::Decimal as checked above - let new_data_type = T::TYPE_CONSTRUCTOR(precision, scale); - let data = self.data.into_builder().data_type(new_data_type); - - // SAFETY - // Validated data above - Ok(unsafe { data.build_unchecked().into() }) + Ok(Self { + data_type: T::TYPE_CONSTRUCTOR(precision, scale), + ..self + }) } // validate that the new precision and scale are valid or not @@ -1244,7 +1282,7 @@ mod tests { fn test_primitive_array_from_vec() { let buf = Buffer::from_slice_ref([0, 1, 2, 3, 4]); let arr = Int32Array::from(vec![0, 1, 2, 3, 4]); - assert_eq!(buf, *arr.data.buffers()[0]); + assert_eq!(&buf, arr.values.inner()); assert_eq!(5, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(0, arr.null_count()); @@ -1484,7 +1522,6 @@ mod tests { let arr2 = arr.slice(2, 5); assert_eq!(5, arr2.len()); - assert_eq!(2, arr2.offset()); assert_eq!(1, arr2.null_count()); for i in 0..arr2.len() { @@ -1497,7 +1534,6 @@ mod tests { let arr3 = arr2.slice(2, 3); assert_eq!(3, arr3.len()); - assert_eq!(4, arr3.offset()); assert_eq!(0, arr3.null_count()); let int_arr3 = arr3.as_any().downcast_ref::().unwrap(); @@ -1742,7 +1778,7 @@ mod tests { fn test_primitive_array_builder() { // Test building a primitive array with ArrayData builder and offset let buf = Buffer::from_slice_ref([0i32, 1, 2, 3, 4, 5, 6]); - let buf2 = buf.clone(); + let buf2 = buf.slice_with_length(8, 20); let data = ArrayData::builder(DataType::Int32) .len(5) .offset(2) @@ -1750,7 +1786,7 @@ mod tests { .build() .unwrap(); let arr = Int32Array::from(data); - assert_eq!(buf2, *arr.data.buffers()[0]); + assert_eq!(&buf2, arr.values.inner()); assert_eq!(5, arr.len()); assert_eq!(0, arr.null_count()); for i in 0..3 { diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 0754913e9d3e..e7e71d3840bb 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -62,7 +62,7 @@ use crate::{ /// ``` pub struct RunArray { - data: ArrayData, + data_type: DataType, run_ends: RunEndBuffer, values: ArrayRef, } @@ -70,7 +70,7 @@ pub struct RunArray { impl Clone for RunArray { fn clone(&self) -> Self { Self { - data: self.data.clone(), + data_type: self.data_type.clone(), run_ends: self.run_ends.clone(), values: self.values.clone(), } @@ -256,8 +256,11 @@ impl RunArray { /// Returns a zero-copy slice of this array with the indicated offset and length. pub fn slice(&self, offset: usize, length: usize) -> Self { - // TODO: Slice buffers directly (#3880) - self.data.slice(offset, length).into() + Self { + data_type: self.data_type.clone(), + run_ends: self.run_ends.slice(offset, length), + values: self.values.clone(), + } } } @@ -282,7 +285,7 @@ impl From for RunArray { let values = make_array(data.child_data()[1].clone()); Self { - data, + data_type: data.data_type().clone(), run_ends, values, } @@ -291,7 +294,21 @@ impl From for RunArray { impl From> for ArrayData { fn from(array: RunArray) -> Self { - array.data + let len = array.run_ends.len(); + let offset = array.run_ends.offset(); + + let run_ends = ArrayDataBuilder::new(R::DATA_TYPE) + .len(array.run_ends.values().len()) + .buffers(vec![array.run_ends.into_inner().into_inner()]); + + let run_ends = unsafe { run_ends.build_unchecked() }; + + let builder = ArrayDataBuilder::new(array.data_type) + .len(len) + .offset(offset) + .child_data(vec![run_ends, array.values.to_data()]); + + unsafe { builder.build_unchecked() } } } @@ -300,25 +317,47 @@ impl Array for RunArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &self.data_type + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.slice(offset, length)) } + fn len(&self) -> usize { + self.run_ends.len() + } + + fn is_empty(&self) -> bool { + self.run_ends.is_empty() + } + + fn offset(&self) -> usize { + self.run_ends.offset() + } + fn nulls(&self) -> Option<&NullBuffer> { None } + + fn get_buffer_memory_size(&self) -> usize { + self.run_ends.inner().inner().capacity() + self.values.get_buffer_memory_size() + } + + fn get_array_memory_size(&self) -> usize { + std::mem::size_of::() + + self.run_ends.inner().inner().capacity() + + self.values.get_array_memory_size() + } } impl std::fmt::Debug for RunArray { @@ -497,10 +536,6 @@ impl<'a, R: RunEndIndexType, V: Sync> Array for TypedRunArray<'a, R, V> { self.run_array } - fn data(&self) -> &ArrayData { - &self.run_array.data - } - fn to_data(&self) -> ArrayData { self.run_array.to_data() } @@ -509,13 +544,37 @@ impl<'a, R: RunEndIndexType, V: Sync> Array for TypedRunArray<'a, R, V> { self.run_array.into_data() } + fn data_type(&self) -> &DataType { + self.run_array.data_type() + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.run_array.slice(offset, length)) } + fn len(&self) -> usize { + self.run_array.len() + } + + fn is_empty(&self) -> bool { + self.run_array.is_empty() + } + + fn offset(&self) -> usize { + self.run_array.offset() + } + fn nulls(&self) -> Option<&NullBuffer> { self.run_array.nulls() } + + fn get_buffer_memory_size(&self) -> usize { + self.run_array.get_buffer_memory_size() + } + + fn get_array_memory_size(&self) -> usize { + self.run_array.get_array_memory_size() + } } // Array accessor converts the index of logical array to the index of the physical array diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 1dccfc7d4ef3..fa43062b77bf 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -17,7 +17,7 @@ use crate::{make_array, Array, ArrayRef, RecordBatch}; use arrow_buffer::{buffer_bin_or, Buffer, NullBuffer}; -use arrow_data::ArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field, Fields, SchemaBuilder}; use std::sync::Arc; use std::{any::Any, ops::Index}; @@ -74,24 +74,26 @@ use std::{any::Any, ops::Index}; /// ``` #[derive(Clone)] pub struct StructArray { - data: ArrayData, - pub(crate) boxed_fields: Vec, + len: usize, + data_type: DataType, + nulls: Option, + pub(crate) fields: Vec, } impl StructArray { /// Returns the field at `pos`. pub fn column(&self, pos: usize) -> &ArrayRef { - &self.boxed_fields[pos] + &self.fields[pos] } /// Return the number of fields in this struct array pub fn num_columns(&self) -> usize { - self.boxed_fields.len() + self.fields.len() } /// Returns the fields of the struct array pub fn columns(&self) -> &[ArrayRef] { - &self.boxed_fields + &self.fields } /// Returns child array refs of the struct array @@ -102,7 +104,7 @@ impl StructArray { /// Return field names in this struct array pub fn column_names(&self) -> Vec<&str> { - match self.data.data_type() { + match self.data_type() { DataType::Struct(fields) => fields .iter() .map(|f| f.name().as_str()) @@ -132,27 +134,48 @@ impl StructArray { } /// Returns a zero-copy slice of this array with the indicated offset and length. - pub fn slice(&self, offset: usize, length: usize) -> Self { - // TODO: Slice buffers directly (#3880) - self.data.slice(offset, length).into() + pub fn slice(&self, offset: usize, len: usize) -> Self { + assert!( + offset.saturating_add(len) <= self.len, + "the length + offset of the sliced StructArray cannot exceed the existing length" + ); + + let fields = self.fields.iter().map(|a| a.slice(offset, len)).collect(); + + Self { + len, + data_type: self.data_type.clone(), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, len)), + fields, + } } } impl From for StructArray { fn from(data: ArrayData) -> Self { - let boxed_fields = data + let fields = data .child_data() .iter() .map(|cd| make_array(cd.clone())) .collect(); - Self { data, boxed_fields } + Self { + len: data.len(), + data_type: data.data_type().clone(), + nulls: data.nulls().cloned(), + fields, + } } } impl From for ArrayData { fn from(array: StructArray) -> Self { - array.data + let builder = ArrayDataBuilder::new(array.data_type) + .len(array.len) + .nulls(array.nulls) + .child_data(array.fields.iter().map(|x| x.to_data()).collect()); + + unsafe { builder.build_unchecked() } } } @@ -228,24 +251,53 @@ impl Array for StructArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &self.data_type + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.slice(offset, length)) } + fn len(&self) -> usize { + self.len + } + + fn is_empty(&self) -> bool { + self.len == 0 + } + + fn offset(&self) -> usize { + 0 + } + fn nulls(&self) -> Option<&NullBuffer> { - self.data.nulls() + self.nulls.as_ref() + } + + fn get_buffer_memory_size(&self) -> usize { + let mut size = self.fields.iter().map(|a| a.get_buffer_memory_size()).sum(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size + } + + fn get_array_memory_size(&self) -> usize { + let mut size = self.fields.iter().map(|a| a.get_array_memory_size()).sum(); + size += std::mem::size_of::(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size } } @@ -343,15 +395,11 @@ impl From<(Vec<(Field, ArrayRef)>, Buffer)> for StructArray { impl From for StructArray { fn from(value: RecordBatch) -> Self { - // TODO: Don't store ArrayData inside arrays (#3880) - let builder = ArrayData::builder(DataType::Struct(value.schema().fields.clone())) - .child_data(value.columns().iter().map(|x| x.to_data()).collect()) - .len(value.num_rows()); - - // Safety: RecordBatch must be valid Self { - data: unsafe { builder.build_unchecked() }, - boxed_fields: value.columns().to_vec(), + len: value.num_rows(), + data_type: DataType::Struct(value.schema().fields().clone()), + nulls: None, + fields: value.columns().to_vec(), } } } @@ -607,7 +655,6 @@ mod tests { let sliced_array = struct_array.slice(2, 3); let sliced_array = sliced_array.as_any().downcast_ref::().unwrap(); assert_eq!(3, sliced_array.len()); - assert_eq!(2, sliced_array.offset()); assert_eq!(1, sliced_array.null_count()); assert!(sliced_array.is_valid(0)); assert!(sliced_array.is_null(1)); @@ -616,7 +663,6 @@ mod tests { let sliced_c0 = sliced_array.column(0); let sliced_c0 = sliced_c0.as_any().downcast_ref::().unwrap(); assert_eq!(3, sliced_c0.len()); - assert_eq!(2, sliced_c0.offset()); assert!(sliced_c0.is_null(0)); assert!(sliced_c0.is_null(1)); assert!(sliced_c0.is_valid(2)); @@ -625,7 +671,6 @@ mod tests { let sliced_c1 = sliced_array.column(1); let sliced_c1 = sliced_c1.as_any().downcast_ref::().unwrap(); assert_eq!(3, sliced_c1.len()); - assert_eq!(2, sliced_c1.offset()); assert!(sliced_c1.is_valid(0)); assert_eq!(42, sliced_c1.value(0)); assert!(sliced_c1.is_null(1)); diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 7b818f3130b7..2a516751c40d 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -18,7 +18,7 @@ use crate::{make_array, Array, ArrayRef}; use arrow_buffer::buffer::NullBuffer; use arrow_buffer::{Buffer, ScalarBuffer}; -use arrow_data::ArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field, UnionFields, UnionMode}; /// Contains the `UnionArray` type. /// @@ -108,10 +108,10 @@ use std::sync::Arc; /// ``` #[derive(Clone)] pub struct UnionArray { - data: ArrayData, + data_type: DataType, type_ids: ScalarBuffer, offsets: Option>, - boxed_fields: Vec>, + fields: Vec>, } impl UnionArray { @@ -231,7 +231,7 @@ impl UnionArray { /// Panics if the `type_id` provided is less than zero or greater than the number of types /// in the `Union`. pub fn child(&self, type_id: i8) -> &ArrayRef { - let boxed = &self.boxed_fields[type_id as usize]; + let boxed = &self.fields[type_id as usize]; boxed.as_ref().expect("invalid type id") } @@ -279,7 +279,7 @@ impl UnionArray { /// Returns the names of the types in the union. pub fn type_names(&self) -> Vec<&str> { - match self.data.data_type() { + match self.data_type() { DataType::Union(fields, _) => fields .iter() .map(|(_, f)| f.name().as_str()) @@ -290,7 +290,7 @@ impl UnionArray { /// Returns whether the `UnionArray` is dense (or sparse if `false`). fn is_dense(&self) -> bool { - match self.data.data_type() { + match self.data_type() { DataType::Union(_, mode) => mode == &UnionMode::Dense, _ => unreachable!("Union array's data type is not a union!"), } @@ -298,8 +298,24 @@ impl UnionArray { /// Returns a zero-copy slice of this array with the indicated offset and length. pub fn slice(&self, offset: usize, length: usize) -> Self { - // TODO: Slice buffers directly (#3880) - self.data.slice(offset, length).into() + let (offsets, fields) = match self.offsets.as_ref() { + Some(offsets) => (Some(offsets.slice(offset, length)), self.fields.clone()), + None => { + let fields = self + .fields + .iter() + .map(|x| x.as_ref().map(|x| x.slice(offset, length))) + .collect(); + (None, fields) + } + }; + + Self { + data_type: self.data_type.clone(), + type_ids: self.type_ids.slice(offset, length), + offsets, + fields, + } } } @@ -330,17 +346,36 @@ impl From for UnionArray { boxed_fields[field_id as usize] = Some(make_array(cd.clone())); } Self { - data, + data_type: data.data_type().clone(), type_ids, offsets, - boxed_fields, + fields: boxed_fields, } } } impl From for ArrayData { fn from(array: UnionArray) -> Self { - array.data + let len = array.len(); + let f = match &array.data_type { + DataType::Union(f, _) => f, + _ => unreachable!(), + }; + let buffers = match array.offsets { + Some(o) => vec![array.type_ids.into_inner(), o.into_inner()], + None => vec![array.type_ids.into_inner()], + }; + + let child = f + .iter() + .map(|(i, _)| array.fields[i as usize].as_ref().unwrap().to_data()) + .collect(); + + let builder = ArrayDataBuilder::new(array.data_type) + .len(len) + .buffers(buffers) + .child_data(child); + unsafe { builder.build_unchecked() } } } @@ -349,22 +384,34 @@ impl Array for UnionArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &self.data_type + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.slice(offset, length)) } + fn len(&self) -> usize { + self.type_ids.len() + } + + fn is_empty(&self) -> bool { + self.type_ids.is_empty() + } + + fn offset(&self) -> usize { + 0 + } + fn nulls(&self) -> Option<&NullBuffer> { None } @@ -386,6 +433,32 @@ impl Array for UnionArray { fn null_count(&self) -> usize { 0 } + + fn get_buffer_memory_size(&self) -> usize { + let mut sum = self.type_ids.inner().capacity(); + if let Some(o) = self.offsets.as_ref() { + sum += o.inner().capacity() + } + self.fields + .iter() + .flat_map(|x| x.as_ref().map(|x| x.get_buffer_memory_size())) + .sum::() + + sum + } + + fn get_array_memory_size(&self) -> usize { + let mut sum = self.type_ids.inner().capacity(); + if let Some(o) = self.offsets.as_ref() { + sum += o.inner().capacity() + } + std::mem::size_of::() + + self + .fields + .iter() + .flat_map(|x| x.as_ref().map(|x| x.get_array_memory_size())) + .sum::() + + sum + } } impl std::fmt::Debug for UnionArray { diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 1350285f8b26..ee61d2da6597 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -474,7 +474,7 @@ impl From for RecordBatch { ); let row_count = value.len(); let schema = Arc::new(Schema::new(value.fields().clone())); - let columns = value.boxed_fields; + let columns = value.fields; RecordBatch { schema, @@ -614,7 +614,7 @@ mod tests { let record_batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]) .unwrap(); - assert_eq!(record_batch.get_array_memory_size(), 564); + assert_eq!(record_batch.get_array_memory_size(), 364); } fn check_batch(record_batch: RecordBatch, num_rows: usize) { diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 05b56a0e8d32..2c1dae5187fa 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -4667,10 +4667,8 @@ mod tests { let array = Int32Array::from(vec![-5, 6, -7, 8, 100000000]); assert_eq!(0, array.offset()); let array = array.slice(2, 3); - assert_eq!(2, array.offset()); let b = cast(&array, &DataType::UInt8).unwrap(); assert_eq!(3, b.len()); - assert_eq!(0, b.offset()); let c = b.as_any().downcast_ref::().unwrap(); assert!(!c.is_valid(0)); assert_eq!(8, c.value(1)); diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 7d44d8f24030..c1fed0206fcc 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -24,11 +24,11 @@ use std::cmp::min; use std::collections::HashMap; use std::io::{BufWriter, Write}; -use arrow_array::types::{Int16Type, Int32Type, Int64Type, RunEndIndexType}; use flatbuffers::FlatBufferBuilder; use arrow_array::builder::BufferBuilder; use arrow_array::cast::*; +use arrow_array::types::{Int16Type, Int32Type, Int64Type, RunEndIndexType}; use arrow_array::*; use arrow_buffer::bit_util; use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; @@ -1107,94 +1107,29 @@ fn get_buffer_element_width(spec: &BufferSpec) -> usize { } } -/// Returns byte width for binary value_offset buffer spec. -#[inline] -fn get_value_offset_byte_width(data_type: &DataType) -> usize { - match data_type { - DataType::Binary | DataType::Utf8 => 4, - DataType::LargeBinary | DataType::LargeUtf8 => 8, - _ => unreachable!(), +/// Returns the values and offsets [`Buffer`] for a ByteArray with offset type `O` +fn get_byte_array_buffers(data: &ArrayData) -> (Buffer, Buffer) { + if data.is_empty() { + return (MutableBuffer::new(0).into(), MutableBuffer::new(0).into()); } -} -/// Returns the number of total bytes in base binary arrays. -fn get_binary_buffer_len(array_data: &ArrayData) -> usize { - if array_data.is_empty() { - return 0; - } - match array_data.data_type() { - DataType::Binary => { - let array: BinaryArray = array_data.clone().into(); - let offsets = array.value_offsets(); - (offsets[array_data.len()] - offsets[0]) as usize - } - DataType::LargeBinary => { - let array: LargeBinaryArray = array_data.clone().into(); - let offsets = array.value_offsets(); - (offsets[array_data.len()] - offsets[0]) as usize - } - DataType::Utf8 => { - let array: StringArray = array_data.clone().into(); - let offsets = array.value_offsets(); - (offsets[array_data.len()] - offsets[0]) as usize - } - DataType::LargeUtf8 => { - let array: LargeStringArray = array_data.clone().into(); - let offsets = array.value_offsets(); - (offsets[array_data.len()] - offsets[0]) as usize - } - _ => unreachable!(), - } -} + let buffers = data.buffers(); + let offsets: &[O] = buffers[0].typed_data::(); + let offset_slice = &offsets[data.offset()..data.offset() + data.len() + 1]; -/// Rebase value offsets for given ArrayData to zero-based. -fn get_zero_based_value_offsets( - array_data: &ArrayData, -) -> Buffer { - match array_data.data_type() { - DataType::Binary | DataType::LargeBinary => { - let array: GenericBinaryArray = array_data.clone().into(); - let offsets = array.value_offsets(); - let start_offset = offsets[0]; - - let mut builder = BufferBuilder::::new(array_data.len() + 1); - for x in offsets { - builder.append(*x - start_offset); - } - - builder.finish() - } - DataType::Utf8 | DataType::LargeUtf8 => { - let array: GenericStringArray = array_data.clone().into(); - let offsets = array.value_offsets(); - let start_offset = offsets[0]; - - let mut builder = BufferBuilder::::new(array_data.len() + 1); - for x in offsets { - builder.append(*x - start_offset); - } + let start_offset = offset_slice.first().unwrap(); + let end_offset = offset_slice.last().unwrap(); - builder.finish() - } - _ => unreachable!(), - } -} + let offsets = match start_offset.as_usize() { + 0 => buffers[0].clone(), + _ => offset_slice.iter().map(|x| *x - *start_offset).collect(), + }; -/// Returns the start offset of base binary array. -fn get_buffer_offset(array_data: &ArrayData) -> OffsetSize { - match array_data.data_type() { - DataType::Binary | DataType::LargeBinary => { - let array: GenericBinaryArray = array_data.clone().into(); - let offsets = array.value_offsets(); - offsets[0] - } - DataType::Utf8 | DataType::LargeUtf8 => { - let array: GenericStringArray = array_data.clone().into(); - let offsets = array.value_offsets(); - offsets[0] - } - _ => unreachable!(), - } + let values = buffers[1].slice_with_length( + start_offset.as_usize(), + end_offset.as_usize() - start_offset.as_usize(), + ); + (offsets, values) } /// Write array data to a vector of bytes @@ -1241,65 +1176,27 @@ fn write_array_data( } let data_type = array_data.data_type(); - if matches!( - data_type, - DataType::Binary | DataType::LargeBinary | DataType::Utf8 | DataType::LargeUtf8 - ) { - let offset_buffer = &array_data.buffers()[0]; - let value_offset_byte_width = get_value_offset_byte_width(data_type); - let min_length = (array_data.len() + 1) * value_offset_byte_width; - if buffer_need_truncate( - array_data.offset(), - offset_buffer, - &BufferSpec::FixedWidth { - byte_width: value_offset_byte_width, - }, - min_length, - ) { - // Rebase offsets and truncate values - let (new_offsets, byte_offset) = - if matches!(data_type, DataType::Binary | DataType::Utf8) { - ( - get_zero_based_value_offsets::(array_data), - get_buffer_offset::(array_data) as usize, - ) - } else { - ( - get_zero_based_value_offsets::(array_data), - get_buffer_offset::(array_data) as usize, - ) - }; - + if matches!(data_type, DataType::Binary | DataType::Utf8) { + let (offsets, values) = get_byte_array_buffers::(array_data); + for buffer in [offsets, values] { offset = write_buffer( - new_offsets.as_slice(), + buffer.as_slice(), buffers, arrow_data, offset, compression_codec, )?; - - let total_bytes = get_binary_buffer_len(array_data); - let value_buffer = &array_data.buffers()[1]; - let buffer_length = min(total_bytes, value_buffer.len() - byte_offset); - let buffer_slice = - &value_buffer.as_slice()[byte_offset..(byte_offset + buffer_length)]; + } + } else if matches!(data_type, DataType::LargeBinary | DataType::LargeUtf8) { + let (offsets, values) = get_byte_array_buffers::(array_data); + for buffer in [offsets, values] { offset = write_buffer( - buffer_slice, + buffer.as_slice(), buffers, arrow_data, offset, compression_codec, )?; - } else { - for buffer in array_data.buffers() { - offset = write_buffer( - buffer.as_slice(), - buffers, - arrow_data, - offset, - compression_codec, - )?; - } } } else if DataType::is_numeric(data_type) || DataType::is_temporal(data_type) @@ -1445,20 +1342,20 @@ fn pad_to_8(len: u32) -> usize { #[cfg(test)] mod tests { - use super::*; - use std::io::Cursor; use std::io::Seek; use std::sync::Arc; - use crate::MetadataVersion; - - use crate::reader::*; use arrow_array::builder::PrimitiveRunBuilder; use arrow_array::builder::UnionBuilder; use arrow_array::types::*; use arrow_schema::DataType; + use crate::reader::*; + use crate::MetadataVersion; + + use super::*; + #[test] #[cfg(feature = "lz4")] fn test_write_empty_record_batch_lz4_compression() { diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index 6039d53eaedc..aaa3423d69e5 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -208,8 +208,7 @@ mod tests { let s = s.slice(2, 3); let select = select.slice(1, 3); - let select = select.as_boolean(); - let a = nullif(&s, select).unwrap(); + let a = nullif(&s, &select).unwrap(); let r: Vec<_> = a.as_string::().iter().collect(); assert_eq!(r, vec![None, Some("a"), None]); } @@ -509,9 +508,8 @@ mod tests { .map(|_| rng.gen_bool(0.5).then(|| rng.gen_bool(0.5))) .collect(); let b = b.slice(b_start_offset, a_length); - let b = b.as_boolean(); - test_nullif(&a, b); + test_nullif(&a, &b); } } } From d7dd502bf5d7c14296c4e2c7fd79638be906f867 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Wed, 12 Apr 2023 12:16:28 +0100 Subject: [PATCH 2/4] Fix doc --- arrow-array/src/array/primitive_array.rs | 27 ++++++------------------ arrow/src/lib.rs | 2 +- 2 files changed, 7 insertions(+), 22 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index fc17fb04ee95..3199104382a6 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -282,15 +282,6 @@ impl PrimitiveArray { assert_eq!(values.len(), n.len()); } - // TODO: Don't store ArrayData inside arrays (#3880) - // let data = unsafe { - // ArrayData::builder(data_type) - // .len(values.len()) - // .nulls(nulls) - // .buffers(vec![values.inner().clone()]) - // .build_unchecked() - // }; - Self { data_type, values, @@ -372,18 +363,12 @@ impl PrimitiveArray { /// Creates a PrimitiveArray based on an iterator of values without nulls pub fn from_iter_values>(iter: I) -> Self { let val_buf: Buffer = iter.into_iter().collect(); - let data = unsafe { - ArrayData::new_unchecked( - T::DATA_TYPE, - val_buf.len() / std::mem::size_of::<::Native>(), - None, - None, - 0, - vec![val_buf], - vec![], - ) - }; - PrimitiveArray::from(data) + let len = val_buf.len() / std::mem::size_of::(); + Self { + data_type: T::DATA_TYPE, + values: ScalarBuffer::new(val_buf, 0, len), + nulls: None, + } } /// Creates a PrimitiveArray based on a constant value with `count` elements diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 8bad29bf74b7..27c905ba0cd6 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -322,7 +322,7 @@ //! Advanced users may wish to interact with the underlying buffers of an [`Array`], for example, //! for FFI or high-performance conversion from other formats. This interface is provided by //! [`ArrayData`] which stores the [`Buffer`] comprising an [`Array`], and can be accessed -//! with [`Array::data`](array::Array::data) +//! with [`Array::to_data`](array::Array::to_data) //! //! The APIs for constructing [`ArrayData`] come in safe, and unsafe variants, with the former //! performing extensive, but potentially expensive validation to ensure the buffers are well-formed. From dd7dc10a665593262c1aa0026a4e2346964b60b5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Wed, 12 Apr 2023 12:23:31 +0100 Subject: [PATCH 3/4] Fix pyarrow-integration-testing --- arrow-pyarrow-integration-testing/src/lib.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arrow-pyarrow-integration-testing/src/lib.rs b/arrow-pyarrow-integration-testing/src/lib.rs index cf94b0dd40af..af400868ffa9 100644 --- a/arrow-pyarrow-integration-testing/src/lib.rs +++ b/arrow-pyarrow-integration-testing/src/lib.rs @@ -52,7 +52,7 @@ fn double(array: &PyAny, py: Python) -> PyResult { let array = kernels::arithmetic::add(array, array).map_err(to_py_err)?; // export - array.data().to_pyarrow(py) + array.to_data().to_pyarrow(py) } /// calls a lambda function that receives and returns an array @@ -64,7 +64,7 @@ fn double_py(lambda: &PyAny, py: Python) -> PyResult { let expected = Arc::new(Int64Array::from(vec![Some(2), None, Some(6)])) as ArrayRef; // to py - let pyarray = array.data().to_pyarrow(py)?; + let pyarray = array.to_data().to_pyarrow(py)?; let pyarray = lambda.call1((pyarray,))?; let array = make_array(ArrayData::from_pyarrow(pyarray)?); @@ -75,7 +75,7 @@ fn double_py(lambda: &PyAny, py: Python) -> PyResult { fn make_empty_array(datatype: PyArrowType, py: Python) -> PyResult { let array = new_empty_array(&datatype.0); - array.data().to_pyarrow(py) + array.to_data().to_pyarrow(py) } /// Returns the substring @@ -90,7 +90,7 @@ fn substring( // substring let array = kernels::substring::substring(array.as_ref(), start, None).map_err(to_py_err)?; - Ok(array.data().to_owned().into()) + Ok(array.to_data().into()) } /// Returns the concatenate @@ -101,7 +101,7 @@ fn concatenate(array: PyArrowType, py: Python) -> PyResult // concat let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]).map_err(to_py_err)?; - array.data().to_pyarrow(py) + array.to_data().to_pyarrow(py) } #[pyfunction] From d8b1c6be6f8eeda2454d94e9a09f088252ba1e94 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Wed, 12 Apr 2023 19:01:00 +0100 Subject: [PATCH 4/4] Review feedback --- arrow-array/src/array/union_array.rs | 2 ++ arrow-ipc/src/writer.rs | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 2a516751c40d..172ae082197c 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -299,7 +299,9 @@ impl UnionArray { /// Returns a zero-copy slice of this array with the indicated offset and length. pub fn slice(&self, offset: usize, length: usize) -> Self { let (offsets, fields) = match self.offsets.as_ref() { + // If dense union, slice offsets Some(offsets) => (Some(offsets.slice(offset, length)), self.fields.clone()), + // Otherwise need to slice sparse children None => { let fields = self .fields diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index c1fed0206fcc..08ddd1812bb7 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1108,6 +1108,10 @@ fn get_buffer_element_width(spec: &BufferSpec) -> usize { } /// Returns the values and offsets [`Buffer`] for a ByteArray with offset type `O` +/// +/// In particular, this handles re-encoding the offsets if they don't start at `0`, +/// slicing the values buffer as appropriate. This helps reduce the encoded +/// size of sliced arrays, as values that have been sliced away are not encoded fn get_byte_array_buffers(data: &ArrayData) -> (Buffer, Buffer) { if data.is_empty() { return (MutableBuffer::new(0).into(), MutableBuffer::new(0).into());