From 0c1b2d1fe61efefd63523fad6e411363aae82aab Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Fri, 17 Mar 2023 13:09:37 +0000 Subject: [PATCH 1/3] Add Array::to_data and Array::nulls (#3880) --- arrow-arith/src/aggregate.rs | 8 ++-- arrow-arith/src/arity.rs | 3 +- arrow-arith/src/boolean.rs | 24 ++++++------ arrow-array/src/array/binary_array.rs | 2 +- arrow-array/src/array/boolean_array.rs | 17 ++++++++- arrow-array/src/array/byte_array.rs | 18 ++++++++- arrow-array/src/array/dictionary_array.rs | 27 +++++++++++++ .../src/array/fixed_size_binary_array.rs | 17 ++++++++- .../src/array/fixed_size_list_array.rs | 15 ++++++++ arrow-array/src/array/list_array.rs | 16 +++++++- arrow-array/src/array/map_array.rs | 15 +++++++- arrow-array/src/array/mod.rs | 38 +++++++++++++++---- arrow-array/src/array/null_array.rs | 17 ++++++++- arrow-array/src/array/primitive_array.rs | 18 ++++++++- arrow-array/src/array/run_array.rs | 28 +++++++++++++- arrow-array/src/array/struct_array.rs | 17 +++++++-- arrow-array/src/array/union_array.rs | 14 +++++++ arrow-array/src/builder/boolean_builder.rs | 4 +- .../src/builder/fixed_size_list_builder.rs | 8 ++-- arrow-array/src/cast.rs | 2 +- arrow-array/src/record_batch.rs | 4 +- arrow-integration-test/src/lib.rs | 4 +- arrow-json/src/reader.rs | 23 ++++------- arrow-ord/src/comparison.rs | 7 ++-- arrow-row/src/lib.rs | 4 +- arrow-select/src/concat.rs | 2 +- arrow-select/src/filter.rs | 2 +- arrow-string/src/length.rs | 8 ++-- arrow-string/src/regexp.rs | 2 +- arrow-string/src/substring.rs | 8 ++-- arrow/benches/array_data_validate.rs | 2 +- arrow/examples/dynamic_types.rs | 2 +- arrow/src/ffi.rs | 6 +-- arrow/src/util/bench_util.rs | 2 +- arrow/src/util/data_gen.rs | 4 +- 35 files changed, 297 insertions(+), 91 deletions(-) diff --git a/arrow-arith/src/aggregate.rs b/arrow-arith/src/aggregate.rs index 7777bb0ede43..8e760da21909 100644 --- a/arrow-arith/src/aggregate.rs +++ b/arrow-arith/src/aggregate.rs @@ -117,7 +117,7 @@ where .map(|i| unsafe { array.value_unchecked(i) }) .reduce(|acc, item| if cmp(&acc, &item) { item } else { acc }) } else { - let nulls = array.data().nulls().unwrap(); + let nulls = array.nulls().unwrap(); let iter = BitIndexIterator::new(nulls.validity(), nulls.offset(), nulls.len()); unsafe { let idx = iter.reduce(|acc_idx, idx| { @@ -288,7 +288,7 @@ where let data: &[T::Native] = array.values(); - match array.data().nulls() { + match array.nulls() { None => { let sum = data.iter().fold(T::default_value(), |accumulator, value| { accumulator.add_wrapping(*value) @@ -347,7 +347,7 @@ where let data: &[T::Native] = array.values(); - match array.data().nulls() { + match array.nulls() { None => { let sum = data .iter() @@ -665,7 +665,7 @@ mod simd { let mut chunk_acc = A::init_accumulator_chunk(); let mut rem_acc = A::init_accumulator_scalar(); - match array.data().nulls() { + match array.nulls() { None => { let data_chunks = data.chunks_exact(64); let remainder = data_chunks.remainder(); diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index 74edd654bbcd..162b56ef1fe3 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -414,8 +414,7 @@ where let array_builder = builder .finish() - .data() - .clone() + .into_data() .into_builder() .null_bit_buffer(null_buffer) .null_count(null_count); diff --git a/arrow-arith/src/boolean.rs b/arrow-arith/src/boolean.rs index 61942dc90b81..3e21c2f1b484 100644 --- a/arrow-arith/src/boolean.rs +++ b/arrow-arith/src/boolean.rs @@ -610,7 +610,7 @@ mod tests { let a = BooleanArray::from(vec![false, false, false, true, true, true]); // ensure null bitmap of a is absent - assert!(a.data().nulls().is_none()); + assert!(a.nulls().is_none()); let b = BooleanArray::from(vec![ Some(true), @@ -622,7 +622,7 @@ mod tests { ]); // ensure null bitmap of b is present - assert!(b.data().nulls().is_some()); + assert!(b.nulls().is_some()); let c = or_kleene(&a, &b).unwrap(); @@ -650,12 +650,12 @@ mod tests { ]); // ensure null bitmap of b is absent - assert!(a.data().nulls().is_some()); + assert!(a.nulls().is_some()); let b = BooleanArray::from(vec![false, false, false, true, true, true]); // ensure null bitmap of a is present - assert!(b.data().nulls().is_none()); + assert!(b.nulls().is_none()); let c = or_kleene(&a, &b).unwrap(); @@ -852,7 +852,7 @@ mod tests { let expected = BooleanArray::from(vec![false, false, false, false]); assert_eq!(expected, res); - assert!(res.data().nulls().is_none()); + assert!(res.nulls().is_none()); } #[test] @@ -865,7 +865,7 @@ mod tests { let expected = BooleanArray::from(vec![false, false, false, false]); assert_eq!(expected, res); - assert!(res.data().nulls().is_none()); + assert!(res.nulls().is_none()); } #[test] @@ -877,7 +877,7 @@ mod tests { let expected = BooleanArray::from(vec![true, true, true, true]); assert_eq!(expected, res); - assert!(res.data().nulls().is_none()); + assert!(res.nulls().is_none()); } #[test] @@ -890,7 +890,7 @@ mod tests { let expected = BooleanArray::from(vec![true, true, true, true]); assert_eq!(expected, res); - assert!(res.data().nulls().is_none()); + assert!(res.nulls().is_none()); } #[test] @@ -902,7 +902,7 @@ mod tests { let expected = BooleanArray::from(vec![false, true, false, true]); assert_eq!(expected, res); - assert!(res.data().nulls().is_none()); + assert!(res.nulls().is_none()); } #[test] @@ -933,7 +933,7 @@ mod tests { let expected = BooleanArray::from(vec![false, true, false, true]); assert_eq!(expected, res); - assert!(res.data().nulls().is_none()); + assert!(res.nulls().is_none()); } #[test] @@ -945,7 +945,7 @@ mod tests { let expected = BooleanArray::from(vec![true, false, true, false]); assert_eq!(expected, res); - assert!(res.data().nulls().is_none()); + assert!(res.nulls().is_none()); } #[test] @@ -976,6 +976,6 @@ mod tests { let expected = BooleanArray::from(vec![true, false, true, false]); assert_eq!(expected, res); - assert!(res.data().nulls().is_none()); + assert!(res.nulls().is_none()); } } diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index 1a3270a70d80..b965279fb796 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -77,7 +77,7 @@ impl GenericBinaryArray { .offset(v.offset()) .add_buffer(v.data_ref().buffers()[0].clone()) .add_buffer(child_data.buffers()[0].slice(child_data.offset())) - .nulls(v.data().nulls().cloned()); + .nulls(v.nulls().cloned()); let data = unsafe { builder.build_unchecked() }; Self::from(data) diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index e924824e75ea..89fdca507b00 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -18,12 +18,14 @@ use crate::array::print_long_array; use crate::builder::BooleanBuilder; use crate::iterator::BooleanIter; -use crate::{Array, ArrayAccessor}; +use crate::{Array, ArrayAccessor, ArrayRef}; +use arrow_buffer::buffer::NullBuffer; use arrow_buffer::{bit_util, Buffer, MutableBuffer}; use arrow_data::bit_mask::combine_option_bitmap; use arrow_data::ArrayData; use arrow_schema::DataType; use std::any::Any; +use std::sync::Arc; /// Array of bools /// @@ -265,9 +267,22 @@ impl Array for BooleanArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + // TODO: Slice buffers directly (#3880) + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.data.nulls() + } } impl<'a> ArrayAccessor for &'a BooleanArray { diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index 442e795cec52..81c5824a5e04 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -20,12 +20,13 @@ use crate::builder::GenericByteBuilder; use crate::iterator::ArrayIter; use crate::types::bytes::ByteArrayNativeType; use crate::types::ByteArrayType; -use crate::{Array, ArrayAccessor, OffsetSizeTrait}; -use arrow_buffer::buffer::OffsetBuffer; +use crate::{Array, ArrayAccessor, ArrayRef, OffsetSizeTrait}; +use arrow_buffer::buffer::{NullBuffer, OffsetBuffer}; use arrow_buffer::{ArrowNativeType, Buffer}; use arrow_data::ArrayData; use arrow_schema::DataType; use std::any::Any; +use std::sync::Arc; /// Generic struct for variable-size byte arrays /// @@ -237,9 +238,22 @@ impl Array for GenericByteArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + // TODO: Slice buffers directly (#3880) + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.data.nulls() + } } impl<'a, T: ByteArrayType> ArrayAccessor for &'a GenericByteArray { diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index f9a40c6f3400..ee58a485c71c 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -23,10 +23,12 @@ use crate::{ make_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, PrimitiveArray, StringArray, }; +use arrow_buffer::buffer::NullBuffer; use arrow_buffer::ArrowNativeType; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; use std::any::Any; +use std::sync::Arc; /// /// A dictionary array where each element is a single value indexed by an integer key. @@ -590,9 +592,22 @@ impl Array for DictionaryArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + // TODO: Slice buffers directly (#3880) + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.data.nulls() + } } impl std::fmt::Debug for DictionaryArray { @@ -669,9 +684,21 @@ impl<'a, K: ArrowDictionaryKeyType, V: Sync> Array for TypedDictionaryArray<'a, &self.dictionary.data } + fn to_data(&self) -> ArrayData { + self.dictionary.to_data() + } + fn into_data(self) -> ArrayData { self.dictionary.into_data() } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + self.dictionary.slice(offset, length) + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.dictionary.nulls() + } } impl<'a, K, V> IntoIterator for TypedDictionaryArray<'a, K, V> diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index 87f1b955723d..062961a20abb 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -17,11 +17,13 @@ use crate::array::print_long_array; use crate::iterator::FixedSizeBinaryIter; -use crate::{Array, ArrayAccessor, FixedSizeListArray}; +use crate::{Array, ArrayAccessor, ArrayRef, FixedSizeListArray}; +use arrow_buffer::buffer::NullBuffer; use arrow_buffer::{bit_util, Buffer, MutableBuffer}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; use std::any::Any; +use std::sync::Arc; /// An array where each element is a fixed-size sequence of bytes. /// @@ -462,9 +464,22 @@ impl Array for FixedSizeBinaryArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + // TODO: Slice buffers directly (#3880) + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.data.nulls() + } } impl<'a> ArrayAccessor for &'a FixedSizeBinaryArray { diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index 6e228ba3c770..7d65927cdeec 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -18,9 +18,11 @@ use crate::array::print_long_array; use crate::builder::{FixedSizeListBuilder, PrimitiveBuilder}; use crate::{make_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType}; +use arrow_buffer::buffer::NullBuffer; use arrow_data::ArrayData; use arrow_schema::DataType; use std::any::Any; +use std::sync::Arc; /// A list array where each element is a fixed-size sequence of values with the same /// type whose maximum length is represented by a i32. @@ -205,9 +207,22 @@ impl Array for FixedSizeListArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + // TODO: Slice buffers directly (#3880) + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.data.nulls() + } } impl ArrayAccessor for FixedSizeListArray { diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 178139f810e7..203b98d2fca5 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -20,12 +20,13 @@ use crate::builder::{GenericListBuilder, PrimitiveBuilder}; use crate::{ iterator::GenericListArrayIter, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, }; -use arrow_buffer::buffer::OffsetBuffer; +use arrow_buffer::buffer::{NullBuffer, OffsetBuffer}; use arrow_buffer::ArrowNativeType; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, Field}; use num::Integer; use std::any::Any; +use std::sync::Arc; /// trait declaring an offset size, relevant for i32 vs i64 array types. pub trait OffsetSizeTrait: ArrowNativeType + std::ops::AddAssign + Integer { @@ -244,9 +245,22 @@ impl Array for GenericListArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + // TODO: Slice buffers directly (#3880) + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.data.nulls() + } } impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor for &'a GenericListArray { diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index 8c9b02921781..de9e3a87396c 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -17,7 +17,7 @@ use crate::array::{get_offsets, print_long_array}; use crate::{make_array, Array, ArrayRef, StringArray, StructArray}; -use arrow_buffer::buffer::OffsetBuffer; +use arrow_buffer::buffer::{NullBuffer, OffsetBuffer}; use arrow_buffer::{ArrowNativeType, Buffer, ToByteSlice}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, Field}; @@ -214,10 +214,23 @@ impl Array for MapArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + // TODO: Slice buffers directly (#3880) + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.data.nulls() + } + /// Returns the total number of bytes of memory occupied by the buffers owned by this [MapArray]. fn get_buffer_memory_size(&self) -> usize { self.data.get_buffer_memory_size() diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index dfdaac85bf85..048f41b73a85 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -20,7 +20,7 @@ mod binary_array; use crate::types::*; -use arrow_buffer::buffer::{OffsetBuffer, ScalarBuffer}; +use arrow_buffer::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow_buffer::ArrowNativeType; use arrow_data::ArrayData; use arrow_schema::{DataType, IntervalUnit, TimeUnit}; @@ -96,12 +96,19 @@ pub trait Array: std::fmt::Debug + Send + Sync { fn as_any(&self) -> &dyn Any; /// Returns a reference to the underlying data of this array. + /// + /// This will be deprecated in a future release [(#3880)](https://github.com/apache/arrow-rs/issues/3880) fn data(&self) -> &ArrayData; + /// Returns the underlying data of this array. + fn to_data(&self) -> ArrayData; + /// Returns the underlying data of this array. fn into_data(self) -> ArrayData; /// Returns a reference-counted pointer to the underlying data of this array. + /// + /// This will be deprecated in a future release [(#3880)](https://github.com/apache/arrow-rs/issues/3880) fn data_ref(&self) -> &ArrayData { self.data() } @@ -135,9 +142,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// /// assert_eq!(array_slice.as_ref(), &Int32Array::from(vec![2, 3, 4])); /// ``` - fn slice(&self, offset: usize, length: usize) -> ArrayRef { - make_array(self.data_ref().slice(offset, length)) - } + fn slice(&self, offset: usize, length: usize) -> ArrayRef; /// Returns the length (i.e., number of elements) of this array. /// @@ -189,6 +194,9 @@ pub trait Array: std::fmt::Debug + Send + Sync { self.data_ref().offset() } + /// Returns the null buffers of this array if any + fn nulls(&self) -> Option<&NullBuffer>; + /// Returns whether the element at `index` is null. /// When using this function on a slice, the index is relative to the slice. /// @@ -203,7 +211,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// assert_eq!(array.is_null(1), true); /// ``` fn is_null(&self, index: usize) -> bool { - self.data_ref().is_null(index) + self.nulls().map(|n| n.is_null(index)).unwrap_or_default() } /// Returns whether the element at `index` is not null. @@ -220,7 +228,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// assert_eq!(array.is_valid(1), false); /// ``` fn is_valid(&self, index: usize) -> bool { - self.data_ref().is_valid(index) + !self.is_null(index) } /// Returns the total number of null values in this array. @@ -236,7 +244,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// assert_eq!(array.null_count(), 2); /// ``` fn null_count(&self) -> usize { - self.data_ref().null_count() + self.nulls().map(|n| n.null_count()).unwrap_or_default() } /// Returns the total number of bytes of memory pointed to by this array. @@ -269,6 +277,10 @@ impl Array for ArrayRef { self.as_ref().data() } + fn to_data(&self) -> ArrayData { + self.as_ref().to_data() + } + fn into_data(self) -> ArrayData { self.data().clone() } @@ -297,6 +309,10 @@ impl Array for ArrayRef { self.as_ref().offset() } + fn nulls(&self) -> Option<&NullBuffer> { + self.as_ref().nulls() + } + fn is_null(&self, index: usize) -> bool { self.as_ref().is_null(index) } @@ -327,6 +343,10 @@ impl<'a, T: Array> Array for &'a T { T::data(self) } + fn to_data(&self) -> ArrayData { + T::to_data(self) + } + fn into_data(self) -> ArrayData { self.data().clone() } @@ -355,6 +375,10 @@ impl<'a, T: Array> Array for &'a T { T::offset(self) } + fn nulls(&self) -> Option<&NullBuffer> { + T::nulls(self) + } + fn is_null(&self, index: usize) -> bool { T::is_null(self, index) } diff --git a/arrow-array/src/array/null_array.rs b/arrow-array/src/array/null_array.rs index 8eb8e64b0eda..fba6e41e871d 100644 --- a/arrow-array/src/array/null_array.rs +++ b/arrow-array/src/array/null_array.rs @@ -17,10 +17,12 @@ //! Contains the `NullArray` type. -use crate::Array; +use crate::{Array, ArrayRef}; +use arrow_buffer::buffer::NullBuffer; use arrow_data::ArrayData; use arrow_schema::DataType; use std::any::Any; +use std::sync::Arc; /// An Array where all elements are nulls /// @@ -63,10 +65,23 @@ impl Array for NullArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + // TODO: Slice buffers directly (#3880) + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + None + } + /// Returns whether the element at `index` is null. /// All elements of a `NullArray` are always null. fn is_null(&self, _index: usize) -> bool { diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 408f0c4ae96a..dd1ce4f1473f 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -23,9 +23,9 @@ use crate::temporal_conversions::{ }; use crate::timezone::Tz; use crate::trusted_len::trusted_len_unzip; -use crate::{types::*, ArrowNativeTypeOp}; +use crate::{types::*, ArrayRef, ArrowNativeTypeOp}; use crate::{Array, ArrayAccessor}; -use arrow_buffer::buffer::ScalarBuffer; +use arrow_buffer::buffer::{NullBuffer, ScalarBuffer}; use arrow_buffer::{i256, ArrowNativeType, Buffer}; use arrow_data::bit_iterator::try_for_each_valid_idx; use arrow_data::ArrayData; @@ -33,6 +33,7 @@ use arrow_schema::{ArrowError, DataType}; use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, NaiveTime}; use half::f16; use std::any::Any; +use std::sync::Arc; /// /// # Example: Using `collect` @@ -697,9 +698,22 @@ impl Array for PrimitiveArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + // TODO: Slice buffers directly (#3880) + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.data.nulls() + } } impl<'a, T: ArrowPrimitiveType> ArrayAccessor for &'a PrimitiveArray { diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index e50903f30f9b..2f69e5a2472a 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -16,8 +16,9 @@ // under the License. use std::any::Any; +use std::sync::Arc; -use arrow_buffer::buffer::RunEndBuffer; +use arrow_buffer::buffer::{NullBuffer, RunEndBuffer}; use arrow_buffer::ArrowNativeType; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field}; @@ -288,9 +289,22 @@ impl Array for RunArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + // TODO: Slice buffers directly (#3880) + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + None + } } impl std::fmt::Debug for RunArray { @@ -473,9 +487,21 @@ impl<'a, R: RunEndIndexType, V: Sync> Array for TypedRunArray<'a, R, V> { &self.run_array.data } + fn to_data(&self) -> ArrayData { + self.run_array.to_data() + } + fn into_data(self) -> ArrayData { self.run_array.into_data() } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + self.run_array.slice(offset, length) + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.run_array.nulls() + } } // Array accessor converts the index of logical array to the index of the physical array diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 35d4444e0117..34d9d0db5117 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -16,10 +16,11 @@ // under the License. use crate::{make_array, Array, ArrayRef}; -use arrow_buffer::buffer::buffer_bin_or; +use arrow_buffer::buffer::{buffer_bin_or, NullBuffer}; use arrow_buffer::Buffer; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, Field}; +use std::sync::Arc; use std::{any::Any, ops::Index}; /// A nested array type where each child (called *field*) is represented by a separate @@ -196,13 +197,21 @@ impl Array for StructArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } - /// Returns the length (i.e., number of elements) of this array - fn len(&self) -> usize { - self.data_ref().len() + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + // TODO: Slice buffers directly (#3880) + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.data.nulls() } } diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 867eb8d59fde..5a4d2af7ca45 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -16,12 +16,14 @@ // under the License. use crate::{make_array, Array, ArrayRef}; +use arrow_buffer::buffer::NullBuffer; use arrow_buffer::Buffer; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, Field, UnionMode}; /// Contains the `UnionArray` type. /// use std::any::Any; +use std::sync::Arc; /// An Array that can represent slots of varying types. /// @@ -317,10 +319,22 @@ impl Array for UnionArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + None + } + /// Union types always return non null as there is no validity buffer. /// To check validity correctly you must check the underlying vector. fn is_null(&self, _index: usize) -> bool { diff --git a/arrow-array/src/builder/boolean_builder.rs b/arrow-array/src/builder/boolean_builder.rs index 0862b35b07e0..0002309a3d55 100644 --- a/arrow-array/src/builder/boolean_builder.rs +++ b/arrow-array/src/builder/boolean_builder.rs @@ -289,7 +289,7 @@ mod tests { let array = builder.finish(); assert_eq!(0, array.null_count()); - assert!(array.data().nulls().is_none()); + assert!(array.nulls().is_none()); } #[test] @@ -311,7 +311,7 @@ mod tests { assert_eq!(4, array.false_count()); assert_eq!(0, array.null_count()); - assert!(array.data().nulls().is_none()); + assert!(array.nulls().is_none()); } #[test] diff --git a/arrow-array/src/builder/fixed_size_list_builder.rs b/arrow-array/src/builder/fixed_size_list_builder.rs index bc4ce466ac39..f8cd5d15f852 100644 --- a/arrow-array/src/builder/fixed_size_list_builder.rs +++ b/arrow-array/src/builder/fixed_size_list_builder.rs @@ -157,7 +157,7 @@ where pub fn finish(&mut self) -> FixedSizeListArray { let len = self.len(); let values_arr = self.values_builder.finish(); - let values_data = values_arr.data(); + let values_data = values_arr.to_data(); assert_eq!( values_data.len(), len * self.list_len as usize, @@ -173,7 +173,7 @@ where self.list_len, )) .len(len) - .add_child_data(values_data.clone()) + .add_child_data(values_data) .null_bit_buffer(null_bit_buffer); let array_data = unsafe { array_data.build_unchecked() }; @@ -185,7 +185,7 @@ where pub fn finish_cloned(&self) -> FixedSizeListArray { let len = self.len(); let values_arr = self.values_builder.finish_cloned(); - let values_data = values_arr.data(); + let values_data = values_arr.to_data(); assert_eq!( values_data.len(), len * self.list_len as usize, @@ -204,7 +204,7 @@ where self.list_len, )) .len(len) - .add_child_data(values_data.clone()) + .add_child_data(values_data) .null_bit_buffer(null_bit_buffer); let array_data = unsafe { array_data.build_unchecked() }; diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index 4bae4932c5f1..81d250cafffe 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -706,7 +706,7 @@ pub fn downcast_array(array: &dyn Array) -> T where T: From, { - T::from(array.data().clone()) + T::from(array.to_data()) } #[cfg(test)] diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 20e4e19bad39..9e9f15daea4b 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -611,8 +611,8 @@ mod tests { assert_eq!(2, record_batch.num_columns()); assert_eq!(&DataType::Int32, record_batch.schema().field(0).data_type()); assert_eq!(&DataType::Utf8, record_batch.schema().field(1).data_type()); - assert_eq!(num_rows, record_batch.column(0).data().len()); - assert_eq!(num_rows, record_batch.column(1).data().len()); + assert_eq!(num_rows, record_batch.column(0).len()); + assert_eq!(num_rows, record_batch.column(1).len()); } #[test] diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs index 87a7edc8740b..6f9e8a4eb1aa 100644 --- a/arrow-integration-test/src/lib.rs +++ b/arrow-integration-test/src/lib.rs @@ -1271,8 +1271,8 @@ mod tests { ]); let struct_data = ArrayData::builder(struct_data_type) .len(3) - .add_child_data(structs_int32s.data().clone()) - .add_child_data(structs_utf8s.data().clone()) + .add_child_data(structs_int32s.into_data()) + .add_child_data(structs_utf8s.into_data()) .null_bit_buffer(Some(Buffer::from([0b00000011]))) .build() .unwrap(); diff --git a/arrow-json/src/reader.rs b/arrow-json/src/reader.rs index f4610eb345ea..5d86f9a578c2 100644 --- a/arrow-json/src/reader.rs +++ b/arrow-json/src/reader.rs @@ -1178,13 +1178,11 @@ impl Decoder { DataType::Utf8 => flatten_json_string_values(rows) .into_iter() .collect::() - .data() - .clone(), + .into_data(), DataType::LargeUtf8 => flatten_json_string_values(rows) .into_iter() .collect::() - .data() - .clone(), + .into_data(), DataType::List(field) => { let child = self .build_nested_list_array::(&flatten_json_values(rows), field)?; @@ -2411,7 +2409,7 @@ mod tests { ]); let c = ArrayDataBuilder::new(c_field.data_type().clone()) .len(7) - .add_child_data(d.data().clone()) + .add_child_data(d.to_data()) .null_bit_buffer(Some(Buffer::from(vec![0b00111011]))) .build() .unwrap(); @@ -2426,7 +2424,7 @@ mod tests { ]); let a = ArrayDataBuilder::new(a_struct_field.data_type().clone()) .len(7) - .add_child_data(b.data().clone()) + .add_child_data(b.to_data()) .add_child_data(c.clone()) .null_bit_buffer(Some(Buffer::from(vec![0b00111111]))) .build() @@ -2452,7 +2450,7 @@ mod tests { Buffer::from_slice_ref([0i32, 2, 3, 6, 6, 6, 7]) ); // compare list null buffers - assert_eq!(read.data().nulls(), expected.data().nulls()); + assert_eq!(read.nulls(), expected.nulls()); // build struct from list let struct_array = as_struct_array(read.values()); let expected_struct_array = as_struct_array(expected.values()); @@ -2462,10 +2460,7 @@ mod tests { assert_eq!(7, expected_struct_array.len()); assert_eq!(1, expected_struct_array.null_count()); // test struct's nulls - assert_eq!( - struct_array.data().nulls(), - expected_struct_array.data().nulls() - ); + assert_eq!(struct_array.nulls(), expected_struct_array.nulls()); // test struct's fields let read_b = struct_array.column(0); assert_eq!(b.data_ref(), read_b.data_ref()); @@ -2512,13 +2507,11 @@ mod tests { let expected_keys = StringArray::from(vec![ "long", "short", "long", "short", "hedged", "long", "short", ]) - .data() - .clone(); + .into_data(); let expected_value_array_data = StringArray::from(vec![ "$AAA", "$BBB", "$CCC", "$D", "$AAA", "$CCC", "$D", "$YYY", "$D", ]) - .data() - .clone(); + .into_data(); // Create the list that holds ["$_", "$_"] let expected_values = ArrayDataBuilder::new(value_list_type) .len(7) diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index b235df036077..e0853da32e80 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -210,7 +210,7 @@ pub fn eq_bool_scalar( DataType::Boolean, len, None, - left.data().nulls().map(|b| b.inner().sliced()), + left.nulls().map(|b| b.inner().sliced()), 0, vec![values], vec![], @@ -1433,7 +1433,7 @@ where result_remainder.copy_from_slice(remainder_mask_as_bytes); } - let null_bit_buffer = left.data().nulls().map(|b| b.inner().sliced()); + let null_bit_buffer = left.nulls().map(|b| b.inner().sliced()); // null count is the same as in the input since the right side of the scalar comparison cannot be null let null_count = left.null_count(); @@ -3519,8 +3519,7 @@ mod tests { None, Some(7), ]) - .data() - .clone(); + .into_data(); let value_offsets = Buffer::from_slice_ref([0i64, 3, 6, 6, 9]); let list_data_type = DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true))); diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index e4b02fbf230d..2c1de68c1926 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -1314,7 +1314,7 @@ unsafe fn decode_column( rows.iter_mut().for_each(|row| *row = &row[1..]); let children = converter.convert_raw(rows, validate_utf8)?; - let child_data = children.iter().map(|c| c.data().clone()).collect(); + let child_data = children.iter().map(|c| c.to_data()).collect(); let builder = ArrayDataBuilder::new(field.data_type.clone()) .len(rows.len()) .null_count(null_count) @@ -1532,7 +1532,7 @@ mod tests { // Construct dictionary with a timezone let dict = a.finish(); - let values = TimestampNanosecondArray::from(dict.values().data().clone()); + let values = TimestampNanosecondArray::from(dict.values().to_data()); let dict_with_tz = dict.with_values(&values.with_timezone("+02:00".to_string())); let d = DataType::Dictionary( Box::new(DataType::Int32), diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index e463c12a8856..7d42584514f1 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -539,7 +539,7 @@ mod tests { fn test_dictionary_concat_reuse() { let array: DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect(); - let copy: DictionaryArray = array.data().clone().into(); + let copy: DictionaryArray = array.to_data().into(); // dictionary is "a", "b", "c" assert_eq!( diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index 1818c4fb50c4..a75acda79583 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -222,7 +222,7 @@ impl FilterBuilder { /// Create a new [`FilterBuilder`] that can be used to construct a [`FilterPredicate`] pub fn new(filter: &BooleanArray) -> Self { let filter = match filter.null_count() { - 0 => BooleanArray::from(filter.data().clone()), + 0 => filter.clone(), _ => prep_null_mask_filter(filter), }; diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs index cd588fe01c6b..acef1da51aad 100644 --- a/arrow-string/src/length.rs +++ b/arrow-string/src/length.rs @@ -349,7 +349,7 @@ mod tests { let result = result.as_any().downcast_ref::().unwrap(); let expected: Int32Array = expected.into(); - assert_eq!(expected.data(), result.data()); + assert_eq!(&expected, result); }) } @@ -369,7 +369,7 @@ mod tests { .map(|e| e.map(|e| e as i64)) .collect::>() .into(); - assert_eq!(expected.data(), result.data()); + assert_eq!(&expected, result); }) } @@ -528,7 +528,7 @@ mod tests { let result = result.as_any().downcast_ref::().unwrap(); let expected: Int32Array = expected.into(); - assert_eq!(expected.data(), result.data()); + assert_eq!(&expected, result); }) } @@ -548,7 +548,7 @@ mod tests { .map(|e| e.map(|e| e as i64)) .collect::>() .into(); - assert_eq!(expected.data(), result.data()); + assert_eq!(&expected, result); }) } diff --git a/arrow-string/src/regexp.rs b/arrow-string/src/regexp.rs index f3ba90d8a741..ec785c01e818 100644 --- a/arrow-string/src/regexp.rs +++ b/arrow-string/src/regexp.rs @@ -122,7 +122,7 @@ pub fn regexp_is_match_utf8_scalar( regex: &str, flag: Option<&str>, ) -> Result { - let null_bit_buffer = array.data().nulls().map(|x| x.inner().sliced()); + let null_bit_buffer = array.nulls().map(|x| x.inner().sliced()); let mut result = BooleanBufferBuilder::new(array.len()); let pattern = match flag { diff --git a/arrow-string/src/substring.rs b/arrow-string/src/substring.rs index a59a54d7e6e4..7ee33f7fc282 100644 --- a/arrow-string/src/substring.rs +++ b/arrow-string/src/substring.rs @@ -210,7 +210,7 @@ pub fn substring_by_char( GenericStringArray::::DATA_TYPE, array.len(), None, - array.data().nulls().map(|b| b.inner().sliced()), + array.nulls().map(|b| b.inner().sliced()), 0, vec![new_offsets.finish(), vals.finish()], vec![], @@ -294,7 +294,7 @@ fn binary_substring( GenericBinaryArray::::DATA_TYPE, array.len(), None, - array.data().nulls().map(|b| b.inner().sliced()), + array.nulls().map(|b| b.inner().sliced()), 0, vec![Buffer::from_slice_ref(&new_offsets), new_values.into()], vec![], @@ -339,7 +339,7 @@ fn fixed_size_binary_substring( DataType::FixedSizeBinary(new_len), num_of_elements, None, - array.data().nulls().map(|b| b.inner().sliced()), + array.nulls().map(|b| b.inner().sliced()), 0, vec![new_values.into()], vec![], @@ -418,7 +418,7 @@ fn utf8_substring( GenericStringArray::::DATA_TYPE, array.len(), None, - array.data().nulls().map(|b| b.inner().sliced()), + array.nulls().map(|b| b.inner().sliced()), 0, vec![Buffer::from_slice_ref(&new_offsets), new_values.into()], vec![], diff --git a/arrow/benches/array_data_validate.rs b/arrow/benches/array_data_validate.rs index 3b0fdbe63c97..68fc66a635bc 100644 --- a/arrow/benches/array_data_validate.rs +++ b/arrow/benches/array_data_validate.rs @@ -56,7 +56,7 @@ fn validate_benchmark(c: &mut Criterion) { let byte_array = BinaryArray::from_iter_values(std::iter::repeat(b"test").take(20000)); c.bench_function("byte_array_to_string_array 20000", |b| { - b.iter(|| StringArray::from(BinaryArray::from(byte_array.data().clone()))) + b.iter(|| StringArray::from(BinaryArray::from(byte_array.to_data()))) }); } diff --git a/arrow/examples/dynamic_types.rs b/arrow/examples/dynamic_types.rs index eefbf6dcd4ff..d4aec4d38423 100644 --- a/arrow/examples/dynamic_types.rs +++ b/arrow/examples/dynamic_types.rs @@ -100,7 +100,7 @@ fn process(batch: &RecordBatch) { Arc::new(projected_schema), vec![ id.clone(), // NOTE: this is cloning the Arc not the array data - Arc::new(Float64Array::from(nested_c.data().clone())), + Arc::new(Float64Array::from(nested_c.to_data())), ], ) .unwrap(); diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 81c32594861c..333d6425a38e 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -628,7 +628,7 @@ mod tests { .unwrap(); // export it - let array = ArrowArray::try_from(Array::data(&original_array).clone())?; + let array = ArrowArray::try_from(Array::to_data(&original_array))?; // (simulate consumer) import it let data = ArrayData::try_from(array)?; @@ -1122,7 +1122,7 @@ mod tests { .unwrap(); // export it - let array = ArrowArray::try_from(map_array.data().clone())?; + let array = ArrowArray::try_from(map_array.to_data())?; // (simulate consumer) import it let data = ArrayData::try_from(array)?; @@ -1209,7 +1209,7 @@ mod tests { let union = builder.build().unwrap(); // export it - let array = ArrowArray::try_from(union.data().clone())?; + let array = ArrowArray::try_from(union.to_data())?; // (simulate consumer) import it let data = ArrayData::try_from(array)?; diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index 33552dbe3b1b..b8199031796e 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -290,7 +290,7 @@ where .len(size) .null_bit_buffer(nulls) .add_buffer(keys) - .add_child_data(values.data().clone()) + .add_child_data(values.to_data()) .build() .unwrap(); diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index 5fc8e4d43c52..7ead5fa61522 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -191,7 +191,7 @@ fn create_random_list_array( // Create list's child data let child_array = create_random_array(list_field, child_len, null_density, true_density)?; - let child_data = child_array.data(); + let child_data = child_array.to_data(); // Create list's null buffers, if it is nullable let null_buffer = match field.is_nullable() { true => Some(create_random_null_buffer(size, null_density)), @@ -205,7 +205,7 @@ fn create_random_list_array( null_buffer, 0, vec![offsets], - vec![child_data.clone()], + vec![child_data], ) }; Ok(make_array(list_data)) From 1856f4cfe8c59b955186aa9edd048235c8c07304 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Fri, 17 Mar 2023 17:17:48 +0000 Subject: [PATCH 2/3] Review feedback --- arrow-array/src/array/mod.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 136049482330..1ddcc2881863 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -94,15 +94,17 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// ``` fn as_any(&self) -> &dyn Any; - /// Returns a reference to the underlying data of this array. + /// Returns a reference to the underlying data of this array /// /// This will be deprecated in a future release [(#3880)](https://github.com/apache/arrow-rs/issues/3880) fn data(&self) -> &ArrayData; - /// Returns the underlying data of this array. + /// Returns the underlying data of this array fn to_data(&self) -> ArrayData; - /// Returns the underlying data of this array. + /// Returns the underlying data of this array + /// + /// Unlike [`Array::to_data`] this consumes self, allowing it avoid unnecessary clones fn into_data(self) -> ArrayData; /// Returns a reference-counted pointer to the underlying data of this array. From b8cb2551febfb98bd8ef6a1d76080308f203ed31 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Fri, 17 Mar 2023 17:34:04 +0000 Subject: [PATCH 3/3] Format --- arrow-array/src/array/byte_array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index 0ea5409cefa8..991e02501505 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -21,8 +21,8 @@ use crate::iterator::ArrayIter; use crate::types::bytes::ByteArrayNativeType; use crate::types::ByteArrayType; use crate::{Array, ArrayAccessor, ArrayRef, OffsetSizeTrait}; -use arrow_buffer::{NullBuffer, OffsetBuffer}; use arrow_buffer::{ArrowNativeType, Buffer}; +use arrow_buffer::{NullBuffer, OffsetBuffer}; use arrow_data::ArrayData; use arrow_schema::DataType; use std::any::Any;