From 2730da13187df85c064e4715032d2d7f4ab92bff Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 21 Mar 2023 22:05:01 +0000 Subject: [PATCH] Use BooleanBuffer in BooleanArray (#3879) (#3895) * Use BooleanBuffer in BooleanArray (#3879) * Review feedback --- arrow-array/src/array/boolean_array.rs | 33 ++++++++--------- arrow-array/src/builder/boolean_builder.rs | 2 +- arrow-array/src/builder/primitive_builder.rs | 2 +- arrow-buffer/src/buffer/boolean.rs | 39 ++++++++++++++++++-- arrow-buffer/src/buffer/null.rs | 2 +- arrow-ord/src/comparison.rs | 17 +++------ arrow-select/src/filter.rs | 17 +++------ arrow-select/src/take.rs | 2 +- 8 files changed, 66 insertions(+), 48 deletions(-) diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index a7ed870ed5cb..c5775ad3b959 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -19,7 +19,7 @@ use crate::array::print_long_array; use crate::builder::BooleanBuilder; use crate::iterator::BooleanIter; use crate::{Array, ArrayAccessor, ArrayRef}; -use arrow_buffer::{bit_util, Buffer, MutableBuffer, NullBuffer}; +use arrow_buffer::{bit_util, BooleanBuffer, Buffer, MutableBuffer, NullBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::DataType; use std::any::Any; @@ -67,7 +67,7 @@ use std::sync::Arc; #[derive(Clone)] pub struct BooleanArray { data: ArrayData, - raw_values: Buffer, + values: BooleanBuffer, } impl std::fmt::Debug for BooleanArray { @@ -96,11 +96,9 @@ impl BooleanArray { BooleanBuilder::with_capacity(capacity) } - /// Returns a `Buffer` holding all the values of this array. - /// - /// Note this doesn't take the offset of this array into account. - pub fn values(&self) -> &Buffer { - &self.raw_values + /// Returns the underlying [`BooleanBuffer`] holding all the values of this array + pub fn values(&self) -> &BooleanBuffer { + &self.values } /// Returns the number of non null, true values within this array @@ -108,7 +106,7 @@ impl BooleanArray { match self.data.nulls() { Some(nulls) => { let null_chunks = nulls.inner().bit_chunks(); - let value_chunks = self.values().bit_chunks(self.offset(), self.len()); + let value_chunks = self.values().bit_chunks(); null_chunks .iter() .zip(value_chunks.iter()) @@ -119,9 +117,7 @@ impl BooleanArray { .map(|(a, b)| (a & b).count_ones() as usize) .sum() } - None => self - .values() - .count_set_bits_offset(self.offset(), self.len()), + None => self.values().count_set_bits(), } } @@ -135,8 +131,7 @@ impl BooleanArray { /// # Safety /// This doesn't check bounds, the caller must ensure that index < self.len() pub unsafe fn value_unchecked(&self, i: usize) -> bool { - let offset = i + self.offset(); - bit_util::get_bit_raw(self.raw_values.as_ptr(), offset) + self.values.value_unchecked(i) } /// Returns the boolean value at index `i`. @@ -329,8 +324,10 @@ impl From for BooleanArray { 1, "BooleanArray data should contain a single buffer only (values buffer)" ); - let raw_values = data.buffers()[0].clone(); - Self { data, raw_values } + let values = + BooleanBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); + + Self { data, values } } } @@ -424,7 +421,7 @@ mod tests { fn test_boolean_array_from_vec() { let buf = Buffer::from([10_u8]); let arr = BooleanArray::from(vec![false, true, false, true]); - assert_eq!(&buf, arr.values()); + assert_eq!(&buf, arr.values().inner()); assert_eq!(4, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(0, arr.null_count()); @@ -439,7 +436,7 @@ mod tests { fn test_boolean_array_from_vec_option() { let buf = Buffer::from([10_u8]); let arr = BooleanArray::from(vec![Some(false), Some(true), None, Some(true)]); - assert_eq!(&buf, arr.values()); + assert_eq!(&buf, arr.values().inner()); assert_eq!(4, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(1, arr.null_count()); @@ -501,7 +498,7 @@ mod tests { .build() .unwrap(); let arr = BooleanArray::from(data); - assert_eq!(&buf2, arr.values()); + assert_eq!(&buf2, arr.values().inner()); assert_eq!(5, arr.len()); assert_eq!(2, arr.offset()); assert_eq!(0, arr.null_count()); diff --git a/arrow-array/src/builder/boolean_builder.rs b/arrow-array/src/builder/boolean_builder.rs index 0002309a3d55..bc3b62f99234 100644 --- a/arrow-array/src/builder/boolean_builder.rs +++ b/arrow-array/src/builder/boolean_builder.rs @@ -240,7 +240,7 @@ mod tests { } let arr = builder.finish(); - assert_eq!(&buf, arr.values()); + assert_eq!(&buf, arr.values().inner()); assert_eq!(10, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(0, arr.null_count()); diff --git a/arrow-array/src/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs index 71671fe7db53..1c2cd908ca26 100644 --- a/arrow-array/src/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -448,7 +448,7 @@ mod tests { } let arr = builder.finish(); - assert_eq!(&buf, arr.values()); + assert_eq!(&buf, arr.values().inner()); assert_eq!(10, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(0, arr.null_count()); diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index 43b74c6031af..9d5953594d5d 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -16,8 +16,8 @@ // under the License. use crate::bit_chunk_iterator::BitChunks; -use crate::{bit_util, buffer_bin_and, buffer_bin_or, Buffer}; -use std::ops::{BitAnd, BitOr}; +use crate::{bit_util, buffer_bin_and, buffer_bin_or, buffer_unary_not, Buffer}; +use std::ops::{BitAnd, BitOr, Not}; /// A slice-able [`Buffer`] containing bit-packed booleans #[derive(Debug, Clone, Eq)] @@ -77,9 +77,9 @@ impl BooleanBuffer { /// /// Panics if `i >= self.len()` #[inline] + #[deprecated(note = "use BooleanBuffer::value")] pub fn is_set(&self, i: usize) -> bool { - assert!(i < self.len); - unsafe { bit_util::get_bit_raw(self.buffer.as_ptr(), i + self.offset) } + self.value(i) } /// Returns the offset of this [`BooleanBuffer`] in bits @@ -100,6 +100,25 @@ impl BooleanBuffer { self.len == 0 } + /// Returns the boolean value at index `i`. + /// + /// # Panics + /// + /// Panics if `i >= self.len()` + pub fn value(&self, idx: usize) -> bool { + assert!(idx < self.len); + unsafe { self.value_unchecked(idx) } + } + + /// Returns the boolean value at index `i`. + /// + /// # Safety + /// This doesn't check bounds, the caller must ensure that index < self.len() + #[inline] + pub unsafe fn value_unchecked(&self, i: usize) -> bool { + unsafe { bit_util::get_bit_raw(self.buffer.as_ptr(), i + self.offset) } + } + /// Returns the packed values of this [`BooleanBuffer`] not including any offset #[inline] pub fn values(&self) -> &[u8] { @@ -147,6 +166,18 @@ impl BooleanBuffer { } } +impl Not for &BooleanBuffer { + type Output = BooleanBuffer; + + fn not(self) -> Self::Output { + BooleanBuffer { + buffer: buffer_unary_not(&self.buffer, self.offset, self.len), + offset: 0, + len: self.len, + } + } +} + impl BitAnd<&BooleanBuffer> for &BooleanBuffer { type Output = BooleanBuffer; diff --git a/arrow-buffer/src/buffer/null.rs b/arrow-buffer/src/buffer/null.rs index 2f8c864ca957..cbadb7f42dbf 100644 --- a/arrow-buffer/src/buffer/null.rs +++ b/arrow-buffer/src/buffer/null.rs @@ -94,7 +94,7 @@ impl NullBuffer { /// Returns `true` if the value at `idx` is not null #[inline] pub fn is_valid(&self, idx: usize) -> bool { - self.buffer.is_set(idx) + self.buffer.value(idx) } /// Returns `true` if the value at `idx` is null diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index 76760f8bc4f5..eb672e769ac3 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -26,7 +26,6 @@ use arrow_array::cast::*; use arrow_array::types::*; use arrow_array::*; -use arrow_buffer::buffer::buffer_unary_not; use arrow_buffer::{bit_util, Buffer, MutableBuffer}; use arrow_data::bit_mask::combine_option_bitmap; use arrow_data::ArrayData; @@ -196,23 +195,19 @@ pub fn eq_bool_scalar( left: &BooleanArray, right: bool, ) -> Result { - let len = left.len(); - let left_offset = left.offset(); - - let values = if right { - left.values().bit_slice(left_offset, len) - } else { - buffer_unary_not(left.values(), left.offset(), left.len()) + let values = match right { + true => left.values().clone(), + false => !left.values(), }; let data = unsafe { ArrayData::new_unchecked( DataType::Boolean, - len, + values.len(), None, left.nulls().map(|b| b.inner().sliced()), - 0, - vec![values], + values.offset(), + vec![values.into_inner()], vec![], ) }; diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index a75acda79583..35c11970c0f6 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -24,7 +24,7 @@ use arrow_array::cast::{as_generic_binary_array, as_largestring_array, as_string use arrow_array::types::{ArrowDictionaryKeyType, ByteArrayType}; use arrow_array::*; use arrow_buffer::bit_util; -use arrow_buffer::{buffer::buffer_bin_and, Buffer, MutableBuffer}; +use arrow_buffer::{Buffer, MutableBuffer}; use arrow_data::bit_iterator::{BitIndexIterator, BitSliceIterator}; use arrow_data::transform::MutableArrayData; use arrow_data::{ArrayData, ArrayDataBuilder}; @@ -109,9 +109,7 @@ impl<'a> Iterator for IndexIterator<'a> { /// Counts the number of set bits in `filter` fn filter_count(filter: &BooleanArray) -> usize { - filter - .values() - .count_set_bits_offset(filter.offset(), filter.len()) + filter.values().count_set_bits() } /// Function that can filter arbitrary arrays @@ -154,15 +152,12 @@ pub fn build_filter(filter: &BooleanArray) -> Result { pub fn prep_null_mask_filter(filter: &BooleanArray) -> BooleanArray { let array_data = filter.data_ref(); let nulls = array_data.nulls().unwrap(); - let mask = filter.values(); - let offset = filter.offset(); - - let new_mask = - buffer_bin_and(mask, offset, nulls.buffer(), nulls.offset(), filter.len()); + let mask = filter.values() & nulls.inner(); let array_data = ArrayData::builder(DataType::Boolean) - .len(filter.len()) - .add_buffer(new_mask); + .len(mask.len()) + .offset(mask.offset()) + .add_buffer(mask.into_inner()); let array_data = unsafe { array_data.build_unchecked() }; diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 741b05493ea4..421157bdf041 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -524,7 +524,7 @@ where IndexType: ArrowPrimitiveType, IndexType::Native: ToPrimitive, { - let val_buf = take_bits(values.values(), values.offset(), indices)?; + let val_buf = take_bits(values.values().inner(), values.offset(), indices)?; let null_buf = match values.nulls() { Some(nulls) if nulls.null_count() > 0 => { Some(take_bits(nulls.buffer(), nulls.offset(), indices)?)