From 95be75491778931f851c1ed49a7a13a06b666d13 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 17 Oct 2025 13:35:56 -0400 Subject: [PATCH 01/14] feat: improved ListView support Added support for ListView and LargeListView for the following operations: * `arrow_select::concat` * `arrow_select::filter_array` Signed-off-by: Andrew Duffy --- arrow-data/src/transform/mod.rs | 7 ++++ arrow-select/src/concat.rs | 65 ++++++++++++++++++++++++++++++++- 2 files changed, 71 insertions(+), 1 deletion(-) diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index 5b994046e6ca..3f6b6bbd96d1 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -456,6 +456,13 @@ impl<'a> MutableArrayData<'a> { array_capacity = *capacity; new_buffers(data_type, *capacity) } + ( + DataType::ListView(_) | DataType::LargeListView(_), + Capacities::List(capacity, _), + ) => { + array_capacity = *capacity; + new_buffers(data_type, *capacity) + } _ => panic!("Capacities: {capacities:?} not yet supported"), }; diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index 83bc5c2763d2..cf7b387195ce 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -37,7 +37,9 @@ use arrow_array::builder::{ use arrow_array::cast::AsArray; use arrow_array::types::*; use arrow_array::*; -use arrow_buffer::{ArrowNativeType, BooleanBufferBuilder, NullBuffer, OffsetBuffer}; +use arrow_buffer::{ + ArrowNativeType, BooleanBufferBuilder, MutableBuffer, NullBuffer, OffsetBuffer, ScalarBuffer, +}; use arrow_data::ArrayDataBuilder; use arrow_data::transform::{Capacities, MutableArrayData}; use arrow_schema::{ArrowError, DataType, FieldRef, Fields, SchemaRef}; @@ -206,6 +208,65 @@ fn concat_lists( Ok(Arc::new(array)) } +fn concat_list_view( + arrays: &[&dyn Array], + field: &FieldRef, +) -> Result { + let mut output_len = 0; + let mut list_has_nulls = false; + + let lists = arrays + .iter() + .map(|x| x.as_list_view::()) + .inspect(|l| { + output_len += l.len(); + list_has_nulls |= l.null_count() != 0; + }) + .collect::>(); + + let lists_nulls = list_has_nulls.then(|| { + let mut nulls = BooleanBufferBuilder::new(output_len); + for l in &lists { + match l.nulls() { + Some(n) => nulls.append_buffer(n.inner()), + None => nulls.append_n(l.len(), true), + } + } + NullBuffer::new(nulls.finish()) + }); + + // If any of the lists have slices, we need to slice the values + // to ensure that the offsets are correct + let values: Vec<&dyn Array> = lists.iter().map(|l| l.values().as_ref()).collect(); + + let concatenated_values = concat(values.as_slice())?; + + let sizes: ScalarBuffer = lists.iter().flat_map(|x| x.sizes()).copied().collect(); + + let mut offsets = MutableBuffer::with_capacity(lists.iter().map(|l| l.offsets().len()).sum()); + let mut global_offset = OffsetSize::zero(); + for l in lists.iter() { + for &offset in l.offsets() { + offsets.push(offset + global_offset); + } + + // advance the offsets + global_offset += OffsetSize::from_usize(l.values().len()).unwrap(); + } + + let offsets = ScalarBuffer::from(offsets); + + let array = GenericListViewArray::try_new( + field.clone(), + offsets, + sizes, + concatenated_values, + lists_nulls, + )?; + + Ok(Arc::new(array)) +} + fn concat_primitives(arrays: &[&dyn Array]) -> Result { let mut builder = PrimitiveBuilder::::with_capacity(arrays.iter().map(|a| a.len()).sum()) .with_data_type(arrays[0].data_type().clone()); @@ -422,6 +483,8 @@ pub fn concat(arrays: &[&dyn Array]) -> Result { } DataType::List(field) => concat_lists::(arrays, field), DataType::LargeList(field) => concat_lists::(arrays, field), + DataType::ListView(field) => concat_list_view::(arrays, field), + DataType::LargeListView(field) => concat_list_view::(arrays, field), DataType::Struct(fields) => concat_structs(arrays, fields), DataType::Utf8 => concat_bytes::(arrays), DataType::LargeUtf8 => concat_bytes::(arrays), From edc98a6beb1e7535ec29273eae7000238698a730 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 17 Oct 2025 15:59:31 -0400 Subject: [PATCH 02/14] fix ArrayData validation for ListView There was a bug in ArrayData validation for ListView, which became apparent when you tried to call `list_view.to_data().into_builder().build().unwrap()`. The range for checking the offsets/sizes was wrong and would trivially trigger an out of bounds check. Signed-off-by: Andrew Duffy --- arrow-data/src/data.rs | 10 ++- arrow-data/src/equal/list_view.rs | 99 +++++++++++++++++++++++ arrow-data/src/equal/mod.rs | 10 +-- arrow-data/src/transform/mod.rs | 5 +- arrow-select/src/take.rs | 125 ++++++++++++++++++++++++++++++ 5 files changed, 239 insertions(+), 10 deletions(-) create mode 100644 arrow-data/src/equal/list_view.rs diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index c31ac0c6e693..91957e14f332 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -980,7 +980,15 @@ impl ArrayData { ) -> Result<(), ArrowError> { let offsets: &[T] = self.typed_buffer(0, self.len)?; let sizes: &[T] = self.typed_buffer(1, self.len)?; - for i in 0..values_length { + if offsets.len() != sizes.len() { + return Err(ArrowError::ComputeError(format!( + "ListView offsets len {} does not match sizes len {}", + offsets.len(), + sizes.len() + ))); + } + + for i in 0..sizes.len() { let size = sizes[i].to_usize().ok_or_else(|| { ArrowError::InvalidArgumentError(format!( "Error converting size[{}] ({}) to usize for {}", diff --git a/arrow-data/src/equal/list_view.rs b/arrow-data/src/equal/list_view.rs new file mode 100644 index 000000000000..228136fefe90 --- /dev/null +++ b/arrow-data/src/equal/list_view.rs @@ -0,0 +1,99 @@ +use crate::ArrayData; +use crate::data::count_nulls; +use crate::equal::equal_values; +use arrow_buffer::ArrowNativeType; +use num_integer::Integer; + +pub(super) fn list_view_equal( + lhs: &ArrayData, + rhs: &ArrayData, + lhs_start: usize, + rhs_start: usize, + len: usize, +) -> bool { + let lhs_offsets = lhs.buffer::(0); + let lhs_sizes = lhs.buffer::(1); + + let rhs_offsets = rhs.buffer::(0); + let rhs_sizes = rhs.buffer::(1); + + let lhs_data = &lhs.child_data()[0]; + let rhs_data = &rhs.child_data()[0]; + + let lhs_null_count = count_nulls(lhs.nulls(), lhs_start, len); + let rhs_null_count = count_nulls(rhs.nulls(), rhs_start, len); + + if lhs_null_count != rhs_null_count { + return false; + } + + if lhs_null_count == 0 { + // non-null pathway: all sizes must be equal, and all values must be equal + let lhs_range_sizes = &lhs_sizes[lhs_start..lhs_start + len]; + let rhs_range_sizes = &rhs_sizes[rhs_start..rhs_start + len]; + + if lhs_range_sizes.len() != rhs_range_sizes.len() { + return false; + } + + // Check values for equality + let lhs_range_offsets = &lhs_offsets[lhs_start..lhs_start + len]; + let rhs_range_offsets = &rhs_offsets[rhs_start..rhs_start + len]; + + for ((&lhs_offset, &rhs_offset), &size) in lhs_range_offsets + .iter() + .zip(rhs_range_offsets) + .zip(lhs_sizes) + { + let lhs_offset = lhs_offset.to_usize().unwrap(); + let rhs_offset = rhs_offset.to_usize().unwrap(); + let size = size.to_usize().unwrap(); + + // Check if offsets are valid for the given range + if !equal_values(lhs_data, rhs_data, lhs_offset, rhs_offset, size) { + return false; + } + } + } else { + // Need to integrate validity check in the inner loop. + // non-null pathway: all sizes must be equal, and all values must be equal + let lhs_range_sizes = &lhs_sizes[lhs_start..lhs_start + len]; + let rhs_range_sizes = &rhs_sizes[rhs_start..rhs_start + len]; + + let lhs_nulls = lhs.nulls().unwrap().slice(lhs_start, len); + let rhs_nulls = rhs.nulls().unwrap().slice(rhs_start, len); + + if lhs_range_sizes.len() != rhs_range_sizes.len() { + return false; + } + + // Check values for equality, with null checking + let lhs_range_offsets = &lhs_offsets[lhs_start..lhs_start + len]; + let rhs_range_offsets = &rhs_offsets[rhs_start..rhs_start + len]; + + for (index, ((&lhs_offset, &rhs_offset), &size)) in lhs_range_offsets + .iter() + .zip(rhs_range_offsets) + .zip(lhs_sizes) + .enumerate() + { + let lhs_is_null = lhs_nulls.is_null(index); + let rhs_is_null = rhs_nulls.is_null(index); + + if lhs_is_null != rhs_is_null { + return false; + } + + let lhs_offset = lhs_offset.to_usize().unwrap(); + let rhs_offset = rhs_offset.to_usize().unwrap(); + let size = size.to_usize().unwrap(); + + // Check if values match in the range + if !lhs_is_null && !equal_values(lhs_data, rhs_data, lhs_offset, rhs_offset, size) { + return false; + } + } + } + + true +} diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs index 1c16ee2f8a14..7a310b1240df 100644 --- a/arrow-data/src/equal/mod.rs +++ b/arrow-data/src/equal/mod.rs @@ -30,6 +30,7 @@ mod dictionary; mod fixed_binary; mod fixed_list; mod list; +mod list_view; mod null; mod primitive; mod run; @@ -41,6 +42,8 @@ mod variable_size; // these methods assume the same type, len and null count. // For this reason, they are not exposed and are instead used // to build the generic functions below (`equal_range` and `equal`). +use self::run::run_equal; +use crate::equal::list_view::list_view_equal; use boolean::boolean_equal; use byte_view::byte_view_equal; use dictionary::dictionary_equal; @@ -53,8 +56,6 @@ use structure::struct_equal; use union::union_equal; use variable_size::variable_sized_equal; -use self::run::run_equal; - /// Compares the values of two [ArrayData] starting at `lhs_start` and `rhs_start` respectively /// for `len` slots. #[inline] @@ -104,10 +105,9 @@ fn equal_values( byte_view_equal(lhs, rhs, lhs_start, rhs_start, len) } DataType::List(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::ListView(_) | DataType::LargeListView(_) => { - unimplemented!("ListView/LargeListView not yet implemented") - } DataType::LargeList(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::ListView(_) => list_view_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::LargeListView(_) => list_view_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::FixedSizeList(_, _) => fixed_list_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::Struct(_) => struct_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::Union(_, _) => union_equal(lhs, rhs, lhs_start, rhs_start, len), diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index 3f6b6bbd96d1..9fa74a0f3c73 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -456,10 +456,7 @@ impl<'a> MutableArrayData<'a> { array_capacity = *capacity; new_buffers(data_type, *capacity) } - ( - DataType::ListView(_) | DataType::LargeListView(_), - Capacities::List(capacity, _), - ) => { + (DataType::ListView(_) | DataType::LargeListView(_), Capacities::List(capacity, _)) => { array_capacity = *capacity; new_buffers(data_type, *capacity) } diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index dfe6903dc4e3..f87e099c3bea 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -218,6 +218,12 @@ fn take_impl( DataType::LargeList(_) => { Ok(Arc::new(take_list::<_, Int64Type>(values.as_list(), indices)?)) } + DataType::ListView(_) => { + Ok(Arc::new(take_list_view::<_, Int32Type>(values.as_list_view(), indices)?)) + } + DataType::LargeListView(_) => { + Ok(Arc::new(take_list_view::<_, Int64Type>(values.as_list_view(), indices)?)) + } DataType::FixedSizeList(_, length) => { let values = values .as_any() @@ -621,6 +627,59 @@ where Ok(GenericListArray::::from(list_data)) } +fn take_list_view( + values: &GenericListViewArray, + indices: &PrimitiveArray, +) -> Result, ArrowError> +where + IndexType: ArrowPrimitiveType, + OffsetType: ArrowPrimitiveType, + OffsetType::Native: OffsetSizeTrait, +{ + // Take executes only over the views. + let mut taken_offsets: Vec = Vec::with_capacity(indices.len()); + let mut taken_sizes: Vec = Vec::with_capacity(indices.len()); + + // Initialize null buffer + let num_bytes = bit_util::ceil(indices.len(), 8); + let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); + let null_slice = null_buf.as_slice_mut(); + + for i in 0..indices.len() { + if indices.is_valid(i) { + let idx = indices + .value(i) + .to_usize() + .ok_or_else(|| ArrowError::ComputeError("Cast to usize failed".to_string()))?; + taken_offsets.push(values.value_offset(idx)); + taken_sizes.push(values.value_size(idx)); + + if !values.is_valid(idx) { + bit_util::unset_bit(null_slice, i); + } + } else { + // null pathway + bit_util::unset_bit(null_slice, i); + taken_offsets.push(OffsetType::default_value()); + taken_sizes.push(OffsetType::default_value()); + } + } + + let list_view_data = ArrayDataBuilder::new(values.data_type().clone()) + .len(indices.len()) + .null_bit_buffer(Some(null_buf.into())) + .offset(0) + .buffers(vec![taken_offsets.into(), taken_sizes.into()]) + .child_data(vec![values.values().to_data()]); + + // SAFETY: all buffers and child nodes for ListView added in constructor + let list_view_data = unsafe { list_view_data.build_unchecked() }; + + Ok(GenericListViewArray::::from( + list_view_data, + )) +} + /// `take` implementation for `FixedSizeListArray` /// /// Calculates the index and indexed offset for the inner array, @@ -1821,6 +1880,67 @@ mod tests { }}; } + macro_rules! test_take_list_view { + ($offset_type:ty, $list_view_data_type:ident, $list_view_array_type:ident) => {{ + // Construct a value array, [[0,0,0], [-1,-2,-1], [], [2,3]] + let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 3]).into_data(); + // Construct offsets + let value_offsets: [$offset_type; 4] = [0, 3, 0, 6]; + let value_offsets = Buffer::from_slice_ref(&value_offsets); + let value_sizes: [$offset_type; 4] = [3, 3, 0, 2]; + let value_sizes = Buffer::from_slice_ref(&value_sizes); + // Construct a list array from the above two + let list_view_data_type = DataType::$list_view_data_type(Arc::new( + Field::new_list_field(DataType::Int32, false), + )); + let list_view_data = ArrayData::builder(list_view_data_type.clone()) + .len(4) + .add_buffers(vec![value_offsets, value_sizes]) + .child_data(vec![value_data]) + .build() + .unwrap(); + let list_view_array = $list_view_array_type::from(list_view_data); + + // index returns: [[2,3], null, [-1,-2,-1], [], [0,0,0]] + let index = UInt32Array::from(vec![Some(3), None, Some(1), Some(2), Some(0)]); + + let a = take(&list_view_array, &index, None).unwrap(); + let a: &$list_view_array_type = + a.as_any().downcast_ref::<$list_view_array_type>().unwrap(); + + // construct a value array with expected results: + // [[2,3], null, [-1,-2,-1], [], [0,0,0]] + let expected_data = Int32Array::from(vec![ + Some(2), + Some(3), + Some(-1), + Some(-2), + Some(-1), + Some(0), + Some(0), + Some(0), + ]) + .into_data(); + // construct offsets + let expected_offsets: [$offset_type; 5] = [0, 0, 2, 0, 5]; + let expected_offsets = Buffer::from_slice_ref(&expected_offsets); + let expected_sizes: [$offset_type; 5] = [2, 0, 3, 0, 3]; + let expected_sizes = Buffer::from_slice_ref(&expected_sizes); + // construct expected list view array + let expected_list_data = ArrayData::builder(list_view_data_type) + .len(5) + // null buffer remains the same as only the indices have nulls + .nulls(index.nulls().cloned()) + .buffers(vec![expected_offsets, expected_sizes]) + .child_data(vec![expected_data]) + .build() + .unwrap(); + let expected_list_array = $list_view_array_type::from(expected_list_data); + + assert_eq!(a, &expected_list_array); + }}; + } + fn do_take_fixed_size_list_test( length: ::Native, input_data: Vec>>>, @@ -1871,6 +1991,11 @@ mod tests { test_take_list_with_nulls!(i64, LargeList, LargeListArray); } + #[test] + fn test_test_take_list_view() { + test_take_list_view!(i32, ListView, ListViewArray); + } + #[test] fn test_take_fixed_size_list() { do_take_fixed_size_list_test::( From a3b821b492126324278fb2386d780c08057671f7 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 17 Oct 2025 17:24:32 -0400 Subject: [PATCH 03/14] more fixes, add test Signed-off-by: Andrew Duffy --- arrow-data/src/equal/list_view.rs | 2 +- arrow-data/src/transform/list_view.rs | 42 ++++++++++++++ arrow-data/src/transform/mod.rs | 37 +++++++------ arrow-data/src/transform/utils.rs | 7 +++ arrow-select/src/filter.rs | 80 +++++++++++++++++++++++++++ 5 files changed, 151 insertions(+), 17 deletions(-) create mode 100644 arrow-data/src/transform/list_view.rs diff --git a/arrow-data/src/equal/list_view.rs b/arrow-data/src/equal/list_view.rs index 228136fefe90..80be35c8f8f8 100644 --- a/arrow-data/src/equal/list_view.rs +++ b/arrow-data/src/equal/list_view.rs @@ -63,7 +63,7 @@ pub(super) fn list_view_equal( let lhs_nulls = lhs.nulls().unwrap().slice(lhs_start, len); let rhs_nulls = rhs.nulls().unwrap().slice(rhs_start, len); - if lhs_range_sizes.len() != rhs_range_sizes.len() { + if lhs_range_sizes.len() != rhs_range_sizes.len() || lhs_range_sizes != rhs_range_sizes { return false; } diff --git a/arrow-data/src/transform/list_view.rs b/arrow-data/src/transform/list_view.rs new file mode 100644 index 000000000000..e218c9653fc1 --- /dev/null +++ b/arrow-data/src/transform/list_view.rs @@ -0,0 +1,42 @@ +use crate::ArrayData; +use crate::transform::_MutableArrayData; +use crate::transform::utils::get_last_value_or_default; +use arrow_buffer::ArrowNativeType; +use num_integer::Integer; +use num_traits::CheckedAdd; + +pub(super) fn build_extend( + array: &ArrayData, +) -> crate::transform::Extend<'_> { + let offsets = array.buffer::(0); + let sizes = array.buffer::(1); + Box::new( + move |mutable: &mut _MutableArrayData, _index: usize, start: usize, len: usize| { + let offset_buffer = &mut mutable.buffer1; + let sizes_buffer = &mut mutable.buffer2; + + for &offset in &offsets[start..start + len] { + offset_buffer.push(offset); + } + + // sizes + for &size in &sizes[start..start + len] { + sizes_buffer.push(size); + } + + // the beauty of views is that we don't need to copy child_data, we just splat + // the offsets and sizes. + }, + ) +} + +pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { + let offset_buffer = &mut mutable.buffer1; + let sizes_buffer = &mut mutable.buffer2; + + let last_offset: T = get_last_value_or_default(offset_buffer); + let last_size: T = get_last_value_or_default(sizes_buffer); + + (0..len).for_each(|_| offset_buffer.push(last_offset)); + (0..len).for_each(|_| sizes_buffer.push(last_size)); +} diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index 9fa74a0f3c73..c6052817bfb6 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -33,6 +33,7 @@ mod boolean; mod fixed_binary; mod fixed_size_list; mod list; +mod list_view; mod null; mod primitive; mod run; @@ -265,10 +266,9 @@ fn build_extend(array: &ArrayData) -> Extend<'_> { DataType::LargeUtf8 | DataType::LargeBinary => variable_size::build_extend::(array), DataType::BinaryView | DataType::Utf8View => unreachable!("should use build_extend_view"), DataType::Map(_, _) | DataType::List(_) => list::build_extend::(array), - DataType::ListView(_) | DataType::LargeListView(_) => { - unimplemented!("ListView/LargeListView not implemented") - } DataType::LargeList(_) => list::build_extend::(array), + DataType::ListView(_) => list_view::build_extend::(array), + DataType::LargeListView(_) => list_view::build_extend::(array), DataType::Dictionary(_, _) => unreachable!("should use build_extend_dictionary"), DataType::Struct(_) => structure::build_extend(array), DataType::FixedSizeBinary(_) => fixed_binary::build_extend(array), @@ -313,10 +313,9 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::, DataType::BinaryView | DataType::Utf8View => primitive::extend_nulls::, DataType::Map(_, _) | DataType::List(_) => list::extend_nulls::, - DataType::ListView(_) | DataType::LargeListView(_) => { - unimplemented!("ListView/LargeListView not implemented") - } DataType::LargeList(_) => list::extend_nulls::, + DataType::ListView(_) => list_view::extend_nulls::, + DataType::LargeListView(_) => list_view::extend_nulls::, DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() { DataType::UInt8 => primitive::extend_nulls::, DataType::UInt16 => primitive::extend_nulls::, @@ -450,16 +449,16 @@ impl<'a> MutableArrayData<'a> { new_buffers(data_type, *capacity) } ( - DataType::List(_) | DataType::LargeList(_) | DataType::FixedSizeList(_, _), + DataType::List(_) + | DataType::LargeList(_) + | DataType::ListView(_) + | DataType::LargeListView(_) + | DataType::FixedSizeList(_, _), Capacities::List(capacity, _), ) => { array_capacity = *capacity; new_buffers(data_type, *capacity) } - (DataType::ListView(_) | DataType::LargeListView(_), Capacities::List(capacity, _)) => { - array_capacity = *capacity; - new_buffers(data_type, *capacity) - } _ => panic!("Capacities: {capacities:?} not yet supported"), }; @@ -495,10 +494,11 @@ impl<'a> MutableArrayData<'a> { | DataType::Utf8View | DataType::Interval(_) | DataType::FixedSizeBinary(_) => vec![], - DataType::ListView(_) | DataType::LargeListView(_) => { - unimplemented!("ListView/LargeListView not implemented") - } - DataType::Map(_, _) | DataType::List(_) | DataType::LargeList(_) => { + DataType::Map(_, _) + | DataType::List(_) + | DataType::LargeList(_) + | DataType::ListView(_) + | DataType::LargeListView(_) => { let children = arrays .iter() .map(|array| &array.child_data()[0]) @@ -789,7 +789,12 @@ impl<'a> MutableArrayData<'a> { b.insert(0, data.buffer1.into()); b } - DataType::Utf8 | DataType::Binary | DataType::LargeUtf8 | DataType::LargeBinary => { + DataType::Utf8 + | DataType::Binary + | DataType::LargeUtf8 + | DataType::LargeBinary + | DataType::ListView(_) + | DataType::LargeListView(_) => { vec![data.buffer1.into(), data.buffer2.into()] } DataType::Union(_, mode) => { diff --git a/arrow-data/src/transform/utils.rs b/arrow-data/src/transform/utils.rs index 979738d057fd..757c5fdf651a 100644 --- a/arrow-data/src/transform/utils.rs +++ b/arrow-data/src/transform/utils.rs @@ -58,6 +58,13 @@ pub(super) unsafe fn get_last_offset(offset_buffer: &Mutable *unsafe { offsets.get_unchecked(offsets.len() - 1) } } +#[inline] +pub(super) fn get_last_value_or_default(offset_buffer: &MutableBuffer) -> T { + let (prefix, offsets, suffix) = unsafe { offset_buffer.as_slice().align_to::() }; + debug_assert!(prefix.is_empty() && suffix.is_empty()); + offsets.last().copied().unwrap_or_default() +} + #[cfg(test)] mod tests { use crate::transform::utils::extend_offsets; diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index dace2bab728f..85ff38a66e4d 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -346,6 +346,12 @@ fn filter_array(values: &dyn Array, predicate: &FilterPredicate) -> Result { Ok(Arc::new(filter_fixed_size_binary(values.as_fixed_size_binary(), predicate))) } + DataType::ListView(_) => { + Ok(Arc::new(filter_list_view::(values.as_list_view(), predicate))) + } + DataType::LargeListView(_) => { + Ok(Arc::new(filter_list_view::(values.as_list_view(), predicate))) + } DataType::RunEndEncoded(_, _) => { downcast_run_array!{ values => Ok(Arc::new(filter_run_end_array(values, predicate)?)), @@ -860,6 +866,26 @@ fn filter_sparse_union( }) } +/// `filter` implementation for list views +fn filter_list_view( + array: &GenericListViewArray, + predicate: &FilterPredicate, +) -> GenericListViewArray { + let filtered_offsets = filter_native::(array.offsets(), predicate); + let filtered_sizes = filter_native::(array.sizes(), predicate); + + // Do we need to propagate the offset from the ListView? + let list_data = ArrayDataBuilder::new(array.data_type().clone()) + .nulls(array.nulls().cloned()) + .buffers(vec![filtered_offsets, filtered_sizes]) + .child_data(vec![array.values().to_data()]) + .len(predicate.count); + + let list_data = unsafe { list_data.build_unchecked() }; + + GenericListViewArray::from(list_data) +} + #[cfg(test)] mod tests { use super::*; @@ -1370,6 +1396,60 @@ mod tests { assert_eq!(&make_array(expected), &result); } + #[test] + fn test_filter_list_view_array() { + // [[1, 2], null, [], [3,4]] + let mut list_array = ListViewBuilder::with_capacity(Int32Builder::with_capacity(6), 4); + list_array.append_value([Some(1), Some(2)]); + list_array.append_null(); + list_array.append_value([]); + list_array.append_value([Some(3), Some(4)]); + + let list_array = list_array.finish(); + let predicate = BooleanArray::from_iter([true, true, false, true]); + + // Filter result: [[1, 2], null, [3, 4]] + let filtered = filter(&list_array, &predicate) + .unwrap() + .as_list_view::() + .clone(); + + let mut expected = ListViewBuilder::with_capacity(Int32Builder::with_capacity(5), 3); + expected.append_value([Some(1), Some(2)]); + expected.append_null(); + expected.append_value([Some(3), Some(4)]); + let expected = expected.finish(); + + assert_eq!(&filtered, &expected); + } + + #[test] + fn test_filter_large_list_view_array() { + // [[1, 2], null, [], [3,4]] + let mut list_array = LargeListViewBuilder::with_capacity(Int32Builder::with_capacity(6), 4); + list_array.append_value([Some(1), Some(2)]); + list_array.append_null(); + list_array.append_value([]); + list_array.append_value([Some(3), Some(4)]); + + let list_array = list_array.finish(); + let predicate = BooleanArray::from_iter([true, true, false, true]); + + // Filter result: [[1, 2], null, [3, 4]] + let filtered = filter(&list_array, &predicate) + .unwrap() + .as_list_view::() + .clone(); + + let mut expected = LargeListViewBuilder::with_capacity(Int32Builder::with_capacity(5), 3); + expected.append_value([Some(1), Some(2)]); + expected.append_null(); + expected.append_value([Some(3), Some(4)]); + let expected = expected.finish(); + + assert_eq!(&filtered, &expected); + } + #[test] fn test_slice_iterator_bits() { let filter_values = (0..64).map(|i| i == 1).collect::>(); From eefd289132cbd1328db7cf5d7453e309362c57ec Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Sat, 18 Oct 2025 14:39:07 -0400 Subject: [PATCH 04/14] test for concat Signed-off-by: Andrew Duffy --- arrow-select/src/concat.rs | 47 +++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index cf7b387195ce..17067b7040a1 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -563,7 +563,9 @@ pub fn concat_batches<'a>( #[cfg(test)] mod tests { use super::*; - use arrow_array::builder::{GenericListBuilder, StringDictionaryBuilder}; + use arrow_array::builder::{ + GenericListBuilder, Int64Builder, ListViewBuilder, StringDictionaryBuilder, + }; use arrow_schema::{Field, Schema}; use std::fmt::Debug; @@ -948,6 +950,49 @@ mod tests { assert_eq!(array_result.as_ref(), &array_expected as &dyn Array); } + #[test] + fn test_concat_list_view_arrays() { + let list1 = vec![ + (Some(vec![Some(-1), None])), + None, + Some(vec![Some(10), Some(20)]), + ]; + let mut list1_array = ListViewBuilder::new(Int64Builder::new()); + for v in list1.iter() { + list1_array.append_option(v.clone()); + } + let list1_array = list1_array.finish(); + + let list2 = vec![ + None, + Some(vec![Some(100), None]), + Some(vec![Some(102), Some(103)]), + ]; + let mut list2_array = ListViewBuilder::new(Int64Builder::new()); + for v in list2.iter() { + list2_array.append_option(v.clone()); + } + let list2_array = list2_array.finish(); + + let list3 = vec![Some(vec![Some(1000), Some(1001)])]; + let mut list3_array = ListViewBuilder::new(Int64Builder::new()); + for v in list3.iter() { + list3_array.append_option(v.clone()); + } + let list3_array = list3_array.finish(); + + let array_result = concat(&[&list1_array, &list2_array, &list3_array]).unwrap(); + + let expected: Vec<_> = list1.into_iter().chain(list2).chain(list3).collect(); + let mut array_expected = ListViewBuilder::new(Int64Builder::new()); + for v in expected.iter() { + array_expected.append_option(v.clone()); + } + let array_expected = array_expected.finish(); + + assert_eq!(array_result.as_ref(), &array_expected as &dyn Array); + } + #[test] fn test_concat_struct_arrays() { let field = Arc::new(Field::new("field", DataType::Int64, true)); From d7e7846afa2f24a62acc8dbc93a9255dcc825f22 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Sun, 19 Oct 2025 13:05:28 -0400 Subject: [PATCH 05/14] fix + tests for take list_view Signed-off-by: Andrew Duffy --- arrow-data/src/equal/list_view.rs | 11 +++- arrow-select/src/take.rs | 92 +++++++++++++++++++++++++++++-- 2 files changed, 97 insertions(+), 6 deletions(-) diff --git a/arrow-data/src/equal/list_view.rs b/arrow-data/src/equal/list_view.rs index 80be35c8f8f8..d2fe364a791d 100644 --- a/arrow-data/src/equal/list_view.rs +++ b/arrow-data/src/equal/list_view.rs @@ -40,6 +40,10 @@ pub(super) fn list_view_equal( let lhs_range_offsets = &lhs_offsets[lhs_start..lhs_start + len]; let rhs_range_offsets = &rhs_offsets[rhs_start..rhs_start + len]; + if lhs_range_offsets.len() != rhs_range_offsets.len() { + return false; + } + for ((&lhs_offset, &rhs_offset), &size) in lhs_range_offsets .iter() .zip(rhs_range_offsets) @@ -63,7 +67,8 @@ pub(super) fn list_view_equal( let lhs_nulls = lhs.nulls().unwrap().slice(lhs_start, len); let rhs_nulls = rhs.nulls().unwrap().slice(rhs_start, len); - if lhs_range_sizes.len() != rhs_range_sizes.len() || lhs_range_sizes != rhs_range_sizes { + // Sizes can differ if values are null + if lhs_range_sizes.len() != rhs_range_sizes.len() { return false; } @@ -71,6 +76,10 @@ pub(super) fn list_view_equal( let lhs_range_offsets = &lhs_offsets[lhs_start..lhs_start + len]; let rhs_range_offsets = &rhs_offsets[rhs_start..rhs_start + len]; + if lhs_range_offsets.len() != rhs_range_offsets.len() { + return false; + } + for (index, ((&lhs_offset, &rhs_offset), &size)) in lhs_range_offsets .iter() .zip(rhs_range_offsets) diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index f87e099c3bea..12a50cbba334 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -660,15 +660,14 @@ where } else { // null pathway bit_util::unset_bit(null_slice, i); - taken_offsets.push(OffsetType::default_value()); - taken_sizes.push(OffsetType::default_value()); + taken_offsets.push(taken_offsets.last().copied().unwrap_or_default()); + taken_sizes.push(taken_offsets.last().copied().unwrap_or_default()); } } let list_view_data = ArrayDataBuilder::new(values.data_type().clone()) .len(indices.len()) .null_bit_buffer(Some(null_buf.into())) - .offset(0) .buffers(vec![taken_offsets.into(), taken_sizes.into()]) .child_data(vec![values.values().to_data()]); @@ -1039,6 +1038,7 @@ mod tests { use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano}; use arrow_data::ArrayData; use arrow_schema::{Field, Fields, TimeUnit, UnionFields}; + use num_traits::ToPrimitive; fn test_take_decimal_arrays( data: Vec>, @@ -1941,6 +1941,47 @@ mod tests { }}; } + fn test_take_list_view_generic( + values: Vec>>>, + take_indices: Vec>, + expected: Vec>>>, + ) { + let mut list_view_array = + GenericListViewBuilder::::new(PrimitiveBuilder::::new()); + + for value in values { + list_view_array.append_option(value); + } + let list_view_array = list_view_array.finish(); + + let mut indices = UInt64Builder::new(); + for idx in take_indices { + indices.append_option(idx.map(|i| i.to_u64().unwrap())); + } + let indices = indices.finish(); + + let taken = take(&list_view_array, &indices, None) + .unwrap() + .as_list_view() + .clone(); + + let mut expected_array = + GenericListViewBuilder::::new(PrimitiveBuilder::::new()); + for value in expected { + expected_array.append_option(value); + } + let expected_array = expected_array.finish(); + + assert_eq!(taken, expected_array); + } + + macro_rules! list_view_test_case { + (values: $values:expr, indices: $indices:expr, expected: $expected: expr) => {{ + test_take_list_view_generic::($values, $indices, $expected); + test_take_list_view_generic::($values, $indices, $expected); + }}; + } + fn do_take_fixed_size_list_test( length: ::Native, input_data: Vec>>>, @@ -1992,8 +2033,49 @@ mod tests { } #[test] - fn test_test_take_list_view() { - test_take_list_view!(i32, ListView, ListViewArray); + fn test_test_take_list_view_reversed() { + // Take reversed indices + list_view_test_case! { + values: vec![ + Some(vec![Some(1), None, Some(3)]), + None, + Some(vec![Some(7), Some(8), None]), + ], + indices: vec![Some(2), Some(1), Some(0)], + expected: vec![ + Some(vec![Some(7), Some(8), None]), + None, + Some(vec![Some(1), None, Some(3)]), + ] + }; + } + + #[test] + fn test_take_list_view_null_indices() { + // Take with null indices + list_view_test_case! { + values: vec![ + Some(vec![Some(1), None, Some(3)]), + None, + Some(vec![Some(7), Some(8), None]), + ], + indices: vec![None, Some(0), None], + expected: vec![None, Some(vec![Some(1), None, Some(3)]), None] + }; + } + + #[test] + fn test_take_list_view_null_values() { + // Take at null values + list_view_test_case! { + values: vec![ + Some(vec![Some(1), None, Some(3)]), + None, + Some(vec![Some(7), Some(8), None]), + ], + indices: vec![Some(1), Some(1), Some(1), None, None], + expected: vec![None; 5] + }; } #[test] From b7519075986c0b26b49e20acad4b45342ce5a936 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 20 Oct 2025 11:36:16 -0400 Subject: [PATCH 06/14] remove unused test macro Signed-off-by: Andrew Duffy --- arrow-select/src/take.rs | 61 ---------------------------------------- 1 file changed, 61 deletions(-) diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 12a50cbba334..06ca990f3c5b 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -1880,67 +1880,6 @@ mod tests { }}; } - macro_rules! test_take_list_view { - ($offset_type:ty, $list_view_data_type:ident, $list_view_array_type:ident) => {{ - // Construct a value array, [[0,0,0], [-1,-2,-1], [], [2,3]] - let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 3]).into_data(); - // Construct offsets - let value_offsets: [$offset_type; 4] = [0, 3, 0, 6]; - let value_offsets = Buffer::from_slice_ref(&value_offsets); - let value_sizes: [$offset_type; 4] = [3, 3, 0, 2]; - let value_sizes = Buffer::from_slice_ref(&value_sizes); - // Construct a list array from the above two - let list_view_data_type = DataType::$list_view_data_type(Arc::new( - Field::new_list_field(DataType::Int32, false), - )); - let list_view_data = ArrayData::builder(list_view_data_type.clone()) - .len(4) - .add_buffers(vec![value_offsets, value_sizes]) - .child_data(vec![value_data]) - .build() - .unwrap(); - let list_view_array = $list_view_array_type::from(list_view_data); - - // index returns: [[2,3], null, [-1,-2,-1], [], [0,0,0]] - let index = UInt32Array::from(vec![Some(3), None, Some(1), Some(2), Some(0)]); - - let a = take(&list_view_array, &index, None).unwrap(); - let a: &$list_view_array_type = - a.as_any().downcast_ref::<$list_view_array_type>().unwrap(); - - // construct a value array with expected results: - // [[2,3], null, [-1,-2,-1], [], [0,0,0]] - let expected_data = Int32Array::from(vec![ - Some(2), - Some(3), - Some(-1), - Some(-2), - Some(-1), - Some(0), - Some(0), - Some(0), - ]) - .into_data(); - // construct offsets - let expected_offsets: [$offset_type; 5] = [0, 0, 2, 0, 5]; - let expected_offsets = Buffer::from_slice_ref(&expected_offsets); - let expected_sizes: [$offset_type; 5] = [2, 0, 3, 0, 3]; - let expected_sizes = Buffer::from_slice_ref(&expected_sizes); - // construct expected list view array - let expected_list_data = ArrayData::builder(list_view_data_type) - .len(5) - // null buffer remains the same as only the indices have nulls - .nulls(index.nulls().cloned()) - .buffers(vec![expected_offsets, expected_sizes]) - .child_data(vec![expected_data]) - .build() - .unwrap(); - let expected_list_array = $list_view_array_type::from(expected_list_data); - - assert_eq!(a, &expected_list_array); - }}; - } - fn test_take_list_view_generic( values: Vec>>>, take_indices: Vec>, From 124b4370808d946e08dd9d1ee139a7041cb03f54 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 20 Oct 2025 11:37:53 -0400 Subject: [PATCH 07/14] fix license headers Signed-off-by: Andrew Duffy --- arrow-data/src/equal/list_view.rs | 17 +++++++++++++++++ arrow-data/src/transform/list_view.rs | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/arrow-data/src/equal/list_view.rs b/arrow-data/src/equal/list_view.rs index d2fe364a791d..b0e7f3456c66 100644 --- a/arrow-data/src/equal/list_view.rs +++ b/arrow-data/src/equal/list_view.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use crate::ArrayData; use crate::data::count_nulls; use crate::equal::equal_values; diff --git a/arrow-data/src/transform/list_view.rs b/arrow-data/src/transform/list_view.rs index e218c9653fc1..2081bcec8089 100644 --- a/arrow-data/src/transform/list_view.rs +++ b/arrow-data/src/transform/list_view.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use crate::ArrayData; use crate::transform::_MutableArrayData; use crate::transform::utils::get_last_value_or_default; From 278f297b307d7c9aa074f4afab02f5fbf324d5c6 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 20 Oct 2025 17:10:53 -0400 Subject: [PATCH 08/14] remove old comment Signed-off-by: Andrew Duffy --- arrow-select/src/concat.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index 17067b7040a1..d6c358868b5d 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -235,8 +235,6 @@ fn concat_list_view( NullBuffer::new(nulls.finish()) }); - // If any of the lists have slices, we need to slice the values - // to ensure that the offsets are correct let values: Vec<&dyn Array> = lists.iter().map(|l| l.values().as_ref()).collect(); let concatenated_values = concat(values.as_slice())?; From c22b4a789a577c4f3796834ae93472a7aeb82766 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 23 Oct 2025 12:14:04 -0400 Subject: [PATCH 09/14] address comments Signed-off-by: Andrew Duffy --- arrow-data/src/equal/list_view.rs | 4 ++ arrow-data/src/transform/list_view.rs | 9 ++-- arrow-data/src/transform/utils.rs | 7 ---- arrow-select/src/concat.rs | 56 +++++++++++++++++++++++++ arrow-select/src/filter.rs | 59 +++++++++++++++++---------- 5 files changed, 101 insertions(+), 34 deletions(-) diff --git a/arrow-data/src/equal/list_view.rs b/arrow-data/src/equal/list_view.rs index b0e7f3456c66..f35622ae3a82 100644 --- a/arrow-data/src/equal/list_view.rs +++ b/arrow-data/src/equal/list_view.rs @@ -53,6 +53,10 @@ pub(super) fn list_view_equal( return false; } + if lhs_range_sizes != rhs_range_sizes { + return false; + } + // Check values for equality let lhs_range_offsets = &lhs_offsets[lhs_start..lhs_start + len]; let rhs_range_offsets = &rhs_offsets[rhs_start..rhs_start + len]; diff --git a/arrow-data/src/transform/list_view.rs b/arrow-data/src/transform/list_view.rs index 2081bcec8089..9b66a6a6abb1 100644 --- a/arrow-data/src/transform/list_view.rs +++ b/arrow-data/src/transform/list_view.rs @@ -17,7 +17,6 @@ use crate::ArrayData; use crate::transform::_MutableArrayData; -use crate::transform::utils::get_last_value_or_default; use arrow_buffer::ArrowNativeType; use num_integer::Integer; use num_traits::CheckedAdd; @@ -51,9 +50,7 @@ pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, let offset_buffer = &mut mutable.buffer1; let sizes_buffer = &mut mutable.buffer2; - let last_offset: T = get_last_value_or_default(offset_buffer); - let last_size: T = get_last_value_or_default(sizes_buffer); - - (0..len).for_each(|_| offset_buffer.push(last_offset)); - (0..len).for_each(|_| sizes_buffer.push(last_size)); + // We push 0 as a placeholder for NULL values in both the offsets and sizes + (0..len).for_each(|_| offset_buffer.push(T::default())); + (0..len).for_each(|_| sizes_buffer.push(T::default())); } diff --git a/arrow-data/src/transform/utils.rs b/arrow-data/src/transform/utils.rs index 757c5fdf651a..979738d057fd 100644 --- a/arrow-data/src/transform/utils.rs +++ b/arrow-data/src/transform/utils.rs @@ -58,13 +58,6 @@ pub(super) unsafe fn get_last_offset(offset_buffer: &Mutable *unsafe { offsets.get_unchecked(offsets.len() - 1) } } -#[inline] -pub(super) fn get_last_value_or_default(offset_buffer: &MutableBuffer) -> T { - let (prefix, offsets, suffix) = unsafe { offset_buffer.as_slice().align_to::() }; - debug_assert!(prefix.is_empty() && suffix.is_empty()); - offsets.last().copied().unwrap_or_default() -} - #[cfg(test)] mod tests { use crate::transform::utils::extend_offsets; diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index d6c358868b5d..548611323ee2 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -991,6 +991,62 @@ mod tests { assert_eq!(array_result.as_ref(), &array_expected as &dyn Array); } + #[test] + fn test_concat_sliced_list_view_arrays() { + let list1 = vec![ + Some(vec![Some(-1), None]), + None, + Some(vec![Some(10), Some(20)]), + ]; + let mut list1_array = ListViewBuilder::new(Int64Builder::new()); + for v in list1.iter() { + list1_array.append_option(v.clone()); + } + let list1_array = list1_array.finish(); + + let list2 = vec![ + None, + Some(vec![Some(100), None]), + Some(vec![Some(102), Some(103)]), + ]; + let mut list2_array = ListViewBuilder::new(Int64Builder::new()); + for v in list2.iter() { + list2_array.append_option(v.clone()); + } + let list2_array = list2_array.finish(); + + let list3 = vec![Some(vec![Some(1000), Some(1001)])]; + let mut list3_array = ListViewBuilder::new(Int64Builder::new()); + for v in list3.iter() { + list3_array.append_option(v.clone()); + } + let list3_array = list3_array.finish(); + + // Concat sliced arrays. + // ListView slicing will slice the offset/sizes but preserve the original values child. + let array_result = concat(&[ + &list1_array.slice(1, 2), + &list2_array.slice(1, 2), + &list3_array.slice(0, 1), + ]) + .unwrap(); + + let expected: Vec<_> = vec![ + None, + Some(vec![Some(10), Some(20)]), + Some(vec![Some(100), None]), + Some(vec![Some(102), Some(103)]), + Some(vec![Some(1000), Some(1001)]), + ]; + let mut array_expected = ListViewBuilder::new(Int64Builder::new()); + for v in expected.iter() { + array_expected.append_option(v.clone()); + } + let array_expected = array_expected.finish(); + + assert_eq!(array_result.as_ref(), &array_expected as &dyn Array); + } + #[test] fn test_concat_struct_arrays() { let field = Arc::new(Field::new("field", DataType::Int64, true)); diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index 85ff38a66e4d..d1ae0aa48ab2 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -874,9 +874,17 @@ fn filter_list_view( let filtered_offsets = filter_native::(array.offsets(), predicate); let filtered_sizes = filter_native::(array.sizes(), predicate); - // Do we need to propagate the offset from the ListView? + // Filter the nulls + let nulls = if let Some((null_count, nulls)) = filter_null_mask(array.nulls(), predicate) { + let buffer = BooleanBuffer::new(nulls, 0, predicate.count); + + Some(unsafe { NullBuffer::new_unchecked(buffer, null_count) }) + } else { + None + }; + let list_data = ArrayDataBuilder::new(array.data_type().clone()) - .nulls(array.nulls().cloned()) + .nulls(nulls) .buffers(vec![filtered_offsets, filtered_sizes]) .child_data(vec![array.values().to_data()]) .len(predicate.count); @@ -1396,60 +1404,69 @@ mod tests { assert_eq!(&make_array(expected), &result); } - #[test] - fn test_filter_list_view_array() { + fn test_case_filter_list_view() { // [[1, 2], null, [], [3,4]] - let mut list_array = ListViewBuilder::with_capacity(Int32Builder::with_capacity(6), 4); + let mut list_array = GenericListViewBuilder::::new(Int32Builder::new()); list_array.append_value([Some(1), Some(2)]); list_array.append_null(); list_array.append_value([]); list_array.append_value([Some(3), Some(4)]); let list_array = list_array.finish(); - let predicate = BooleanArray::from_iter([true, true, false, true]); + let predicate = BooleanArray::from_iter([true, false, true, false]); - // Filter result: [[1, 2], null, [3, 4]] + // Filter result: [[1, 2], []] let filtered = filter(&list_array, &predicate) .unwrap() - .as_list_view::() + .as_list_view::() .clone(); - let mut expected = ListViewBuilder::with_capacity(Int32Builder::with_capacity(5), 3); + let mut expected = + GenericListViewBuilder::::with_capacity(Int32Builder::with_capacity(5), 3); expected.append_value([Some(1), Some(2)]); - expected.append_null(); - expected.append_value([Some(3), Some(4)]); + expected.append_value([]); let expected = expected.finish(); assert_eq!(&filtered, &expected); } - #[test] - fn test_filter_large_list_view_array() { + fn test_case_filter_sliced_list_view() { // [[1, 2], null, [], [3,4]] - let mut list_array = LargeListViewBuilder::with_capacity(Int32Builder::with_capacity(6), 4); + let mut list_array = + GenericListViewBuilder::::with_capacity(Int32Builder::with_capacity(6), 4); list_array.append_value([Some(1), Some(2)]); list_array.append_null(); list_array.append_value([]); list_array.append_value([Some(3), Some(4)]); let list_array = list_array.finish(); - let predicate = BooleanArray::from_iter([true, true, false, true]); - // Filter result: [[1, 2], null, [3, 4]] - let filtered = filter(&list_array, &predicate) + // Sliced: [null, [], [3, 4]] + let sliced = list_array.slice(1, 3); + let predicate = BooleanArray::from_iter([false, false, true]); + + // Filter result: [[1, 2], []] + let filtered = filter(&sliced, &predicate) .unwrap() - .as_list_view::() + .as_list_view::() .clone(); - let mut expected = LargeListViewBuilder::with_capacity(Int32Builder::with_capacity(5), 3); - expected.append_value([Some(1), Some(2)]); - expected.append_null(); + let mut expected = GenericListViewBuilder::::new(Int32Builder::new()); expected.append_value([Some(3), Some(4)]); let expected = expected.finish(); assert_eq!(&filtered, &expected); } + #[test] + fn test_filter_list_view_array() { + test_case_filter_list_view::(); + test_case_filter_list_view::(); + + test_case_filter_sliced_list_view::(); + test_case_filter_sliced_list_view::(); + } + #[test] fn test_slice_iterator_bits() { let filter_values = (0..64).map(|i| i == 1).collect::>(); From 722f34e1d0a229d1fa8002c134a82fb931abd97d Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 23 Oct 2025 12:29:36 -0400 Subject: [PATCH 10/14] more tests, more clippy Signed-off-by: Andrew Duffy --- arrow-select/src/concat.rs | 34 +++++++++++++++--------------- arrow-select/src/take.rs | 42 +++++++++++++++++++++++++++++++------- 2 files changed, 52 insertions(+), 24 deletions(-) diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index 548611323ee2..3bfdd31ccf2d 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -831,7 +831,7 @@ mod tests { #[test] fn test_concat_primitive_list_arrays() { - let list1 = vec![ + let list1 = [ Some(vec![Some(-1), Some(-1), Some(2), None, None]), Some(vec![]), None, @@ -839,14 +839,14 @@ mod tests { ]; let list1_array = ListArray::from_iter_primitive::(list1.clone()); - let list2 = vec![ + let list2 = [ None, Some(vec![Some(100), None, Some(101)]), Some(vec![Some(102)]), ]; let list2_array = ListArray::from_iter_primitive::(list2.clone()); - let list3 = vec![Some(vec![Some(1000), Some(1001)])]; + let list3 = [Some(vec![Some(1000), Some(1001)])]; let list3_array = ListArray::from_iter_primitive::(list3.clone()); let array_result = concat(&[&list1_array, &list2_array, &list3_array]).unwrap(); @@ -859,7 +859,7 @@ mod tests { #[test] fn test_concat_primitive_list_arrays_slices() { - let list1 = vec![ + let list1 = [ Some(vec![Some(-1), Some(-1), Some(2), None, None]), Some(vec![]), // In slice None, // In slice @@ -869,7 +869,7 @@ mod tests { let list1_array = list1_array.slice(1, 2); let list1_values = list1.into_iter().skip(1).take(2); - let list2 = vec![ + let list2 = [ None, Some(vec![Some(100), None, Some(101)]), Some(vec![Some(102)]), @@ -888,7 +888,7 @@ mod tests { #[test] fn test_concat_primitive_list_arrays_sliced_lengths() { - let list1 = vec![ + let list1 = [ Some(vec![Some(-1), Some(-1), Some(2), None, None]), // In slice Some(vec![]), // In slice None, // In slice @@ -898,7 +898,7 @@ mod tests { let list1_array = list1_array.slice(0, 3); // no offset, but not all values let list1_values = list1.into_iter().take(3); - let list2 = vec![ + let list2 = [ None, Some(vec![Some(100), None, Some(101)]), Some(vec![Some(102)]), @@ -919,7 +919,7 @@ mod tests { #[test] fn test_concat_primitive_fixed_size_list_arrays() { - let list1 = vec![ + let list1 = [ Some(vec![Some(-1), None]), None, Some(vec![Some(10), Some(20)]), @@ -927,7 +927,7 @@ mod tests { let list1_array = FixedSizeListArray::from_iter_primitive::(list1.clone(), 2); - let list2 = vec![ + let list2 = [ None, Some(vec![Some(100), None]), Some(vec![Some(102), Some(103)]), @@ -935,7 +935,7 @@ mod tests { let list2_array = FixedSizeListArray::from_iter_primitive::(list2.clone(), 2); - let list3 = vec![Some(vec![Some(1000), Some(1001)])]; + let list3 = [Some(vec![Some(1000), Some(1001)])]; let list3_array = FixedSizeListArray::from_iter_primitive::(list3.clone(), 2); @@ -950,8 +950,8 @@ mod tests { #[test] fn test_concat_list_view_arrays() { - let list1 = vec![ - (Some(vec![Some(-1), None])), + let list1 = [ + Some(vec![Some(-1), None]), None, Some(vec![Some(10), Some(20)]), ]; @@ -961,7 +961,7 @@ mod tests { } let list1_array = list1_array.finish(); - let list2 = vec![ + let list2 = [ None, Some(vec![Some(100), None]), Some(vec![Some(102), Some(103)]), @@ -972,7 +972,7 @@ mod tests { } let list2_array = list2_array.finish(); - let list3 = vec![Some(vec![Some(1000), Some(1001)])]; + let list3 = [Some(vec![Some(1000), Some(1001)])]; let mut list3_array = ListViewBuilder::new(Int64Builder::new()); for v in list3.iter() { list3_array.append_option(v.clone()); @@ -993,7 +993,7 @@ mod tests { #[test] fn test_concat_sliced_list_view_arrays() { - let list1 = vec![ + let list1 = [ Some(vec![Some(-1), None]), None, Some(vec![Some(10), Some(20)]), @@ -1004,7 +1004,7 @@ mod tests { } let list1_array = list1_array.finish(); - let list2 = vec![ + let list2 = [ None, Some(vec![Some(100), None]), Some(vec![Some(102), Some(103)]), @@ -1015,7 +1015,7 @@ mod tests { } let list2_array = list2_array.finish(); - let list3 = vec![Some(vec![Some(1000), Some(1001)])]; + let list3 = [Some(vec![Some(1000), Some(1001)])]; let mut list3_array = ListViewBuilder::new(Int64Builder::new()); for v in list3.iter() { list3_array.append_option(v.clone()); diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 06ca990f3c5b..cfc16f7e34ae 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -1880,11 +1880,14 @@ mod tests { }}; } - fn test_take_list_view_generic( + fn test_take_list_view_generic( values: Vec>>>, take_indices: Vec>, expected: Vec>>>, - ) { + mapper: F, + ) where + F: Fn(GenericListViewArray) -> GenericListViewArray, + { let mut list_view_array = GenericListViewBuilder::::new(PrimitiveBuilder::::new()); @@ -1892,6 +1895,7 @@ mod tests { list_view_array.append_option(value); } let list_view_array = list_view_array.finish(); + let list_view_array = mapper(list_view_array); let mut indices = UInt64Builder::new(); for idx in take_indices { @@ -1916,8 +1920,12 @@ mod tests { macro_rules! list_view_test_case { (values: $values:expr, indices: $indices:expr, expected: $expected: expr) => {{ - test_take_list_view_generic::($values, $indices, $expected); - test_take_list_view_generic::($values, $indices, $expected); + test_take_list_view_generic::($values, $indices, $expected, |x| x); + test_take_list_view_generic::($values, $indices, $expected, |x| x); + }}; + (values: $values:expr, transform: $fn:expr, indices: $indices:expr, expected: $expected: expr) => {{ + test_take_list_view_generic::($values, $indices, $expected, $fn); + test_take_list_view_generic::($values, $indices, $expected, $fn); }}; } @@ -1986,7 +1994,7 @@ mod tests { None, Some(vec![Some(1), None, Some(3)]), ] - }; + } } #[test] @@ -2000,7 +2008,7 @@ mod tests { ], indices: vec![None, Some(0), None], expected: vec![None, Some(vec![Some(1), None, Some(3)]), None] - }; + } } #[test] @@ -2014,7 +2022,27 @@ mod tests { ], indices: vec![Some(1), Some(1), Some(1), None, None], expected: vec![None; 5] - }; + } + } + + #[test] + fn test_take_list_view_sliced() { + // Take null indices/values, with slicing. + list_view_test_case! { + values: vec![ + Some(vec![Some(1)]), + None, + None, + Some(vec![Some(2), Some(3)]), + Some(vec![Some(4), Some(5)]), + None, + ], + transform: |l| l.slice(2, 4), + indices: vec![Some(0), Some(3), None, Some(1), Some(2)], + expected: vec![ + None, None, None, Some(vec![Some(2), Some(3)]), Some(vec![Some(4), Some(5)]) + ] + } } #[test] From f3f80649596315383bdc053cce5a83068a3661f6 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 23 Oct 2025 13:36:53 -0400 Subject: [PATCH 11/14] reduce Signed-off-by: Andrew Duffy --- arrow-select/src/take.rs | 33 ++++----------------------------- 1 file changed, 4 insertions(+), 29 deletions(-) diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index cfc16f7e34ae..eec4ffa14e72 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -636,38 +636,13 @@ where OffsetType: ArrowPrimitiveType, OffsetType::Native: OffsetSizeTrait, { - // Take executes only over the views. - let mut taken_offsets: Vec = Vec::with_capacity(indices.len()); - let mut taken_sizes: Vec = Vec::with_capacity(indices.len()); - - // Initialize null buffer - let num_bytes = bit_util::ceil(indices.len(), 8); - let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); - let null_slice = null_buf.as_slice_mut(); - - for i in 0..indices.len() { - if indices.is_valid(i) { - let idx = indices - .value(i) - .to_usize() - .ok_or_else(|| ArrowError::ComputeError("Cast to usize failed".to_string()))?; - taken_offsets.push(values.value_offset(idx)); - taken_sizes.push(values.value_size(idx)); - - if !values.is_valid(idx) { - bit_util::unset_bit(null_slice, i); - } - } else { - // null pathway - bit_util::unset_bit(null_slice, i); - taken_offsets.push(taken_offsets.last().copied().unwrap_or_default()); - taken_sizes.push(taken_offsets.last().copied().unwrap_or_default()); - } - } + let taken_offsets = take_native(values.offsets(), indices); + let taken_sizes = take_native(values.sizes(), indices); + let nulls = take_nulls(values.nulls(), indices); let list_view_data = ArrayDataBuilder::new(values.data_type().clone()) .len(indices.len()) - .null_bit_buffer(Some(null_buf.into())) + .nulls(nulls) .buffers(vec![taken_offsets.into(), taken_sizes.into()]) .child_data(vec![values.values().to_data()]); From d0da6c59482632690ff08dfd0c29d1f9de33c19f Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 23 Oct 2025 14:32:51 -0400 Subject: [PATCH 12/14] Add test for equality kernel Signed-off-by: Andrew Duffy --- arrow/tests/array_equal.rs | 80 +++++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 2 deletions(-) diff --git a/arrow/tests/array_equal.rs b/arrow/tests/array_equal.rs index 7fc8b0be7a3d..0ff00676f32c 100644 --- a/arrow/tests/array_equal.rs +++ b/arrow/tests/array_equal.rs @@ -22,11 +22,17 @@ use arrow::array::{ StringDictionaryBuilder, StructArray, UnionBuilder, make_array, }; use arrow::datatypes::{Int16Type, Int32Type}; -use arrow_array::builder::{StringBuilder, StringViewBuilder, StructBuilder}; -use arrow_array::{DictionaryArray, FixedSizeListArray, StringViewArray}; +use arrow_array::builder::{ + GenericListViewBuilder, StringBuilder, StringViewBuilder, StructBuilder, +}; +use arrow_array::cast::AsArray; +use arrow_array::{ + DictionaryArray, FixedSizeListArray, GenericListViewArray, PrimitiveArray, StringViewArray, +}; use arrow_buffer::{Buffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{DataType, Field, Fields}; +use arrow_select::take::take; use std::sync::Arc; #[test] @@ -756,6 +762,76 @@ fn test_fixed_list_offsets() { test_equal(&a_slice, &b_slice, true); } +fn create_list_view_array< + O: OffsetSizeTrait, + U: IntoIterator>, + T: IntoIterator>, +>( + data: T, +) -> GenericListViewArray { + let mut builder = GenericListViewBuilder::::new(Int32Builder::new()); + for d in data { + if let Some(v) = d { + builder.append_value(v); + } else { + builder.append_null(); + } + } + + builder.finish() +} + +fn test_test_list_view_array() { + let a = create_list_view_array::([ + None, + Some(vec![Some(1), None, Some(2)]), + Some(vec![Some(3), Some(4), Some(5), None]), + ]); + let b = create_list_view_array::([ + None, + Some(vec![Some(1), None, Some(2)]), + Some(vec![Some(3), Some(4), Some(5), None]), + ]); + + test_equal(&a, &b, true); + + // Simple non-matching arrays by reordering + let b = create_list_view_array::([ + Some(vec![Some(3), Some(4), Some(5), None]), + Some(vec![Some(1), None, Some(2)]), + ]); + test_equal(&a, &b, false); + + // reorder using take yields equal values + let indices: PrimitiveArray = vec![None, Some(1), Some(0)].into(); + let b = take(&b, &indices, None) + .unwrap() + .as_list_view::() + .clone(); + + test_equal(&a, &b, true); + + // Slicing one side yields unequal again + let a = a.slice(1, 2); + + test_equal(&a, &b, false); + + // Slicing the other to match makes them equal again + let b = b.slice(1, 2); + + test_equal(&a, &b, true); +} + +#[test] +fn test_list_view_array() { + test_test_list_view_array::(); +} + +#[test] +fn test_large_list_view_array() { + test_test_list_view_array::(); +} + #[test] fn test_struct_equal() { let strings: ArrayRef = Arc::new(StringArray::from(vec![ From 5ac42a82fd954cd147f5573d9652dfbcc4a7c7c7 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 24 Oct 2025 09:22:32 -0400 Subject: [PATCH 13/14] use lhs_range_sizes to account for lhs_start/rhs_start Signed-off-by: Andrew Duffy --- arrow-data/src/equal/list_view.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-data/src/equal/list_view.rs b/arrow-data/src/equal/list_view.rs index f35622ae3a82..c7cb31db9099 100644 --- a/arrow-data/src/equal/list_view.rs +++ b/arrow-data/src/equal/list_view.rs @@ -68,7 +68,7 @@ pub(super) fn list_view_equal( for ((&lhs_offset, &rhs_offset), &size) in lhs_range_offsets .iter() .zip(rhs_range_offsets) - .zip(lhs_sizes) + .zip(lhs_range_sizes) { let lhs_offset = lhs_offset.to_usize().unwrap(); let rhs_offset = rhs_offset.to_usize().unwrap(); @@ -104,7 +104,7 @@ pub(super) fn list_view_equal( for (index, ((&lhs_offset, &rhs_offset), &size)) in lhs_range_offsets .iter() .zip(rhs_range_offsets) - .zip(lhs_sizes) + .zip(lhs_range_sizes) .enumerate() { let lhs_is_null = lhs_nulls.is_null(index); From 3cf2342b6772e61148f53c307b4524debafc7f58 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 24 Oct 2025 10:03:34 -0400 Subject: [PATCH 14/14] add test for equal_ranges kernel over ListView Signed-off-by: Andrew Duffy --- arrow/tests/array_equal.rs | 49 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/arrow/tests/array_equal.rs b/arrow/tests/array_equal.rs index 0ff00676f32c..381054a25df5 100644 --- a/arrow/tests/array_equal.rs +++ b/arrow/tests/array_equal.rs @@ -822,6 +822,45 @@ fn test_test_list_view_array() { test_equal(&a, &b, true); } +// Special test for List>. +// This tests the equal_ranges kernel +fn test_sliced_list_of_list_view() { + // First list view is created using the builder, with elements not deduplicated. + let mut a = ListBuilder::new(GenericListViewBuilder::::new(Int32Builder::new())); + + a.append_value([Some(vec![Some(1), Some(2), Some(3)]), Some(vec![])]); + a.append_null(); + a.append_value([ + Some(vec![Some(1), Some(2), Some(3)]), + None, + Some(vec![Some(6)]), + ]); + + let a = a.finish(); + // a = [[[1,2,3], []], null, [[4, null], [5], null, [6]]] + + // First list view is created using the builder, with elements not deduplicated. + let mut b = ListBuilder::new(GenericListViewBuilder::::new(Int32Builder::new())); + + // Add an extra row that we will slice off, adjust the List offsets + b.append_value([Some(vec![Some(0), Some(0), Some(0)])]); + b.append_value([Some(vec![Some(1), Some(2), Some(3)]), Some(vec![])]); + b.append_null(); + b.append_value([ + Some(vec![Some(1), Some(2), Some(3)]), + None, + Some(vec![Some(6)]), + ]); + + let b = b.finish(); + // b = [[[0, 0, 0]], [[1,2,3], []], null, [[4, null], [5], null, [6]]] + let b = b.slice(1, 3); + // b = [[[1,2,3], []], null, [[4, null], [5], null, [6]]] but the outer ListArray + // has an offset + + test_equal(&a, &b, true); +} + #[test] fn test_list_view_array() { test_test_list_view_array::(); @@ -832,6 +871,16 @@ fn test_large_list_view_array() { test_test_list_view_array::(); } +#[test] +fn test_nested_list_view_array() { + test_sliced_list_of_list_view::(); +} + +#[test] +fn test_nested_large_list_view_array() { + test_sliced_list_of_list_view::(); +} + #[test] fn test_struct_equal() { let strings: ArrayRef = Arc::new(StringArray::from(vec![