Skip to content

Commit 5e32cc6

Browse files
authored
Support more operations on ListView (#8645)
# Which issue does this PR close? Part of #5375 Vortex was encountering some issues after we switched our preferred List type to `ListView`, the first thing we noticed was that `arrow_select::filter_array` would fail on ListView (and LargeListView, though we don't use that). This PR addresses some missing select kernel implementations for ListView and LargeListView. This also fixes an existing bug in the ArrayData validation for ListView arrays that would trigger an out of bounds index panic. # Are these changes tested? - [x] filter_array - [x] concat - [x] take # Are there any user-facing changes? ListView/LargeListView can now be used with the `take`, `concat` and `filter_array` kernels You can now use the `PartialEq` to compare ListView arrays. --------- Signed-off-by: Andrew Duffy <andrew@a10y.dev>
1 parent 29cd685 commit 5e32cc6

File tree

9 files changed

+767
-32
lines changed

9 files changed

+767
-32
lines changed

arrow-data/src/data.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -980,7 +980,15 @@ impl ArrayData {
980980
) -> Result<(), ArrowError> {
981981
let offsets: &[T] = self.typed_buffer(0, self.len)?;
982982
let sizes: &[T] = self.typed_buffer(1, self.len)?;
983-
for i in 0..values_length {
983+
if offsets.len() != sizes.len() {
984+
return Err(ArrowError::ComputeError(format!(
985+
"ListView offsets len {} does not match sizes len {}",
986+
offsets.len(),
987+
sizes.len()
988+
)));
989+
}
990+
991+
for i in 0..sizes.len() {
984992
let size = sizes[i].to_usize().ok_or_else(|| {
985993
ArrowError::InvalidArgumentError(format!(
986994
"Error converting size[{}] ({}) to usize for {}",

arrow-data/src/equal/list_view.rs

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use crate::ArrayData;
19+
use crate::data::count_nulls;
20+
use crate::equal::equal_values;
21+
use arrow_buffer::ArrowNativeType;
22+
use num_integer::Integer;
23+
24+
pub(super) fn list_view_equal<T: ArrowNativeType + Integer>(
25+
lhs: &ArrayData,
26+
rhs: &ArrayData,
27+
lhs_start: usize,
28+
rhs_start: usize,
29+
len: usize,
30+
) -> bool {
31+
let lhs_offsets = lhs.buffer::<T>(0);
32+
let lhs_sizes = lhs.buffer::<T>(1);
33+
34+
let rhs_offsets = rhs.buffer::<T>(0);
35+
let rhs_sizes = rhs.buffer::<T>(1);
36+
37+
let lhs_data = &lhs.child_data()[0];
38+
let rhs_data = &rhs.child_data()[0];
39+
40+
let lhs_null_count = count_nulls(lhs.nulls(), lhs_start, len);
41+
let rhs_null_count = count_nulls(rhs.nulls(), rhs_start, len);
42+
43+
if lhs_null_count != rhs_null_count {
44+
return false;
45+
}
46+
47+
if lhs_null_count == 0 {
48+
// non-null pathway: all sizes must be equal, and all values must be equal
49+
let lhs_range_sizes = &lhs_sizes[lhs_start..lhs_start + len];
50+
let rhs_range_sizes = &rhs_sizes[rhs_start..rhs_start + len];
51+
52+
if lhs_range_sizes.len() != rhs_range_sizes.len() {
53+
return false;
54+
}
55+
56+
if lhs_range_sizes != rhs_range_sizes {
57+
return false;
58+
}
59+
60+
// Check values for equality
61+
let lhs_range_offsets = &lhs_offsets[lhs_start..lhs_start + len];
62+
let rhs_range_offsets = &rhs_offsets[rhs_start..rhs_start + len];
63+
64+
if lhs_range_offsets.len() != rhs_range_offsets.len() {
65+
return false;
66+
}
67+
68+
for ((&lhs_offset, &rhs_offset), &size) in lhs_range_offsets
69+
.iter()
70+
.zip(rhs_range_offsets)
71+
.zip(lhs_range_sizes)
72+
{
73+
let lhs_offset = lhs_offset.to_usize().unwrap();
74+
let rhs_offset = rhs_offset.to_usize().unwrap();
75+
let size = size.to_usize().unwrap();
76+
77+
// Check if offsets are valid for the given range
78+
if !equal_values(lhs_data, rhs_data, lhs_offset, rhs_offset, size) {
79+
return false;
80+
}
81+
}
82+
} else {
83+
// Need to integrate validity check in the inner loop.
84+
// non-null pathway: all sizes must be equal, and all values must be equal
85+
let lhs_range_sizes = &lhs_sizes[lhs_start..lhs_start + len];
86+
let rhs_range_sizes = &rhs_sizes[rhs_start..rhs_start + len];
87+
88+
let lhs_nulls = lhs.nulls().unwrap().slice(lhs_start, len);
89+
let rhs_nulls = rhs.nulls().unwrap().slice(rhs_start, len);
90+
91+
// Sizes can differ if values are null
92+
if lhs_range_sizes.len() != rhs_range_sizes.len() {
93+
return false;
94+
}
95+
96+
// Check values for equality, with null checking
97+
let lhs_range_offsets = &lhs_offsets[lhs_start..lhs_start + len];
98+
let rhs_range_offsets = &rhs_offsets[rhs_start..rhs_start + len];
99+
100+
if lhs_range_offsets.len() != rhs_range_offsets.len() {
101+
return false;
102+
}
103+
104+
for (index, ((&lhs_offset, &rhs_offset), &size)) in lhs_range_offsets
105+
.iter()
106+
.zip(rhs_range_offsets)
107+
.zip(lhs_range_sizes)
108+
.enumerate()
109+
{
110+
let lhs_is_null = lhs_nulls.is_null(index);
111+
let rhs_is_null = rhs_nulls.is_null(index);
112+
113+
if lhs_is_null != rhs_is_null {
114+
return false;
115+
}
116+
117+
let lhs_offset = lhs_offset.to_usize().unwrap();
118+
let rhs_offset = rhs_offset.to_usize().unwrap();
119+
let size = size.to_usize().unwrap();
120+
121+
// Check if values match in the range
122+
if !lhs_is_null && !equal_values(lhs_data, rhs_data, lhs_offset, rhs_offset, size) {
123+
return false;
124+
}
125+
}
126+
}
127+
128+
true
129+
}

arrow-data/src/equal/mod.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ mod dictionary;
3030
mod fixed_binary;
3131
mod fixed_list;
3232
mod list;
33+
mod list_view;
3334
mod null;
3435
mod primitive;
3536
mod run;
@@ -41,6 +42,8 @@ mod variable_size;
4142
// these methods assume the same type, len and null count.
4243
// For this reason, they are not exposed and are instead used
4344
// to build the generic functions below (`equal_range` and `equal`).
45+
use self::run::run_equal;
46+
use crate::equal::list_view::list_view_equal;
4447
use boolean::boolean_equal;
4548
use byte_view::byte_view_equal;
4649
use dictionary::dictionary_equal;
@@ -53,8 +56,6 @@ use structure::struct_equal;
5356
use union::union_equal;
5457
use variable_size::variable_sized_equal;
5558

56-
use self::run::run_equal;
57-
5859
/// Compares the values of two [ArrayData] starting at `lhs_start` and `rhs_start` respectively
5960
/// for `len` slots.
6061
#[inline]
@@ -104,10 +105,9 @@ fn equal_values(
104105
byte_view_equal(lhs, rhs, lhs_start, rhs_start, len)
105106
}
106107
DataType::List(_) => list_equal::<i32>(lhs, rhs, lhs_start, rhs_start, len),
107-
DataType::ListView(_) | DataType::LargeListView(_) => {
108-
unimplemented!("ListView/LargeListView not yet implemented")
109-
}
110108
DataType::LargeList(_) => list_equal::<i64>(lhs, rhs, lhs_start, rhs_start, len),
109+
DataType::ListView(_) => list_view_equal::<i32>(lhs, rhs, lhs_start, rhs_start, len),
110+
DataType::LargeListView(_) => list_view_equal::<i64>(lhs, rhs, lhs_start, rhs_start, len),
111111
DataType::FixedSizeList(_, _) => fixed_list_equal(lhs, rhs, lhs_start, rhs_start, len),
112112
DataType::Struct(_) => struct_equal(lhs, rhs, lhs_start, rhs_start, len),
113113
DataType::Union(_, _) => union_equal(lhs, rhs, lhs_start, rhs_start, len),
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use crate::ArrayData;
19+
use crate::transform::_MutableArrayData;
20+
use arrow_buffer::ArrowNativeType;
21+
use num_integer::Integer;
22+
use num_traits::CheckedAdd;
23+
24+
pub(super) fn build_extend<T: ArrowNativeType + Integer + CheckedAdd>(
25+
array: &ArrayData,
26+
) -> crate::transform::Extend<'_> {
27+
let offsets = array.buffer::<T>(0);
28+
let sizes = array.buffer::<T>(1);
29+
Box::new(
30+
move |mutable: &mut _MutableArrayData, _index: usize, start: usize, len: usize| {
31+
let offset_buffer = &mut mutable.buffer1;
32+
let sizes_buffer = &mut mutable.buffer2;
33+
34+
for &offset in &offsets[start..start + len] {
35+
offset_buffer.push(offset);
36+
}
37+
38+
// sizes
39+
for &size in &sizes[start..start + len] {
40+
sizes_buffer.push(size);
41+
}
42+
43+
// the beauty of views is that we don't need to copy child_data, we just splat
44+
// the offsets and sizes.
45+
},
46+
)
47+
}
48+
49+
pub(super) fn extend_nulls<T: ArrowNativeType>(mutable: &mut _MutableArrayData, len: usize) {
50+
let offset_buffer = &mut mutable.buffer1;
51+
let sizes_buffer = &mut mutable.buffer2;
52+
53+
// We push 0 as a placeholder for NULL values in both the offsets and sizes
54+
(0..len).for_each(|_| offset_buffer.push(T::default()));
55+
(0..len).for_each(|_| sizes_buffer.push(T::default()));
56+
}

arrow-data/src/transform/mod.rs

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ mod boolean;
3333
mod fixed_binary;
3434
mod fixed_size_list;
3535
mod list;
36+
mod list_view;
3637
mod null;
3738
mod primitive;
3839
mod run;
@@ -265,10 +266,9 @@ fn build_extend(array: &ArrayData) -> Extend<'_> {
265266
DataType::LargeUtf8 | DataType::LargeBinary => variable_size::build_extend::<i64>(array),
266267
DataType::BinaryView | DataType::Utf8View => unreachable!("should use build_extend_view"),
267268
DataType::Map(_, _) | DataType::List(_) => list::build_extend::<i32>(array),
268-
DataType::ListView(_) | DataType::LargeListView(_) => {
269-
unimplemented!("ListView/LargeListView not implemented")
270-
}
271269
DataType::LargeList(_) => list::build_extend::<i64>(array),
270+
DataType::ListView(_) => list_view::build_extend::<i32>(array),
271+
DataType::LargeListView(_) => list_view::build_extend::<i64>(array),
272272
DataType::Dictionary(_, _) => unreachable!("should use build_extend_dictionary"),
273273
DataType::Struct(_) => structure::build_extend(array),
274274
DataType::FixedSizeBinary(_) => fixed_binary::build_extend(array),
@@ -313,10 +313,9 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls {
313313
DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::<i64>,
314314
DataType::BinaryView | DataType::Utf8View => primitive::extend_nulls::<u128>,
315315
DataType::Map(_, _) | DataType::List(_) => list::extend_nulls::<i32>,
316-
DataType::ListView(_) | DataType::LargeListView(_) => {
317-
unimplemented!("ListView/LargeListView not implemented")
318-
}
319316
DataType::LargeList(_) => list::extend_nulls::<i64>,
317+
DataType::ListView(_) => list_view::extend_nulls::<i32>,
318+
DataType::LargeListView(_) => list_view::extend_nulls::<i64>,
320319
DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() {
321320
DataType::UInt8 => primitive::extend_nulls::<u8>,
322321
DataType::UInt16 => primitive::extend_nulls::<u16>,
@@ -450,7 +449,11 @@ impl<'a> MutableArrayData<'a> {
450449
new_buffers(data_type, *capacity)
451450
}
452451
(
453-
DataType::List(_) | DataType::LargeList(_) | DataType::FixedSizeList(_, _),
452+
DataType::List(_)
453+
| DataType::LargeList(_)
454+
| DataType::ListView(_)
455+
| DataType::LargeListView(_)
456+
| DataType::FixedSizeList(_, _),
454457
Capacities::List(capacity, _),
455458
) => {
456459
array_capacity = *capacity;
@@ -491,10 +494,11 @@ impl<'a> MutableArrayData<'a> {
491494
| DataType::Utf8View
492495
| DataType::Interval(_)
493496
| DataType::FixedSizeBinary(_) => vec![],
494-
DataType::ListView(_) | DataType::LargeListView(_) => {
495-
unimplemented!("ListView/LargeListView not implemented")
496-
}
497-
DataType::Map(_, _) | DataType::List(_) | DataType::LargeList(_) => {
497+
DataType::Map(_, _)
498+
| DataType::List(_)
499+
| DataType::LargeList(_)
500+
| DataType::ListView(_)
501+
| DataType::LargeListView(_) => {
498502
let children = arrays
499503
.iter()
500504
.map(|array| &array.child_data()[0])
@@ -785,7 +789,12 @@ impl<'a> MutableArrayData<'a> {
785789
b.insert(0, data.buffer1.into());
786790
b
787791
}
788-
DataType::Utf8 | DataType::Binary | DataType::LargeUtf8 | DataType::LargeBinary => {
792+
DataType::Utf8
793+
| DataType::Binary
794+
| DataType::LargeUtf8
795+
| DataType::LargeBinary
796+
| DataType::ListView(_)
797+
| DataType::LargeListView(_) => {
789798
vec![data.buffer1.into(), data.buffer2.into()]
790799
}
791800
DataType::Union(_, mode) => {

0 commit comments

Comments
 (0)