Skip to content
10 changes: 9 additions & 1 deletion arrow-data/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -980,7 +980,15 @@ impl ArrayData {
) -> Result<(), ArrowError> {
let offsets: &[T] = self.typed_buffer(0, self.len)?;
let sizes: &[T] = self.typed_buffer(1, self.len)?;
for i in 0..values_length {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was a bug before. You can verify this by construction a list_view_array and then doing list_view_array.to_data().into_builder().build().unwrap() and it will panic, because values_length is the length of the inner values not of the list itself.

if offsets.len() != sizes.len() {
return Err(ArrowError::ComputeError(format!(
"ListView offsets len {} does not match sizes len {}",
offsets.len(),
sizes.len()
)));
}

for i in 0..sizes.len() {
let size = sizes[i].to_usize().ok_or_else(|| {
ArrowError::InvalidArgumentError(format!(
"Error converting size[{}] ({}) to usize for {}",
Expand Down
129 changes: 129 additions & 0 deletions arrow-data/src/equal/list_view.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use crate::ArrayData;
use crate::data::count_nulls;
use crate::equal::equal_values;
use arrow_buffer::ArrowNativeType;
use num_integer::Integer;

pub(super) fn list_view_equal<T: ArrowNativeType + Integer>(
lhs: &ArrayData,
rhs: &ArrayData,
lhs_start: usize,
rhs_start: usize,
len: usize,
) -> bool {
let lhs_offsets = lhs.buffer::<T>(0);
let lhs_sizes = lhs.buffer::<T>(1);

let rhs_offsets = rhs.buffer::<T>(0);
let rhs_sizes = rhs.buffer::<T>(1);

let lhs_data = &lhs.child_data()[0];
let rhs_data = &rhs.child_data()[0];

let lhs_null_count = count_nulls(lhs.nulls(), lhs_start, len);
let rhs_null_count = count_nulls(rhs.nulls(), rhs_start, len);

if lhs_null_count != rhs_null_count {
return false;
}

if lhs_null_count == 0 {
// non-null pathway: all sizes must be equal, and all values must be equal
let lhs_range_sizes = &lhs_sizes[lhs_start..lhs_start + len];
let rhs_range_sizes = &rhs_sizes[rhs_start..rhs_start + len];

if lhs_range_sizes.len() != rhs_range_sizes.len() {
return false;
}

if lhs_range_sizes != rhs_range_sizes {
return false;
}

// Check values for equality
let lhs_range_offsets = &lhs_offsets[lhs_start..lhs_start + len];
let rhs_range_offsets = &rhs_offsets[rhs_start..rhs_start + len];

if lhs_range_offsets.len() != rhs_range_offsets.len() {
return false;
}

for ((&lhs_offset, &rhs_offset), &size) in lhs_range_offsets
.iter()
.zip(rhs_range_offsets)
.zip(lhs_range_sizes)
{
let lhs_offset = lhs_offset.to_usize().unwrap();
let rhs_offset = rhs_offset.to_usize().unwrap();
let size = size.to_usize().unwrap();

// Check if offsets are valid for the given range
if !equal_values(lhs_data, rhs_data, lhs_offset, rhs_offset, size) {
return false;
}
}
} else {
// Need to integrate validity check in the inner loop.
// non-null pathway: all sizes must be equal, and all values must be equal
let lhs_range_sizes = &lhs_sizes[lhs_start..lhs_start + len];
let rhs_range_sizes = &rhs_sizes[rhs_start..rhs_start + len];

let lhs_nulls = lhs.nulls().unwrap().slice(lhs_start, len);
let rhs_nulls = rhs.nulls().unwrap().slice(rhs_start, len);

// Sizes can differ if values are null
if lhs_range_sizes.len() != rhs_range_sizes.len() {
return false;
}

// Check values for equality, with null checking
let lhs_range_offsets = &lhs_offsets[lhs_start..lhs_start + len];
let rhs_range_offsets = &rhs_offsets[rhs_start..rhs_start + len];

if lhs_range_offsets.len() != rhs_range_offsets.len() {
return false;
}

for (index, ((&lhs_offset, &rhs_offset), &size)) in lhs_range_offsets
.iter()
.zip(rhs_range_offsets)
.zip(lhs_range_sizes)
.enumerate()
{
let lhs_is_null = lhs_nulls.is_null(index);
let rhs_is_null = rhs_nulls.is_null(index);

if lhs_is_null != rhs_is_null {
return false;
}

let lhs_offset = lhs_offset.to_usize().unwrap();
let rhs_offset = rhs_offset.to_usize().unwrap();
let size = size.to_usize().unwrap();

// Check if values match in the range
if !lhs_is_null && !equal_values(lhs_data, rhs_data, lhs_offset, rhs_offset, size) {
return false;
}
}
}

true
}
10 changes: 5 additions & 5 deletions arrow-data/src/equal/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ mod dictionary;
mod fixed_binary;
mod fixed_list;
mod list;
mod list_view;
mod null;
mod primitive;
mod run;
Expand All @@ -41,6 +42,8 @@ mod variable_size;
// these methods assume the same type, len and null count.
// For this reason, they are not exposed and are instead used
// to build the generic functions below (`equal_range` and `equal`).
use self::run::run_equal;
use crate::equal::list_view::list_view_equal;
use boolean::boolean_equal;
use byte_view::byte_view_equal;
use dictionary::dictionary_equal;
Expand All @@ -53,8 +56,6 @@ use structure::struct_equal;
use union::union_equal;
use variable_size::variable_sized_equal;

use self::run::run_equal;

/// Compares the values of two [ArrayData] starting at `lhs_start` and `rhs_start` respectively
/// for `len` slots.
#[inline]
Expand Down Expand Up @@ -104,10 +105,9 @@ fn equal_values(
byte_view_equal(lhs, rhs, lhs_start, rhs_start, len)
}
DataType::List(_) => list_equal::<i32>(lhs, rhs, lhs_start, rhs_start, len),
DataType::ListView(_) | DataType::LargeListView(_) => {
unimplemented!("ListView/LargeListView not yet implemented")
}
DataType::LargeList(_) => list_equal::<i64>(lhs, rhs, lhs_start, rhs_start, len),
DataType::ListView(_) => list_view_equal::<i32>(lhs, rhs, lhs_start, rhs_start, len),
DataType::LargeListView(_) => list_view_equal::<i64>(lhs, rhs, lhs_start, rhs_start, len),
DataType::FixedSizeList(_, _) => fixed_list_equal(lhs, rhs, lhs_start, rhs_start, len),
DataType::Struct(_) => struct_equal(lhs, rhs, lhs_start, rhs_start, len),
DataType::Union(_, _) => union_equal(lhs, rhs, lhs_start, rhs_start, len),
Expand Down
56 changes: 56 additions & 0 deletions arrow-data/src/transform/list_view.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use crate::ArrayData;
use crate::transform::_MutableArrayData;
use arrow_buffer::ArrowNativeType;
use num_integer::Integer;
use num_traits::CheckedAdd;

pub(super) fn build_extend<T: ArrowNativeType + Integer + CheckedAdd>(
array: &ArrayData,
) -> crate::transform::Extend<'_> {
let offsets = array.buffer::<T>(0);
let sizes = array.buffer::<T>(1);
Box::new(
move |mutable: &mut _MutableArrayData, _index: usize, start: usize, len: usize| {
let offset_buffer = &mut mutable.buffer1;
let sizes_buffer = &mut mutable.buffer2;

for &offset in &offsets[start..start + len] {
offset_buffer.push(offset);
}

// sizes
for &size in &sizes[start..start + len] {
sizes_buffer.push(size);
}

// the beauty of views is that we don't need to copy child_data, we just splat
// the offsets and sizes.
},
)
}

pub(super) fn extend_nulls<T: ArrowNativeType>(mutable: &mut _MutableArrayData, len: usize) {
let offset_buffer = &mut mutable.buffer1;
let sizes_buffer = &mut mutable.buffer2;

// We push 0 as a placeholder for NULL values in both the offsets and sizes
(0..len).for_each(|_| offset_buffer.push(T::default()));
(0..len).for_each(|_| sizes_buffer.push(T::default()));
}
33 changes: 21 additions & 12 deletions arrow-data/src/transform/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ mod boolean;
mod fixed_binary;
mod fixed_size_list;
mod list;
mod list_view;
mod null;
mod primitive;
mod run;
Expand Down Expand Up @@ -265,10 +266,9 @@ fn build_extend(array: &ArrayData) -> Extend<'_> {
DataType::LargeUtf8 | DataType::LargeBinary => variable_size::build_extend::<i64>(array),
DataType::BinaryView | DataType::Utf8View => unreachable!("should use build_extend_view"),
DataType::Map(_, _) | DataType::List(_) => list::build_extend::<i32>(array),
DataType::ListView(_) | DataType::LargeListView(_) => {
unimplemented!("ListView/LargeListView not implemented")
}
DataType::LargeList(_) => list::build_extend::<i64>(array),
DataType::ListView(_) => list_view::build_extend::<i32>(array),
DataType::LargeListView(_) => list_view::build_extend::<i64>(array),
DataType::Dictionary(_, _) => unreachable!("should use build_extend_dictionary"),
DataType::Struct(_) => structure::build_extend(array),
DataType::FixedSizeBinary(_) => fixed_binary::build_extend(array),
Expand Down Expand Up @@ -313,10 +313,9 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls {
DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::<i64>,
DataType::BinaryView | DataType::Utf8View => primitive::extend_nulls::<u128>,
DataType::Map(_, _) | DataType::List(_) => list::extend_nulls::<i32>,
DataType::ListView(_) | DataType::LargeListView(_) => {
unimplemented!("ListView/LargeListView not implemented")
}
DataType::LargeList(_) => list::extend_nulls::<i64>,
DataType::ListView(_) => list_view::extend_nulls::<i32>,
DataType::LargeListView(_) => list_view::extend_nulls::<i64>,
DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() {
DataType::UInt8 => primitive::extend_nulls::<u8>,
DataType::UInt16 => primitive::extend_nulls::<u16>,
Expand Down Expand Up @@ -450,7 +449,11 @@ impl<'a> MutableArrayData<'a> {
new_buffers(data_type, *capacity)
}
(
DataType::List(_) | DataType::LargeList(_) | DataType::FixedSizeList(_, _),
DataType::List(_)
| DataType::LargeList(_)
| DataType::ListView(_)
| DataType::LargeListView(_)
| DataType::FixedSizeList(_, _),
Capacities::List(capacity, _),
) => {
array_capacity = *capacity;
Expand Down Expand Up @@ -491,10 +494,11 @@ impl<'a> MutableArrayData<'a> {
| DataType::Utf8View
| DataType::Interval(_)
| DataType::FixedSizeBinary(_) => vec![],
DataType::ListView(_) | DataType::LargeListView(_) => {
unimplemented!("ListView/LargeListView not implemented")
}
DataType::Map(_, _) | DataType::List(_) | DataType::LargeList(_) => {
DataType::Map(_, _)
| DataType::List(_)
| DataType::LargeList(_)
| DataType::ListView(_)
| DataType::LargeListView(_) => {
let children = arrays
.iter()
.map(|array| &array.child_data()[0])
Expand Down Expand Up @@ -785,7 +789,12 @@ impl<'a> MutableArrayData<'a> {
b.insert(0, data.buffer1.into());
b
}
DataType::Utf8 | DataType::Binary | DataType::LargeUtf8 | DataType::LargeBinary => {
DataType::Utf8
| DataType::Binary
| DataType::LargeUtf8
| DataType::LargeBinary
| DataType::ListView(_)
| DataType::LargeListView(_) => {
vec![data.buffer1.into(), data.buffer2.into()]
}
DataType::Union(_, mode) => {
Expand Down
Loading
Loading