Skip to content

Commit 70b24d1

Browse files
authored
Merge branch 'main' into 8708-remove-arrow-ord-in-arrow-cast
2 parents c72af36 + 62df32e commit 70b24d1

File tree

27 files changed

+1993
-224
lines changed

27 files changed

+1993
-224
lines changed

arrow-buffer/src/util/bit_iterator.rs

Lines changed: 516 additions & 0 deletions
Large diffs are not rendered by default.

arrow-cast/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,4 @@ harness = false
7474
[[bench]]
7575
name = "parse_decimal"
7676
harness = false
77+

arrow-cast/src/cast/decimal.rs

Lines changed: 287 additions & 74 deletions
Large diffs are not rendered by default.

arrow-cast/src/cast/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ use arrow_schema::*;
6969
use arrow_select::take::take;
7070
use num_traits::{NumCast, ToPrimitive, cast::AsPrimitive};
7171

72-
pub use decimal::DecimalCast;
72+
pub use decimal::{DecimalCast, rescale_decimal};
7373

7474
/// CastOptions provides a way to override the default cast behaviors
7575
#[derive(Debug, Clone, PartialEq, Eq, Hash)]

arrow-data/src/data.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -980,7 +980,15 @@ impl ArrayData {
980980
) -> Result<(), ArrowError> {
981981
let offsets: &[T] = self.typed_buffer(0, self.len)?;
982982
let sizes: &[T] = self.typed_buffer(1, self.len)?;
983-
for i in 0..values_length {
983+
if offsets.len() != sizes.len() {
984+
return Err(ArrowError::ComputeError(format!(
985+
"ListView offsets len {} does not match sizes len {}",
986+
offsets.len(),
987+
sizes.len()
988+
)));
989+
}
990+
991+
for i in 0..sizes.len() {
984992
let size = sizes[i].to_usize().ok_or_else(|| {
985993
ArrowError::InvalidArgumentError(format!(
986994
"Error converting size[{}] ({}) to usize for {}",

arrow-data/src/equal/list_view.rs

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use crate::ArrayData;
19+
use crate::data::count_nulls;
20+
use crate::equal::equal_values;
21+
use arrow_buffer::ArrowNativeType;
22+
use num_integer::Integer;
23+
24+
pub(super) fn list_view_equal<T: ArrowNativeType + Integer>(
25+
lhs: &ArrayData,
26+
rhs: &ArrayData,
27+
lhs_start: usize,
28+
rhs_start: usize,
29+
len: usize,
30+
) -> bool {
31+
let lhs_offsets = lhs.buffer::<T>(0);
32+
let lhs_sizes = lhs.buffer::<T>(1);
33+
34+
let rhs_offsets = rhs.buffer::<T>(0);
35+
let rhs_sizes = rhs.buffer::<T>(1);
36+
37+
let lhs_data = &lhs.child_data()[0];
38+
let rhs_data = &rhs.child_data()[0];
39+
40+
let lhs_null_count = count_nulls(lhs.nulls(), lhs_start, len);
41+
let rhs_null_count = count_nulls(rhs.nulls(), rhs_start, len);
42+
43+
if lhs_null_count != rhs_null_count {
44+
return false;
45+
}
46+
47+
if lhs_null_count == 0 {
48+
// non-null pathway: all sizes must be equal, and all values must be equal
49+
let lhs_range_sizes = &lhs_sizes[lhs_start..lhs_start + len];
50+
let rhs_range_sizes = &rhs_sizes[rhs_start..rhs_start + len];
51+
52+
if lhs_range_sizes.len() != rhs_range_sizes.len() {
53+
return false;
54+
}
55+
56+
if lhs_range_sizes != rhs_range_sizes {
57+
return false;
58+
}
59+
60+
// Check values for equality
61+
let lhs_range_offsets = &lhs_offsets[lhs_start..lhs_start + len];
62+
let rhs_range_offsets = &rhs_offsets[rhs_start..rhs_start + len];
63+
64+
if lhs_range_offsets.len() != rhs_range_offsets.len() {
65+
return false;
66+
}
67+
68+
for ((&lhs_offset, &rhs_offset), &size) in lhs_range_offsets
69+
.iter()
70+
.zip(rhs_range_offsets)
71+
.zip(lhs_range_sizes)
72+
{
73+
let lhs_offset = lhs_offset.to_usize().unwrap();
74+
let rhs_offset = rhs_offset.to_usize().unwrap();
75+
let size = size.to_usize().unwrap();
76+
77+
// Check if offsets are valid for the given range
78+
if !equal_values(lhs_data, rhs_data, lhs_offset, rhs_offset, size) {
79+
return false;
80+
}
81+
}
82+
} else {
83+
// Need to integrate validity check in the inner loop.
84+
// non-null pathway: all sizes must be equal, and all values must be equal
85+
let lhs_range_sizes = &lhs_sizes[lhs_start..lhs_start + len];
86+
let rhs_range_sizes = &rhs_sizes[rhs_start..rhs_start + len];
87+
88+
let lhs_nulls = lhs.nulls().unwrap().slice(lhs_start, len);
89+
let rhs_nulls = rhs.nulls().unwrap().slice(rhs_start, len);
90+
91+
// Sizes can differ if values are null
92+
if lhs_range_sizes.len() != rhs_range_sizes.len() {
93+
return false;
94+
}
95+
96+
// Check values for equality, with null checking
97+
let lhs_range_offsets = &lhs_offsets[lhs_start..lhs_start + len];
98+
let rhs_range_offsets = &rhs_offsets[rhs_start..rhs_start + len];
99+
100+
if lhs_range_offsets.len() != rhs_range_offsets.len() {
101+
return false;
102+
}
103+
104+
for (index, ((&lhs_offset, &rhs_offset), &size)) in lhs_range_offsets
105+
.iter()
106+
.zip(rhs_range_offsets)
107+
.zip(lhs_range_sizes)
108+
.enumerate()
109+
{
110+
let lhs_is_null = lhs_nulls.is_null(index);
111+
let rhs_is_null = rhs_nulls.is_null(index);
112+
113+
if lhs_is_null != rhs_is_null {
114+
return false;
115+
}
116+
117+
let lhs_offset = lhs_offset.to_usize().unwrap();
118+
let rhs_offset = rhs_offset.to_usize().unwrap();
119+
let size = size.to_usize().unwrap();
120+
121+
// Check if values match in the range
122+
if !lhs_is_null && !equal_values(lhs_data, rhs_data, lhs_offset, rhs_offset, size) {
123+
return false;
124+
}
125+
}
126+
}
127+
128+
true
129+
}

arrow-data/src/equal/mod.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ mod dictionary;
3030
mod fixed_binary;
3131
mod fixed_list;
3232
mod list;
33+
mod list_view;
3334
mod null;
3435
mod primitive;
3536
mod run;
@@ -41,6 +42,8 @@ mod variable_size;
4142
// these methods assume the same type, len and null count.
4243
// For this reason, they are not exposed and are instead used
4344
// to build the generic functions below (`equal_range` and `equal`).
45+
use self::run::run_equal;
46+
use crate::equal::list_view::list_view_equal;
4447
use boolean::boolean_equal;
4548
use byte_view::byte_view_equal;
4649
use dictionary::dictionary_equal;
@@ -53,8 +56,6 @@ use structure::struct_equal;
5356
use union::union_equal;
5457
use variable_size::variable_sized_equal;
5558

56-
use self::run::run_equal;
57-
5859
/// Compares the values of two [ArrayData] starting at `lhs_start` and `rhs_start` respectively
5960
/// for `len` slots.
6061
#[inline]
@@ -104,10 +105,9 @@ fn equal_values(
104105
byte_view_equal(lhs, rhs, lhs_start, rhs_start, len)
105106
}
106107
DataType::List(_) => list_equal::<i32>(lhs, rhs, lhs_start, rhs_start, len),
107-
DataType::ListView(_) | DataType::LargeListView(_) => {
108-
unimplemented!("ListView/LargeListView not yet implemented")
109-
}
110108
DataType::LargeList(_) => list_equal::<i64>(lhs, rhs, lhs_start, rhs_start, len),
109+
DataType::ListView(_) => list_view_equal::<i32>(lhs, rhs, lhs_start, rhs_start, len),
110+
DataType::LargeListView(_) => list_view_equal::<i64>(lhs, rhs, lhs_start, rhs_start, len),
111111
DataType::FixedSizeList(_, _) => fixed_list_equal(lhs, rhs, lhs_start, rhs_start, len),
112112
DataType::Struct(_) => struct_equal(lhs, rhs, lhs_start, rhs_start, len),
113113
DataType::Union(_, _) => union_equal(lhs, rhs, lhs_start, rhs_start, len),
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use crate::ArrayData;
19+
use crate::transform::_MutableArrayData;
20+
use arrow_buffer::ArrowNativeType;
21+
use num_integer::Integer;
22+
use num_traits::CheckedAdd;
23+
24+
pub(super) fn build_extend<T: ArrowNativeType + Integer + CheckedAdd>(
25+
array: &ArrayData,
26+
) -> crate::transform::Extend<'_> {
27+
let offsets = array.buffer::<T>(0);
28+
let sizes = array.buffer::<T>(1);
29+
Box::new(
30+
move |mutable: &mut _MutableArrayData, _index: usize, start: usize, len: usize| {
31+
let offset_buffer = &mut mutable.buffer1;
32+
let sizes_buffer = &mut mutable.buffer2;
33+
34+
for &offset in &offsets[start..start + len] {
35+
offset_buffer.push(offset);
36+
}
37+
38+
// sizes
39+
for &size in &sizes[start..start + len] {
40+
sizes_buffer.push(size);
41+
}
42+
43+
// the beauty of views is that we don't need to copy child_data, we just splat
44+
// the offsets and sizes.
45+
},
46+
)
47+
}
48+
49+
pub(super) fn extend_nulls<T: ArrowNativeType>(mutable: &mut _MutableArrayData, len: usize) {
50+
let offset_buffer = &mut mutable.buffer1;
51+
let sizes_buffer = &mut mutable.buffer2;
52+
53+
// We push 0 as a placeholder for NULL values in both the offsets and sizes
54+
(0..len).for_each(|_| offset_buffer.push(T::default()));
55+
(0..len).for_each(|_| sizes_buffer.push(T::default()));
56+
}

arrow-data/src/transform/mod.rs

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ mod boolean;
3333
mod fixed_binary;
3434
mod fixed_size_list;
3535
mod list;
36+
mod list_view;
3637
mod null;
3738
mod primitive;
3839
mod run;
@@ -265,10 +266,9 @@ fn build_extend(array: &ArrayData) -> Extend<'_> {
265266
DataType::LargeUtf8 | DataType::LargeBinary => variable_size::build_extend::<i64>(array),
266267
DataType::BinaryView | DataType::Utf8View => unreachable!("should use build_extend_view"),
267268
DataType::Map(_, _) | DataType::List(_) => list::build_extend::<i32>(array),
268-
DataType::ListView(_) | DataType::LargeListView(_) => {
269-
unimplemented!("ListView/LargeListView not implemented")
270-
}
271269
DataType::LargeList(_) => list::build_extend::<i64>(array),
270+
DataType::ListView(_) => list_view::build_extend::<i32>(array),
271+
DataType::LargeListView(_) => list_view::build_extend::<i64>(array),
272272
DataType::Dictionary(_, _) => unreachable!("should use build_extend_dictionary"),
273273
DataType::Struct(_) => structure::build_extend(array),
274274
DataType::FixedSizeBinary(_) => fixed_binary::build_extend(array),
@@ -313,10 +313,9 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls {
313313
DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::<i64>,
314314
DataType::BinaryView | DataType::Utf8View => primitive::extend_nulls::<u128>,
315315
DataType::Map(_, _) | DataType::List(_) => list::extend_nulls::<i32>,
316-
DataType::ListView(_) | DataType::LargeListView(_) => {
317-
unimplemented!("ListView/LargeListView not implemented")
318-
}
319316
DataType::LargeList(_) => list::extend_nulls::<i64>,
317+
DataType::ListView(_) => list_view::extend_nulls::<i32>,
318+
DataType::LargeListView(_) => list_view::extend_nulls::<i64>,
320319
DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() {
321320
DataType::UInt8 => primitive::extend_nulls::<u8>,
322321
DataType::UInt16 => primitive::extend_nulls::<u16>,
@@ -450,7 +449,11 @@ impl<'a> MutableArrayData<'a> {
450449
new_buffers(data_type, *capacity)
451450
}
452451
(
453-
DataType::List(_) | DataType::LargeList(_) | DataType::FixedSizeList(_, _),
452+
DataType::List(_)
453+
| DataType::LargeList(_)
454+
| DataType::ListView(_)
455+
| DataType::LargeListView(_)
456+
| DataType::FixedSizeList(_, _),
454457
Capacities::List(capacity, _),
455458
) => {
456459
array_capacity = *capacity;
@@ -491,10 +494,11 @@ impl<'a> MutableArrayData<'a> {
491494
| DataType::Utf8View
492495
| DataType::Interval(_)
493496
| DataType::FixedSizeBinary(_) => vec![],
494-
DataType::ListView(_) | DataType::LargeListView(_) => {
495-
unimplemented!("ListView/LargeListView not implemented")
496-
}
497-
DataType::Map(_, _) | DataType::List(_) | DataType::LargeList(_) => {
497+
DataType::Map(_, _)
498+
| DataType::List(_)
499+
| DataType::LargeList(_)
500+
| DataType::ListView(_)
501+
| DataType::LargeListView(_) => {
498502
let children = arrays
499503
.iter()
500504
.map(|array| &array.child_data()[0])
@@ -785,7 +789,12 @@ impl<'a> MutableArrayData<'a> {
785789
b.insert(0, data.buffer1.into());
786790
b
787791
}
788-
DataType::Utf8 | DataType::Binary | DataType::LargeUtf8 | DataType::LargeBinary => {
792+
DataType::Utf8
793+
| DataType::Binary
794+
| DataType::LargeUtf8
795+
| DataType::LargeBinary
796+
| DataType::ListView(_)
797+
| DataType::LargeListView(_) => {
789798
vec![data.buffer1.into(), data.buffer2.into()]
790799
}
791800
DataType::Union(_, mode) => {

0 commit comments

Comments
 (0)