-
Notifications
You must be signed in to change notification settings - Fork 811
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
arrow-ord: lt
and eq
for nested list
#5408
Changes from 2 commits
119e23d
76d960f
1cd6c7c
4dfa79b
ac7e789
b86c096
934528e
f8ba661
51c2c1e
bf93c86
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,10 +31,12 @@ use arrow_array::{ | |
}; | ||
use arrow_buffer::bit_util::ceil; | ||
use arrow_buffer::{BooleanBuffer, MutableBuffer, NullBuffer}; | ||
use arrow_schema::ArrowError; | ||
use arrow_schema::{ArrowError, DataType}; | ||
use arrow_select::take::take; | ||
use std::ops::Not; | ||
|
||
use crate::ord::{build_compare, DynComparator}; | ||
|
||
#[derive(Debug, Copy, Clone)] | ||
enum Op { | ||
Equal, | ||
|
@@ -166,6 +168,122 @@ pub fn not_distinct(lhs: &dyn Datum, rhs: &dyn Datum) -> Result<BooleanArray, Ar | |
compare_op(Op::NotDistinct, lhs, rhs) | ||
} | ||
|
||
fn process_nested( | ||
l: &dyn Array, | ||
r: &dyn Array, | ||
op: Op, | ||
l_t: &DataType, | ||
r_t: &DataType, | ||
len: usize, | ||
) -> Result<Option<BooleanArray>, ArrowError> { | ||
use arrow_schema::DataType::*; | ||
if let (List(_), List(_)) = (l_t, r_t) { | ||
// Process nested data types | ||
match op { | ||
Op::Less => { | ||
let l = l.as_list::<i32>(); | ||
let r = r.as_list::<i32>(); | ||
let mut values = BooleanArray::builder(len); | ||
for i in 0..l.len() { | ||
let l = l.value(i); | ||
let r = r.value(i); | ||
let l_t = l.data_type(); | ||
let r_t = r.data_type(); | ||
let l_len = l.len(); | ||
let r_len = r.len(); | ||
let min_len = std::cmp::min(l_len, r_len); | ||
|
||
if !l_t.is_nested() && !r_t.is_nested() { | ||
let cmp = build_compare(&l, &r)?; | ||
|
||
fn post_process(len: usize, cmp: DynComparator, r_is_longer: bool) -> bool { | ||
for j in 0..len { | ||
let ord = cmp(j, j); | ||
if ord == std::cmp::Ordering::Less { | ||
return true; | ||
} | ||
if ord == std::cmp::Ordering::Greater { | ||
return false; | ||
} | ||
} | ||
r_is_longer | ||
} | ||
values.append_value(post_process(min_len, cmp, r_len > l_len)); | ||
} else { | ||
// Since `compare_op` does not support inconsistent lengths, we compare the | ||
// prefix with `compare_op` only, and compare the left if the prefix is equal | ||
let l = l.slice(0, min_len); | ||
let r = r.slice(0, min_len); | ||
|
||
let lt_res = lt(&l, &r)?; | ||
let eq_res = eq(&l, &r)?; | ||
|
||
fn post_process( | ||
lt: &BooleanArray, | ||
eq: &BooleanArray, | ||
r_is_longer: bool, | ||
) -> bool { | ||
for j in 0..lt.len() { | ||
if lt.value(j) { | ||
return true; | ||
} | ||
if !eq.value(j) { | ||
return false; | ||
} | ||
} | ||
r_is_longer | ||
} | ||
|
||
values.append_value(post_process(<_res, &eq_res, r_len > l_len)); | ||
} | ||
} | ||
|
||
let values = values.finish(); | ||
Ok(Some(values)) | ||
} | ||
Op::Equal => { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This duplication could be eliminated by extracting out a function that maps the Ordering result of DynComparator to the boolean result |
||
let l = l.as_list::<i32>(); | ||
let r = r.as_list::<i32>(); | ||
let mut values = BooleanArray::builder(len); | ||
for i in 0..l.len() { | ||
let l = l.value(i); | ||
let r = r.value(i); | ||
let l_len = l.len(); | ||
let r_len = r.len(); | ||
if l_len != r_len { | ||
values.append_value(false); | ||
continue; | ||
} | ||
|
||
let eq_res = eq(&l, &r)?; | ||
fn post_process(eq: &BooleanArray) -> bool { | ||
for j in 0..eq.len() { | ||
if !eq.value(j) { | ||
return false; | ||
} | ||
} | ||
true | ||
} | ||
|
||
values.append_value(post_process(&eq_res)); | ||
} | ||
|
||
let values = values.finish(); | ||
Ok(Some(values)) | ||
} | ||
_ => Err(ArrowError::NotYetImplemented(format!( | ||
"Comparison for {op} is NYI" | ||
))), | ||
} | ||
} else if l_t.is_nested() { | ||
Err(ArrowError::NotYetImplemented(format!( | ||
"Comparison for {l_t} is NYI" | ||
))) | ||
} else { | ||
Ok(None) | ||
} | ||
} | ||
|
||
/// Perform `op` on the provided `Datum` | ||
#[inline(never)] | ||
fn compare_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result<BooleanArray, ArrowError> { | ||
|
@@ -198,12 +316,16 @@ fn compare_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result<BooleanArray, | |
let r = r_v.map(|x| x.values().as_ref()).unwrap_or(r); | ||
let r_t = r.data_type(); | ||
|
||
if l_t != r_t || l_t.is_nested() { | ||
if l_t != r_t { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This now allows hitting unreachable in the below code block There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I return an error in |
||
return Err(ArrowError::InvalidArgumentError(format!( | ||
"Invalid comparison operation: {l_t} {op} {r_t}" | ||
))); | ||
} | ||
|
||
if let Some(values) = process_nested(l, r, op, l_t, r_t, len)? { | ||
return Ok(values); | ||
} | ||
|
||
// Defer computation as may not be necessary | ||
let values = || -> BooleanBuffer { | ||
let d = downcast_primitive_array! { | ||
|
@@ -544,7 +666,11 @@ impl<'a> ArrayOrd for &'a FixedSizeBinaryArray { | |
mod tests { | ||
use std::sync::Arc; | ||
|
||
use arrow_array::{DictionaryArray, Int32Array, Scalar, StringArray}; | ||
use arrow_array::{ | ||
types::Int32Type, ArrayRef, DictionaryArray, Int32Array, ListArray, Scalar, StringArray, | ||
}; | ||
use arrow_buffer::OffsetBuffer; | ||
use arrow_schema::Field; | ||
|
||
use super::*; | ||
|
||
|
@@ -702,4 +828,216 @@ mod tests { | |
|
||
neq(&col.slice(0, col.len() - 1), &col.slice(1, col.len() - 1)).unwrap(); | ||
} | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be good to see some tests of
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure |
||
#[test] | ||
fn test_list_lt() { | ||
let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(0), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4), Some(5)]), | ||
]); | ||
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(1), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4)]), | ||
]); | ||
let res = lt(&l1, &l2).unwrap(); | ||
assert_eq!(res, BooleanArray::from(vec![true, false, false])); | ||
|
||
let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(0), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4), Some(5)]), | ||
]); | ||
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(0), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4)]), | ||
]); | ||
let res = lt(&l1, &l2).unwrap(); | ||
assert_eq!(res, BooleanArray::from(vec![false, false, false])); | ||
|
||
let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(0), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4), Some(5)]), | ||
]); | ||
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(0), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4), Some(5), Some(7)]), | ||
]); | ||
let res = lt(&l1, &l2).unwrap(); | ||
assert_eq!(res, BooleanArray::from(vec![false, false, true])); | ||
} | ||
|
||
fn array_into_list_array(arr: ArrayRef) -> ListArray { | ||
let offsets = OffsetBuffer::from_lengths([arr.len()]); | ||
ListArray::new( | ||
Arc::new(Field::new("item", arr.data_type().to_owned(), true)), | ||
offsets, | ||
arr, | ||
None, | ||
) | ||
} | ||
|
||
#[test] | ||
fn test_nested_list_lt() { | ||
let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(0), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4), Some(5)]), | ||
]); | ||
let l1 = array_into_list_array(Arc::new(l1)); | ||
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(1), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4)]), | ||
]); | ||
let l2 = array_into_list_array(Arc::new(l2)); | ||
|
||
let res = lt(&l1, &l2).unwrap(); | ||
assert_eq!(res, BooleanArray::from(vec![true])); | ||
|
||
let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(1), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4), Some(5)]), | ||
]); | ||
let l1 = array_into_list_array(Arc::new(l1)); | ||
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(1), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4)]), | ||
]); | ||
let l2 = array_into_list_array(Arc::new(l2)); | ||
|
||
let res = lt(&l1, &l2).unwrap(); | ||
assert_eq!(res, BooleanArray::from(vec![false])); | ||
|
||
let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(0), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4), Some(5)]), | ||
]); | ||
let l1 = array_into_list_array(Arc::new(l1)); | ||
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(0), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4), Some(5), Some(7)]), | ||
]); | ||
let l2 = array_into_list_array(Arc::new(l2)); | ||
|
||
let res = lt(&l1, &l2).unwrap(); | ||
assert_eq!(res, BooleanArray::from(vec![true])); | ||
} | ||
|
||
#[test] | ||
fn test_list_eq() { | ||
let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(0), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4), Some(5)]), | ||
]); | ||
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(1), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4)]), | ||
]); | ||
let res = eq(&l1, &l2).unwrap(); | ||
assert_eq!(res, BooleanArray::from(vec![false, true, false])); | ||
|
||
let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(0), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4), Some(5)]), | ||
]); | ||
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(0), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4)]), | ||
]); | ||
let res = eq(&l1, &l2).unwrap(); | ||
assert_eq!(res, BooleanArray::from(vec![true, true, false])); | ||
|
||
let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(0), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4), Some(5)]), | ||
]); | ||
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(0), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4), Some(5), Some(7)]), | ||
]); | ||
let res = eq(&l1, &l2).unwrap(); | ||
assert_eq!(res, BooleanArray::from(vec![true, true, false])); | ||
} | ||
|
||
#[test] | ||
fn test_nested_list_eq() { | ||
let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(0), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4), Some(5)]), | ||
]); | ||
let l1 = array_into_list_array(Arc::new(l1)); | ||
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(1), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4)]), | ||
]); | ||
let l2 = array_into_list_array(Arc::new(l2)); | ||
|
||
let res = eq(&l1, &l2).unwrap(); | ||
assert_eq!(res, BooleanArray::from(vec![false])); | ||
|
||
let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(1), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4), Some(5)]), | ||
]); | ||
let l1 = array_into_list_array(Arc::new(l1)); | ||
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(1), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4)]), | ||
]); | ||
let l2 = array_into_list_array(Arc::new(l2)); | ||
|
||
let res = eq(&l1, &l2).unwrap(); | ||
assert_eq!(res, BooleanArray::from(vec![false])); | ||
|
||
let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(0), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4), Some(5)]), | ||
]); | ||
let l1 = array_into_list_array(Arc::new(l1)); | ||
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(0), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4), Some(5), Some(7)]), | ||
]); | ||
let l2 = array_into_list_array(Arc::new(l2)); | ||
|
||
let res = eq(&l1, &l2).unwrap(); | ||
assert_eq!(res, BooleanArray::from(vec![false])); | ||
|
||
let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(0), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4), Some(5)]), | ||
]); | ||
let l1 = array_into_list_array(Arc::new(l1)); | ||
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
Some(vec![Some(0), Some(1), Some(2)]), | ||
None, | ||
Some(vec![Some(3), Some(4), Some(5)]), | ||
]); | ||
let l2 = array_into_list_array(Arc::new(l2)); | ||
|
||
let res = eq(&l1, &l2).unwrap(); | ||
assert_eq!(res, BooleanArray::from(vec![true])); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would be better to construct the DynComparator once for
l.values()
andr.values()
and then use the offsets to drive the comparison?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm unsure about using l.values and offsets for the comparison. Instead, I find looping through the index with
.value(i)
much clearer and straightforward for me.