Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

arrow-ord: lt and eq for nested list #5408

Closed
wants to merge 10 commits into from
Closed
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
344 changes: 341 additions & 3 deletions arrow-ord/src/cmp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,12 @@ use arrow_array::{
};
use arrow_buffer::bit_util::ceil;
use arrow_buffer::{BooleanBuffer, MutableBuffer, NullBuffer};
use arrow_schema::ArrowError;
use arrow_schema::{ArrowError, DataType};
use arrow_select::take::take;
use std::ops::Not;

use crate::ord::{build_compare, DynComparator};

#[derive(Debug, Copy, Clone)]
enum Op {
Equal,
Expand Down Expand Up @@ -166,6 +168,122 @@ pub fn not_distinct(lhs: &dyn Datum, rhs: &dyn Datum) -> Result<BooleanArray, Ar
compare_op(Op::NotDistinct, lhs, rhs)
}

fn process_nested(
l: &dyn Array,
r: &dyn Array,
op: Op,
l_t: &DataType,
r_t: &DataType,
len: usize,
) -> Result<Option<BooleanArray>, ArrowError> {
use arrow_schema::DataType::*;
if let (List(_), List(_)) = (l_t, r_t) {
// Process nested data types
match op {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be better to construct the DynComparator once for l.values() and r.values() and then use the offsets to drive the comparison?

Copy link
Contributor Author

@jayzhan211 jayzhan211 Feb 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm unsure about using l.values and offsets for the comparison. Instead, I find looping through the index with .value(i) much clearer and straightforward for me.

Op::Less => {
let l = l.as_list::<i32>();
let r = r.as_list::<i32>();
let mut values = BooleanArray::builder(len);
for i in 0..l.len() {
let l = l.value(i);
let r = r.value(i);
let l_t = l.data_type();
let r_t = r.data_type();
let l_len = l.len();
let r_len = r.len();
let min_len = std::cmp::min(l_len, r_len);

if !l_t.is_nested() && !r_t.is_nested() {
let cmp = build_compare(&l, &r)?;

fn post_process(len: usize, cmp: DynComparator, r_is_longer: bool) -> bool {
for j in 0..len {
let ord = cmp(j, j);
if ord == std::cmp::Ordering::Less {
return true;
}
if ord == std::cmp::Ordering::Greater {
return false;
}
}
r_is_longer
}
values.append_value(post_process(min_len, cmp, r_len > l_len));
} else {
// Since `compare_op` does not support inconsistent lengths, we compare the
// prefix with `compare_op` only, and compare the left if the prefix is equal
let l = l.slice(0, min_len);
let r = r.slice(0, min_len);

let lt_res = lt(&l, &r)?;
let eq_res = eq(&l, &r)?;

fn post_process(
lt: &BooleanArray,
eq: &BooleanArray,
r_is_longer: bool,
) -> bool {
for j in 0..lt.len() {
if lt.value(j) {
return true;
}
if !eq.value(j) {
return false;
}
}
r_is_longer
}

values.append_value(post_process(&lt_res, &eq_res, r_len > l_len));
}
}

let values = values.finish();
Ok(Some(values))
}
Op::Equal => {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This duplication could be eliminated by extracting out a function that maps the Ordering result of DynComparator to the boolean result

let l = l.as_list::<i32>();
let r = r.as_list::<i32>();
let mut values = BooleanArray::builder(len);
for i in 0..l.len() {
let l = l.value(i);
let r = r.value(i);
let l_len = l.len();
let r_len = r.len();
if l_len != r_len {
values.append_value(false);
continue;
}

let eq_res = eq(&l, &r)?;
fn post_process(eq: &BooleanArray) -> bool {
for j in 0..eq.len() {
if !eq.value(j) {
return false;
}
}
true
}

values.append_value(post_process(&eq_res));
}

let values = values.finish();
Ok(Some(values))
}
_ => Err(ArrowError::NotYetImplemented(format!(
"Comparison for {op} is NYI"
))),
}
} else if l_t.is_nested() {
Err(ArrowError::NotYetImplemented(format!(
"Comparison for {l_t} is NYI"
)))
} else {
Ok(None)
}
}

/// Perform `op` on the provided `Datum`
#[inline(never)]
fn compare_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result<BooleanArray, ArrowError> {
Expand Down Expand Up @@ -198,12 +316,16 @@ fn compare_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result<BooleanArray,
let r = r_v.map(|x| x.values().as_ref()).unwrap_or(r);
let r_t = r.data_type();

if l_t != r_t || l_t.is_nested() {
if l_t != r_t {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This now allows hitting unreachable in the below code block

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I return an error in process_nested, so the nested type that is NYI will not go down there

return Err(ArrowError::InvalidArgumentError(format!(
"Invalid comparison operation: {l_t} {op} {r_t}"
)));
}

if let Some(values) = process_nested(l, r, op, l_t, r_t, len)? {
return Ok(values);
}

// Defer computation as may not be necessary
let values = || -> BooleanBuffer {
let d = downcast_primitive_array! {
Expand Down Expand Up @@ -544,7 +666,11 @@ impl<'a> ArrayOrd for &'a FixedSizeBinaryArray {
mod tests {
use std::sync::Arc;

use arrow_array::{DictionaryArray, Int32Array, Scalar, StringArray};
use arrow_array::{
types::Int32Type, ArrayRef, DictionaryArray, Int32Array, ListArray, Scalar, StringArray,
};
use arrow_buffer::OffsetBuffer;
use arrow_schema::Field;

use super::*;

Expand Down Expand Up @@ -702,4 +828,216 @@ mod tests {

neq(&col.slice(0, col.len() - 1), &col.slice(1, col.len() - 1)).unwrap();
}

Copy link
Contributor

@tustvold tustvold Feb 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be good to see some tests of

  • Scalar arguments
  • Nulls masking non-empty slices
  • DictionaryArray of ListArray (returning an error would be perfectly valid for this)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure

#[test]
fn test_list_lt() {
let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(0), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4), Some(5)]),
]);
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(1), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4)]),
]);
let res = lt(&l1, &l2).unwrap();
assert_eq!(res, BooleanArray::from(vec![true, false, false]));

let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(0), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4), Some(5)]),
]);
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(0), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4)]),
]);
let res = lt(&l1, &l2).unwrap();
assert_eq!(res, BooleanArray::from(vec![false, false, false]));

let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(0), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4), Some(5)]),
]);
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(0), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4), Some(5), Some(7)]),
]);
let res = lt(&l1, &l2).unwrap();
assert_eq!(res, BooleanArray::from(vec![false, false, true]));
}

fn array_into_list_array(arr: ArrayRef) -> ListArray {
let offsets = OffsetBuffer::from_lengths([arr.len()]);
ListArray::new(
Arc::new(Field::new("item", arr.data_type().to_owned(), true)),
offsets,
arr,
None,
)
}

#[test]
fn test_nested_list_lt() {
let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(0), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4), Some(5)]),
]);
let l1 = array_into_list_array(Arc::new(l1));
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(1), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4)]),
]);
let l2 = array_into_list_array(Arc::new(l2));

let res = lt(&l1, &l2).unwrap();
assert_eq!(res, BooleanArray::from(vec![true]));

let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(1), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4), Some(5)]),
]);
let l1 = array_into_list_array(Arc::new(l1));
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(1), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4)]),
]);
let l2 = array_into_list_array(Arc::new(l2));

let res = lt(&l1, &l2).unwrap();
assert_eq!(res, BooleanArray::from(vec![false]));

let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(0), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4), Some(5)]),
]);
let l1 = array_into_list_array(Arc::new(l1));
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(0), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4), Some(5), Some(7)]),
]);
let l2 = array_into_list_array(Arc::new(l2));

let res = lt(&l1, &l2).unwrap();
assert_eq!(res, BooleanArray::from(vec![true]));
}

#[test]
fn test_list_eq() {
let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(0), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4), Some(5)]),
]);
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(1), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4)]),
]);
let res = eq(&l1, &l2).unwrap();
assert_eq!(res, BooleanArray::from(vec![false, true, false]));

let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(0), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4), Some(5)]),
]);
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(0), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4)]),
]);
let res = eq(&l1, &l2).unwrap();
assert_eq!(res, BooleanArray::from(vec![true, true, false]));

let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(0), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4), Some(5)]),
]);
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(0), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4), Some(5), Some(7)]),
]);
let res = eq(&l1, &l2).unwrap();
assert_eq!(res, BooleanArray::from(vec![true, true, false]));
}

#[test]
fn test_nested_list_eq() {
let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(0), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4), Some(5)]),
]);
let l1 = array_into_list_array(Arc::new(l1));
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(1), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4)]),
]);
let l2 = array_into_list_array(Arc::new(l2));

let res = eq(&l1, &l2).unwrap();
assert_eq!(res, BooleanArray::from(vec![false]));

let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(1), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4), Some(5)]),
]);
let l1 = array_into_list_array(Arc::new(l1));
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(1), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4)]),
]);
let l2 = array_into_list_array(Arc::new(l2));

let res = eq(&l1, &l2).unwrap();
assert_eq!(res, BooleanArray::from(vec![false]));

let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(0), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4), Some(5)]),
]);
let l1 = array_into_list_array(Arc::new(l1));
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(0), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4), Some(5), Some(7)]),
]);
let l2 = array_into_list_array(Arc::new(l2));

let res = eq(&l1, &l2).unwrap();
assert_eq!(res, BooleanArray::from(vec![false]));

let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(0), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4), Some(5)]),
]);
let l1 = array_into_list_array(Arc::new(l1));
let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(0), Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4), Some(5)]),
]);
let l2 = array_into_list_array(Arc::new(l2));

let res = eq(&l1, &l2).unwrap();
assert_eq!(res, BooleanArray::from(vec![true]));
}
}
Loading