Skip to content

Commit 6c3e588

Browse files
authored
fix: zip now treats nulls as false in provided mask regardless of the underlying bit value (#8711)
# Which issue does this PR close? - closes #8721 # Rationale for this change mask `nulls` should be treated as `false` (even if the underlying values are not 0) as described in the docs for zip # What changes are included in this PR? used `prep_null_mask_filter` before iterating over the mask, added tests for both scalar and non scalar (to prepare for #8653) # Are these changes tested? Yes # Are there any user-facing changes? Kinda
1 parent 62df32e commit 6c3e588

File tree

2 files changed

+130
-3
lines changed

2 files changed

+130
-3
lines changed

arrow-select/src/filter.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,13 @@ pub struct SlicesIterator<'a>(BitSliceIterator<'a>);
5858
impl<'a> SlicesIterator<'a> {
5959
/// Creates a new iterator from a [BooleanArray]
6060
pub fn new(filter: &'a BooleanArray) -> Self {
61-
Self(filter.values().set_slices())
61+
filter.values().into()
62+
}
63+
}
64+
65+
impl<'a> From<&'a BooleanBuffer> for SlicesIterator<'a> {
66+
fn from(filter: &'a BooleanBuffer) -> Self {
67+
Self(filter.set_slices())
6268
}
6369
}
6470

arrow-select/src/zip.rs

Lines changed: 123 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@
1717

1818
//! [`zip`]: Combine values from two arrays based on boolean mask
1919
20-
use crate::filter::SlicesIterator;
20+
use crate::filter::{SlicesIterator, prep_null_mask_filter};
2121
use arrow_array::*;
22+
use arrow_buffer::BooleanBuffer;
2223
use arrow_data::transform::MutableArrayData;
2324
use arrow_schema::ArrowError;
2425

@@ -127,7 +128,8 @@ pub fn zip(
127128
// keep track of how much is filled
128129
let mut filled = 0;
129130

130-
SlicesIterator::new(mask).for_each(|(start, end)| {
131+
let mask = maybe_prep_null_mask_filter(mask);
132+
SlicesIterator::from(&mask).for_each(|(start, end)| {
131133
// the gap needs to be filled with falsy values
132134
if start > filled {
133135
if falsy_is_scalar {
@@ -166,9 +168,22 @@ pub fn zip(
166168
Ok(make_array(data))
167169
}
168170

171+
fn maybe_prep_null_mask_filter(predicate: &BooleanArray) -> BooleanBuffer {
172+
// Nulls are treated as false
173+
if predicate.null_count() == 0 {
174+
predicate.values().clone()
175+
} else {
176+
let cleaned = prep_null_mask_filter(predicate);
177+
let (boolean_buffer, _) = cleaned.into_parts();
178+
boolean_buffer
179+
}
180+
}
181+
169182
#[cfg(test)]
170183
mod test {
171184
use super::*;
185+
use arrow_array::cast::AsArray;
186+
use arrow_buffer::{BooleanBuffer, NullBuffer};
172187

173188
#[test]
174189
fn test_zip_kernel_one() {
@@ -279,4 +294,110 @@ mod test {
279294
let expected = Int32Array::from(vec![None, None, Some(42), Some(42), None]);
280295
assert_eq!(actual, &expected);
281296
}
297+
298+
#[test]
299+
fn test_zip_primitive_array_with_nulls_is_mask_should_be_treated_as_false() {
300+
let truthy = Int32Array::from_iter_values(vec![1, 2, 3, 4, 5, 6]);
301+
let falsy = Int32Array::from_iter_values(vec![7, 8, 9, 10, 11, 12]);
302+
303+
let mask = {
304+
let booleans = BooleanBuffer::from(vec![true, true, false, true, false, false]);
305+
let nulls = NullBuffer::from(vec![
306+
true, true, true,
307+
false, // null treated as false even though in the original mask it was true
308+
true, true,
309+
]);
310+
BooleanArray::new(booleans, Some(nulls))
311+
};
312+
let out = zip(&mask, &truthy, &falsy).unwrap();
313+
let actual = out.as_any().downcast_ref::<Int32Array>().unwrap();
314+
let expected = Int32Array::from(vec![
315+
Some(1),
316+
Some(2),
317+
Some(9),
318+
Some(10), // true in mask but null
319+
Some(11),
320+
Some(12),
321+
]);
322+
assert_eq!(actual, &expected);
323+
}
324+
325+
#[test]
326+
fn test_zip_kernel_primitive_scalar_with_boolean_array_mask_with_nulls_should_be_treated_as_false()
327+
{
328+
let scalar_truthy = Scalar::new(Int32Array::from_value(42, 1));
329+
let scalar_falsy = Scalar::new(Int32Array::from_value(123, 1));
330+
331+
let mask = {
332+
let booleans = BooleanBuffer::from(vec![true, true, false, true, false, false]);
333+
let nulls = NullBuffer::from(vec![
334+
true, true, true,
335+
false, // null treated as false even though in the original mask it was true
336+
true, true,
337+
]);
338+
BooleanArray::new(booleans, Some(nulls))
339+
};
340+
let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap();
341+
let actual = out.as_any().downcast_ref::<Int32Array>().unwrap();
342+
let expected = Int32Array::from(vec![
343+
Some(42),
344+
Some(42),
345+
Some(123),
346+
Some(123), // true in mask but null
347+
Some(123),
348+
Some(123),
349+
]);
350+
assert_eq!(actual, &expected);
351+
}
352+
353+
#[test]
354+
fn test_zip_string_array_with_nulls_is_mask_should_be_treated_as_false() {
355+
let truthy = StringArray::from_iter_values(vec!["1", "2", "3", "4", "5", "6"]);
356+
let falsy = StringArray::from_iter_values(vec!["7", "8", "9", "10", "11", "12"]);
357+
358+
let mask = {
359+
let booleans = BooleanBuffer::from(vec![true, true, false, true, false, false]);
360+
let nulls = NullBuffer::from(vec![
361+
true, true, true,
362+
false, // null treated as false even though in the original mask it was true
363+
true, true,
364+
]);
365+
BooleanArray::new(booleans, Some(nulls))
366+
};
367+
let out = zip(&mask, &truthy, &falsy).unwrap();
368+
let actual = out.as_string::<i32>();
369+
let expected = StringArray::from_iter_values(vec![
370+
"1", "2", "9", "10", // true in mask but null
371+
"11", "12",
372+
]);
373+
assert_eq!(actual, &expected);
374+
}
375+
376+
#[test]
377+
fn test_zip_kernel_large_string_scalar_with_boolean_array_mask_with_nulls_should_be_treated_as_false()
378+
{
379+
let scalar_truthy = Scalar::new(LargeStringArray::from_iter_values(["test"]));
380+
let scalar_falsy = Scalar::new(LargeStringArray::from_iter_values(["something else"]));
381+
382+
let mask = {
383+
let booleans = BooleanBuffer::from(vec![true, true, false, true, false, false]);
384+
let nulls = NullBuffer::from(vec![
385+
true, true, true,
386+
false, // null treated as false even though in the original mask it was true
387+
true, true,
388+
]);
389+
BooleanArray::new(booleans, Some(nulls))
390+
};
391+
let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap();
392+
let actual = out.as_any().downcast_ref::<LargeStringArray>().unwrap();
393+
let expected = LargeStringArray::from_iter(vec![
394+
Some("test"),
395+
Some("test"),
396+
Some("something else"),
397+
Some("something else"), // true in mask but null
398+
Some("something else"),
399+
Some("something else"),
400+
]);
401+
assert_eq!(actual, &expected);
402+
}
282403
}

0 commit comments

Comments
 (0)