From f2b811dfd274df8562c40050a3777c5e0552563e Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sun, 16 May 2021 10:00:27 +0200 Subject: [PATCH 1/3] fix invalid null handling in filter --- arrow/src/compute/kernels/filter.rs | 43 +++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/arrow/src/compute/kernels/filter.rs b/arrow/src/compute/kernels/filter.rs index 68feb0a546e2..5b76536e5887 100644 --- a/arrow/src/compute/kernels/filter.rs +++ b/arrow/src/compute/kernels/filter.rs @@ -17,10 +17,12 @@ //! Defines miscellaneous array kernels. +use crate::datatypes::DataType; use crate::error::Result; use crate::record_batch::RecordBatch; use crate::{array::*, util::bit_chunk_iterator::BitChunkIterator}; use std::iter::Enumerate; +use std::ops::BitAnd; /// Function that can filter arbitrary arrays pub type Filter<'a> = Box ArrayData + 'a>; @@ -221,6 +223,23 @@ pub fn build_filter(filter: &BooleanArray) -> Result { /// # } /// ``` pub fn filter(array: &Array, filter: &BooleanArray) -> Result { + if filter.null_count() > 0 { + // this greatly simplifies subsequent filtering code + // now we only have a boolean mask to deal with + let array_data = filter.data_ref(); + let null_bitmap = array_data.null_buffer().unwrap(); + let mask = filter.values(); + let new_mask = mask.bitand(&null_bitmap)?; + + let array_data = ArrayData::builder(DataType::Boolean) + .len(filter.len()) + .add_buffer(new_mask) + .build(); + let filter = BooleanArray::from(array_data); + // fully syntax, because we have an argument with the same name + return crate::compute::kernels::filter::filter(array, &filter); + } + let iter = SlicesIterator::new(filter); let mut mutable = @@ -249,6 +268,7 @@ pub fn filter_record_batch( #[cfg(test)] mod tests { use super::*; + use crate::datatypes::Int64Type; use crate::{ buffer::Buffer, datatypes::{DataType, Field}, @@ -581,4 +601,27 @@ mod tests { assert_eq!(chunks, vec![(1, 62), (63, 124), (125, 130)]); assert_eq!(filter_count, 61 + 61 + 5); } + + #[test] + fn test_null_mask() -> Result<()> { + use crate::compute::kernels::comparison; + let a: PrimitiveArray = + PrimitiveArray::from(vec![Some(1), Some(2), None]); + let mask0 = comparison::eq(&a, &a)?; + let out0 = filter(&a, &mask0)?; + let out_arr0 = out0 + .as_any() + .downcast_ref::>() + .unwrap(); + + let mask1 = BooleanArray::from(vec![Some(true), Some(true), None]); + let out1 = filter(&a, &mask1)?; + let out_arr1 = out1 + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!(mask0, mask1); + assert_eq!(out_arr0, out_arr1); + Ok(()) + } } From b51d484889d31cbfc9d6d13be10be1490247b1c4 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sun, 16 May 2021 16:17:17 +0200 Subject: [PATCH 2/3] take offset into account --- arrow/src/compute/kernels/filter.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arrow/src/compute/kernels/filter.rs b/arrow/src/compute/kernels/filter.rs index 5b76536e5887..b450d76d7d21 100644 --- a/arrow/src/compute/kernels/filter.rs +++ b/arrow/src/compute/kernels/filter.rs @@ -17,12 +17,12 @@ //! Defines miscellaneous array kernels. +use crate::buffer::buffer_bin_and; use crate::datatypes::DataType; use crate::error::Result; use crate::record_batch::RecordBatch; use crate::{array::*, util::bit_chunk_iterator::BitChunkIterator}; use std::iter::Enumerate; -use std::ops::BitAnd; /// Function that can filter arbitrary arrays pub type Filter<'a> = Box ArrayData + 'a>; @@ -229,14 +229,16 @@ pub fn filter(array: &Array, filter: &BooleanArray) -> Result { let array_data = filter.data_ref(); let null_bitmap = array_data.null_buffer().unwrap(); let mask = filter.values(); - let new_mask = mask.bitand(&null_bitmap)?; + let offset = filter.offset(); + + let new_mask = buffer_bin_and(mask, offset, null_bitmap, offset, filter.len()); let array_data = ArrayData::builder(DataType::Boolean) .len(filter.len()) .add_buffer(new_mask) .build(); let filter = BooleanArray::from(array_data); - // fully syntax, because we have an argument with the same name + // fully qualified syntax, because we have an argument with the same name return crate::compute::kernels::filter::filter(array, &filter); } From 5fc668f3496deda87bfd0f716430bab4bde3f292 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Tue, 18 May 2021 11:20:27 +0200 Subject: [PATCH 3/3] remove incorrect UB warning --- arrow/src/compute/kernels/filter.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arrow/src/compute/kernels/filter.rs b/arrow/src/compute/kernels/filter.rs index b450d76d7d21..4da07b89edde 100644 --- a/arrow/src/compute/kernels/filter.rs +++ b/arrow/src/compute/kernels/filter.rs @@ -206,8 +206,7 @@ pub fn build_filter(filter: &BooleanArray) -> Result { } /// Filters an [Array], returning elements matching the filter (i.e. where the values are true). -/// WARNING: the nulls of `filter` are ignored and the value on its slot is considered. -/// Therefore, it is considered undefined behavior to pass `filter` with null values. +/// /// # Example /// ```rust /// # use arrow::array::{Int32Array, BooleanArray};