Skip to content

Commit ba22a21

Browse files
authored
Fix "Incorrect Behavior of Collecting a filtered iterator to a BooleanArray" (#8543)
# Which issue does this PR close? - Closes #8505 . # Rationale for this change Fix the bug and align `BooleanArray::from_iter` to `PrimitiveArray::from_iter` In `BooleanArray::from_iter`: Collecting to a `Vec` and then using `from_trusted_len_iter` was almost double as fast as using `BooleanBufferBuilder` on my machine. # What changes are included in this PR? - Use builders in `BooleanArray::from_iter` to fix the wrong behavior - Introduce `BooleanArray::from_trusted_len_iter` for a more performant version (The old version of `BooleanArray::from_iter`, just with unsafe flavor of `bit_util::set_bit_raw`) - Add `BooleanAdapter`, inspired by `NativeAdapter` from the `PrimitiveArray`. This allows also doing `BooleanArray::from_iter([true, false].into_iter())`. # Are these changes tested? - New test to cover the initial bug - New test to cover `BooleanArray::from_trusted_len_iter` directly (old `BooleanArray::from_iter` also cover it indirectly) - New test to document that you can directly collect `[false, true, ...]` (no `Option`) # Are there any user-facing changes? - `BooleanArray::from_iter` has a "slight" performance regression that users could observe. - Allow directly collecting bools to a `BooleanArray` - `BooleanArray::from_trusted_len_iter`
1 parent da98297 commit ba22a21

File tree

3 files changed

+135
-12
lines changed

3 files changed

+135
-12
lines changed

arrow-array/src/array/boolean_array.rs

Lines changed: 121 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -436,11 +436,84 @@ impl<'a> BooleanArray {
436436
}
437437
}
438438

439-
impl<Ptr: std::borrow::Borrow<Option<bool>>> FromIterator<Ptr> for BooleanArray {
439+
/// An optional boolean value
440+
///
441+
/// This struct is used as an adapter when creating `BooleanArray` from an iterator.
442+
/// `FromIterator` for `BooleanArray` takes an iterator where the elements can be `into`
443+
/// this struct. So once implementing `From` or `Into` trait for a type, an iterator of
444+
/// the type can be collected to `BooleanArray`.
445+
///
446+
/// See also [NativeAdapter](crate::array::NativeAdapter).
447+
#[derive(Debug)]
448+
struct BooleanAdapter {
449+
/// Corresponding Rust native type if available
450+
pub native: Option<bool>,
451+
}
452+
453+
impl From<bool> for BooleanAdapter {
454+
fn from(value: bool) -> Self {
455+
BooleanAdapter {
456+
native: Some(value),
457+
}
458+
}
459+
}
460+
461+
impl From<&bool> for BooleanAdapter {
462+
fn from(value: &bool) -> Self {
463+
BooleanAdapter {
464+
native: Some(*value),
465+
}
466+
}
467+
}
468+
469+
impl From<Option<bool>> for BooleanAdapter {
470+
fn from(value: Option<bool>) -> Self {
471+
BooleanAdapter { native: value }
472+
}
473+
}
474+
475+
impl From<&Option<bool>> for BooleanAdapter {
476+
fn from(value: &Option<bool>) -> Self {
477+
BooleanAdapter { native: *value }
478+
}
479+
}
480+
481+
impl<Ptr: Into<BooleanAdapter>> FromIterator<Ptr> for BooleanArray {
440482
fn from_iter<I: IntoIterator<Item = Ptr>>(iter: I) -> Self {
441483
let iter = iter.into_iter();
442-
let (_, data_len) = iter.size_hint();
443-
let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound.
484+
let capacity = match iter.size_hint() {
485+
(lower, Some(upper)) if lower == upper => lower,
486+
_ => 0,
487+
};
488+
let mut builder = BooleanBuilder::with_capacity(capacity);
489+
builder.extend(iter.map(|item| item.into().native));
490+
builder.finish()
491+
}
492+
}
493+
494+
impl BooleanArray {
495+
/// Creates a [`BooleanArray`] from an iterator of trusted length.
496+
///
497+
/// # Safety
498+
///
499+
/// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
500+
/// I.e. that `size_hint().1` correctly reports its length. Note that this is a stronger
501+
/// guarantee that `ExactSizeIterator` provides which could still report a wrong length.
502+
///
503+
/// # Panics
504+
///
505+
/// Panics if the iterator does not report an upper bound on `size_hint()`.
506+
#[inline]
507+
#[allow(
508+
private_bounds,
509+
reason = "We will expose BooleanAdapter if there is a need"
510+
)]
511+
pub unsafe fn from_trusted_len_iter<I, P>(iter: I) -> Self
512+
where
513+
P: Into<BooleanAdapter>,
514+
I: ExactSizeIterator<Item = P>,
515+
{
516+
let data_len = iter.len();
444517

445518
let num_bytes = bit_util::ceil(data_len, 8);
446519
let mut null_builder = MutableBuffer::from_len_zeroed(num_bytes);
@@ -450,10 +523,14 @@ impl<Ptr: std::borrow::Borrow<Option<bool>>> FromIterator<Ptr> for BooleanArray
450523

451524
let null_slice = null_builder.as_slice_mut();
452525
iter.enumerate().for_each(|(i, item)| {
453-
if let Some(a) = item.borrow() {
454-
bit_util::set_bit(null_slice, i);
455-
if *a {
456-
bit_util::set_bit(data, i);
526+
if let Some(a) = item.into().native {
527+
unsafe {
528+
// SAFETY: There will be enough space in the buffers due to the trusted len size
529+
// hint
530+
bit_util::set_bit_raw(null_slice.as_mut_ptr(), i);
531+
if a {
532+
bit_util::set_bit_raw(data.as_mut_ptr(), i);
533+
}
457534
}
458535
}
459536
});
@@ -599,6 +676,20 @@ mod tests {
599676
}
600677
}
601678

679+
#[test]
680+
fn test_boolean_array_from_non_nullable_iter() {
681+
let v = vec![true, false, true];
682+
let arr = v.into_iter().collect::<BooleanArray>();
683+
assert_eq!(3, arr.len());
684+
assert_eq!(0, arr.offset());
685+
assert_eq!(0, arr.null_count());
686+
assert!(arr.nulls().is_none());
687+
688+
assert!(arr.value(0));
689+
assert!(!arr.value(1));
690+
assert!(arr.value(2));
691+
}
692+
602693
#[test]
603694
fn test_boolean_array_from_nullable_iter() {
604695
let v = vec![Some(true), None, Some(false), None];
@@ -617,6 +708,29 @@ mod tests {
617708
assert!(!arr.value(2));
618709
}
619710

711+
#[test]
712+
fn test_boolean_array_from_nullable_trusted_len_iter() {
713+
// Should exhibit the same behavior as `from_iter`, which is tested above.
714+
let v = vec![Some(true), None, Some(false), None];
715+
let expected = v.clone().into_iter().collect::<BooleanArray>();
716+
let actual = unsafe {
717+
// SAFETY: `v` has trusted length
718+
BooleanArray::from_trusted_len_iter(v.into_iter())
719+
};
720+
assert_eq!(expected, actual);
721+
}
722+
723+
#[test]
724+
fn test_boolean_array_from_iter_with_larger_upper_bound() {
725+
// See https://github.com/apache/arrow-rs/issues/8505
726+
// This returns an upper size hint of 4
727+
let iterator = vec![Some(true), None, Some(false), None]
728+
.into_iter()
729+
.filter(Option::is_some);
730+
let arr = iterator.collect::<BooleanArray>();
731+
assert_eq!(2, arr.len());
732+
}
733+
620734
#[test]
621735
fn test_boolean_array_builder() {
622736
// Test building a boolean array with ArrayData builder and offset

arrow-array/src/builder/boolean_builder.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -234,9 +234,12 @@ impl ArrayBuilder for BooleanBuilder {
234234
impl Extend<Option<bool>> for BooleanBuilder {
235235
#[inline]
236236
fn extend<T: IntoIterator<Item = Option<bool>>>(&mut self, iter: T) {
237-
for v in iter {
238-
self.append_option(v)
239-
}
237+
let buffered = iter.into_iter().collect::<Vec<_>>();
238+
let array = unsafe {
239+
// SAFETY: std::vec::IntoIter implements TrustedLen
240+
BooleanArray::from_trusted_len_iter(buffered.into_iter())
241+
};
242+
self.append_array(&array)
240243
}
241244
}
242245

arrow/benches/array_from.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,12 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
extern crate arrow;
1819
#[macro_use]
1920
extern crate criterion;
2021

2122
use criterion::Criterion;
2223

23-
extern crate arrow;
24-
2524
use arrow::array::*;
2625
use arrow_buffer::i256;
2726
use rand::Rng;
@@ -236,6 +235,13 @@ fn from_iter_benchmark(c: &mut Criterion) {
236235
let values = gen_option_vector(true, ITER_LEN);
237236
b.iter(|| hint::black_box(BooleanArray::from_iter(values.iter())));
238237
});
238+
c.bench_function("BooleanArray::from_trusted_len_iter", |b| {
239+
let values = gen_option_vector(true, ITER_LEN);
240+
b.iter(|| unsafe {
241+
// SAFETY: values.iter() is a TrustedLenIterator
242+
hint::black_box(BooleanArray::from_trusted_len_iter(values.iter()))
243+
});
244+
});
239245
}
240246

241247
criterion_group!(

0 commit comments

Comments
 (0)