Skip to content

Commit

Permalink
ARROW-9523 [Rust] Improve filter kernel performance
Browse files Browse the repository at this point in the history
The filter kernel located here https://github.com/apache/arrow/blob/master/rust/arrow/src/compute/kernels/filter.rs

currently has the following performance:

filter old u8 low selectivity time: [1.7782 ms 1.7790 ms 1.7801 ms]
filter old u8 high selectivity time: [815.58 us 816.58 us 817.57 us]
filter old u8 w NULLs low selectivity time: [1.8131 ms 1.8231 ms 1.8336 ms]
filter old u8 w NULLs high selectivity time: [817.41 us 820.01 us 823.05 us]

I have been working on a new implementation which performs between approximately 17 and 550 times faster depending mostly on filter selectivity. Here are the benchmark results:

filter u8 low selectivity                  time:   [107.26 us 108.24 us 109.58 us]
filter u8 high selectivity                 time:   [4.7854 us 4.8050 us 4.8276 us]
filter context u8 low selectivity          time:   [102.59 us 102.93 us 103.38 us]
filter context u8 high selectivity         time:   [1.4709 us 1.4760 us 1.4823 us]
filter context u8 w NULLs low selectivity  time:   [130.48 us 131.00 us 131.65 us]
filter context u8 w NULLs high selectivity time:   [2.0520 us 2.0818 us 2.1137 us]
filter context f32 low selectivity         time:   [117.26 us 118.58 us 120.13 us]
filter context f32 high selectivity        time:   [1.7895 us 1.7919 us 1.7942 us]

This new implementation is based on a few key ideas:

(1) if the data array being filtered doesn't have a null bitmap, no time should be wasted to copy or create a null bitmap in the resulting filtered data array - this is implemented using the CopyNullBit trait which has a no-op implementation and an actual implementation

(2) when the filter is highly selective, e.g. only a small number of values from the data array are selected, the filter implementation should efficiently skip entire batches of 0s in the filter array - this is implemented by transmuting the filter array to u64 which allows to quickly check and skip entire batches of 64 bits 

(3) when an entire record batch is filtered, any computation which only depends on the filter array is done once and then shared for filtering all the data arrays in the record batch - this is implemented using the FilterContext struct

This pull request also implements support for filtering dictionary arrays. 

@paddyhoran @andygrove

Closes #7798 from yordan-pavlov/improve_filter_kernel_perf

Lead-authored-by: Yordan Pavlov <yordan.pavlov@outlook.com>
Co-authored-by: Yordan Pavlov <64363766+yordan-pavlov@users.noreply.github.com>
Signed-off-by: Andy Grove <andygrove@nvidia.com>
  • Loading branch information
2 people authored and andygrove committed Aug 6, 2020
1 parent b64d7f8 commit 457ea62
Show file tree
Hide file tree
Showing 6 changed files with 758 additions and 113 deletions.
4 changes: 4 additions & 0 deletions rust/arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ harness = false
name = "comparison_kernels"
harness = false

[[bench]]
name = "filter_kernels"
harness = false

[[bench]]
name = "take_kernels"
harness = false
Expand Down
152 changes: 152 additions & 0 deletions rust/arrow/benches/filter_kernels.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use arrow::array::*;
use arrow::compute::{filter, FilterContext};
use arrow::datatypes::ArrowNumericType;
use criterion::{criterion_group, criterion_main, Criterion};

fn create_primitive_array<T, F>(size: usize, value_fn: F) -> PrimitiveArray<T>
where
T: ArrowNumericType,
F: Fn(usize) -> T::Native,
{
let mut builder = PrimitiveArray::<T>::builder(size);
for i in 0..size {
builder.append_value(value_fn(i)).unwrap();
}
builder.finish()
}

fn create_u8_array_with_nulls(size: usize) -> UInt8Array {
let mut builder = UInt8Builder::new(size);
for i in 0..size {
if i % 2 == 0 {
builder.append_value(1).unwrap();
} else {
builder.append_null().unwrap();
}
}
builder.finish()
}

fn create_bool_array<F>(size: usize, value_fn: F) -> BooleanArray
where
F: Fn(usize) -> bool,
{
let mut builder = BooleanBuilder::new(size);
for i in 0..size {
builder.append_value(value_fn(i)).unwrap();
}
builder.finish()
}

fn bench_filter_u8(data_array: &UInt8Array, filter_array: &BooleanArray) {
filter(
criterion::black_box(data_array),
criterion::black_box(filter_array),
)
.unwrap();
}

// fn bench_filter_f32(data_array: &Float32Array, filter_array: &BooleanArray) {
// filter(criterion::black_box(data_array), criterion::black_box(filter_array)).unwrap();
// }

fn bench_filter_context_u8(data_array: &UInt8Array, filter_context: &FilterContext) {
filter_context
.filter(criterion::black_box(data_array))
.unwrap();
}

fn bench_filter_context_f32(data_array: &Float32Array, filter_context: &FilterContext) {
filter_context
.filter(criterion::black_box(data_array))
.unwrap();
}

fn add_benchmark(c: &mut Criterion) {
let size = 65536;
let filter_array = create_bool_array(size, |i| match i % 2 {
0 => true,
_ => false,
});
let sparse_filter_array = create_bool_array(size, |i| match i % 8000 {
0 => true,
_ => false,
});
let dense_filter_array = create_bool_array(size, |i| match i % 8000 {
0 => false,
_ => true,
});

let filter_context = FilterContext::new(&filter_array).unwrap();
let sparse_filter_context = FilterContext::new(&sparse_filter_array).unwrap();
let dense_filter_context = FilterContext::new(&dense_filter_array).unwrap();

let data_array = create_primitive_array(size, |i| match i % 2 {
0 => 1,
_ => 0,
});
c.bench_function("filter u8 low selectivity", |b| {
b.iter(|| bench_filter_u8(&data_array, &filter_array))
});
c.bench_function("filter u8 high selectivity", |b| {
b.iter(|| bench_filter_u8(&data_array, &sparse_filter_array))
});
c.bench_function("filter u8 very low selectivity", |b| {
b.iter(|| bench_filter_u8(&data_array, &dense_filter_array))
});

c.bench_function("filter context u8 low selectivity", |b| {
b.iter(|| bench_filter_context_u8(&data_array, &filter_context))
});
c.bench_function("filter context u8 high selectivity", |b| {
b.iter(|| bench_filter_context_u8(&data_array, &sparse_filter_context))
});
c.bench_function("filter context u8 very low selectivity", |b| {
b.iter(|| bench_filter_context_u8(&data_array, &dense_filter_context))
});

let data_array = create_u8_array_with_nulls(size);
c.bench_function("filter context u8 w NULLs low selectivity", |b| {
b.iter(|| bench_filter_context_u8(&data_array, &filter_context))
});
c.bench_function("filter context u8 w NULLs high selectivity", |b| {
b.iter(|| bench_filter_context_u8(&data_array, &sparse_filter_context))
});
c.bench_function("filter context u8 w NULLs very low selectivity", |b| {
b.iter(|| bench_filter_context_u8(&data_array, &dense_filter_context))
});

let data_array = create_primitive_array(size, |i| match i % 2 {
0 => 1.0,
_ => 0.0,
});
c.bench_function("filter context f32 low selectivity", |b| {
b.iter(|| bench_filter_context_f32(&data_array, &filter_context))
});
c.bench_function("filter context f32 high selectivity", |b| {
b.iter(|| bench_filter_context_f32(&data_array, &sparse_filter_context))
});
c.bench_function("filter context f32 very low selectivity", |b| {
b.iter(|| bench_filter_context_f32(&data_array, &dense_filter_context))
});
}

criterion_group!(benches, add_benchmark);
criterion_main!(benches);
1 change: 1 addition & 0 deletions rust/arrow/src/array/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ impl ArrayData {
}

/// Returns the offset of this array
#[inline]
pub fn offset(&self) -> usize {
self.offset
}
Expand Down
2 changes: 2 additions & 0 deletions rust/arrow/src/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -545,10 +545,12 @@ impl MutableBuffer {
///
/// Note that this should be used cautiously, and the returned pointer should not be
/// stored anywhere, to avoid dangling pointers.
#[inline]
pub fn raw_data(&self) -> *const u8 {
self.data
}

#[inline]
pub fn raw_data_mut(&mut self) -> *mut u8 {
self.data
}
Expand Down
Loading

0 comments on commit 457ea62

Please sign in to comment.