Skip to content

Commit

Permalink
Add a "contains" fast-path to like_utf8_scalar (jorgecarleitao#1582)
Browse files Browse the repository at this point in the history
  • Loading branch information
RyanMarcus authored Oct 22, 2023
1 parent 9a26422 commit 346c866
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 1 deletion.
10 changes: 9 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,9 @@ odbc-api = { version = "0.36", optional = true }
# Faster hashing
ahash = "0.8"

# For `LIKE` matching "contains" fast-path
memchr = { version = "2.6", optional = true }

# Support conversion to/from arrow-rs
arrow-buffer = { version = ">=40", optional = true }
arrow-schema = { version = ">=40", optional = true }
Expand Down Expand Up @@ -237,7 +240,7 @@ compute_filter = []
compute_hash = ["multiversion"]
compute_if_then_else = []
compute_length = []
compute_like = ["regex", "regex-syntax"]
compute_like = ["regex", "regex-syntax", "dep:memchr"]
compute_limit = []
compute_merge_sort = ["itertools", "compute_sort"]
compute_nullif = ["compute_comparison"]
Expand Down Expand Up @@ -394,3 +397,8 @@ harness = false
[[bench]]
name = "assign_ops"
harness = false

[[bench]]
name = "like_kernels"
harness = false

22 changes: 22 additions & 0 deletions benches/like_kernels.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
use arrow2::util::bench_util::create_string_array;
use criterion::{criterion_group, criterion_main, Criterion};

use arrow2::array::*;
use arrow2::compute::like::like_utf8_scalar;

fn bench_like(array: &Utf8Array<i32>, pattern: &str) {
criterion::black_box(like_utf8_scalar(array, pattern).unwrap());
}

fn add_benchmark(c: &mut Criterion) {
for size_log2 in 16..21_u32 {
let size = size_log2.pow(2) as usize;
let array = create_string_array::<i32>(100, size, 0.0, 0);
c.bench_function(&format!("LIKE length = 2^{}", size_log2), |b| {
b.iter(|| bench_like(&array, "%abba%"))
});
}
}

criterion_group!(benches, add_benchmark);
criterion_main!(benches);
11 changes: 11 additions & 0 deletions src/compute/like.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,17 @@ fn a_like_utf8_scalar<O: Offset, F: Fn(bool) -> bool>(
// fast path, can use ends_with
let ends_with = &rhs[1..];
Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x.ends_with(ends_with))))
} else if rhs.starts_with('%')
&& rhs.ends_with('%')
&& !rhs.ends_with("\\%")
&& !rhs[1..rhs.len() - 1].contains(is_like_pattern)
{
let needle = &rhs[1..rhs.len() - 1];
let finder = memchr::memmem::Finder::new(needle);
Bitmap::from_trusted_len_iter(
lhs.values_iter()
.map(|x| op(finder.find(x.as_bytes()).is_some())),
)
} else {
let re_pattern = replace_pattern(rhs);
let re = Regex::new(&format!("^{re_pattern}$")).map_err(|e| {
Expand Down
4 changes: 4 additions & 0 deletions tests/it/compute/like.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ fn test_like_utf8_scalar() -> Result<()> {
let result = like_utf8_scalar(&array, "A\\_row").unwrap();
assert_eq!(result, BooleanArray::from_slice([true, false]));

let array = Utf8Array::<i32>::from_slice(["Arrow", "Arrow", "row your", "boat"]);
let result = like_utf8_scalar(&array, "%row%").unwrap();
assert_eq!(result, BooleanArray::from_slice([true, true, true, false]));

Ok(())
}

Expand Down

0 comments on commit 346c866

Please sign in to comment.