From fd095ca08b2490f4b6ef819a403ae2af25a11fab Mon Sep 17 00:00:00 2001 From: taichong Date: Wed, 9 Nov 2022 17:55:42 +0800 Subject: [PATCH] optimize stringsearch like --- Cargo.lock | 1 + src/query/functions/Cargo.toml | 1 + .../scalars/comparisons/comparison_like.rs | 45 ++++++++++++++----- 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0615e048f669..c566a18e92c1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1444,6 +1444,7 @@ dependencies = [ "hex", "itertools", "md-5", + "memchr", "naive-cityhash", "num", "num-format", diff --git a/src/query/functions/Cargo.toml b/src/query/functions/Cargo.toml index f6d649905139..82968875c2e3 100644 --- a/src/query/functions/Cargo.toml +++ b/src/query/functions/Cargo.toml @@ -33,6 +33,7 @@ h3ron = "0.15.1" hex = "0.4.3" itertools = "0.10.5" md-5 = "0.10.5" +memchr = "2.5.0" naive-cityhash = "0.2.0" num = "0.4.0" num-format = "0.4.0" diff --git a/src/query/functions/src/scalars/comparisons/comparison_like.rs b/src/query/functions/src/scalars/comparisons/comparison_like.rs index 0239ade671bc..c74a0e9176b6 100644 --- a/src/query/functions/src/scalars/comparisons/comparison_like.rs +++ b/src/query/functions/src/scalars/comparisons/comparison_like.rs @@ -15,6 +15,7 @@ use std::collections::HashMap; use common_datavalues::prelude::*; +use memchr::memmem; use regex::bytes::Regex as BytesRegex; use super::comparison::StringSearchCreator; @@ -84,25 +85,45 @@ impl StringSearchImpl for StringSearchLike { let re_pattern = like_pattern_to_regex(pattern); let re = BytesRegex::new(&re_pattern) .expect("Unable to build regex from LIKE pattern: {}"); - BooleanColumn::from_iterator(lhs.scalar_iter().map(|x| { - if !is_empty { - let lhs_str = std::str::from_utf8(x) - .expect("Unable to convert lhs value to string: {}"); - let contain = lhs_str.find(sub_strings[0]); - if contain.is_none() { - op(false) - } else { - op(re.is_match(x)) - } + if std::intrinsics::unlikely(is_empty) { + BooleanColumn::from_iterator(lhs.scalar_iter().map(|x| op(re.is_match(x)))) + } else { + let sub_string = sub_strings[0].as_bytes(); + // This impl like position function + if sub_strings.len() == 1 { + BooleanColumn::from_iterator(lhs.scalar_iter().map(|x| { + let contain = search_sub_str(x, sub_string); + if contain.is_none() { + op(false) + } else { + op(true) + } + })) } else { - op(re.is_match(x)) + BooleanColumn::from_iterator(lhs.scalar_iter().map(|x| { + let contain = memmem::find(x, sub_string).is_none(); + if contain { + op(false) + } else { + op(re.is_match(x)) + } + })) } - })) + } } } } } +#[inline] +fn search_sub_str(str: &[u8], substr: &[u8]) -> Option { + if substr.len() <= str.len() { + str.windows(substr.len()).position(|w| w == substr) + } else { + None + } +} + #[inline] fn is_like_pattern_escape(c: u8) -> bool { c == b'%' || c == b'_' || c == b'\\'