-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add experimental translation of 'LIKE' patterns to regex
- Loading branch information
Showing
5 changed files
with
199 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
use criterion::{black_box, criterion_group, criterion_main, Criterion}; | ||
use partiql_rewriter::experimental::{like_to_re_pattern, similar_to_re_pattern}; | ||
use rand::distributions::Alphanumeric; | ||
use rand::{Rng, SeedableRng}; | ||
use regex::Regex; | ||
use std::time::Duration; | ||
|
||
const LIKE_SIMPLE: &str = r#"foo_.*?_bar"#; | ||
|
||
fn like_8k() -> String { | ||
let mut rng = rand_chacha::ChaCha20Rng::seed_from_u64(987654321); | ||
let mut like_8k = String::with_capacity(8002); | ||
like_8k += "%"; | ||
for ch in rng.sample_iter(&Alphanumeric).take(8000) { | ||
like_8k.push(ch as char); | ||
} | ||
like_8k += "%"; | ||
like_8k | ||
} | ||
|
||
fn like(c: &mut Criterion) { | ||
c.bench_function("like-simple-translate", |b| { | ||
b.iter(|| like_to_re_pattern(black_box(LIKE_SIMPLE), '\\')) | ||
}); | ||
|
||
let pat = like_to_re_pattern(black_box(LIKE_SIMPLE), '\\'); | ||
let re = Regex::new(&pat).unwrap(); | ||
c.bench_function("like-simple-match", |b| { | ||
b.iter(|| re.is_match("foos.*?%bar")) | ||
}); | ||
|
||
let like_8k = like_8k(); | ||
c.bench_function("like-8k-translate", |b| { | ||
b.iter(|| like_to_re_pattern(black_box(&like_8k), '\\')) | ||
}); | ||
let pat = like_to_re_pattern(black_box(&like_8k), '\\'); | ||
let re = Regex::new(&pat).unwrap(); | ||
|
||
c.bench_function("like-8k-match", |b| b.iter(|| re.is_match(&like_8k))); | ||
} | ||
|
||
criterion_group! { | ||
name = like_compile; | ||
config = Criterion::default(); | ||
targets = like | ||
} | ||
|
||
criterion_main!(like_compile); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,8 @@ | ||
mod pattern_match; | ||
pub mod experimental { | ||
pub use super::pattern_match::*; | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
#[test] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
use regex::{Error, Regex}; | ||
|
||
// TODO docs | ||
|
||
// TODO consider how to use the appropriate mechanisms to prevent exhaustion of | ||
// resources by query-written regexes | ||
// See https://docs.rs/regex/latest/regex/#untrusted-input | ||
|
||
// TODO I believe this should be resilient ReDoS, as this should never build | ||
// an 'Evil Regex' as defined by | ||
// https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS | ||
pub fn like_to_re_pattern(like_expr: &str, escape: char) -> String { | ||
to_re_pattern(like_expr, escape, regex_syntax::is_meta_character) | ||
} | ||
|
||
// TODO SIMILAR probably needs to be better thought through for preventing ReDoS | ||
// A query writer would be able to build an 'Evil Regex' as defined by | ||
// https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS | ||
fn similar_to_re_pattern(similar_expr: &str, escape: char) -> String { | ||
to_re_pattern(similar_expr, escape, is_similar_meta_character) | ||
} | ||
|
||
#[inline] | ||
fn is_similar_meta_character(c: char) -> bool { | ||
match c { | ||
// pass these through to be interpreted as regex meta characters | ||
'|' | '*' | '+' | '?' | '{' | '}' | '(' | ')' | '[' | ']' => false, | ||
// everything else, defer | ||
_ => regex_syntax::is_meta_character(c), | ||
} | ||
} | ||
|
||
#[inline] | ||
fn to_re_pattern<F>(expr: &str, escape: char, is_meta_character: F) -> String | ||
where | ||
F: Fn(char) -> bool, | ||
{ | ||
let mut pattern = String::from("^"); | ||
write_re_pattern(expr, escape, is_meta_character, &mut pattern); | ||
pattern += "$"; | ||
pattern | ||
} | ||
|
||
#[inline] | ||
fn write_re_pattern<F>(like_expr: &str, escape_ch: char, is_meta_character: F, buf: &mut String) | ||
where | ||
F: Fn(char) -> bool, | ||
{ | ||
buf.reserve(like_expr.len() + 6); | ||
let mut escaped = false; | ||
let mut wildcard = false; | ||
|
||
for ch in like_expr.chars() { | ||
let is_any = std::mem::replace(&mut wildcard, false); | ||
let is_escaped = std::mem::replace(&mut escaped, false); | ||
match (ch, is_escaped) { | ||
(_, false) if ch == escape_ch => escaped = true, | ||
('%', false) => { | ||
if !is_any { | ||
buf.push_str(".*?") | ||
} | ||
wildcard = true; | ||
} | ||
('_', false) => buf.push('.'), | ||
_ => { | ||
if is_meta_character(ch) { | ||
buf.push('\\'); // regex-escape the next character | ||
} | ||
buf.push(ch); | ||
} | ||
} | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
use regex::Regex; | ||
use std::collections::{BTreeSet, HashSet}; | ||
|
||
#[test] | ||
fn like() { | ||
assert_eq!(like_to_re_pattern("foo", '\\'), r#"^foo$"#); | ||
assert_eq!(like_to_re_pattern("%foo", '\\'), r#"^.*?foo$"#); | ||
assert_eq!(like_to_re_pattern("foo%", '\\'), r#"^foo.*?$"#); | ||
assert_eq!(like_to_re_pattern("foo%bar", '\\'), r#"^foo.*?bar$"#); | ||
assert_eq!(like_to_re_pattern("foo%%bar", '\\'), r#"^foo.*?bar$"#); | ||
assert_eq!(like_to_re_pattern("foo%%%bar", '\\'), r#"^foo.*?bar$"#); | ||
assert_eq!(like_to_re_pattern("foo%%%%bar", '\\'), r#"^foo.*?bar$"#); | ||
assert_eq!( | ||
like_to_re_pattern("%foo%%%%bar%", '\\'), | ||
r#"^.*?foo.*?bar.*?$"# | ||
); | ||
assert_eq!( | ||
like_to_re_pattern("%foo%%%%bar\\%baz%", '\\'), | ||
r#"^.*?foo.*?bar%baz.*?$"# | ||
); | ||
assert_eq!( | ||
like_to_re_pattern("%foo%%%%bar*%baz%", '*'), | ||
r#"^.*?foo.*?bar%baz.*?$"# | ||
); | ||
assert_eq!(like_to_re_pattern("_foo", '\\'), r#"^.foo$"#); | ||
assert_eq!(like_to_re_pattern("foo_", '\\'), r#"^foo.$"#); | ||
assert_eq!(like_to_re_pattern("foo_bar", '\\'), r#"^foo.bar$"#); | ||
assert_eq!(like_to_re_pattern("foo__bar", '\\'), r#"^foo..bar$"#); | ||
assert_eq!( | ||
like_to_re_pattern("foo_.*?_bar", '\\'), | ||
r#"^foo.\.\*\?.bar$"# | ||
); | ||
} | ||
|
||
#[test] | ||
fn like_match() { | ||
let pat = like_to_re_pattern("foo_.*?_bar", '\\'); | ||
let re = Regex::new(&pat).unwrap(); | ||
|
||
assert!(re.is_match("foos.*?qbar")); | ||
} | ||
|
||
#[test] | ||
fn similar() { | ||
assert_eq!(similar_to_re_pattern("(b|c)%", '\\'), r#"^(b|c).*?$"#); | ||
assert_eq!(similar_to_re_pattern("%(b|d)%", '\\'), r#"^.*?(b|d).*?$"#); | ||
} | ||
|
||
#[test] | ||
fn similar_match() { | ||
let pat = like_to_re_pattern("(b|c)%", '\\'); | ||
let re = Regex::new(&pat).unwrap(); | ||
assert!(!re.is_match("abc")); | ||
|
||
let pat = like_to_re_pattern("%(b|d)%", '\\'); | ||
let re = Regex::new(&pat).unwrap(); | ||
assert!(!re.is_match("abc")); | ||
} | ||
} |