From d1392fa540a56baf4966791d7a97e3ef23ba14f3 Mon Sep 17 00:00:00 2001 From: Josh Pschorr Date: Mon, 11 Apr 2022 12:33:58 -0700 Subject: [PATCH] Add experimental translation of 'LIKE' patterns to regex --- partiql-parser/benches/bench_parse.rs | 2 +- partiql-rewriter/Cargo.toml | 9 ++ .../benches/bench_pattern_match.rs | 48 +++++++ partiql-rewriter/src/lib.rs | 5 + partiql-rewriter/src/pattern_match.rs | 136 ++++++++++++++++++ 5 files changed, 199 insertions(+), 1 deletion(-) create mode 100644 partiql-rewriter/benches/bench_pattern_match.rs create mode 100644 partiql-rewriter/src/pattern_match.rs diff --git a/partiql-parser/benches/bench_parse.rs b/partiql-parser/benches/bench_parse.rs index 4cb8afb3..a2ce670c 100644 --- a/partiql-parser/benches/bench_parse.rs +++ b/partiql-parser/benches/bench_parse.rs @@ -31,7 +31,7 @@ fn parse_bench(c: &mut Criterion) { criterion_group! { name = parse; - config = Criterion::default().measurement_time(Duration::new(10, 0)); + config = Criterion::default(); targets = parse_bench } diff --git a/partiql-rewriter/Cargo.toml b/partiql-rewriter/Cargo.toml index fb44ea41..617a858b 100644 --- a/partiql-rewriter/Cargo.toml +++ b/partiql-rewriter/Cargo.toml @@ -20,3 +20,12 @@ version = "0.0.0" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +regex = "1.5.5" +regex-syntax = "0.6.25" +rand = "0.8.5" +rand_chacha = "0.3.1" +criterion = "0.3" + +[[bench]] +name = "bench_pattern_match" +harness = false diff --git a/partiql-rewriter/benches/bench_pattern_match.rs b/partiql-rewriter/benches/bench_pattern_match.rs new file mode 100644 index 00000000..adaaf4b7 --- /dev/null +++ b/partiql-rewriter/benches/bench_pattern_match.rs @@ -0,0 +1,48 @@ +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use partiql_rewriter::experimental::{like_to_re_pattern, similar_to_re_pattern}; +use rand::distributions::Alphanumeric; +use rand::{Rng, SeedableRng}; +use regex::Regex; +use std::time::Duration; + +const LIKE_SIMPLE: &str = r#"foo_.*?_bar"#; + +fn like_8k() -> String { + let mut rng = rand_chacha::ChaCha20Rng::seed_from_u64(987654321); + let mut like_8k = String::with_capacity(8002); + like_8k += "%"; + for ch in rng.sample_iter(&Alphanumeric).take(8000) { + like_8k.push(ch as char); + } + like_8k += "%"; + like_8k +} + +fn like(c: &mut Criterion) { + c.bench_function("like-simple-translate", |b| { + b.iter(|| like_to_re_pattern(black_box(LIKE_SIMPLE), '\\')) + }); + + let pat = like_to_re_pattern(black_box(LIKE_SIMPLE), '\\'); + let re = Regex::new(&pat).unwrap(); + c.bench_function("like-simple-match", |b| { + b.iter(|| re.is_match("foos.*?%bar")) + }); + + let like_8k = like_8k(); + c.bench_function("like-8k-translate", |b| { + b.iter(|| like_to_re_pattern(black_box(&like_8k), '\\')) + }); + let pat = like_to_re_pattern(black_box(&like_8k), '\\'); + let re = Regex::new(&pat).unwrap(); + + c.bench_function("like-8k-match", |b| b.iter(|| re.is_match(&like_8k))); +} + +criterion_group! { + name = like_compile; + config = Criterion::default(); + targets = like +} + +criterion_main!(like_compile); diff --git a/partiql-rewriter/src/lib.rs b/partiql-rewriter/src/lib.rs index 31e1bb20..c2968473 100644 --- a/partiql-rewriter/src/lib.rs +++ b/partiql-rewriter/src/lib.rs @@ -1,3 +1,8 @@ +mod pattern_match; +pub mod experimental { + pub use super::pattern_match::*; +} + #[cfg(test)] mod tests { #[test] diff --git a/partiql-rewriter/src/pattern_match.rs b/partiql-rewriter/src/pattern_match.rs new file mode 100644 index 00000000..faadf4f9 --- /dev/null +++ b/partiql-rewriter/src/pattern_match.rs @@ -0,0 +1,136 @@ +use regex::{Error, Regex}; + +// TODO docs + +// TODO consider how to use the appropriate mechanisms to prevent exhaustion of +// resources by query-written regexes +// See https://docs.rs/regex/latest/regex/#untrusted-input + +// TODO I believe this should be resilient ReDoS, as this should never build +// an 'Evil Regex' as defined by +// https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS +pub fn like_to_re_pattern(like_expr: &str, escape: char) -> String { + to_re_pattern(like_expr, escape, regex_syntax::is_meta_character) +} + +// TODO SIMILAR probably needs to be better thought through for preventing ReDoS +// A query writer would be able to build an 'Evil Regex' as defined by +// https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS +fn similar_to_re_pattern(similar_expr: &str, escape: char) -> String { + to_re_pattern(similar_expr, escape, is_similar_meta_character) +} + +#[inline] +fn is_similar_meta_character(c: char) -> bool { + match c { + // pass these through to be interpreted as regex meta characters + '|' | '*' | '+' | '?' | '{' | '}' | '(' | ')' | '[' | ']' => false, + // everything else, defer + _ => regex_syntax::is_meta_character(c), + } +} + +#[inline] +fn to_re_pattern(expr: &str, escape: char, is_meta_character: F) -> String +where + F: Fn(char) -> bool, +{ + let mut pattern = String::from("^"); + write_re_pattern(expr, escape, is_meta_character, &mut pattern); + pattern += "$"; + pattern +} + +#[inline] +fn write_re_pattern(like_expr: &str, escape_ch: char, is_meta_character: F, buf: &mut String) +where + F: Fn(char) -> bool, +{ + buf.reserve(like_expr.len() + 6); + let mut escaped = false; + let mut wildcard = false; + + for ch in like_expr.chars() { + let is_any = std::mem::replace(&mut wildcard, false); + let is_escaped = std::mem::replace(&mut escaped, false); + match (ch, is_escaped) { + (_, false) if ch == escape_ch => escaped = true, + ('%', false) => { + if !is_any { + buf.push_str(".*?") + } + wildcard = true; + } + ('_', false) => buf.push('.'), + _ => { + if is_meta_character(ch) { + buf.push('\\'); // regex-escape the next character + } + buf.push(ch); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use regex::Regex; + use std::collections::{BTreeSet, HashSet}; + + #[test] + fn like() { + assert_eq!(like_to_re_pattern("foo", '\\'), r#"^foo$"#); + assert_eq!(like_to_re_pattern("%foo", '\\'), r#"^.*?foo$"#); + assert_eq!(like_to_re_pattern("foo%", '\\'), r#"^foo.*?$"#); + assert_eq!(like_to_re_pattern("foo%bar", '\\'), r#"^foo.*?bar$"#); + assert_eq!(like_to_re_pattern("foo%%bar", '\\'), r#"^foo.*?bar$"#); + assert_eq!(like_to_re_pattern("foo%%%bar", '\\'), r#"^foo.*?bar$"#); + assert_eq!(like_to_re_pattern("foo%%%%bar", '\\'), r#"^foo.*?bar$"#); + assert_eq!( + like_to_re_pattern("%foo%%%%bar%", '\\'), + r#"^.*?foo.*?bar.*?$"# + ); + assert_eq!( + like_to_re_pattern("%foo%%%%bar\\%baz%", '\\'), + r#"^.*?foo.*?bar%baz.*?$"# + ); + assert_eq!( + like_to_re_pattern("%foo%%%%bar*%baz%", '*'), + r#"^.*?foo.*?bar%baz.*?$"# + ); + assert_eq!(like_to_re_pattern("_foo", '\\'), r#"^.foo$"#); + assert_eq!(like_to_re_pattern("foo_", '\\'), r#"^foo.$"#); + assert_eq!(like_to_re_pattern("foo_bar", '\\'), r#"^foo.bar$"#); + assert_eq!(like_to_re_pattern("foo__bar", '\\'), r#"^foo..bar$"#); + assert_eq!( + like_to_re_pattern("foo_.*?_bar", '\\'), + r#"^foo.\.\*\?.bar$"# + ); + } + + #[test] + fn like_match() { + let pat = like_to_re_pattern("foo_.*?_bar", '\\'); + let re = Regex::new(&pat).unwrap(); + + assert!(re.is_match("foos.*?qbar")); + } + + #[test] + fn similar() { + assert_eq!(similar_to_re_pattern("(b|c)%", '\\'), r#"^(b|c).*?$"#); + assert_eq!(similar_to_re_pattern("%(b|d)%", '\\'), r#"^.*?(b|d).*?$"#); + } + + #[test] + fn similar_match() { + let pat = like_to_re_pattern("(b|c)%", '\\'); + let re = Regex::new(&pat).unwrap(); + assert!(!re.is_match("abc")); + + let pat = like_to_re_pattern("%(b|d)%", '\\'); + let re = Regex::new(&pat).unwrap(); + assert!(!re.is_match("abc")); + } +}