Skip to content

Commit

Permalink
Add experimental translation of 'LIKE' patterns to regex
Browse files Browse the repository at this point in the history
  • Loading branch information
jpschorr committed Apr 21, 2022
1 parent 61ae03c commit d1392fa
Show file tree
Hide file tree
Showing 5 changed files with 199 additions and 1 deletion.
2 changes: 1 addition & 1 deletion partiql-parser/benches/bench_parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ fn parse_bench(c: &mut Criterion) {

criterion_group! {
name = parse;
config = Criterion::default().measurement_time(Duration::new(10, 0));
config = Criterion::default();
targets = parse_bench
}

Expand Down
9 changes: 9 additions & 0 deletions partiql-rewriter/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,12 @@ version = "0.0.0"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
regex = "1.5.5"
regex-syntax = "0.6.25"
rand = "0.8.5"
rand_chacha = "0.3.1"
criterion = "0.3"

[[bench]]
name = "bench_pattern_match"
harness = false
48 changes: 48 additions & 0 deletions partiql-rewriter/benches/bench_pattern_match.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use partiql_rewriter::experimental::{like_to_re_pattern, similar_to_re_pattern};
use rand::distributions::Alphanumeric;
use rand::{Rng, SeedableRng};
use regex::Regex;
use std::time::Duration;

const LIKE_SIMPLE: &str = r#"foo_.*?_bar"#;

fn like_8k() -> String {
let mut rng = rand_chacha::ChaCha20Rng::seed_from_u64(987654321);
let mut like_8k = String::with_capacity(8002);
like_8k += "%";
for ch in rng.sample_iter(&Alphanumeric).take(8000) {
like_8k.push(ch as char);
}
like_8k += "%";
like_8k
}

fn like(c: &mut Criterion) {
c.bench_function("like-simple-translate", |b| {
b.iter(|| like_to_re_pattern(black_box(LIKE_SIMPLE), '\\'))
});

let pat = like_to_re_pattern(black_box(LIKE_SIMPLE), '\\');
let re = Regex::new(&pat).unwrap();
c.bench_function("like-simple-match", |b| {
b.iter(|| re.is_match("foos.*?%bar"))
});

let like_8k = like_8k();
c.bench_function("like-8k-translate", |b| {
b.iter(|| like_to_re_pattern(black_box(&like_8k), '\\'))
});
let pat = like_to_re_pattern(black_box(&like_8k), '\\');
let re = Regex::new(&pat).unwrap();

c.bench_function("like-8k-match", |b| b.iter(|| re.is_match(&like_8k)));
}

criterion_group! {
name = like_compile;
config = Criterion::default();
targets = like
}

criterion_main!(like_compile);
5 changes: 5 additions & 0 deletions partiql-rewriter/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
mod pattern_match;
pub mod experimental {
pub use super::pattern_match::*;
}

#[cfg(test)]
mod tests {
#[test]
Expand Down
136 changes: 136 additions & 0 deletions partiql-rewriter/src/pattern_match.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
use regex::{Error, Regex};

// TODO docs

// TODO consider how to use the appropriate mechanisms to prevent exhaustion of
// resources by query-written regexes
// See https://docs.rs/regex/latest/regex/#untrusted-input

// TODO I believe this should be resilient ReDoS, as this should never build
// an 'Evil Regex' as defined by
// https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
pub fn like_to_re_pattern(like_expr: &str, escape: char) -> String {
to_re_pattern(like_expr, escape, regex_syntax::is_meta_character)
}

// TODO SIMILAR probably needs to be better thought through for preventing ReDoS
// A query writer would be able to build an 'Evil Regex' as defined by
// https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
fn similar_to_re_pattern(similar_expr: &str, escape: char) -> String {
to_re_pattern(similar_expr, escape, is_similar_meta_character)
}

#[inline]
fn is_similar_meta_character(c: char) -> bool {
match c {
// pass these through to be interpreted as regex meta characters
'|' | '*' | '+' | '?' | '{' | '}' | '(' | ')' | '[' | ']' => false,
// everything else, defer
_ => regex_syntax::is_meta_character(c),
}
}

#[inline]
fn to_re_pattern<F>(expr: &str, escape: char, is_meta_character: F) -> String
where
F: Fn(char) -> bool,
{
let mut pattern = String::from("^");
write_re_pattern(expr, escape, is_meta_character, &mut pattern);
pattern += "$";
pattern
}

#[inline]
fn write_re_pattern<F>(like_expr: &str, escape_ch: char, is_meta_character: F, buf: &mut String)
where
F: Fn(char) -> bool,
{
buf.reserve(like_expr.len() + 6);
let mut escaped = false;
let mut wildcard = false;

for ch in like_expr.chars() {
let is_any = std::mem::replace(&mut wildcard, false);
let is_escaped = std::mem::replace(&mut escaped, false);
match (ch, is_escaped) {
(_, false) if ch == escape_ch => escaped = true,
('%', false) => {
if !is_any {
buf.push_str(".*?")
}
wildcard = true;
}
('_', false) => buf.push('.'),
_ => {
if is_meta_character(ch) {
buf.push('\\'); // regex-escape the next character
}
buf.push(ch);
}
}
}
}

#[cfg(test)]
mod tests {
use super::*;
use regex::Regex;
use std::collections::{BTreeSet, HashSet};

#[test]
fn like() {
assert_eq!(like_to_re_pattern("foo", '\\'), r#"^foo$"#);
assert_eq!(like_to_re_pattern("%foo", '\\'), r#"^.*?foo$"#);
assert_eq!(like_to_re_pattern("foo%", '\\'), r#"^foo.*?$"#);
assert_eq!(like_to_re_pattern("foo%bar", '\\'), r#"^foo.*?bar$"#);
assert_eq!(like_to_re_pattern("foo%%bar", '\\'), r#"^foo.*?bar$"#);
assert_eq!(like_to_re_pattern("foo%%%bar", '\\'), r#"^foo.*?bar$"#);
assert_eq!(like_to_re_pattern("foo%%%%bar", '\\'), r#"^foo.*?bar$"#);
assert_eq!(
like_to_re_pattern("%foo%%%%bar%", '\\'),
r#"^.*?foo.*?bar.*?$"#
);
assert_eq!(
like_to_re_pattern("%foo%%%%bar\\%baz%", '\\'),
r#"^.*?foo.*?bar%baz.*?$"#
);
assert_eq!(
like_to_re_pattern("%foo%%%%bar*%baz%", '*'),
r#"^.*?foo.*?bar%baz.*?$"#
);
assert_eq!(like_to_re_pattern("_foo", '\\'), r#"^.foo$"#);
assert_eq!(like_to_re_pattern("foo_", '\\'), r#"^foo.$"#);
assert_eq!(like_to_re_pattern("foo_bar", '\\'), r#"^foo.bar$"#);
assert_eq!(like_to_re_pattern("foo__bar", '\\'), r#"^foo..bar$"#);
assert_eq!(
like_to_re_pattern("foo_.*?_bar", '\\'),
r#"^foo.\.\*\?.bar$"#
);
}

#[test]
fn like_match() {
let pat = like_to_re_pattern("foo_.*?_bar", '\\');
let re = Regex::new(&pat).unwrap();

assert!(re.is_match("foos.*?qbar"));
}

#[test]
fn similar() {
assert_eq!(similar_to_re_pattern("(b|c)%", '\\'), r#"^(b|c).*?$"#);
assert_eq!(similar_to_re_pattern("%(b|d)%", '\\'), r#"^.*?(b|d).*?$"#);
}

#[test]
fn similar_match() {
let pat = like_to_re_pattern("(b|c)%", '\\');
let re = Regex::new(&pat).unwrap();
assert!(!re.is_match("abc"));

let pat = like_to_re_pattern("%(b|d)%", '\\');
let re = Regex::new(&pat).unwrap();
assert!(!re.is_match("abc"));
}
}

0 comments on commit d1392fa

Please sign in to comment.