Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add experimental translation of 'LIKE' patterns to regex #98

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion partiql-parser/benches/bench_parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ fn parse_bench(c: &mut Criterion) {

criterion_group! {
name = parse;
config = Criterion::default().measurement_time(Duration::new(10, 0));
config = Criterion::default();
targets = parse_bench
}

Expand Down
9 changes: 9 additions & 0 deletions partiql-rewriter/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,12 @@ version = "0.0.0"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
regex = "1.5.5"
regex-syntax = "0.6.25"
rand = "0.8.5"
rand_chacha = "0.3.1"
criterion = "0.3"

[[bench]]
name = "bench_pattern_match"
harness = false
48 changes: 48 additions & 0 deletions partiql-rewriter/benches/bench_pattern_match.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use partiql_rewriter::experimental::{like_to_re_pattern, similar_to_re_pattern};
use rand::distributions::Alphanumeric;
use rand::{Rng, SeedableRng};
use regex::Regex;
use std::time::Duration;

const LIKE_SIMPLE: &str = r#"foo_.*?_bar"#;

fn like_8k() -> String {
let mut rng = rand_chacha::ChaCha20Rng::seed_from_u64(987654321);
let mut like_8k = String::with_capacity(8002);
like_8k += "%";
for ch in rng.sample_iter(&Alphanumeric).take(8000) {
like_8k.push(ch as char);
}
like_8k += "%";
like_8k
}

fn like(c: &mut Criterion) {
c.bench_function("like-simple-translate", |b| {
b.iter(|| like_to_re_pattern(black_box(LIKE_SIMPLE), '\\'))
});

let pat = like_to_re_pattern(black_box(LIKE_SIMPLE), '\\');
let re = Regex::new(&pat).unwrap();
c.bench_function("like-simple-match", |b| {
b.iter(|| re.is_match("foos.*?%bar"))
});

let like_8k = like_8k();
c.bench_function("like-8k-translate", |b| {
b.iter(|| like_to_re_pattern(black_box(&like_8k), '\\'))
});
let pat = like_to_re_pattern(black_box(&like_8k), '\\');
let re = Regex::new(&pat).unwrap();

c.bench_function("like-8k-match", |b| b.iter(|| re.is_match(&like_8k)));
}

criterion_group! {
name = like_compile;
config = Criterion::default();
targets = like
}

criterion_main!(like_compile);
5 changes: 5 additions & 0 deletions partiql-rewriter/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
mod pattern_match;
pub mod experimental {
pub use super::pattern_match::*;
}

#[cfg(test)]
mod tests {
#[test]
Expand Down
136 changes: 136 additions & 0 deletions partiql-rewriter/src/pattern_match.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
use regex::{Error, Regex};

// TODO docs

// TODO consider how to use the appropriate mechanisms to prevent exhaustion of
// resources by query-written regexes
// See https://docs.rs/regex/latest/regex/#untrusted-input

// TODO I believe this should be resilient ReDoS, as this should never build
// an 'Evil Regex' as defined by
// https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
pub fn like_to_re_pattern(like_expr: &str, escape: char) -> String {
to_re_pattern(like_expr, escape, regex_syntax::is_meta_character)
}

// TODO SIMILAR probably needs to be better thought through for preventing ReDoS
// A query writer would be able to build an 'Evil Regex' as defined by
// https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
fn similar_to_re_pattern(similar_expr: &str, escape: char) -> String {
to_re_pattern(similar_expr, escape, is_similar_meta_character)
}

#[inline]
fn is_similar_meta_character(c: char) -> bool {
match c {
// pass these through to be interpreted as regex meta characters
'|' | '*' | '+' | '?' | '{' | '}' | '(' | ')' | '[' | ']' => false,
// everything else, defer
_ => regex_syntax::is_meta_character(c),
}
}

#[inline]
fn to_re_pattern<F>(expr: &str, escape: char, is_meta_character: F) -> String
where
F: Fn(char) -> bool,
{
let mut pattern = String::from("^");
write_re_pattern(expr, escape, is_meta_character, &mut pattern);
pattern += "$";
pattern
}

#[inline]
fn write_re_pattern<F>(like_expr: &str, escape_ch: char, is_meta_character: F, buf: &mut String)
where
F: Fn(char) -> bool,
{
buf.reserve(like_expr.len() + 6);
let mut escaped = false;
let mut wildcard = false;

for ch in like_expr.chars() {
let is_any = std::mem::replace(&mut wildcard, false);
let is_escaped = std::mem::replace(&mut escaped, false);
match (ch, is_escaped) {
(_, false) if ch == escape_ch => escaped = true,
('%', false) => {
if !is_any {
buf.push_str(".*?")
}
wildcard = true;
}
('_', false) => buf.push('.'),
_ => {
if is_meta_character(ch) {
buf.push('\\'); // regex-escape the next character
}
buf.push(ch);
}
}
}
}

#[cfg(test)]
mod tests {
use super::*;
use regex::Regex;
use std::collections::{BTreeSet, HashSet};

#[test]
fn like() {
assert_eq!(like_to_re_pattern("foo", '\\'), r#"^foo$"#);
assert_eq!(like_to_re_pattern("%foo", '\\'), r#"^.*?foo$"#);
assert_eq!(like_to_re_pattern("foo%", '\\'), r#"^foo.*?$"#);
assert_eq!(like_to_re_pattern("foo%bar", '\\'), r#"^foo.*?bar$"#);
assert_eq!(like_to_re_pattern("foo%%bar", '\\'), r#"^foo.*?bar$"#);
assert_eq!(like_to_re_pattern("foo%%%bar", '\\'), r#"^foo.*?bar$"#);
assert_eq!(like_to_re_pattern("foo%%%%bar", '\\'), r#"^foo.*?bar$"#);
assert_eq!(
like_to_re_pattern("%foo%%%%bar%", '\\'),
r#"^.*?foo.*?bar.*?$"#
);
assert_eq!(
like_to_re_pattern("%foo%%%%bar\\%baz%", '\\'),
r#"^.*?foo.*?bar%baz.*?$"#
);
assert_eq!(
like_to_re_pattern("%foo%%%%bar*%baz%", '*'),
r#"^.*?foo.*?bar%baz.*?$"#
);
assert_eq!(like_to_re_pattern("_foo", '\\'), r#"^.foo$"#);
assert_eq!(like_to_re_pattern("foo_", '\\'), r#"^foo.$"#);
assert_eq!(like_to_re_pattern("foo_bar", '\\'), r#"^foo.bar$"#);
assert_eq!(like_to_re_pattern("foo__bar", '\\'), r#"^foo..bar$"#);
assert_eq!(
like_to_re_pattern("foo_.*?_bar", '\\'),
r#"^foo.\.\*\?.bar$"#
);
}

#[test]
fn like_match() {
let pat = like_to_re_pattern("foo_.*?_bar", '\\');
let re = Regex::new(&pat).unwrap();

assert!(re.is_match("foos.*?qbar"));
}

#[test]
fn similar() {
assert_eq!(similar_to_re_pattern("(b|c)%", '\\'), r#"^(b|c).*?$"#);
assert_eq!(similar_to_re_pattern("%(b|d)%", '\\'), r#"^.*?(b|d).*?$"#);
}

#[test]
fn similar_match() {
let pat = like_to_re_pattern("(b|c)%", '\\');
let re = Regex::new(&pat).unwrap();
assert!(!re.is_match("abc"));

let pat = like_to_re_pattern("%(b|d)%", '\\');
let re = Regex::new(&pat).unwrap();
assert!(!re.is_match("abc"));
}
}