Skip to content

Commit

Permalink
feat: add pattern for simplifying exprs like str ~ '^foo$' (#6369)
Browse files Browse the repository at this point in the history
* feat: add pattern for simplifying exprs like `str ~ '^foo$'`

* test: add additional tests
  • Loading branch information
Christopher M. Wolff authored May 17, 2023
1 parent 9f808f4 commit 3e3e9b5
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 2 deletions.
34 changes: 34 additions & 0 deletions datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2434,6 +2434,27 @@ mod tests {
// single word
assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"), "%foo%"));

// regular expressions that match an exact literal
assert_change(regex_match(col("c1"), lit("^$")), col("c1").eq(lit("")));
assert_change(
regex_not_match(col("c1"), lit("^$")),
col("c1").not_eq(lit("")),
);
assert_change(
regex_match(col("c1"), lit("^foo$")),
col("c1").eq(lit("foo")),
);
assert_change(
regex_not_match(col("c1"), lit("^foo$")),
col("c1").not_eq(lit("foo")),
);
assert_no_change(regex_match(col("c1"), lit("^foo|bar$")));
assert_no_change(regex_match(col("c1"), lit("^(foo)(bar)$")));
assert_no_change(regex_match(col("c1"), lit("^")));
assert_no_change(regex_match(col("c1"), lit("$")));
assert_no_change(regex_match(col("c1"), lit("$^")));
assert_no_change(regex_match(col("c1"), lit("$foo^")));

// OR-chain
assert_change(
regex_match(col("c1"), lit("foo|bar|baz")),
Expand All @@ -2453,6 +2474,19 @@ mod tests {
.and(not_like(col("c1"), "%bar%"))
.and(not_like(col("c1"), "%baz%")),
);
// both anchored expressions (translated to equality) and unanchored
assert_change(
regex_match(col("c1"), lit("foo|^x$|baz")),
like(col("c1"), "%foo%")
.or(col("c1").eq(lit("x")))
.or(like(col("c1"), "%baz%")),
);
assert_change(
regex_not_match(col("c1"), lit("foo|^bar$|baz")),
not_like(col("c1"), "%foo%")
.and(col("c1").not_eq(lit("bar")))
.and(not_like(col("c1"), "%baz%")),
);
// Too many patterns (MAX_REGEX_ALTERNATIONS_EXPANSION)
assert_no_change(regex_match(col("c1"), lit("foo|bar|baz|blarg|bozo|etc")));
}
Expand Down
59 changes: 57 additions & 2 deletions datafusion/optimizer/src/simplify_expressions/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
// under the License.

use datafusion_common::{DataFusionError, Result, ScalarValue};
use datafusion_expr::{BinaryExpr, Expr, Like, Operator};
use regex_syntax::hir::{Hir, HirKind, Literal};
use datafusion_expr::{lit, BinaryExpr, Expr, Like, Operator};
use regex_syntax::hir::{Hir, HirKind, Literal, Look};

/// Maximum number of regex alternations (`foo|bar|...`) that will be expanded into multiple `LIKE` expressions.
const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4;
Expand Down Expand Up @@ -95,6 +95,15 @@ impl OperatorMode {
Expr::Like(like)
}
}

fn expr_matches_literal(&self, left: Box<Expr>, right: Box<Expr>) -> Expr {
let op = if self.not {
Operator::NotEq
} else {
Operator::Eq
};
Expr::BinaryExpr(BinaryExpr { left, op, right })
}
}

fn collect_concat_to_like_string(parts: &[Hir]) -> Option<String> {
Expand Down Expand Up @@ -130,6 +139,46 @@ fn is_safe_for_like(c: char) -> bool {
(c != '%') && (c != '_')
}

/// returns true if the elements in a `Concat` pattern are:
/// - `[Look::Start, Look::End]`
/// - `[Look::Start, Literal(_), Look::End]`
fn is_anchored_literal(v: &[Hir]) -> bool {
match v.len() {
2..=3 => (),
_ => return false,
};

let first_last = (
v.first().expect("length checked"),
v.last().expect("length checked"),
);
if !matches!(first_last,
(s, e) if s.kind() == &HirKind::Look(Look::Start)
&& e.kind() == &HirKind::Look(Look::End)
)
{
return false;
}

v.iter()
.skip(1)
.take(v.len() - 2)
.all(|h| matches!(h.kind(), HirKind::Literal(_)))
}

/// extracts a string literal expression assuming that [`is_anchored_literal`]
/// returned true.
fn anchored_literal_to_expr(v: &[Hir]) -> Option<Expr> {
match v.len() {
2 => Some(lit("")),
3 => {
let HirKind::Literal(l) = v[1].kind() else { return None };
str_from_literal(l).map(lit)
}
_ => None,
}
}

fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
println!("Considering hir kind: mode {mode:?} hir: {hir:?}");
match hir.kind() {
Expand All @@ -140,6 +189,12 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
let s = str_from_literal(l)?;
return Some(mode.expr(Box::new(left.clone()), format!("%{s}%")));
}
HirKind::Concat(inner) if is_anchored_literal(inner) => {
let right = anchored_literal_to_expr(inner)?;
return Some(
mode.expr_matches_literal(Box::new(left.clone()), Box::new(right)),
);
}
HirKind::Concat(inner) => {
if let Some(pattern) = collect_concat_to_like_string(inner) {
return Some(mode.expr(Box::new(left.clone()), pattern));
Expand Down

0 comments on commit 3e3e9b5

Please sign in to comment.