From 02d401bb8f612902fe3f6dcf2d6dc2cc8e1b4f87 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Fri, 8 Nov 2024 14:45:01 +0100 Subject: [PATCH] Backport: Fix LIKE with escapes Fix LIKE processing for patterns containing escapes - the starts_with / ends_with optimization did not correctly check for escapes when checking rest of the pattern for being literal or not - the pattern to regexp compiler incorrectly processed \ followed by a character other than % or _. In PostgreSQL '\x' pattern matches single 'x'. There are two tests - like_escape_many was generated using PostgreSQL with the code attached below for verification - like_escape is hand-picked test cases that are more interesting. Lower cardinality of hand-picked test cases allows for exercising all scalar/array vs scalar/array combinations. The below script isn't simples possible, because it was attempted to generate more test cases by adding padding. Hence e.g. is_like_without_dangling_escape. Since this is attached for reference, should be attached as-is. ```python import psycopg2 data = r""" \ \\ \\\ \\\\ a \a \\a % \% \\% %% \%% \\%% _ \_ \\_ __ \__ \\__ abc a_c a\bc a\_c %abc \%abc a\\_c% """.split('\n') data = list(dict.fromkeys(data)) conn = psycopg2.connect(host='localhost', port=5432, user='postgres', password='mysecretpassword') conn.set_session(autocommit=True) cursor = conn.cursor() for r in data: try: # PostgreSQL verifies dandling escape only sometimes cursor.execute(f"SELECT %s LIKE %s", (r, r)) is_like, = cursor.fetchone() has_dandling_escape = False pg_pattern = r except Exception as e: if 'LIKE pattern must not end with escape character' not in str(e): raise e has_dandling_escape = True pg_pattern = r + '\\' for l in data: # print() # print(' '.join(str(v) for v in (l, r, has_dandling_escape, postgres_pattern))) cursor.execute(f"SELECT %s LIKE %s", (l, pg_pattern)) is_like, = cursor.fetchone() assert type(is_like) is bool if not is_like and has_dandling_escape: pattern_without_escaped_dandling_escape = pg_pattern[:-2] cursor.execute(f"SELECT %s LIKE %s", (l, pattern_without_escaped_dandling_escape)) is_like_without_dangling_escape, = cursor.fetchone() assert type(is_like_without_dangling_escape) is bool else: is_like_without_dangling_escape = False assert '"' not in l assert '"' not in r print('(r"%s", r"%s", %s),' % ( l, r, str(is_like).lower(), # str(has_dandling_escape).lower(), # str(is_like_without_dangling_escape).lower(), )) ``` --- arrow-string/src/like.rs | 1058 +++++++++++++++++++++++++++++++++ arrow-string/src/predicate.rs | 125 ++-- 2 files changed, 1123 insertions(+), 60 deletions(-) diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 49831092ffcd..1edd1dd3b1c0 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -415,6 +415,7 @@ legacy_kernels!( mod tests { use super::*; use arrow_array::types::Int8Type; + use std::iter::zip; /// Applying `op(left, right)`, both sides are arrays /// The macro tests four types of array implementations: @@ -1702,4 +1703,1061 @@ mod tests { assert_eq!(r.null_count(), 1); assert!(r.is_null(0)); } + + #[test] + fn like_escape() { + // (value, pattern, expected) + let test_cases = vec![ + // Empty pattern + (r"", r"", true), + (r"\", r"", false), + // Sole (dangling) escape (some engines consider this invalid pattern) + (r"", r"\", false), + (r"\", r"\", true), + (r"\\", r"\", false), + (r"a", r"\", false), + (r"\a", r"\", false), + (r"\\a", r"\", false), + // Sole escape + (r"", r"\\", false), + (r"\", r"\\", true), + (r"\\", r"\\", false), + (r"a", r"\\", false), + (r"\a", r"\\", false), + (r"\\a", r"\\", false), + // Sole escape and dangling escape + (r"", r"\\\", false), + (r"\", r"\\\", false), + (r"\\", r"\\\", true), + (r"\\\", r"\\\", false), + (r"\\\\", r"\\\", false), + (r"a", r"\\\", false), + (r"\a", r"\\\", false), + (r"\\a", r"\\\", false), + // Sole two escapes + (r"", r"\\\\", false), + (r"\", r"\\\\", false), + (r"\\", r"\\\\", true), + (r"\\\", r"\\\\", false), + (r"\\\\", r"\\\\", false), + (r"\\\\\", r"\\\\", false), + (r"a", r"\\\\", false), + (r"\a", r"\\\\", false), + (r"\\a", r"\\\\", false), + // Escaped non-wildcard + (r"", r"\a", false), + (r"\", r"\a", false), + (r"\\", r"\a", false), + (r"a", r"\a", true), + (r"\a", r"\a", false), + (r"\\a", r"\a", false), + // Escaped _ wildcard + (r"", r"\_", false), + (r"\", r"\_", false), + (r"\\", r"\_", false), + (r"a", r"\_", false), + (r"_", r"\_", true), + (r"%", r"\_", false), + (r"\a", r"\_", false), + (r"\\a", r"\_", false), + (r"\_", r"\_", false), + (r"\\_", r"\_", false), + // Escaped % wildcard + (r"", r"\%", false), + (r"\", r"\%", false), + (r"\\", r"\%", false), + (r"a", r"\%", false), + (r"_", r"\%", false), + (r"%", r"\%", true), + (r"\a", r"\%", false), + (r"\\a", r"\%", false), + (r"\%", r"\%", false), + (r"\\%", r"\%", false), + // Escape and non-wildcard + (r"", r"\\a", false), + (r"\", r"\\a", false), + (r"\\", r"\\a", false), + (r"a", r"\\a", false), + (r"\a", r"\\a", true), + (r"\\a", r"\\a", false), + (r"\\\a", r"\\a", false), + // Escape and _ wildcard + (r"", r"\\_", false), + (r"\", r"\\_", false), + (r"\\", r"\\_", true), + (r"a", r"\\_", false), + (r"_", r"\\_", false), + (r"%", r"\\_", false), + (r"\a", r"\\_", true), + (r"\\a", r"\\_", false), + (r"\_", r"\\_", true), + (r"\\_", r"\\_", false), + (r"\\\_", r"\\_", false), + // Escape and % wildcard + (r"", r"\\%", false), + (r"\", r"\\%", true), + (r"\\", r"\\%", true), + (r"a", r"\\%", false), + (r"ab", r"\\%", false), + (r"a%", r"\\%", false), + (r"_", r"\\%", false), + (r"%", r"\\%", false), + (r"\a", r"\\%", true), + (r"\\a", r"\\%", true), + (r"\%", r"\\%", true), + (r"\\%", r"\\%", true), + (r"\\\%", r"\\%", true), + // %... pattern with dangling wildcard + (r"\", r"%\", true), + (r"\\", r"%\", true), + (r"%\", r"%\", true), + (r"%\\", r"%\", true), + (r"abc\", r"%\", true), + (r"abc", r"%\", false), + // %... pattern with wildcard + (r"\", r"%\\", true), + (r"\\", r"%\\", true), + (r"%\\", r"%\\", true), + (r"%\\\", r"%\\", true), + (r"abc\", r"%\\", true), + (r"abc", r"%\\", false), + // %... pattern including escaped non-wildcard + (r"ac", r"%a\c", true), + (r"xyzac", r"%a\c", true), + (r"abc", r"%a\c", false), + (r"a\c", r"%a\c", false), + (r"%a\c", r"%a\c", false), + // %... pattern including escape + (r"\", r"%a\\c", false), + (r"\\", r"%a\\c", false), + (r"ac", r"%a\\c", false), + (r"a\c", r"%a\\c", true), + (r"a\\c", r"%a\\c", false), + (r"abc", r"%a\\c", false), + (r"xyza\c", r"%a\\c", true), + (r"xyza\\c", r"%a\\c", false), + (r"%a\\c", r"%a\\c", false), + // ...% pattern with wildcard + (r"\", r"\\%", true), + (r"\\", r"\\%", true), + (r"\\%", r"\\%", true), + (r"\\\%", r"\\%", true), + (r"\abc", r"\\%", true), + (r"a", r"\\%", false), + (r"abc", r"\\%", false), + // ...% pattern including escaped non-wildcard + (r"ac", r"a\c%", true), + (r"acxyz", r"a\c%", true), + (r"abc", r"a\c%", false), + (r"a\c", r"a\c%", false), + (r"a\c%", r"a\c%", false), + (r"a\\c%", r"a\c%", false), + // ...% pattern including escape + (r"ac", r"a\\c%", false), + (r"a\c", r"a\\c%", true), + (r"a\cxyz", r"a\\c%", true), + (r"a\\c", r"a\\c%", false), + (r"a\\cxyz", r"a\\c%", false), + (r"abc", r"a\\c%", false), + (r"abcxyz", r"a\\c%", false), + (r"a\\c%", r"a\\c%", false), + // %...% pattern including escaped non-wildcard + (r"ac", r"%a\c%", true), + (r"xyzacxyz", r"%a\c%", true), + (r"abc", r"%a\c%", false), + (r"a\c", r"%a\c%", false), + (r"xyza\cxyz", r"%a\c%", false), + (r"%a\c%", r"%a\c%", false), + (r"%a\\c%", r"%a\c%", false), + // %...% pattern including escape + (r"ac", r"%a\\c%", false), + (r"a\c", r"%a\\c%", true), + (r"xyza\cxyz", r"%a\\c%", true), + (r"a\\c", r"%a\\c%", false), + (r"xyza\\cxyz", r"%a\\c%", false), + (r"abc", r"%a\\c%", false), + (r"xyzabcxyz", r"%a\\c%", false), + (r"%a\\c%", r"%a\\c%", false), + // Odd (7) backslashes and % wildcard + (r"\\%", r"\\\\\\\%", false), + (r"\\\", r"\\\\\\\%", false), + (r"\\\%", r"\\\\\\\%", true), + (r"\\\\", r"\\\\\\\%", false), + (r"\\\\%", r"\\\\\\\%", false), + (r"\\\\\\\%", r"\\\\\\\%", false), + // Odd (7) backslashes and _ wildcard + (r"\\\", r"\\\\\\\_", false), + (r"\\\\", r"\\\\\\\_", false), + (r"\\\_", r"\\\\\\\_", true), + (r"\\\\", r"\\\\\\\_", false), + (r"\\\a", r"\\\\\\\_", false), + (r"\\\\_", r"\\\\\\\_", false), + (r"\\\\\\\_", r"\\\\\\\_", false), + // Even (8) backslashes and % wildcard + (r"\\\", r"\\\\\\\\%", false), + (r"\\\\", r"\\\\\\\\%", true), + (r"\\\\\", r"\\\\\\\\%", true), + (r"\\\\xyz", r"\\\\\\\\%", true), + (r"\\\\\\\\%", r"\\\\\\\\%", true), + // Even (8) backslashes and _ wildcard + (r"\\\", r"\\\\\\\\_", false), + (r"\\\\", r"\\\\\\\\_", false), + (r"\\\\\", r"\\\\\\\\_", true), + (r"\\\\a", r"\\\\\\\\_", true), + (r"\\\\\a", r"\\\\\\\\_", false), + (r"\\\\ab", r"\\\\\\\\_", false), + (r"\\\\\\\\_", r"\\\\\\\\_", false), + ]; + + for (value, pattern, expected) in test_cases { + let unexpected = BooleanArray::from(vec![!expected]); + let expected = BooleanArray::from(vec![expected]); + + for string_type in [DataType::Utf8, DataType::LargeUtf8, DataType::Utf8View] { + for ((value_datum, value_type), (pattern_datum, pattern_type)) in zip( + make_datums(value, &string_type), + make_datums(pattern, &string_type), + ) { + let value_datum = value_datum.as_ref(); + let pattern_datum = pattern_datum.as_ref(); + assert_eq!( + like(value_datum, pattern_datum).unwrap(), + expected, + "{value_type:?} «{value}» like {pattern_type:?} «{pattern}»" + ); + assert_eq!( + ilike(value_datum, pattern_datum).unwrap(), + expected, + "{value_type:?} «{value}» ilike {pattern_type:?} «{pattern}»" + ); + assert_eq!( + nlike(value_datum, pattern_datum).unwrap(), + unexpected, + "{value_type:?} «{value}» nlike {pattern_type:?} «{pattern}»" + ); + assert_eq!( + nilike(value_datum, pattern_datum).unwrap(), + unexpected, + "{value_type:?} «{value}» nilike {pattern_type:?} «{pattern}»" + ); + } + } + } + } + + #[test] + fn like_escape_many() { + // (value, pattern, expected) + let test_cases = vec![ + (r"", r"", true), + (r"\", r"", false), + (r"\\", r"", false), + (r"\\\", r"", false), + (r"\\\\", r"", false), + (r"a", r"", false), + (r"\a", r"", false), + (r"\\a", r"", false), + (r"%", r"", false), + (r"\%", r"", false), + (r"\\%", r"", false), + (r"%%", r"", false), + (r"\%%", r"", false), + (r"\\%%", r"", false), + (r"_", r"", false), + (r"\_", r"", false), + (r"\\_", r"", false), + (r"__", r"", false), + (r"\__", r"", false), + (r"\\__", r"", false), + (r"abc", r"", false), + (r"a_c", r"", false), + (r"a\bc", r"", false), + (r"a\_c", r"", false), + (r"%abc", r"", false), + (r"\%abc", r"", false), + (r"a\\_c%", r"", false), + (r"", r"\", false), + (r"\", r"\", true), + (r"\\", r"\", false), + (r"\\\", r"\", false), + (r"\\\\", r"\", false), + (r"a", r"\", false), + (r"\a", r"\", false), + (r"\\a", r"\", false), + (r"%", r"\", false), + (r"\%", r"\", false), + (r"\\%", r"\", false), + (r"%%", r"\", false), + (r"\%%", r"\", false), + (r"\\%%", r"\", false), + (r"_", r"\", false), + (r"\_", r"\", false), + (r"\\_", r"\", false), + (r"__", r"\", false), + (r"\__", r"\", false), + (r"\\__", r"\", false), + (r"abc", r"\", false), + (r"a_c", r"\", false), + (r"a\bc", r"\", false), + (r"a\_c", r"\", false), + (r"%abc", r"\", false), + (r"\%abc", r"\", false), + (r"a\\_c%", r"\", false), + (r"", r"\\", false), + (r"\", r"\\", true), + (r"\\", r"\\", false), + (r"\\\", r"\\", false), + (r"\\\\", r"\\", false), + (r"a", r"\\", false), + (r"\a", r"\\", false), + (r"\\a", r"\\", false), + (r"%", r"\\", false), + (r"\%", r"\\", false), + (r"\\%", r"\\", false), + (r"%%", r"\\", false), + (r"\%%", r"\\", false), + (r"\\%%", r"\\", false), + (r"_", r"\\", false), + (r"\_", r"\\", false), + (r"\\_", r"\\", false), + (r"__", r"\\", false), + (r"\__", r"\\", false), + (r"\\__", r"\\", false), + (r"abc", r"\\", false), + (r"a_c", r"\\", false), + (r"a\bc", r"\\", false), + (r"a\_c", r"\\", false), + (r"%abc", r"\\", false), + (r"\%abc", r"\\", false), + (r"a\\_c%", r"\\", false), + (r"", r"\\\", false), + (r"\", r"\\\", false), + (r"\\", r"\\\", true), + (r"\\\", r"\\\", false), + (r"\\\\", r"\\\", false), + (r"a", r"\\\", false), + (r"\a", r"\\\", false), + (r"\\a", r"\\\", false), + (r"%", r"\\\", false), + (r"\%", r"\\\", false), + (r"\\%", r"\\\", false), + (r"%%", r"\\\", false), + (r"\%%", r"\\\", false), + (r"\\%%", r"\\\", false), + (r"_", r"\\\", false), + (r"\_", r"\\\", false), + (r"\\_", r"\\\", false), + (r"__", r"\\\", false), + (r"\__", r"\\\", false), + (r"\\__", r"\\\", false), + (r"abc", r"\\\", false), + (r"a_c", r"\\\", false), + (r"a\bc", r"\\\", false), + (r"a\_c", r"\\\", false), + (r"%abc", r"\\\", false), + (r"\%abc", r"\\\", false), + (r"a\\_c%", r"\\\", false), + (r"", r"\\\\", false), + (r"\", r"\\\\", false), + (r"\\", r"\\\\", true), + (r"\\\", r"\\\\", false), + (r"\\\\", r"\\\\", false), + (r"a", r"\\\\", false), + (r"\a", r"\\\\", false), + (r"\\a", r"\\\\", false), + (r"%", r"\\\\", false), + (r"\%", r"\\\\", false), + (r"\\%", r"\\\\", false), + (r"%%", r"\\\\", false), + (r"\%%", r"\\\\", false), + (r"\\%%", r"\\\\", false), + (r"_", r"\\\\", false), + (r"\_", r"\\\\", false), + (r"\\_", r"\\\\", false), + (r"__", r"\\\\", false), + (r"\__", r"\\\\", false), + (r"\\__", r"\\\\", false), + (r"abc", r"\\\\", false), + (r"a_c", r"\\\\", false), + (r"a\bc", r"\\\\", false), + (r"a\_c", r"\\\\", false), + (r"%abc", r"\\\\", false), + (r"\%abc", r"\\\\", false), + (r"a\\_c%", r"\\\\", false), + (r"", r"a", false), + (r"\", r"a", false), + (r"\\", r"a", false), + (r"\\\", r"a", false), + (r"\\\\", r"a", false), + (r"a", r"a", true), + (r"\a", r"a", false), + (r"\\a", r"a", false), + (r"%", r"a", false), + (r"\%", r"a", false), + (r"\\%", r"a", false), + (r"%%", r"a", false), + (r"\%%", r"a", false), + (r"\\%%", r"a", false), + (r"_", r"a", false), + (r"\_", r"a", false), + (r"\\_", r"a", false), + (r"__", r"a", false), + (r"\__", r"a", false), + (r"\\__", r"a", false), + (r"abc", r"a", false), + (r"a_c", r"a", false), + (r"a\bc", r"a", false), + (r"a\_c", r"a", false), + (r"%abc", r"a", false), + (r"\%abc", r"a", false), + (r"a\\_c%", r"a", false), + (r"", r"\a", false), + (r"\", r"\a", false), + (r"\\", r"\a", false), + (r"\\\", r"\a", false), + (r"\\\\", r"\a", false), + (r"a", r"\a", true), + (r"\a", r"\a", false), + (r"\\a", r"\a", false), + (r"%", r"\a", false), + (r"\%", r"\a", false), + (r"\\%", r"\a", false), + (r"%%", r"\a", false), + (r"\%%", r"\a", false), + (r"\\%%", r"\a", false), + (r"_", r"\a", false), + (r"\_", r"\a", false), + (r"\\_", r"\a", false), + (r"__", r"\a", false), + (r"\__", r"\a", false), + (r"\\__", r"\a", false), + (r"abc", r"\a", false), + (r"a_c", r"\a", false), + (r"a\bc", r"\a", false), + (r"a\_c", r"\a", false), + (r"%abc", r"\a", false), + (r"\%abc", r"\a", false), + (r"a\\_c%", r"\a", false), + (r"", r"\\a", false), + (r"\", r"\\a", false), + (r"\\", r"\\a", false), + (r"\\\", r"\\a", false), + (r"\\\\", r"\\a", false), + (r"a", r"\\a", false), + (r"\a", r"\\a", true), + (r"\\a", r"\\a", false), + (r"%", r"\\a", false), + (r"\%", r"\\a", false), + (r"\\%", r"\\a", false), + (r"%%", r"\\a", false), + (r"\%%", r"\\a", false), + (r"\\%%", r"\\a", false), + (r"_", r"\\a", false), + (r"\_", r"\\a", false), + (r"\\_", r"\\a", false), + (r"__", r"\\a", false), + (r"\__", r"\\a", false), + (r"\\__", r"\\a", false), + (r"abc", r"\\a", false), + (r"a_c", r"\\a", false), + (r"a\bc", r"\\a", false), + (r"a\_c", r"\\a", false), + (r"%abc", r"\\a", false), + (r"\%abc", r"\\a", false), + (r"a\\_c%", r"\\a", false), + (r"", r"%", true), + (r"\", r"%", true), + (r"\\", r"%", true), + (r"\\\", r"%", true), + (r"\\\\", r"%", true), + (r"a", r"%", true), + (r"\a", r"%", true), + (r"\\a", r"%", true), + (r"%", r"%", true), + (r"\%", r"%", true), + (r"\\%", r"%", true), + (r"%%", r"%", true), + (r"\%%", r"%", true), + (r"\\%%", r"%", true), + (r"_", r"%", true), + (r"\_", r"%", true), + (r"\\_", r"%", true), + (r"__", r"%", true), + (r"\__", r"%", true), + (r"\\__", r"%", true), + (r"abc", r"%", true), + (r"a_c", r"%", true), + (r"a\bc", r"%", true), + (r"a\_c", r"%", true), + (r"%abc", r"%", true), + (r"\%abc", r"%", true), + (r"a\\_c%", r"%", true), + (r"", r"\%", false), + (r"\", r"\%", false), + (r"\\", r"\%", false), + (r"\\\", r"\%", false), + (r"\\\\", r"\%", false), + (r"a", r"\%", false), + (r"\a", r"\%", false), + (r"\\a", r"\%", false), + (r"%", r"\%", true), + (r"\%", r"\%", false), + (r"\\%", r"\%", false), + (r"%%", r"\%", false), + (r"\%%", r"\%", false), + (r"\\%%", r"\%", false), + (r"_", r"\%", false), + (r"\_", r"\%", false), + (r"\\_", r"\%", false), + (r"__", r"\%", false), + (r"\__", r"\%", false), + (r"\\__", r"\%", false), + (r"abc", r"\%", false), + (r"a_c", r"\%", false), + (r"a\bc", r"\%", false), + (r"a\_c", r"\%", false), + (r"%abc", r"\%", false), + (r"\%abc", r"\%", false), + (r"a\\_c%", r"\%", false), + (r"", r"\\%", false), + (r"\", r"\\%", true), + (r"\\", r"\\%", true), + (r"\\\", r"\\%", true), + (r"\\\\", r"\\%", true), + (r"a", r"\\%", false), + (r"\a", r"\\%", true), + (r"\\a", r"\\%", true), + (r"%", r"\\%", false), + (r"\%", r"\\%", true), + (r"\\%", r"\\%", true), + (r"%%", r"\\%", false), + (r"\%%", r"\\%", true), + (r"\\%%", r"\\%", true), + (r"_", r"\\%", false), + (r"\_", r"\\%", true), + (r"\\_", r"\\%", true), + (r"__", r"\\%", false), + (r"\__", r"\\%", true), + (r"\\__", r"\\%", true), + (r"abc", r"\\%", false), + (r"a_c", r"\\%", false), + (r"a\bc", r"\\%", false), + (r"a\_c", r"\\%", false), + (r"%abc", r"\\%", false), + (r"\%abc", r"\\%", true), + (r"a\\_c%", r"\\%", false), + (r"", r"%%", true), + (r"\", r"%%", true), + (r"\\", r"%%", true), + (r"\\\", r"%%", true), + (r"\\\\", r"%%", true), + (r"a", r"%%", true), + (r"\a", r"%%", true), + (r"\\a", r"%%", true), + (r"%", r"%%", true), + (r"\%", r"%%", true), + (r"\\%", r"%%", true), + (r"%%", r"%%", true), + (r"\%%", r"%%", true), + (r"\\%%", r"%%", true), + (r"_", r"%%", true), + (r"\_", r"%%", true), + (r"\\_", r"%%", true), + (r"__", r"%%", true), + (r"\__", r"%%", true), + (r"\\__", r"%%", true), + (r"abc", r"%%", true), + (r"a_c", r"%%", true), + (r"a\bc", r"%%", true), + (r"a\_c", r"%%", true), + (r"%abc", r"%%", true), + (r"\%abc", r"%%", true), + (r"a\\_c%", r"%%", true), + (r"", r"\%%", false), + (r"\", r"\%%", false), + (r"\\", r"\%%", false), + (r"\\\", r"\%%", false), + (r"\\\\", r"\%%", false), + (r"a", r"\%%", false), + (r"\a", r"\%%", false), + (r"\\a", r"\%%", false), + (r"%", r"\%%", true), + (r"\%", r"\%%", false), + (r"\\%", r"\%%", false), + (r"%%", r"\%%", true), + (r"\%%", r"\%%", false), + (r"\\%%", r"\%%", false), + (r"_", r"\%%", false), + (r"\_", r"\%%", false), + (r"\\_", r"\%%", false), + (r"__", r"\%%", false), + (r"\__", r"\%%", false), + (r"\\__", r"\%%", false), + (r"abc", r"\%%", false), + (r"a_c", r"\%%", false), + (r"a\bc", r"\%%", false), + (r"a\_c", r"\%%", false), + (r"%abc", r"\%%", true), + (r"\%abc", r"\%%", false), + (r"a\\_c%", r"\%%", false), + (r"", r"\\%%", false), + (r"\", r"\\%%", true), + (r"\\", r"\\%%", true), + (r"\\\", r"\\%%", true), + (r"\\\\", r"\\%%", true), + (r"a", r"\\%%", false), + (r"\a", r"\\%%", true), + (r"\\a", r"\\%%", true), + (r"%", r"\\%%", false), + (r"\%", r"\\%%", true), + (r"\\%", r"\\%%", true), + (r"%%", r"\\%%", false), + (r"\%%", r"\\%%", true), + (r"\\%%", r"\\%%", true), + (r"_", r"\\%%", false), + (r"\_", r"\\%%", true), + (r"\\_", r"\\%%", true), + (r"__", r"\\%%", false), + (r"\__", r"\\%%", true), + (r"\\__", r"\\%%", true), + (r"abc", r"\\%%", false), + (r"a_c", r"\\%%", false), + (r"a\bc", r"\\%%", false), + (r"a\_c", r"\\%%", false), + (r"%abc", r"\\%%", false), + (r"\%abc", r"\\%%", true), + (r"a\\_c%", r"\\%%", false), + (r"", r"_", false), + (r"\", r"_", true), + (r"\\", r"_", false), + (r"\\\", r"_", false), + (r"\\\\", r"_", false), + (r"a", r"_", true), + (r"\a", r"_", false), + (r"\\a", r"_", false), + (r"%", r"_", true), + (r"\%", r"_", false), + (r"\\%", r"_", false), + (r"%%", r"_", false), + (r"\%%", r"_", false), + (r"\\%%", r"_", false), + (r"_", r"_", true), + (r"\_", r"_", false), + (r"\\_", r"_", false), + (r"__", r"_", false), + (r"\__", r"_", false), + (r"\\__", r"_", false), + (r"abc", r"_", false), + (r"a_c", r"_", false), + (r"a\bc", r"_", false), + (r"a\_c", r"_", false), + (r"%abc", r"_", false), + (r"\%abc", r"_", false), + (r"a\\_c%", r"_", false), + (r"", r"\_", false), + (r"\", r"\_", false), + (r"\\", r"\_", false), + (r"\\\", r"\_", false), + (r"\\\\", r"\_", false), + (r"a", r"\_", false), + (r"\a", r"\_", false), + (r"\\a", r"\_", false), + (r"%", r"\_", false), + (r"\%", r"\_", false), + (r"\\%", r"\_", false), + (r"%%", r"\_", false), + (r"\%%", r"\_", false), + (r"\\%%", r"\_", false), + (r"_", r"\_", true), + (r"\_", r"\_", false), + (r"\\_", r"\_", false), + (r"__", r"\_", false), + (r"\__", r"\_", false), + (r"\\__", r"\_", false), + (r"abc", r"\_", false), + (r"a_c", r"\_", false), + (r"a\bc", r"\_", false), + (r"a\_c", r"\_", false), + (r"%abc", r"\_", false), + (r"\%abc", r"\_", false), + (r"a\\_c%", r"\_", false), + (r"", r"\\_", false), + (r"\", r"\\_", false), + (r"\\", r"\\_", true), + (r"\\\", r"\\_", false), + (r"\\\\", r"\\_", false), + (r"a", r"\\_", false), + (r"\a", r"\\_", true), + (r"\\a", r"\\_", false), + (r"%", r"\\_", false), + (r"\%", r"\\_", true), + (r"\\%", r"\\_", false), + (r"%%", r"\\_", false), + (r"\%%", r"\\_", false), + (r"\\%%", r"\\_", false), + (r"_", r"\\_", false), + (r"\_", r"\\_", true), + (r"\\_", r"\\_", false), + (r"__", r"\\_", false), + (r"\__", r"\\_", false), + (r"\\__", r"\\_", false), + (r"abc", r"\\_", false), + (r"a_c", r"\\_", false), + (r"a\bc", r"\\_", false), + (r"a\_c", r"\\_", false), + (r"%abc", r"\\_", false), + (r"\%abc", r"\\_", false), + (r"a\\_c%", r"\\_", false), + (r"", r"__", false), + (r"\", r"__", false), + (r"\\", r"__", true), + (r"\\\", r"__", false), + (r"\\\\", r"__", false), + (r"a", r"__", false), + (r"\a", r"__", true), + (r"\\a", r"__", false), + (r"%", r"__", false), + (r"\%", r"__", true), + (r"\\%", r"__", false), + (r"%%", r"__", true), + (r"\%%", r"__", false), + (r"\\%%", r"__", false), + (r"_", r"__", false), + (r"\_", r"__", true), + (r"\\_", r"__", false), + (r"__", r"__", true), + (r"\__", r"__", false), + (r"\\__", r"__", false), + (r"abc", r"__", false), + (r"a_c", r"__", false), + (r"a\bc", r"__", false), + (r"a\_c", r"__", false), + (r"%abc", r"__", false), + (r"\%abc", r"__", false), + (r"a\\_c%", r"__", false), + (r"", r"\__", false), + (r"\", r"\__", false), + (r"\\", r"\__", false), + (r"\\\", r"\__", false), + (r"\\\\", r"\__", false), + (r"a", r"\__", false), + (r"\a", r"\__", false), + (r"\\a", r"\__", false), + (r"%", r"\__", false), + (r"\%", r"\__", false), + (r"\\%", r"\__", false), + (r"%%", r"\__", false), + (r"\%%", r"\__", false), + (r"\\%%", r"\__", false), + (r"_", r"\__", false), + (r"\_", r"\__", false), + (r"\\_", r"\__", false), + (r"__", r"\__", true), + (r"\__", r"\__", false), + (r"\\__", r"\__", false), + (r"abc", r"\__", false), + (r"a_c", r"\__", false), + (r"a\bc", r"\__", false), + (r"a\_c", r"\__", false), + (r"%abc", r"\__", false), + (r"\%abc", r"\__", false), + (r"a\\_c%", r"\__", false), + (r"", r"\\__", false), + (r"\", r"\\__", false), + (r"\\", r"\\__", false), + (r"\\\", r"\\__", true), + (r"\\\\", r"\\__", false), + (r"a", r"\\__", false), + (r"\a", r"\\__", false), + (r"\\a", r"\\__", true), + (r"%", r"\\__", false), + (r"\%", r"\\__", false), + (r"\\%", r"\\__", true), + (r"%%", r"\\__", false), + (r"\%%", r"\\__", true), + (r"\\%%", r"\\__", false), + (r"_", r"\\__", false), + (r"\_", r"\\__", false), + (r"\\_", r"\\__", true), + (r"__", r"\\__", false), + (r"\__", r"\\__", true), + (r"\\__", r"\\__", false), + (r"abc", r"\\__", false), + (r"a_c", r"\\__", false), + (r"a\bc", r"\\__", false), + (r"a\_c", r"\\__", false), + (r"%abc", r"\\__", false), + (r"\%abc", r"\\__", false), + (r"a\\_c%", r"\\__", false), + (r"", r"abc", false), + (r"\", r"abc", false), + (r"\\", r"abc", false), + (r"\\\", r"abc", false), + (r"\\\\", r"abc", false), + (r"a", r"abc", false), + (r"\a", r"abc", false), + (r"\\a", r"abc", false), + (r"%", r"abc", false), + (r"\%", r"abc", false), + (r"\\%", r"abc", false), + (r"%%", r"abc", false), + (r"\%%", r"abc", false), + (r"\\%%", r"abc", false), + (r"_", r"abc", false), + (r"\_", r"abc", false), + (r"\\_", r"abc", false), + (r"__", r"abc", false), + (r"\__", r"abc", false), + (r"\\__", r"abc", false), + (r"abc", r"abc", true), + (r"a_c", r"abc", false), + (r"a\bc", r"abc", false), + (r"a\_c", r"abc", false), + (r"%abc", r"abc", false), + (r"\%abc", r"abc", false), + (r"a\\_c%", r"abc", false), + (r"", r"a_c", false), + (r"\", r"a_c", false), + (r"\\", r"a_c", false), + (r"\\\", r"a_c", false), + (r"\\\\", r"a_c", false), + (r"a", r"a_c", false), + (r"\a", r"a_c", false), + (r"\\a", r"a_c", false), + (r"%", r"a_c", false), + (r"\%", r"a_c", false), + (r"\\%", r"a_c", false), + (r"%%", r"a_c", false), + (r"\%%", r"a_c", false), + (r"\\%%", r"a_c", false), + (r"_", r"a_c", false), + (r"\_", r"a_c", false), + (r"\\_", r"a_c", false), + (r"__", r"a_c", false), + (r"\__", r"a_c", false), + (r"\\__", r"a_c", false), + (r"abc", r"a_c", true), + (r"a_c", r"a_c", true), + (r"a\bc", r"a_c", false), + (r"a\_c", r"a_c", false), + (r"%abc", r"a_c", false), + (r"\%abc", r"a_c", false), + (r"a\\_c%", r"a_c", false), + (r"", r"a\bc", false), + (r"\", r"a\bc", false), + (r"\\", r"a\bc", false), + (r"\\\", r"a\bc", false), + (r"\\\\", r"a\bc", false), + (r"a", r"a\bc", false), + (r"\a", r"a\bc", false), + (r"\\a", r"a\bc", false), + (r"%", r"a\bc", false), + (r"\%", r"a\bc", false), + (r"\\%", r"a\bc", false), + (r"%%", r"a\bc", false), + (r"\%%", r"a\bc", false), + (r"\\%%", r"a\bc", false), + (r"_", r"a\bc", false), + (r"\_", r"a\bc", false), + (r"\\_", r"a\bc", false), + (r"__", r"a\bc", false), + (r"\__", r"a\bc", false), + (r"\\__", r"a\bc", false), + (r"abc", r"a\bc", true), + (r"a_c", r"a\bc", false), + (r"a\bc", r"a\bc", false), + (r"a\_c", r"a\bc", false), + (r"%abc", r"a\bc", false), + (r"\%abc", r"a\bc", false), + (r"a\\_c%", r"a\bc", false), + (r"", r"a\_c", false), + (r"\", r"a\_c", false), + (r"\\", r"a\_c", false), + (r"\\\", r"a\_c", false), + (r"\\\\", r"a\_c", false), + (r"a", r"a\_c", false), + (r"\a", r"a\_c", false), + (r"\\a", r"a\_c", false), + (r"%", r"a\_c", false), + (r"\%", r"a\_c", false), + (r"\\%", r"a\_c", false), + (r"%%", r"a\_c", false), + (r"\%%", r"a\_c", false), + (r"\\%%", r"a\_c", false), + (r"_", r"a\_c", false), + (r"\_", r"a\_c", false), + (r"\\_", r"a\_c", false), + (r"__", r"a\_c", false), + (r"\__", r"a\_c", false), + (r"\\__", r"a\_c", false), + (r"abc", r"a\_c", false), + (r"a_c", r"a\_c", true), + (r"a\bc", r"a\_c", false), + (r"a\_c", r"a\_c", false), + (r"%abc", r"a\_c", false), + (r"\%abc", r"a\_c", false), + (r"a\\_c%", r"a\_c", false), + (r"", r"%abc", false), + (r"\", r"%abc", false), + (r"\\", r"%abc", false), + (r"\\\", r"%abc", false), + (r"\\\\", r"%abc", false), + (r"a", r"%abc", false), + (r"\a", r"%abc", false), + (r"\\a", r"%abc", false), + (r"%", r"%abc", false), + (r"\%", r"%abc", false), + (r"\\%", r"%abc", false), + (r"%%", r"%abc", false), + (r"\%%", r"%abc", false), + (r"\\%%", r"%abc", false), + (r"_", r"%abc", false), + (r"\_", r"%abc", false), + (r"\\_", r"%abc", false), + (r"__", r"%abc", false), + (r"\__", r"%abc", false), + (r"\\__", r"%abc", false), + (r"abc", r"%abc", true), + (r"a_c", r"%abc", false), + (r"a\bc", r"%abc", false), + (r"a\_c", r"%abc", false), + (r"%abc", r"%abc", true), + (r"\%abc", r"%abc", true), + (r"a\\_c%", r"%abc", false), + (r"", r"\%abc", false), + (r"\", r"\%abc", false), + (r"\\", r"\%abc", false), + (r"\\\", r"\%abc", false), + (r"\\\\", r"\%abc", false), + (r"a", r"\%abc", false), + (r"\a", r"\%abc", false), + (r"\\a", r"\%abc", false), + (r"%", r"\%abc", false), + (r"\%", r"\%abc", false), + (r"\\%", r"\%abc", false), + (r"%%", r"\%abc", false), + (r"\%%", r"\%abc", false), + (r"\\%%", r"\%abc", false), + (r"_", r"\%abc", false), + (r"\_", r"\%abc", false), + (r"\\_", r"\%abc", false), + (r"__", r"\%abc", false), + (r"\__", r"\%abc", false), + (r"\\__", r"\%abc", false), + (r"abc", r"\%abc", false), + (r"a_c", r"\%abc", false), + (r"a\bc", r"\%abc", false), + (r"a\_c", r"\%abc", false), + (r"%abc", r"\%abc", true), + (r"\%abc", r"\%abc", false), + (r"a\\_c%", r"\%abc", false), + (r"", r"a\\_c%", false), + (r"\", r"a\\_c%", false), + (r"\\", r"a\\_c%", false), + (r"\\\", r"a\\_c%", false), + (r"\\\\", r"a\\_c%", false), + (r"a", r"a\\_c%", false), + (r"\a", r"a\\_c%", false), + (r"\\a", r"a\\_c%", false), + (r"%", r"a\\_c%", false), + (r"\%", r"a\\_c%", false), + (r"\\%", r"a\\_c%", false), + (r"%%", r"a\\_c%", false), + (r"\%%", r"a\\_c%", false), + (r"\\%%", r"a\\_c%", false), + (r"_", r"a\\_c%", false), + (r"\_", r"a\\_c%", false), + (r"\\_", r"a\\_c%", false), + (r"__", r"a\\_c%", false), + (r"\__", r"a\\_c%", false), + (r"\\__", r"a\\_c%", false), + (r"abc", r"a\\_c%", false), + (r"a_c", r"a\\_c%", false), + (r"a\bc", r"a\\_c%", true), + (r"a\_c", r"a\\_c%", true), + (r"%abc", r"a\\_c%", false), + (r"\%abc", r"a\\_c%", false), + (r"a\\_c%", r"a\\_c%", false), + ]; + + let values = test_cases + .iter() + .map(|(value, _, _)| *value) + .collect::>(); + let patterns = test_cases + .iter() + .map(|(_, pattern, _)| *pattern) + .collect::>(); + let expected = BooleanArray::from( + test_cases + .iter() + .map(|(_, _, expected)| *expected) + .collect::>(), + ); + let unexpected = BooleanArray::from( + test_cases + .iter() + .map(|(_, _, expected)| !*expected) + .collect::>(), + ); + + for string_type in [DataType::Utf8, DataType::LargeUtf8, DataType::Utf8View] { + let values = make_array(values.iter(), &string_type); + let patterns = make_array(patterns.iter(), &string_type); + let (values, patterns) = (values.as_ref(), patterns.as_ref()); + + assert_eq!(like(&values, &patterns).unwrap(), expected,); + assert_eq!(ilike(&values, &patterns).unwrap(), expected,); + assert_eq!(nlike(&values, &patterns).unwrap(), unexpected,); + assert_eq!(nilike(&values, &patterns).unwrap(), unexpected,); + } + } + + fn make_datums( + value: impl AsRef, + data_type: &DataType, + ) -> Vec<(Box, DatumType)> { + match data_type { + DataType::Utf8 => { + let array = StringArray::from_iter_values([value]); + vec![ + (Box::new(array.clone()), DatumType::Array), + (Box::new(Scalar::new(array)), DatumType::Scalar), + ] + } + DataType::LargeUtf8 => { + let array = LargeStringArray::from_iter_values([value]); + vec![ + (Box::new(array.clone()), DatumType::Array), + (Box::new(Scalar::new(array)), DatumType::Scalar), + ] + } + DataType::Utf8View => { + let array = StringViewArray::from_iter_values([value]); + vec![ + (Box::new(array.clone()), DatumType::Array), + (Box::new(Scalar::new(array)), DatumType::Scalar), + ] + } + _ => unimplemented!(), + } + } + + fn make_array( + values: impl IntoIterator>, + data_type: &DataType, + ) -> Box { + match data_type { + DataType::Utf8 => Box::new(StringArray::from_iter_values(values)), + DataType::LargeUtf8 => Box::new(LargeStringArray::from_iter_values(values)), + DataType::Utf8View => Box::new(StringViewArray::from_iter_values(values)), + _ => unimplemented!(), + } + } + + #[derive(Debug)] + enum DatumType { + Array, + Scalar, + } } diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index 01e3710a6d0a..0a8433dcf0f8 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -17,7 +17,7 @@ use arrow_array::{ArrayAccessor, BooleanArray}; use arrow_schema::ArrowError; -use memchr::memchr2; +use memchr::memchr3; use regex::{Regex, RegexBuilder}; /// A string based predicate @@ -42,16 +42,12 @@ impl<'a> Predicate<'a> { pub fn like(pattern: &'a str) -> Result { if !contains_like_pattern(pattern) { Ok(Self::Eq(pattern)) - } else if pattern.ends_with('%') - && !pattern.ends_with("\\%") - && !contains_like_pattern(&pattern[..pattern.len() - 1]) - { + } else if pattern.ends_with('%') && !contains_like_pattern(&pattern[..pattern.len() - 1]) { Ok(Self::StartsWith(&pattern[..pattern.len() - 1])) } else if pattern.starts_with('%') && !contains_like_pattern(&pattern[1..]) { Ok(Self::EndsWith(&pattern[1..])) } else if pattern.starts_with('%') && pattern.ends_with('%') - && !pattern.ends_with("\\%") && !contains_like_pattern(&pattern[1..pattern.len() - 1]) { Ok(Self::Contains(&pattern[1..pattern.len() - 1])) @@ -145,34 +141,50 @@ fn ends_with_ignore_ascii_case(haystack: &str, needle: &str) -> bool { /// 3. Replace escaped like wildcards removing the escape characters to be able to match it as a regex. For example: `\\%` => `%` fn regex_like(pattern: &str, case_insensitive: bool) -> Result { let mut result = String::with_capacity(pattern.len() * 2); - result.push('^'); let mut chars_iter = pattern.chars().peekable(); + match chars_iter.peek() { + // if the pattern starts with `%`, we avoid starting the regex with a slow but meaningless `^.*` + Some('%') => { + chars_iter.next(); + } + _ => result.push('^'), + }; while let Some(c) = chars_iter.next() { - if c == '\\' { - let next = chars_iter.peek(); - match next { - Some(next) if is_like_pattern(*next) => { - result.push(*next); - // Skipping the next char as it is already appended - chars_iter.next(); + match c { + '\\' => { + match chars_iter.peek() { + Some(&next) => { + if regex_syntax::is_meta_character(next) { + result.push('\\'); + } + result.push(next); + // Skipping the next char as it is already appended + chars_iter.next(); + } + None => { + // Trailing backslash in the pattern. E.g. PostgreSQL and Trino treat it as an error, but e.g. Snowflake treats it as a literal backslash + result.push('\\'); + result.push('\\'); + } } - _ => { - result.push('\\'); + } + '%' => result.push_str(".*"), + '_' => result.push('.'), + c => { + if regex_syntax::is_meta_character(c) { result.push('\\'); } + result.push(c); } - } else if regex_syntax::is_meta_character(c) { - result.push('\\'); - result.push(c); - } else if c == '%' { - result.push_str(".*"); - } else if c == '_' { - result.push('.'); - } else { - result.push(c); } } - result.push('$'); + // instead of ending the regex with `.*$` and making it needlessly slow, we just end the regex + if result.ends_with(".*") { + result.pop(); + result.pop(); + } else { + result.push('$'); + } RegexBuilder::new(&result) .case_insensitive(case_insensitive) .dot_matches_new_line(true) @@ -184,12 +196,8 @@ fn regex_like(pattern: &str, case_insensitive: bool) -> Result bool { - c == '%' || c == '_' -} - fn contains_like_pattern(pattern: &str) -> bool { - memchr2(b'%', b'_', pattern.as_bytes()).is_some() + memchr3(b'%', b'_', b'\\', pattern.as_bytes()).is_some() } #[cfg(test)] @@ -197,34 +205,31 @@ mod tests { use super::*; #[test] - fn test_replace_like_wildcards() { - let a_eq = "_%"; - let expected = "^..*$"; - let r = regex_like(a_eq, false).unwrap(); - assert_eq!(r.to_string(), expected); - } - - #[test] - fn test_replace_like_wildcards_leave_like_meta_chars() { - let a_eq = "\\%\\_"; - let expected = "^%_$"; - let r = regex_like(a_eq, false).unwrap(); - assert_eq!(r.to_string(), expected); - } - - #[test] - fn test_replace_like_wildcards_with_multiple_escape_chars() { - let a_eq = "\\\\%"; - let expected = "^\\\\%$"; - let r = regex_like(a_eq, false).unwrap(); - assert_eq!(r.to_string(), expected); - } - - #[test] - fn test_replace_like_wildcards_escape_regex_meta_char() { - let a_eq = "."; - let expected = "^\\.$"; - let r = regex_like(a_eq, false).unwrap(); - assert_eq!(r.to_string(), expected); + fn test_regex_like() { + let test_cases = [ + // %..% + (r"%foobar%", r"foobar"), + // ..%.. + (r"foo%bar", r"^foo.*bar$"), + // .._.. + (r"foo_bar", r"^foo.bar$"), + // escaped wildcards + (r"\%\_", r"^%_$"), + // escaped non-wildcard + (r"\a", r"^a$"), + // escaped escape and wildcard + (r"\\%", r"^\\"), + // escaped escape and non-wildcard + (r"\\a", r"^\\a$"), + // regex meta character + (r".", r"^\.$"), + (r"$", r"^\$$"), + (r"\\", r"^\\$"), + ]; + + for (like_pattern, expected_regexp) in test_cases { + let r = regex_like(like_pattern, false).unwrap(); + assert_eq!(r.to_string(), expected_regexp); + } } }