From 5173e1e7bb32ed8c319835d10c37526c6a4456e2 Mon Sep 17 00:00:00 2001 From: Christina Sioula Date: Mon, 18 Dec 2023 09:47:42 +0100 Subject: [PATCH] use RE2::QuoteMeta to escape special chars --- .../scalar/string/functions.json | 2 +- .../scalar/string/regexp_escape.cpp | 29 +++++----------- .../scalar/string_functions.hpp | 2 +- test/sql/function/string/regex_escape.test | 34 +++++++++++-------- 4 files changed, 30 insertions(+), 37 deletions(-) diff --git a/src/core_functions/scalar/string/functions.json b/src/core_functions/scalar/string/functions.json index 1817726f6a81..aa2f3078f844 100644 --- a/src/core_functions/scalar/string/functions.json +++ b/src/core_functions/scalar/string/functions.json @@ -308,7 +308,7 @@ { "name": "regexp_escape", "parameters": "string", - "description": "Return the input string with all the regular expression metacharacters escaped", + "description": "Escapes all potentially meaningful regexp characters in the input string", "example": "regexp_escape('https://duckdb.org')", "struct": "RegexpEscapeFun", "type": "scalar_function" diff --git a/src/core_functions/scalar/string/regexp_escape.cpp b/src/core_functions/scalar/string/regexp_escape.cpp index b9692df706dd..32517c9c82b7 100644 --- a/src/core_functions/scalar/string/regexp_escape.cpp +++ b/src/core_functions/scalar/string/regexp_escape.cpp @@ -1,31 +1,18 @@ #include "duckdb/core_functions/scalar/string_functions.hpp" -#include "duckdb/common/string_util.hpp" +#include "re2/re2.h" namespace duckdb { -static string_t escape(const string_t &str, vector &escaped_pattern) { - auto input_str = str.GetData(); - auto size_str = str.GetSize(); - - escaped_pattern.clear(); // reuse the buffer - // note: reserving double the size to account for escaping ('\\') as an average case - // to have half of the characters in the input string are special - escaped_pattern.reserve(2 * size_str); - string special_chars = "()[]{}?*+-|^$\\.&~#"; - for (idx_t i = 0; i < size_str; ++i) { - char ch = input_str[i]; - if (special_chars.find(ch) != std::string::npos) { - escaped_pattern.push_back('\\'); // escape the special character - } - escaped_pattern.push_back(ch); +struct EscapeOperator { + template + static RESULT_TYPE Operation(INPUT_TYPE &input, Vector &result) { + auto escaped_pattern = RE2::QuoteMeta(input.GetString()); + return StringVector::AddString(result, escaped_pattern); } - return string_t(escaped_pattern.data(), escaped_pattern.size()); -} +}; static void RegexpEscapeFunction(DataChunk &args, ExpressionState &state, Vector &result) { - vector escaped_pattern; - auto input_str = args.GetValue(0, 0); - result.Reference(escape(input_str.ToString(), escaped_pattern)); + UnaryExecutor::ExecuteString(args.data[0], result, args.size()); } ScalarFunction RegexpEscapeFun::GetFunction() { diff --git a/src/include/duckdb/core_functions/scalar/string_functions.hpp b/src/include/duckdb/core_functions/scalar/string_functions.hpp index 51c92bc904cb..216473058d8a 100644 --- a/src/include/duckdb/core_functions/scalar/string_functions.hpp +++ b/src/include/duckdb/core_functions/scalar/string_functions.hpp @@ -483,7 +483,7 @@ struct ToBaseFun { struct RegexpEscapeFun { static constexpr const char *Name = "regexp_escape"; static constexpr const char *Parameters = "string"; - static constexpr const char *Description = "Return the input string with all the regular expression metacharacters escaped"; + static constexpr const char *Description = "Escapes all potentially meaningful regexp characters in the input string"; static constexpr const char *Example = "regexp_escape('https://duckdb.org')"; static ScalarFunction GetFunction(); diff --git a/test/sql/function/string/regex_escape.test b/test/sql/function/string/regex_escape.test index 8e23af4d7d37..81653a6889eb 100644 --- a/test/sql/function/string/regex_escape.test +++ b/test/sql/function/string/regex_escape.test @@ -9,9 +9,9 @@ PRAGMA enable_verification query T SELECT regexp_escape('https://duckdb.org'); ---- -https://duckdb\.org +https\:\/\/duckdb\.org -# no special chars in the input +# no special chars query T SELECT regexp_escape('abc'); ---- @@ -23,11 +23,6 @@ SELECT regexp_escape('a.b'); ---- a\.b -query T -SELECT regexp_escape('()[]{}*+?|^$'); ----- -\(\)\[\]\{\}\*\+\?\|\^\$ - query T SELECT regexp_escape('a.b[c]*'); ---- @@ -35,15 +30,15 @@ a\.b\[c\]\* # empty string query T -SELECT regexp_escape('""'); +SELECT regexp_escape('\n'); ---- -"" +\\n # whitespaces query T SELECT regexp_escape('a b c'); ---- -a b c +a\ b\ c query T SELECT regexp_escape('line1\nline2'); @@ -52,15 +47,15 @@ line1\\nline2 # case sensitive query T -SELECT regexp_escape('CaseSensitive'); +SELECT regexp_escape('CaseSensitivE'); ---- -CaseSensitive +CaseSensitivE # unicode character query T SELECT regexp_escape('@'); ---- -@ +\@ # backslashes query T @@ -68,8 +63,19 @@ SELECT regexp_escape('path\to\wonderland'); ---- path\\to\\wonderland -# escape all special characters +# more special characters query T SELECT regexp_escape('$()*+.?[\]^{|}-'); ---- \$\(\)\*\+\.\?\[\\\]\^\{\|\}\- + +query I +CREATE TABLE tbl (a VARCHAR); +INSERT INTO tbl VALUES('cde'); +INSERT INTO tbl VALUES('-cd+e'); +INSERT INTO tbl VALUES('[^!$'); +SELECT regexp_escape(a) FROM tbl; +---- +cde +\-cd\+e +\[\^\!\$