Skip to content

Commit

Permalink
use RE2::QuoteMeta to escape special chars
Browse files Browse the repository at this point in the history
  • Loading branch information
chrisiou committed Dec 19, 2023
1 parent bb5e3ea commit 5173e1e
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 37 deletions.
2 changes: 1 addition & 1 deletion src/core_functions/scalar/string/functions.json
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@
{
"name": "regexp_escape",
"parameters": "string",
"description": "Return the input string with all the regular expression metacharacters escaped",
"description": "Escapes all potentially meaningful regexp characters in the input string",
"example": "regexp_escape('https://duckdb.org')",
"struct": "RegexpEscapeFun",
"type": "scalar_function"
Expand Down
29 changes: 8 additions & 21 deletions src/core_functions/scalar/string/regexp_escape.cpp
Original file line number Diff line number Diff line change
@@ -1,31 +1,18 @@
#include "duckdb/core_functions/scalar/string_functions.hpp"
#include "duckdb/common/string_util.hpp"
#include "re2/re2.h"

namespace duckdb {

static string_t escape(const string_t &str, vector<char> &escaped_pattern) {
auto input_str = str.GetData();
auto size_str = str.GetSize();

escaped_pattern.clear(); // reuse the buffer
// note: reserving double the size to account for escaping ('\\') as an average case
// to have half of the characters in the input string are special
escaped_pattern.reserve(2 * size_str);
string special_chars = "()[]{}?*+-|^$\\.&~#";
for (idx_t i = 0; i < size_str; ++i) {
char ch = input_str[i];
if (special_chars.find(ch) != std::string::npos) {
escaped_pattern.push_back('\\'); // escape the special character
}
escaped_pattern.push_back(ch);
struct EscapeOperator {
template <class INPUT_TYPE, class RESULT_TYPE>
static RESULT_TYPE Operation(INPUT_TYPE &input, Vector &result) {
auto escaped_pattern = RE2::QuoteMeta(input.GetString());
return StringVector::AddString(result, escaped_pattern);
}
return string_t(escaped_pattern.data(), escaped_pattern.size());
}
};

static void RegexpEscapeFunction(DataChunk &args, ExpressionState &state, Vector &result) {
vector<char> escaped_pattern;
auto input_str = args.GetValue(0, 0);
result.Reference(escape(input_str.ToString(), escaped_pattern));
UnaryExecutor::ExecuteString<string_t, string_t, EscapeOperator>(args.data[0], result, args.size());
}

ScalarFunction RegexpEscapeFun::GetFunction() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,7 @@ struct ToBaseFun {
struct RegexpEscapeFun {
static constexpr const char *Name = "regexp_escape";
static constexpr const char *Parameters = "string";
static constexpr const char *Description = "Return the input string with all the regular expression metacharacters escaped";
static constexpr const char *Description = "Escapes all potentially meaningful regexp characters in the input string";
static constexpr const char *Example = "regexp_escape('https://duckdb.org')";

static ScalarFunction GetFunction();
Expand Down
34 changes: 20 additions & 14 deletions test/sql/function/string/regex_escape.test
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ PRAGMA enable_verification
query T
SELECT regexp_escape('https://duckdb.org');
----
https://duckdb\.org
https\:\/\/duckdb\.org

# no special chars in the input
# no special chars
query T
SELECT regexp_escape('abc');
----
Expand All @@ -23,27 +23,22 @@ SELECT regexp_escape('a.b');
----
a\.b

query T
SELECT regexp_escape('()[]{}*+?|^$');
----
\(\)\[\]\{\}\*\+\?\|\^\$

query T
SELECT regexp_escape('a.b[c]*');
----
a\.b\[c\]\*

# empty string
query T
SELECT regexp_escape('""');
SELECT regexp_escape('\n');
----
""
\\n

# whitespaces
query T
SELECT regexp_escape('a b c');
----
a b c
a\ b\ c

query T
SELECT regexp_escape('line1\nline2');
Expand All @@ -52,24 +47,35 @@ line1\\nline2

# case sensitive
query T
SELECT regexp_escape('CaseSensitive');
SELECT regexp_escape('CaseSensitivE');
----
CaseSensitive
CaseSensitivE

# unicode character
query T
SELECT regexp_escape('@');
----
@
\@

# backslashes
query T
SELECT regexp_escape('path\to\wonderland');
----
path\\to\\wonderland

# escape all special characters
# more special characters
query T
SELECT regexp_escape('$()*+.?[\]^{|}-');
----
\$\(\)\*\+\.\?\[\\\]\^\{\|\}\-

query I
CREATE TABLE tbl (a VARCHAR);
INSERT INTO tbl VALUES('cde');
INSERT INTO tbl VALUES('-cd+e');
INSERT INTO tbl VALUES('[^!$');
SELECT regexp_escape(a) FROM tbl;
----
cde
\-cd\+e
\[\^\!\$

0 comments on commit 5173e1e

Please sign in to comment.