Skip to content

Commit

Permalink
apacheGH-40968: [C++][Gandiva] add RE2::Options set_dot_nl(true) for …
Browse files Browse the repository at this point in the history
…Like function (apache#40970)

### Rationale for this change

Gandiva function "LIKE" does not always work correctly when the string contains \n.
String value:
`[function_name: "Space1.protect"\nargs: "passenger_count"\ncolumn_name: "passenger_count" ]`
Pattern '%Space1%' nor '%Space1.%' do not match.

### What changes are included in this PR?

added flag set_dot_nl(true) to LikeHolder

### Are these changes tested?

add unit tests.

### Are there any user-facing changes?
Yes

**This PR includes breaking changes to public APIs.**

* GitHub Issue: apache#40968

Lead-authored-by: Ivan Chesnov <ivan.chesnov@dremio.com>
Co-authored-by: Ivan Chesnov <xxxlaykxxx@gmail.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
  • Loading branch information
xxlaykxx authored and tolleybot committed May 2, 2024
1 parent 7b029b9 commit 81580b2
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 13 deletions.
14 changes: 9 additions & 5 deletions cpp/src/gandiva/regex_functions_holder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,14 @@ Result<std::shared_ptr<LikeHolder>> LikeHolder::Make(const FunctionNode& node) {
"'like' function requires a string literal as the second parameter"));

RE2::Options regex_op;
regex_op.set_dot_nl(true); // set dotall mode for the regex.
if (node.descriptor()->name() == "ilike") {
regex_op.set_case_sensitive(false); // set case-insensitive for ilike function.

return Make(std::get<std::string>(literal->holder()), regex_op);
}
if (node.children().size() == 2) {
return Make(std::get<std::string>(literal->holder()));
return Make(std::get<std::string>(literal->holder()), regex_op);
} else {
auto escape_char = dynamic_cast<LiteralNode*>(node.children().at(2).get());
ARROW_RETURN_IF(
Expand All @@ -118,15 +119,17 @@ Result<std::shared_ptr<LikeHolder>> LikeHolder::Make(const FunctionNode& node) {
Status::Invalid(
"'like' function requires a string literal as the third parameter"));
return Make(std::get<std::string>(literal->holder()),
std::get<std::string>(escape_char->holder()));
std::get<std::string>(escape_char->holder()), regex_op);
}
}

Result<std::shared_ptr<LikeHolder>> LikeHolder::Make(const std::string& sql_pattern) {
std::string pcre_pattern;
ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern));

auto lholder = std::shared_ptr<LikeHolder>(new LikeHolder(pcre_pattern));
RE2::Options regex_op;
regex_op.set_dot_nl(true); // set dotall mode for the regex.
auto lholder = std::shared_ptr<LikeHolder>(new LikeHolder(pcre_pattern, regex_op));
ARROW_RETURN_IF(!lholder->regex_.ok(),
Status::Invalid("Building RE2 pattern '", pcre_pattern,
"' failed with: ", lholder->regex_.error()));
Expand All @@ -135,7 +138,8 @@ Result<std::shared_ptr<LikeHolder>> LikeHolder::Make(const std::string& sql_patt
}

Result<std::shared_ptr<LikeHolder>> LikeHolder::Make(const std::string& sql_pattern,
const std::string& escape_char) {
const std::string& escape_char,
RE2::Options regex_op) {
ARROW_RETURN_IF(escape_char.length() > 1,
Status::Invalid("The length of escape char ", escape_char,
" in 'like' function is greater than 1"));
Expand All @@ -147,7 +151,7 @@ Result<std::shared_ptr<LikeHolder>> LikeHolder::Make(const std::string& sql_patt
ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern));
}

auto lholder = std::shared_ptr<LikeHolder>(new LikeHolder(pcre_pattern));
auto lholder = std::shared_ptr<LikeHolder>(new LikeHolder(pcre_pattern, regex_op));
ARROW_RETURN_IF(!lholder->regex_.ok(),
Status::Invalid("Building RE2 pattern '", pcre_pattern,
"' failed with: ", lholder->regex_.error()));
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/gandiva/regex_functions_holder.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ class GANDIVA_EXPORT LikeHolder : public FunctionHolder {
static Result<std::shared_ptr<LikeHolder>> Make(const std::string& sql_pattern);

static Result<std::shared_ptr<LikeHolder>> Make(const std::string& sql_pattern,
const std::string& escape_char);
const std::string& escape_char,
RE2::Options regex_op);

static Result<std::shared_ptr<LikeHolder>> Make(const std::string& sql_pattern,
RE2::Options regex_op);
Expand Down
33 changes: 26 additions & 7 deletions cpp/src/gandiva/regex_functions_holder_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ namespace gandiva {
class TestLikeHolder : public ::testing::Test {
public:
RE2::Options regex_op;
void SetUp() { regex_op.set_dot_nl(true); }

FunctionNode BuildLike(std::string pattern) {
auto field = std::make_shared<FieldNode>(arrow::field("in", arrow::utf8()));
auto pattern_node =
Expand Down Expand Up @@ -77,6 +79,14 @@ TEST_F(TestLikeHolder, TestPcreSpecial) {
EXPECT_FALSE(like("xxabc"));
}

TEST_F(TestLikeHolder, TestPcreSpecialWithNewLine) {
EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("%Space1.%", regex_op));

auto& like = *like_holder;
EXPECT_TRUE(
like("[name: \"Space1.protect\"\nargs: \"count\"\ncolumn_name: \"pass_count\"]"));
}

TEST_F(TestLikeHolder, TestRegexEscape) {
std::string res;
ARROW_EXPECT_OK(RegexUtil::SqlLikePatternToPcre("#%hello#_abc_def##", '#', res));
Expand All @@ -91,14 +101,22 @@ TEST_F(TestLikeHolder, TestDot) {
EXPECT_FALSE(like("abcd"));
}

TEST_F(TestLikeHolder, TestMatchWithNewLine) {
EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("%abc%", regex_op));

auto& like = *like_holder;
EXPECT_TRUE(like("abc\nd"));
}

TEST_F(TestLikeHolder, TestMatchSubString) {
EXPECT_OK_AND_ASSIGN(auto like_holder, LikeHolder::Make("%abc%", "\\"));
EXPECT_OK_AND_ASSIGN(auto like_holder, LikeHolder::Make("%abc%", "\\", regex_op));

auto& like = *like_holder;
EXPECT_TRUE(like("abc"));
EXPECT_FALSE(like("xxabdc"));

EXPECT_OK_AND_ASSIGN(like_holder, LikeHolder::Make("%ab-.^$*+?()[]{}|—/c\\%%", "\\"));
EXPECT_OK_AND_ASSIGN(like_holder,
LikeHolder::Make("%ab-.^$*+?()[]{}|—/c\\%%", "\\", regex_op));

auto& like_reserved_char = *like_holder;
EXPECT_TRUE(like_reserved_char("XXab-.^$*+?()[]{}|—/c%d"));
Expand Down Expand Up @@ -173,7 +191,7 @@ TEST_F(TestLikeHolder, TestOptimise) {
}

TEST_F(TestLikeHolder, TestMatchOneEscape) {
EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\_", "\\"));
EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\_", "\\", regex_op));

auto& like = *like_holder;

Expand All @@ -187,7 +205,7 @@ TEST_F(TestLikeHolder, TestMatchOneEscape) {
}

TEST_F(TestLikeHolder, TestMatchManyEscape) {
EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\%", "\\"));
EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\%", "\\", regex_op));

auto& like = *like_holder;

Expand All @@ -201,7 +219,8 @@ TEST_F(TestLikeHolder, TestMatchManyEscape) {
}

TEST_F(TestLikeHolder, TestMatchEscape) {
EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\\\", "\\"));
EXPECT_OK_AND_ASSIGN(auto const like_holder,
LikeHolder::Make("ab\\\\", "\\", regex_op));

auto& like = *like_holder;

Expand All @@ -211,7 +230,7 @@ TEST_F(TestLikeHolder, TestMatchEscape) {
}

TEST_F(TestLikeHolder, TestEmptyEscapeChar) {
EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\_", ""));
EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\_", "", regex_op));

auto& like = *like_holder;

Expand All @@ -223,7 +242,7 @@ TEST_F(TestLikeHolder, TestEmptyEscapeChar) {
}

TEST_F(TestLikeHolder, TestMultipleEscapeChar) {
ASSERT_RAISES(Invalid, LikeHolder::Make("ab\\_", "\\\\").status());
ASSERT_RAISES(Invalid, LikeHolder::Make("ab\\_", "\\\\", regex_op).status());
}

class TestILikeHolder : public ::testing::Test {
Expand Down

0 comments on commit 81580b2

Please sign in to comment.