Skip to content

Commit

Permalink
Add substr short-ciruit for regexp_like
Browse files Browse the repository at this point in the history
  • Loading branch information
wjones127 committed Dec 6, 2020
1 parent 42a8736 commit 960406e
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 2 deletions.
6 changes: 6 additions & 0 deletions cpp/src/gandiva/regexp_matches_holder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ namespace gandiva {

RE2 RegexpMatchesHolder::starts_with_regex_(R"(\^([\w\s]+)(\.\*)?)");
RE2 RegexpMatchesHolder::ends_with_regex_(R"((\.\*)?([\w\s]+)\$)");
RE2 RegexpMatchesHolder::is_substr_regex_(R"((\w|\s)*)");

// Short-circuit pattern matches for the two common sub cases :
// - starts_with and ends_with.
Expand All @@ -45,6 +46,11 @@ const FunctionNode RegexpMatchesHolder::TryOptimize(const FunctionNode& node) {
std::make_shared<LiteralNode>(literal_type, LiteralHolder(substr), false);
return FunctionNode("ends_with", {node.children().at(0), suffix_node},
node.return_type());
} else if (RE2::FullMatch(pattern, is_substr_regex_)) {
auto substr_node =
std::make_shared<LiteralNode>(literal_type, LiteralHolder(pattern), false);
return FunctionNode("is_substr", {node.children().at(0), substr_node},
node.return_type());
}
}

Expand Down
1 change: 1 addition & 0 deletions cpp/src/gandiva/regexp_matches_holder.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ class GANDIVA_EXPORT RegexpMatchesHolder : public LikeHolder {

static RE2 starts_with_regex_; // pre-compiled pattern for matching starts_with
static RE2 ends_with_regex_; // pre-compiled pattern for matching ends_with
static RE2 is_substr_regex_; // pre-compiled pattern for matching is_substr
};
} // namespace gandiva

Expand Down
5 changes: 5 additions & 0 deletions cpp/src/gandiva/regexp_matches_holder_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,11 @@ TEST_F(TestRegexpMatchesHolder, TestOptimise) {
EXPECT_EQ(fnode.descriptor()->name(), "ends_with");
EXPECT_EQ(fnode.ToString(), "bool ends_with((string) in, (const string) xyz)");

// optimise for 'is_substr'
fnode = RegexpMatchesHolder::TryOptimize(BuildRegexpMatches("xyz"));
EXPECT_EQ(fnode.descriptor()->name(), "is_substr");
EXPECT_EQ(fnode.ToString(), "bool is_substr((string) in, (const string) xyz)");

// no optimisation for others.
fnode = RegexpMatchesHolder::TryOptimize(BuildRegexpMatches("^xyz$"));
EXPECT_EQ(fnode.descriptor()->name(), "regexp_matches");
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/gandiva/sql_like_holder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ const FunctionNode SQLLikeHolder::TryOptimize(const FunctionNode& node) {
auto suffix = pattern.substr(2); // skip .*
auto suffix_node =
std::make_shared<LiteralNode>(literal_type, LiteralHolder(suffix), false);
return FunctionNode("ends_with", {node.children().at(0), suffix_node},
node.return_type());
return FunctionNode("ends_with", {node.children().at(0), suffix_node},
node.return_type());
} else if (RE2::FullMatch(pattern, is_substr_regex_)) {
auto substr =
pattern.substr(2, pattern.length() - 4); // trim starting and ending .*
Expand Down

0 comments on commit 960406e

Please sign in to comment.