diff --git a/cpp/src/gandiva/expr_decomposer.cc b/cpp/src/gandiva/expr_decomposer.cc index 1c09d28f5e036..365b0337e14e9 100644 --- a/cpp/src/gandiva/expr_decomposer.cc +++ b/cpp/src/gandiva/expr_decomposer.cc @@ -53,7 +53,10 @@ Status ExprDecomposer::Visit(const FieldNode& node) { // time. const FunctionNode ExprDecomposer::TryOptimize(const FunctionNode& node) { if (node.descriptor()->name() == "like") { - return LikeHolder::TryOptimize(node); + return SQLLikeHolder::TryOptimize(node); + } else if (node.descriptor()->name() == "regexp_matches" || + node.descriptor()->name() == "regexp_like") { + return RegexpMatchesHolder::TryOptimize(node); } else { return node; } diff --git a/cpp/src/gandiva/function_holder_registry.h b/cpp/src/gandiva/function_holder_registry.h index ced1538915dd5..f87b89072f01e 100644 --- a/cpp/src/gandiva/function_holder_registry.h +++ b/cpp/src/gandiva/function_holder_registry.h @@ -23,7 +23,6 @@ #include #include "arrow/status.h" - #include "gandiva/function_holder.h" #include "gandiva/like_holder.h" #include "gandiva/node.h" @@ -62,8 +61,10 @@ class FunctionHolderRegistry { private: static map_type& makers() { static map_type maker_map = { - {"like", LAMBDA_MAKER(LikeHolder)}, - {"ilike", LAMBDA_MAKER(LikeHolder)}, + {"like", LAMBDA_MAKER(SQLLikeHolder)}, + {"ilike", LAMBDA_MAKER(SQLLikeHolder)}, + {"regexp_matches", LAMBDA_MAKER(RegexpMatchesHolder)}, + {"regexp_like", LAMBDA_MAKER(RegexpMatchesHolder)}, {"to_date", LAMBDA_MAKER(ToDateHolder)}, {"random", LAMBDA_MAKER(RandomGeneratorHolder)}, {"rand", LAMBDA_MAKER(RandomGeneratorHolder)}, diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index 3ea426c85f489..8b2741559356f 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -167,6 +167,10 @@ std::vector GetStringFunctionRegistry() { kResultNullIfNull, "gdv_fn_ilike_utf8_utf8", NativeFunction::kNeedsFunctionHolder), + NativeFunction("regexp_matches", {"regexp_like"}, DataTypeVector{utf8(), utf8()}, + boolean(), kResultNullIfNull, "gdv_fn_regexp_matches_utf8_utf8", + NativeFunction::kNeedsFunctionHolder), + NativeFunction("ltrim", {}, DataTypeVector{utf8(), utf8()}, utf8(), kResultNullIfNull, "ltrim_utf8_utf8", NativeFunction::kNeedsContext), diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index 2cac036abd577..c6070bb296d5c 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -44,21 +44,25 @@ extern "C" { bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len, const char* pattern, int pattern_len) { - gandiva::LikeHolder* holder = reinterpret_cast(ptr); + gandiva::RegexpMatchesHolder* holder = + reinterpret_cast(ptr); return (*holder)(std::string(data, data_len)); } bool gdv_fn_like_utf8_utf8_utf8(int64_t ptr, const char* data, int data_len, const char* pattern, int pattern_len, const char* escape_char, int escape_char_len) { - gandiva::LikeHolder* holder = reinterpret_cast(ptr); - return (*holder)(std::string(data, data_len)); + return gdv_fn_like_utf8_utf8(ptr, data, data_len, pattern, pattern_len); } bool gdv_fn_ilike_utf8_utf8(int64_t ptr, const char* data, int data_len, const char* pattern, int pattern_len) { - gandiva::LikeHolder* holder = reinterpret_cast(ptr); - return (*holder)(std::string(data, data_len)); + return gdv_fn_like_utf8_utf8(ptr, data, data_len, pattern, pattern_len); +} + +bool gdv_fn_regexp_matches_utf8_utf8(int64_t ptr, const char* data, int data_len, + const char* pattern, int pattern_len) { + return gdv_fn_like_utf8_utf8(ptr, data, data_len, pattern, pattern_len); } const char* gdv_fn_regexp_replace_utf8_utf8( @@ -911,6 +915,17 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { types->i1_type() /*return_type*/, args, reinterpret_cast(gdv_fn_ilike_utf8_utf8)); + // gdv_fn_regexp_matches_utf8_utf8 + args = {types->i64_type(), // int64_t ptr + types->i8_ptr_type(), // const char* data + types->i32_type(), // int data_len + types->i8_ptr_type(), // const char* pattern + types->i32_type()}; // int pattern_len + + engine->AddGlobalMappingForFunc( + "gdv_fn_regexp_matches_utf8_utf8", types->i1_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_regexp_matches_utf8_utf8)); + // gdv_fn_regexp_replace_utf8_utf8 args = {types->i64_type(), // int64_t ptr types->i64_type(), // int64_t holder_ptr diff --git a/cpp/src/gandiva/gdv_function_stubs.h b/cpp/src/gandiva/gdv_function_stubs.h index 670ac94df1b89..997e104220e4b 100644 --- a/cpp/src/gandiva/gdv_function_stubs.h +++ b/cpp/src/gandiva/gdv_function_stubs.h @@ -65,6 +65,9 @@ bool gdv_fn_like_utf8_utf8_utf8(int64_t ptr, const char* data, int data_len, bool gdv_fn_ilike_utf8_utf8(int64_t ptr, const char* data, int data_len, const char* pattern, int pattern_len); +bool gdv_fn_regexp_matches_utf8_utf8(int64_t ptr, const char* data, int data_len, + const char* pattern, int pattern_len); + int64_t gdv_fn_to_date_utf8_utf8_int32(int64_t context, int64_t ptr, const char* data, int data_len, bool in1_validity, const char* pattern, int pattern_len, diff --git a/cpp/src/gandiva/like_holder.cc b/cpp/src/gandiva/like_holder.cc index af9ac67d66ac4..a2a250d4aa0bd 100644 --- a/cpp/src/gandiva/like_holder.cc +++ b/cpp/src/gandiva/like_holder.cc @@ -18,41 +18,37 @@ #include "gandiva/like_holder.h" #include + #include "gandiva/node.h" -#include "gandiva/regex_util.h" namespace gandiva { -RE2 LikeHolder::starts_with_regex_(R"((\w|\s)*\.\*)"); -RE2 LikeHolder::ends_with_regex_(R"(\.\*(\w|\s)*)"); -RE2 LikeHolder::is_substr_regex_(R"(\.\*(\w|\s)*\.\*)"); +RE2 RegexpMatchesHolder::starts_with_regex_(R"(\^([\w\s]+)(\.\*)?)"); +RE2 RegexpMatchesHolder::ends_with_regex_(R"((\.\*)?([\w\s]+)\$)"); +RE2 RegexpMatchesHolder::is_substr_regex_(R"((\w|\s)*)"); -// Short-circuit pattern matches for the following common sub cases : -// - starts_with, ends_with and is_substr -const FunctionNode LikeHolder::TryOptimize(const FunctionNode& node) { - std::shared_ptr holder; +// Short-circuit pattern matches for the three common sub cases : +// - starts_with, ends_with, and contains. +const FunctionNode RegexpMatchesHolder::TryOptimize(const FunctionNode& node) { + std::shared_ptr holder; auto status = Make(node, &holder); if (status.ok()) { std::string& pattern = holder->pattern_; auto literal_type = node.children().at(1)->return_type(); - - if (RE2::FullMatch(pattern, starts_with_regex_)) { - auto prefix = pattern.substr(0, pattern.length() - 2); // trim .* + std::string substr; + if (RE2::FullMatch(pattern, starts_with_regex_, &substr)) { auto prefix_node = - std::make_shared(literal_type, LiteralHolder(prefix), false); + std::make_shared(literal_type, LiteralHolder(substr), false); return FunctionNode("starts_with", {node.children().at(0), prefix_node}, node.return_type()); - } else if (RE2::FullMatch(pattern, ends_with_regex_)) { - auto suffix = pattern.substr(2); // skip .* + } else if (RE2::FullMatch(pattern, ends_with_regex_, (void*)NULL, &substr)) { auto suffix_node = - std::make_shared(literal_type, LiteralHolder(suffix), false); + std::make_shared(literal_type, LiteralHolder(substr), false); return FunctionNode("ends_with", {node.children().at(0), suffix_node}, node.return_type()); } else if (RE2::FullMatch(pattern, is_substr_regex_)) { - auto substr = - pattern.substr(2, pattern.length() - 4); // trim starting and ending .* auto substr_node = - std::make_shared(literal_type, LiteralHolder(substr), false); + std::make_shared(literal_type, LiteralHolder(pattern), false); return FunctionNode("is_substr", {node.children().at(0), substr_node}, node.return_type()); } @@ -62,14 +58,114 @@ const FunctionNode LikeHolder::TryOptimize(const FunctionNode& node) { return node; } +const FunctionNode SQLLikeHolder::TryOptimize(const FunctionNode& node) { + if (node.descriptor()->name() == "ilike") { + // Optimizations don't work for case-insensitive matching + return node; + } + + std::string pcre_pattern; + auto pattern_result = GetPattern(node); + if (!pattern_result.ok()) { + return node; + } else { + pcre_pattern = pattern_result.ValueOrDie(); + } + + auto literal_type = node.children().at(1)->return_type(); + auto pcre_node = + std::make_shared(literal_type, LiteralHolder(pcre_pattern), false); + auto new_node = FunctionNode("regexp_matches", {node.children().at(0), pcre_node}, + node.return_type()); + + auto optimized_node = RegexpMatchesHolder::TryOptimize(new_node); + + if (optimized_node.descriptor()->name() != "regexp_matches") { + return optimized_node; + } else { + return node; + } +} + static bool IsArrowStringLiteral(arrow::Type::type type) { return type == arrow::Type::STRING || type == arrow::Type::BINARY; } -Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr* holder) { - ARROW_RETURN_IF(node.children().size() != 2 && node.children().size() != 3, - Status::Invalid("'like' function requires two or three parameters")); +Result RegexpMatchesHolder::GetPattern(const FunctionNode& node) { + auto literal = dynamic_cast(node.children().at(1).get()); + auto pattern = arrow::util::get(literal->holder()); + return pattern; +} + +Result SQLLikeHolder::GetPattern(const FunctionNode& node) { + std::string sql_pattern; + ARROW_ASSIGN_OR_RAISE(sql_pattern, GetSQLPattern(node)); + + std::string escape_char; + ARROW_ASSIGN_OR_RAISE(escape_char, GetEscapeChar(node)); + + return GetPattern(sql_pattern, escape_char); +} + +Result SQLLikeHolder::GetPattern(const std::string& sql_pattern, + const std::string& escape_char) { + std::string pcre_pattern; + if (escape_char.length() == 1) { + ARROW_RETURN_NOT_OK( + RegexUtil::SqlLikePatternToPcre(sql_pattern, escape_char.at(0), pcre_pattern)); + } else { + ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); + } + return pcre_pattern; +} + +Status RegexpMatchesHolder::Make(const std::string& pcre_pattern, + std::shared_ptr* holder) { + auto lholder = + std::shared_ptr(new RegexpMatchesHolder(pcre_pattern)); + ARROW_RETURN_IF(!lholder->regex_.ok(), + Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed")); + + *holder = lholder; + return Status::OK(); +} + +Status RegexpMatchesHolder::Make(const std::string& pcre_pattern, + std::shared_ptr* holder, + RE2::Options regex_ops) { + auto lholder = std::shared_ptr( + new RegexpMatchesHolder(pcre_pattern, regex_ops)); + ARROW_RETURN_IF(!lholder->regex_.ok(), + Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed")); + + *holder = lholder; + return Status::OK(); +} + +Status RegexpMatchesHolder::Make(const FunctionNode& node, + std::shared_ptr* holder) { + // Add regexp_matches validation + ARROW_RETURN_IF(node.children().size() != 2, + Status::Invalid("'regexp_matches' function requires two parameters")); + + auto literal = dynamic_cast(node.children().at(1).get()); + ARROW_RETURN_IF( + literal == nullptr, + Status::Invalid( + "'regexp_matches' function requires a literal as the second parameter")); + + auto literal_type = literal->return_type()->id(); + ARROW_RETURN_IF( + !IsArrowStringLiteral(literal_type), + Status::Invalid( + "'regexp_matches' function requires a string literal as the second parameter")); + + ARROW_ASSIGN_OR_RAISE(std::string pattern, GetPattern(node)); + return Make(pattern, holder); +} + +Result SQLLikeHolder::GetSQLPattern(const FunctionNode& node) { auto literal = dynamic_cast(node.children().at(1).get()); ARROW_RETURN_IF( literal == nullptr, @@ -81,76 +177,78 @@ Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr* h Status::Invalid( "'like' function requires a string literal as the second parameter")); - RE2::Options regex_op; - if (node.descriptor()->name() == "ilike") { - regex_op.set_case_sensitive(false); // set case-insensitive for ilike function. + return arrow::util::get(literal->holder()); +} - return Make(arrow::util::get(literal->holder()), holder, regex_op); - } - if (node.children().size() == 2) { - return Make(arrow::util::get(literal->holder()), holder); - } else { - auto escape_char = dynamic_cast(node.children().at(2).get()); +Result SQLLikeHolder::GetEscapeChar(const FunctionNode& node) { + std::string escape_char = ""; + if (node.children().size() == 3) { + auto escape_node = dynamic_cast(node.children().at(2).get()); ARROW_RETURN_IF( - escape_char == nullptr, + escape_node == nullptr, Status::Invalid("'like' function requires a literal as the third parameter")); - auto escape_char_type = escape_char->return_type()->id(); + auto escape_char_type = escape_node->return_type()->id(); ARROW_RETURN_IF( !IsArrowStringLiteral(escape_char_type), Status::Invalid( "'like' function requires a string literal as the third parameter")); - return Make(arrow::util::get(literal->holder()), - arrow::util::get(escape_char->holder()), holder); + escape_char = arrow::util::get(escape_node->holder()); } + return escape_char; } -Status LikeHolder::Make(const std::string& sql_pattern, - std::shared_ptr* holder) { - std::string pcre_pattern; - ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); +Status SQLLikeHolder::Make(const FunctionNode& node, + std::shared_ptr* holder) { + ARROW_RETURN_IF(node.children().size() != 2 && node.children().size() != 3, + Status::Invalid("'like' function requires two or three parameters")); - auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern)); - ARROW_RETURN_IF(!lholder->regex_.ok(), - Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed")); + std::string sql_pattern; + ARROW_ASSIGN_OR_RAISE(sql_pattern, GetSQLPattern(node)); - *holder = lholder; - return Status::OK(); -} + std::string escape_char; + ARROW_ASSIGN_OR_RAISE(escape_char, GetEscapeChar(node)); -Status LikeHolder::Make(const std::string& sql_pattern, const std::string& escape_char, - std::shared_ptr* holder) { - ARROW_RETURN_IF(escape_char.length() > 1, - Status::Invalid("The length of escape char ", escape_char, - " in 'like' function is greater than 1")); - std::string pcre_pattern; - if (escape_char.length() == 1) { - ARROW_RETURN_NOT_OK( - RegexUtil::SqlLikePatternToPcre(sql_pattern, escape_char.at(0), pcre_pattern)); - } else { - ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); + RE2::Options regex_op; + if (node.descriptor()->name() == "ilike") { + regex_op.set_case_sensitive(false); // set case-insensitive for ilike function. } - auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern)); - ARROW_RETURN_IF(!lholder->regex_.ok(), - Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed")); + return Make(sql_pattern, escape_char, holder, regex_op); +} - *holder = lholder; - return Status::OK(); +Status SQLLikeHolder::Make(const std::string& sql_pattern, + std::shared_ptr* holder) { + RE2::Options regex_op; + return Make(sql_pattern, "", holder, regex_op); } -Status LikeHolder::Make(const std::string& sql_pattern, - std::shared_ptr* holder, RE2::Options regex_op) { - std::string pcre_pattern; - ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); +Status SQLLikeHolder::Make(const std::string& sql_pattern, const std::string& escape_char, + std::shared_ptr* holder) { + RE2::Options regex_op; + return Make(sql_pattern, escape_char, holder, regex_op); +} - std::shared_ptr lholder; - lholder = std::shared_ptr(new LikeHolder(pcre_pattern, regex_op)); +Status SQLLikeHolder::Make(const std::string& sql_pattern, + std::shared_ptr* holder, + RE2::Options regex_op) { + return Make(sql_pattern, "", holder, regex_op); +} - ARROW_RETURN_IF(!lholder->regex_.ok(), - Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed")); +Status SQLLikeHolder::Make(const std::string& sql_pattern, const std::string& escape_char, + std::shared_ptr* holder, + RE2::Options regex_op) { + ARROW_RETURN_IF(escape_char.length() > 1, + Status::Invalid("The length of escape char ", escape_char, + " in 'like' function is greater than 1")); + std::string pcre_pattern; + ARROW_ASSIGN_OR_RAISE(pcre_pattern, GetPattern(sql_pattern, escape_char)); - *holder = lholder; + std::shared_ptr base_holder; + ARROW_RETURN_NOT_OK(RegexpMatchesHolder::Make(pcre_pattern, &base_holder, regex_op)); + + *holder = std::static_pointer_cast(base_holder); return Status::OK(); } + } // namespace gandiva diff --git a/cpp/src/gandiva/like_holder.h b/cpp/src/gandiva/like_holder.h index 73e58017de19f..97ffbc5abc72d 100644 --- a/cpp/src/gandiva/like_holder.h +++ b/cpp/src/gandiva/like_holder.h @@ -15,46 +15,49 @@ // specific language governing permissions and limitations // under the License. -#pragma once +#ifndef GANDIVA_REGEXP_MATCHES_HOLDER_H +#define GANDIVA_REGEXP_MATCHES_HOLDER_H + +#include #include #include -#include - #include "arrow/status.h" - #include "gandiva/function_holder.h" #include "gandiva/node.h" +#include "gandiva/regex_util.h" #include "gandiva/visibility.h" namespace gandiva { -/// Function Holder for SQL 'like' -class GANDIVA_EXPORT LikeHolder : public FunctionHolder { +class GANDIVA_EXPORT RegexpMatchesHolder : public FunctionHolder { public: - ~LikeHolder() override = default; + static Status Make(const FunctionNode& node, + std::shared_ptr* holder); - static Status Make(const FunctionNode& node, std::shared_ptr* holder); + static Status Make(const std::string& pcre_pattern, + std::shared_ptr* holder); - static Status Make(const std::string& sql_pattern, std::shared_ptr* holder); + static Status Make(const std::string& pcre_pattern, + std::shared_ptr* holder, + RE2::Options regex_ops); - static Status Make(const std::string& sql_pattern, const std::string& escape_char, - std::shared_ptr* holder); - - static Status Make(const std::string& sql_pattern, std::shared_ptr* holder, - RE2::Options regex_op); - - // Try and optimise a function node with a "like" pattern. + /// Try and optimise a function node with a "regexp_matches" pattern. static const FunctionNode TryOptimize(const FunctionNode& node); - /// Return true if the data matches the pattern. - bool operator()(const std::string& data) { return RE2::FullMatch(data, regex_); } + /// Return true if there is a match in the data. + bool operator()(const std::string& data) { return RE2::PartialMatch(data, regex_); } + + protected: + static Status ValidateArguments(const FunctionNode& node); + static Result GetPattern(const FunctionNode& node); private: - explicit LikeHolder(const std::string& pattern) : pattern_(pattern), regex_(pattern) {} + explicit RegexpMatchesHolder(const std::string& pattern) + : pattern_(pattern), regex_(pattern) {} - LikeHolder(const std::string& pattern, RE2::Options regex_op) + RegexpMatchesHolder(const std::string& pattern, RE2::Options regex_op) : pattern_(pattern), regex_(pattern, regex_op) {} std::string pattern_; // posix pattern string, to help debugging @@ -65,4 +68,33 @@ class GANDIVA_EXPORT LikeHolder : public FunctionHolder { static RE2 is_substr_regex_; // pre-compiled pattern for matching is_substr }; +class GANDIVA_EXPORT SQLLikeHolder : public RegexpMatchesHolder { + public: + static Status Make(const FunctionNode& node, std::shared_ptr* holder); + + static Status Make(const std::string& sql_pattern, + std::shared_ptr* holder); + + static Status Make(const std::string& sql_pattern, const std::string& escape_char, + std::shared_ptr* holder); + + static Status Make(const std::string& sql_pattern, + std::shared_ptr* holder, RE2::Options regex_ops); + + static Status Make(const std::string& sql_pattern, const std::string& escape_char, + std::shared_ptr* holder, RE2::Options regex_ops); + + /// Try and optimise a function node with a "like" pattern. + static const FunctionNode TryOptimize(const FunctionNode& node); + + protected: + static Result GetPattern(const FunctionNode& node); + static Result GetPattern(const std::string& sql_pattern, + const std::string& escape_char); + static Result GetSQLPattern(const FunctionNode& node); + static Result GetEscapeChar(const FunctionNode& node); +}; + } // namespace gandiva + +#endif // GANDIVA_REGEXP_MATCHES_HOLDER_H diff --git a/cpp/src/gandiva/like_holder_test.cc b/cpp/src/gandiva/like_holder_test.cc index a52533a113836..42a72f8075aaa 100644 --- a/cpp/src/gandiva/like_holder_test.cc +++ b/cpp/src/gandiva/like_holder_test.cc @@ -16,16 +16,186 @@ // under the License. #include "gandiva/like_holder.h" -#include "gandiva/regex_util.h" + +#include #include #include -#include +#include "gandiva/regex_util.h" namespace gandiva { -class TestLikeHolder : public ::testing::Test { +class TestRegexpMatchesHolder : public ::testing::Test { + public: + FunctionNode BuildRegexpMatches(std::string pattern) { + auto field = std::make_shared(arrow::field("in", arrow::utf8())); + auto pattern_node = + std::make_shared(arrow::utf8(), LiteralHolder(pattern), false); + return FunctionNode("regexp_matches", {field, pattern_node}, arrow::boolean()); + } +}; + +TEST_F(TestRegexpMatchesHolder, TestString) { + std::shared_ptr regexp_matches_holder; + + auto status = RegexpMatchesHolder::Make("ab", ®exp_matches_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& regexp_matches = *regexp_matches_holder; + EXPECT_TRUE(regexp_matches("ab")); + EXPECT_TRUE(regexp_matches("abc")); + EXPECT_TRUE(regexp_matches("abcd")); + EXPECT_TRUE(regexp_matches("cab")); + + EXPECT_FALSE(regexp_matches("a")); +} + +TEST_F(TestRegexpMatchesHolder, TestDotStar) { + std::shared_ptr regexp_matches_holder; + + auto status = RegexpMatchesHolder::Make("a.*b", ®exp_matches_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& regexp_matches = *regexp_matches_holder; + EXPECT_TRUE(regexp_matches("ab")); + EXPECT_TRUE(regexp_matches("adeb")); + EXPECT_TRUE(regexp_matches("abc")); + EXPECT_TRUE(regexp_matches("cabc")); + EXPECT_TRUE(regexp_matches("caebf")); + + EXPECT_FALSE(regexp_matches("ba")); + EXPECT_FALSE(regexp_matches("a")); +} + +TEST_F(TestRegexpMatchesHolder, TestDot) { + std::shared_ptr regexp_matches_holder; + + auto status = RegexpMatchesHolder::Make("ab.", ®exp_matches_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& regexp_matches = *regexp_matches_holder; + EXPECT_TRUE(regexp_matches("abc")); + EXPECT_TRUE(regexp_matches("abd")); + EXPECT_TRUE(regexp_matches("abcd")); + EXPECT_TRUE(regexp_matches("dabc")); + + EXPECT_FALSE(regexp_matches("a")); + EXPECT_FALSE(regexp_matches("ab")); +} + +TEST_F(TestRegexpMatchesHolder, TestAnchors) { + std::shared_ptr regexp_matches_holder; + + auto status = RegexpMatchesHolder::Make("^ab.*c$", ®exp_matches_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& regexp_matches = *regexp_matches_holder; + EXPECT_TRUE(regexp_matches("abdc")); + EXPECT_TRUE(regexp_matches("abc")); + + EXPECT_FALSE(regexp_matches("abcd")); + EXPECT_FALSE(regexp_matches("dabc")); +} + +TEST_F(TestRegexpMatchesHolder, TestIgnoreCase) { + std::shared_ptr regexp_matches_holder; + + auto status = RegexpMatchesHolder::Make("(?i)ab", ®exp_matches_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& regexp_matches = *regexp_matches_holder; + EXPECT_TRUE(regexp_matches("abc")); + EXPECT_TRUE(regexp_matches("daBc")); + EXPECT_TRUE(regexp_matches("CAB")); + + EXPECT_FALSE(regexp_matches("ba")); +} + +TEST_F(TestRegexpMatchesHolder, TestCharacterClass) { + std::shared_ptr regexp_matches_holder; + + auto status = RegexpMatchesHolder::Make("[ab]c", ®exp_matches_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& regexp_matches = *regexp_matches_holder; + EXPECT_TRUE(regexp_matches("acd")); + EXPECT_TRUE(regexp_matches("ebc")); + EXPECT_TRUE(regexp_matches("abc")); + + EXPECT_FALSE(regexp_matches("ab")); +} + +TEST_F(TestRegexpMatchesHolder, TestEscapeCharacter) { + std::shared_ptr regexp_matches_holder; + + auto status = RegexpMatchesHolder::Make("\\.\\*", ®exp_matches_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& regexp_matches = *regexp_matches_holder; + EXPECT_TRUE(regexp_matches(".*")); + + EXPECT_FALSE(regexp_matches("ab")); +} + +TEST_F(TestRegexpMatchesHolder, TestNonAsciiMatches) { + std::shared_ptr regexp_matches_holder; + + auto status = RegexpMatchesHolder::Make(".*çåå†.*", ®exp_matches_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& regexp_matches = *regexp_matches_holder; + EXPECT_TRUE(regexp_matches("açåå†b")); + + EXPECT_FALSE(regexp_matches("ab")); +} + +TEST_F(TestRegexpMatchesHolder, TestOptimise) { + // optimise for 'starts_with' + auto fnode = RegexpMatchesHolder::TryOptimize(BuildRegexpMatches("^abc")); + EXPECT_EQ(fnode.descriptor()->name(), "starts_with"); + EXPECT_EQ(fnode.ToString(), "bool starts_with((string) in, (const string) abc)"); + + fnode = RegexpMatchesHolder::TryOptimize(BuildRegexpMatches("^abc.*")); + EXPECT_EQ(fnode.descriptor()->name(), "starts_with"); + EXPECT_EQ(fnode.ToString(), "bool starts_with((string) in, (const string) abc)"); + + fnode = RegexpMatchesHolder::TryOptimize(BuildRegexpMatches("^ab cd")); + EXPECT_EQ(fnode.descriptor()->name(), "starts_with"); + EXPECT_EQ(fnode.ToString(), "bool starts_with((string) in, (const string) ab cd)"); + + // optimise for 'ends_with' + fnode = RegexpMatchesHolder::TryOptimize(BuildRegexpMatches("xyz$")); + EXPECT_EQ(fnode.descriptor()->name(), "ends_with"); + EXPECT_EQ(fnode.ToString(), "bool ends_with((string) in, (const string) xyz)"); + + fnode = RegexpMatchesHolder::TryOptimize(BuildRegexpMatches(".*xyz$")); + EXPECT_EQ(fnode.descriptor()->name(), "ends_with"); + EXPECT_EQ(fnode.ToString(), "bool ends_with((string) in, (const string) xyz)"); + + // optimise for 'is_substr' + fnode = RegexpMatchesHolder::TryOptimize(BuildRegexpMatches("xyz")); + EXPECT_EQ(fnode.descriptor()->name(), "is_substr"); + EXPECT_EQ(fnode.ToString(), "bool is_substr((string) in, (const string) xyz)"); + + // no optimisation for others. + fnode = RegexpMatchesHolder::TryOptimize(BuildRegexpMatches("^xyz$")); + EXPECT_EQ(fnode.descriptor()->name(), "regexp_matches"); + + fnode = RegexpMatchesHolder::TryOptimize(BuildRegexpMatches("^xy.*z")); + EXPECT_EQ(fnode.descriptor()->name(), "regexp_matches"); + + fnode = RegexpMatchesHolder::TryOptimize(BuildRegexpMatches("^.*")); + EXPECT_EQ(fnode.descriptor()->name(), "regexp_matches"); + + fnode = RegexpMatchesHolder::TryOptimize(BuildRegexpMatches("x.yz$")); + EXPECT_EQ(fnode.descriptor()->name(), "regexp_matches"); + + fnode = RegexpMatchesHolder::TryOptimize(BuildRegexpMatches("^[xyz]")); + EXPECT_EQ(fnode.descriptor()->name(), "regexp_matches"); +} + +class TestSQLLikeHolder : public ::testing::Test { public: RE2::Options regex_op; FunctionNode BuildLike(std::string pattern) { @@ -46,13 +216,13 @@ class TestLikeHolder : public ::testing::Test { } }; -TEST_F(TestLikeHolder, TestMatchAny) { - std::shared_ptr like_holder; +TEST_F(TestSQLLikeHolder, TestMatchAny) { + std::shared_ptr sql_like_holder; - auto status = LikeHolder::Make("ab%", &like_holder, regex_op); + auto status = SQLLikeHolder::Make("ab%", &sql_like_holder); EXPECT_EQ(status.ok(), true) << status.message(); - auto& like = *like_holder; + auto& like = *sql_like_holder; EXPECT_TRUE(like("ab")); EXPECT_TRUE(like("abc")); EXPECT_TRUE(like("abcd")); @@ -61,13 +231,13 @@ TEST_F(TestLikeHolder, TestMatchAny) { EXPECT_FALSE(like("cab")); } -TEST_F(TestLikeHolder, TestMatchOne) { - std::shared_ptr like_holder; +TEST_F(TestSQLLikeHolder, TestMatchOne) { + std::shared_ptr sql_like_holder; - auto status = LikeHolder::Make("ab_", &like_holder, regex_op); + auto status = SQLLikeHolder::Make("ab_", &sql_like_holder); EXPECT_EQ(status.ok(), true) << status.message(); - auto& like = *like_holder; + auto& like = *sql_like_holder; EXPECT_TRUE(like("abc")); EXPECT_TRUE(like("abd")); @@ -76,78 +246,101 @@ TEST_F(TestLikeHolder, TestMatchOne) { EXPECT_FALSE(like("dabc")); } -TEST_F(TestLikeHolder, TestPcreSpecial) { - std::shared_ptr like_holder; +TEST_F(TestSQLLikeHolder, TestPcreSpecial) { + std::shared_ptr sql_like_holder; - auto status = LikeHolder::Make(".*ab_", &like_holder, regex_op); + auto status = SQLLikeHolder::Make(".*ab_", &sql_like_holder); EXPECT_EQ(status.ok(), true) << status.message(); - auto& like = *like_holder; + auto& like = *sql_like_holder; EXPECT_TRUE(like(".*abc")); // . and * aren't special in sql regex EXPECT_FALSE(like("xxabc")); } -TEST_F(TestLikeHolder, TestRegexEscape) { +TEST_F(TestSQLLikeHolder, TestRegexEscape) { std::string res; auto status = RegexUtil::SqlLikePatternToPcre("#%hello#_abc_def##", '#', res); EXPECT_TRUE(status.ok()) << status.message(); - EXPECT_EQ(res, "%hello_abc.def#"); + EXPECT_EQ(res, "^%hello_abc.def#$"); +} + +TEST_F(TestSQLLikeHolder, Test) { + std::vector> cases = { + {"test12", "^test12$", '\\'}, {"_test_test_", "^.test.test.$", '\\'}, + {"%test%test%", "test.*test", '\\'}, {"\\%test.%", "^%test\\.", '\\'}, + {"f%test.%", "^%test\\.", 'f'}, {"$25.00", "^\\$25\\.00$", '\\'}, + {"\\test", "^\\\\test$", '#'}}; + + for (auto&& test_case : cases) { + std::string pattern_like, pattern_pcre; + char escape_char; + std::tie(pattern_like, pattern_pcre, escape_char) = test_case; + + std::string res; + auto status = RegexUtil::SqlLikePatternToPcre(pattern_like, escape_char, res); + EXPECT_TRUE(status.ok()) << status.message(); + + EXPECT_EQ(res, pattern_pcre); + } } -TEST_F(TestLikeHolder, TestDot) { - std::shared_ptr like_holder; +TEST_F(TestSQLLikeHolder, TestDot) { + std::shared_ptr sql_like_holder; - auto status = LikeHolder::Make("abc.", &like_holder, regex_op); + auto status = SQLLikeHolder::Make("abc.", &sql_like_holder); EXPECT_EQ(status.ok(), true) << status.message(); - auto& like = *like_holder; + auto& like = *sql_like_holder; EXPECT_FALSE(like("abcd")); } -TEST_F(TestLikeHolder, TestOptimise) { +TEST_F(TestSQLLikeHolder, TestOptimise) { // optimise for 'starts_with' - auto fnode = LikeHolder::TryOptimize(BuildLike("xy 123z%")); + auto fnode = SQLLikeHolder::TryOptimize(BuildLike("xy 123z%")); EXPECT_EQ(fnode.descriptor()->name(), "starts_with"); EXPECT_EQ(fnode.ToString(), "bool starts_with((string) in, (const string) xy 123z)"); // optimise for 'ends_with' - fnode = LikeHolder::TryOptimize(BuildLike("%xyz")); + fnode = SQLLikeHolder::TryOptimize(BuildLike("%xyz")); EXPECT_EQ(fnode.descriptor()->name(), "ends_with"); EXPECT_EQ(fnode.ToString(), "bool ends_with((string) in, (const string) xyz)"); // optimise for 'is_substr' - fnode = LikeHolder::TryOptimize(BuildLike("%abc%")); + fnode = SQLLikeHolder::TryOptimize(BuildLike("%abc%")); EXPECT_EQ(fnode.descriptor()->name(), "is_substr"); EXPECT_EQ(fnode.ToString(), "bool is_substr((string) in, (const string) abc)"); // no optimisation for others. - fnode = LikeHolder::TryOptimize(BuildLike("xyz_")); + fnode = SQLLikeHolder::TryOptimize(BuildLike("xyz_")); EXPECT_EQ(fnode.descriptor()->name(), "like"); - fnode = LikeHolder::TryOptimize(BuildLike("_xyz")); + fnode = SQLLikeHolder::TryOptimize(BuildLike("_xyz")); EXPECT_EQ(fnode.descriptor()->name(), "like"); - fnode = LikeHolder::TryOptimize(BuildLike("_xyz_")); + fnode = SQLLikeHolder::TryOptimize(BuildLike("_xyz_")); EXPECT_EQ(fnode.descriptor()->name(), "like"); - fnode = LikeHolder::TryOptimize(BuildLike("%xyz_")); + fnode = SQLLikeHolder::TryOptimize(BuildLike("%xyz_")); EXPECT_EQ(fnode.descriptor()->name(), "like"); - fnode = LikeHolder::TryOptimize(BuildLike("x_yz%")); + fnode = SQLLikeHolder::TryOptimize(BuildLike("x_yz%")); EXPECT_EQ(fnode.descriptor()->name(), "like"); // no optimisation for escaped pattern. - fnode = LikeHolder::TryOptimize(BuildLike("\\%xyz", '\\')); + fnode = SQLLikeHolder::TryOptimize(BuildLike("\\%xyz", '\\')); EXPECT_EQ(fnode.descriptor()->name(), "like"); EXPECT_EQ(fnode.ToString(), "bool like((string) in, (const string) \\%xyz, (const int8) \\)"); + + fnode = SQLLikeHolder::TryOptimize(BuildLike("x_yz%")); + EXPECT_EQ(fnode.descriptor()->name(), "like"); } -TEST_F(TestLikeHolder, TestMatchOneEscape) { - std::shared_ptr like_holder; +TEST_F(TestSQLLikeHolder, TestMatchOneEscape) { + std::shared_ptr like_holder; - auto status = LikeHolder::Make("ab\\_", "\\", &like_holder); + auto status = SQLLikeHolder::Make("ab\\_", "\\", &like_holder); EXPECT_EQ(status.ok(), true) << status.message(); auto& like = *like_holder; @@ -161,10 +354,10 @@ TEST_F(TestLikeHolder, TestMatchOneEscape) { EXPECT_FALSE(like("dabc")); } -TEST_F(TestLikeHolder, TestMatchManyEscape) { - std::shared_ptr like_holder; +TEST_F(TestSQLLikeHolder, TestMatchManyEscape) { + std::shared_ptr like_holder; - auto status = LikeHolder::Make("ab\\%", "\\", &like_holder); + auto status = SQLLikeHolder::Make("ab\\%", "\\", &like_holder); EXPECT_EQ(status.ok(), true) << status.message(); auto& like = *like_holder; @@ -178,10 +371,10 @@ TEST_F(TestLikeHolder, TestMatchManyEscape) { EXPECT_FALSE(like("dabc")); } -TEST_F(TestLikeHolder, TestMatchEscape) { - std::shared_ptr like_holder; +TEST_F(TestSQLLikeHolder, TestMatchEscape) { + std::shared_ptr like_holder; - auto status = LikeHolder::Make("ab\\\\", "\\", &like_holder); + auto status = SQLLikeHolder::Make("ab\\\\", "\\", &like_holder); EXPECT_EQ(status.ok(), true) << status.message(); auto& like = *like_holder; @@ -191,10 +384,10 @@ TEST_F(TestLikeHolder, TestMatchEscape) { EXPECT_FALSE(like("abc")); } -TEST_F(TestLikeHolder, TestEmptyEscapeChar) { - std::shared_ptr like_holder; +TEST_F(TestSQLLikeHolder, TestEmptyEscapeChar) { + std::shared_ptr like_holder; - auto status = LikeHolder::Make("ab\\_", "", &like_holder); + auto status = SQLLikeHolder::Make("ab\\_", "", &like_holder); EXPECT_EQ(status.ok(), true) << status.message(); auto& like = *like_holder; @@ -206,10 +399,10 @@ TEST_F(TestLikeHolder, TestEmptyEscapeChar) { EXPECT_FALSE(like("ab__")); } -TEST_F(TestLikeHolder, TestMultipleEscapeChar) { - std::shared_ptr like_holder; +TEST_F(TestSQLLikeHolder, TestMultipleEscapeChar) { + std::shared_ptr like_holder; - auto status = LikeHolder::Make("ab\\_", "\\\\", &like_holder); + auto status = SQLLikeHolder::Make("ab\\_", "\\\\", &like_holder); EXPECT_EQ(status.ok(), false) << status.message(); } class TestILikeHolder : public ::testing::Test { @@ -224,10 +417,10 @@ class TestILikeHolder : public ::testing::Test { }; TEST_F(TestILikeHolder, TestMatchAny) { - std::shared_ptr like_holder; + std::shared_ptr like_holder; regex_op.set_case_sensitive(false); - auto status = LikeHolder::Make("ab%", &like_holder, regex_op); + auto status = SQLLikeHolder::Make("ab%", &like_holder, regex_op); EXPECT_EQ(status.ok(), true) << status.message(); auto& like = *like_holder; @@ -240,10 +433,10 @@ TEST_F(TestILikeHolder, TestMatchAny) { } TEST_F(TestILikeHolder, TestMatchOne) { - std::shared_ptr like_holder; + std::shared_ptr like_holder; regex_op.set_case_sensitive(false); - auto status = LikeHolder::Make("Ab_", &like_holder, regex_op); + auto status = SQLLikeHolder::Make("Ab_", &like_holder, regex_op); EXPECT_EQ(status.ok(), true) << status.message(); auto& like = *like_holder; @@ -256,10 +449,10 @@ TEST_F(TestILikeHolder, TestMatchOne) { } TEST_F(TestILikeHolder, TestPcreSpecial) { - std::shared_ptr like_holder; + std::shared_ptr like_holder; regex_op.set_case_sensitive(false); - auto status = LikeHolder::Make(".*aB_", &like_holder, regex_op); + auto status = SQLLikeHolder::Make(".*aB_", &like_holder, regex_op); EXPECT_EQ(status.ok(), true) << status.message(); auto& like = *like_holder; @@ -268,14 +461,21 @@ TEST_F(TestILikeHolder, TestPcreSpecial) { } TEST_F(TestILikeHolder, TestDot) { - std::shared_ptr like_holder; + std::shared_ptr like_holder; regex_op.set_case_sensitive(false); - auto status = LikeHolder::Make("aBc.", &like_holder, regex_op); + auto status = SQLLikeHolder::Make("aBc.", &like_holder, regex_op); EXPECT_EQ(status.ok(), true) << status.message(); auto& like = *like_holder; EXPECT_FALSE(like("abcd")); } +TEST_F(TestILikeHolder, TestOptimise) { + // no optimise for ilike + auto fnode = SQLLikeHolder::TryOptimize(BuildILike("%abc%")); + EXPECT_EQ(fnode.descriptor()->name(), "ilike"); + EXPECT_EQ(fnode.ToString(), "bool ilike((string) in, (const string) %abc%)"); +} + } // namespace gandiva diff --git a/cpp/src/gandiva/regex_util.cc b/cpp/src/gandiva/regex_util.cc index abdd579d1f5e4..dbe0d63645415 100644 --- a/cpp/src/gandiva/regex_util.cc +++ b/cpp/src/gandiva/regex_util.cc @@ -30,8 +30,15 @@ Status RegexUtil::SqlLikePatternToPcre(const std::string& sql_pattern, char esca for (size_t idx = 0; idx < sql_pattern.size(); ++idx) { auto cur = sql_pattern.at(idx); + if (idx == 0 && cur != '%') { + pcre_pattern += '^'; + } else if (idx == 0 && cur == '%') { + continue; + } + // Escape any char that is special for pcre regex - if (pcre_regex_specials_.find(cur) != pcre_regex_specials_.end()) { + if (pcre_regex_specials_.find(cur) != pcre_regex_specials_.end() && + cur != escape_char) { pcre_pattern += "\\"; } @@ -43,7 +50,11 @@ Status RegexUtil::SqlLikePatternToPcre(const std::string& sql_pattern, char esca Status::Invalid("Unexpected escape char at the end of pattern ", sql_pattern)); cur = sql_pattern.at(idx); - if (cur == '_' || cur == '%' || cur == escape_char) { + + if (cur == '\\' && escape_char == '\\') { + // Backslash still needs to be escaped in pcre + pcre_pattern += "\\\\"; + } else if (cur == '_' || cur == '%' || cur == escape_char) { pcre_pattern += cur; } else { return Status::Invalid("Invalid escape sequence in pattern ", sql_pattern, @@ -51,11 +62,17 @@ Status RegexUtil::SqlLikePatternToPcre(const std::string& sql_pattern, char esca } } else if (cur == '_') { pcre_pattern += '.'; + } else if (cur == '%' && idx == sql_pattern.size() - 1) { + continue; } else if (cur == '%') { pcre_pattern += ".*"; } else { pcre_pattern += cur; } + + if (idx == sql_pattern.size() - 1 && cur != '%') { + pcre_pattern += '$'; + } } return Status::OK(); } diff --git a/cpp/src/gandiva/regex_util.h b/cpp/src/gandiva/regex_util.h index cf0002b8cdf20..894d20ff92a80 100644 --- a/cpp/src/gandiva/regex_util.h +++ b/cpp/src/gandiva/regex_util.h @@ -29,7 +29,7 @@ namespace gandiva { /// \brief Utility class for converting sql patterns to pcre patterns. class GANDIVA_EXPORT RegexUtil { public: - // Convert an sql pattern to a pcre pattern + // Convert an sql pattern to a pcre pattern for use with PartialMatch static Status SqlLikePatternToPcre(const std::string& like_pattern, char escape_char, std::string& pcre_pattern); diff --git a/cpp/src/gandiva/tests/utf8_test.cc b/cpp/src/gandiva/tests/utf8_test.cc index e19d6712d5785..a18f5c750c3d3 100644 --- a/cpp/src/gandiva/tests/utf8_test.cc +++ b/cpp/src/gandiva/tests/utf8_test.cc @@ -16,9 +16,9 @@ // under the License. #include + #include "arrow/memory_pool.h" #include "arrow/status.h" - #include "gandiva/projector.h" #include "gandiva/tests/test_util.h" #include "gandiva/tree_expr_builder.h" @@ -246,8 +246,9 @@ TEST_F(TestUtf8, TestLikeWithEscape) { // Create a row-batch with some sample data int num_records = 4; - auto array_a = MakeArrowArrayUtf8( - {"park", "spa%rkle", "bright spa%rk and fire", "spark"}, {true, true, true, true}); + auto array_a = + MakeArrowArrayUtf8({"park", "spa%rkle", "bright spa%rk and fire", "spa\\rk"}, + {true, true, true, true}); // expected output auto exp = MakeArrowArrayBool({false, true, true, false}, {true, true, true, true});