From 76e889829868d5c5f5b49843b8b1174cf8f24e6e Mon Sep 17 00:00:00 2001 From: crystrix Date: Sun, 14 Mar 2021 19:49:27 +0800 Subject: [PATCH 1/4] ARROW-11960: [Gandiva][C++]Support escape in LIKE --- cpp/src/gandiva/function_registry_string.cc | 4 ++ cpp/src/gandiva/gdv_function_stubs.cc | 19 ++++++++ cpp/src/gandiva/gdv_function_stubs.h | 3 ++ cpp/src/gandiva/like_holder.cc | 36 +++++++++++++-- cpp/src/gandiva/like_holder.h | 3 ++ cpp/src/gandiva/like_holder_test.cc | 49 +++++++++++++++++++++ cpp/src/gandiva/tests/utf8_test.cc | 43 ++++++++++++++++++ 7 files changed, 153 insertions(+), 4 deletions(-) diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index cbc70066306df..a5aaeabc77b0c 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -124,6 +124,10 @@ std::vector GetStringFunctionRegistry() { kResultNullIfNull, "gdv_fn_like_utf8_utf8", NativeFunction::kNeedsFunctionHolder), + NativeFunction("like", {}, DataTypeVector{utf8(), utf8(), int8()}, boolean(), + kResultNullIfNull, "gdv_fn_like_utf8_utf8_int8", + NativeFunction::kNeedsFunctionHolder), + NativeFunction("ltrim", {}, DataTypeVector{utf8(), utf8()}, utf8(), kResultNullIfNull, "ltrim_utf8_utf8", NativeFunction::kNeedsContext), diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index a890775edad95..36afb21b077a5 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -45,6 +45,13 @@ bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len, return (*holder)(std::string(data, data_len)); } +bool gdv_fn_like_utf8_utf8_int8(int64_t ptr, const char* data, int data_len, + const char* pattern, int pattern_len, + int8_t escape_char) { + gandiva::LikeHolder* holder = reinterpret_cast(ptr); + return (*holder)(std::string(data, data_len)); +} + double gdv_fn_random(int64_t ptr) { gandiva::RandomGeneratorHolder* holder = reinterpret_cast(ptr); @@ -732,6 +739,18 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { types->i1_type() /*return_type*/, args, reinterpret_cast(gdv_fn_like_utf8_utf8)); + // gdv_fn_like_utf8_utf8_int8 + args = {types->i64_type(), // int64_t ptr + types->i8_ptr_type(), // const char* data + types->i32_type(), // int data_len + types->i8_ptr_type(), // const char* pattern + types->i32_type(), // int pattern_len + types->i8_type()}; // int8_t escape_char + + engine->AddGlobalMappingForFunc("gdv_fn_like_utf8_utf8_int8", + types->i1_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_like_utf8_utf8_int8)); + // gdv_fn_to_date_utf8_utf8 args = {types->i64_type(), // int64_t execution_context types->i64_type(), // int64_t holder_ptr diff --git a/cpp/src/gandiva/gdv_function_stubs.h b/cpp/src/gandiva/gdv_function_stubs.h index 847772b17a4bc..1b0ab8e181666 100644 --- a/cpp/src/gandiva/gdv_function_stubs.h +++ b/cpp/src/gandiva/gdv_function_stubs.h @@ -46,6 +46,9 @@ using gdv_day_time_interval = int64_t; bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len, const char* pattern, int pattern_len); +bool gdv_fn_like_utf8_utf8_int8(int64_t ptr, const char* data, int data_len, + const char* pattern, int pattern_len, int8_t escape_char); + int64_t gdv_fn_to_date_utf8_utf8_int32(int64_t context, int64_t ptr, const char* data, int data_len, bool in1_validity, const char* pattern, int pattern_len, diff --git a/cpp/src/gandiva/like_holder.cc b/cpp/src/gandiva/like_holder.cc index 688a4ffa13067..04f660f586ac0 100644 --- a/cpp/src/gandiva/like_holder.cc +++ b/cpp/src/gandiva/like_holder.cc @@ -67,8 +67,8 @@ static bool IsArrowStringLiteral(arrow::Type::type type) { } Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr* holder) { - ARROW_RETURN_IF(node.children().size() != 2, - Status::Invalid("'like' function requires two parameters")); + ARROW_RETURN_IF(node.children().size() != 2 && node.children().size() != 3, + Status::Invalid("'like' function requires two or three parameters")); auto literal = dynamic_cast(node.children().at(1).get()); ARROW_RETURN_IF( @@ -80,8 +80,22 @@ Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr* h !IsArrowStringLiteral(literal_type), Status::Invalid( "'like' function requires a string literal as the second parameter")); - - return Make(arrow::util::get(literal->holder()), holder); + if (node.children().size() == 2) { + return Make(arrow::util::get(literal->holder()), holder); + } else { + auto escape_char = dynamic_cast(node.children().at(2).get()); + ARROW_RETURN_IF( + escape_char == nullptr, + Status::Invalid("'like' function requires a literal as the third parameter")); + + auto escape_char_type = escape_char->return_type()->id(); + ARROW_RETURN_IF( + escape_char_type != arrow::Type::INT8, + Status::Invalid( + "'like' function requires a int8 literal as the third parameter")); + return Make(arrow::util::get(literal->holder()), + arrow::util::get(escape_char->holder()), holder); + } } Status LikeHolder::Make(const std::string& sql_pattern, @@ -97,4 +111,18 @@ Status LikeHolder::Make(const std::string& sql_pattern, return Status::OK(); } +Status LikeHolder::Make(const std::string& sql_pattern, char escape_char, + std::shared_ptr* holder) { + std::string pcre_pattern; + ARROW_RETURN_NOT_OK( + RegexUtil::SqlLikePatternToPcre(sql_pattern, escape_char, pcre_pattern)); + + auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern)); + ARROW_RETURN_IF(!lholder->regex_.ok(), + Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed")); + + *holder = lholder; + return Status::OK(); +} + } // namespace gandiva diff --git a/cpp/src/gandiva/like_holder.h b/cpp/src/gandiva/like_holder.h index 82c9e3b29a602..328ad9e3a1c54 100644 --- a/cpp/src/gandiva/like_holder.h +++ b/cpp/src/gandiva/like_holder.h @@ -39,6 +39,9 @@ class GANDIVA_EXPORT LikeHolder : public FunctionHolder { static Status Make(const std::string& sql_pattern, std::shared_ptr* holder); + static Status Make(const std::string& sql_pattern, char escape_char, + std::shared_ptr* holder); + // Try and optimise a function node with a "like" pattern. static const FunctionNode TryOptimize(const FunctionNode& node); diff --git a/cpp/src/gandiva/like_holder_test.cc b/cpp/src/gandiva/like_holder_test.cc index ce6697e72d681..9408c7a1b3285 100644 --- a/cpp/src/gandiva/like_holder_test.cc +++ b/cpp/src/gandiva/like_holder_test.cc @@ -33,6 +33,16 @@ class TestLikeHolder : public ::testing::Test { std::make_shared(arrow::utf8(), LiteralHolder(pattern), false); return FunctionNode("like", {field, pattern_node}, arrow::boolean()); } + + FunctionNode BuildLike(std::string pattern, char escape_char) { + auto field = std::make_shared(arrow::field("in", arrow::utf8())); + auto pattern_node = + std::make_shared(arrow::utf8(), LiteralHolder(pattern), false); + auto escape_char_node = std::make_shared( + arrow::int8(), LiteralHolder((int8_t)escape_char), false); + return FunctionNode("like", {field, pattern_node, escape_char_node}, + arrow::boolean()); + } }; TEST_F(TestLikeHolder, TestMatchAny) { @@ -125,6 +135,45 @@ TEST_F(TestLikeHolder, TestOptimise) { fnode = LikeHolder::TryOptimize(BuildLike("x_yz%")); EXPECT_EQ(fnode.descriptor()->name(), "like"); + + // no optimisation for escaped pattern. + fnode = LikeHolder::TryOptimize(BuildLike("\\%xyz", '\\')); + EXPECT_EQ(fnode.descriptor()->name(), "like"); + EXPECT_EQ(fnode.ToString(), + "bool like((string) in, (const string) \\%xyz, (const int8) \\)"); +} + +TEST_F(TestLikeHolder, TestMatchOneEscape) { + std::shared_ptr like_holder; + + auto status = LikeHolder::Make("ab\\_", '\\', &like_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& like = *like_holder; + + EXPECT_TRUE(like("ab_")); + + EXPECT_FALSE(like("abc")); + EXPECT_FALSE(like("abd")); + EXPECT_FALSE(like("a")); + EXPECT_FALSE(like("abcd")); + EXPECT_FALSE(like("dabc")); } +TEST_F(TestLikeHolder, TestMatchManyEscape) { + std::shared_ptr like_holder; + + auto status = LikeHolder::Make("ab\\%", '\\', &like_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& like = *like_holder; + + EXPECT_TRUE(like("ab_")); + + EXPECT_FALSE(like("abc")); + EXPECT_FALSE(like("abd")); + EXPECT_FALSE(like("a")); + EXPECT_FALSE(like("abcd")); + EXPECT_FALSE(like("dabc")); +} } // namespace gandiva diff --git a/cpp/src/gandiva/tests/utf8_test.cc b/cpp/src/gandiva/tests/utf8_test.cc index 29ce81f4942e8..08de2762961c1 100644 --- a/cpp/src/gandiva/tests/utf8_test.cc +++ b/cpp/src/gandiva/tests/utf8_test.cc @@ -221,6 +221,49 @@ TEST_F(TestUtf8, TestLike) { EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); } +TEST_F(TestUtf8, TestLikeWithEscape) { + // schema for input fields + auto field_a = field("a", utf8()); + auto schema = arrow::schema({field_a}); + + // output fields + auto res = field("res", boolean()); + + // build expressions. + // like(literal(s), a, '\') + + auto node_a = TreeExprBuilder::MakeField(field_a); + auto literal_s = TreeExprBuilder::MakeStringLiteral("%pa\\%rk%"); + auto escape_char = TreeExprBuilder::MakeLiteral((int8_t)'\\'); + auto is_like = + TreeExprBuilder::MakeFunction("like", {node_a, literal_s, escape_char}, boolean()); + auto expr = TreeExprBuilder::MakeExpression(is_like, res); + + // Build a projector for the expressions. + std::shared_ptr projector; + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Create a row-batch with some sample data + int num_records = 4; + auto array_a = MakeArrowArrayUtf8( + {"park", "spa%rkle", "bright spa%rk and fire", "spark"}, {true, true, true, true}); + + // expected output + auto exp = MakeArrowArrayBool({false, true, true, false}, {true, true, true, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); +} + TEST_F(TestUtf8, TestBeginsEnds) { // schema for input fields auto field_a = field("a", utf8()); From 3565fcad764e0f6fb5a1d24ea9262c5cc05a73bc Mon Sep 17 00:00:00 2001 From: crystrix Date: Mon, 15 Mar 2021 00:13:26 +0800 Subject: [PATCH 2/4] Update failure case of like escape --- cpp/src/gandiva/like_holder_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/gandiva/like_holder_test.cc b/cpp/src/gandiva/like_holder_test.cc index 9408c7a1b3285..e1ea0c23a9d16 100644 --- a/cpp/src/gandiva/like_holder_test.cc +++ b/cpp/src/gandiva/like_holder_test.cc @@ -168,7 +168,7 @@ TEST_F(TestLikeHolder, TestMatchManyEscape) { auto& like = *like_holder; - EXPECT_TRUE(like("ab_")); + EXPECT_TRUE(like("ab%")); EXPECT_FALSE(like("abc")); EXPECT_FALSE(like("abd")); From bbe982ca90fa4b924073c55b997ab132af41d1e3 Mon Sep 17 00:00:00 2001 From: crystrix Date: Fri, 2 Apr 2021 11:39:40 +0800 Subject: [PATCH 3/4] Use utf8 as the type of escape char --- cpp/src/gandiva/function_registry_string.cc | 4 +-- cpp/src/gandiva/gdv_function_stubs.cc | 13 +++---- cpp/src/gandiva/gdv_function_stubs.h | 5 +-- cpp/src/gandiva/like_holder.cc | 19 ++++++---- cpp/src/gandiva/like_holder.h | 2 +- cpp/src/gandiva/like_holder_test.cc | 39 +++++++++++++++++++-- 6 files changed, 63 insertions(+), 19 deletions(-) diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index a5aaeabc77b0c..35ef2dfcb3431 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -124,8 +124,8 @@ std::vector GetStringFunctionRegistry() { kResultNullIfNull, "gdv_fn_like_utf8_utf8", NativeFunction::kNeedsFunctionHolder), - NativeFunction("like", {}, DataTypeVector{utf8(), utf8(), int8()}, boolean(), - kResultNullIfNull, "gdv_fn_like_utf8_utf8_int8", + NativeFunction("like", {}, DataTypeVector{utf8(), utf8(), utf8()}, boolean(), + kResultNullIfNull, "gdv_fn_like_utf8_utf8_utf8", NativeFunction::kNeedsFunctionHolder), NativeFunction("ltrim", {}, DataTypeVector{utf8(), utf8()}, utf8(), diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index 36afb21b077a5..26b8654fb7edf 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -45,9 +45,9 @@ bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len, return (*holder)(std::string(data, data_len)); } -bool gdv_fn_like_utf8_utf8_int8(int64_t ptr, const char* data, int data_len, +bool gdv_fn_like_utf8_utf8_utf8(int64_t ptr, const char* data, int data_len, const char* pattern, int pattern_len, - int8_t escape_char) { + const char* escape_char, int escape_char_len) { gandiva::LikeHolder* holder = reinterpret_cast(ptr); return (*holder)(std::string(data, data_len)); } @@ -739,17 +739,18 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { types->i1_type() /*return_type*/, args, reinterpret_cast(gdv_fn_like_utf8_utf8)); - // gdv_fn_like_utf8_utf8_int8 + // gdv_fn_like_utf8_utf8_utf8 args = {types->i64_type(), // int64_t ptr types->i8_ptr_type(), // const char* data types->i32_type(), // int data_len types->i8_ptr_type(), // const char* pattern types->i32_type(), // int pattern_len - types->i8_type()}; // int8_t escape_char + types->i8_ptr_type(), // const char* escape_char + types->i32_type()}; // int escape_char_len - engine->AddGlobalMappingForFunc("gdv_fn_like_utf8_utf8_int8", + engine->AddGlobalMappingForFunc("gdv_fn_like_utf8_utf8_utf8", types->i1_type() /*return_type*/, args, - reinterpret_cast(gdv_fn_like_utf8_utf8_int8)); + reinterpret_cast(gdv_fn_like_utf8_utf8_utf8)); // gdv_fn_to_date_utf8_utf8 args = {types->i64_type(), // int64_t execution_context diff --git a/cpp/src/gandiva/gdv_function_stubs.h b/cpp/src/gandiva/gdv_function_stubs.h index 1b0ab8e181666..d4a127dd1cfe6 100644 --- a/cpp/src/gandiva/gdv_function_stubs.h +++ b/cpp/src/gandiva/gdv_function_stubs.h @@ -46,8 +46,9 @@ using gdv_day_time_interval = int64_t; bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len, const char* pattern, int pattern_len); -bool gdv_fn_like_utf8_utf8_int8(int64_t ptr, const char* data, int data_len, - const char* pattern, int pattern_len, int8_t escape_char); +bool gdv_fn_like_utf8_utf8_utf8(int64_t ptr, const char* data, int data_len, + const char* pattern, int pattern_len, + const char* escape_char, int escape_char_len); int64_t gdv_fn_to_date_utf8_utf8_int32(int64_t context, int64_t ptr, const char* data, int data_len, bool in1_validity, diff --git a/cpp/src/gandiva/like_holder.cc b/cpp/src/gandiva/like_holder.cc index 04f660f586ac0..5a3510e36528b 100644 --- a/cpp/src/gandiva/like_holder.cc +++ b/cpp/src/gandiva/like_holder.cc @@ -90,11 +90,11 @@ Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr* h auto escape_char_type = escape_char->return_type()->id(); ARROW_RETURN_IF( - escape_char_type != arrow::Type::INT8, + !IsArrowStringLiteral(escape_char_type), Status::Invalid( - "'like' function requires a int8 literal as the third parameter")); + "'like' function requires a string literal as the third parameter")); return Make(arrow::util::get(literal->holder()), - arrow::util::get(escape_char->holder()), holder); + arrow::util::get(escape_char->holder()), holder); } } @@ -111,11 +111,18 @@ Status LikeHolder::Make(const std::string& sql_pattern, return Status::OK(); } -Status LikeHolder::Make(const std::string& sql_pattern, char escape_char, +Status LikeHolder::Make(const std::string& sql_pattern, const std::string& escape_char, std::shared_ptr* holder) { + ARROW_RETURN_IF(escape_char.length() > 1, + Status::Invalid("The length of escape char ", escape_char, + " in 'like' function is greater than 1")); std::string pcre_pattern; - ARROW_RETURN_NOT_OK( - RegexUtil::SqlLikePatternToPcre(sql_pattern, escape_char, pcre_pattern)); + if (escape_char.length() == 1) { + ARROW_RETURN_NOT_OK( + RegexUtil::SqlLikePatternToPcre(sql_pattern, escape_char.at(0), pcre_pattern)); + } else { + ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); + } auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern)); ARROW_RETURN_IF(!lholder->regex_.ok(), diff --git a/cpp/src/gandiva/like_holder.h b/cpp/src/gandiva/like_holder.h index 328ad9e3a1c54..c7982e9143748 100644 --- a/cpp/src/gandiva/like_holder.h +++ b/cpp/src/gandiva/like_holder.h @@ -39,7 +39,7 @@ class GANDIVA_EXPORT LikeHolder : public FunctionHolder { static Status Make(const std::string& sql_pattern, std::shared_ptr* holder); - static Status Make(const std::string& sql_pattern, char escape_char, + static Status Make(const std::string& sql_pattern, const std::string& escape_char, std::shared_ptr* holder); // Try and optimise a function node with a "like" pattern. diff --git a/cpp/src/gandiva/like_holder_test.cc b/cpp/src/gandiva/like_holder_test.cc index e1ea0c23a9d16..18e585fc502d4 100644 --- a/cpp/src/gandiva/like_holder_test.cc +++ b/cpp/src/gandiva/like_holder_test.cc @@ -146,7 +146,7 @@ TEST_F(TestLikeHolder, TestOptimise) { TEST_F(TestLikeHolder, TestMatchOneEscape) { std::shared_ptr like_holder; - auto status = LikeHolder::Make("ab\\_", '\\', &like_holder); + auto status = LikeHolder::Make("ab\\_", "\\", &like_holder); EXPECT_EQ(status.ok(), true) << status.message(); auto& like = *like_holder; @@ -163,7 +163,7 @@ TEST_F(TestLikeHolder, TestMatchOneEscape) { TEST_F(TestLikeHolder, TestMatchManyEscape) { std::shared_ptr like_holder; - auto status = LikeHolder::Make("ab\\%", '\\', &like_holder); + auto status = LikeHolder::Make("ab\\%", "\\", &like_holder); EXPECT_EQ(status.ok(), true) << status.message(); auto& like = *like_holder; @@ -176,4 +176,39 @@ TEST_F(TestLikeHolder, TestMatchManyEscape) { EXPECT_FALSE(like("abcd")); EXPECT_FALSE(like("dabc")); } + +TEST_F(TestLikeHolder, TestMatchEscape) { + std::shared_ptr like_holder; + + auto status = LikeHolder::Make("ab\\\\", "\\", &like_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& like = *like_holder; + + EXPECT_TRUE(like("ab\\")); + + EXPECT_FALSE(like("abc")); +} + +TEST_F(TestLikeHolder, TestEmptyEscapeChar) { + std::shared_ptr like_holder; + + auto status = LikeHolder::Make("ab\\_", "", &like_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& like = *like_holder; + + EXPECT_TRUE(like("ab\\c")); + EXPECT_TRUE(like("ab\\_")); + + EXPECT_FALSE(like("ab\\_d")); + EXPECT_FALSE(like("ab__")); +} + +TEST_F(TestLikeHolder, TestMultipleEscapeChar) { + std::shared_ptr like_holder; + + auto status = LikeHolder::Make("ab\\_", "\\\\", &like_holder); + EXPECT_EQ(status.ok(), false) << status.message(); +} } // namespace gandiva From e99fa9f65102eb61fb50d094f4b94cde1024b62d Mon Sep 17 00:00:00 2001 From: crystrix Date: Fri, 2 Apr 2021 12:32:11 +0800 Subject: [PATCH 4/4] Fix a test case failure --- cpp/src/gandiva/tests/utf8_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/gandiva/tests/utf8_test.cc b/cpp/src/gandiva/tests/utf8_test.cc index 08de2762961c1..01e62a59379e0 100644 --- a/cpp/src/gandiva/tests/utf8_test.cc +++ b/cpp/src/gandiva/tests/utf8_test.cc @@ -234,7 +234,7 @@ TEST_F(TestUtf8, TestLikeWithEscape) { auto node_a = TreeExprBuilder::MakeField(field_a); auto literal_s = TreeExprBuilder::MakeStringLiteral("%pa\\%rk%"); - auto escape_char = TreeExprBuilder::MakeLiteral((int8_t)'\\'); + auto escape_char = TreeExprBuilder::MakeStringLiteral("\\"); auto is_like = TreeExprBuilder::MakeFunction("like", {node_a, literal_s, escape_char}, boolean()); auto expr = TreeExprBuilder::MakeExpression(is_like, res);