From 924fbec4825f7a3bc1a588e11abf2afb712b6199 Mon Sep 17 00:00:00 2001 From: Francis <455954986@qq.com> Date: Tue, 16 Apr 2024 00:09:05 +0800 Subject: [PATCH] GH-41121: [C++] Fix: left anti join filter empty rows. (#41122) ### Rationale for this change Since the left anti filter implementation is based on the left semi filter, and an assertion error occurs when the left semi filter rows are empty, this problem should be fixed. ### What changes are included in this PR? swiss_join.cc and hash_join_node_test.cc ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #41121 Lead-authored-by: light-city <455954986@qq.com> Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/acero/hash_join_node_test.cc | 23 ++++++++++++++++++++++ cpp/src/arrow/acero/swiss_join.cc | 5 +++++ 2 files changed, 28 insertions(+) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 63969d9a3ed4b..9c3dbc176ff4f 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -2036,6 +2036,29 @@ TEST(HashJoin, ResidualFilter) { [3, 4, "alpha", 4, 16, "alpha"]])")}); } +TEST(HashJoin, FilterEmptyRows) { + // Regression test for GH-41121. + BatchesWithSchema input_left; + input_left.batches = { + ExecBatchFromJSON({int32(), utf8(), int32()}, R"([[2, "Jarry", 28]])")}; + input_left.schema = + schema({field("id", int32()), field("name", utf8()), field("age", int32())}); + + BatchesWithSchema input_right; + input_right.batches = {ExecBatchFromJSON( + {int32(), int32(), utf8()}, + R"([[2, 10, "Jack"], [3, 12, "Mark"], [4, 15, "Tom"], [1, 10, "Jack"]])")}; + input_right.schema = + schema({field("id", int32()), field("stu_id", int32()), field("subject", utf8())}); + + const ResidualFilterCaseRunner runner{std::move(input_left), std::move(input_right)}; + + Expression filter = greater(field_ref("age"), literal(25)); + + runner.Run(JoinType::LEFT_ANTI, {"id"}, {"stu_id"}, std::move(filter), + {ExecBatchFromJSON({int32(), utf8(), int32()}, R"([[2, "Jarry", 28]])")}); +} + TEST(HashJoin, TrivialResidualFilter) { Expression always_true = equal(call("add", {field_ref("l1"), field_ref("r1")}), literal(2)); // 1 + 1 == 2 diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 61c8bfe95414e..542e943c4a82b 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -2167,6 +2167,11 @@ Status JoinResidualFilter::FilterOneBatch(const ExecBatch& keypayload_batch, ARROW_DCHECK(!output_payload_ids || payload_ids_maybe_null); *num_passing_rows = 0; + + if (num_batch_rows == 0) { + return Status::OK(); + } + ARROW_ASSIGN_OR_RAISE(Datum mask, EvalFilter(keypayload_batch, num_batch_rows, batch_row_ids, key_ids_maybe_null, payload_ids_maybe_null));