From b7ae7a07c7caaf69ad188c1905407dba5fb34811 Mon Sep 17 00:00:00 2001 From: Jerry Hu Date: Mon, 25 Dec 2023 09:07:38 +0800 Subject: [PATCH] [fix](join) incorrect result of left semi/anti join with empty build side (#28898) --- be/src/vec/common/hash_table/hash_map.h | 35 +++++++++++++++++++ .../test_null_aware_left_anti_join.out | 7 ++++ .../test_null_aware_left_anti_join.groovy | 18 +++++++--- 3 files changed, 56 insertions(+), 4 deletions(-) diff --git a/be/src/vec/common/hash_table/hash_map.h b/be/src/vec/common/hash_table/hash_map.h index 6efbdbb3e94ed0..cb2809492aebaf 100644 --- a/be/src/vec/common/hash_table/hash_map.h +++ b/be/src/vec/common/hash_table/hash_map.h @@ -226,6 +226,9 @@ class JoinHashMapTable : public HashMapTable template void prepare_build(size_t num_elem, int batch_size, bool has_null_key) { _has_null_key = has_null_key; + + // the first row in build side is not really from build side table + _empty_build_side = num_elem <= 1; max_batch_size = batch_size; bucket_size = calc_bucket_size(num_elem + 1); first.resize(bucket_size + 1); @@ -262,6 +265,14 @@ class JoinHashMapTable : public HashMapTable uint32_t* __restrict probe_idxs, bool& probe_visited, uint32_t* __restrict build_idxs, doris::vectorized::ColumnFilterHelper* mark_column) { + if constexpr (JoinOpType == doris::TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { + if (_empty_build_side) { + return _process_null_aware_left_anti_join_for_empty_build_side< + JoinOpType, with_other_conjuncts, is_mark_join>( + probe_idx, probe_rows, probe_idxs, build_idxs, mark_column); + } + } + if constexpr (is_mark_join) { return _find_batch_mark( keys, build_idx_map, probe_idx, probe_rows, probe_idxs, build_idxs, @@ -367,6 +378,29 @@ class JoinHashMapTable : public HashMapTable return std::tuple {probe_idx, 0U, matched_cnt}; } + template + auto _process_null_aware_left_anti_join_for_empty_build_side( + int probe_idx, int probe_rows, uint32_t* __restrict probe_idxs, + uint32_t* __restrict build_idxs, doris::vectorized::ColumnFilterHelper* mark_column) { + static_assert(JoinOpType == doris::TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN); + auto matched_cnt = 0; + const auto batch_size = max_batch_size; + + while (probe_idx < probe_rows && matched_cnt < batch_size) { + probe_idxs[matched_cnt] = probe_idx++; + if constexpr (is_mark_join) { + build_idxs[matched_cnt] = 0; + } + ++matched_cnt; + } + + if constexpr (is_mark_join && !with_other_conjuncts) { + mark_column->resize_fill(matched_cnt, 1); + } + + return std::tuple {probe_idx, 0U, matched_cnt}; + } + auto _find_batch_right_semi_anti(const Key* __restrict keys, const uint32_t* __restrict build_idx_map, int probe_idx, int probe_rows) { @@ -532,6 +566,7 @@ class JoinHashMapTable : public HashMapTable Cell cell; doris::vectorized::Arena* pool; bool _has_null_key = false; + bool _empty_build_side = true; }; template , diff --git a/regression-test/data/correctness_p0/test_null_aware_left_anti_join.out b/regression-test/data/correctness_p0/test_null_aware_left_anti_join.out index d33e4e2947f228..09d7d231709999 100644 --- a/regression-test/data/correctness_p0/test_null_aware_left_anti_join.out +++ b/regression-test/data/correctness_p0/test_null_aware_left_anti_join.out @@ -9,3 +9,10 @@ -- !select -- +-- !anti_emtpy_right -- +\N +1 +3 + +-- !semi_emtpy_right -- + diff --git a/regression-test/suites/correctness_p0/test_null_aware_left_anti_join.groovy b/regression-test/suites/correctness_p0/test_null_aware_left_anti_join.groovy index f732b6bda585ea..6083290b2e5cf0 100644 --- a/regression-test/suites/correctness_p0/test_null_aware_left_anti_join.groovy +++ b/regression-test/suites/correctness_p0/test_null_aware_left_anti_join.groovy @@ -60,11 +60,21 @@ suite("test_null_aware_left_anti_join") { sql """ set parallel_pipeline_task_num=2; """ qt_select """ select ${tableName2}.k1 from ${tableName2} where k1 not in (select ${tableName1}.k1 from ${tableName1}) order by ${tableName2}.k1; """ - sql """ - drop table if exists ${tableName2}; + // In left anti join, if right side is empty, all rows(null included) of left should be output. + qt_anti_emtpy_right """ + select + * + from ${tableName1} t1 where k1 not in ( + select k1 from ${tableName2} t2 where t2.k1 > 2 + ) order by 1; """ - sql """ - drop table if exists ${tableName1}; + // In left semi join, if right side is empty, no row should be output. + qt_semi_emtpy_right """ + select + * + from ${tableName1} t1 where k1 in ( + select k1 from ${tableName2} t2 where t2.k1 > 2 + ) order by 1; """ }