From 7eb79604296fd9775c5bd8a07b09ec28b5e3da30 Mon Sep 17 00:00:00 2001 From: Jerry Hu Date: Mon, 19 May 2025 14:42:15 +0800 Subject: [PATCH 1/4] [fix](join)Consider mark join when computing right_col_idx(#50720) (#50727) --- .../pipeline/exec/hashjoin_probe_operator.cpp | 31 ++- .../pipeline/exec/hashjoin_probe_operator.h | 6 + .../exec/join/process_hash_table_probe.h | 10 +- .../exec/join/process_hash_table_probe_impl.h | 142 ++++++++---- .../vec/common/hash_table/join_hash_table.h | 44 ++-- .../query_p0/join/mark_join/mark_join.out | 47 ++++ .../query_p0/join/mark_join/mark_join.groovy | 214 ++++++++++++++++++ 7 files changed, 423 insertions(+), 71 deletions(-) diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.cpp b/be/src/pipeline/exec/hashjoin_probe_operator.cpp index c969f60fa727c1..82e4b732193e13 100644 --- a/be/src/pipeline/exec/hashjoin_probe_operator.cpp +++ b/be/src/pipeline/exec/hashjoin_probe_operator.cpp @@ -73,6 +73,7 @@ Status HashJoinProbeLocalState::open(RuntimeState* state) { RETURN_IF_ERROR(JoinProbeLocalState::open(state)); auto& p = _parent->cast(); + _right_col_idx = p._right_col_idx; std::visit( [&](auto&& join_op_variants, auto have_other_join_conjunct) { using JoinOpType = std::decay_t; @@ -511,7 +512,7 @@ Status HashJoinProbeOperatorX::init(const TPlanNode& tnode, RuntimeState* state) const bool probe_dispose_null = _match_all_probe || _build_unique || _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || _join_op == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN || _join_op == TJoinOp::LEFT_ANTI_JOIN || - _join_op == TJoinOp::LEFT_SEMI_JOIN; + _join_op == TJoinOp::LEFT_SEMI_JOIN || _is_mark_join; const std::vector& eq_join_conjuncts = tnode.hash_join_node.eq_join_conjuncts; std::vector probe_not_ignore_null(eq_join_conjuncts.size()); size_t conjuncts_index = 0; @@ -642,14 +643,34 @@ Status HashJoinProbeOperatorX::open(RuntimeState* state) { } } - const int right_col_idx = - (_is_right_semi_anti && !_have_other_join_conjunct) ? 0 : _left_table_data_types.size(); + _right_col_idx = (_is_right_semi_anti && !_have_other_join_conjunct && + (!_is_mark_join || _mark_join_conjuncts.empty())) + ? 0 + : _left_table_data_types.size(); size_t idx = 0; for (const auto* slot : slots_to_check) { auto data_type = slot->get_data_type_ptr(); - const auto slot_on_left = idx < right_col_idx; + const auto slot_on_left = idx < _right_col_idx; + + if (slot_on_left) { + if (idx >= _left_table_data_types.size()) { + return Status::InternalError( + "Join node(id={}, OP={}) intermediate slot({}, #{})'s on left table " + "idx out bound of _left_table_data_types: {} vs {}", + _node_id, _join_op, slot->col_name(), slot->id(), idx, + _left_table_data_types.size()); + } + } else if (idx - _right_col_idx >= _right_table_data_types.size()) { + return Status::InternalError( + "Join node(id={}, OP={}) intermediate slot({}, #{})'s on right table " + "idx out bound of _right_table_data_types: {} vs {}(idx = {}, _right_col_idx = " + "{})", + _node_id, _join_op, slot->col_name(), slot->id(), idx - _right_col_idx, + _right_table_data_types.size(), idx, _right_col_idx); + } + auto target_data_type = slot_on_left ? _left_table_data_types[idx] - : _right_table_data_types[idx - right_col_idx]; + : _right_table_data_types[idx - _right_col_idx]; ++idx; if (data_type->equals(*target_data_type)) { continue; diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.h b/be/src/pipeline/exec/hashjoin_probe_operator.h index 66d709e6541ad8..ee0bb78ee4f768 100644 --- a/be/src/pipeline/exec/hashjoin_probe_operator.h +++ b/be/src/pipeline/exec/hashjoin_probe_operator.h @@ -116,6 +116,9 @@ class HashJoinProbeLocalState final std::unique_ptr _process_hashtable_ctx_variants = std::make_unique(); + // Index of column(slot) from right table in the `_intermediate_row_desc`. + size_t _right_col_idx; + RuntimeProfile::Counter* _probe_expr_call_timer = nullptr; RuntimeProfile::Counter* _probe_side_output_timer = nullptr; RuntimeProfile::HighWaterMarkCounter* _probe_arena_memory_usage = nullptr; @@ -185,6 +188,9 @@ class HashJoinProbeOperatorX final : public JoinProbeOperatorX _right_output_slot_flags; std::vector _right_table_column_names; const std::vector _partition_exprs; + + // Index of column(slot) from right table in the `_intermediate_row_desc`. + size_t _right_col_idx; }; } // namespace pipeline diff --git a/be/src/pipeline/exec/join/process_hash_table_probe.h b/be/src/pipeline/exec/join/process_hash_table_probe.h index 7a5c34fb845937..4e1a22a9fab517 100644 --- a/be/src/pipeline/exec/join/process_hash_table_probe.h +++ b/be/src/pipeline/exec/join/process_hash_table_probe.h @@ -23,6 +23,7 @@ #include "vec/columns/column.h" #include "vec/columns/columns_number.h" #include "vec/common/arena.h" +#include "vec/common/custom_allocator.h" namespace doris { namespace vectorized { @@ -137,8 +138,15 @@ struct ProcessHashTableProbe { RuntimeProfile::Counter* _probe_side_output_timer = nullptr; RuntimeProfile::Counter* _finish_probe_phase_timer = nullptr; - int _right_col_idx; + // See `HashJoinProbeOperatorX::_right_col_idx` + const int _right_col_idx; + int _right_col_len; + + // For right semi with mark join conjunct, we need to store the mark join flags + // in the hash table. + // -1 means null, 0 means false, 1 means true + DorisVector mark_join_flags; }; } // namespace pipeline diff --git a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h index 1f9d127549ad70..079c1f5ff24e82 100644 --- a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h +++ b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h @@ -56,9 +56,7 @@ ProcessHashTableProbe::ProcessHashTableProbe(HashJoinProbeLocalState _build_side_output_timer(parent->_build_side_output_timer), _probe_side_output_timer(parent->_probe_side_output_timer), _finish_probe_phase_timer(parent->_finish_probe_phase_timer), - _right_col_idx((_is_right_semi_anti && !_have_other_join_conjunct) - ? 0 - : _parent->left_table_data_types().size()), + _right_col_idx(parent->_right_col_idx), _right_col_len(_parent->right_table_data_types().size()) {} template @@ -66,11 +64,10 @@ void ProcessHashTableProbe::build_side_output_column( vectorized::MutableColumns& mcol, const std::vector& output_slot_flags, int size, bool have_other_join_conjunct, bool is_mark_join) { SCOPED_TIMER(_build_side_output_timer); - constexpr auto is_semi_anti_join = JoinOpType == TJoinOp::RIGHT_ANTI_JOIN || - JoinOpType == TJoinOp::RIGHT_SEMI_JOIN || - JoinOpType == TJoinOp::LEFT_ANTI_JOIN || + constexpr auto is_semi_anti_join = JoinOpType == TJoinOp::LEFT_ANTI_JOIN || + JoinOpType == TJoinOp::LEFT_SEMI_JOIN || JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || - JoinOpType == TJoinOp::LEFT_SEMI_JOIN; + JoinOpType == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN; constexpr auto probe_all = JoinOpType == TJoinOp::LEFT_OUTER_JOIN || JoinOpType == TJoinOp::FULL_OUTER_JOIN; @@ -209,7 +206,7 @@ Status ProcessHashTableProbe::do_process(HashTableType& hash_table_c (JoinOpType == doris::TJoinOp::LEFT_ANTI_JOIN || JoinOpType == doris::TJoinOp::LEFT_SEMI_JOIN || JoinOpType == doris::TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || - (is_mark_join && JoinOpType != doris::TJoinOp::RIGHT_SEMI_JOIN))); + (is_mark_join))); } auto& mcol = mutable_block.mutable_columns(); @@ -268,8 +265,8 @@ Status ProcessHashTableProbe::do_process(HashTableType& hash_table_c build_side_output_column(mcol, *_right_output_slot_flags, current_offset, with_other_conjuncts, is_mark_join); - if constexpr (with_other_conjuncts || (JoinOpType != TJoinOp::RIGHT_SEMI_JOIN && - JoinOpType != TJoinOp::RIGHT_ANTI_JOIN)) { + if (with_other_conjuncts || !_parent->_mark_join_conjuncts.empty() || + (JoinOpType != TJoinOp::RIGHT_SEMI_JOIN && JoinOpType != TJoinOp::RIGHT_ANTI_JOIN)) { auto check_all_match_one = [](const std::vector& vecs, uint32_t probe_idx, int size) { if (!size || vecs[0] != probe_idx || vecs[size - 1] != probe_idx + size - 1) { @@ -291,7 +288,7 @@ Status ProcessHashTableProbe::do_process(HashTableType& hash_table_c output_block->swap(mutable_block.to_block()); - if constexpr (is_mark_join && JoinOpType != TJoinOp::RIGHT_SEMI_JOIN) { + if constexpr (is_mark_join) { return do_mark_join_conjuncts( output_block, hash_table_ctx.hash_table->get_bucket_size()); } else if constexpr (with_other_conjuncts) { @@ -363,21 +360,31 @@ template template Status ProcessHashTableProbe::do_mark_join_conjuncts(vectorized::Block* output_block, size_t hash_table_bucket_size) { - DCHECK(JoinOpType == TJoinOp::LEFT_ANTI_JOIN || - JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || - JoinOpType == TJoinOp::LEFT_SEMI_JOIN || - JoinOpType == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN); + if (JoinOpType != TJoinOp::LEFT_ANTI_JOIN && JoinOpType != TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN && + JoinOpType != TJoinOp::LEFT_SEMI_JOIN && JoinOpType != TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN && + JoinOpType != TJoinOp::RIGHT_SEMI_JOIN && JoinOpType != TJoinOp::RIGHT_ANTI_JOIN) { + return Status::InternalError("join type {} is not supported", JoinOpType); + } constexpr bool is_anti_join = JoinOpType == TJoinOp::LEFT_ANTI_JOIN || - JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN; + JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + JoinOpType == TJoinOp::RIGHT_ANTI_JOIN; constexpr bool is_null_aware_join = JoinOpType == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN || JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN; + constexpr bool is_right_half_join = + JoinOpType == TJoinOp::RIGHT_SEMI_JOIN || JoinOpType == TJoinOp::RIGHT_ANTI_JOIN; const auto row_count = output_block->rows(); if (!row_count) { return Status::OK(); } + if constexpr (is_right_half_join) { + if (mark_join_flags.empty() && _build_block != nullptr) { + mark_join_flags.resize(_build_block->rows(), 0); + } + } + auto mark_column_mutable = output_block->get_by_position(_parent->_mark_column_id).column->assume_mutable(); auto& mark_column = assert_cast(*mark_column_mutable); @@ -458,37 +465,70 @@ Status ProcessHashTableProbe::do_mark_join_conjuncts(vectorized::Blo const bool should_be_null_if_build_side_has_null = *_has_null_in_build_side && is_null_aware_join && !with_other_conjuncts; for (size_t i = 0; i != row_count; ++i) { - bool not_matched_before = _parent->_last_probe_match != _probe_indexs[i]; - if (_build_indexs[i] == 0) { - bool has_null_mark_value = _parent->_last_probe_null_mark == _probe_indexs[i]; - if (not_matched_before) { - filter_map[i] = true; - mark_null_map[i] = has_null_mark_value || should_be_null_if_build_side_has_null; - mark_filter_data[i] = false; + if constexpr (is_right_half_join) { + const auto& build_index = _build_indexs[i]; + if (build_index == 0) { + continue; + } + + if (mark_join_flags[build_index] == 1) { + continue; + } + + if (mark_null_map[i]) { + mark_join_flags[build_index] = -1; + } else if (mark_filter_data[i]) { + mark_join_flags[build_index] = 1; } } else { - if (mark_null_map[i]) { // is null - _parent->_last_probe_null_mark = _probe_indexs[i]; - } else { - if (mark_filter_data[i] && not_matched_before) { - _parent->_last_probe_match = _probe_indexs[i]; + bool not_matched_before = _parent->_last_probe_match != _probe_indexs[i]; + if (_build_indexs[i] == 0) { + bool has_null_mark_value = _parent->_last_probe_null_mark == _probe_indexs[i]; + if (not_matched_before) { filter_map[i] = true; + mark_null_map[i] = has_null_mark_value || should_be_null_if_build_side_has_null; + mark_filter_data[i] = false; + } + } else { + if (mark_null_map[i]) { // is null + _parent->_last_probe_null_mark = _probe_indexs[i]; + } else { + if (mark_filter_data[i] && not_matched_before) { + _parent->_last_probe_match = _probe_indexs[i]; + filter_map[i] = true; + } } } } } - if constexpr (is_anti_join) { - // flip the mark column - for (size_t i = 0; i != row_count; ++i) { - mark_filter_data[i] ^= 1; // not null/ null + if constexpr (is_right_half_join) { + if constexpr (is_anti_join) { + // flip the mark column + for (size_t i = 0; i != row_count; ++i) { + if (mark_join_flags[i] == -1) { + // -1 means null. + continue; + } + + mark_join_flags[i] ^= 1; + } + } + // For right semi/anti join, no rows will be output in probe phase. + output_block->swap(vectorized::Block()); + return Status::OK(); + } else { + if constexpr (is_anti_join) { + // flip the mark column + for (size_t i = 0; i != row_count; ++i) { + mark_filter_data[i] ^= 1; // not null/ null + } } - } - auto result_column_id = output_block->columns(); - output_block->insert( - {std::move(filter_column), std::make_shared(), ""}); - return vectorized::Block::filter_block(output_block, result_column_id, result_column_id); + auto result_column_id = output_block->columns(); + output_block->insert({std::move(filter_column), std::make_shared(), ""}); + return vectorized::Block::filter_block(output_block, result_column_id, result_column_id); + } } template @@ -637,7 +677,8 @@ Status ProcessHashTableProbe::finish_probing(HashTableType& hash_tab if (block_size) { if (mcol.size() < _right_col_len + _right_col_idx) { return Status::InternalError( - "output block invalid, mcol.size()={}, _right_col_len={}, _right_col_idx={}", + "output block invalid, mcol.size()={}, _right_col_len={}, " + "_right_col_idx={}", mcol.size(), _right_col_len, _right_col_idx); } for (size_t j = 0; j < _right_col_len; ++j) { @@ -646,8 +687,31 @@ Status ProcessHashTableProbe::finish_probing(HashTableType& hash_tab _build_indexs.data() + block_size); } + if constexpr (JoinOpType == TJoinOp::RIGHT_ANTI_JOIN || + JoinOpType == TJoinOp::RIGHT_SEMI_JOIN) { + if (is_mark_join) { + if (mark_join_flags.empty() && _build_block != nullptr) { + mark_join_flags.resize(_build_block->rows(), 0); + } + + // mark column is nullable + auto* mark_column = assert_cast( + mcol[_parent->_mark_column_id].get()); + mark_column->resize(block_size); + auto* null_map = mark_column->get_null_map_data().data(); + auto* data = assert_cast(mark_column->get_nested_column()) + .get_data() + .data(); + for (size_t i = 0; i != block_size; ++i) { + const auto build_index = _build_indexs[i]; + null_map[i] = mark_join_flags[build_index] == -1; + data[i] = mark_join_flags[build_index] == 1; + } + } + } + // just resize the left table column in case with other conjunct to make block size is not zero - if (_is_right_semi_anti && _have_other_join_conjunct) { + if (_is_right_semi_anti && _right_col_idx != 0) { for (int i = 0; i < _right_col_idx; ++i) { mcol[i]->resize(block_size); } diff --git a/be/src/vec/common/hash_table/join_hash_table.h b/be/src/vec/common/hash_table/join_hash_table.h index 317987541cdbe1..88695ef9b43119 100644 --- a/be/src/vec/common/hash_table/join_hash_table.h +++ b/be/src/vec/common/hash_table/join_hash_table.h @@ -101,24 +101,23 @@ class JoinHashTable { } } - if constexpr (with_other_conjuncts || - (is_mark_join && JoinOpType != TJoinOp::RIGHT_SEMI_JOIN)) { - if constexpr (!with_other_conjuncts) { - constexpr bool is_null_aware_join = - JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || - JoinOpType == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN; - constexpr bool is_left_half_join = JoinOpType == TJoinOp::LEFT_SEMI_JOIN || - JoinOpType == TJoinOp::LEFT_ANTI_JOIN; - - /// For null aware join or left half(semi/anti) join without other conjuncts and without - /// mark join conjunct. - /// If one row on probe side has one match in build side, we should stop searching the - /// hash table for this row. - if (is_null_aware_join || (is_left_half_join && !has_mark_join_conjunct)) { - return _find_batch_conjunct( - keys, build_idx_map, probe_idx, build_idx, probe_rows, probe_idxs, - build_idxs); - } + if constexpr (with_other_conjuncts) { + return _find_batch_conjunct( + keys, build_idx_map, probe_idx, build_idx, probe_rows, probe_idxs, build_idxs); + } else if constexpr (is_mark_join) { + constexpr bool is_null_aware_join = JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + JoinOpType == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN; + constexpr bool is_left_half_join = + JoinOpType == TJoinOp::LEFT_SEMI_JOIN || JoinOpType == TJoinOp::LEFT_ANTI_JOIN; + + /// For null aware join or left half(semi/anti) join without other conjuncts and without + /// mark join conjunct. + /// If one row on probe side has one match in build side, we should stop searching the + /// hash table for this row. + if (is_null_aware_join || (is_left_half_join && !has_mark_join_conjunct)) { + return _find_batch_conjunct( + keys, build_idx_map, probe_idx, build_idx, probe_rows, probe_idxs, + build_idxs); } return _find_batch_conjunct( @@ -339,14 +338,7 @@ class JoinHashTable { auto do_the_probe = [&]() { while (build_idx && matched_cnt < batch_size) { - if constexpr (JoinOpType == TJoinOp::RIGHT_ANTI_JOIN || - JoinOpType == TJoinOp::RIGHT_SEMI_JOIN) { - if (!visited[build_idx] && keys[probe_idx] == build_keys[build_idx]) { - probe_idxs[matched_cnt] = probe_idx; - build_idxs[matched_cnt] = build_idx; - matched_cnt++; - } - } else if constexpr (need_judge_null) { + if constexpr (need_judge_null) { if (build_idx == bucket_size) { build_idxs[matched_cnt] = build_idx; probe_idxs[matched_cnt] = probe_idx; diff --git a/regression-test/data/query_p0/join/mark_join/mark_join.out b/regression-test/data/query_p0/join/mark_join/mark_join.out index ed3575d0e14476..ea3b7ddd6a6d1a 100644 --- a/regression-test/data/query_p0/join/mark_join/mark_join.out +++ b/regression-test/data/query_p0/join/mark_join/mark_join.out @@ -17,3 +17,50 @@ 3 -3 \N c 3 3 \N c +-- !test_right_semi_mark_join -- +1 v1 o1 \N \N +2 v2 o2 \N \N +3 v3 o3 \N \N +4 v4 o4 \N \N +5 v5 o5 \N \N +6 v1 \N \N \N +7 v2 \N \N \N +8 v3 \N \N \N +9 v4 \N \N \N +10 v5 \N \N \N + +-- !test_right_semi_mark_join_2 -- +1 v1 o1 \N \N +2 v2 o2 \N \N +3 v3 o3 \N \N +4 v4 o4 \N \N +5 v5 o5 \N \N +6 v1 \N \N \N +7 v2 \N \N \N +8 v3 \N \N \N +9 v4 \N \N \N +10 v5 \N \N \N + +-- !test_right_semi_mark_join_no_null -- +1 v1 o1 false true +2 v2 o2 false true +3 v3 o3 false true +4 v4 o4 false true +5 v5 o5 false true +6 v1 \N \N \N +7 v2 \N \N \N +8 v3 \N \N \N +9 v4 \N \N \N +10 v5 \N \N \N + +-- !test_right_semi_mark_join_no_null_2 -- +1 v1 o1 false true +2 v2 o2 false true +3 v3 o3 false true +4 v4 o4 false true +5 v5 o5 false true +6 v1 \N \N \N +7 v2 \N \N \N +8 v3 \N \N \N +9 v4 \N \N \N +10 v5 \N \N \N \ No newline at end of file diff --git a/regression-test/suites/query_p0/join/mark_join/mark_join.groovy b/regression-test/suites/query_p0/join/mark_join/mark_join.groovy index 9759a0e9b4cd70..0292fd4ae30780 100644 --- a/regression-test/suites/query_p0/join/mark_join/mark_join.groovy +++ b/regression-test/suites/query_p0/join/mark_join/mark_join.groovy @@ -61,4 +61,218 @@ suite("mark_join") { qt_test """ select * from t1 where t1.k1 not in (select t2.k3 from t2 where t2.k2 = t1.k2) or k1 < 10 order by k1, k2; """ + + sql "drop table if exists tbl1;" + sql "drop table if exists tbl2;" + sql "drop table if exists tbl3;" + + sql """ + CREATE TABLE `tbl1` ( + `unit_name` varchar(1080) NULL, + `cur_unit_name` varchar(1080) NOT NULL + ) ENGINE=OLAP + DUPLICATE KEY(`unit_name`) + DISTRIBUTED BY RANDOM BUCKETS AUTO + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql """ + CREATE TABLE `tbl2` ( + `org_code` varchar(150) NOT NULL , + `org_name` varchar(300) NULL + ) ENGINE=OLAP + DUPLICATE KEY(`org_code`) + DISTRIBUTED BY HASH(`org_code`) BUCKETS 4 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql """ + CREATE TABLE `tbl3` ( + `id` bigint NOT NULL, + `acntm_name` varchar(500) NULL , + `vendor_name` varchar(500) NULL + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS AUTO + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql """ + insert into tbl1 (unit_name, cur_unit_name) values + ('v1', 'o1'), + ('v2', 'o2'), + ('v3', 'o3'), + ('v4', 'o4'), + ('v5', 'o5'), + (null, 'o1'), + ('v1', 'o1'), + ('v2', 'o2'), + ('v3', 'o3'), + ('v4', 'o4'), + ('v5', 'o5'), + (null, 'o1'), + (null, 'o2'), + (null, 'o3'), + (null, 'o4'), + (null, 'o5'), + ('v1', 'o1'), + ('v2', 'o2'), + ('v3', 'o3'), + ('v4', 'o4'), + ('v5', 'o5'); + """ + + sql """ + insert into tbl2(org_code, org_name) values + ('v1', 'o1'), + ('v2', 'o2'), + ('v3', 'o3'), + ('v4', 'o4'), + ('v5', 'o5'), + ('v1', null), + ('v2', null), + ('v3', null), + ('v4', null), + ('v5', null); + """ + + sql """ + insert into tbl3 (id, vendor_name, acntm_name) + values(1, 'o1', 'v1'), + (2, 'o2', 'v2'), + (3, 'o3', 'v3'), + (4, 'o4', 'v4'), + (5, 'o5', 'v5'), + (6, null, 'v1'), + (7, null, 'v2'), + (8, null, 'v3'), + (9, null, 'v4'), + (10, null, 'v5'); + """ + + sql " analyze table tbl1 with sync;" + sql " analyze table tbl2 with sync;" + sql " analyze table tbl3 with sync;" + + sql "set disable_join_reorder=0;" + qt_test_right_semi_mark_join """ + select + tbl3.id, + tbl3.acntm_name, + tbl3.vendor_name, + tbl3.vendor_name in ( + select + tbl1.unit_name + from + tbl2 + join tbl1 on tbl1.cur_unit_name = tbl2.org_name + where + tbl2.org_code = tbl3.acntm_name + ) v1, + tbl3.vendor_name not in ( + select + tbl1.unit_name + from + tbl2 + join tbl1 on tbl1.cur_unit_name = tbl2.org_name + where + tbl2.org_code = tbl3.acntm_name + ) v2 + from + tbl3 order by 1,2,3,4,5; + """ + + sql "set disable_join_reorder=1;" + qt_test_right_semi_mark_join_2 """ + select + tbl3.id, + tbl3.acntm_name, + tbl3.vendor_name, + tbl3.vendor_name in ( + select + tbl1.unit_name + from + tbl2 + join tbl1 on tbl1.cur_unit_name = tbl2.org_name + where + tbl2.org_code = tbl3.acntm_name + ) v1, + tbl3.vendor_name not in ( + select + tbl1.unit_name + from + tbl2 + join tbl1 on tbl1.cur_unit_name = tbl2.org_name + where + tbl2.org_code = tbl3.acntm_name + ) v2 + from + tbl3 order by 1,2,3,4,5; + """ + + sql "set disable_join_reorder=0;" + qt_test_right_semi_mark_join_no_null """ + select + tbl3.id, + tbl3.acntm_name, + tbl3.vendor_name, + tbl3.vendor_name in ( + select + tbl1.unit_name + from + tbl2 + join tbl1 on tbl1.cur_unit_name = tbl2.org_name + where + tbl2.org_code = tbl3.acntm_name + and tbl1.unit_name is not null + ) v1, + tbl3.vendor_name not in ( + select + tbl1.unit_name + from + tbl2 + join tbl1 on tbl1.cur_unit_name = tbl2.org_name + where + tbl2.org_code = tbl3.acntm_name + and tbl1.unit_name is not null + ) v2 + from + tbl3 order by 1,2,3,4,5; + """ + + sql "set disable_join_reorder=1;" + qt_test_right_semi_mark_join_no_null_2 """ + select + tbl3.id, + tbl3.acntm_name, + tbl3.vendor_name, + tbl3.vendor_name in ( + select + tbl1.unit_name + from + tbl2 + join tbl1 on tbl1.cur_unit_name = tbl2.org_name + where + tbl2.org_code = tbl3.acntm_name + and tbl1.unit_name is not null + ) v1, + tbl3.vendor_name not in ( + select + tbl1.unit_name + from + tbl2 + join tbl1 on tbl1.cur_unit_name = tbl2.org_name + where + tbl2.org_code = tbl3.acntm_name + and tbl1.unit_name is not null + ) v2 + from + tbl3 order by 1,2,3,4,5; + """ } From 1bafe7de8c37aae10f2f6bc567d84ea57bb39020 Mon Sep 17 00:00:00 2001 From: Jerry Hu Date: Wed, 21 May 2025 12:11:17 +0800 Subject: [PATCH 2/4] [fix](join) Should not use the build block's size to resize mark_join_flags (#50993) (#51089) Pick #50993 Introduced by #51050 The build block maybe be `clear_column_mem_not_keep` in build phase when the operator is closed. ```cpp Status HashJoinBuildSinkLocalState::close(RuntimeState* state, Status exec_status) { if (_closed) { return Status::OK(); } auto& p = _parent->cast(); Defer defer {[&]() { if (!_should_build_hash_table) { return; } // The build side hash key column maybe no need output, but we need to keep the column in block // because it is used to compare with probe side hash key column if (p._should_keep_hash_key_column && _build_col_ids.size() == 1) { p._should_keep_column_flags[_build_col_ids[0]] = true; } if (_shared_state->build_block) { // release the memory of unused column in probe stage _shared_state->build_block->clear_column_mem_not_keep(p._should_keep_column_flags, p._use_shared_hash_table); } if (p._use_shared_hash_table) { std::unique_lock lock(p._mutex); p._signaled = true; for (auto& dep : _shared_state->sink_deps) { dep->set_ready(); } for (auto& dep : p._finish_dependencies) { dep->set_ready(); } } }}; ``` ``` *** Aborted at 1747343165 (unix time) try "date -d @1747343165" if you are using GNU date *** *** Current BE git commitID: e7a3e78b97 *** *** SIGSEGV address not mapped to object (@0x1) received by PID 7474 (TID 9641 OR 0x7f3f8c0e5640) from PID 1; stack trace: *** 0# doris::signal::(anonymous namespace)::FailureSignalHandler(int, siginfo_t*, void*) at /root/doris/be/src/common/signal_handler.h:421 1# PosixSignals::chained_handler(int, siginfo*, void*) [clone .part.0] in /usr/lib/jvm/java-17-openjdk-amd64/lib/server/libjvm.so 2# JVM_handle_linux_signal in /usr/lib/jvm/java-17-openjdk-amd64/lib/server/libjvm.so 3# 0x00007F4368F76520 in /lib/x86_64-linux-gnu/libc.so.6 4# doris::Status doris::pipeline::ProcessHashTableProbe<7>::finish_probing > > >(doris::vectorized::MethodKeysFixed > >&, doris::vectorized::MutableBlock&, doris::vectorized::Block*, bool*, bool) at /root/doris/be/src/pipeline/exec/join/process_hash_table_probe_impl.h:738 5# std::__detail::__variant::__gen_vtable_impl (*)(doris::pipeline::HashJoinProbeOperatorX::pull(doris::RuntimeState*, doris::vectorized::Block*, bool*) const::$_1&&, std::variant > >, doris::vectorized::MethodOneNumber > >, doris::vectorized::MethodOneNumber > >, doris::vectorized::MethodOneNumber > >, doris::vectorized::MethodOneNumber > >, doris::vectorized::MethodOneNumber, doris::JoinHashTable, HashCRC32 > > >, doris::vectorized::MethodOneNumber, doris::JoinHashTable, HashCRC32 > > >, doris::vectorized::MethodKeysFixed > >, doris::vectorized::MethodKeysFixed, HashCRC32 > > >, doris::vectorized::MethodKeysFixed > >, doris::vectorized::MethodKeysFixed, HashCRC32 > > >, doris::vectorized::MethodStringNoCache > > >&, std::variant, doris::pipeline::ProcessHashTableProbe<2>, doris::pipeline::ProcessHashTableProbe<8>, doris::pipeline::ProcessHashTableProbe<1>, doris::pipeline::ProcessHashTableProbe<4>, doris::pipeline::ProcessHashTableProbe<3>, doris::pipeline::ProcessHashTableProbe<7>, doris::pipeline::ProcessHashTableProbe<9>, doris::pipeline::ProcessHashTableProbe<10>, doris::pipeline::ProcessHashTableProbe<11> >&)>, std::integer_sequence >::__visit_invoke(doris::pipeline::HashJoinProbeOperatorX::pull(doris::RuntimeState*, doris::vectorized::Block*, bool*) const::$_1&&, std::variant > >, doris::vectorized::MethodOneNumber > >, doris::vectorized::MethodOneNumber > >, doris::vectorized::MethodOneNumber > >, doris::vectorized::MethodOneNumber > >, doris::vectorized::MethodOneNumber, doris::JoinHashTable, HashCRC32 > > >, doris::vectorized::MethodOneNumber, doris::JoinHashTable, HashCRC32 > > >, doris::vectorized::MethodKeysFixed > >, doris::vectorized::MethodKeysFixed, HashCRC32 > > >, doris::vectorized::MethodKeysFixed > >, doris::vectorized::MethodKeysFixed, HashCRC32 > > >, doris::vectorized::MethodStringNoCache > > >&, std::variant, doris::pipeline::ProcessHashTableProbe<2>, doris::pipeline::ProcessHashTableProbe<8>, doris::pipeline::ProcessHashTableProbe<1>, doris::pipeline::ProcessHashTableProbe<4>, doris::pipeline::ProcessHashTableProbe<3>, doris::pipeline::ProcessHashTableProbe<7>, doris::pipeline::ProcessHashTableProbe<9>, doris::pipeline::ProcessHashTableProbe<10>, doris::pipeline::ProcessHashTableProbe<11> >&) at /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/variant:1013 6# doris::pipeline::HashJoinProbeOperatorX::pull(doris::RuntimeState*, doris::vectorized::Block*, bool*) const at /root/doris/be/src/pipeline/exec/hashjoin_probe_operator.cpp:281 7# doris::pipeline::StatefulOperatorX::get_block(doris::RuntimeState*, doris::vectorized::Block*, bool*) at /root/doris/be/src/pipeline/exec/operator.cpp:670 8# doris::pipeline::OperatorXBase::get_block_after_projects(doris::RuntimeState*, doris::vectorized::Block*, bool*) at /root/doris/be/src/pipeline/exec/operator.cpp:381 9# doris::pipeline::PipelineTask::execute(bool*) in /mnt/hdd01/ci/doris-deploy-master-local/be/lib/doris_be 10# doris::pipeline::TaskScheduler::_do_work(int) at /root/doris/be/src/pipeline/task_scheduler.cpp:144 11# doris::ThreadPool::dispatch_thread() at /root/doris/be/src/util/threadpool.cpp:622 12# doris::Thread::supervise_thread(void*) at /root/doris/be/src/util/thread.cpp:469 13# start_thread at ./nptl/pthread_create.c:442 14# 0x00007F436905A850 at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:83 ``` Related PR: #xxx Problem Summary: None - Test - [ ] Regression test - [ ] Unit Test - [ ] Manual test (add detailed scripts or steps below) - [ ] No need to test or manual test. Explain why: - [ ] This is a refactor/code format and no logic has been changed. - [ ] Previous test can cover this change. - [ ] No code files have been changed. - [ ] Other reason - Behavior changed: - [ ] No. - [ ] Yes. - Does this need documentation? - [ ] No. - [ ] Yes. - [ ] Confirm the release note - [ ] Confirm test cases - [ ] Confirm document - [ ] Add branch pick label ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: ### Release note None ### Check List (For Author) - Test - [ ] Regression test - [ ] Unit Test - [ ] Manual test (add detailed scripts or steps below) - [ ] No need to test or manual test. Explain why: - [ ] This is a refactor/code format and no logic has been changed. - [ ] Previous test can cover this change. - [ ] No code files have been changed. - [ ] Other reason - Behavior changed: - [ ] No. - [ ] Yes. - Does this need documentation? - [ ] No. - [ ] Yes. ### Check List (For Reviewer who merge this PR) - [ ] Confirm the release note - [ ] Confirm test cases - [ ] Confirm document - [ ] Add branch pick label --- .../exec/join/process_hash_table_probe_impl.h | 19 ++-- .../join/mark_join/right_semi_mark_join.out | 63 +++++++++++ .../mark_join/right_semi_mark_join.groovy | 104 ++++++++++++++++++ 3 files changed, 177 insertions(+), 9 deletions(-) create mode 100644 regression-test/data/query_p0/join/mark_join/right_semi_mark_join.out create mode 100644 regression-test/suites/query_p0/join/mark_join/right_semi_mark_join.groovy diff --git a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h index 079c1f5ff24e82..40c2f03749cf2f 100644 --- a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h +++ b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h @@ -289,6 +289,13 @@ Status ProcessHashTableProbe::do_process(HashTableType& hash_table_c output_block->swap(mutable_block.to_block()); if constexpr (is_mark_join) { + if constexpr (JoinOpType == TJoinOp::RIGHT_SEMI_JOIN || + JoinOpType == TJoinOp::RIGHT_ANTI_JOIN) { + if (mark_join_flags.empty()) { + mark_join_flags.resize(hash_table_ctx.hash_table->size(), 0); + } + } + return do_mark_join_conjuncts( output_block, hash_table_ctx.hash_table->get_bucket_size()); } else if constexpr (with_other_conjuncts) { @@ -379,12 +386,6 @@ Status ProcessHashTableProbe::do_mark_join_conjuncts(vectorized::Blo return Status::OK(); } - if constexpr (is_right_half_join) { - if (mark_join_flags.empty() && _build_block != nullptr) { - mark_join_flags.resize(_build_block->rows(), 0); - } - } - auto mark_column_mutable = output_block->get_by_position(_parent->_mark_column_id).column->assume_mutable(); auto& mark_column = assert_cast(*mark_column_mutable); @@ -515,7 +516,7 @@ Status ProcessHashTableProbe::do_mark_join_conjuncts(vectorized::Blo } } // For right semi/anti join, no rows will be output in probe phase. - output_block->swap(vectorized::Block()); + output_block->clear(); return Status::OK(); } else { if constexpr (is_anti_join) { @@ -690,8 +691,8 @@ Status ProcessHashTableProbe::finish_probing(HashTableType& hash_tab if constexpr (JoinOpType == TJoinOp::RIGHT_ANTI_JOIN || JoinOpType == TJoinOp::RIGHT_SEMI_JOIN) { if (is_mark_join) { - if (mark_join_flags.empty() && _build_block != nullptr) { - mark_join_flags.resize(_build_block->rows(), 0); + if (mark_join_flags.empty()) { + mark_join_flags.resize(hash_table_ctx.hash_table->size(), 0); } // mark column is nullable diff --git a/regression-test/data/query_p0/join/mark_join/right_semi_mark_join.out b/regression-test/data/query_p0/join/mark_join/right_semi_mark_join.out new file mode 100644 index 00000000000000..e00e19be11e07e --- /dev/null +++ b/regression-test/data/query_p0/join/mark_join/right_semi_mark_join.out @@ -0,0 +1,63 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !test -- +\N \N +0 18332 +1 \N +1 \N +1 \N +2 -56 +3 72 +4 -5581 +5 -62 +5 -62 +6 22979 +6 22979 +6 22979 +6 22979 +6 22979 +6 22979 +6 22979 +6 22979 +7 -41 +7 -41 +7 -41 +7 -41 +8 -54 +9 -6236 +9 -6236 +9 -6236 +9 -6236 +9 -6236 +9 -6236 +10 \N +10 \N +10 \N +10 \N +10 \N +10 \N +10 \N +10 \N +10 \N +11 \N +11 \N +11 \N +12 \N +12 \N +12 \N +12 \N +12 \N +12 \N +12 \N +13 -2343514 +13 -2343514 +13 -2343514 +13 -2343514 +13 -2343514 +13 -2343514 +14 -3361960 +14 -3361960 +14 -3361960 +14 -3361960 +14 -3361960 +14 -3361960 + diff --git a/regression-test/suites/query_p0/join/mark_join/right_semi_mark_join.groovy b/regression-test/suites/query_p0/join/mark_join/right_semi_mark_join.groovy new file mode 100644 index 00000000000000..3557475cdd22c2 --- /dev/null +++ b/regression-test/suites/query_p0/join/mark_join/right_semi_mark_join.groovy @@ -0,0 +1,104 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("right_semi_mark_join") { + sql "drop table if exists tbl1;" + sql "drop table if exists tbl2;" + sql "drop table if exists tbl3;" + + sql """ + create table tbl1 (pk int, col1 bigint, col2 bigint) engine = olap DUPLICATE KEY(pk) distributed by hash(pk) buckets 10 properties("replication_num" = "1"); + """ + + sql """ + insert into + tbl1(pk, col1, col2) + values + (0, null, 18332), (1, 788547, null), (2, 4644959, -56), (3, 8364628, 72), (4, null, -5581), + (5, 2344024, -62), (6, -2689177, 22979), (7, 1320, -41), (8, null, -54), (9, 12, -6236), + (10, -8321648, null), (11, 153691, null), (12, -8056, null), (13, -12, -2343514), (14, -35, -3361960); + """ + + sql """ + create table tbl2 ( + pk int, col1 bigint, col2 bigint + ) engine = olap + distributed by hash(pk) buckets 4 + properties("replication_num" = "1"); + """ + + sql """ + insert into + tbl2(pk, col1, col2) + values + (0, 108, 31161), (1, 1479175, 6764263), (2, 110, 25), (3, 110, -18656), (4, null, -51), + (5, 21, 27), (6, -6950217, 1585978), (7, null, null), (8, null, 3453467), (9, null, -6701140); + """ + + sql """ + create table tbl3 ( + pk int, col1 bigint, col2 bigint, col3 bigint + ) engine = olap + DUPLICATE KEY(pk) distributed by hash(pk) buckets 10 + properties("replication_num" = "1"); + """ + + sql """ + insert into + tbl3(pk, col1, col2) + values + (0, 55, -58), (1, 49, 29792), (2, 95, 32361), (3, 31243, -27428), (4, -27400, null), + (5, 31243, null), (6, null, -27428), (7, null, 7), (8, 31243, -21951), (9, 13186, 24466), + (10, null, -8), (11, null, null), (12, -18, 32361), (13, null, -18), (14, 21681, 14079), + (15, 31241, -17653), (16, 5825, 13559), (17, null, -10508), (18, null, 20682), (19, 31243, -98), + (73, -32480, 24424), (74, 31, -27428), (75, 31243, -718), (76, null, 20822), (77, 31243, -27428), + (78, -15934, null), (79, 78, -27428), (80, 8572, -27428), (81, 31243, 4077), (82, null, 114), + (83, 10, -71), (84, -32489, 32361), (85, null, null), (86, -22984, 32361), (87, 26607, -27428), + (5, 31243, null), (6, null, -27428), (7, null, 7), (8, 31243, -21951), (9, 13186, 24466), + (10, null, -8), (11, null, null), (12, -18, 32361), (13, null, -18), (14, 21681, 14079), + (15, 31241, -17653), (16, 5825, 13559), (17, null, -10508), (18, null, 20682), (19, 31243, -98), + (73, -32480, 24424), (74, 31, -27428), (75, 31243, -718), (76, null, 20822), (77, 31243, -27428), + (78, -15934, null), (79, 78, -27428), (80, 8572, -27428), (81, 31243, 4077), (82, null, 114), + (83, 10, -71), (84, -32489, 32361), (85, null, null), (86, -22984, 32361), (87, 26607, -27428), + (10, null, -8), (11, null, null), (12, -18, 32361), (13, null, -18), (14, 21681, 14079), + (15, 31241, -17653), (16, 5825, 13559), (17, null, -10508), (18, null, 20682), (19, 31243, -98), + (73, -32480, 24424), (74, 31, -27428), (75, 31243, -718), (76, null, 20822), (77, 31243, -27428), + (78, -15934, null), (79, 78, -27428), (80, 8572, -27428), (81, 31243, 4077), (82, null, 114), + (83, 10, -71), (84, -32489, 32361), (85, null, null), (86, -22984, 32361), (87, 26607, -27428); + """ + + qt_test """ + SELECT + T1.pk AS C1, + T1.col2 AS C2 + FROM + tbl1 AS T1 FULL + OUTER JOIN tbl2 AS T2 ON T1.col1 <= T2.col2 + OR T2.col1 IN ( + SELECT + T3.col2 + FROM + tbl3 AS T3 + WHERE + T2.col2 = T3.col1 + ) + ORDER BY + C1, + C2 DESC; + """ +} \ No newline at end of file From 5b5f7ac357b7eba7cdbab662811d72198e270ab7 Mon Sep 17 00:00:00 2001 From: zhangdong Date: Thu, 22 May 2025 11:03:47 +0800 Subject: [PATCH 3/4] [fix](case)fix mark_join and right_semi_mark_join has same table name (#51124) Related PR: #50720 --- .../query_p0/join/mark_join/mark_join.groovy | 181 +++++++++--------- .../mark_join/right_semi_mark_join.groovy | 29 +-- 2 files changed, 111 insertions(+), 99 deletions(-) diff --git a/regression-test/suites/query_p0/join/mark_join/mark_join.groovy b/regression-test/suites/query_p0/join/mark_join/mark_join.groovy index 0292fd4ae30780..289c9d5b06bad2 100644 --- a/regression-test/suites/query_p0/join/mark_join/mark_join.groovy +++ b/regression-test/suites/query_p0/join/mark_join/mark_join.groovy @@ -16,10 +16,17 @@ // under the License. suite("mark_join") { - sql "drop table if exists t1;" - sql "drop table if exists t2;" + String suiteName = "mark_join" + String table_t1 = "${suiteName}_table_t1" + String table_t2 = "${suiteName}_table_t2" + String table_tbl1 = "${suiteName}_table_tbl1" + String table_tbl2 = "${suiteName}_table_tbl2" + String table_tbl3 = "${suiteName}_table_tbl3" + + sql "drop table if exists ${table_t1};" + sql "drop table if exists ${table_t2};" sql """ - create table t1 ( + create table ${table_t1} ( k1 int null, k2 int null, k3 bigint null, @@ -31,7 +38,7 @@ suite("mark_join") { """ sql """ - create table t2 ( + create table ${table_t2} ( k1 int null, k2 int null, k3 bigint null, @@ -42,32 +49,32 @@ suite("mark_join") { properties("replication_num" = "1"); """ - sql "insert into t1 select 1,1,1,'a';" - sql "insert into t1 select 2,2,2,'b';" - sql "insert into t1 select 3,-3,null,'c';" - sql "insert into t1 select 3,3,null,'c';" + sql "insert into ${table_t1} select 1,1,1,'a';" + sql "insert into ${table_t1} select 2,2,2,'b';" + sql "insert into ${table_t1} select 3,-3,null,'c';" + sql "insert into ${table_t1} select 3,3,null,'c';" - sql "insert into t2 select 1,1,1,'a';" - sql "insert into t2 select 2,2,2,'b';" - sql "insert into t2 select 3,-3,null,'c';" - sql "insert into t2 select 3,3,null,'c';" + sql "insert into ${table_t2} select 1,1,1,'a';" + sql "insert into ${table_t2} select 2,2,2,'b';" + sql "insert into ${table_t2} select 3,-3,null,'c';" + sql "insert into ${table_t2} select 3,3,null,'c';" qt_test """ - select * from t1 where exists (select t2.k3 from t2 where t1.k2 = t2.k2) or k1 < 10 order by k1, k2; + select * from ${table_t1} where exists (select ${table_t2}.k3 from ${table_t2} where ${table_t1}.k2 = ${table_t2}.k2) or k1 < 10 order by k1, k2; """ qt_test """ - select * from t1 where not exists (select t2.k3 from t2 where t1.k2 = t2.k2) or k1 < 10 order by k1, k2; + select * from ${table_t1} where not exists (select ${table_t2}.k3 from ${table_t2} where ${table_t1}.k2 = ${table_t2}.k2) or k1 < 10 order by k1, k2; """ qt_test """ - select * from t1 where t1.k1 not in (select t2.k3 from t2 where t2.k2 = t1.k2) or k1 < 10 order by k1, k2; + select * from ${table_t1} where ${table_t1}.k1 not in (select ${table_t2}.k3 from ${table_t2} where ${table_t2}.k2 = ${table_t1}.k2) or k1 < 10 order by k1, k2; """ - sql "drop table if exists tbl1;" - sql "drop table if exists tbl2;" - sql "drop table if exists tbl3;" + sql "drop table if exists ${table_tbl1};" + sql "drop table if exists ${table_tbl2};" + sql "drop table if exists ${table_tbl3};" sql """ - CREATE TABLE `tbl1` ( + CREATE TABLE `${table_tbl1}` ( `unit_name` varchar(1080) NULL, `cur_unit_name` varchar(1080) NOT NULL ) ENGINE=OLAP @@ -79,7 +86,7 @@ suite("mark_join") { """ sql """ - CREATE TABLE `tbl2` ( + CREATE TABLE `${table_tbl2}` ( `org_code` varchar(150) NOT NULL , `org_name` varchar(300) NULL ) ENGINE=OLAP @@ -91,7 +98,7 @@ suite("mark_join") { """ sql """ - CREATE TABLE `tbl3` ( + CREATE TABLE `${table_tbl3}` ( `id` bigint NOT NULL, `acntm_name` varchar(500) NULL , `vendor_name` varchar(500) NULL @@ -104,7 +111,7 @@ suite("mark_join") { """ sql """ - insert into tbl1 (unit_name, cur_unit_name) values + insert into ${table_tbl1} (unit_name, cur_unit_name) values ('v1', 'o1'), ('v2', 'o2'), ('v3', 'o3'), @@ -129,7 +136,7 @@ suite("mark_join") { """ sql """ - insert into tbl2(org_code, org_name) values + insert into ${table_tbl2}(org_code, org_name) values ('v1', 'o1'), ('v2', 'o2'), ('v3', 'o3'), @@ -143,7 +150,7 @@ suite("mark_join") { """ sql """ - insert into tbl3 (id, vendor_name, acntm_name) + insert into ${table_tbl3} (id, vendor_name, acntm_name) values(1, 'o1', 'v1'), (2, 'o2', 'v2'), (3, 'o3', 'v3'), @@ -156,123 +163,123 @@ suite("mark_join") { (10, null, 'v5'); """ - sql " analyze table tbl1 with sync;" - sql " analyze table tbl2 with sync;" - sql " analyze table tbl3 with sync;" + sql " analyze table ${table_tbl1} with sync;" + sql " analyze table ${table_tbl2} with sync;" + sql " analyze table ${table_tbl3} with sync;" sql "set disable_join_reorder=0;" qt_test_right_semi_mark_join """ select - tbl3.id, - tbl3.acntm_name, - tbl3.vendor_name, - tbl3.vendor_name in ( + ${table_tbl3}.id, + ${table_tbl3}.acntm_name, + ${table_tbl3}.vendor_name, + ${table_tbl3}.vendor_name in ( select - tbl1.unit_name + ${table_tbl1}.unit_name from - tbl2 - join tbl1 on tbl1.cur_unit_name = tbl2.org_name + ${table_tbl2} + join ${table_tbl1} on ${table_tbl1}.cur_unit_name = ${table_tbl2}.org_name where - tbl2.org_code = tbl3.acntm_name + ${table_tbl2}.org_code = ${table_tbl3}.acntm_name ) v1, - tbl3.vendor_name not in ( + ${table_tbl3}.vendor_name not in ( select - tbl1.unit_name + ${table_tbl1}.unit_name from - tbl2 - join tbl1 on tbl1.cur_unit_name = tbl2.org_name + ${table_tbl2} + join ${table_tbl1} on ${table_tbl1}.cur_unit_name = ${table_tbl2}.org_name where - tbl2.org_code = tbl3.acntm_name + ${table_tbl2}.org_code = ${table_tbl3}.acntm_name ) v2 from - tbl3 order by 1,2,3,4,5; + ${table_tbl3} order by 1,2,3,4,5; """ sql "set disable_join_reorder=1;" qt_test_right_semi_mark_join_2 """ select - tbl3.id, - tbl3.acntm_name, - tbl3.vendor_name, - tbl3.vendor_name in ( + ${table_tbl3}.id, + ${table_tbl3}.acntm_name, + ${table_tbl3}.vendor_name, + ${table_tbl3}.vendor_name in ( select - tbl1.unit_name + ${table_tbl1}.unit_name from - tbl2 - join tbl1 on tbl1.cur_unit_name = tbl2.org_name + ${table_tbl2} + join ${table_tbl1} on ${table_tbl1}.cur_unit_name = ${table_tbl2}.org_name where - tbl2.org_code = tbl3.acntm_name + ${table_tbl2}.org_code = ${table_tbl3}.acntm_name ) v1, - tbl3.vendor_name not in ( + ${table_tbl3}.vendor_name not in ( select - tbl1.unit_name + ${table_tbl1}.unit_name from - tbl2 - join tbl1 on tbl1.cur_unit_name = tbl2.org_name + ${table_tbl2} + join ${table_tbl1} on ${table_tbl1}.cur_unit_name = ${table_tbl2}.org_name where - tbl2.org_code = tbl3.acntm_name + ${table_tbl2}.org_code = ${table_tbl3}.acntm_name ) v2 from - tbl3 order by 1,2,3,4,5; + ${table_tbl3} order by 1,2,3,4,5; """ sql "set disable_join_reorder=0;" qt_test_right_semi_mark_join_no_null """ select - tbl3.id, - tbl3.acntm_name, - tbl3.vendor_name, - tbl3.vendor_name in ( + ${table_tbl3}.id, + ${table_tbl3}.acntm_name, + ${table_tbl3}.vendor_name, + ${table_tbl3}.vendor_name in ( select - tbl1.unit_name + ${table_tbl1}.unit_name from - tbl2 - join tbl1 on tbl1.cur_unit_name = tbl2.org_name + ${table_tbl2} + join ${table_tbl1} on ${table_tbl1}.cur_unit_name = ${table_tbl2}.org_name where - tbl2.org_code = tbl3.acntm_name - and tbl1.unit_name is not null + ${table_tbl2}.org_code = ${table_tbl3}.acntm_name + and ${table_tbl1}.unit_name is not null ) v1, - tbl3.vendor_name not in ( + ${table_tbl3}.vendor_name not in ( select - tbl1.unit_name + ${table_tbl1}.unit_name from - tbl2 - join tbl1 on tbl1.cur_unit_name = tbl2.org_name + ${table_tbl2} + join ${table_tbl1} on ${table_tbl1}.cur_unit_name = ${table_tbl2}.org_name where - tbl2.org_code = tbl3.acntm_name - and tbl1.unit_name is not null + ${table_tbl2}.org_code = ${table_tbl3}.acntm_name + and ${table_tbl1}.unit_name is not null ) v2 from - tbl3 order by 1,2,3,4,5; + ${table_tbl3} order by 1,2,3,4,5; """ sql "set disable_join_reorder=1;" qt_test_right_semi_mark_join_no_null_2 """ select - tbl3.id, - tbl3.acntm_name, - tbl3.vendor_name, - tbl3.vendor_name in ( + ${table_tbl3}.id, + ${table_tbl3}.acntm_name, + ${table_tbl3}.vendor_name, + ${table_tbl3}.vendor_name in ( select - tbl1.unit_name + ${table_tbl1}.unit_name from - tbl2 - join tbl1 on tbl1.cur_unit_name = tbl2.org_name + ${table_tbl2} + join ${table_tbl1} on ${table_tbl1}.cur_unit_name = ${table_tbl2}.org_name where - tbl2.org_code = tbl3.acntm_name - and tbl1.unit_name is not null + ${table_tbl2}.org_code = ${table_tbl3}.acntm_name + and ${table_tbl1}.unit_name is not null ) v1, - tbl3.vendor_name not in ( + ${table_tbl3}.vendor_name not in ( select - tbl1.unit_name + ${table_tbl1}.unit_name from - tbl2 - join tbl1 on tbl1.cur_unit_name = tbl2.org_name + ${table_tbl2} + join ${table_tbl1} on ${table_tbl1}.cur_unit_name = ${table_tbl2}.org_name where - tbl2.org_code = tbl3.acntm_name - and tbl1.unit_name is not null + ${table_tbl2}.org_code = ${table_tbl3}.acntm_name + and ${table_tbl1}.unit_name is not null ) v2 from - tbl3 order by 1,2,3,4,5; + ${table_tbl3} order by 1,2,3,4,5; """ } diff --git a/regression-test/suites/query_p0/join/mark_join/right_semi_mark_join.groovy b/regression-test/suites/query_p0/join/mark_join/right_semi_mark_join.groovy index 3557475cdd22c2..cc287f1a6f791d 100644 --- a/regression-test/suites/query_p0/join/mark_join/right_semi_mark_join.groovy +++ b/regression-test/suites/query_p0/join/mark_join/right_semi_mark_join.groovy @@ -17,17 +17,22 @@ suite("right_semi_mark_join") { - sql "drop table if exists tbl1;" - sql "drop table if exists tbl2;" - sql "drop table if exists tbl3;" + String suiteName = "right_semi_mark_join" + String table_tbl1 = "${suiteName}_table_tbl1" + String table_tbl2 = "${suiteName}_table_tbl2" + String table_tbl3 = "${suiteName}_table_tbl3" + + sql "drop table if exists ${table_tbl1};" + sql "drop table if exists ${table_tbl2};" + sql "drop table if exists ${table_tbl3};" sql """ - create table tbl1 (pk int, col1 bigint, col2 bigint) engine = olap DUPLICATE KEY(pk) distributed by hash(pk) buckets 10 properties("replication_num" = "1"); + create table ${table_tbl1} (pk int, col1 bigint, col2 bigint) engine = olap DUPLICATE KEY(pk) distributed by hash(pk) buckets 10 properties("replication_num" = "1"); """ sql """ insert into - tbl1(pk, col1, col2) + ${table_tbl1}(pk, col1, col2) values (0, null, 18332), (1, 788547, null), (2, 4644959, -56), (3, 8364628, 72), (4, null, -5581), (5, 2344024, -62), (6, -2689177, 22979), (7, 1320, -41), (8, null, -54), (9, 12, -6236), @@ -35,7 +40,7 @@ suite("right_semi_mark_join") { """ sql """ - create table tbl2 ( + create table ${table_tbl2} ( pk int, col1 bigint, col2 bigint ) engine = olap distributed by hash(pk) buckets 4 @@ -44,14 +49,14 @@ suite("right_semi_mark_join") { sql """ insert into - tbl2(pk, col1, col2) + ${table_tbl2}(pk, col1, col2) values (0, 108, 31161), (1, 1479175, 6764263), (2, 110, 25), (3, 110, -18656), (4, null, -51), (5, 21, 27), (6, -6950217, 1585978), (7, null, null), (8, null, 3453467), (9, null, -6701140); """ sql """ - create table tbl3 ( + create table ${table_tbl3} ( pk int, col1 bigint, col2 bigint, col3 bigint ) engine = olap DUPLICATE KEY(pk) distributed by hash(pk) buckets 10 @@ -60,7 +65,7 @@ suite("right_semi_mark_join") { sql """ insert into - tbl3(pk, col1, col2) + ${table_tbl3}(pk, col1, col2) values (0, 55, -58), (1, 49, 29792), (2, 95, 32361), (3, 31243, -27428), (4, -27400, null), (5, 31243, null), (6, null, -27428), (7, null, 7), (8, 31243, -21951), (9, 13186, 24466), @@ -87,13 +92,13 @@ suite("right_semi_mark_join") { T1.pk AS C1, T1.col2 AS C2 FROM - tbl1 AS T1 FULL - OUTER JOIN tbl2 AS T2 ON T1.col1 <= T2.col2 + ${table_tbl1} AS T1 FULL + OUTER JOIN ${table_tbl2} AS T2 ON T1.col1 <= T2.col2 OR T2.col1 IN ( SELECT T3.col2 FROM - tbl3 AS T3 + ${table_tbl3} AS T3 WHERE T2.col2 = T3.col1 ) From 05c0c42ff07f7d001808c0350739221eeee7d5ad Mon Sep 17 00:00:00 2001 From: Hu Shenggang Date: Thu, 22 May 2025 15:48:13 +0800 Subject: [PATCH 4/4] fix compile error --- be/src/pipeline/exec/join/process_hash_table_probe_impl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h index 40c2f03749cf2f..b88d1f42541d34 100644 --- a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h +++ b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h @@ -527,7 +527,8 @@ Status ProcessHashTableProbe::do_mark_join_conjuncts(vectorized::Blo } auto result_column_id = output_block->columns(); - output_block->insert({std::move(filter_column), std::make_shared(), ""}); + output_block->insert( + {std::move(filter_column), std::make_shared(), ""}); return vectorized::Block::filter_block(output_block, result_column_id, result_column_id); } }