From 55e3edc0e63d94a4266f5e1bb96d84ad9c9620a3 Mon Sep 17 00:00:00 2001 From: Socrates Date: Sat, 8 Feb 2025 19:56:13 +0800 Subject: [PATCH] [fix](orc) dont't push down null aware predicate (#47625) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? Related PR: #43255 Problem Summary: Example: ```sql CREATE TABLE table_a ( id INT, age INT ) STORED AS ORC; INSERT INTO table_a VALUES (1, null), (2, 18), (3, null), (4, 25); CREATE TABLE table_b ( id INT, age INT ) STORED AS ORC; INSERT INTO table_b VALUES (1, null), (2, null), (3, 1000000), (4, 100); ``` run sql ``` select * from table_a inner join table_b on table_a.age <=> table_b.age and table_b.id in (1,3); ``` When executing this SQL, the backend generates a runtime filter on the table_a side during the join operation, resulting in a condition like WHERE table_a.age IN (NULL, 1000000). It’s important to note that since <=> is a null-aware comparison operator, the IN predicate must also be null-aware. However, the ORC predicate pushdown API does not support null-aware IN predicates. As a result, our current approach ignores null values, leading to an empty result set for this query. To fix this bug, we’ve adjusted the logic so that predicates with null-aware comparisons are not pushed down, ensuring the correct result as follows: ```text +------+------+------+------+ | id | age | id | age | +------+------+------+------+ | 1 | NULL | 1 | NULL | | 3 | NULL | 1 | NULL | +------+------+------+------+ ``` --- be/src/vec/exec/format/orc/vorc_reader.cpp | 5 ++++- .../orc_predicate/orc_predicate_table.hql | 22 +++++++++++++++++++ .../hive/test_hive_orc_predicate.out | 8 +++++++ .../hive/test_hive_orc_predicate.groovy | 2 ++ 4 files changed, 36 insertions(+), 1 deletion(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 26e41afe3c901a..c75f5a58c00815 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -716,7 +716,10 @@ bool OrcReader::_check_expr_can_push_down(const VExprSPtr& expr) { case TExprOpcode::NE: case TExprOpcode::FILTER_IN: case TExprOpcode::FILTER_NOT_IN: - return _check_slot_can_push_down(expr) && _check_rest_children_can_push_down(expr); + // can't push down if expr is null aware predicate + return expr->node_type() != TExprNodeType::NULL_AWARE_BINARY_PRED && + expr->node_type() != TExprNodeType::NULL_AWARE_IN_PRED && + _check_slot_can_push_down(expr) && _check_rest_children_can_push_down(expr); case TExprOpcode::INVALID_OPCODE: if (expr->node_type() == TExprNodeType::FUNCTION_CALL) { diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/orc_predicate_table.hql b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/orc_predicate_table.hql index a946b25ff1af04..6a1c9dce521415 100644 --- a/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/orc_predicate_table.hql +++ b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/orc_predicate_table.hql @@ -14,3 +14,25 @@ create table type_changed_table ( ) stored as orc; insert into type_changed_table values (1, 'Alice'), (2, 'Bob'), (3, 'Charlie'); ALTER TABLE type_changed_table CHANGE COLUMN id id STRING; + +CREATE TABLE table_a ( + id INT, + age INT +) STORED AS ORC; + +INSERT INTO table_a VALUES +(1, null), +(2, 18), +(3, null), +(4, 25); + +CREATE TABLE table_b ( + id INT, + age INT +) STORED AS ORC; + +INSERT INTO table_b VALUES +(1, null), +(2, null), +(3, 1000000), +(4, 100); diff --git a/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out b/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out index f42bb629550c88..d943f3f80c63be 100644 --- a/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out +++ b/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out @@ -13,6 +13,10 @@ -- !predicate_changed_type3 -- 3 Charlie +-- !predicate_null_aware_equal_in_rt -- +1 \N 1 \N +3 \N 1 \N + -- !predicate_fixed_char1 -- 1 a @@ -27,3 +31,7 @@ -- !predicate_changed_type3 -- 3 Charlie +-- !predicate_null_aware_equal_in_rt -- +1 \N 1 \N +3 \N 1 \N + diff --git a/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy b/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy index 2dd647aa2c1d8e..726354e1408135 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy @@ -43,6 +43,8 @@ suite("test_hive_orc_predicate", "p0,external,hive,external_docker,external_dock qt_predicate_changed_type2 """ select * from type_changed_table where id = '2';""" qt_predicate_changed_type3 """ select * from type_changed_table where id = '3';""" + qt_predicate_null_aware_equal_in_rt """select * from table_a inner join table_b on table_a.age <=> table_b.age and table_b.id in (1,3);""" + sql """drop catalog if exists ${catalog_name}""" } finally { }