From f57a9bfd8c1d22114203ff8115b7e6602dfecdd6 Mon Sep 17 00:00:00 2001 From: daidai Date: Mon, 1 Dec 2025 17:07:20 +0800 Subject: [PATCH] [fix](parquet)fix hudi parquet read hoodie.datasource.write.drop.partition.columns prop table cause be core. (#58532) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? Related PR: #57771 Problem Summary: Fixed a core issue when reading Hudi Parquet format tables with the `hoodie.properties` `hoodie.datasource.write.drop.partition.columns=false`. ``` *** SIGSEGV address not mapped to object (@0x18) received by PID 12234 (TID 38368 OR 0x7f0bd279e640) from PID 24; stack trace: *** 11:01:31  0# doris::signal::(anonymous namespace)::FailureSignalHandler(int, siginfo_t*, void*) at /home/zcp/repo_center/doris_master/doris/be/src/common/signal_handler.h:420 11:01:31  1# PosixSignals::chained_handler(int, siginfo*, void*) [clone .part.0] in /usr/lib/jvm/java-17-openjdk-amd64/lib/server/libjvm.so 11:01:31  2# JVM_handle_linux_signal in /usr/lib/jvm/java-17-openjdk-amd64/lib/server/libjvm.so 11:01:31  3# 0x00007F18963FB520 in /lib/x86_64-linux-gnu/libc.so.6 11:01:31  4# std::_Function_handler >, std::allocator > > > const&, doris::segment_v2::RowRanges*)::$_1>::_M_invoke(std::_Any_data const&, doris::vectorized::ParquetPredicate::PageIndexStat**&&, int&&) at /usr/local/ldb-toolchain-v0.26/bin/../lib/gcc/x86_64-pc-linux-gnu/15/include/g++-v15/bits/std_function.h:292 11:01:31  5# doris::InListPredicateBase<(doris::PrimitiveType)2, (doris::PredicateType)7, doris::HybridSet<(doris::PrimitiveType)2, doris::FixedContainer, doris::vectorized::PredicateColumnType<(doris::PrimitiveType)2> > >::evaluate_and(doris::vectorized::ParquetPredicate::CachedPageIndexStat*, doris::segment_v2::RowRanges*) const at /home/zcp/repo_center/doris_master/doris/be/src/olap/in_list_predicate.h:345 11:01:31  6# doris::AndBlockColumnPredicate::evaluate_and(doris::vectorized::ParquetPredicate::CachedPageIndexStat*, doris::segment_v2::RowRanges*) const at /home/zcp/repo_center/doris_master/doris/be/src/olap/block_column_predicate.cpp:148 11:01:31  7# doris::vectorized::ParquetReader::_process_page_index_filter(tparquet::RowGroup const&, doris::vectorized::RowGroupReader::RowGroupIndex const&, std::vector >, std::allocator > > > const&, doris::segment_v2::RowRanges*) in /mnt/hdd01/ci/doris-deploy-master-local/be/lib/doris_be 11:01:31  8# doris::vectorized::ParquetReader::_process_min_max_bloom_filter(doris::vectorized::RowGroupReader::RowGroupIndex const&, tparquet::RowGroup const&, std::vector >, std::allocator > > > const&, doris::segment_v2::RowRanges*) at /home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/parquet/vparquet_reader.cpp:1082 11:01:31  9# doris::vectorized::ParquetReader::_next_row_group_reader() in /mnt/hdd01/ci/doris-deploy-master-local/be/lib/doris_be 11:01:31  10# doris::vectorized::ParquetReader::get_next_block(doris::vectorized::Block*, unsigned long*, bool*) at /home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/parquet/vparquet_reader.cpp:598 11:01:31  11# doris::vectorized::HudiReader::get_next_block_inner(doris::vectorized::Block*, unsigned long*, bool*) at /home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/table/hudi_reader.cpp:29 11:01:31  12# doris::vectorized::TableFormatReader::get_next_block(doris::vectorized::Block*, unsigned long*, bool*) at /home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/table/table_format_reader.h:82 11:01:31  13# doris::vectorized::FileScanner::_get_block_wrapped(doris::RuntimeState*, doris::vectorized::Block*, bool*) at /home/zcp/repo_center/doris_master/doris/be/src/vec/exec/scan/file_scanner.cpp:472 ``` --- .../exec/format/parquet/vparquet_reader.cpp | 15 ++++++++++++++- .../vec/exec/format/parquet/vparquet_reader.h | 3 ++- .../exec/format/parquet/parquet_expr_test.cpp | 19 +++++++++++-------- 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index d4fee37a33bc1b..fa37837ecf6962 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -365,6 +365,7 @@ Status ParquetReader::init_reader( if (required_file_columns.contains(name)) { _read_file_columns.emplace_back(name); _read_table_columns.emplace_back(required_file_columns[name]); + _read_table_columns_set.insert(required_file_columns[name]); } } // build column predicates for column lazy read @@ -373,7 +374,13 @@ Status ParquetReader::init_reader( } bool ParquetReader::_exists_in_file(const VSlotRef* slot_ref) const { - return _table_info_node_ptr->children_column_exists(slot_ref->expr_name()); + // `_read_table_columns_set` is used to ensure that only columns actually read are subject to min-max filtering. + // This primarily handles cases where partition columns also exist in a file. The reason it's not modified + // in `_table_info_node_ptr` is that Iceberg、Hudi has inconsistent requirements for this node; + // Iceberg partition evolution need read partition columns from a file. + // hudi set `hoodie.datasource.write.drop.partition.columns=false` not need read partition columns from a file. + return _table_info_node_ptr->children_column_exists(slot_ref->expr_name()) && + _read_table_columns_set.contains(slot_ref->expr_name()); } bool ParquetReader::_type_matches(const VSlotRef* slot_ref) const { @@ -999,6 +1006,12 @@ Status ParquetReader::_process_page_index_filter( // complex type, not support page index yet. return false; } + if (!_col_offsets.contains(parquet_col_id)) { + // If the file contains partition columns and the query applies filters on those + // partition columns, then reading the page index is unnecessary. + return false; + } + auto& column_chunk = row_group.columns[parquet_col_id]; if (column_chunk.column_index_length == 0 || column_chunk.offset_index_length == 0) { // column no page index. diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h b/be/src/vec/exec/format/parquet/vparquet_reader.h index 2aee8fe2a820c1..0b15792da27972 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_reader.h @@ -299,7 +299,8 @@ class ParquetReader : public GenericReader, public ExprPushDownHelper { //sequence in file, need to read std::vector _read_table_columns; std::vector _read_file_columns; - + // The set of file columns to be read; only columns within this set will be filtered using the min-max predicate. + std::set _read_table_columns_set; // Deleted rows will be marked by Iceberg/Paimon. So we should filter deleted rows when reading it. const std::vector* _delete_rows = nullptr; int64_t _delete_rows_index = 0; diff --git a/be/test/vec/exec/format/parquet/parquet_expr_test.cpp b/be/test/vec/exec/format/parquet/parquet_expr_test.cpp index 7bf7d73808d68e..c3e6ae3b3e5277 100644 --- a/be/test/vec/exec/format/parquet/parquet_expr_test.cpp +++ b/be/test/vec/exec/format/parquet/parquet_expr_test.cpp @@ -401,6 +401,7 @@ TEST_F(ParquetExprTest, test_ne) { auto const_val = std::make_shared( ColumnHelper::create_column_with_name({100})); + slot_ref->set_expr_name("int32_all_null_col"); fn_eq->add_child(slot_ref); fn_eq->add_child(const_val); fn_eq->_node_type = TExprNodeType::BINARY_PRED; @@ -419,7 +420,7 @@ TEST_F(ParquetExprTest, test_eq) { auto fn_eq = MockFnCall::create("eq"); auto const_val = std::make_shared( ColumnHelper::create_column_with_name({100})); - + slot_ref->set_expr_name("int32_all_null_col"); fn_eq->add_child(slot_ref); fn_eq->add_child(const_val); fn_eq->_node_type = TExprNodeType::BINARY_PRED; @@ -439,7 +440,7 @@ TEST_F(ParquetExprTest, test_le) { auto fn_eq = MockFnCall::create("le"); auto const_val = std::make_shared( ColumnHelper::create_column_with_name({100})); - + slot_ref->set_expr_name("int32_all_null_col"); fn_eq->add_child(slot_ref); fn_eq->add_child(const_val); fn_eq->_node_type = TExprNodeType::BINARY_PRED; @@ -459,7 +460,7 @@ TEST_F(ParquetExprTest, test_ge) { auto fn_eq = MockFnCall::create("ge"); auto const_val = std::make_shared( ColumnHelper::create_column_with_name({100})); - + slot_ref->set_expr_name("int32_all_null_col"); fn_eq->add_child(slot_ref); fn_eq->add_child(const_val); fn_eq->_node_type = TExprNodeType::BINARY_PRED; @@ -479,7 +480,7 @@ TEST_F(ParquetExprTest, test_gt) { auto fn_eq = MockFnCall::create("gt"); auto const_val = std::make_shared( ColumnHelper::create_column_with_name({100})); - + slot_ref->set_expr_name("int32_all_null_col"); fn_eq->add_child(slot_ref); fn_eq->add_child(const_val); fn_eq->_node_type = TExprNodeType::BINARY_PRED; @@ -499,7 +500,7 @@ TEST_F(ParquetExprTest, test_lt) { auto fn_eq = MockFnCall::create("lt"); auto const_val = std::make_shared( ColumnHelper::create_column_with_name({100})); - + slot_ref->set_expr_name("int32_all_null_col"); fn_eq->add_child(slot_ref); fn_eq->add_child(const_val); fn_eq->_node_type = TExprNodeType::BINARY_PRED; @@ -1172,6 +1173,7 @@ TEST_F(ParquetExprTest, test_expr_push_down_and) { auto fn_le = MockFnCall::create("le"); auto const_val = std::make_shared( ColumnHelper::create_column_with_name({10000000002})); + slot_ref->set_expr_name("int64_col"); fn_le->add_child(slot_ref); fn_le->add_child(const_val); fn_le->_node_type = TExprNodeType::BINARY_PRED; @@ -1191,7 +1193,7 @@ TEST_F(ParquetExprTest, test_expr_push_down_and) { auto fn_le = MockFnCall::create("gt"); auto const_val = std::make_shared( ColumnHelper::create_column_with_name({100})); - + slot_ref->set_expr_name("int64_col"); fn_le->add_child(slot_ref); fn_le->add_child(const_val); fn_le->_node_type = TExprNodeType::BINARY_PRED; @@ -1211,7 +1213,7 @@ TEST_F(ParquetExprTest, test_expr_push_down_and) { auto fn_le = MockFnCall::create("ge"); auto const_val = std::make_shared( ColumnHelper::create_column_with_name({900})); - + slot_ref->set_expr_name("int64_col"); fn_le->add_child(slot_ref); fn_le->add_child(const_val); fn_le->_node_type = TExprNodeType::BINARY_PRED; @@ -1273,7 +1275,7 @@ TEST_F(ParquetExprTest, test_expr_push_down_or_string) { auto fn_lt = MockFnCall::create("lt"); auto const_val = std::make_shared( ColumnHelper::create_column_with_name({"name_1"})); - + slot_ref->set_expr_name("string_col"); fn_lt->add_child(slot_ref); fn_lt->add_child(const_val); fn_lt->_node_type = TExprNodeType::BINARY_PRED; @@ -1292,6 +1294,7 @@ TEST_F(ParquetExprTest, test_expr_push_down_or_string) { { auto slot_ref = std::make_shared(5, std::make_shared()); auto fn_is_not_null = MockFnCall::create("is_not_null_pred"); + slot_ref->set_expr_name("string_col"); fn_is_not_null->add_child(slot_ref); fn_is_not_null->_node_type = TExprNodeType::FUNCTION_CALL;