diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp index 0917ca7cd06fb2..dac6b76aecef54 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp @@ -932,10 +932,14 @@ Status StructColumnReader::read_column_data( size_t field_rows = 0; bool field_eof = false; - // Use root_node to get the correct child node for the reference column - // reference_file_column_name is the file column name, use get_children_node_by_file_column_name - auto ref_child_node = - root_node->get_children_node_by_file_column_name(reference_file_column_name); + // Use ConstNode for the reference column instead of looking up from root_node. + // The reference column is only used to get RL/DL information for determining the number + // of elements in the struct. It may be a column that has been dropped from the table + // schema (e.g., 'removed' field), but still exists in older parquet files. + // Since we don't need schema mapping for this column (we just need its RL/DL levels), + // using ConstNode is safe and avoids the issue where the reference column doesn't exist + // in root_node (because it was dropped from table schema). + auto ref_child_node = TableSchemaChangeHelper::ConstNode::get_instance(); not_missing_orig_column_size = temp_column->size(); RETURN_IF_ERROR((*reference_reader) diff --git a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run24.sql b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run24.sql new file mode 100644 index 00000000000000..b5b19b1f15c717 --- /dev/null +++ b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run24.sql @@ -0,0 +1,151 @@ +use demo.test_db; + +DROP TABLE IF EXISTS test_struct_evolution; + +-- Test case for struct schema evolution bug +-- Bug scenario: When querying a struct field after schema evolution, if all queried fields are missing +-- in old Parquet files, the code tries to find a reference column from file schema. However, if the +-- reference column (e.g., 'removed') was dropped from table schema, accessing it via root_node will fail. +-- +-- Steps to reproduce: +-- 1. Create table with struct containing: removed, rename, keep, drop_and_add +-- 2. Insert data (creates Parquet file with these fields) +-- 3. DROP a_struct.removed - removes field from table schema +-- 4. DROP a_struct.drop_and_add then ADD a_struct.drop_and_add - gets new field ID +-- 5. ADD a_struct.added - adds new field +-- 6. Query struct_element(a_struct, 'drop_and_add') or struct_element(a_struct, 'added') +-- -> This will fail because all queried fields are missing in old file, and the reference +-- column 'removed' doesn't exist in root_node (it was dropped from table schema) + +-- Step 1: Create table +CREATE TABLE test_struct_evolution ( + id BIGINT, + a_struct STRUCT +) USING ICEBERG +TBLPROPERTIES ('write.format.default' = 'parquet', 'format-version' = 2); + +-- Step 2: Insert data (creates Parquet file with original schema) +INSERT INTO test_struct_evolution +SELECT 1, named_struct('removed', 10, 'rename', 11, 'keep', 12, 'drop_and_add', 13); + +-- Step 3: Schema evolution - drop removed field +ALTER TABLE test_struct_evolution DROP COLUMN a_struct.removed; + +-- Step 4: Rename field (field ID stays the same) +ALTER TABLE test_struct_evolution RENAME COLUMN a_struct.rename TO renamed; + +-- Step 5: Drop and add drop_and_add (new field ID) +ALTER TABLE test_struct_evolution DROP COLUMN a_struct.drop_and_add; +ALTER TABLE test_struct_evolution ADD COLUMN a_struct.drop_and_add BIGINT; + +-- Step 6: Add new field +ALTER TABLE test_struct_evolution ADD COLUMN a_struct.added BIGINT; + +-- Step 7: Insert new data after schema evolution (creates new Parquet file) +INSERT INTO test_struct_evolution +SELECT 2, named_struct('renamed', 21, 'keep', 22, 'drop_and_add', 23, 'added', 24); + +-- Now the table contains two Parquet files: +-- - Old file: contains removed, rename, keep, drop_and_add (old field ID) +-- - New file: contains renamed, keep, drop_and_add (new field ID), added +-- +-- Querying struct_element(a_struct, 'drop_and_add') or struct_element(a_struct, 'added') +-- on the old file will trigger the bug + +-- ============================================================ +-- ORC format test table (for completeness, though ORC doesn't have the same bug) +-- ============================================================ +DROP TABLE IF EXISTS test_struct_evolution_orc; + +-- Create ORC format table with same schema evolution scenario +CREATE TABLE test_struct_evolution_orc ( + id BIGINT, + a_struct STRUCT +) USING ICEBERG +TBLPROPERTIES ('write.format.default' = 'orc', 'format-version' = 2); + +-- Insert initial data (creates ORC file with original schema) +INSERT INTO test_struct_evolution_orc +SELECT 1, named_struct('removed', 10, 'rename', 11, 'keep', 12, 'drop_and_add', 13); + +-- Schema evolution - same operations as Parquet table +ALTER TABLE test_struct_evolution_orc DROP COLUMN a_struct.removed; +ALTER TABLE test_struct_evolution_orc RENAME COLUMN a_struct.rename TO renamed; +ALTER TABLE test_struct_evolution_orc DROP COLUMN a_struct.drop_and_add; +ALTER TABLE test_struct_evolution_orc ADD COLUMN a_struct.drop_and_add BIGINT; +ALTER TABLE test_struct_evolution_orc ADD COLUMN a_struct.added BIGINT; + +-- Insert new data after schema evolution (creates new ORC file) +INSERT INTO test_struct_evolution_orc +SELECT 2, named_struct('renamed', 21, 'keep', 22, 'drop_and_add', 23, 'added', 24); + +-- ============================================================ +-- Case sensitivity test table (mixed case field names) +-- ============================================================ +DROP TABLE IF EXISTS test_struct_evolution_case; + +-- Test case for struct schema evolution with mixed case field names +-- This tests that case sensitivity is handled correctly when: +-- - Field names have mixed case (e.g., REMOVED, rename, keep, drop_and_add) +-- - Schema evolution operations are performed +-- - Querying struct fields with different case patterns + +-- Step 1: Create table with mixed case field names +CREATE TABLE test_struct_evolution_case ( + id BIGINT, + a_struct STRUCT +) USING ICEBERG +TBLPROPERTIES ('write.format.default' = 'parquet', 'format-version' = 2); + +-- Step 2: Insert data (creates Parquet file with original schema) +INSERT INTO test_struct_evolution_case +SELECT 1, named_struct('REMOVED', 10, 'rename', 11, 'keep', 12, 'drop_and_add', 13); + +-- Step 3: Schema evolution - drop REMOVED field (uppercase) +ALTER TABLE test_struct_evolution_case DROP COLUMN a_struct.REMOVED; + +-- Step 4: Rename field (field ID stays the same) +ALTER TABLE test_struct_evolution_case RENAME COLUMN a_struct.rename TO renamed; + +-- Step 5: Drop and add drop_and_add with case change (new field ID) +-- Initial: drop_and_add (lowercase), after re-add: DROP_AND_ADD (uppercase) +ALTER TABLE test_struct_evolution_case DROP COLUMN a_struct.drop_and_add; +ALTER TABLE test_struct_evolution_case ADD COLUMN a_struct.DROP_AND_ADD BIGINT; + +-- Step 6: Add new field +ALTER TABLE test_struct_evolution_case ADD COLUMN a_struct.added BIGINT; + +-- Step 7: Insert new data after schema evolution (creates new Parquet file) +-- Note: Use DROP_AND_ADD (uppercase) in the new data +INSERT INTO test_struct_evolution_case +SELECT 2, named_struct('renamed', 21, 'keep', 22, 'DROP_AND_ADD', 23, 'added', 24); + +-- ============================================================ +-- ORC format test table with mixed case (for completeness) +-- ============================================================ +DROP TABLE IF EXISTS test_struct_evolution_case_orc; + +-- Create ORC format table with same schema evolution scenario and mixed case +CREATE TABLE test_struct_evolution_case_orc ( + id BIGINT, + a_struct STRUCT +) USING ICEBERG +TBLPROPERTIES ('write.format.default' = 'orc', 'format-version' = 2); + +-- Insert initial data (creates ORC file with original schema) +INSERT INTO test_struct_evolution_case_orc +SELECT 1, named_struct('REMOVED', 10, 'rename', 11, 'keep', 12, 'drop_and_add', 13); + +-- Schema evolution - same operations as Parquet table +ALTER TABLE test_struct_evolution_case_orc DROP COLUMN a_struct.REMOVED; +ALTER TABLE test_struct_evolution_case_orc RENAME COLUMN a_struct.rename TO renamed; +-- Drop and add with case change: drop_and_add (lowercase) -> DROP_AND_ADD (uppercase) +ALTER TABLE test_struct_evolution_case_orc DROP COLUMN a_struct.drop_and_add; +ALTER TABLE test_struct_evolution_case_orc ADD COLUMN a_struct.DROP_AND_ADD BIGINT; +ALTER TABLE test_struct_evolution_case_orc ADD COLUMN a_struct.added BIGINT; + +-- Insert new data after schema evolution (creates new ORC file) +-- Note: Use DROP_AND_ADD (uppercase) in the new data +INSERT INTO test_struct_evolution_case_orc +SELECT 2, named_struct('renamed', 21, 'keep', 22, 'DROP_AND_ADD', 23, 'added', 24); + diff --git a/regression-test/data/external_table_p0/iceberg/test_iceberg_struct_schema_evolution.out b/regression-test/data/external_table_p0/iceberg/test_iceberg_struct_schema_evolution.out new file mode 100644 index 00000000000000..a364316df427a9 --- /dev/null +++ b/regression-test/data/external_table_p0/iceberg/test_iceberg_struct_schema_evolution.out @@ -0,0 +1,161 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !desc -- +id bigint Yes true \N +a_struct struct Yes true \N + +-- !select_all -- +1 {"renamed":11, "keep":12, "drop_and_add":null, "added":null} +2 {"renamed":21, "keep":22, "drop_and_add":23, "added":24} + +-- !struct_keep -- +12 +22 + +-- !struct_renamed -- +11 +21 + +-- !struct_drop_and_add -- +\N +23 + +-- !struct_added -- +\N +24 + +-- !struct_full -- +{"renamed":11, "keep":12, "drop_and_add":null, "added":null} +{"renamed":21, "keep":22, "drop_and_add":23, "added":24} + +-- !struct_predicate_1 -- +1 + +-- !struct_predicate_2 -- +1 + +-- !struct_predicate_3 -- +1 + +-- !struct_predicate_4 -- +2 + +-- !struct_multi -- +11 12 \N \N +21 22 23 24 + +-- !struct_distinct -- +11 \N 12 +21 24 22 + +-- !orc_desc -- +id bigint Yes true \N +a_struct struct Yes true \N + +-- !orc_select_all -- +1 {"renamed":11, "keep":12, "drop_and_add":null, "added":null} +2 {"renamed":21, "keep":22, "drop_and_add":23, "added":24} + +-- !orc_struct_keep -- +12 +22 + +-- !orc_struct_renamed -- +11 +21 + +-- !orc_struct_drop_and_add -- +\N +23 + +-- !orc_struct_added -- +\N +24 + +-- !orc_struct_full -- +{"renamed":11, "keep":12, "drop_and_add":null, "added":null} +{"renamed":21, "keep":22, "drop_and_add":23, "added":24} + +-- !orc_struct_multi -- +11 12 \N \N +21 22 23 24 + +-- !case_desc -- +id bigint Yes true \N +a_struct struct Yes true \N + +-- !case_select_all -- +1 {"renamed":11, "keep":12, "drop_and_add":null, "added":null} +2 {"renamed":21, "keep":22, "drop_and_add":23, "added":24} + +-- !case_struct_keep -- +12 +22 + +-- !case_struct_renamed -- +11 +21 + +-- !case_struct_drop_and_add -- +\N +23 + +-- !case_struct_added -- +\N +24 + +-- !case_struct_full -- +{"renamed":11, "keep":12, "drop_and_add":null, "added":null} +{"renamed":21, "keep":22, "drop_and_add":23, "added":24} + +-- !case_struct_predicate_1 -- +1 + +-- !case_struct_predicate_2 -- +1 + +-- !case_struct_predicate_3 -- +1 + +-- !case_struct_predicate_4 -- +2 + +-- !case_struct_multi -- +11 12 \N \N +21 22 23 24 + +-- !case_struct_distinct -- +11 \N 12 +21 24 22 + +-- !case_orc_desc -- +id bigint Yes true \N +a_struct struct Yes true \N + +-- !case_orc_select_all -- +1 {"renamed":11, "keep":12, "drop_and_add":null, "added":null} +2 {"renamed":21, "keep":22, "drop_and_add":23, "added":24} + +-- !case_orc_struct_keep -- +12 +22 + +-- !case_orc_struct_renamed -- +11 +21 + +-- !case_orc_struct_drop_and_add -- +\N +23 + +-- !case_orc_struct_added -- +\N +24 + +-- !case_orc_struct_full -- +{"renamed":11, "keep":12, "drop_and_add":null, "added":null} +{"renamed":21, "keep":22, "drop_and_add":23, "added":24} + +-- !case_orc_struct_multi -- +11 12 \N \N +21 22 23 24 + diff --git a/regression-test/suites/external_table_p0/iceberg/test_iceberg_struct_schema_evolution.groovy b/regression-test/suites/external_table_p0/iceberg/test_iceberg_struct_schema_evolution.groovy new file mode 100644 index 00000000000000..f4e95fa4fbd811 --- /dev/null +++ b/regression-test/suites/external_table_p0/iceberg/test_iceberg_struct_schema_evolution.groovy @@ -0,0 +1,194 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Test for struct field schema evolution in Iceberg tables. +// This test case verifies the fix for the bug where querying a struct field +// that was added after schema evolution fails when all queried columns are +// missing in the original file, and the reference column used for RL/DL +// was dropped from the table schema. +// +// Bug: "File column name 'removed' not found in struct children" +// Fix: Use ConstNode for reference column when reading RL/DL information +// +// Prerequisites: +// - Tables created by run24.sql in docker iceberg scripts + +suite("test_iceberg_struct_schema_evolution", "p0,external,doris,external_docker,external_docker_doris") { + + String enabled = context.config.otherConfigs.get("enableIcebergTest") + if (enabled == null || !enabled.equalsIgnoreCase("true")) { + logger.info("disable iceberg test.") + return + } + + String rest_port = context.config.otherConfigs.get("iceberg_rest_uri_port") + String minio_port = context.config.otherConfigs.get("iceberg_minio_port") + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + String catalog_name = "test_iceberg_struct_schema_evolution" + + sql """drop catalog if exists ${catalog_name}""" + sql """ + CREATE CATALOG ${catalog_name} PROPERTIES ( + 'type'='iceberg', + 'iceberg.catalog.type'='rest', + 'uri' = 'http://${externalEnvIp}:${rest_port}', + "s3.access_key" = "admin", + "s3.secret_key" = "password", + "s3.endpoint" = "http://${externalEnvIp}:${minio_port}", + "s3.region" = "us-east-1" + );""" + + logger.info("catalog " + catalog_name + " created") + sql """switch ${catalog_name};""" + logger.info("switched to catalog " + catalog_name) + sql """use test_db;""" + + sql """set enable_fallback_to_original_planner=false;""" + + def table_name = "test_struct_evolution" + + // Verify table schema after evolution + qt_desc """DESC ${table_name}""" + + // Test 1: Query all columns - should work + qt_select_all """SELECT * FROM ${table_name} ORDER BY id""" + + // Test 2: Query struct field that exists in both old and new files + qt_struct_keep """SELECT struct_element(a_struct, 'keep') FROM ${table_name} ORDER BY id""" + qt_struct_renamed """SELECT struct_element(a_struct, 'renamed') FROM ${table_name} ORDER BY id""" + + // Test 3: Query struct field that was dropped and re-added (BUG FIX TEST) + // This query would crash before the fix with: + // "Not support read struct 'a_struct' which columns are all missing" + // or "File column name 'removed' not found in struct children" + qt_struct_drop_and_add """SELECT struct_element(a_struct, 'drop_and_add') FROM ${table_name} ORDER BY id""" + + // Test 4: Query struct field that was newly added (BUG FIX TEST) + qt_struct_added """SELECT struct_element(a_struct, 'added') FROM ${table_name} ORDER BY id""" + + // Test 5: Query entire struct column + qt_struct_full """SELECT a_struct FROM ${table_name} ORDER BY id""" + + // Test 6: Query with predicate on struct field + qt_struct_predicate_1 """SELECT id FROM ${table_name} WHERE struct_element(a_struct, 'renamed') = 11 ORDER BY id""" + qt_struct_predicate_2 """SELECT id FROM ${table_name} WHERE struct_element(a_struct, 'drop_and_add') IS NULL ORDER BY id""" + qt_struct_predicate_3 """SELECT id FROM ${table_name} WHERE struct_element(a_struct, 'added') IS NULL ORDER BY id""" + qt_struct_predicate_4 """SELECT id FROM ${table_name} WHERE struct_element(a_struct, 'added') IS NOT NULL ORDER BY id""" + + // Test 7: Multiple struct fields in one query + qt_struct_multi """SELECT struct_element(a_struct, 'renamed'), struct_element(a_struct, 'keep'), struct_element(a_struct, 'drop_and_add'), struct_element(a_struct, 'added') FROM ${table_name} ORDER BY id""" + + // Test 8: DISTINCT query on struct fields + qt_struct_distinct """SELECT DISTINCT struct_element(a_struct, 'renamed'), struct_element(a_struct, 'added'), struct_element(a_struct, 'keep') FROM ${table_name} ORDER BY 1, 2, 3""" + + // ============================================================ + // Test with ORC format (for completeness) + // ============================================================ + def orc_table_name = "test_struct_evolution_orc" + + // Verify ORC table schema after evolution + qt_orc_desc """DESC ${orc_table_name}""" + + // Test 1: Query all columns - should work + qt_orc_select_all """SELECT * FROM ${orc_table_name} ORDER BY id""" + + // Test 2: Query struct field that exists in both old and new files + qt_orc_struct_keep """SELECT struct_element(a_struct, 'keep') FROM ${orc_table_name} ORDER BY id""" + qt_orc_struct_renamed """SELECT struct_element(a_struct, 'renamed') FROM ${orc_table_name} ORDER BY id""" + + // Test 3: Query struct field that was dropped and re-added + qt_orc_struct_drop_and_add """SELECT struct_element(a_struct, 'drop_and_add') FROM ${orc_table_name} ORDER BY id""" + + // Test 4: Query struct field that was newly added + qt_orc_struct_added """SELECT struct_element(a_struct, 'added') FROM ${orc_table_name} ORDER BY id""" + + // Test 5: Query entire struct column + qt_orc_struct_full """SELECT a_struct FROM ${orc_table_name} ORDER BY id""" + + // Test 6: Multiple struct fields in one query + qt_orc_struct_multi """SELECT struct_element(a_struct, 'renamed'), struct_element(a_struct, 'keep'), struct_element(a_struct, 'drop_and_add'), struct_element(a_struct, 'added') FROM ${orc_table_name} ORDER BY id""" + + // ============================================================ + // Test with mixed case field names (case sensitivity test) + // ============================================================ + def case_table_name = "test_struct_evolution_case" + + // Verify case-sensitive table schema after evolution + qt_case_desc """DESC ${case_table_name}""" + + // Test 1: Query all columns - should work + qt_case_select_all """SELECT * FROM ${case_table_name} ORDER BY id""" + + // Test 2: Query struct field that exists in both old and new files + qt_case_struct_keep """SELECT struct_element(a_struct, 'keep') FROM ${case_table_name} ORDER BY id""" + qt_case_struct_renamed """SELECT struct_element(a_struct, 'renamed') FROM ${case_table_name} ORDER BY id""" + + // Test 3: Query struct field that was dropped and re-added with case change + // Note: Even though we use DROP_AND_ADD (uppercase) in SQL, the system normalizes + // field names to lowercase, so we query with 'drop_and_add' (lowercase) + qt_case_struct_drop_and_add """SELECT struct_element(a_struct, 'drop_and_add') FROM ${case_table_name} ORDER BY id""" + + // Test 4: Query struct field that was newly added + qt_case_struct_added """SELECT struct_element(a_struct, 'added') FROM ${case_table_name} ORDER BY id""" + + // Test 5: Query entire struct column + qt_case_struct_full """SELECT a_struct FROM ${case_table_name} ORDER BY id""" + + // Test 6: Query with predicate on struct field + qt_case_struct_predicate_1 """SELECT id FROM ${case_table_name} WHERE struct_element(a_struct, 'renamed') = 11 ORDER BY id""" + qt_case_struct_predicate_2 """SELECT id FROM ${case_table_name} WHERE struct_element(a_struct, 'drop_and_add') IS NULL ORDER BY id""" + qt_case_struct_predicate_3 """SELECT id FROM ${case_table_name} WHERE struct_element(a_struct, 'added') IS NULL ORDER BY id""" + qt_case_struct_predicate_4 """SELECT id FROM ${case_table_name} WHERE struct_element(a_struct, 'added') IS NOT NULL ORDER BY id""" + + // Test 7: Multiple struct fields in one query + qt_case_struct_multi """SELECT struct_element(a_struct, 'renamed'), struct_element(a_struct, 'keep'), struct_element(a_struct, 'drop_and_add'), struct_element(a_struct, 'added') FROM ${case_table_name} ORDER BY id""" + + // Test 8: DISTINCT query on struct fields + qt_case_struct_distinct """SELECT DISTINCT struct_element(a_struct, 'renamed'), struct_element(a_struct, 'added'), struct_element(a_struct, 'keep') FROM ${case_table_name} ORDER BY 1, 2, 3""" + + // ============================================================ + // Test with ORC format and mixed case field names + // ============================================================ + def case_orc_table_name = "test_struct_evolution_case_orc" + + // Verify ORC case-sensitive table schema after evolution + qt_case_orc_desc """DESC ${case_orc_table_name}""" + + // Test 1: Query all columns - should work + qt_case_orc_select_all """SELECT * FROM ${case_orc_table_name} ORDER BY id""" + + // Test 2: Query struct field that exists in both old and new files + qt_case_orc_struct_keep """SELECT struct_element(a_struct, 'keep') FROM ${case_orc_table_name} ORDER BY id""" + qt_case_orc_struct_renamed """SELECT struct_element(a_struct, 'renamed') FROM ${case_orc_table_name} ORDER BY id""" + + // Test 3: Query struct field that was dropped and re-added with case change + // Note: Even though we use DROP_AND_ADD (uppercase) in SQL, the system normalizes + // field names to lowercase, so we query with 'drop_and_add' (lowercase) + qt_case_orc_struct_drop_and_add """SELECT struct_element(a_struct, 'drop_and_add') FROM ${case_orc_table_name} ORDER BY id""" + + // Test 4: Query struct field that was newly added + qt_case_orc_struct_added """SELECT struct_element(a_struct, 'added') FROM ${case_orc_table_name} ORDER BY id""" + + // Test 5: Query entire struct column + qt_case_orc_struct_full """SELECT a_struct FROM ${case_orc_table_name} ORDER BY id""" + + // Test 6: Multiple struct fields in one query + qt_case_orc_struct_multi """SELECT struct_element(a_struct, 'renamed'), struct_element(a_struct, 'keep'), struct_element(a_struct, 'drop_and_add'), struct_element(a_struct, 'added') FROM ${case_orc_table_name} ORDER BY id""" + + // Clean up + sql """drop catalog if exists ${catalog_name}""" +}