From 3db0265d900d1b1c456feb46d2ddc611230af3f8 Mon Sep 17 00:00:00 2001 From: Socrates Date: Thu, 4 Sep 2025 14:02:08 +0800 Subject: [PATCH] [fix](hive)fix querying hive text table with NULL DEFINED AS '' (#55626) Problem Summary: This pull request improves the handling of empty string null formats and delimiter properties for Hive external tables, ensuring more robust and consistent behavior when parsing data. For hive text table like this: ```sql CREATE TABLE test_empty_null_defined_text ( id INT, name STRING ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' NULL DEFINED AS '' STORED AS TEXTFILE; INSERT INTO TABLE test_empty_null_defined_text VALUES (1, 'Alice'), (2, NULL); ``` Query in Doris: ```sql select * from test_empty_null_defined_text; ``` Before Result: ```text +------+-------+ | id | name | +------+-------+ | 1 | Alice | | 2 | | +------+-------+ ``` After Result: ```text +------+-------+ | id | name | +------+-------+ | 1 | Alice | | 2 | NULL | +------+-------+ ``` --- .../serde_prop/some_serde_table.hql | 30 +++++++++++++ .../hive/HiveMetaStoreClientHelper.java | 8 ++-- .../doris/datasource/hive/HiveProperties.java | 15 +++---- .../hive/test_hive_serde_prop.out | 44 +++++++++++++++++++ .../hive/test_hive_serde_prop.groovy | 8 ++++ 5 files changed, 92 insertions(+), 13 deletions(-) diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql index 0368547f8be224..81bdf03da8e6c4 100644 --- a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql +++ b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql @@ -170,3 +170,33 @@ INSERT INTO TABLE test_open_csv_standard_prop VALUES INSERT INTO TABLE test_open_csv_custom_prop VALUES (1, 'John Doe', 28, 50000.75, true, '2022-01-15', '2023-10-21 14:30:00', 4.5, 'Senior Developer'), (2, 'Jane,Smith', NULL, NULL, false, '2020-05-20', NULL, NULL, '\"Project Manager\"'); + +CREATE TABLE test_empty_null_format_text ( + id INT, + name STRING +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '\t' +STORED AS TEXTFILE +TBLPROPERTIES ( + "serialization.null.format"="" +); + +INSERT INTO TABLE test_empty_null_format_text VALUES + (1, 'Alice'), + (2, NULL), + (3, ''); + +CREATE TABLE test_empty_null_defined_text ( + id INT, + name STRING +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '\t' +NULL DEFINED AS '' +STORED AS TEXTFILE; + +INSERT INTO TABLE test_empty_null_defined_text VALUES + (1, 'Alice'), + (2, NULL), + (3, ''); \ No newline at end of file diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java index eb63aa1e5410f1..38b689b46d75d3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java @@ -851,7 +851,7 @@ public static Optional getSerdeProperty(Table table, String key) { private static Optional firstNonNullable(String... values) { for (String value : values) { - if (!Strings.isNullOrEmpty(value)) { + if (value != null) { return Optional.of(value); } } @@ -872,8 +872,10 @@ public static String firstPresentOrDefault(String defaultValue, Optional * * @param altValue * The string containing a number. + * @param defValue + * The default value to return if altValue is invalid. */ - public static String getByte(String altValue) { + public static String getByte(String altValue, String defValue) { if (altValue != null && altValue.length() > 0) { try { return Character.toString((char) ((Byte.parseByte(altValue) + 256) % 256)); @@ -881,6 +883,6 @@ public static String getByte(String altValue) { return altValue.substring(0, 1); } } - return null; + return defValue; } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java index bdc8e0cacd9c04..e1061887c64063 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java @@ -85,7 +85,7 @@ public static String getFieldDelimiter(Table table) { Optional fieldDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_FIELD_DELIMITER); Optional serFormat = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SERIALIZATION_FORMAT); return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault( - DEFAULT_FIELD_DELIMITER, fieldDelim, serFormat)); + "", fieldDelim, serFormat), DEFAULT_FIELD_DELIMITER); } public static String getSeparatorChar(Table table) { @@ -97,13 +97,13 @@ public static String getSeparatorChar(Table table) { public static String getLineDelimiter(Table table) { Optional lineDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_LINE_DELIMITER); return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault( - DEFAULT_LINE_DELIMITER, lineDelim)); + "", lineDelim), DEFAULT_LINE_DELIMITER); } public static String getMapKvDelimiter(Table table) { Optional mapkvDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_MAP_KV_DELIMITER); return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault( - DEFAULT_MAP_KV_DELIMITER, mapkvDelim)); + "", mapkvDelim), DEFAULT_MAP_KV_DELIMITER); } public static String getCollectionDelimiter(Table table) { @@ -112,18 +112,13 @@ public static String getCollectionDelimiter(Table table) { Optional collectionDelimHive3 = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_COLLECTION_DELIMITER_HIVE3); return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault( - DEFAULT_COLLECTION_DELIMITER, collectionDelimHive2, collectionDelimHive3)); + "", collectionDelimHive2, collectionDelimHive3), DEFAULT_COLLECTION_DELIMITER); } public static Optional getEscapeDelimiter(Table table) { Optional escapeDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_ESCAPE_DELIMITER); if (escapeDelim.isPresent()) { - String escape = HiveMetaStoreClientHelper.getByte(escapeDelim.get()); - if (escape != null) { - return Optional.of(escape); - } else { - return Optional.of(DEFAULT_ESCAPE_DELIMIER); - } + return Optional.of(HiveMetaStoreClientHelper.getByte(escapeDelim.get(), DEFAULT_ESCAPE_DELIMIER)); } return Optional.empty(); } diff --git a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out index c2415c058f14f1..cda92c0519ad51 100644 --- a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out +++ b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out @@ -51,6 +51,28 @@ b 2.2 1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer 2 Jane,Smith 2020-05-20 "Project Manager" +-- !test_empty_null_format_text -- +1 Alice +2 \N +3 \N + +-- !test_empty_null_format_text2 -- +2 \N +3 \N + +-- !test_empty_null_format_text3 -- + +-- !test_empty_null_defined_text -- +1 Alice +2 \N +3 \N + +-- !test_empty_null_defined_text2 -- +2 \N +3 \N + +-- !test_empty_null_defined_text3 -- + -- !1 -- a 1.1 b 2.2 @@ -103,3 +125,25 @@ b 2.2 1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer 2 Jane,Smith FALSE 2020-05-20 "Project Manager" +-- !test_empty_null_format_text -- +1 Alice +2 \N +3 \N + +-- !test_empty_null_format_text2 -- +2 \N +3 \N + +-- !test_empty_null_format_text3 -- + +-- !test_empty_null_defined_text -- +1 Alice +2 \N +3 \N + +-- !test_empty_null_defined_text2 -- +2 \N +3 \N + +-- !test_empty_null_defined_text3 -- + diff --git a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy index 52cdd25eb07b2c..d4bb051214d724 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy @@ -55,6 +55,14 @@ suite("test_hive_serde_prop", "external_docker,hive,external_docker_hive,p0,exte qt_test_open_csv_default_prop """select * from ${catalog_name}.regression.test_open_csv_default_prop order by id;""" qt_test_open_csv_standard_prop """select * from ${catalog_name}.regression.test_open_csv_standard_prop order by id;""" qt_test_open_csv_custom_prop """select * from ${catalog_name}.regression.test_open_csv_custom_prop order by id;""" + + qt_test_empty_null_format_text """select * from ${catalog_name}.regression.test_empty_null_format_text order by id;""" + qt_test_empty_null_format_text2 """select * from ${catalog_name}.regression.test_empty_null_format_text where name is null order by id;""" + qt_test_empty_null_format_text3 """select * from ${catalog_name}.regression.test_empty_null_format_text where name = '' order by id;""" + + qt_test_empty_null_defined_text """select * from ${catalog_name}.regression.test_empty_null_defined_text order by id;""" + qt_test_empty_null_defined_text2 """select * from ${catalog_name}.regression.test_empty_null_defined_text where name is null order by id;""" + qt_test_empty_null_defined_text3 """select * from ${catalog_name}.regression.test_empty_null_defined_text where name = '' order by id;""" } }