From 21aa4cdc08faec40d2c81ed1ed38eda1dba28be7 Mon Sep 17 00:00:00 2001 From: Socrates Date: Thu, 12 Dec 2024 07:47:47 -0800 Subject: [PATCH] ORC-1813: [C++] Fix has_null forward compatibility close: #2079 relate pr: #2055 Introduce fallback logic in the C++ reader to set hasNull to true when the field is missing, similar to the Java implementation. The Java implementation includes the following logic: ```java if (stats.hasHasNull()) { hasNull = stats.getHasNull(); } else { hasNull = true; } ``` In contrast, the C++ implementation directly uses the has_null value without any fallback logic: ```c++ ColumnStatisticsImpl::ColumnStatisticsImpl(const proto::ColumnStatistics& pb) { stats_.setNumberOfValues(pb.number_of_values()); stats_.setHasNull(pb.has_null()); } ``` We encountered an issue with the C++ implementation of the ORC reader when handling ORC files written with version 0.12. Specifically, files written in this version do not include the hasNull field in the column statistics metadata. While the Java implementation of the ORC reader handles this gracefully by defaulting hasNull to true when the field is absent, the C++ implementation does not handle this scenario correctly. **This issue prevents predicates like IS NULL from being pushed down to the ORC reader!!! As a result, all rows in the file are filtered out, leading to incorrect query results :(** I have tested this using [Doris](https://github.com/apache/doris) external pipeline: https://github.com/apache/doris/pull/45104 https://github.com/apache/doris-thirdparty/pull/259 No Closes #2082 from suxiaogang223/fix_has_null. Authored-by: Socrates Signed-off-by: Dongjoon Hyun --- c++/src/Statistics.cc | 20 ++++++------- c++/test/TestStripeIndexStatistics.cc | 13 +++++---- tools/test/TestFileStatistics.cc | 42 +++++++++++++-------------- 3 files changed, 38 insertions(+), 37 deletions(-) diff --git a/c++/src/Statistics.cc b/c++/src/Statistics.cc index 8ed29d0e7c..e8aeb51830 100644 --- a/c++/src/Statistics.cc +++ b/c++/src/Statistics.cc @@ -181,13 +181,13 @@ namespace orc { ColumnStatisticsImpl::ColumnStatisticsImpl(const proto::ColumnStatistics& pb) { _stats.setNumberOfValues(pb.number_of_values()); - _stats.setHasNull(pb.has_null()); + _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true); } BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { _stats.setNumberOfValues(pb.number_of_values()); - _stats.setHasNull(pb.has_null()); + _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (pb.has_binary_statistics() && statContext.correctStats) { _stats.setHasTotalLength(pb.binary_statistics().has_sum()); _stats.setTotalLength(static_cast(pb.binary_statistics().sum())); @@ -197,7 +197,7 @@ namespace orc { BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { _stats.setNumberOfValues(pb.number_of_values()); - _stats.setHasNull(pb.has_null()); + _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (pb.has_bucket_statistics() && statContext.correctStats) { _hasCount = true; _trueCount = pb.bucket_statistics().count(0); @@ -210,7 +210,7 @@ namespace orc { DateColumnStatisticsImpl::DateColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { _stats.setNumberOfValues(pb.number_of_values()); - _stats.setHasNull(pb.has_null()); + _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_date_statistics() || !statContext.correctStats) { // hasMinimum_ is false by default; // hasMaximum_ is false by default; @@ -227,7 +227,7 @@ namespace orc { DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { _stats.setNumberOfValues(pb.number_of_values()); - _stats.setHasNull(pb.has_null()); + _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (pb.has_decimal_statistics() && statContext.correctStats) { const proto::DecimalStatistics& stats = pb.decimal_statistics(); _stats.setHasMinimum(stats.has_minimum()); @@ -242,7 +242,7 @@ namespace orc { DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl(const proto::ColumnStatistics& pb) { _stats.setNumberOfValues(pb.number_of_values()); - _stats.setHasNull(pb.has_null()); + _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_double_statistics()) { _stats.setMinimum(0); _stats.setMaximum(0); @@ -261,7 +261,7 @@ namespace orc { IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl(const proto::ColumnStatistics& pb) { _stats.setNumberOfValues(pb.number_of_values()); - _stats.setHasNull(pb.has_null()); + _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_int_statistics()) { _stats.setMinimum(0); _stats.setMaximum(0); @@ -281,7 +281,7 @@ namespace orc { StringColumnStatisticsImpl::StringColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { _stats.setNumberOfValues(pb.number_of_values()); - _stats.setHasNull(pb.has_null()); + _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_string_statistics() || !statContext.correctStats) { _stats.setTotalLength(0); } else { @@ -299,7 +299,7 @@ namespace orc { TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { _stats.setNumberOfValues(pb.number_of_values()); - _stats.setHasNull(pb.has_null()); + _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_timestamp_statistics() || !statContext.correctStats) { _stats.setMinimum(0); _stats.setMaximum(0); @@ -365,7 +365,7 @@ namespace orc { CollectionColumnStatisticsImpl::CollectionColumnStatisticsImpl( const proto::ColumnStatistics& pb) { _stats.setNumberOfValues(pb.number_of_values()); - _stats.setHasNull(pb.has_null()); + _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_collection_statistics()) { _stats.setMinimum(0); _stats.setMaximum(0); diff --git a/c++/test/TestStripeIndexStatistics.cc b/c++/test/TestStripeIndexStatistics.cc index 34a4649c35..85fdb80e4f 100644 --- a/c++/test/TestStripeIndexStatistics.cc +++ b/c++/test/TestStripeIndexStatistics.cc @@ -46,18 +46,19 @@ namespace orc { intColStats = reinterpret_cast( stripeStats->getRowIndexStatistics(1, 0)); EXPECT_EQ( - "Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 1\nMaximum: 2000\nSum: 2001000\n", + "Data type: Integer\nValues: 2000\nHas null: yes\nMinimum: 1\nMaximum: 2000\nSum: " + "2001000\n", intColStats->toString()); intColStats = reinterpret_cast( stripeStats->getRowIndexStatistics(1, 1)); EXPECT_EQ( - "Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 2001\nMaximum: 4000\nSum: " + "Data type: Integer\nValues: 2000\nHas null: yes\nMinimum: 2001\nMaximum: 4000\nSum: " "6001000\n", intColStats->toString()); intColStats = reinterpret_cast( stripeStats->getRowIndexStatistics(1, 2)); EXPECT_EQ( - "Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 4001\nMaximum: 6000\nSum: " + "Data type: Integer\nValues: 2000\nHas null: yes\nMinimum: 4001\nMaximum: 6000\nSum: " "10001000\n", intColStats->toString()); @@ -65,20 +66,20 @@ namespace orc { stringColStats = reinterpret_cast( stripeStats->getRowIndexStatistics(2, 0)); EXPECT_EQ( - "Data type: String\nValues: 2000\nHas null: no\nMinimum: 1000\nMaximum: 9a\nTotal length: " + "Data type: String\nValues: 2000\nHas null: yes\nMinimum: 1000\nMaximum: 9a\nTotal length: " "7892\n", stringColStats->toString()); stringColStats = reinterpret_cast( stripeStats->getRowIndexStatistics(2, 1)); EXPECT_EQ( - "Data type: String\nValues: 2000\nHas null: no\nMinimum: 2001\nMaximum: 4000\nTotal " + "Data type: String\nValues: 2000\nHas null: yes\nMinimum: 2001\nMaximum: 4000\nTotal " "length: " "8000\n", stringColStats->toString()); stringColStats = reinterpret_cast( stripeStats->getRowIndexStatistics(2, 2)); EXPECT_EQ( - "Data type: String\nValues: 2000\nHas null: no\nMinimum: 4001\nMaximum: 6000\nTotal " + "Data type: String\nValues: 2000\nHas null: yes\nMinimum: 4001\nMaximum: 6000\nTotal " "length: " "8000\n", stringColStats->toString()); diff --git a/tools/test/TestFileStatistics.cc b/tools/test/TestFileStatistics.cc index 1b2a396dc4..051f2fb3f9 100644 --- a/tools/test/TestFileStatistics.cc +++ b/tools/test/TestFileStatistics.cc @@ -30,12 +30,12 @@ TEST(TestFileStatistics, testNormal) { const std::string expected = "File " + file + " has 3 columns\n" "*** Column 0 ***\n" - "Column has 6000 values and has null value: no\n" + "Column has 6000 values and has null value: yes\n" "\n" "*** Column 1 ***\n" "Data type: Integer\n" "Values: 6000\n" - "Has null: no\n" + "Has null: yes\n" "Minimum: 1\n" "Maximum: 6000\n" "Sum: 18003000\n" @@ -43,7 +43,7 @@ TEST(TestFileStatistics, testNormal) { "*** Column 2 ***\n" "Data type: String\n" "Values: 6000\n" - "Has null: no\n" + "Has null: yes\n" "Minimum: 1000\n" "Maximum: 9a\n" "Total length: 23892\n" @@ -54,12 +54,12 @@ TEST(TestFileStatistics, testNormal) { "*** Stripe 0 ***\n" "\n" "--- Column 0 ---\n" - "Column has 6000 values and has null value: no\n" + "Column has 6000 values and has null value: yes\n" "\n" "--- Column 1 ---\n" "Data type: Integer\n" "Values: 6000\n" - "Has null: no\n" + "Has null: yes\n" "Minimum: 1\n" "Maximum: 6000\n" "Sum: 18003000\n" @@ -67,7 +67,7 @@ TEST(TestFileStatistics, testNormal) { "--- Column 2 ---\n" "Data type: String\n" "Values: 6000\n" - "Has null: no\n" + "Has null: yes\n" "Minimum: 1000\n" "Maximum: 9a\n" "Total length: 23892\n\n"; @@ -86,12 +86,12 @@ TEST(TestFileStatistics, testOptions) { const std::string expected = "File " + file + " has 3 columns\n" "*** Column 0 ***\n" - "Column has 6000 values and has null value: no\n" + "Column has 6000 values and has null value: yes\n" "\n" "*** Column 1 ***\n" "Data type: Integer\n" "Values: 6000\n" - "Has null: no\n" + "Has null: yes\n" "Minimum: 1\n" "Maximum: 6000\n" "Sum: 18003000\n" @@ -99,7 +99,7 @@ TEST(TestFileStatistics, testOptions) { "*** Column 2 ***\n" "Data type: String\n" "Values: 6000\n" - "Has null: no\n" + "Has null: yes\n" "Minimum: 1000\n" "Maximum: 9a\n" "Total length: 23892\n" @@ -110,21 +110,21 @@ TEST(TestFileStatistics, testOptions) { "*** Stripe 0 ***\n" "\n" "--- Column 0 ---\n" - "Column has 6000 values and has null value: no\n" + "Column has 6000 values and has null value: yes\n" "\n" "--- RowIndex 0 ---\n" - "Column has 2000 values and has null value: no\n" + "Column has 2000 values and has null value: yes\n" "\n" "--- RowIndex 1 ---\n" - "Column has 2000 values and has null value: no\n" + "Column has 2000 values and has null value: yes\n" "\n" "--- RowIndex 2 ---\n" - "Column has 2000 values and has null value: no\n" + "Column has 2000 values and has null value: yes\n" "\n" "--- Column 1 ---\n" "Data type: Integer\n" "Values: 6000\n" - "Has null: no\n" + "Has null: yes\n" "Minimum: 1\n" "Maximum: 6000\n" "Sum: 18003000\n" @@ -132,7 +132,7 @@ TEST(TestFileStatistics, testOptions) { "--- RowIndex 0 ---\n" "Data type: Integer\n" "Values: 2000\n" - "Has null: no\n" + "Has null: yes\n" "Minimum: 1\n" "Maximum: 2000\n" "Sum: 2001000\n" @@ -140,7 +140,7 @@ TEST(TestFileStatistics, testOptions) { "--- RowIndex 1 ---\n" "Data type: Integer\n" "Values: 2000\n" - "Has null: no\n" + "Has null: yes\n" "Minimum: 2001\n" "Maximum: 4000\n" "Sum: 6001000\n" @@ -148,7 +148,7 @@ TEST(TestFileStatistics, testOptions) { "--- RowIndex 2 ---\n" "Data type: Integer\n" "Values: 2000\n" - "Has null: no\n" + "Has null: yes\n" "Minimum: 4001\n" "Maximum: 6000\n" "Sum: 10001000\n" @@ -156,7 +156,7 @@ TEST(TestFileStatistics, testOptions) { "--- Column 2 ---\n" "Data type: String\n" "Values: 6000\n" - "Has null: no\n" + "Has null: yes\n" "Minimum: 1000\n" "Maximum: 9a\n" "Total length: 23892\n" @@ -164,7 +164,7 @@ TEST(TestFileStatistics, testOptions) { "--- RowIndex 0 ---\n" "Data type: String\n" "Values: 2000\n" - "Has null: no\n" + "Has null: yes\n" "Minimum: 1000\n" "Maximum: 9a\n" "Total length: 7892\n" @@ -172,7 +172,7 @@ TEST(TestFileStatistics, testOptions) { "--- RowIndex 1 ---\n" "Data type: String\n" "Values: 2000\n" - "Has null: no\n" + "Has null: yes\n" "Minimum: 2001\n" "Maximum: 4000\n" "Total length: 8000\n" @@ -180,7 +180,7 @@ TEST(TestFileStatistics, testOptions) { "--- RowIndex 2 ---\n" "Data type: String\n" "Values: 2000\n" - "Has null: no\n" + "Has null: yes\n" "Minimum: 4001\n" "Maximum: 6000\n" "Total length: 8000\n\n";