Skip to content

Commit

Permalink
ORC-1813: [C++] Fix has_null forward compatibility
Browse files Browse the repository at this point in the history
close: apache#2079
relate pr: apache#2055
Introduce fallback logic in the C++ reader to set hasNull to true when the field is missing, similar to the Java implementation.
The Java implementation includes the following logic:
```java
if (stats.hasHasNull()) {
    hasNull = stats.getHasNull();
} else {
    hasNull = true;
}
```
In contrast, the C++ implementation directly uses the has_null value without any fallback logic:
```c++
ColumnStatisticsImpl::ColumnStatisticsImpl(const proto::ColumnStatistics& pb) {
    stats_.setNumberOfValues(pb.number_of_values());
    stats_.setHasNull(pb.has_null());
}
```
We encountered an issue with the C++ implementation of the ORC reader when handling ORC files written with version 0.12. Specifically, files written in this version do not include the hasNull field in the column statistics metadata. While the Java implementation of the ORC reader handles this gracefully by defaulting hasNull to true when the field is absent, the C++ implementation does not handle this scenario correctly.
**This issue prevents predicates like IS NULL from being pushed down to the ORC reader!!! As a result, all rows in the file are filtered out, leading to incorrect query results :(**
I have tested this using [Doris](https://github.com/apache/doris) external pipeline:
apache/doris#45104
apache/doris-thirdparty#259
No

Closes apache#2082 from suxiaogang223/fix_has_null.

Authored-by: Socrates <suxiaogang223@icloud.com>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
  • Loading branch information
suxiaogang223 committed Dec 16, 2024
1 parent 24b8e5b commit 21aa4cd
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 37 deletions.
20 changes: 10 additions & 10 deletions c++/src/Statistics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -181,13 +181,13 @@ namespace orc {

ColumnStatisticsImpl::ColumnStatisticsImpl(const proto::ColumnStatistics& pb) {
_stats.setNumberOfValues(pb.number_of_values());
_stats.setHasNull(pb.has_null());
_stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
}

BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl(const proto::ColumnStatistics& pb,
const StatContext& statContext) {
_stats.setNumberOfValues(pb.number_of_values());
_stats.setHasNull(pb.has_null());
_stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (pb.has_binary_statistics() && statContext.correctStats) {
_stats.setHasTotalLength(pb.binary_statistics().has_sum());
_stats.setTotalLength(static_cast<uint64_t>(pb.binary_statistics().sum()));
Expand All @@ -197,7 +197,7 @@ namespace orc {
BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl(const proto::ColumnStatistics& pb,
const StatContext& statContext) {
_stats.setNumberOfValues(pb.number_of_values());
_stats.setHasNull(pb.has_null());
_stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (pb.has_bucket_statistics() && statContext.correctStats) {
_hasCount = true;
_trueCount = pb.bucket_statistics().count(0);
Expand All @@ -210,7 +210,7 @@ namespace orc {
DateColumnStatisticsImpl::DateColumnStatisticsImpl(const proto::ColumnStatistics& pb,
const StatContext& statContext) {
_stats.setNumberOfValues(pb.number_of_values());
_stats.setHasNull(pb.has_null());
_stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (!pb.has_date_statistics() || !statContext.correctStats) {
// hasMinimum_ is false by default;
// hasMaximum_ is false by default;
Expand All @@ -227,7 +227,7 @@ namespace orc {
DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl(const proto::ColumnStatistics& pb,
const StatContext& statContext) {
_stats.setNumberOfValues(pb.number_of_values());
_stats.setHasNull(pb.has_null());
_stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (pb.has_decimal_statistics() && statContext.correctStats) {
const proto::DecimalStatistics& stats = pb.decimal_statistics();
_stats.setHasMinimum(stats.has_minimum());
Expand All @@ -242,7 +242,7 @@ namespace orc {

DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl(const proto::ColumnStatistics& pb) {
_stats.setNumberOfValues(pb.number_of_values());
_stats.setHasNull(pb.has_null());
_stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (!pb.has_double_statistics()) {
_stats.setMinimum(0);
_stats.setMaximum(0);
Expand All @@ -261,7 +261,7 @@ namespace orc {

IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl(const proto::ColumnStatistics& pb) {
_stats.setNumberOfValues(pb.number_of_values());
_stats.setHasNull(pb.has_null());
_stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (!pb.has_int_statistics()) {
_stats.setMinimum(0);
_stats.setMaximum(0);
Expand All @@ -281,7 +281,7 @@ namespace orc {
StringColumnStatisticsImpl::StringColumnStatisticsImpl(const proto::ColumnStatistics& pb,
const StatContext& statContext) {
_stats.setNumberOfValues(pb.number_of_values());
_stats.setHasNull(pb.has_null());
_stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (!pb.has_string_statistics() || !statContext.correctStats) {
_stats.setTotalLength(0);
} else {
Expand All @@ -299,7 +299,7 @@ namespace orc {
TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl(const proto::ColumnStatistics& pb,
const StatContext& statContext) {
_stats.setNumberOfValues(pb.number_of_values());
_stats.setHasNull(pb.has_null());
_stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (!pb.has_timestamp_statistics() || !statContext.correctStats) {
_stats.setMinimum(0);
_stats.setMaximum(0);
Expand Down Expand Up @@ -365,7 +365,7 @@ namespace orc {
CollectionColumnStatisticsImpl::CollectionColumnStatisticsImpl(
const proto::ColumnStatistics& pb) {
_stats.setNumberOfValues(pb.number_of_values());
_stats.setHasNull(pb.has_null());
_stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (!pb.has_collection_statistics()) {
_stats.setMinimum(0);
_stats.setMaximum(0);
Expand Down
13 changes: 7 additions & 6 deletions c++/test/TestStripeIndexStatistics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,39 +46,40 @@ namespace orc {
intColStats = reinterpret_cast<const orc::IntegerColumnStatistics*>(
stripeStats->getRowIndexStatistics(1, 0));
EXPECT_EQ(
"Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 1\nMaximum: 2000\nSum: 2001000\n",
"Data type: Integer\nValues: 2000\nHas null: yes\nMinimum: 1\nMaximum: 2000\nSum: "
"2001000\n",
intColStats->toString());
intColStats = reinterpret_cast<const orc::IntegerColumnStatistics*>(
stripeStats->getRowIndexStatistics(1, 1));
EXPECT_EQ(
"Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 2001\nMaximum: 4000\nSum: "
"Data type: Integer\nValues: 2000\nHas null: yes\nMinimum: 2001\nMaximum: 4000\nSum: "
"6001000\n",
intColStats->toString());
intColStats = reinterpret_cast<const orc::IntegerColumnStatistics*>(
stripeStats->getRowIndexStatistics(1, 2));
EXPECT_EQ(
"Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 4001\nMaximum: 6000\nSum: "
"Data type: Integer\nValues: 2000\nHas null: yes\nMinimum: 4001\nMaximum: 6000\nSum: "
"10001000\n",
intColStats->toString());

const orc::StringColumnStatistics* stringColStats;
stringColStats = reinterpret_cast<const orc::StringColumnStatistics*>(
stripeStats->getRowIndexStatistics(2, 0));
EXPECT_EQ(
"Data type: String\nValues: 2000\nHas null: no\nMinimum: 1000\nMaximum: 9a\nTotal length: "
"Data type: String\nValues: 2000\nHas null: yes\nMinimum: 1000\nMaximum: 9a\nTotal length: "
"7892\n",
stringColStats->toString());
stringColStats = reinterpret_cast<const orc::StringColumnStatistics*>(
stripeStats->getRowIndexStatistics(2, 1));
EXPECT_EQ(
"Data type: String\nValues: 2000\nHas null: no\nMinimum: 2001\nMaximum: 4000\nTotal "
"Data type: String\nValues: 2000\nHas null: yes\nMinimum: 2001\nMaximum: 4000\nTotal "
"length: "
"8000\n",
stringColStats->toString());
stringColStats = reinterpret_cast<const orc::StringColumnStatistics*>(
stripeStats->getRowIndexStatistics(2, 2));
EXPECT_EQ(
"Data type: String\nValues: 2000\nHas null: no\nMinimum: 4001\nMaximum: 6000\nTotal "
"Data type: String\nValues: 2000\nHas null: yes\nMinimum: 4001\nMaximum: 6000\nTotal "
"length: "
"8000\n",
stringColStats->toString());
Expand Down
42 changes: 21 additions & 21 deletions tools/test/TestFileStatistics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,20 @@ TEST(TestFileStatistics, testNormal) {
const std::string expected = "File " + file +
" has 3 columns\n"
"*** Column 0 ***\n"
"Column has 6000 values and has null value: no\n"
"Column has 6000 values and has null value: yes\n"
"\n"
"*** Column 1 ***\n"
"Data type: Integer\n"
"Values: 6000\n"
"Has null: no\n"
"Has null: yes\n"
"Minimum: 1\n"
"Maximum: 6000\n"
"Sum: 18003000\n"
"\n"
"*** Column 2 ***\n"
"Data type: String\n"
"Values: 6000\n"
"Has null: no\n"
"Has null: yes\n"
"Minimum: 1000\n"
"Maximum: 9a\n"
"Total length: 23892\n"
Expand All @@ -54,20 +54,20 @@ TEST(TestFileStatistics, testNormal) {
"*** Stripe 0 ***\n"
"\n"
"--- Column 0 ---\n"
"Column has 6000 values and has null value: no\n"
"Column has 6000 values and has null value: yes\n"
"\n"
"--- Column 1 ---\n"
"Data type: Integer\n"
"Values: 6000\n"
"Has null: no\n"
"Has null: yes\n"
"Minimum: 1\n"
"Maximum: 6000\n"
"Sum: 18003000\n"
"\n"
"--- Column 2 ---\n"
"Data type: String\n"
"Values: 6000\n"
"Has null: no\n"
"Has null: yes\n"
"Minimum: 1000\n"
"Maximum: 9a\n"
"Total length: 23892\n\n";
Expand All @@ -86,20 +86,20 @@ TEST(TestFileStatistics, testOptions) {
const std::string expected = "File " + file +
" has 3 columns\n"
"*** Column 0 ***\n"
"Column has 6000 values and has null value: no\n"
"Column has 6000 values and has null value: yes\n"
"\n"
"*** Column 1 ***\n"
"Data type: Integer\n"
"Values: 6000\n"
"Has null: no\n"
"Has null: yes\n"
"Minimum: 1\n"
"Maximum: 6000\n"
"Sum: 18003000\n"
"\n"
"*** Column 2 ***\n"
"Data type: String\n"
"Values: 6000\n"
"Has null: no\n"
"Has null: yes\n"
"Minimum: 1000\n"
"Maximum: 9a\n"
"Total length: 23892\n"
Expand All @@ -110,77 +110,77 @@ TEST(TestFileStatistics, testOptions) {
"*** Stripe 0 ***\n"
"\n"
"--- Column 0 ---\n"
"Column has 6000 values and has null value: no\n"
"Column has 6000 values and has null value: yes\n"
"\n"
"--- RowIndex 0 ---\n"
"Column has 2000 values and has null value: no\n"
"Column has 2000 values and has null value: yes\n"
"\n"
"--- RowIndex 1 ---\n"
"Column has 2000 values and has null value: no\n"
"Column has 2000 values and has null value: yes\n"
"\n"
"--- RowIndex 2 ---\n"
"Column has 2000 values and has null value: no\n"
"Column has 2000 values and has null value: yes\n"
"\n"
"--- Column 1 ---\n"
"Data type: Integer\n"
"Values: 6000\n"
"Has null: no\n"
"Has null: yes\n"
"Minimum: 1\n"
"Maximum: 6000\n"
"Sum: 18003000\n"
"\n"
"--- RowIndex 0 ---\n"
"Data type: Integer\n"
"Values: 2000\n"
"Has null: no\n"
"Has null: yes\n"
"Minimum: 1\n"
"Maximum: 2000\n"
"Sum: 2001000\n"
"\n"
"--- RowIndex 1 ---\n"
"Data type: Integer\n"
"Values: 2000\n"
"Has null: no\n"
"Has null: yes\n"
"Minimum: 2001\n"
"Maximum: 4000\n"
"Sum: 6001000\n"
"\n"
"--- RowIndex 2 ---\n"
"Data type: Integer\n"
"Values: 2000\n"
"Has null: no\n"
"Has null: yes\n"
"Minimum: 4001\n"
"Maximum: 6000\n"
"Sum: 10001000\n"
"\n"
"--- Column 2 ---\n"
"Data type: String\n"
"Values: 6000\n"
"Has null: no\n"
"Has null: yes\n"
"Minimum: 1000\n"
"Maximum: 9a\n"
"Total length: 23892\n"
"\n"
"--- RowIndex 0 ---\n"
"Data type: String\n"
"Values: 2000\n"
"Has null: no\n"
"Has null: yes\n"
"Minimum: 1000\n"
"Maximum: 9a\n"
"Total length: 7892\n"
"\n"
"--- RowIndex 1 ---\n"
"Data type: String\n"
"Values: 2000\n"
"Has null: no\n"
"Has null: yes\n"
"Minimum: 2001\n"
"Maximum: 4000\n"
"Total length: 8000\n"
"\n"
"--- RowIndex 2 ---\n"
"Data type: String\n"
"Values: 2000\n"
"Has null: no\n"
"Has null: yes\n"
"Minimum: 4001\n"
"Maximum: 6000\n"
"Total length: 8000\n\n";
Expand Down

0 comments on commit 21aa4cd

Please sign in to comment.