Skip to content

Commit 5c20cef

Browse files
author
Gabor Szadovszky
committed
PARQUET-1217: Incorrect handling of missing values in Statistics
In parquet-format every value in Statistics is optional while parquet-mr does not properly handle these scenarios: - null_count is set but min/max or min_value/max_value are not: filtering may fail with NPE or incorrect filtering occurs fix: check if min/max is set before comparing to the related values - null_count is not set: filtering handles null_count as if it would be 0 -> incorrect filtering may occur fix: introduce new method in Statistics object to check if num_nulls is set; check if num_nulls is set by the new method before using its value for filtering Author: Gabor Szadovszky <gabor.szadovszky@cloudera.com> Closes #458 from gszadovszky/PARQUET-1217 and squashes the following commits: 9d14090 [Gabor Szadovszky] Updates according to rdblue's comments 116d1d3 [Gabor Szadovszky] PARQUET-1217: Updates according to zi's comments c264b50 [Gabor Szadovszky] PARQUET-1217: fix handling of unset nullCount 2ec2fb1 [Gabor Szadovszky] PARQUET-1217: Incorrect handling of missing values in Statistics This change is based on b82d962 but is not a clean cherry-pick.
1 parent d59b32a commit 5c20cef

File tree

8 files changed

+240
-42
lines changed

8 files changed

+240
-42
lines changed

parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java

Lines changed: 65 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,44 @@
3131
*/
3232
public abstract class Statistics<T extends Comparable<T>> {
3333

34+
/**
35+
* Builder class to build Statistics objects. Used to read the statistics from the Parquet file.
36+
*/
37+
public static class Builder {
38+
private final PrimitiveTypeName type;
39+
private byte[] min;
40+
private byte[] max;
41+
private long numNulls = -1;
42+
43+
private Builder(PrimitiveTypeName type) {
44+
this.type = type;
45+
}
46+
47+
public Builder withMin(byte[] min) {
48+
this.min = min;
49+
return this;
50+
}
51+
52+
public Builder withMax(byte[] max) {
53+
this.max = max;
54+
return this;
55+
}
56+
57+
public Builder withNumNulls(long numNulls) {
58+
this.numNulls = numNulls;
59+
return this;
60+
}
61+
62+
public Statistics<?> build() {
63+
Statistics<?> stats = getStatsBasedOnType(type);
64+
if (min != null && max != null) {
65+
stats.setMinMaxFromBytes(min, max);
66+
}
67+
stats.num_nulls = this.numNulls;
68+
return stats;
69+
}
70+
}
71+
3472
private boolean hasNonNullValue;
3573
private long num_nulls;
3674

@@ -67,6 +105,17 @@ public static Statistics getStatsBasedOnType(PrimitiveTypeName type) {
67105
}
68106
}
69107

108+
/**
109+
* Returns a builder to create new statistics object. Used to read the statistics from the parquet file.
110+
*
111+
* @param type
112+
* type of the column
113+
* @return builder to create new statistics object
114+
*/
115+
public static Builder getBuilder(PrimitiveTypeName type) {
116+
return new Builder(type);
117+
}
118+
70119
/**
71120
* updates statistics min and max using the passed value
72121
* @param value value to use to update min and max
@@ -172,7 +221,9 @@ public void mergeStatistics(Statistics stats) {
172221
* Abstract method to set min and max values from byte arrays.
173222
* @param minBytes byte array to set the min value to
174223
* @param maxBytes byte array to set the max value to
224+
* @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead.
175225
*/
226+
@Deprecated
176227
abstract public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes);
177228

178229
abstract public T genericGetMin();
@@ -221,16 +272,20 @@ public void incrementNumNulls(long increment) {
221272

222273
/**
223274
* Returns the null count
224-
* @return null count
275+
* @return null count or {@code -1} if the null count is not set
225276
*/
226277
public long getNumNulls() {
227278
return num_nulls;
228279
}
229280

230281
/**
231282
* Sets the number of nulls to the parameter value
232-
* @param nulls null count to set the count to
283+
*
284+
* @param nulls
285+
* null count to set the count to
286+
* @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead.
233287
*/
288+
@Deprecated
234289
public void setNumNulls(long nulls) {
235290
num_nulls = nulls;
236291
}
@@ -241,7 +296,7 @@ public void setNumNulls(long nulls) {
241296
* @return true if object is empty, false otherwise
242297
*/
243298
public boolean isEmpty() {
244-
return !hasNonNullValue && num_nulls == 0;
299+
return !hasNonNullValue && !isNumNullsSet();
245300
}
246301

247302
/**
@@ -251,6 +306,13 @@ public boolean hasNonNullValue() {
251306
return hasNonNullValue;
252307
}
253308

309+
/**
310+
* @return whether numNulls is set and can be used
311+
*/
312+
public boolean isNumNullsSet() {
313+
return num_nulls >= 0;
314+
}
315+
254316
/**
255317
* Sets the page/column as having a valid non-null value
256318
* kind of misnomer here

parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ public class TestStatistics {
3737
@Test
3838
public void testNumNulls() {
3939
IntStatistics stats = new IntStatistics();
40+
assertTrue(stats.isNumNullsSet());
4041
assertEquals(stats.getNumNulls(), 0);
4142

4243
stats.incrementNumNulls();

parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@
4040
import org.apache.parquet.filter2.predicate.UserDefinedPredicate;
4141
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
4242

43-
import static org.apache.parquet.Preconditions.checkArgument;
4443
import static org.apache.parquet.Preconditions.checkNotNull;
4544

4645
/**
@@ -122,6 +121,10 @@ public <T extends Comparable<T>> Boolean visit(Eq<T> eq) {
122121
}
123122

124123
if (value == null) {
124+
// We don't know anything about the nulls in this chunk
125+
if (!stats.isNumNullsSet()) {
126+
return BLOCK_MIGHT_MATCH;
127+
}
125128
// we are looking for records where v eq(null)
126129
// so drop if there are no nulls in this chunk
127130
return !hasNulls(meta);
@@ -133,6 +136,11 @@ public <T extends Comparable<T>> Boolean visit(Eq<T> eq) {
133136
return BLOCK_CANNOT_MATCH;
134137
}
135138

139+
if (!stats.hasNonNullValue()) {
140+
// stats does not contain min/max values, we cannot drop any chunks
141+
return BLOCK_MIGHT_MATCH;
142+
}
143+
136144
// drop if value < min || value > max
137145
return value.compareTo(stats.genericGetMin()) < 0 || value.compareTo(stats.genericGetMax()) > 0;
138146
}
@@ -166,12 +174,17 @@ public <T extends Comparable<T>> Boolean visit(NotEq<T> notEq) {
166174
return isAllNulls(meta);
167175
}
168176

169-
if (hasNulls(meta)) {
177+
if (stats.isNumNullsSet() && hasNulls(meta)) {
170178
// we are looking for records where v notEq(someNonNull)
171179
// but this chunk contains nulls, we cannot drop it
172180
return BLOCK_MIGHT_MATCH;
173181
}
174182

183+
if (!stats.hasNonNullValue()) {
184+
// stats does not contain min/max values, we cannot drop any chunks
185+
return BLOCK_MIGHT_MATCH;
186+
}
187+
175188
// drop if this is a column where min = max = value
176189
return value.compareTo(stats.genericGetMin()) == 0 && value.compareTo(stats.genericGetMax()) == 0;
177190
}
@@ -201,6 +214,11 @@ public <T extends Comparable<T>> Boolean visit(Lt<T> lt) {
201214
return BLOCK_CANNOT_MATCH;
202215
}
203216

217+
if (!stats.hasNonNullValue()) {
218+
// stats does not contain min/max values, we cannot drop any chunks
219+
return BLOCK_MIGHT_MATCH;
220+
}
221+
204222
T value = lt.getValue();
205223

206224
// drop if value <= min
@@ -232,6 +250,11 @@ public <T extends Comparable<T>> Boolean visit(LtEq<T> ltEq) {
232250
return BLOCK_CANNOT_MATCH;
233251
}
234252

253+
if (!stats.hasNonNullValue()) {
254+
// stats does not contain min/max values, we cannot drop any chunks
255+
return BLOCK_MIGHT_MATCH;
256+
}
257+
235258
T value = ltEq.getValue();
236259

237260
// drop if value < min
@@ -263,6 +286,11 @@ public <T extends Comparable<T>> Boolean visit(Gt<T> gt) {
263286
return BLOCK_CANNOT_MATCH;
264287
}
265288

289+
if (!stats.hasNonNullValue()) {
290+
// stats does not contain min/max values, we cannot drop any chunks
291+
return BLOCK_MIGHT_MATCH;
292+
}
293+
266294
T value = gt.getValue();
267295

268296
// drop if value >= max
@@ -294,6 +322,11 @@ public <T extends Comparable<T>> Boolean visit(GtEq<T> gtEq) {
294322
return BLOCK_CANNOT_MATCH;
295323
}
296324

325+
if (!stats.hasNonNullValue()) {
326+
// stats does not contain min/max values, we cannot drop any chunks
327+
return BLOCK_MIGHT_MATCH;
328+
}
329+
297330
T value = gtEq.getValue();
298331

299332
// drop if value >= max
@@ -355,6 +388,11 @@ private <T extends Comparable<T>, U extends UserDefinedPredicate<T>> Boolean vis
355388
}
356389
}
357390

391+
if (!stats.hasNonNullValue()) {
392+
// stats does not contain min/max values, we cannot drop any chunks
393+
return BLOCK_MIGHT_MATCH;
394+
}
395+
358396
org.apache.parquet.filter2.predicate.Statistics<T> udpStats =
359397
new org.apache.parquet.filter2.predicate.Statistics<T>(stats.genericGetMin(), stats.genericGetMax());
360398

parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -337,7 +337,8 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist
337337
static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal
338338
(String createdBy, Statistics statistics, PrimitiveTypeName type, SortOrder typeSortOrder) {
339339
// create stats object based on the column type
340-
org.apache.parquet.column.statistics.Statistics stats = org.apache.parquet.column.statistics.Statistics.getStatsBasedOnType(type);
340+
org.apache.parquet.column.statistics.Statistics.Builder statsBuilder =
341+
org.apache.parquet.column.statistics.Statistics.getBuilder(type);
341342
// If there was no statistics written to the footer, create an empty Statistics object and return
342343

343344
// NOTE: See docs in CorruptStatistics for explanation of why this check is needed
@@ -347,11 +348,14 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist
347348
if (statistics != null && !CorruptStatistics.shouldIgnoreStatistics(createdBy, type) &&
348349
SortOrder.SIGNED == typeSortOrder) {
349350
if (statistics.isSetMax() && statistics.isSetMin()) {
350-
stats.setMinMaxFromBytes(statistics.min.array(), statistics.max.array());
351+
statsBuilder.withMin(statistics.min.array());
352+
statsBuilder.withMax(statistics.max.array());
353+
}
354+
if (statistics.isSetNull_count()) {
355+
statsBuilder.withNumNulls(statistics.null_count);
351356
}
352-
stats.setNumNulls(statistics.null_count);
353357
}
354-
return stats;
358+
return statsBuilder.build();
355359
}
356360

357361
public org.apache.parquet.column.statistics.Statistics fromParquetStatistics(

0 commit comments

Comments
 (0)