Skip to content

Commit 2ec2fb1

Browse files
author
Gabor Szadovszky
committed
PARQUET-1217: Incorrect handling of missing values in Statistics
1 parent 6a4bbe9 commit 2ec2fb1

File tree

2 files changed

+90
-2
lines changed

2 files changed

+90
-2
lines changed

parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@
4040
import org.apache.parquet.filter2.predicate.UserDefinedPredicate;
4141
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
4242

43-
import static org.apache.parquet.Preconditions.checkArgument;
4443
import static org.apache.parquet.Preconditions.checkNotNull;
4544

4645
/**
@@ -133,6 +132,11 @@ public <T extends Comparable<T>> Boolean visit(Eq<T> eq) {
133132
return BLOCK_CANNOT_MATCH;
134133
}
135134

135+
if (!stats.hasNonNullValue()) {
136+
// stats does not contain min/max values, we cannot drop any chunks
137+
return BLOCK_MIGHT_MATCH;
138+
}
139+
136140
// drop if value < min || value > max
137141
return stats.compareMinToValue(value) > 0 || stats.compareMaxToValue(value) < 0;
138142
}
@@ -172,6 +176,11 @@ public <T extends Comparable<T>> Boolean visit(NotEq<T> notEq) {
172176
return BLOCK_MIGHT_MATCH;
173177
}
174178

179+
if (!stats.hasNonNullValue()) {
180+
// stats does not contain min/max values, we cannot drop any chunks
181+
return BLOCK_MIGHT_MATCH;
182+
}
183+
175184
// drop if this is a column where min = max = value
176185
return stats.compareMinToValue(value) == 0 && stats.compareMaxToValue(value) == 0;
177186
}
@@ -201,6 +210,11 @@ public <T extends Comparable<T>> Boolean visit(Lt<T> lt) {
201210
return BLOCK_CANNOT_MATCH;
202211
}
203212

213+
if (!stats.hasNonNullValue()) {
214+
// stats does not contain min/max values, we cannot drop any chunks
215+
return BLOCK_MIGHT_MATCH;
216+
}
217+
204218
T value = lt.getValue();
205219

206220
// drop if value <= min
@@ -232,6 +246,11 @@ public <T extends Comparable<T>> Boolean visit(LtEq<T> ltEq) {
232246
return BLOCK_CANNOT_MATCH;
233247
}
234248

249+
if (!stats.hasNonNullValue()) {
250+
// stats does not contain min/max values, we cannot drop any chunks
251+
return BLOCK_MIGHT_MATCH;
252+
}
253+
235254
T value = ltEq.getValue();
236255

237256
// drop if value < min
@@ -263,6 +282,11 @@ public <T extends Comparable<T>> Boolean visit(Gt<T> gt) {
263282
return BLOCK_CANNOT_MATCH;
264283
}
265284

285+
if (!stats.hasNonNullValue()) {
286+
// stats does not contain min/max values, we cannot drop any chunks
287+
return BLOCK_MIGHT_MATCH;
288+
}
289+
266290
T value = gt.getValue();
267291

268292
// drop if value >= max
@@ -294,6 +318,11 @@ public <T extends Comparable<T>> Boolean visit(GtEq<T> gtEq) {
294318
return BLOCK_CANNOT_MATCH;
295319
}
296320

321+
if (!stats.hasNonNullValue()) {
322+
// stats does not contain min/max values, we cannot drop any chunks
323+
return BLOCK_MIGHT_MATCH;
324+
}
325+
297326
T value = gtEq.getValue();
298327

299328
// drop if value > max
@@ -355,6 +384,11 @@ private <T extends Comparable<T>, U extends UserDefinedPredicate<T>> Boolean vis
355384
}
356385
}
357386

387+
if (!stats.hasNonNullValue()) {
388+
// stats does not contain min/max values, we cannot drop any chunks
389+
return BLOCK_MIGHT_MATCH;
390+
}
391+
358392
org.apache.parquet.filter2.predicate.Statistics<T> udpStats =
359393
new org.apache.parquet.filter2.predicate.Statistics<T>(stats.genericGetMin(), stats.genericGetMax(),
360394
stats.comparator());

parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
import java.util.HashSet;
2323
import java.util.List;
2424

25-
import org.apache.parquet.io.api.Binary;
2625
import org.junit.Test;
2726

2827
import org.apache.parquet.column.Encoding;
@@ -87,14 +86,18 @@ private static ColumnChunkMetaData getDoubleColumnMeta(DoubleStatistics stats, l
8786

8887
private static final IntStatistics intStats = new IntStatistics();
8988
private static final IntStatistics nullIntStats = new IntStatistics();
89+
private static final IntStatistics missingMinMaxIntStats = new IntStatistics();
9090
private static final DoubleStatistics doubleStats = new DoubleStatistics();
91+
private static final DoubleStatistics missingMinMaxDoubleStats = new DoubleStatistics();
9192

9293
static {
9394
intStats.setMinMax(10, 100);
9495
doubleStats.setMinMax(10, 100);
9596

9697
nullIntStats.setMinMax(0, 0);
9798
nullIntStats.setNumNulls(177);
99+
100+
missingMinMaxDoubleStats.setNumNulls(100);
98101
}
99102

100103
private static final List<ColumnChunkMetaData> columnMetas = Arrays.asList(
@@ -105,6 +108,9 @@ private static ColumnChunkMetaData getDoubleColumnMeta(DoubleStatistics stats, l
105108
getIntColumnMeta(nullIntStats, 177L), // column of all nulls
106109
getDoubleColumnMeta(doubleStats, 177L));
107110

111+
private static final List<ColumnChunkMetaData> missingMinMaxColumnMetas = Arrays.asList(
112+
getIntColumnMeta(missingMinMaxIntStats, 177L), // missing min/max, no null values => stats is empty
113+
getDoubleColumnMeta(missingMinMaxDoubleStats, 177L)); // missing min/max, some null values
108114

109115
@Test
110116
public void testEqNonNull() {
@@ -116,6 +122,9 @@ public void testEqNonNull() {
116122
// drop columns of all nulls when looking for non-null value
117123
assertTrue(canDrop(eq(intColumn, 0), nullColumnMetas));
118124
assertTrue(canDrop(eq(missingColumn, fromString("any")), columnMetas));
125+
126+
assertFalse(canDrop(eq(intColumn, 50), missingMinMaxColumnMetas));
127+
assertFalse(canDrop(eq(doubleColumn, 50.0), missingMinMaxColumnMetas));
119128
}
120129

121130
@Test
@@ -137,6 +146,9 @@ public void testEqNull() {
137146
getDoubleColumnMeta(doubleStats, 177L))));
138147

139148
assertFalse(canDrop(eq(missingColumn, null), columnMetas));
149+
150+
assertFalse(canDrop(eq(intColumn, null), missingMinMaxColumnMetas));
151+
assertFalse(canDrop(eq(doubleColumn, null), missingMinMaxColumnMetas));
140152
}
141153

142154
@Test
@@ -163,6 +175,9 @@ public void testNotEqNonNull() {
163175
getDoubleColumnMeta(doubleStats, 177L))));
164176

165177
assertFalse(canDrop(notEq(missingColumn, fromString("any")), columnMetas));
178+
179+
assertFalse(canDrop(notEq(intColumn, 50), missingMinMaxColumnMetas));
180+
assertFalse(canDrop(notEq(doubleColumn, 50.0), missingMinMaxColumnMetas));
166181
}
167182

168183
@Test
@@ -192,6 +207,9 @@ public void testNotEqNull() {
192207
getDoubleColumnMeta(doubleStats, 177L))));
193208

194209
assertTrue(canDrop(notEq(missingColumn, null), columnMetas));
210+
211+
assertFalse(canDrop(notEq(intColumn, null), missingMinMaxColumnMetas));
212+
assertFalse(canDrop(notEq(doubleColumn, null), missingMinMaxColumnMetas));
195213
}
196214

197215
@Test
@@ -205,6 +223,9 @@ public void testLt() {
205223
assertTrue(canDrop(lt(intColumn, 7), nullColumnMetas));
206224

207225
assertTrue(canDrop(lt(missingColumn, fromString("any")), columnMetas));
226+
227+
assertFalse(canDrop(lt(intColumn, 0), missingMinMaxColumnMetas));
228+
assertFalse(canDrop(lt(doubleColumn, 0.0), missingMinMaxColumnMetas));
208229
}
209230

210231
@Test
@@ -218,6 +239,9 @@ public void testLtEq() {
218239
assertTrue(canDrop(ltEq(intColumn, 7), nullColumnMetas));
219240

220241
assertTrue(canDrop(ltEq(missingColumn, fromString("any")), columnMetas));
242+
243+
assertFalse(canDrop(ltEq(intColumn, -1), missingMinMaxColumnMetas));
244+
assertFalse(canDrop(ltEq(doubleColumn, -0.1), missingMinMaxColumnMetas));
221245
}
222246

223247
@Test
@@ -231,6 +255,9 @@ public void testGt() {
231255
assertTrue(canDrop(gt(intColumn, 7), nullColumnMetas));
232256

233257
assertTrue(canDrop(gt(missingColumn, fromString("any")), columnMetas));
258+
259+
assertFalse(canDrop(gt(intColumn, 0), missingMinMaxColumnMetas));
260+
assertFalse(canDrop(gt(doubleColumn, 0.0), missingMinMaxColumnMetas));
234261
}
235262

236263
@Test
@@ -244,6 +271,9 @@ public void testGtEq() {
244271
assertTrue(canDrop(gtEq(intColumn, 7), nullColumnMetas));
245272

246273
assertTrue(canDrop(gtEq(missingColumn, fromString("any")), columnMetas));
274+
275+
assertFalse(canDrop(gtEq(intColumn, 1), missingMinMaxColumnMetas));
276+
assertFalse(canDrop(gtEq(doubleColumn, 0.1), missingMinMaxColumnMetas));
247277
}
248278

249279
@Test
@@ -297,6 +327,26 @@ public boolean keep(Integer value) {
297327
}
298328
}
299329

330+
public static class AllPositiveUdp extends UserDefinedPredicate<Double> {
331+
@Override
332+
public boolean keep(Double value) {
333+
if (value == null) {
334+
return true;
335+
}
336+
throw new RuntimeException("this method should not be called with value != null");
337+
}
338+
339+
@Override
340+
public boolean canDrop(Statistics<Double> statistics) {
341+
return statistics.getMin() <= 0.0;
342+
}
343+
344+
@Override
345+
public boolean inverseCanDrop(Statistics<Double> statistics) {
346+
return statistics.getMin() > 0.0;
347+
}
348+
}
349+
300350
@Test
301351
public void testUdp() {
302352
FilterPredicate pred = userDefined(intColumn, SevensAndEightsUdp.class);
@@ -308,6 +358,8 @@ public void testUdp() {
308358
FilterPredicate udpKeepMissingColumn = userDefined(missingColumn2, SevensAndEightsUdp.class);
309359
FilterPredicate invUdpKeepMissingColumn = LogicalInverseRewriter.rewrite(not(userDefined(missingColumn2, SevensAndEightsUdp.class)));
310360

361+
FilterPredicate allPositivePred = userDefined(doubleColumn, AllPositiveUdp.class);
362+
311363
IntStatistics seven = new IntStatistics();
312364
seven.setMinMax(7, 7);
313365

@@ -392,6 +444,8 @@ public void testUdp() {
392444
assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList(
393445
getIntColumnMeta(neither, 177L),
394446
getDoubleColumnMeta(doubleStats, 177L))));
447+
448+
assertFalse(canDrop(allPositivePred, missingMinMaxColumnMetas));
395449
}
396450

397451
@Test

0 commit comments

Comments
 (0)