Skip to content

Commit 71cff7c

Browse files
viiryardblue
authored andcommitted
PARQUET-791: Add missing column support for UserDefinedPredicate
This extends the fixing #354 to UserDefinedPredicate. Author: Liang-Chi Hsieh <viirya@gmail.com> Closes #389 from viirya/PARQUET-791 and squashes the following commits: d6be37d [Liang-Chi Hsieh] Address comment. 7e929c3 [Liang-Chi Hsieh] PARQUET-791: Add missing column support for UserDefinedPredicate.
1 parent 98c2769 commit 71cff7c

File tree

2 files changed

+91
-6
lines changed

2 files changed

+91
-6
lines changed

parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -328,18 +328,31 @@ private <T extends Comparable<T>, U extends UserDefinedPredicate<T>> Boolean vis
328328
Column<T> filterColumn = ud.getColumn();
329329
ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath());
330330
U udp = ud.getUserDefinedPredicate();
331+
332+
if (columnChunk == null) {
333+
// the column isn't in this file so all values are null.
334+
// lets run the udp with null value to see if it keeps null or not.
335+
if (inverted) {
336+
return udp.keep(null);
337+
} else {
338+
return !udp.keep(null);
339+
}
340+
}
341+
331342
Statistics<T> stats = columnChunk.getStatistics();
332343

333344
if (stats.isEmpty()) {
334345
// we have no statistics available, we cannot drop any chunks
335-
return false;
346+
return BLOCK_MIGHT_MATCH;
336347
}
337348

338349
if (isAllNulls(columnChunk)) {
339-
// there is no min max, there is nothing
340-
// else we can say about this chunk, we
341-
// cannot drop it.
342-
return false;
350+
// lets run the udp with null value to see if it keeps null or not.
351+
if (inverted) {
352+
return udp.keep(null);
353+
} else {
354+
return !udp.keep(null);
355+
}
343356
}
344357

345358
org.apache.parquet.filter2.predicate.Statistics<T> udpStats =

parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ private static ColumnChunkMetaData getDoubleColumnMeta(DoubleStatistics stats, l
8383
private static final IntColumn intColumn = intColumn("int.column");
8484
private static final DoubleColumn doubleColumn = doubleColumn("double.column");
8585
private static final BinaryColumn missingColumn = binaryColumn("missing");
86+
private static final IntColumn missingColumn2 = intColumn("missing.int");
8687

8788
private static final IntStatistics intStats = new IntStatistics();
8889
private static final IntStatistics nullIntStats = new IntStatistics();
@@ -269,7 +270,10 @@ public static class SevensAndEightsUdp extends UserDefinedPredicate<Integer> {
269270

270271
@Override
271272
public boolean keep(Integer value) {
272-
throw new RuntimeException("this method should not be called");
273+
if (value == null) {
274+
return true;
275+
}
276+
throw new RuntimeException("this method should not be called with value != null");
273277
}
274278

275279
@Override
@@ -283,11 +287,27 @@ public boolean inverseCanDrop(Statistics<Integer> statistics) {
283287
}
284288
}
285289

290+
public static class DropNullUdp extends SevensAndEightsUdp {
291+
@Override
292+
public boolean keep(Integer value) {
293+
if (value == null) {
294+
return false;
295+
}
296+
throw new RuntimeException("this method should not be called with value != null");
297+
}
298+
}
299+
286300
@Test
287301
public void testUdp() {
288302
FilterPredicate pred = userDefined(intColumn, SevensAndEightsUdp.class);
289303
FilterPredicate invPred = LogicalInverseRewriter.rewrite(not(userDefined(intColumn, SevensAndEightsUdp.class)));
290304

305+
FilterPredicate udpDropMissingColumn = userDefined(missingColumn2, DropNullUdp.class);
306+
FilterPredicate invUdpDropMissingColumn = LogicalInverseRewriter.rewrite(not(userDefined(missingColumn2, DropNullUdp.class)));
307+
308+
FilterPredicate udpKeepMissingColumn = userDefined(missingColumn2, SevensAndEightsUdp.class);
309+
FilterPredicate invUdpKeepMissingColumn = LogicalInverseRewriter.rewrite(not(userDefined(missingColumn2, SevensAndEightsUdp.class)));
310+
291311
IntStatistics seven = new IntStatistics();
292312
seven.setMinMax(7, 7);
293313

@@ -320,6 +340,58 @@ public void testUdp() {
320340
assertFalse(canDrop(invPred, Arrays.asList(
321341
getIntColumnMeta(neither, 177L),
322342
getDoubleColumnMeta(doubleStats, 177L))));
343+
344+
// udpDropMissingColumn drops null column.
345+
assertTrue(canDrop(udpDropMissingColumn, Arrays.asList(
346+
getIntColumnMeta(seven, 177L),
347+
getDoubleColumnMeta(doubleStats, 177L))));
348+
349+
assertTrue(canDrop(udpDropMissingColumn, Arrays.asList(
350+
getIntColumnMeta(eight, 177L),
351+
getDoubleColumnMeta(doubleStats, 177L))));
352+
353+
assertTrue(canDrop(udpDropMissingColumn, Arrays.asList(
354+
getIntColumnMeta(neither, 177L),
355+
getDoubleColumnMeta(doubleStats, 177L))));
356+
357+
// invUdpDropMissingColumn (i.e., not(udpDropMissingColumn)) keeps null column.
358+
assertFalse(canDrop(invUdpDropMissingColumn, Arrays.asList(
359+
getIntColumnMeta(seven, 177L),
360+
getDoubleColumnMeta(doubleStats, 177L))));
361+
362+
assertFalse(canDrop(invUdpDropMissingColumn, Arrays.asList(
363+
getIntColumnMeta(eight, 177L),
364+
getDoubleColumnMeta(doubleStats, 177L))));
365+
366+
assertFalse(canDrop(invUdpDropMissingColumn, Arrays.asList(
367+
getIntColumnMeta(neither, 177L),
368+
getDoubleColumnMeta(doubleStats, 177L))));
369+
370+
// udpKeepMissingColumn keeps null column.
371+
assertFalse(canDrop(udpKeepMissingColumn, Arrays.asList(
372+
getIntColumnMeta(seven, 177L),
373+
getDoubleColumnMeta(doubleStats, 177L))));
374+
375+
assertFalse(canDrop(udpKeepMissingColumn, Arrays.asList(
376+
getIntColumnMeta(eight, 177L),
377+
getDoubleColumnMeta(doubleStats, 177L))));
378+
379+
assertFalse(canDrop(udpKeepMissingColumn, Arrays.asList(
380+
getIntColumnMeta(neither, 177L),
381+
getDoubleColumnMeta(doubleStats, 177L))));
382+
383+
// invUdpKeepMissingColumn (i.e., not(udpKeepMissingColumn)) drops null column.
384+
assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList(
385+
getIntColumnMeta(seven, 177L),
386+
getDoubleColumnMeta(doubleStats, 177L))));
387+
388+
assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList(
389+
getIntColumnMeta(eight, 177L),
390+
getDoubleColumnMeta(doubleStats, 177L))));
391+
392+
assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList(
393+
getIntColumnMeta(neither, 177L),
394+
getDoubleColumnMeta(doubleStats, 177L))));
323395
}
324396

325397
@Test

0 commit comments

Comments
 (0)