Skip to content

Commit 91b841c

Browse files
committed
PARQUET-389: Add missing column support to StatisticsFilter.
1 parent 275f950 commit 91b841c

File tree

2 files changed

+127
-50
lines changed

2 files changed

+127
-50
lines changed

parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java

Lines changed: 99 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@
6464
// TODO: (https://issues.apache.org/jira/browse/PARQUET-38)
6565
public class StatisticsFilter implements FilterPredicate.Visitor<Boolean> {
6666

67+
private static final boolean BLOCK_MIGHT_MATCH = false;
68+
private static final boolean BLOCK_CANNOT_MATCH = true;
69+
6770
public static boolean canDrop(FilterPredicate pred, List<ColumnChunkMetaData> columns) {
6871
checkNotNull(pred, "pred");
6972
checkNotNull(columns, "columns");
@@ -79,9 +82,7 @@ private StatisticsFilter(List<ColumnChunkMetaData> columnsList) {
7982
}
8083

8184
private ColumnChunkMetaData getColumnChunk(ColumnPath columnPath) {
82-
ColumnChunkMetaData c = columns.get(columnPath);
83-
checkArgument(c != null, "Column " + columnPath.toDotString() + " not found in schema!");
84-
return c;
85+
return columns.get(columnPath);
8586
}
8687

8788
// is this column chunk composed entirely of nulls?
@@ -97,145 +98,204 @@ private boolean hasNulls(ColumnChunkMetaData column) {
9798
}
9899

99100
@Override
101+
@SuppressWarnings("unchecked")
100102
public <T extends Comparable<T>> Boolean visit(Eq<T> eq) {
101103
Column<T> filterColumn = eq.getColumn();
104+
ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());
105+
102106
T value = eq.getValue();
103-
ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath());
104-
Statistics<T> stats = columnChunk.getStatistics();
107+
108+
if (meta == null) {
109+
// the column isn't in this file so all values are null.
110+
if (value != null) {
111+
// non-null is never null
112+
return BLOCK_CANNOT_MATCH;
113+
}
114+
return BLOCK_MIGHT_MATCH;
115+
}
116+
117+
Statistics<T> stats = meta.getStatistics();
105118

106119
if (stats.isEmpty()) {
107120
// we have no statistics available, we cannot drop any chunks
108-
return false;
121+
return BLOCK_MIGHT_MATCH;
109122
}
110123

111124
if (value == null) {
112125
// we are looking for records where v eq(null)
113126
// so drop if there are no nulls in this chunk
114-
return !hasNulls(columnChunk);
127+
return !hasNulls(meta);
115128
}
116129

117-
if (isAllNulls(columnChunk)) {
130+
if (isAllNulls(meta)) {
118131
// we are looking for records where v eq(someNonNull)
119132
// and this is a column of all nulls, so drop it
120-
return true;
133+
return BLOCK_CANNOT_MATCH;
121134
}
122135

123136
// drop if value < min || value > max
124137
return value.compareTo(stats.genericGetMin()) < 0 || value.compareTo(stats.genericGetMax()) > 0;
125138
}
126139

127140
@Override
141+
@SuppressWarnings("unchecked")
128142
public <T extends Comparable<T>> Boolean visit(NotEq<T> notEq) {
129143
Column<T> filterColumn = notEq.getColumn();
144+
ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());
145+
130146
T value = notEq.getValue();
131-
ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath());
132-
Statistics<T> stats = columnChunk.getStatistics();
147+
148+
if (meta == null) {
149+
if (value == null) {
150+
// null is always equal to null
151+
return BLOCK_CANNOT_MATCH;
152+
}
153+
return BLOCK_MIGHT_MATCH;
154+
}
155+
156+
Statistics<T> stats = meta.getStatistics();
133157

134158
if (stats.isEmpty()) {
135159
// we have no statistics available, we cannot drop any chunks
136-
return false;
160+
return BLOCK_MIGHT_MATCH;
137161
}
138162

139163
if (value == null) {
140164
// we are looking for records where v notEq(null)
141165
// so, if this is a column of all nulls, we can drop it
142-
return isAllNulls(columnChunk);
166+
return isAllNulls(meta);
143167
}
144168

145-
if (hasNulls(columnChunk)) {
169+
if (hasNulls(meta)) {
146170
// we are looking for records where v notEq(someNonNull)
147171
// but this chunk contains nulls, we cannot drop it
148-
return false;
172+
return BLOCK_MIGHT_MATCH;
149173
}
150174

151175
// drop if this is a column where min = max = value
152176
return value.compareTo(stats.genericGetMin()) == 0 && value.compareTo(stats.genericGetMax()) == 0;
153177
}
154178

155179
@Override
180+
@SuppressWarnings("unchecked")
156181
public <T extends Comparable<T>> Boolean visit(Lt<T> lt) {
157182
Column<T> filterColumn = lt.getColumn();
158-
T value = lt.getValue();
159-
ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath());
160-
Statistics<T> stats = columnChunk.getStatistics();
183+
ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());
184+
185+
if (meta == null) {
186+
// the column is missing and always null, which is never less than a
187+
// value. for all x, null is never < x.
188+
return BLOCK_CANNOT_MATCH;
189+
}
190+
191+
Statistics<T> stats = meta.getStatistics();
161192

162193
if (stats.isEmpty()) {
163194
// we have no statistics available, we cannot drop any chunks
164-
return false;
195+
return BLOCK_MIGHT_MATCH;
165196
}
166197

167-
if (isAllNulls(columnChunk)) {
198+
if (isAllNulls(meta)) {
168199
// we are looking for records where v < someValue
169200
// this chunk is all nulls, so we can drop it
170-
return true;
201+
return BLOCK_CANNOT_MATCH;
171202
}
172203

204+
T value = lt.getValue();
205+
173206
// drop if value <= min
174207
return value.compareTo(stats.genericGetMin()) <= 0;
175208
}
176209

177210
@Override
211+
@SuppressWarnings("unchecked")
178212
public <T extends Comparable<T>> Boolean visit(LtEq<T> ltEq) {
179213
Column<T> filterColumn = ltEq.getColumn();
180-
T value = ltEq.getValue();
181-
ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath());
182-
Statistics<T> stats = columnChunk.getStatistics();
214+
ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());
215+
216+
if (meta == null) {
217+
// the column is missing and always null, which is never less than or
218+
// equal to a value. for all x, null is never <= x.
219+
return BLOCK_CANNOT_MATCH;
220+
}
221+
222+
Statistics<T> stats = meta.getStatistics();
183223

184224
if (stats.isEmpty()) {
185225
// we have no statistics available, we cannot drop any chunks
186-
return false;
226+
return BLOCK_MIGHT_MATCH;
187227
}
188228

189-
if (isAllNulls(columnChunk)) {
229+
if (isAllNulls(meta)) {
190230
// we are looking for records where v <= someValue
191231
// this chunk is all nulls, so we can drop it
192-
return true;
232+
return BLOCK_CANNOT_MATCH;
193233
}
194234

235+
T value = ltEq.getValue();
236+
195237
// drop if value < min
196238
return value.compareTo(stats.genericGetMin()) < 0;
197239
}
198240

199241
@Override
242+
@SuppressWarnings("unchecked")
200243
public <T extends Comparable<T>> Boolean visit(Gt<T> gt) {
201244
Column<T> filterColumn = gt.getColumn();
202-
T value = gt.getValue();
203-
ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath());
204-
Statistics<T> stats = columnChunk.getStatistics();
245+
ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());
246+
247+
if (meta == null) {
248+
// the column is missing and always null, which is never greater than a
249+
// value. for all x, null is never > x.
250+
return BLOCK_CANNOT_MATCH;
251+
}
252+
253+
Statistics<T> stats = meta.getStatistics();
205254

206255
if (stats.isEmpty()) {
207256
// we have no statistics available, we cannot drop any chunks
208-
return false;
257+
return BLOCK_MIGHT_MATCH;
209258
}
210259

211-
if (isAllNulls(columnChunk)) {
260+
if (isAllNulls(meta)) {
212261
// we are looking for records where v > someValue
213262
// this chunk is all nulls, so we can drop it
214-
return true;
263+
return BLOCK_CANNOT_MATCH;
215264
}
216265

266+
T value = gt.getValue();
267+
217268
// drop if value >= max
218269
return value.compareTo(stats.genericGetMax()) >= 0;
219270
}
220271

221272
@Override
273+
@SuppressWarnings("unchecked")
222274
public <T extends Comparable<T>> Boolean visit(GtEq<T> gtEq) {
223275
Column<T> filterColumn = gtEq.getColumn();
224-
T value = gtEq.getValue();
225-
ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath());
226-
Statistics<T> stats = columnChunk.getStatistics();
276+
ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());
277+
278+
if (meta == null) {
279+
// the column is missing and always null, which is never greater than or
280+
// equal to a value. for all x, null is never >= x.
281+
return BLOCK_CANNOT_MATCH;
282+
}
283+
284+
Statistics<T> stats = meta.getStatistics();
227285

228286
if (stats.isEmpty()) {
229287
// we have no statistics available, we cannot drop any chunks
230-
return false;
288+
return BLOCK_MIGHT_MATCH;
231289
}
232290

233-
if (isAllNulls(columnChunk)) {
291+
if (isAllNulls(meta)) {
234292
// we are looking for records where v >= someValue
235293
// this chunk is all nulls, so we can drop it
236-
return true;
294+
return BLOCK_CANNOT_MATCH;
237295
}
238296

297+
T value = gtEq.getValue();
298+
239299
// drop if value >= max
240300
return value.compareTo(stats.genericGetMax()) > 0;
241301
}

parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import java.util.HashSet;
2323
import java.util.List;
2424

25+
import org.apache.parquet.io.api.Binary;
2526
import org.junit.Test;
2627

2728
import org.apache.parquet.column.Encoding;
@@ -30,6 +31,7 @@
3031
import org.apache.parquet.hadoop.metadata.ColumnPath;
3132
import org.apache.parquet.filter2.predicate.FilterPredicate;
3233
import org.apache.parquet.filter2.predicate.LogicalInverseRewriter;
34+
import org.apache.parquet.filter2.predicate.Operators.BinaryColumn;
3335
import org.apache.parquet.filter2.predicate.Operators.DoubleColumn;
3436
import org.apache.parquet.filter2.predicate.Operators.IntColumn;
3537
import org.apache.parquet.filter2.predicate.Statistics;
@@ -38,6 +40,8 @@
3840
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
3941
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
4042

43+
import static org.apache.parquet.filter2.predicate.FilterApi.binaryColumn;
44+
import static org.apache.parquet.io.api.Binary.fromString;
4145
import static org.junit.Assert.assertEquals;
4246
import static org.junit.Assert.assertFalse;
4347
import static org.junit.Assert.assertTrue;
@@ -78,6 +82,7 @@ private static ColumnChunkMetaData getDoubleColumnMeta(DoubleStatistics stats, l
7882

7983
private static final IntColumn intColumn = intColumn("int.column");
8084
private static final DoubleColumn doubleColumn = doubleColumn("double.column");
85+
private static final BinaryColumn missingColumn = binaryColumn("missing");
8186

8287
private static final IntStatistics intStats = new IntStatistics();
8388
private static final IntStatistics nullIntStats = new IntStatistics();
@@ -109,6 +114,7 @@ public void testEqNonNull() {
109114

110115
// drop columns of all nulls when looking for non-null value
111116
assertTrue(canDrop(eq(intColumn, 0), nullColumnMetas));
117+
assertTrue(canDrop(eq(missingColumn, fromString("any")), columnMetas));
112118
}
113119

114120
@Test
@@ -129,6 +135,7 @@ public void testEqNull() {
129135
getIntColumnMeta(statsSomeNulls, 177L),
130136
getDoubleColumnMeta(doubleStats, 177L))));
131137

138+
assertFalse(canDrop(eq(missingColumn, null), columnMetas));
132139
}
133140

134141
@Test
@@ -144,6 +151,17 @@ public void testNotEqNonNull() {
144151
getIntColumnMeta(allSevens, 177L),
145152
getDoubleColumnMeta(doubleStats, 177L))));
146153

154+
allSevens.setNumNulls(100L);
155+
assertFalse(canDrop(notEq(intColumn, 7), Arrays.asList(
156+
getIntColumnMeta(allSevens, 177L),
157+
getDoubleColumnMeta(doubleStats, 177L))));
158+
159+
allSevens.setNumNulls(177L);
160+
assertFalse(canDrop(notEq(intColumn, 7), Arrays.asList(
161+
getIntColumnMeta(allSevens, 177L),
162+
getDoubleColumnMeta(doubleStats, 177L))));
163+
164+
assertFalse(canDrop(notEq(missingColumn, fromString("any")), columnMetas));
147165
}
148166

149167
@Test
@@ -171,6 +189,8 @@ public void testNotEqNull() {
171189
assertTrue(canDrop(notEq(intColumn, null), Arrays.asList(
172190
getIntColumnMeta(statsAllNulls, 177L),
173191
getDoubleColumnMeta(doubleStats, 177L))));
192+
193+
assertTrue(canDrop(notEq(missingColumn, null), columnMetas));
174194
}
175195

176196
@Test
@@ -182,6 +202,8 @@ public void testLt() {
182202

183203
assertTrue(canDrop(lt(intColumn, 0), nullColumnMetas));
184204
assertTrue(canDrop(lt(intColumn, 7), nullColumnMetas));
205+
206+
assertTrue(canDrop(lt(missingColumn, fromString("any")), columnMetas));
185207
}
186208

187209
@Test
@@ -193,6 +215,8 @@ public void testLtEq() {
193215

194216
assertTrue(canDrop(ltEq(intColumn, 0), nullColumnMetas));
195217
assertTrue(canDrop(ltEq(intColumn, 7), nullColumnMetas));
218+
219+
assertTrue(canDrop(ltEq(missingColumn, fromString("any")), columnMetas));
196220
}
197221

198222
@Test
@@ -204,6 +228,8 @@ public void testGt() {
204228

205229
assertTrue(canDrop(gt(intColumn, 0), nullColumnMetas));
206230
assertTrue(canDrop(gt(intColumn, 7), nullColumnMetas));
231+
232+
assertTrue(canDrop(gt(missingColumn, fromString("any")), columnMetas));
207233
}
208234

209235
@Test
@@ -215,6 +241,8 @@ public void testGtEq() {
215241

216242
assertTrue(canDrop(gtEq(intColumn, 0), nullColumnMetas));
217243
assertTrue(canDrop(gtEq(intColumn, 7), nullColumnMetas));
244+
245+
assertTrue(canDrop(gtEq(missingColumn, fromString("any")), columnMetas));
218246
}
219247

220248
@Test
@@ -311,15 +339,4 @@ public void testClearExceptionForNots() {
311339
}
312340
}
313341

314-
@Test
315-
public void testMissingColumn() {
316-
List<ColumnChunkMetaData> columnMetas = Arrays.asList(getIntColumnMeta(new IntStatistics(), 0L));
317-
try {
318-
canDrop(and(eq(doubleColumn, 12.0), eq(intColumn, 17)), columnMetas);
319-
fail("This should throw");
320-
} catch (IllegalArgumentException e) {
321-
assertEquals("Column double.column not found in schema!", e.getMessage());
322-
}
323-
}
324-
325342
}

0 commit comments

Comments
 (0)