Skip to content

Commit b1001eb

Browse files
committed
Fix statistics isEmpty, handle more edge cases in statistics filter
1 parent 0c88be0 commit b1001eb

File tree

2 files changed

+39
-40
lines changed

2 files changed

+39
-40
lines changed

parquet-column/src/main/java/parquet/column/statistics/Statistics.java

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,11 @@
2828
*/
2929
public abstract class Statistics<T extends Comparable<T>> {
3030

31-
private boolean firstValueAccountedFor;
31+
private boolean hasNonNullValue;
3232
private long num_nulls;
3333

3434
public Statistics() {
35-
firstValueAccountedFor = false;
35+
hasNonNullValue = false;
3636
num_nulls = 0;
3737
}
3838

@@ -142,7 +142,10 @@ public void mergeStatistics(Statistics stats) {
142142

143143
if (this.getClass() == stats.getClass()) {
144144
incrementNumNulls(stats.getNumNulls());
145-
mergeStatisticsMinMax(stats);
145+
if (stats.hasNonNullValue()) {
146+
mergeStatisticsMinMax(stats);
147+
markAsNotEmpty();
148+
}
146149
} else {
147150
throw new StatisticsClassException(this.getClass().toString(), stats.getClass().toString());
148151
}
@@ -220,11 +223,18 @@ public void setNumNulls(long nulls) {
220223
* @return true if object is empty, false otherwise
221224
*/
222225
public boolean isEmpty() {
223-
return !firstValueAccountedFor;
226+
return !hasNonNullValue && num_nulls == 0;
224227
}
225228

229+
/**
230+
* Returns whether there have been non-null values added to this statistics
231+
*/
232+
public boolean hasNonNullValue() {
233+
return hasNonNullValue;
234+
}
235+
226236
protected void markAsNotEmpty() {
227-
firstValueAccountedFor = true;
237+
hasNonNullValue = true;
228238
}
229239
}
230240

parquet-hadoop/src/main/java/parquet/filter2/statisticslevel/StatisticsFilter.java

Lines changed: 24 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -67,11 +67,13 @@ private ColumnChunkMetaData getColumnChunk(ColumnPath columnPath) {
6767
}
6868

6969
// is this column chunk composed entirely of nulls?
70+
// assumes the column chunk's statistics is not empty
7071
private boolean isAllNulls(ColumnChunkMetaData column) {
7172
return column.getStatistics().getNumNulls() == column.getValueCount();
7273
}
7374

7475
// are there any nulls in this column chunk?
76+
// assumes the column chunk's statistics is not empty
7577
private boolean hasNulls(ColumnChunkMetaData column) {
7678
return column.getStatistics().getNumNulls() > 0;
7779
}
@@ -84,11 +86,7 @@ public <T extends Comparable<T>> Boolean visit(Eq<T> eq) {
8486
Statistics<T> stats = columnChunk.getStatistics();
8587

8688
if (stats.isEmpty()) {
87-
// Do not filter in case of empty statistics
88-
// pre-statistics files will have no such info
89-
// best not to filter these.
90-
// Also, in case of all nulls in a column, empty statistics object
91-
// is returned. This fixes the read path because of that bug
89+
// we have no statistics available, we cannot drop any chunks
9290
return false;
9391
}
9492

@@ -116,11 +114,7 @@ public <T extends Comparable<T>> Boolean visit(NotEq<T> notEq) {
116114
Statistics<T> stats = columnChunk.getStatistics();
117115

118116
if (stats.isEmpty()) {
119-
// Do not filter in case of empty statistics
120-
// pre-statistics files will have no such info
121-
// best not to filter these.
122-
// Also, in case of all nulls in a column, empty statistics object
123-
// is returned. This fixes the read path because of that bug
117+
// we have no statistics available, we cannot drop any chunks
124118
return false;
125119
}
126120

@@ -136,6 +130,13 @@ public <T extends Comparable<T>> Boolean visit(NotEq<T> notEq) {
136130
return false;
137131
}
138132

133+
if (isAllNulls(columnChunk)) {
134+
// there is no min max, there is nothing
135+
// else we can say about this chunk, we
136+
// cannot drop it.
137+
return false;
138+
}
139+
139140
// drop if this is a column where min = max = value
140141
return value.compareTo(stats.genericGetMin()) == 0 && value.compareTo(stats.genericGetMax()) == 0;
141142
}
@@ -148,11 +149,7 @@ public <T extends Comparable<T>> Boolean visit(Lt<T> lt) {
148149
Statistics<T> stats = columnChunk.getStatistics();
149150

150151
if (stats.isEmpty()) {
151-
// Do not filter in case of empty statistics
152-
// pre-statistics files will have no such info
153-
// best not to filter these.
154-
// Also, in case of all nulls in a column, empty statistics object
155-
// is returned. This fixes the read path because of that bug
152+
// we have no statistics available, we cannot drop any chunks
156153
return false;
157154
}
158155

@@ -174,11 +171,7 @@ public <T extends Comparable<T>> Boolean visit(LtEq<T> ltEq) {
174171
Statistics<T> stats = columnChunk.getStatistics();
175172

176173
if (stats.isEmpty()) {
177-
// Do not filter in case of empty statistics
178-
// pre-statistics files will have no such info
179-
// best not to filter these.
180-
// Also, in case of all nulls in a column, empty statistics object
181-
// is returned. This fixes the read path because of that bug
174+
// we have no statistics available, we cannot drop any chunks
182175
return false;
183176
}
184177

@@ -200,11 +193,7 @@ public <T extends Comparable<T>> Boolean visit(Gt<T> gt) {
200193
Statistics<T> stats = columnChunk.getStatistics();
201194

202195
if (stats.isEmpty()) {
203-
// Do not filter in case of empty statistics
204-
// pre-statistics files will have no such info
205-
// best not to filter these.
206-
// Also, in case of all nulls in a column, empty statistics object
207-
// is returned. This fixes the read path because of that bug
196+
// we have no statistics available, we cannot drop any chunks
208197
return false;
209198
}
210199

@@ -226,11 +215,7 @@ public <T extends Comparable<T>> Boolean visit(GtEq<T> gtEq) {
226215
Statistics<T> stats = columnChunk.getStatistics();
227216

228217
if (stats.isEmpty()) {
229-
// Do not filter in case of empty statistics
230-
// pre-statistics files will have no such info
231-
// best not to filter these.
232-
// Also, in case of all nulls in a column, empty statistics object
233-
// is returned. This fixes the read path because of that bug
218+
// we have no statistics available, we cannot drop any chunks
234219
return false;
235220
}
236221

@@ -269,12 +254,16 @@ private <T extends Comparable<T>, U extends UserDefinedPredicate<T>> Boolean vis
269254
ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath());
270255
U udp = ud.getUserDefinedPredicate();
271256
Statistics<T> stats = columnChunk.getStatistics();
257+
272258
if (stats.isEmpty()) {
273-
// Do not filter in case of empty statistics
274-
// pre-statistics files will have no such info
275-
// best not to filter these.
276-
// Also, in case of all nulls in a column, empty statistics object
277-
// is returned. This fixes the read path because of that bug
259+
// we have no statistics available, we cannot drop any chunks
260+
return false;
261+
}
262+
263+
if (isAllNulls(columnChunk)) {
264+
// there is no min max, there is nothing
265+
// else we can say about this chunk, we
266+
// cannot drop it.
278267
return false;
279268
}
280269

0 commit comments

Comments
 (0)