6464// TODO: (https://issues.apache.org/jira/browse/PARQUET-38)
6565public class StatisticsFilter implements FilterPredicate .Visitor <Boolean > {
6666
67+ private static final boolean BLOCK_MIGHT_MATCH = false ;
68+ private static final boolean BLOCK_CANNOT_MATCH = true ;
69+
6770 public static boolean canDrop (FilterPredicate pred , List <ColumnChunkMetaData > columns ) {
6871 checkNotNull (pred , "pred" );
6972 checkNotNull (columns , "columns" );
@@ -79,9 +82,7 @@ private StatisticsFilter(List<ColumnChunkMetaData> columnsList) {
7982 }
8083
8184 private ColumnChunkMetaData getColumnChunk (ColumnPath columnPath ) {
82- ColumnChunkMetaData c = columns .get (columnPath );
83- checkArgument (c != null , "Column " + columnPath .toDotString () + " not found in schema!" );
84- return c ;
85+ return columns .get (columnPath );
8586 }
8687
8788 // is this column chunk composed entirely of nulls?
@@ -97,145 +98,204 @@ private boolean hasNulls(ColumnChunkMetaData column) {
9798 }
9899
99100 @ Override
101+ @ SuppressWarnings ("unchecked" )
100102 public <T extends Comparable <T >> Boolean visit (Eq <T > eq ) {
101103 Column <T > filterColumn = eq .getColumn ();
104+ ColumnChunkMetaData meta = getColumnChunk (filterColumn .getColumnPath ());
105+
102106 T value = eq .getValue ();
103- ColumnChunkMetaData columnChunk = getColumnChunk (filterColumn .getColumnPath ());
104- Statistics <T > stats = columnChunk .getStatistics ();
107+
108+ if (meta == null ) {
109+ // the column isn't in this file so all values are null.
110+ if (value != null ) {
111+ // non-null is never null
112+ return BLOCK_CANNOT_MATCH ;
113+ }
114+ return BLOCK_MIGHT_MATCH ;
115+ }
116+
117+ Statistics <T > stats = meta .getStatistics ();
105118
106119 if (stats .isEmpty ()) {
107120 // we have no statistics available, we cannot drop any chunks
108- return false ;
121+ return BLOCK_MIGHT_MATCH ;
109122 }
110123
111124 if (value == null ) {
112125 // we are looking for records where v eq(null)
113126 // so drop if there are no nulls in this chunk
114- return !hasNulls (columnChunk );
127+ return !hasNulls (meta );
115128 }
116129
117- if (isAllNulls (columnChunk )) {
130+ if (isAllNulls (meta )) {
118131 // we are looking for records where v eq(someNonNull)
119132 // and this is a column of all nulls, so drop it
120- return true ;
133+ return BLOCK_CANNOT_MATCH ;
121134 }
122135
123136 // drop if value < min || value > max
124137 return value .compareTo (stats .genericGetMin ()) < 0 || value .compareTo (stats .genericGetMax ()) > 0 ;
125138 }
126139
127140 @ Override
141+ @ SuppressWarnings ("unchecked" )
128142 public <T extends Comparable <T >> Boolean visit (NotEq <T > notEq ) {
129143 Column <T > filterColumn = notEq .getColumn ();
144+ ColumnChunkMetaData meta = getColumnChunk (filterColumn .getColumnPath ());
145+
130146 T value = notEq .getValue ();
131- ColumnChunkMetaData columnChunk = getColumnChunk (filterColumn .getColumnPath ());
132- Statistics <T > stats = columnChunk .getStatistics ();
147+
148+ if (meta == null ) {
149+ if (value == null ) {
150+ // null is always equal to null
151+ return BLOCK_CANNOT_MATCH ;
152+ }
153+ return BLOCK_MIGHT_MATCH ;
154+ }
155+
156+ Statistics <T > stats = meta .getStatistics ();
133157
134158 if (stats .isEmpty ()) {
135159 // we have no statistics available, we cannot drop any chunks
136- return false ;
160+ return BLOCK_MIGHT_MATCH ;
137161 }
138162
139163 if (value == null ) {
140164 // we are looking for records where v notEq(null)
141165 // so, if this is a column of all nulls, we can drop it
142- return isAllNulls (columnChunk );
166+ return isAllNulls (meta );
143167 }
144168
145- if (hasNulls (columnChunk )) {
169+ if (hasNulls (meta )) {
146170 // we are looking for records where v notEq(someNonNull)
147171 // but this chunk contains nulls, we cannot drop it
148- return false ;
172+ return BLOCK_MIGHT_MATCH ;
149173 }
150174
151175 // drop if this is a column where min = max = value
152176 return value .compareTo (stats .genericGetMin ()) == 0 && value .compareTo (stats .genericGetMax ()) == 0 ;
153177 }
154178
155179 @ Override
180+ @ SuppressWarnings ("unchecked" )
156181 public <T extends Comparable <T >> Boolean visit (Lt <T > lt ) {
157182 Column <T > filterColumn = lt .getColumn ();
158- T value = lt .getValue ();
159- ColumnChunkMetaData columnChunk = getColumnChunk (filterColumn .getColumnPath ());
160- Statistics <T > stats = columnChunk .getStatistics ();
183+ ColumnChunkMetaData meta = getColumnChunk (filterColumn .getColumnPath ());
184+
185+ if (meta == null ) {
186+ // the column is missing and always null, which is never less than a
187+ // value. for all x, null is never < x.
188+ return BLOCK_CANNOT_MATCH ;
189+ }
190+
191+ Statistics <T > stats = meta .getStatistics ();
161192
162193 if (stats .isEmpty ()) {
163194 // we have no statistics available, we cannot drop any chunks
164- return false ;
195+ return BLOCK_MIGHT_MATCH ;
165196 }
166197
167- if (isAllNulls (columnChunk )) {
198+ if (isAllNulls (meta )) {
168199 // we are looking for records where v < someValue
169200 // this chunk is all nulls, so we can drop it
170- return true ;
201+ return BLOCK_CANNOT_MATCH ;
171202 }
172203
204+ T value = lt .getValue ();
205+
173206 // drop if value <= min
174207 return value .compareTo (stats .genericGetMin ()) <= 0 ;
175208 }
176209
177210 @ Override
211+ @ SuppressWarnings ("unchecked" )
178212 public <T extends Comparable <T >> Boolean visit (LtEq <T > ltEq ) {
179213 Column <T > filterColumn = ltEq .getColumn ();
180- T value = ltEq .getValue ();
181- ColumnChunkMetaData columnChunk = getColumnChunk (filterColumn .getColumnPath ());
182- Statistics <T > stats = columnChunk .getStatistics ();
214+ ColumnChunkMetaData meta = getColumnChunk (filterColumn .getColumnPath ());
215+
216+ if (meta == null ) {
217+ // the column is missing and always null, which is never less than or
218+ // equal to a value. for all x, null is never <= x.
219+ return BLOCK_CANNOT_MATCH ;
220+ }
221+
222+ Statistics <T > stats = meta .getStatistics ();
183223
184224 if (stats .isEmpty ()) {
185225 // we have no statistics available, we cannot drop any chunks
186- return false ;
226+ return BLOCK_MIGHT_MATCH ;
187227 }
188228
189- if (isAllNulls (columnChunk )) {
229+ if (isAllNulls (meta )) {
190230 // we are looking for records where v <= someValue
191231 // this chunk is all nulls, so we can drop it
192- return true ;
232+ return BLOCK_CANNOT_MATCH ;
193233 }
194234
235+ T value = ltEq .getValue ();
236+
195237 // drop if value < min
196238 return value .compareTo (stats .genericGetMin ()) < 0 ;
197239 }
198240
199241 @ Override
242+ @ SuppressWarnings ("unchecked" )
200243 public <T extends Comparable <T >> Boolean visit (Gt <T > gt ) {
201244 Column <T > filterColumn = gt .getColumn ();
202- T value = gt .getValue ();
203- ColumnChunkMetaData columnChunk = getColumnChunk (filterColumn .getColumnPath ());
204- Statistics <T > stats = columnChunk .getStatistics ();
245+ ColumnChunkMetaData meta = getColumnChunk (filterColumn .getColumnPath ());
246+
247+ if (meta == null ) {
248+ // the column is missing and always null, which is never greater than a
249+ // value. for all x, null is never > x.
250+ return BLOCK_CANNOT_MATCH ;
251+ }
252+
253+ Statistics <T > stats = meta .getStatistics ();
205254
206255 if (stats .isEmpty ()) {
207256 // we have no statistics available, we cannot drop any chunks
208- return false ;
257+ return BLOCK_MIGHT_MATCH ;
209258 }
210259
211- if (isAllNulls (columnChunk )) {
260+ if (isAllNulls (meta )) {
212261 // we are looking for records where v > someValue
213262 // this chunk is all nulls, so we can drop it
214- return true ;
263+ return BLOCK_CANNOT_MATCH ;
215264 }
216265
266+ T value = gt .getValue ();
267+
217268 // drop if value >= max
218269 return value .compareTo (stats .genericGetMax ()) >= 0 ;
219270 }
220271
221272 @ Override
273+ @ SuppressWarnings ("unchecked" )
222274 public <T extends Comparable <T >> Boolean visit (GtEq <T > gtEq ) {
223275 Column <T > filterColumn = gtEq .getColumn ();
224- T value = gtEq .getValue ();
225- ColumnChunkMetaData columnChunk = getColumnChunk (filterColumn .getColumnPath ());
226- Statistics <T > stats = columnChunk .getStatistics ();
276+ ColumnChunkMetaData meta = getColumnChunk (filterColumn .getColumnPath ());
277+
278+ if (meta == null ) {
279+ // the column is missing and always null, which is never greater than or
280+ // equal to a value. for all x, null is never >= x.
281+ return BLOCK_CANNOT_MATCH ;
282+ }
283+
284+ Statistics <T > stats = meta .getStatistics ();
227285
228286 if (stats .isEmpty ()) {
229287 // we have no statistics available, we cannot drop any chunks
230- return false ;
288+ return BLOCK_MIGHT_MATCH ;
231289 }
232290
233- if (isAllNulls (columnChunk )) {
291+ if (isAllNulls (meta )) {
234292 // we are looking for records where v >= someValue
235293 // this chunk is all nulls, so we can drop it
236- return true ;
294+ return BLOCK_CANNOT_MATCH ;
237295 }
238296
297+ T value = gtEq .getValue ();
298+
239299 // drop if value >= max
240300 return value .compareTo (stats .genericGetMax ()) > 0 ;
241301 }
0 commit comments