@@ -127,6 +127,57 @@ pub struct SerializedFileReader<R: ChunkReader> {
127127 metadata : ParquetMetaData ,
128128}
129129
130+ /// A builder for [`ReadOptions`].
131+ /// For the predicates that are added to the builder,
132+ /// they will be chained using 'AND' to filter the row groups.
133+ pub struct ReadOptionsBuilder {
134+ predicates : Vec < Box < dyn FnMut ( & RowGroupMetaData , usize ) -> bool > > ,
135+ }
136+
137+ impl ReadOptionsBuilder {
138+ /// New builder
139+ pub fn new ( ) -> Self {
140+ ReadOptionsBuilder { predicates : vec ! [ ] }
141+ }
142+
143+ /// Add a predicate on row group metadata to the reading option,
144+ /// Filter only row groups that match the predicate criteria
145+ pub fn with_predicate (
146+ mut self ,
147+ predicate : Box < dyn FnMut ( & RowGroupMetaData , usize ) -> bool > ,
148+ ) -> Self {
149+ self . predicates . push ( predicate) ;
150+ self
151+ }
152+
153+ /// Add a range predicate on filtering row groups if their midpoints are within
154+ /// the Closed-Open range `[start..end) {x | start <= x < end}`
155+ pub fn with_range ( mut self , start : i64 , end : i64 ) -> Self {
156+ assert ! ( start < end) ;
157+ let predicate = move |rg : & RowGroupMetaData , _: usize | {
158+ let mid = get_midpoint_offset ( rg) ;
159+ mid >= start && mid < end
160+ } ;
161+ self . predicates . push ( Box :: new ( predicate) ) ;
162+ self
163+ }
164+
165+ /// Seal the builder and return the read options
166+ pub fn build ( self ) -> ReadOptions {
167+ ReadOptions {
168+ predicates : self . predicates ,
169+ }
170+ }
171+ }
172+
173+ /// A collection of options for reading a Parquet file.
174+ ///
175+ /// Currently, only predicates on row group metadata are supported.
176+ /// All predicates will be chained using 'AND' to filter the row groups.
177+ pub struct ReadOptions {
178+ predicates : Vec < Box < dyn FnMut ( & RowGroupMetaData , usize ) -> bool > > ,
179+ }
180+
130181impl < R : ' static + ChunkReader > SerializedFileReader < R > {
131182 /// Creates file reader from a Parquet file.
132183 /// Returns error if Parquet file does not exist or is corrupt.
@@ -138,25 +189,48 @@ impl<R: 'static + ChunkReader> SerializedFileReader<R> {
138189 } )
139190 }
140191
141- /// Filters row group metadata to only those row groups,
142- /// for which the predicate function returns true
143- pub fn filter_row_groups (
144- & mut self ,
145- predicate : & dyn Fn ( & RowGroupMetaData , usize ) -> bool ,
146- ) {
192+ /// Creates file reader from a Parquet file with read options.
193+ /// Returns error if Parquet file does not exist or is corrupt.
194+ pub fn new_with_options ( chunk_reader : R , options : ReadOptions ) -> Result < Self > {
195+ let metadata = footer :: parse_metadata ( & chunk_reader ) ? ;
196+ let mut predicates = options . predicates ;
197+ let row_groups = metadata . row_groups ( ) . to_vec ( ) ;
147198 let mut filtered_row_groups = Vec :: < RowGroupMetaData > :: new ( ) ;
148- for ( i, row_group_metadata) in self . metadata . row_groups ( ) . iter ( ) . enumerate ( ) {
149- if predicate ( row_group_metadata, i) {
150- filtered_row_groups. push ( row_group_metadata. clone ( ) ) ;
199+ for ( i, rg_meta) in row_groups. into_iter ( ) . enumerate ( ) {
200+ let mut keep = true ;
201+ for predicate in & mut predicates {
202+ if !predicate ( & rg_meta, i) {
203+ keep = false ;
204+ break ;
205+ }
206+ }
207+ if keep {
208+ filtered_row_groups. push ( rg_meta) ;
151209 }
152210 }
153- self . metadata = ParquetMetaData :: new (
154- self . metadata . file_metadata ( ) . clone ( ) ,
155- filtered_row_groups,
156- ) ;
211+
212+ Ok ( Self {
213+ chunk_reader : Arc :: new ( chunk_reader) ,
214+ metadata : ParquetMetaData :: new (
215+ metadata. file_metadata ( ) . clone ( ) ,
216+ filtered_row_groups,
217+ ) ,
218+ } )
157219 }
158220}
159221
222+ /// Get midpoint offset for a row group
223+ fn get_midpoint_offset ( meta : & RowGroupMetaData ) -> i64 {
224+ let col = meta. column ( 0 ) ;
225+ let mut offset = col. data_page_offset ( ) ;
226+ if let Some ( dic_offset) = col. dictionary_page_offset ( ) {
227+ if offset > dic_offset {
228+ offset = dic_offset
229+ }
230+ } ;
231+ offset + meta. compressed_size ( ) / 2
232+ }
233+
160234impl < R : ' static + ChunkReader > FileReader for SerializedFileReader < R > {
161235 fn metadata ( & self ) -> & ParquetMetaData {
162236 & self . metadata
@@ -790,19 +864,96 @@ mod tests {
790864 }
791865
792866 #[ test]
793- fn test_file_reader_filter_row_groups ( ) -> Result < ( ) > {
867+ fn test_file_reader_with_no_filter ( ) -> Result < ( ) > {
868+ let test_file = get_test_file ( "alltypes_plain.parquet" ) ;
869+ let origin_reader = SerializedFileReader :: new ( test_file) ?;
870+ // test initial number of row groups
871+ let metadata = origin_reader. metadata ( ) ;
872+ assert_eq ! ( metadata. num_row_groups( ) , 1 ) ;
873+ Ok ( ( ) )
874+ }
875+
876+ #[ test]
877+ fn test_file_reader_filter_row_groups_with_predicate ( ) -> Result < ( ) > {
794878 let test_file = get_test_file ( "alltypes_plain.parquet" ) ;
795- let mut reader = SerializedFileReader :: new ( test_file) ?;
879+ let read_options = ReadOptionsBuilder :: new ( )
880+ . with_predicate ( Box :: new ( |_, _| false ) )
881+ . build ( ) ;
882+ let reader = SerializedFileReader :: new_with_options ( test_file, read_options) ?;
883+ let metadata = reader. metadata ( ) ;
884+ assert_eq ! ( metadata. num_row_groups( ) , 0 ) ;
885+ Ok ( ( ) )
886+ }
796887
888+ #[ test]
889+ fn test_file_reader_filter_row_groups_with_range ( ) -> Result < ( ) > {
890+ let test_file = get_test_file ( "alltypes_plain.parquet" ) ;
891+ let origin_reader = SerializedFileReader :: new ( test_file) ?;
797892 // test initial number of row groups
893+ let metadata = origin_reader. metadata ( ) ;
894+ assert_eq ! ( metadata. num_row_groups( ) , 1 ) ;
895+ let mid = get_midpoint_offset ( metadata. row_group ( 0 ) ) ;
896+
897+ let test_file = get_test_file ( "alltypes_plain.parquet" ) ;
898+ let read_options = ReadOptionsBuilder :: new ( ) . with_range ( 0 , mid + 1 ) . build ( ) ;
899+ let reader = SerializedFileReader :: new_with_options ( test_file, read_options) ?;
900+ let metadata = reader. metadata ( ) ;
901+ assert_eq ! ( metadata. num_row_groups( ) , 1 ) ;
902+
903+ let test_file = get_test_file ( "alltypes_plain.parquet" ) ;
904+ let read_options = ReadOptionsBuilder :: new ( ) . with_range ( 0 , mid) . build ( ) ;
905+ let reader = SerializedFileReader :: new_with_options ( test_file, read_options) ?;
906+ let metadata = reader. metadata ( ) ;
907+ assert_eq ! ( metadata. num_row_groups( ) , 0 ) ;
908+ Ok ( ( ) )
909+ }
910+
911+ #[ test]
912+ fn test_file_reader_filter_row_groups_and_range ( ) -> Result < ( ) > {
913+ let test_file = get_test_file ( "alltypes_plain.parquet" ) ;
914+ let origin_reader = SerializedFileReader :: new ( test_file) ?;
915+ let metadata = origin_reader. metadata ( ) ;
916+ let mid = get_midpoint_offset ( metadata. row_group ( 0 ) ) ;
917+
918+ // true, true predicate
919+ let test_file = get_test_file ( "alltypes_plain.parquet" ) ;
920+ let read_options = ReadOptionsBuilder :: new ( )
921+ . with_predicate ( Box :: new ( |_, _| true ) )
922+ . with_range ( mid, mid + 1 )
923+ . build ( ) ;
924+ let reader = SerializedFileReader :: new_with_options ( test_file, read_options) ?;
798925 let metadata = reader. metadata ( ) ;
799926 assert_eq ! ( metadata. num_row_groups( ) , 1 ) ;
800927
801- // test filtering out all row groups
802- reader. filter_row_groups ( & |_, _| false ) ;
928+ // true, false predicate
929+ let test_file = get_test_file ( "alltypes_plain.parquet" ) ;
930+ let read_options = ReadOptionsBuilder :: new ( )
931+ . with_predicate ( Box :: new ( |_, _| true ) )
932+ . with_range ( 0 , mid)
933+ . build ( ) ;
934+ let reader = SerializedFileReader :: new_with_options ( test_file, read_options) ?;
803935 let metadata = reader. metadata ( ) ;
804936 assert_eq ! ( metadata. num_row_groups( ) , 0 ) ;
805937
938+ // false, true predicate
939+ let test_file = get_test_file ( "alltypes_plain.parquet" ) ;
940+ let read_options = ReadOptionsBuilder :: new ( )
941+ . with_predicate ( Box :: new ( |_, _| false ) )
942+ . with_range ( mid, mid + 1 )
943+ . build ( ) ;
944+ let reader = SerializedFileReader :: new_with_options ( test_file, read_options) ?;
945+ let metadata = reader. metadata ( ) ;
946+ assert_eq ! ( metadata. num_row_groups( ) , 0 ) ;
947+
948+ // false, false predicate
949+ let test_file = get_test_file ( "alltypes_plain.parquet" ) ;
950+ let read_options = ReadOptionsBuilder :: new ( )
951+ . with_predicate ( Box :: new ( |_, _| false ) )
952+ . with_range ( 0 , mid)
953+ . build ( ) ;
954+ let reader = SerializedFileReader :: new_with_options ( test_file, read_options) ?;
955+ let metadata = reader. metadata ( ) ;
956+ assert_eq ! ( metadata. num_row_groups( ) , 0 ) ;
806957 Ok ( ( ) )
807958 }
808959}
0 commit comments