Skip to content

Commit 2bca71e

Browse files
authored
Introduce ReadOptions with builder API, for parquet filter row groups that satisfy all filters, and enable filter row groups by range. (#1389)
* Filter row groups by comparing midpoint with offset range * lint * ReadOptions with builder API * fix comments * precise range doc * tab to space
1 parent 1efd81d commit 2bca71e

File tree

1 file changed

+168
-17
lines changed

1 file changed

+168
-17
lines changed

parquet/src/file/serialized_reader.rs

Lines changed: 168 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,57 @@ pub struct SerializedFileReader<R: ChunkReader> {
127127
metadata: ParquetMetaData,
128128
}
129129

130+
/// A builder for [`ReadOptions`].
131+
/// For the predicates that are added to the builder,
132+
/// they will be chained using 'AND' to filter the row groups.
133+
pub struct ReadOptionsBuilder {
134+
predicates: Vec<Box<dyn FnMut(&RowGroupMetaData, usize) -> bool>>,
135+
}
136+
137+
impl ReadOptionsBuilder {
138+
/// New builder
139+
pub fn new() -> Self {
140+
ReadOptionsBuilder { predicates: vec![] }
141+
}
142+
143+
/// Add a predicate on row group metadata to the reading option,
144+
/// Filter only row groups that match the predicate criteria
145+
pub fn with_predicate(
146+
mut self,
147+
predicate: Box<dyn FnMut(&RowGroupMetaData, usize) -> bool>,
148+
) -> Self {
149+
self.predicates.push(predicate);
150+
self
151+
}
152+
153+
/// Add a range predicate on filtering row groups if their midpoints are within
154+
/// the Closed-Open range `[start..end) {x | start <= x < end}`
155+
pub fn with_range(mut self, start: i64, end: i64) -> Self {
156+
assert!(start < end);
157+
let predicate = move |rg: &RowGroupMetaData, _: usize| {
158+
let mid = get_midpoint_offset(rg);
159+
mid >= start && mid < end
160+
};
161+
self.predicates.push(Box::new(predicate));
162+
self
163+
}
164+
165+
/// Seal the builder and return the read options
166+
pub fn build(self) -> ReadOptions {
167+
ReadOptions {
168+
predicates: self.predicates,
169+
}
170+
}
171+
}
172+
173+
/// A collection of options for reading a Parquet file.
174+
///
175+
/// Currently, only predicates on row group metadata are supported.
176+
/// All predicates will be chained using 'AND' to filter the row groups.
177+
pub struct ReadOptions {
178+
predicates: Vec<Box<dyn FnMut(&RowGroupMetaData, usize) -> bool>>,
179+
}
180+
130181
impl<R: 'static + ChunkReader> SerializedFileReader<R> {
131182
/// Creates file reader from a Parquet file.
132183
/// Returns error if Parquet file does not exist or is corrupt.
@@ -138,25 +189,48 @@ impl<R: 'static + ChunkReader> SerializedFileReader<R> {
138189
})
139190
}
140191

141-
/// Filters row group metadata to only those row groups,
142-
/// for which the predicate function returns true
143-
pub fn filter_row_groups(
144-
&mut self,
145-
predicate: &dyn Fn(&RowGroupMetaData, usize) -> bool,
146-
) {
192+
/// Creates file reader from a Parquet file with read options.
193+
/// Returns error if Parquet file does not exist or is corrupt.
194+
pub fn new_with_options(chunk_reader: R, options: ReadOptions) -> Result<Self> {
195+
let metadata = footer::parse_metadata(&chunk_reader)?;
196+
let mut predicates = options.predicates;
197+
let row_groups = metadata.row_groups().to_vec();
147198
let mut filtered_row_groups = Vec::<RowGroupMetaData>::new();
148-
for (i, row_group_metadata) in self.metadata.row_groups().iter().enumerate() {
149-
if predicate(row_group_metadata, i) {
150-
filtered_row_groups.push(row_group_metadata.clone());
199+
for (i, rg_meta) in row_groups.into_iter().enumerate() {
200+
let mut keep = true;
201+
for predicate in &mut predicates {
202+
if !predicate(&rg_meta, i) {
203+
keep = false;
204+
break;
205+
}
206+
}
207+
if keep {
208+
filtered_row_groups.push(rg_meta);
151209
}
152210
}
153-
self.metadata = ParquetMetaData::new(
154-
self.metadata.file_metadata().clone(),
155-
filtered_row_groups,
156-
);
211+
212+
Ok(Self {
213+
chunk_reader: Arc::new(chunk_reader),
214+
metadata: ParquetMetaData::new(
215+
metadata.file_metadata().clone(),
216+
filtered_row_groups,
217+
),
218+
})
157219
}
158220
}
159221

222+
/// Get midpoint offset for a row group
223+
fn get_midpoint_offset(meta: &RowGroupMetaData) -> i64 {
224+
let col = meta.column(0);
225+
let mut offset = col.data_page_offset();
226+
if let Some(dic_offset) = col.dictionary_page_offset() {
227+
if offset > dic_offset {
228+
offset = dic_offset
229+
}
230+
};
231+
offset + meta.compressed_size() / 2
232+
}
233+
160234
impl<R: 'static + ChunkReader> FileReader for SerializedFileReader<R> {
161235
fn metadata(&self) -> &ParquetMetaData {
162236
&self.metadata
@@ -790,19 +864,96 @@ mod tests {
790864
}
791865

792866
#[test]
793-
fn test_file_reader_filter_row_groups() -> Result<()> {
867+
fn test_file_reader_with_no_filter() -> Result<()> {
868+
let test_file = get_test_file("alltypes_plain.parquet");
869+
let origin_reader = SerializedFileReader::new(test_file)?;
870+
// test initial number of row groups
871+
let metadata = origin_reader.metadata();
872+
assert_eq!(metadata.num_row_groups(), 1);
873+
Ok(())
874+
}
875+
876+
#[test]
877+
fn test_file_reader_filter_row_groups_with_predicate() -> Result<()> {
794878
let test_file = get_test_file("alltypes_plain.parquet");
795-
let mut reader = SerializedFileReader::new(test_file)?;
879+
let read_options = ReadOptionsBuilder::new()
880+
.with_predicate(Box::new(|_, _| false))
881+
.build();
882+
let reader = SerializedFileReader::new_with_options(test_file, read_options)?;
883+
let metadata = reader.metadata();
884+
assert_eq!(metadata.num_row_groups(), 0);
885+
Ok(())
886+
}
796887

888+
#[test]
889+
fn test_file_reader_filter_row_groups_with_range() -> Result<()> {
890+
let test_file = get_test_file("alltypes_plain.parquet");
891+
let origin_reader = SerializedFileReader::new(test_file)?;
797892
// test initial number of row groups
893+
let metadata = origin_reader.metadata();
894+
assert_eq!(metadata.num_row_groups(), 1);
895+
let mid = get_midpoint_offset(metadata.row_group(0));
896+
897+
let test_file = get_test_file("alltypes_plain.parquet");
898+
let read_options = ReadOptionsBuilder::new().with_range(0, mid + 1).build();
899+
let reader = SerializedFileReader::new_with_options(test_file, read_options)?;
900+
let metadata = reader.metadata();
901+
assert_eq!(metadata.num_row_groups(), 1);
902+
903+
let test_file = get_test_file("alltypes_plain.parquet");
904+
let read_options = ReadOptionsBuilder::new().with_range(0, mid).build();
905+
let reader = SerializedFileReader::new_with_options(test_file, read_options)?;
906+
let metadata = reader.metadata();
907+
assert_eq!(metadata.num_row_groups(), 0);
908+
Ok(())
909+
}
910+
911+
#[test]
912+
fn test_file_reader_filter_row_groups_and_range() -> Result<()> {
913+
let test_file = get_test_file("alltypes_plain.parquet");
914+
let origin_reader = SerializedFileReader::new(test_file)?;
915+
let metadata = origin_reader.metadata();
916+
let mid = get_midpoint_offset(metadata.row_group(0));
917+
918+
// true, true predicate
919+
let test_file = get_test_file("alltypes_plain.parquet");
920+
let read_options = ReadOptionsBuilder::new()
921+
.with_predicate(Box::new(|_, _| true))
922+
.with_range(mid, mid + 1)
923+
.build();
924+
let reader = SerializedFileReader::new_with_options(test_file, read_options)?;
798925
let metadata = reader.metadata();
799926
assert_eq!(metadata.num_row_groups(), 1);
800927

801-
// test filtering out all row groups
802-
reader.filter_row_groups(&|_, _| false);
928+
// true, false predicate
929+
let test_file = get_test_file("alltypes_plain.parquet");
930+
let read_options = ReadOptionsBuilder::new()
931+
.with_predicate(Box::new(|_, _| true))
932+
.with_range(0, mid)
933+
.build();
934+
let reader = SerializedFileReader::new_with_options(test_file, read_options)?;
803935
let metadata = reader.metadata();
804936
assert_eq!(metadata.num_row_groups(), 0);
805937

938+
// false, true predicate
939+
let test_file = get_test_file("alltypes_plain.parquet");
940+
let read_options = ReadOptionsBuilder::new()
941+
.with_predicate(Box::new(|_, _| false))
942+
.with_range(mid, mid + 1)
943+
.build();
944+
let reader = SerializedFileReader::new_with_options(test_file, read_options)?;
945+
let metadata = reader.metadata();
946+
assert_eq!(metadata.num_row_groups(), 0);
947+
948+
// false, false predicate
949+
let test_file = get_test_file("alltypes_plain.parquet");
950+
let read_options = ReadOptionsBuilder::new()
951+
.with_predicate(Box::new(|_, _| false))
952+
.with_range(0, mid)
953+
.build();
954+
let reader = SerializedFileReader::new_with_options(test_file, read_options)?;
955+
let metadata = reader.metadata();
956+
assert_eq!(metadata.num_row_groups(), 0);
806957
Ok(())
807958
}
808959
}

0 commit comments

Comments
 (0)