-
Notifications
You must be signed in to change notification settings - Fork 875
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Expose page-level arrow reader API (#4298) #4307
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,12 +26,9 @@ use arrow_array::{RecordBatch, RecordBatchReader}; | |
use arrow_schema::{ArrowError, DataType as ArrowType, Schema, SchemaRef}; | ||
use arrow_select::filter::prep_null_mask_filter; | ||
|
||
use crate::arrow::array_reader::{ | ||
build_array_reader, ArrayReader, FileReaderRowGroupCollection, RowGroupCollection, | ||
}; | ||
use crate::arrow::schema::parquet_to_array_schema_and_fields; | ||
use crate::arrow::schema::ParquetField; | ||
use crate::arrow::ProjectionMask; | ||
use crate::arrow::array_reader::{build_array_reader, ArrayReader, FileReaderRowGroups}; | ||
use crate::arrow::schema::{parquet_to_arrow_schema_and_fields, ParquetField}; | ||
use crate::arrow::{FieldLevels, ProjectionMask}; | ||
use crate::errors::{ParquetError, Result}; | ||
use crate::file::metadata::ParquetMetaData; | ||
use crate::file::reader::{ChunkReader, SerializedFileReader}; | ||
|
@@ -41,6 +38,7 @@ use crate::schema::types::SchemaDescriptor; | |
mod filter; | ||
mod selection; | ||
|
||
pub use crate::arrow::array_reader::RowGroups; | ||
pub use filter::{ArrowPredicate, ArrowPredicateFn, RowFilter}; | ||
pub use selection::{RowSelection, RowSelector}; | ||
|
||
|
@@ -87,7 +85,7 @@ impl<T> ArrowReaderBuilder<T> { | |
false => metadata.file_metadata().key_value_metadata(), | ||
}; | ||
|
||
let (schema, fields) = parquet_to_array_schema_and_fields( | ||
let (schema, fields) = parquet_to_arrow_schema_and_fields( | ||
metadata.file_metadata().schema_descr(), | ||
ProjectionMask::all(), | ||
kv_metadata, | ||
|
@@ -269,8 +267,7 @@ impl<T: ChunkReader + 'static> ArrowReaderBuilder<SyncReader<T>> { | |
/// | ||
/// Note: this will eagerly evaluate any `RowFilter` before returning | ||
pub fn build(self) -> Result<ParquetRecordBatchReader> { | ||
let reader = | ||
FileReaderRowGroupCollection::new(Arc::new(self.input.0), self.row_groups); | ||
let reader = FileReaderRowGroups::new(Arc::new(self.input.0), self.row_groups); | ||
|
||
let mut filter = self.filter; | ||
let mut selection = self.selection; | ||
|
@@ -420,6 +417,30 @@ impl ParquetRecordBatchReader { | |
.build() | ||
} | ||
|
||
/// Create a new [`ParquetRecordBatchReader`] from the provided [`RowGroups`] | ||
/// | ||
/// Note: this is a low-level interface see [`ParquetRecordBatchReader::try_new`] for a | ||
/// higher-level interface for reading parquet data from a file | ||
pub fn try_new_with_row_groups( | ||
levels: &FieldLevels, | ||
row_groups: &dyn RowGroups, | ||
batch_size: usize, | ||
selection: Option<RowSelection>, | ||
) -> Result<Self> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't need to include a |
||
let array_reader = build_array_reader( | ||
levels.levels.as_ref(), | ||
&ProjectionMask::all(), | ||
row_groups, | ||
)?; | ||
|
||
Ok(Self { | ||
batch_size, | ||
array_reader, | ||
schema: Arc::new(Schema::new(levels.fields.clone())), | ||
selection: selection.map(|s| s.trim().into()), | ||
}) | ||
} | ||
|
||
/// Create a new [`ParquetRecordBatchReader`] that will read at most `batch_size` rows at | ||
/// a time from [`ArrayReader`] based on the configured `selection`. If `selection` is `None` | ||
/// all rows will be returned | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -173,9 +173,14 @@ impl RowSelection { | |
} | ||
} | ||
|
||
/// Given an offset index, return the offset ranges for all data pages selected by `self` | ||
#[cfg(any(test, feature = "async"))] | ||
pub(crate) fn scan_ranges( | ||
/// Given an offset index, return the byte ranges for all data pages selected by `self` | ||
/// | ||
/// This is useful for determining what byte ranges to fetch from underlying storage | ||
/// | ||
/// Note: this method does not make any effort to combine consecutive ranges, nor coalesce | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
/// ranges that are close together. This is instead delegated to the IO subsystem to optimise, | ||
/// e.g. [`ObjectStore::get_ranges`](object_store::ObjectStore::get_ranges) | ||
pub fn scan_ranges( | ||
&self, | ||
page_locations: &[crate::format::PageLocation], | ||
) -> Vec<Range<usize>> { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,7 +24,7 @@ use crate::basic::{ConvertedType, Repetition}; | |
use crate::errors::ParquetError; | ||
use crate::errors::Result; | ||
use crate::schema::types::{SchemaDescriptor, Type, TypePtr}; | ||
use arrow_schema::{DataType, Field, Schema, SchemaBuilder}; | ||
use arrow_schema::{DataType, Field, Fields, SchemaBuilder}; | ||
|
||
fn get_repetition(t: &Type) -> Repetition { | ||
let info = t.get_basic_info(); | ||
|
@@ -35,6 +35,7 @@ fn get_repetition(t: &Type) -> Repetition { | |
} | ||
|
||
/// Representation of a parquet file, in terms of arrow schema elements | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When reading this PR I see this comment and wonder "is it really a parquet file" or is it more like a "parquet field"? Or a "possibly nested field" 🤔 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reworded, I agree the use of file here is very misleading |
||
#[derive(Debug, Clone)] | ||
pub struct ParquetField { | ||
/// The level which represents an insertion into the current list | ||
/// i.e. guaranteed to be > 0 for a list type | ||
|
@@ -82,6 +83,7 @@ impl ParquetField { | |
} | ||
} | ||
|
||
#[derive(Debug, Clone)] | ||
pub enum ParquetFieldType { | ||
Primitive { | ||
/// The index of the column in parquet | ||
|
@@ -554,13 +556,13 @@ fn convert_field( | |
|
||
/// Computes the [`ParquetField`] for the provided [`SchemaDescriptor`] with `leaf_columns` listing | ||
/// the indexes of leaf columns to project, and `embedded_arrow_schema` the optional | ||
/// [`Schema`] embedded in the parquet metadata | ||
/// [`Fields`] embedded in the parquet metadata | ||
/// | ||
/// Note: This does not support out of order column projection | ||
pub fn convert_schema( | ||
schema: &SchemaDescriptor, | ||
mask: ProjectionMask, | ||
embedded_arrow_schema: Option<&Schema>, | ||
embedded_arrow_schema: Option<&Fields>, | ||
) -> Result<Option<ParquetField>> { | ||
let mut visitor = Visitor { | ||
next_col_idx: 0, | ||
|
@@ -570,7 +572,7 @@ pub fn convert_schema( | |
let context = VisitorContext { | ||
rep_level: 0, | ||
def_level: 0, | ||
data_type: embedded_arrow_schema.map(|s| DataType::Struct(s.fields().clone())), | ||
data_type: embedded_arrow_schema.map(|fields| DataType::Struct(fields.clone())), | ||
}; | ||
|
||
visitor.dispatch(&schema.root_schema_ptr(), context) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This lets you construct a
ParquetRecordBatchReader
from an arbitrary source of pages