-
Notifications
You must be signed in to change notification settings - Fork 875
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Minor: Update doc strings about Page Index / Column Index #3625
Changes from 4 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -50,7 +50,25 @@ use crate::schema::types::{ | |
Type as SchemaType, | ||
}; | ||
|
||
/// [`Index`] for each row group of each column. | ||
/// | ||
/// `column_index[row_group_number][column_number]` holds the | ||
/// [`Index`] corresponding to column `column_number` of row group | ||
/// `row_group_number`. | ||
/// | ||
/// For example `column_index[2][3]` holds the [`Index`] for the forth | ||
/// column in the third row group of the parquet file. | ||
pub type ParquetColumnIndex = Vec<Vec<Index>>; | ||
|
||
/// [`PageLocation`] for each datapage of each row group of each column. | ||
/// | ||
/// `offset_index[row_group_number][column_number][page_number]` holds | ||
/// the [`PageLocation`] corresponding to page `page_number` of column | ||
/// `column_number`of row group `row_group_number`. | ||
/// | ||
/// For example `offset_index[2][3][4]` holds the [`PageLocation`] for | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice write up! 👍 |
||
/// the fifth page of the forth column in the third row group of the | ||
/// parquet file. | ||
pub type ParquetOffsetIndex = Vec<Vec<Vec<PageLocation>>>; | ||
|
||
/// Global Parquet metadata. | ||
|
@@ -65,8 +83,8 @@ pub struct ParquetMetaData { | |
} | ||
|
||
impl ParquetMetaData { | ||
/// Creates Parquet metadata from file metadata and a list of row group metadata `Arc`s | ||
/// for each available row group. | ||
/// Creates Parquet metadata from file metadata and a list of row | ||
/// group metadata | ||
pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self { | ||
ParquetMetaData { | ||
file_metadata, | ||
|
@@ -76,6 +94,8 @@ impl ParquetMetaData { | |
} | ||
} | ||
|
||
/// Creates Parquet metadata from file metadata, a list of row | ||
/// group metadata, and the column index structures. | ||
pub fn new_with_page_index( | ||
file_metadata: FileMetaData, | ||
row_groups: Vec<RowGroupMetaData>, | ||
|
@@ -232,6 +252,7 @@ pub struct RowGroupMetaData { | |
sorting_columns: Option<Vec<SortingColumn>>, | ||
total_byte_size: i64, | ||
schema_descr: SchemaDescPtr, | ||
/// `page_offset_index[column_number][row_group_number]` | ||
alamb marked this conversation as resolved.
Show resolved
Hide resolved
|
||
page_offset_index: Option<Vec<Vec<PageLocation>>>, | ||
} | ||
|
||
|
@@ -277,6 +298,8 @@ impl RowGroupMetaData { | |
} | ||
|
||
/// Returns reference of page offset index of all column in this row group. | ||
/// | ||
/// The returned vector contains `page_offset[column_number][page_number]` | ||
pub fn page_offset_index(&self) -> Option<&Vec<Vec<PageLocation>>> { | ||
self.page_offset_index.as_ref() | ||
} | ||
|
@@ -292,6 +315,8 @@ impl RowGroupMetaData { | |
} | ||
|
||
/// Sets page offset index for this row group. | ||
/// | ||
/// The vector represents `page_offset[column_number][page_number]` | ||
pub fn set_page_offset(&mut self, page_offset: Vec<Vec<PageLocation>>) { | ||
self.page_offset_index = Some(page_offset); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,8 @@ | |
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
//! [`Index`] structures holding decoded [`ColumnIndex`] information | ||
|
||
use crate::basic::Type; | ||
use crate::data_type::private::ParquetValueType; | ||
use crate::data_type::{ByteArray, Int96}; | ||
|
@@ -23,7 +25,14 @@ use crate::format::{BoundaryOrder, ColumnIndex}; | |
use crate::util::bit_util::from_le_slice; | ||
use std::fmt::Debug; | ||
|
||
/// The statistics in one page | ||
/// PageIndex Statistics for one data page, as described in [Column Index]. | ||
/// | ||
/// One significant difference from the row group level | ||
/// [`Statistics`](crate::format::Statistics) is that page level | ||
/// statistics may not store actual column values as min and max | ||
/// (e.g. they may store truncated strings to save space) | ||
/// | ||
/// [Column Index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md | ||
#[derive(Debug, Clone, PartialEq, Eq, Hash)] | ||
pub struct PageIndex<T> { | ||
/// The minimum value, It is None when all values are null | ||
|
@@ -48,6 +57,10 @@ impl<T> PageIndex<T> { | |
|
||
#[derive(Debug, Clone, PartialEq)] | ||
#[allow(non_camel_case_types)] | ||
/// Typed statistics for a data page in a column chunk. This structure | ||
/// is obtained from decoding the [ColumnIndex] in the parquet file | ||
/// and can be used to skip decoding pages while reading the file | ||
/// data. | ||
pub enum Index { | ||
/// Sometimes reading page index from parquet file | ||
/// will only return pageLocations without min_max index, | ||
|
@@ -90,14 +103,17 @@ impl Index { | |
} | ||
} | ||
|
||
/// An index of a column of [`Type`] physical representation | ||
/// Stores the [`PageIndex`] for each page of a column with [`Type`] | ||
#[derive(Debug, Clone, PartialEq, Eq, Hash)] | ||
pub struct NativeIndex<T: ParquetValueType> { | ||
/// The physical type | ||
/// The physical type of this column | ||
pub physical_type: Type, | ||
/// The indexes, one item per page | ||
pub indexes: Vec<PageIndex<T>>, | ||
/// the order | ||
/// If the min/max elements are ordered, and if so in which | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the correct description. |
||
/// direction. See [source] for details. | ||
/// | ||
/// [source]: https://github.com/apache/parquet-format/blob/bfc549b93e6927cb1fc425466e4084f76edc6d22/src/main/thrift/parquet.thrift#L959-L964 | ||
pub boundary_order: BoundaryOrder, | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,8 @@ | |
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
//! Support for reading [`Index`] and [`PageLocation`] from parquet metadata. | ||
|
||
use crate::basic::Type; | ||
use crate::data_type::Int96; | ||
use crate::errors::ParquetError; | ||
|
@@ -25,8 +27,17 @@ use crate::format::{ColumnIndex, OffsetIndex, PageLocation}; | |
use std::io::{Cursor, Read}; | ||
use thrift::protocol::{TCompactInputProtocol, TSerializable}; | ||
|
||
/// Read on row group's all columns indexes and change into [`Index`] | ||
/// If not the format not available return an empty vector. | ||
/// Reads per-column [`Index`] for all columns of a row group by | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I may rant a little, the use of the terms |
||
/// decoding [`ColumnIndex`] . | ||
/// | ||
/// Returns a vector of `index[column_number]`. | ||
/// | ||
/// Returns an empty vector if this row group does not contain a | ||
/// [`ColumnIndex`]. | ||
/// | ||
/// See [Column Index Documentation] for more details. | ||
/// | ||
/// [Column Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md | ||
pub fn read_columns_indexes<R: ChunkReader>( | ||
reader: &R, | ||
chunks: &[ColumnChunkMetaData], | ||
|
@@ -60,8 +71,17 @@ pub fn read_columns_indexes<R: ChunkReader>( | |
.collect() | ||
} | ||
|
||
/// Read on row group's all indexes and change into [`Index`] | ||
/// If not the format not available return an empty vector. | ||
/// Reads per-page [`PageLocation`] for all columns of a row group by | ||
/// decoding the [`OffsetIndex`]. | ||
/// | ||
/// Returns a vector of `location[column_number][page_number]` | ||
/// | ||
/// Return an empty vector if this row group does not contain an | ||
/// [`OffsetIndex]`. | ||
/// | ||
/// See [Column Index Documentation] for more details. | ||
/// | ||
/// [Column Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md | ||
pub fn read_pages_locations<R: ChunkReader>( | ||
reader: &R, | ||
chunks: &[ColumnChunkMetaData], | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I read how
ArrowReaderBuilder
populatescolumns_indexes
and so this looks correct. 👍