From 0099e344938cc1acb8149627d3d453db0001c9ae Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 3 Oct 2024 11:55:02 -0700 Subject: [PATCH 1/4] detect missing page indexes on read --- parquet/src/arrow/mod.rs | 60 +++++++++++++++++++++++++++++ parquet/src/file/metadata/reader.rs | 26 ++++++++++--- 2 files changed, 81 insertions(+), 5 deletions(-) diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index afe7ed1ebec9..737843326825 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -236,3 +236,63 @@ pub fn parquet_column<'a>( .find(|x| parquet_schema.get_column_root_idx(*x) == root_idx)?; Some((parquet_idx, field)) } + +#[cfg(test)] +mod test { + use crate::arrow::ArrowWriter; + use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader, ParquetMetaDataWriter}; + use crate::file::properties::{EnabledStatistics, WriterProperties}; + use arrow_array::{ArrayRef, Int32Array, RecordBatch}; + use bytes::Bytes; + use std::sync::Arc; + + #[test] + // Reproducer for https://github.com/apache/arrow-rs/issues/6464 + fn test_metadata_read_write_partial_offset() { + let parquet_bytes = create_parquet_file(); + + // read the metadata from the file WITHOUT the page index structures + let original_metadata = ParquetMetaDataReader::new() + .parse_and_finish(&parquet_bytes) + .unwrap(); + + // this should error because the page indexes are not present, but have offsets specified + let metadata_bytes = metadata_to_bytes(&original_metadata); + let err = ParquetMetaDataReader::new() + .with_page_indexes(true) // there are no page indexes in the metadata + .parse_and_finish(&metadata_bytes) + .err() + .unwrap(); + assert_eq!( + err.to_string(), + "EOF: Parquet file too small. Page index range 82..115 overlaps with file metadata 0..341" + ); + } + + /// Write a parquet filed into an in memory buffer + fn create_parquet_file() -> Bytes { + let mut buf = vec![]; + let data = vec![100, 200, 201, 300, 102, 33]; + let array: ArrayRef = Arc::new(Int32Array::from(data)); + let batch = RecordBatch::try_from_iter(vec![("id", array)]).unwrap(); + let props = WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::Page) + .build(); + + let mut writer = ArrowWriter::try_new(&mut buf, batch.schema(), Some(props)).unwrap(); + writer.write(&batch).unwrap(); + writer.finish().unwrap(); + drop(writer); + + Bytes::from(buf) + } + + /// Serializes `ParquetMetaData` into a memory buffer, using `ParquetMetadataWriter + fn metadata_to_bytes(metadata: &ParquetMetaData) -> Bytes { + let mut buf = vec![]; + ParquetMetaDataWriter::new(&mut buf, metadata) + .finish() + .unwrap(); + Bytes::from(buf) + } +} diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 88f53d02f3f4..4da775ae0af9 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -58,6 +58,8 @@ pub struct ParquetMetaDataReader { column_index: bool, offset_index: bool, prefetch_hint: Option, + // size of the serialized thrift metadata plus the 8 byte footer + metadata_size: Option, } impl ParquetMetaDataReader { @@ -195,7 +197,7 @@ impl ParquetMetaDataReader { /// let metadata = reader.finish().unwrap(); /// ``` pub fn try_parse_sized(&mut self, reader: &R, file_size: usize) -> Result<()> { - self.metadata = match Self::parse_metadata(reader) { + self.metadata = match self.parse_metadata(reader) { Ok(metadata) => Some(metadata), // FIXME: throughout this module ParquetError::IndexOutOfBound is used to indicate the // need for more data. This is not it's intended use. The plan is to add a NeedMoreData @@ -282,6 +284,19 @@ impl ParquetMetaDataReader { } } + // Perform extra sanity check to make sure `range` and the footer metadata don't + // overlap. + if let Some(metadata_size) = self.metadata_size { + let metadata_range = file_size.saturating_sub(metadata_size)..file_size; + if metadata_range.contains(&range.end) { + return Err(eof_err!( + "Parquet file too small. Page index range {:?} overlaps with file metadata {:?}", + range, + metadata_range + )); + } + } + let bytes_needed = range.end - range.start; let bytes = reader.get_bytes((range.start - file_range.start) as u64, bytes_needed)?; let offset = range.start; @@ -456,7 +471,7 @@ impl ParquetMetaDataReader { } // one-shot parse of footer - fn parse_metadata(chunk_reader: &R) -> Result { + fn parse_metadata(&mut self, chunk_reader: &R) -> Result { // check file is large enough to hold footer let file_size = chunk_reader.len(); if file_size < (FOOTER_SIZE as u64) { @@ -473,6 +488,7 @@ impl ParquetMetaDataReader { let metadata_len = Self::decode_footer(&footer)?; let footer_metadata_len = FOOTER_SIZE + metadata_len; + self.metadata_size = Some(footer_metadata_len); if footer_metadata_len > file_size as usize { return Err(ParquetError::IndexOutOfBound( @@ -654,14 +670,14 @@ mod tests { #[test] fn test_parse_metadata_size_smaller_than_footer() { let test_file = tempfile::tempfile().unwrap(); - let err = ParquetMetaDataReader::parse_metadata(&test_file).unwrap_err(); + let err = ParquetMetaDataReader::new().parse_metadata(&test_file).unwrap_err(); assert!(matches!(err, ParquetError::IndexOutOfBound(8, _))); } #[test] fn test_parse_metadata_corrupt_footer() { let data = Bytes::from(vec![1, 2, 3, 4, 5, 6, 7, 8]); - let reader_result = ParquetMetaDataReader::parse_metadata(&data); + let reader_result = ParquetMetaDataReader::new().parse_metadata(&data); assert_eq!( reader_result.unwrap_err().to_string(), "Parquet error: Invalid Parquet file. Corrupt footer" @@ -671,7 +687,7 @@ mod tests { #[test] fn test_parse_metadata_invalid_start() { let test_file = Bytes::from(vec![255, 0, 0, 0, b'P', b'A', b'R', b'1']); - let err = ParquetMetaDataReader::parse_metadata(&test_file).unwrap_err(); + let err = ParquetMetaDataReader::new().parse_metadata(&test_file).unwrap_err(); assert!(matches!(err, ParquetError::IndexOutOfBound(263, _))); } From 571931278f5f523a127bb78eb50695ace7f14a82 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 3 Oct 2024 12:05:42 -0700 Subject: [PATCH 2/4] lint --- parquet/src/file/metadata/reader.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 4da775ae0af9..36d8b08b2398 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -670,7 +670,9 @@ mod tests { #[test] fn test_parse_metadata_size_smaller_than_footer() { let test_file = tempfile::tempfile().unwrap(); - let err = ParquetMetaDataReader::new().parse_metadata(&test_file).unwrap_err(); + let err = ParquetMetaDataReader::new() + .parse_metadata(&test_file) + .unwrap_err(); assert!(matches!(err, ParquetError::IndexOutOfBound(8, _))); } @@ -687,7 +689,9 @@ mod tests { #[test] fn test_parse_metadata_invalid_start() { let test_file = Bytes::from(vec![255, 0, 0, 0, b'P', b'A', b'R', b'1']); - let err = ParquetMetaDataReader::new().parse_metadata(&test_file).unwrap_err(); + let err = ParquetMetaDataReader::new() + .parse_metadata(&test_file) + .unwrap_err(); assert!(matches!(err, ParquetError::IndexOutOfBound(263, _))); } From 792d13b499995d28f0e585217126c04b25064dc3 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 3 Oct 2024 12:15:08 -0700 Subject: [PATCH 3/4] fix overlap test --- parquet/src/file/metadata/reader.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 36d8b08b2398..134eb832395d 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -288,7 +288,7 @@ impl ParquetMetaDataReader { // overlap. if let Some(metadata_size) = self.metadata_size { let metadata_range = file_size.saturating_sub(metadata_size)..file_size; - if metadata_range.contains(&range.end) { + if range.end > metadata_range.start { return Err(eof_err!( "Parquet file too small. Page index range {:?} overlaps with file metadata {:?}", range, From ebfb67e44a14cac220c325c792854d4e950c7b2d Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 3 Oct 2024 13:27:57 -0700 Subject: [PATCH 4/4] add some clarifying comments per review suggestion --- parquet/src/file/metadata/reader.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 134eb832395d..ec4b97bb5727 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -58,7 +58,8 @@ pub struct ParquetMetaDataReader { column_index: bool, offset_index: bool, prefetch_hint: Option, - // size of the serialized thrift metadata plus the 8 byte footer + // Size of the serialized thrift metadata plus the 8 byte footer. Only set if + // `self.parse_metadata` is called. metadata_size: Option, } @@ -470,7 +471,8 @@ impl ParquetMetaDataReader { range } - // one-shot parse of footer + // One-shot parse of footer. + // Side effect: this will set `self.metadata_size` fn parse_metadata(&mut self, chunk_reader: &R) -> Result { // check file is large enough to hold footer let file_size = chunk_reader.len();