-
Notifications
You must be signed in to change notification settings - Fork 1k
POC: Avoid a copy for uncompressed pages #8756
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -389,31 +389,15 @@ pub(crate) fn decode_page( | |
|
|
||
| // TODO: page header could be huge because of statistics. We should set a | ||
| // maximum page header size and abort if that is exceeded. | ||
| let buffer = match decompressor { | ||
| Some(decompressor) if can_decompress => { | ||
| let uncompressed_page_size = usize::try_from(page_header.uncompressed_page_size)?; | ||
| if offset > buffer.len() || offset > uncompressed_page_size { | ||
| return Err(general_err!("Invalid page header")); | ||
| } | ||
| let decompressed_size = uncompressed_page_size - offset; | ||
| let mut decompressed = Vec::with_capacity(uncompressed_page_size); | ||
| decompressed.extend_from_slice(&buffer.as_ref()[..offset]); | ||
| if decompressed_size > 0 { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can see here that if I changed the code to return I can add it to this location directly to make the diff smaller, but the logic was already hard to follow and highly indented to I moved it into its own function while I was at it |
||
| let compressed = &buffer.as_ref()[offset..]; | ||
| decompressor.decompress(compressed, &mut decompressed, Some(decompressed_size))?; | ||
| } | ||
|
|
||
| if decompressed.len() != uncompressed_page_size { | ||
| return Err(general_err!( | ||
| "Actual decompressed size doesn't match the expected one ({} vs {})", | ||
| decompressed.len(), | ||
| uncompressed_page_size | ||
| )); | ||
| } | ||
|
|
||
| Bytes::from(decompressed) | ||
| } | ||
| _ => buffer, | ||
| let buffer = if can_decompress { | ||
| decompress_buffer( | ||
| buffer, | ||
| offset, | ||
| decompressor, | ||
| page_header.uncompressed_page_size, | ||
| )? | ||
| } else { | ||
| buffer | ||
| }; | ||
|
|
||
| let result = match page_header.r#type { | ||
|
|
@@ -471,6 +455,47 @@ pub(crate) fn decode_page( | |
| Ok(result) | ||
| } | ||
|
|
||
| /// Decompressed the specified buffer, starting from the specified offset, using | ||
| /// the provided decompressor if available and applicable. If the buffer is not | ||
| /// compressed, it will be returned as is. | ||
|
Comment on lines
+459
to
+460
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The code looks good to me but the I don't know if the comment "not compressed" can be replaced, if
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I actually also do not know what decompressed_size really means -- you are probably right. I will research it more carefully
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I read https://github.com/apache/parquet-format/blob/master/README.md#data-pages and I agree with your assessment that if the decompressed_size=0 it corresponds to no non-null values |
||
| fn decompress_buffer( | ||
| buffer: Bytes, | ||
| offset: usize, | ||
| mut decompressor: Option<&mut Box<dyn Codec>>, | ||
| uncompressed_page_size: i32, | ||
| ) -> Result<Bytes> { | ||
| let Some(decompressor) = decompressor.as_mut() else { | ||
| return Ok(buffer); | ||
| }; | ||
|
|
||
| let uncompressed_page_size = usize::try_from(uncompressed_page_size)?; | ||
| if offset > buffer.len() || offset > uncompressed_page_size { | ||
| return Err(general_err!("Invalid page header")); | ||
| } | ||
|
|
||
| let decompressed_size = uncompressed_page_size - offset; | ||
|
|
||
| if decompressed_size == 0 { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense to me 👍 |
||
| // Not compressed, return buffer as is | ||
| return Ok(buffer.slice(..offset)); | ||
| } | ||
|
|
||
| let mut decompressed = Vec::with_capacity(uncompressed_page_size); | ||
| decompressed.extend_from_slice(&buffer[..offset]); | ||
| let compressed = &buffer.as_ref()[offset..]; | ||
| decompressor.decompress(compressed, &mut decompressed, Some(decompressed_size))?; | ||
|
|
||
| if decompressed.len() != uncompressed_page_size { | ||
| return Err(general_err!( | ||
| "Actual decompressed size doesn't match the expected one ({} vs {})", | ||
| decompressed.len(), | ||
| uncompressed_page_size | ||
| )); | ||
| } | ||
|
|
||
| Ok(Bytes::from(decompressed)) | ||
| } | ||
|
|
||
| enum SerializedPageReaderState { | ||
| Values { | ||
| /// The current byte offset in the reader | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not relevant to this PR, but I think this TODO has largely been addressed by #8376 which enabled skipping the decoding of the page statistics.