-
Notifications
You must be signed in to change notification settings - Fork 1k
parquet-concat: handle large number of files. #8651
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -66,47 +66,55 @@ impl Args { | |
|
|
||
| let output = File::create(&self.output)?; | ||
|
|
||
| let inputs = self | ||
| .input | ||
| .iter() | ||
| .map(|x| { | ||
| let reader = File::open(x)?; | ||
| let metadata = ParquetMetaDataReader::new().parse_and_finish(&reader)?; | ||
| Ok((reader, metadata)) | ||
| }) | ||
| .collect::<Result<Vec<_>>>()?; | ||
| let schema = { | ||
| let inputs = self | ||
| .input | ||
| .iter() | ||
| .map(|x| { | ||
| let reader = File::open(x)?; | ||
| let metadata = ParquetMetaDataReader::new().parse_and_finish(&reader)?; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is still parsing the metadata from all the files into memory before checking the schema If you are looking to support the large file usecase better, it would require fewer resources (memory) to read the schema from the first file, and then verify the schema from the remaining files one at a time rather than reading the metadata for all files before validating |
||
| Ok(metadata) | ||
| }) | ||
| .collect::<Result<Vec<_>>>()?; | ||
|
|
||
| let expected = inputs[0].1.file_metadata().schema(); | ||
| for (_, metadata) in inputs.iter().skip(1) { | ||
| let actual = metadata.file_metadata().schema(); | ||
| if expected != actual { | ||
| return Err(ParquetError::General(format!( | ||
| "inputs must have the same schema, {expected:#?} vs {actual:#?}" | ||
| ))); | ||
| let expected = inputs[0].file_metadata().schema(); | ||
| for metadata in inputs.iter().skip(1) { | ||
| let actual = metadata.file_metadata().schema(); | ||
| if expected != actual { | ||
| return Err(ParquetError::General(format!( | ||
| "inputs must have the same schema, {expected:#?} vs {actual:#?}" | ||
| ))); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| inputs[0].file_metadata().schema_descr().root_schema_ptr() | ||
| }; | ||
| let props = Arc::new(WriterProperties::builder().build()); | ||
| let schema = inputs[0].1.file_metadata().schema_descr().root_schema_ptr(); | ||
| let mut writer = SerializedFileWriter::new(output, schema, props)?; | ||
|
|
||
| for (input, metadata) in inputs { | ||
| for rg in metadata.row_groups() { | ||
| let mut rg_out = writer.next_row_group()?; | ||
| for column in rg.columns() { | ||
| let result = ColumnCloseResult { | ||
| bytes_written: column.compressed_size() as _, | ||
| rows_written: rg.num_rows() as _, | ||
| metadata: column.clone(), | ||
| bloom_filter: None, | ||
| column_index: None, | ||
| offset_index: None, | ||
| }; | ||
| rg_out.append_column(&input, result)?; | ||
| self.input | ||
| .iter() | ||
| .map(|x| { | ||
| let input = File::open(x)?; | ||
| let metadata = ParquetMetaDataReader::new().parse_and_finish(&input)?; | ||
| for rg in metadata.row_groups() { | ||
| let mut rg_out = writer.next_row_group()?; | ||
| for column in rg.columns() { | ||
| let result = ColumnCloseResult { | ||
| bytes_written: column.compressed_size() as _, | ||
| rows_written: rg.num_rows() as _, | ||
| metadata: column.clone(), | ||
| bloom_filter: None, | ||
| column_index: None, | ||
| offset_index: None, | ||
| }; | ||
| rg_out.append_column(&input, result)?; | ||
| } | ||
| rg_out.close()?; | ||
| } | ||
| rg_out.close()?; | ||
| } | ||
| } | ||
| Ok(()) | ||
| }) | ||
| .collect::<Result<Vec<_>>>()?; | ||
|
|
||
| writer.close()?; | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since there are no tests, I think a comment here explaining the rationale for not keeping the files open is probably good