-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DataSink Dynamic Execution Time Demux #7791
Merged
alamb
merged 11 commits into
apache:main
from
devinjdangelo:dynamic_demux_for_fileoutput
Oct 18, 2023
Merged
Changes from all commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
bb09e79
dynamic partition
devinjdangelo 41ad035
linting
devinjdangelo ea98e9b
update docs
devinjdangelo 330d6b3
fix all tests
devinjdangelo 49bd358
cargo doc fix
devinjdangelo 6abcba9
test correct number of files
devinjdangelo 134eda5
add asci art and reduce buffer size
devinjdangelo b951c2d
fix config tests
devinjdangelo 81fc1eb
review comments
devinjdangelo 234371e
cleanup post rebase
devinjdangelo 5accdc2
fix test
devinjdangelo File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,11 +23,10 @@ use std::fmt; | |
use std::fmt::Debug; | ||
use std::sync::Arc; | ||
|
||
use super::write::{stateless_append_all, stateless_multipart_put}; | ||
use super::{FileFormat, DEFAULT_SCHEMA_INFER_MAX_RECORD}; | ||
use crate::datasource::file_format::file_compression_type::FileCompressionType; | ||
use crate::datasource::file_format::write::{ | ||
create_writer, stateless_serialize_and_write_files, BatchSerializer, FileWriterMode, | ||
}; | ||
use crate::datasource::file_format::write::{BatchSerializer, FileWriterMode}; | ||
use crate::datasource::physical_plan::{ | ||
CsvExec, FileGroupDisplay, FileScanConfig, FileSinkConfig, | ||
}; | ||
|
@@ -51,7 +50,6 @@ use bytes::{Buf, Bytes}; | |
use futures::stream::BoxStream; | ||
use futures::{pin_mut, Stream, StreamExt, TryStreamExt}; | ||
use object_store::{delimited::newline_delimited_stream, ObjectMeta, ObjectStore}; | ||
use rand::distributions::{Alphanumeric, DistString}; | ||
|
||
/// Character Separated Value `FileFormat` implementation. | ||
#[derive(Debug)] | ||
|
@@ -481,6 +479,82 @@ impl CsvSink { | |
fn new(config: FileSinkConfig) -> Self { | ||
Self { config } | ||
} | ||
|
||
async fn append_all( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is a very nice cleanup / refactor |
||
&self, | ||
data: SendableRecordBatchStream, | ||
context: &Arc<TaskContext>, | ||
) -> Result<u64> { | ||
let writer_options = self.config.file_type_writer_options.try_into_csv()?; | ||
let (builder, compression) = | ||
(&writer_options.writer_options, &writer_options.compression); | ||
let compression = FileCompressionType::from(*compression); | ||
|
||
let object_store = context | ||
.runtime_env() | ||
.object_store(&self.config.object_store_url)?; | ||
let file_groups = &self.config.file_groups; | ||
|
||
let builder_clone = builder.clone(); | ||
let options_clone = writer_options.clone(); | ||
let get_serializer = move |file_size| { | ||
let inner_clone = builder_clone.clone(); | ||
// In append mode, consider has_header flag only when file is empty (at the start). | ||
// For other modes, use has_header flag as is. | ||
let serializer: Box<dyn BatchSerializer> = Box::new(if file_size > 0 { | ||
CsvSerializer::new() | ||
.with_builder(inner_clone) | ||
.with_header(false) | ||
} else { | ||
CsvSerializer::new() | ||
.with_builder(inner_clone) | ||
.with_header(options_clone.has_header) | ||
}); | ||
serializer | ||
}; | ||
|
||
stateless_append_all( | ||
data, | ||
context, | ||
object_store, | ||
file_groups, | ||
self.config.unbounded_input, | ||
compression, | ||
Box::new(get_serializer), | ||
) | ||
.await | ||
} | ||
|
||
async fn multipartput_all( | ||
&self, | ||
data: SendableRecordBatchStream, | ||
context: &Arc<TaskContext>, | ||
) -> Result<u64> { | ||
let writer_options = self.config.file_type_writer_options.try_into_csv()?; | ||
let builder = &writer_options.writer_options; | ||
|
||
let builder_clone = builder.clone(); | ||
let options_clone = writer_options.clone(); | ||
let get_serializer = move || { | ||
let inner_clone = builder_clone.clone(); | ||
let serializer: Box<dyn BatchSerializer> = Box::new( | ||
CsvSerializer::new() | ||
.with_builder(inner_clone) | ||
.with_header(options_clone.has_header), | ||
); | ||
serializer | ||
}; | ||
|
||
stateless_multipart_put( | ||
data, | ||
context, | ||
"csv".into(), | ||
Box::new(get_serializer), | ||
&self.config, | ||
writer_options.compression.into(), | ||
) | ||
.await | ||
} | ||
} | ||
|
||
#[async_trait] | ||
|
@@ -495,116 +569,22 @@ impl DataSink for CsvSink { | |
|
||
async fn write_all( | ||
&self, | ||
data: Vec<SendableRecordBatchStream>, | ||
data: SendableRecordBatchStream, | ||
context: &Arc<TaskContext>, | ||
) -> Result<u64> { | ||
let num_partitions = data.len(); | ||
let writer_options = self.config.file_type_writer_options.try_into_csv()?; | ||
let (builder, compression) = | ||
(&writer_options.writer_options, &writer_options.compression); | ||
let mut has_header = writer_options.has_header; | ||
let compression = FileCompressionType::from(*compression); | ||
|
||
let object_store = context | ||
.runtime_env() | ||
.object_store(&self.config.object_store_url)?; | ||
// Construct serializer and writer for each file group | ||
let mut serializers: Vec<Box<dyn BatchSerializer>> = vec![]; | ||
let mut writers = vec![]; | ||
match self.config.writer_mode { | ||
FileWriterMode::Append => { | ||
for file_group in &self.config.file_groups { | ||
let mut append_builder = builder.clone(); | ||
// In append mode, consider has_header flag only when file is empty (at the start). | ||
// For other modes, use has_header flag as is. | ||
if file_group.object_meta.size != 0 { | ||
has_header = false; | ||
append_builder = append_builder.has_headers(false); | ||
} | ||
let serializer = CsvSerializer::new() | ||
.with_builder(append_builder) | ||
.with_header(has_header); | ||
serializers.push(Box::new(serializer)); | ||
|
||
let file = file_group.clone(); | ||
let writer = create_writer( | ||
self.config.writer_mode, | ||
compression, | ||
file.object_meta.clone().into(), | ||
object_store.clone(), | ||
) | ||
.await?; | ||
writers.push(writer); | ||
} | ||
} | ||
FileWriterMode::Put => { | ||
return not_impl_err!("Put Mode is not implemented for CSV Sink yet") | ||
let total_count = self.append_all(data, context).await?; | ||
Ok(total_count) | ||
} | ||
FileWriterMode::PutMultipart => { | ||
// Currently assuming only 1 partition path (i.e. not hive-style partitioning on a column) | ||
let base_path = &self.config.table_paths[0]; | ||
match self.config.single_file_output { | ||
false => { | ||
// Uniquely identify this batch of files with a random string, to prevent collisions overwriting files | ||
let write_id = | ||
Alphanumeric.sample_string(&mut rand::thread_rng(), 16); | ||
for part_idx in 0..num_partitions { | ||
let serializer = CsvSerializer::new() | ||
.with_builder(builder.clone()) | ||
.with_header(has_header); | ||
serializers.push(Box::new(serializer)); | ||
let file_path = base_path | ||
.prefix() | ||
.child(format!("{}_{}.csv", write_id, part_idx)); | ||
let object_meta = ObjectMeta { | ||
location: file_path, | ||
last_modified: chrono::offset::Utc::now(), | ||
size: 0, | ||
e_tag: None, | ||
}; | ||
let writer = create_writer( | ||
self.config.writer_mode, | ||
compression, | ||
object_meta.into(), | ||
object_store.clone(), | ||
) | ||
.await?; | ||
writers.push(writer); | ||
} | ||
} | ||
true => { | ||
let serializer = CsvSerializer::new() | ||
.with_builder(builder.clone()) | ||
.with_header(has_header); | ||
serializers.push(Box::new(serializer)); | ||
let file_path = base_path.prefix(); | ||
let object_meta = ObjectMeta { | ||
location: file_path.clone(), | ||
last_modified: chrono::offset::Utc::now(), | ||
size: 0, | ||
e_tag: None, | ||
}; | ||
let writer = create_writer( | ||
self.config.writer_mode, | ||
compression, | ||
object_meta.into(), | ||
object_store.clone(), | ||
) | ||
.await?; | ||
writers.push(writer); | ||
} | ||
} | ||
let total_count = self.multipartput_all(data, context).await?; | ||
Ok(total_count) | ||
} | ||
FileWriterMode::Put => { | ||
return not_impl_err!("FileWriterMode::Put is not supported yet!") | ||
} | ||
} | ||
|
||
stateless_serialize_and_write_files( | ||
data, | ||
serializers, | ||
writers, | ||
self.config.single_file_output, | ||
self.config.unbounded_input, | ||
) | ||
.await | ||
} | ||
} | ||
|
||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The ideal value here is very situational, so definitely need to make this configurable at the statement and table level.