From 3776aebef93ab3cd856b5311ebe42e68609b7d41 Mon Sep 17 00:00:00 2001 From: Scott Donnelly Date: Fri, 21 Feb 2025 08:10:06 +0000 Subject: [PATCH 01/12] feat: delete file manager loading --- .../iceberg/src/arrow/delete_file_manager.rs | 544 +++++++++++++++++- crates/iceberg/src/arrow/reader.rs | 50 +- crates/iceberg/src/delete_vector.rs | 12 +- 3 files changed, 569 insertions(+), 37 deletions(-) diff --git a/crates/iceberg/src/arrow/delete_file_manager.rs b/crates/iceberg/src/arrow/delete_file_manager.rs index e1ca476793..388166cae2 100644 --- a/crates/iceberg/src/arrow/delete_file_manager.rs +++ b/crates/iceberg/src/arrow/delete_file_manager.rs @@ -15,13 +15,23 @@ // specific language governing permissions and limitations // under the License. -use std::sync::Arc; +use std::collections::HashMap; +use std::future::Future; +use std::pin::Pin; +use std::sync::{Arc, OnceLock, RwLock}; +use std::task::{Context, Poll}; +use futures::channel::oneshot; +use futures::future::join_all; +use futures::{StreamExt, TryStreamExt}; + +use crate::arrow::ArrowReader; use crate::delete_vector::DeleteVector; -use crate::expr::BoundPredicate; +use crate::expr::Predicate::AlwaysTrue; +use crate::expr::{Bind, BoundPredicate, Predicate}; use crate::io::FileIO; -use crate::scan::{ArrowRecordBatchStream, FileScanTaskDeleteFile}; -use crate::spec::SchemaRef; +use crate::scan::{ArrowRecordBatchStream, FileScanTask, FileScanTaskDeleteFile}; +use crate::spec::DataContentType; use crate::{Error, ErrorKind, Result}; #[allow(unused)] @@ -37,6 +47,7 @@ pub trait DeleteFileManager { pub(crate) struct CachingDeleteFileManager { file_io: FileIO, concurrency_limit_data_files: usize, + state: Arc>, } impl DeleteFileManager for CachingDeleteFileManager { @@ -49,47 +60,532 @@ impl DeleteFileManager for CachingDeleteFileManager { )) } } +// Equality deletes may apply to more than one DataFile in a scan, and so +// the same equality delete file may be present in more than one invocation of +// DeleteFileManager::load_deletes in the same scan. We want to deduplicate these +// to avoid having to load them twice, so we immediately store cloneable futures in the +// state that can be awaited upon to get te EQ deletes. That way we can check to see if +// a load of each Eq delete file is already in progress and avoid starting another one. +#[derive(Debug, Clone)] +struct EqDelFuture { + result: OnceLock, +} + +impl EqDelFuture { + pub fn new() -> (oneshot::Sender, Self) { + let (tx, rx) = oneshot::channel(); + let result = OnceLock::new(); + + crate::runtime::spawn({ + let result = result.clone(); + async move { result.set(rx.await.unwrap()) } + }); + + (tx, Self { result }) + } +} + +impl Future for EqDelFuture { + type Output = Predicate; + + fn poll(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll { + match self.result.get() { + None => Poll::Pending, + Some(predicate) => Poll::Ready(predicate.clone()), + } + } +} + +#[derive(Debug, Default)] +struct DeleteFileManagerState { + // delete vectors and positional deletes get merged when loaded into a single delete vector + // per data file + delete_vectors: HashMap>>, + + // equality delete files are parsed into unbound `Predicate`s. We store them here as + // cloneable futures (see note below) + equality_deletes: HashMap, +} + +type StateRef = Arc>; + +// Intermediate context during processing of a delete file task. +enum DeleteFileContext { + // TODO: Delete Vector loader from Puffin files + InProgEqDel(EqDelFuture), + PosDels(ArrowRecordBatchStream), + FreshEqDel { + batch_stream: ArrowRecordBatchStream, + sender: oneshot::Sender, + }, +} + +// Final result of the processing of a delete file task before +// results are fully merged into the DeleteFileManager's state +enum ParsedDeleteFileContext { + InProgEqDel(EqDelFuture), + DelVecs(HashMap), + EqDel, +} #[allow(unused_variables)] impl CachingDeleteFileManager { - pub fn new(file_io: FileIO, concurrency_limit_data_files: usize) -> CachingDeleteFileManager { - Self { + pub(crate) fn new(file_io: FileIO, concurrency_limit_data_files: usize) -> Self { + CachingDeleteFileManager { file_io, concurrency_limit_data_files, + state: Arc::new(Default::default()), } } pub(crate) async fn load_deletes( &self, - delete_file_entries: Vec, + delete_file_entries: &[FileScanTaskDeleteFile], ) -> Result<()> { - // TODO + /* + * Create a single stream of all delete file tasks irrespective of type, + so that we can respect the combined concurrency limit + * We then process each in two phases: load and parse. + * for positional deletes the load phase instantiates an ArrowRecordBatchStream to + stream the file contents out + * for eq deletes, we first check if the EQ delete is already loaded or being loaded by + another concurrently processing data file scan task. If it is, we return a future + for the pre-existing task from the load phase. If not, we create such a future + and store it in the state to prevent other data file tasks from starting to load + the same equality delete file, and return a record batch stream from the load phase + as per the other delete file types - only this time it is accompanied by a one-shot + channel sender that we will eventually use to resolve the shared future that we stored + in the state. + * When this gets updated to add support for delete vectors, the load phase will return + a PuffinReader for them. + * The parse phase parses each record batch stream according to its associated data type. + The result of this is a map of data file paths to delete vectors for the positional + delete tasks (and in future for the delete vector tasks). For equality delete + file tasks, this results in an unbound Predicate. + * The unbound Predicates resulting from equality deletes are sent to their associated oneshot + channel to store them in the right place in the delete file manager's state. + * The results of all of these futures are awaited on in parallel with the specified + level of concurrency and collected into a vec. We then combine all of the delete + vector maps that resulted from any positional delete or delete vector files into a + single map and persist it in the state. - if !delete_file_entries.is_empty() { - Err(Error::new( - ErrorKind::FeatureUnsupported, - "Reading delete files is not yet supported", - )) - } else { - Ok(()) + + Conceptually, the data flow is like this: + + FileScanTaskDeleteFile + | + Already-loading EQ Delete | Everything Else + +---------------------------------------------------+ + | | + [get existing future] [load recordbatch stream / puffin] + DeleteFileContext::InProgEqDel DeleteFileContext + | | + | | + | +-----------------------------+--------------------------+ + | Pos Del Del Vec (Not yet Implemented) EQ Del + | | | | + | [parse pos del stream] [parse del vec puffin] [parse eq del] + | HashMap HashMap (Predicate, Sender) + | | | | + | | | [persist to state] + | | | () + | | | | + | +-----------------------------+--------------------------+ + | | + | [buffer unordered] + | | + | [combine del vectors] + | HashMap + | | + | [persist del vectors to state] + | () + | | + +-------------------------+-------------------------+ + | + [join!] + */ + + let stream_items = delete_file_entries + .iter() + .map(|t| (t.clone(), self.file_io.clone(), self.state.clone())) + .collect::>(); + // NOTE: removing the collect and just passing the iterator to futures::stream:iter + // results in an error 'implementation of `std::ops::FnOnce` is not general enough' + + let task_stream = futures::stream::iter(stream_items.into_iter()); + + let results: Vec = task_stream + .map(move |(task, file_io, state_ref)| async { + Self::load_file_for_task(task, file_io, state_ref).await + }) + .map(move |ctx| Ok(async { Self::parse_file_content_for_task(ctx.await?).await })) + .try_buffer_unordered(self.concurrency_limit_data_files) + .try_collect::>() + .await?; + + // wait for all in-progress EQ deletes from other tasks + let _ = join_all(results.iter().filter_map(|i| { + if let ParsedDeleteFileContext::InProgEqDel(fut) = i { + Some(fut.clone()) + } else { + None + } + })) + .await; + + let merged_delete_vectors = results + .into_iter() + .fold(HashMap::default(), Self::merge_delete_vectors); + + self.state.write().unwrap().delete_vectors = merged_delete_vectors; + + Ok(()) + } + + async fn load_file_for_task( + task: FileScanTaskDeleteFile, + file_io: FileIO, + state: StateRef, + ) -> Result { + match task.file_type { + DataContentType::PositionDeletes => Ok(DeleteFileContext::PosDels( + Self::parquet_to_batch_stream(&task.file_path, file_io).await?, + )), + + DataContentType::EqualityDeletes => { + let (sender, fut) = EqDelFuture::new(); + { + let mut state = state.write().unwrap(); + + if let Some(existing) = state.equality_deletes.get(&task.file_path) { + return Ok(DeleteFileContext::InProgEqDel(existing.clone())); + } + + state + .equality_deletes + .insert(task.file_path.to_string(), fut); + } + + Ok(DeleteFileContext::FreshEqDel { + batch_stream: Self::parquet_to_batch_stream(&task.file_path, file_io).await?, + sender, + }) + } + + DataContentType::Data => Err(Error::new( + ErrorKind::Unexpected, + "tasks with files of type Data not expected here", + )), + } + } + + async fn parse_file_content_for_task( + ctx: DeleteFileContext, + ) -> Result { + match ctx { + DeleteFileContext::InProgEqDel(fut) => Ok(ParsedDeleteFileContext::InProgEqDel(fut)), + DeleteFileContext::PosDels(batch_stream) => { + let del_vecs = + Self::parse_positional_deletes_record_batch_stream(batch_stream).await?; + Ok(ParsedDeleteFileContext::DelVecs(del_vecs)) + } + DeleteFileContext::FreshEqDel { + sender, + batch_stream, + } => { + let predicate = + Self::parse_equality_deletes_record_batch_stream(batch_stream).await?; + + sender + .send(predicate) + .map_err(|err| { + Error::new( + ErrorKind::Unexpected, + "Could not send eq delete predicate to state", + ) + }) + .map(|_| ParsedDeleteFileContext::EqDel) + } } } - pub(crate) fn build_delete_predicate( + fn merge_delete_vectors( + mut merged_delete_vectors: HashMap>>, + item: ParsedDeleteFileContext, + ) -> HashMap>> { + if let ParsedDeleteFileContext::DelVecs(del_vecs) = item { + del_vecs.into_iter().for_each(|(key, val)| { + let entry = merged_delete_vectors.entry(key).or_default(); + { + let mut inner = entry.write().unwrap(); + (*inner).intersect_assign(&val); + } + }); + } + + merged_delete_vectors + } + + /// Loads a RecordBatchStream for a given datafile. + async fn parquet_to_batch_stream( + data_file_path: &str, + file_io: FileIO, + ) -> Result { + /* + Essentially a super-cut-down ArrowReader. We can't use ArrowReader directly + as that introduces a circular dependency. + */ + let record_batch_stream = ArrowReader::create_parquet_record_batch_stream_builder( + data_file_path, + file_io.clone(), + false, + ) + .await? + .build()? + .map_err(|e| Error::new(ErrorKind::Unexpected, format!("{}", e))); + + Ok(Box::pin(record_batch_stream) as ArrowRecordBatchStream) + } + + /// Parses a record batch stream coming from positional delete files + /// + /// Returns a map of data file path to a delete vector + async fn parse_positional_deletes_record_batch_stream( + stream: ArrowRecordBatchStream, + ) -> Result> { + // TODO + + Ok(HashMap::default()) + } + + /// Parses record batch streams from individual equality delete files + /// + /// Returns an unbound Predicate for each batch stream + async fn parse_equality_deletes_record_batch_stream( + streams: ArrowRecordBatchStream, + ) -> Result { + // TODO + + Ok(AlwaysTrue) + } + + /// Builds eq delete predicate for the provided task. + /// + /// Must await on load_deletes before calling this. + pub(crate) async fn build_delete_predicate_for_task( &self, - snapshot_schema: SchemaRef, + file_scan_task: &FileScanTask, ) -> Result> { - // TODO + // * Filter the task's deletes into just the Equality deletes + // * Retrieve the unbound predicate for each from self.state.equality_deletes + // * Logical-AND them all together to get a single combined `Predicate` + // * Bind the predicate to the task's schema to get a `BoundPredicate` + + let mut combined_predicate = AlwaysTrue; + for delete in &file_scan_task.deletes { + if !is_equality_delete(delete) { + continue; + } + + let predicate = { + let state = self.state.read().unwrap(); + + let Some(predicate) = state.equality_deletes.get(&delete.file_path) else { + return Err(Error::new( + ErrorKind::Unexpected, + format!( + "Missing predicate for equality delete file '{}'", + delete.file_path + ), + )); + }; - Ok(None) + predicate.clone() + }; + + combined_predicate = combined_predicate.and(predicate.await); + } + + if combined_predicate == AlwaysTrue { + return Ok(None); + } + + // TODO: handle case-insensitive case + let bound_predicate = combined_predicate.bind(file_scan_task.schema.clone(), false)?; + Ok(Some(bound_predicate)) } - pub(crate) fn get_positional_delete_indexes_for_data_file( + /// Retrieve a delete vector for the data file associated with a given file scan task + /// + /// Should only be called after awaiting on load_deletes. Takes the vector to avoid a + /// clone since each item is specific to a single data file and won't need to be used again + pub(crate) fn get_delete_vector_for_task( &self, - data_file_path: &str, - ) -> Option> { - // TODO + file_scan_task: &FileScanTask, + ) -> Option>> { + self.state + .write() + .unwrap() + .delete_vectors + .get(file_scan_task.data_file_path()) + .map(Clone::clone) + } +} + +pub(crate) fn is_equality_delete(f: &FileScanTaskDeleteFile) -> bool { + matches!(f.file_type, DataContentType::EqualityDeletes) +} + +#[cfg(test)] +mod tests { + use std::fs::File; + use std::path::Path; + use std::sync::Arc; + + use arrow_array::{Int64Array, RecordBatch, StringArray}; + use arrow_schema::Schema as ArrowSchema; + use parquet::arrow::{ArrowWriter, PARQUET_FIELD_ID_META_KEY}; + use parquet::basic::Compression; + use parquet::file::properties::WriterProperties; + use tempfile::TempDir; + + use super::*; + use crate::spec::{DataFileFormat, Schema}; + + type ArrowSchemaRef = Arc; + + const FIELD_ID_POSITIONAL_DELETE_FILE_PATH: u64 = 2147483546; + const FIELD_ID_POSITIONAL_DELETE_POS: u64 = 2147483545; + + #[tokio::test] + async fn test_delete_file_manager_load_deletes() { + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path(); + let file_io = FileIO::from_path(table_location.as_os_str().to_str().unwrap()) + .unwrap() + .build() + .unwrap(); + + // Note that with the delete file parsing not yet in place, all we can test here is that + // the call to the loader does not fail. + let delete_file_manager = CachingDeleteFileManager::new(file_io.clone(), 10); + + let file_scan_tasks = setup(table_location); + + delete_file_manager + .load_deletes(&file_scan_tasks[0].deletes) + .await + .unwrap(); + } + + fn setup(table_location: &Path) -> Vec { + let data_file_schema = Arc::new(Schema::builder().build().unwrap()); + let positional_delete_schema = create_pos_del_schema(); + + let file_path_values = vec![format!("{}/1.parquet", table_location.to_str().unwrap()); 8]; + let pos_values = vec![0, 1, 3, 5, 6, 8, 1022, 1023]; + + let file_path_col = Arc::new(StringArray::from_iter_values(file_path_values)); + let pos_col = Arc::new(Int64Array::from_iter_values(pos_values)); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + for n in 1..=3 { + let positional_deletes_to_write = + RecordBatch::try_new(positional_delete_schema.clone(), vec![ + file_path_col.clone(), + pos_col.clone(), + ]) + .unwrap(); + + let file = File::create(format!( + "{}/pos-del-{}.parquet", + table_location.to_str().unwrap(), + n + )) + .unwrap(); + let mut writer = ArrowWriter::try_new( + file, + positional_deletes_to_write.schema(), + Some(props.clone()), + ) + .unwrap(); + + writer + .write(&positional_deletes_to_write) + .expect("Writing batch"); + + // writer must be closed to write footer + writer.close().unwrap(); + } + + let pos_del_1 = FileScanTaskDeleteFile { + file_path: format!("{}/pos-del-1.parquet", table_location.to_str().unwrap()), + file_type: DataContentType::PositionDeletes, + partition_spec_id: 0, + equality_ids: vec![], + }; + + let pos_del_2 = FileScanTaskDeleteFile { + file_path: format!("{}/pos-del-2.parquet", table_location.to_str().unwrap()), + file_type: DataContentType::PositionDeletes, + partition_spec_id: 0, + equality_ids: vec![], + }; + + let pos_del_3 = FileScanTaskDeleteFile { + file_path: format!("{}/pos-del-3.parquet", table_location.to_str().unwrap()), + file_type: DataContentType::PositionDeletes, + partition_spec_id: 0, + equality_ids: vec![], + }; + + let file_scan_tasks = vec![ + FileScanTask { + start: 0, + length: 0, + record_count: None, + data_file_path: "".to_string(), + data_file_content: DataContentType::Data, + data_file_format: DataFileFormat::Parquet, + schema: data_file_schema.clone(), + project_field_ids: vec![], + predicate: None, + deletes: vec![pos_del_1, pos_del_2.clone()], + }, + FileScanTask { + start: 0, + length: 0, + record_count: None, + data_file_path: "".to_string(), + data_file_content: DataContentType::Data, + data_file_format: DataFileFormat::Parquet, + schema: data_file_schema.clone(), + project_field_ids: vec![], + predicate: None, + deletes: vec![pos_del_2, pos_del_3], + }, + ]; + + file_scan_tasks + } - None + fn create_pos_del_schema() -> ArrowSchemaRef { + let fields = vec![ + arrow_schema::Field::new("file_path", arrow_schema::DataType::Utf8, false) + .with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + FIELD_ID_POSITIONAL_DELETE_FILE_PATH.to_string(), + )])), + arrow_schema::Field::new("pos", arrow_schema::DataType::Int64, false).with_metadata( + HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + FIELD_ID_POSITIONAL_DELETE_POS.to_string(), + )]), + ), + ]; + Arc::new(arrow_schema::Schema::new(fields)) } } diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index a3462edec6..5f12c3e3be 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -64,6 +64,7 @@ pub struct ArrowReaderBuilder { concurrency_limit_data_files: usize, row_group_filtering_enabled: bool, row_selection_enabled: bool, + delete_file_support_enabled: bool, } impl ArrowReaderBuilder { @@ -77,6 +78,7 @@ impl ArrowReaderBuilder { concurrency_limit_data_files: num_cpus, row_group_filtering_enabled: true, row_selection_enabled: false, + delete_file_support_enabled: false, } } @@ -105,6 +107,12 @@ impl ArrowReaderBuilder { self } + /// Determines whether to enable delete file support. + pub fn with_delete_file_support_enabled(mut self, delete_file_support_enabled: bool) -> Self { + self.delete_file_support_enabled = delete_file_support_enabled; + self + } + /// Build the ArrowReader. pub fn build(self) -> ArrowReader { ArrowReader { @@ -117,6 +125,7 @@ impl ArrowReaderBuilder { concurrency_limit_data_files: self.concurrency_limit_data_files, row_group_filtering_enabled: self.row_group_filtering_enabled, row_selection_enabled: self.row_selection_enabled, + delete_file_support_enabled: self.delete_file_support_enabled, } } } @@ -133,6 +142,7 @@ pub struct ArrowReader { row_group_filtering_enabled: bool, row_selection_enabled: bool, + delete_file_support_enabled: bool, } impl ArrowReader { @@ -144,6 +154,7 @@ impl ArrowReader { let concurrency_limit_data_files = self.concurrency_limit_data_files; let row_group_filtering_enabled = self.row_group_filtering_enabled; let row_selection_enabled = self.row_selection_enabled; + let delete_file_support_enabled = self.delete_file_support_enabled; let stream = tasks .map_ok(move |task| { @@ -156,6 +167,7 @@ impl ArrowReader { self.delete_file_manager.clone(), row_group_filtering_enabled, row_selection_enabled, + delete_file_support_enabled, ) }) .map_err(|err| { @@ -167,6 +179,7 @@ impl ArrowReader { Ok(Box::pin(stream) as ArrowRecordBatchStream) } + #[allow(clippy::too_many_arguments)] async fn process_file_scan_task( task: FileScanTask, batch_size: Option, @@ -174,13 +187,25 @@ impl ArrowReader { delete_file_manager: CachingDeleteFileManager, row_group_filtering_enabled: bool, row_selection_enabled: bool, + delete_file_support_enabled: bool, ) -> Result { + if !delete_file_support_enabled && !task.deletes.is_empty() { + return Err(Error::new( + ErrorKind::FeatureUnsupported, + "Delete file support is not enabled", + )); + } + let should_load_page_index = (row_selection_enabled && task.predicate.is_some()) || !task.deletes.is_empty(); // concurrently retrieve delete files and create RecordBatchStreamBuilder let (_, mut record_batch_stream_builder) = try_join!( - delete_file_manager.load_deletes(task.deletes.clone()), + delete_file_manager.load_deletes(if delete_file_support_enabled { + &task.deletes + } else { + &[] + },), Self::create_parquet_record_batch_stream_builder( &task.data_file_path, file_io.clone(), @@ -208,7 +233,9 @@ impl ArrowReader { record_batch_stream_builder = record_batch_stream_builder.with_batch_size(batch_size); } - let delete_predicate = delete_file_manager.build_delete_predicate(task.schema.clone())?; + let delete_predicate = delete_file_manager + .build_delete_predicate_for_task(&task) + .await?; // In addition to the optional predicate supplied in the `FileScanTask`, // we also have an optional predicate resulting from equality delete files. @@ -276,15 +303,18 @@ impl ArrowReader { } } - let positional_delete_indexes = - delete_file_manager.get_positional_delete_indexes_for_data_file(&task.data_file_path); + let positional_delete_indexes = delete_file_manager.get_delete_vector_for_task(&task); if let Some(positional_delete_indexes) = positional_delete_indexes { - let delete_row_selection = Self::build_deletes_row_selection( - record_batch_stream_builder.metadata().row_groups(), - &selected_row_group_indices, - positional_delete_indexes.as_ref(), - )?; + let delete_row_selection = { + let positional_delete_indexes = positional_delete_indexes.read().unwrap(); + + Self::build_deletes_row_selection( + record_batch_stream_builder.metadata().row_groups(), + &selected_row_group_indices, + &positional_delete_indexes, + )? + }; // merge the row selection from the delete files with the row selection // from the filter predicate, if there is one from the filter predicate @@ -319,7 +349,7 @@ impl ArrowReader { Ok(Box::pin(record_batch_stream) as ArrowRecordBatchStream) } - async fn create_parquet_record_batch_stream_builder( + pub(crate) async fn create_parquet_record_batch_stream_builder( data_file_path: &str, file_io: FileIO, should_load_page_index: bool, diff --git a/crates/iceberg/src/delete_vector.rs b/crates/iceberg/src/delete_vector.rs index 7bde3c43d0..1dba780e15 100644 --- a/crates/iceberg/src/delete_vector.rs +++ b/crates/iceberg/src/delete_vector.rs @@ -15,11 +15,13 @@ // specific language governing permissions and limitations // under the License. -use roaring::RoaringTreemap; +use std::ops::BitOrAssign; + use roaring::bitmap::Iter; use roaring::treemap::BitmapIter; +use roaring::RoaringTreemap; -#[allow(unused)] +#[derive(Debug, Default)] pub struct DeleteVector { inner: RoaringTreemap, } @@ -36,6 +38,10 @@ impl DeleteVector { let outer = self.inner.bitmaps(); DeleteVectorIterator { outer, inner: None } } + + pub fn intersect_assign(&mut self, other: &DeleteVector) { + self.inner.bitor_assign(&other.inner); + } } // Ideally, we'd just wrap `roaring::RoaringTreemap`'s iterator, `roaring::treemap::Iter` here. @@ -61,7 +67,7 @@ impl Iterator for DeleteVectorIterator<'_> { type Item = u64; fn next(&mut self) -> Option { - if let Some(inner) = &mut self.inner { + if let Some(ref mut inner) = &mut self.inner { if let Some(inner_next) = inner.bitmap_iter.next() { return Some(u64::from(inner.high_bits) << 32 | u64::from(inner_next)); } From 0078be5c0ad85edb79799434c07a967bf5982f22 Mon Sep 17 00:00:00 2001 From: Scott Donnelly Date: Fri, 28 Mar 2025 08:05:02 +0000 Subject: [PATCH 02/12] feat: changes suggested in review --- .../iceberg/src/arrow/delete_file_manager.rs | 138 +++++++++--------- 1 file changed, 71 insertions(+), 67 deletions(-) diff --git a/crates/iceberg/src/arrow/delete_file_manager.rs b/crates/iceberg/src/arrow/delete_file_manager.rs index 388166cae2..9b1472eb44 100644 --- a/crates/iceberg/src/arrow/delete_file_manager.rs +++ b/crates/iceberg/src/arrow/delete_file_manager.rs @@ -138,73 +138,75 @@ impl CachingDeleteFileManager { } } + /// Load the deletes for all the specified tasks + /// + /// Returned future completes once all loading has finished. + /// + /// * Create a single stream of all delete file tasks irrespective of type, + /// so that we can respect the combined concurrency limit + /// * We then process each in two phases: load and parse. + /// * for positional deletes the load phase instantiates an ArrowRecordBatchStream to + /// stream the file contents out + /// * for eq deletes, we first check if the EQ delete is already loaded or being loaded by + /// another concurrently processing data file scan task. If it is, we return a future + /// for the pre-existing task from the load phase. If not, we create such a future + /// and store it in the state to prevent other data file tasks from starting to load + /// the same equality delete file, and return a record batch stream from the load phase + /// as per the other delete file types - only this time it is accompanied by a one-shot + /// channel sender that we will eventually use to resolve the shared future that we stored + /// in the state. + /// * When this gets updated to add support for delete vectors, the load phase will return + /// a PuffinReader for them. + /// * The parse phase parses each record batch stream according to its associated data type. + /// The result of this is a map of data file paths to delete vectors for the positional + /// delete tasks (and in future for the delete vector tasks). For equality delete + /// file tasks, this results in an unbound Predicate. + /// * The unbound Predicates resulting from equality deletes are sent to their associated oneshot + /// channel to store them in the right place in the delete file managers state. + /// * The results of all of these futures are awaited on in parallel with the specified + /// level of concurrency and collected into a vec. We then combine all of the delete + /// vector maps that resulted from any positional delete or delete vector files into a + /// single map and persist it in the state. + /// + /// + /// Conceptually, the data flow is like this: + /// ```none + /// FileScanTaskDeleteFile + /// | + /// Already-loading EQ Delete | Everything Else + /// +---------------------------------------------------+ + /// | | + /// [get existing future] [load recordbatch stream / puffin] + /// DeleteFileContext::InProgEqDel DeleteFileContext + /// | | + /// | | + /// | +-----------------------------+--------------------------+ + /// | Pos Del Del Vec (Not yet Implemented) EQ Del + /// | | | | + /// | [parse pos del stream] [parse del vec puffin] [parse eq del] + /// | HashMap HashMap (Predicate, Sender) + /// | | | | + /// | | | [persist to state] + /// | | | () + /// | | | | + /// | +-----------------------------+--------------------------+ + /// | | + /// | [buffer unordered] + /// | | + /// | [combine del vectors] + /// | HashMap + /// | | + /// | [persist del vectors to state] + /// | () + /// | | + /// +-------------------------+-------------------------+ + /// | + /// [join!] + /// ``` pub(crate) async fn load_deletes( &self, delete_file_entries: &[FileScanTaskDeleteFile], ) -> Result<()> { - /* - * Create a single stream of all delete file tasks irrespective of type, - so that we can respect the combined concurrency limit - * We then process each in two phases: load and parse. - * for positional deletes the load phase instantiates an ArrowRecordBatchStream to - stream the file contents out - * for eq deletes, we first check if the EQ delete is already loaded or being loaded by - another concurrently processing data file scan task. If it is, we return a future - for the pre-existing task from the load phase. If not, we create such a future - and store it in the state to prevent other data file tasks from starting to load - the same equality delete file, and return a record batch stream from the load phase - as per the other delete file types - only this time it is accompanied by a one-shot - channel sender that we will eventually use to resolve the shared future that we stored - in the state. - * When this gets updated to add support for delete vectors, the load phase will return - a PuffinReader for them. - * The parse phase parses each record batch stream according to its associated data type. - The result of this is a map of data file paths to delete vectors for the positional - delete tasks (and in future for the delete vector tasks). For equality delete - file tasks, this results in an unbound Predicate. - * The unbound Predicates resulting from equality deletes are sent to their associated oneshot - channel to store them in the right place in the delete file manager's state. - * The results of all of these futures are awaited on in parallel with the specified - level of concurrency and collected into a vec. We then combine all of the delete - vector maps that resulted from any positional delete or delete vector files into a - single map and persist it in the state. - - - Conceptually, the data flow is like this: - - FileScanTaskDeleteFile - | - Already-loading EQ Delete | Everything Else - +---------------------------------------------------+ - | | - [get existing future] [load recordbatch stream / puffin] - DeleteFileContext::InProgEqDel DeleteFileContext - | | - | | - | +-----------------------------+--------------------------+ - | Pos Del Del Vec (Not yet Implemented) EQ Del - | | | | - | [parse pos del stream] [parse del vec puffin] [parse eq del] - | HashMap HashMap (Predicate, Sender) - | | | | - | | | [persist to state] - | | | () - | | | | - | +-----------------------------+--------------------------+ - | | - | [buffer unordered] - | | - | [combine del vectors] - | HashMap - | | - | [persist del vectors to state] - | () - | | - +-------------------------+-------------------------+ - | - [join!] - */ - let stream_items = delete_file_entries .iter() .map(|t| (t.clone(), self.file_io.clone(), self.state.clone())) @@ -253,18 +255,20 @@ impl CachingDeleteFileManager { )), DataContentType::EqualityDeletes => { - let (sender, fut) = EqDelFuture::new(); - { + let sender = { let mut state = state.write().unwrap(); - if let Some(existing) = state.equality_deletes.get(&task.file_path) { return Ok(DeleteFileContext::InProgEqDel(existing.clone())); } + let (sender, fut) = EqDelFuture::new(); + state .equality_deletes .insert(task.file_path.to_string(), fut); - } + + sender + }; Ok(DeleteFileContext::FreshEqDel { batch_stream: Self::parquet_to_batch_stream(&task.file_path, file_io).await?, From e2903e1ab95017b8a877e99e1f630d2a14e1df50 Mon Sep 17 00:00:00 2001 From: Scott Donnelly Date: Thu, 17 Apr 2025 07:35:03 +0100 Subject: [PATCH 03/12] feat: return Err for unimplemented delete vec parse methods and make DeleteVector::intersect_assign pub(crate) --- .../iceberg/src/arrow/delete_file_manager.rs | 19 +++++++++++++------ crates/iceberg/src/arrow/reader.rs | 2 +- crates/iceberg/src/delete_vector.rs | 2 +- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/crates/iceberg/src/arrow/delete_file_manager.rs b/crates/iceberg/src/arrow/delete_file_manager.rs index 9b1472eb44..7eb84aff71 100644 --- a/crates/iceberg/src/arrow/delete_file_manager.rs +++ b/crates/iceberg/src/arrow/delete_file_manager.rs @@ -359,7 +359,10 @@ impl CachingDeleteFileManager { ) -> Result> { // TODO - Ok(HashMap::default()) + Err(Error::new( + ErrorKind::FeatureUnsupported, + "parsing of positional deletes is not yet supported", + )) } /// Parses record batch streams from individual equality delete files @@ -370,7 +373,10 @@ impl CachingDeleteFileManager { ) -> Result { // TODO - Ok(AlwaysTrue) + Err(Error::new( + ErrorKind::FeatureUnsupported, + "parsing of equality deletes is not yet supported", + )) } /// Builds eq delete predicate for the provided task. @@ -471,15 +477,16 @@ mod tests { .unwrap(); // Note that with the delete file parsing not yet in place, all we can test here is that - // the call to the loader does not fail. + // the call to the loader fails with the expected FeatureUnsupportedError. let delete_file_manager = CachingDeleteFileManager::new(file_io.clone(), 10); let file_scan_tasks = setup(table_location); - delete_file_manager + let result = delete_file_manager .load_deletes(&file_scan_tasks[0].deletes) - .await - .unwrap(); + .await; + + assert!(result.is_err_and(|e| e.kind() == ErrorKind::FeatureUnsupported)); } fn setup(table_location: &Path) -> Vec { diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 5f12c3e3be..854bf07426 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -308,7 +308,7 @@ impl ArrowReader { if let Some(positional_delete_indexes) = positional_delete_indexes { let delete_row_selection = { let positional_delete_indexes = positional_delete_indexes.read().unwrap(); - + Self::build_deletes_row_selection( record_batch_stream_builder.metadata().row_groups(), &selected_row_group_indices, diff --git a/crates/iceberg/src/delete_vector.rs b/crates/iceberg/src/delete_vector.rs index 1dba780e15..fa8ef69349 100644 --- a/crates/iceberg/src/delete_vector.rs +++ b/crates/iceberg/src/delete_vector.rs @@ -39,7 +39,7 @@ impl DeleteVector { DeleteVectorIterator { outer, inner: None } } - pub fn intersect_assign(&mut self, other: &DeleteVector) { + pub(crate) fn intersect_assign(&mut self, other: &DeleteVector) { self.inner.bitor_assign(&other.inner); } } From 65dd6388844904161ba4892747d9eae9c1c7c78a Mon Sep 17 00:00:00 2001 From: Scott Donnelly Date: Wed, 23 Apr 2025 21:48:14 +0100 Subject: [PATCH 04/12] feat: schema evolution of equality delete file record batches --- .../iceberg/src/arrow/delete_file_manager.rs | 52 ++++++++++++++++--- crates/iceberg/src/arrow/reader.rs | 13 +++-- 2 files changed, 53 insertions(+), 12 deletions(-) diff --git a/crates/iceberg/src/arrow/delete_file_manager.rs b/crates/iceberg/src/arrow/delete_file_manager.rs index 7eb84aff71..96034d917e 100644 --- a/crates/iceberg/src/arrow/delete_file_manager.rs +++ b/crates/iceberg/src/arrow/delete_file_manager.rs @@ -25,13 +25,14 @@ use futures::channel::oneshot; use futures::future::join_all; use futures::{StreamExt, TryStreamExt}; +use crate::arrow::record_batch_transformer::RecordBatchTransformer; use crate::arrow::ArrowReader; use crate::delete_vector::DeleteVector; use crate::expr::Predicate::AlwaysTrue; use crate::expr::{Bind, BoundPredicate, Predicate}; use crate::io::FileIO; use crate::scan::{ArrowRecordBatchStream, FileScanTask, FileScanTaskDeleteFile}; -use crate::spec::DataContentType; +use crate::spec::{DataContentType, Schema, SchemaRef}; use crate::{Error, ErrorKind, Result}; #[allow(unused)] @@ -164,7 +165,7 @@ impl CachingDeleteFileManager { /// * The unbound Predicates resulting from equality deletes are sent to their associated oneshot /// channel to store them in the right place in the delete file managers state. /// * The results of all of these futures are awaited on in parallel with the specified - /// level of concurrency and collected into a vec. We then combine all of the delete + /// level of concurrency and collected into a vec. We then combine all the delete /// vector maps that resulted from any positional delete or delete vector files into a /// single map and persist it in the state. /// @@ -206,10 +207,18 @@ impl CachingDeleteFileManager { pub(crate) async fn load_deletes( &self, delete_file_entries: &[FileScanTaskDeleteFile], + schema: SchemaRef, ) -> Result<()> { let stream_items = delete_file_entries .iter() - .map(|t| (t.clone(), self.file_io.clone(), self.state.clone())) + .map(|t| { + ( + t.clone(), + self.file_io.clone(), + self.state.clone(), + schema.clone(), + ) + }) .collect::>(); // NOTE: removing the collect and just passing the iterator to futures::stream:iter // results in an error 'implementation of `std::ops::FnOnce` is not general enough' @@ -217,8 +226,8 @@ impl CachingDeleteFileManager { let task_stream = futures::stream::iter(stream_items.into_iter()); let results: Vec = task_stream - .map(move |(task, file_io, state_ref)| async { - Self::load_file_for_task(task, file_io, state_ref).await + .map(move |(task, file_io, state_ref, schema)| async { + Self::load_file_for_task(task, file_io, state_ref, schema).await }) .map(move |ctx| Ok(async { Self::parse_file_content_for_task(ctx.await?).await })) .try_buffer_unordered(self.concurrency_limit_data_files) @@ -248,6 +257,7 @@ impl CachingDeleteFileManager { task: FileScanTaskDeleteFile, file_io: FileIO, state: StateRef, + schema: SchemaRef, ) -> Result { match task.file_type { DataContentType::PositionDeletes => Ok(DeleteFileContext::PosDels( @@ -271,7 +281,11 @@ impl CachingDeleteFileManager { }; Ok(DeleteFileContext::FreshEqDel { - batch_stream: Self::parquet_to_batch_stream(&task.file_path, file_io).await?, + batch_stream: Self::evolve_schema( + Self::parquet_to_batch_stream(&task.file_path, file_io).await?, + schema, + ) + .await?, sender, }) } @@ -351,6 +365,30 @@ impl CachingDeleteFileManager { Ok(Box::pin(record_batch_stream) as ArrowRecordBatchStream) } + /// Evolves the schema of the RecordBatches from an equality delete file + async fn evolve_schema( + record_batch_stream: ArrowRecordBatchStream, + target_schema: Arc, + ) -> Result { + let eq_ids = target_schema + .as_ref() + .field_id_to_name_map() + .keys() + .cloned() + .collect::>(); + + let mut record_batch_transformer = + RecordBatchTransformer::build(target_schema.clone(), &eq_ids); + + let record_batch_stream = record_batch_stream.map(move |record_batch| { + record_batch.and_then(|record_batch| { + record_batch_transformer.process_record_batch(record_batch) + }) + }); + + Ok(Box::pin(record_batch_stream) as ArrowRecordBatchStream) + } + /// Parses a record batch stream coming from positional delete files /// /// Returns a map of data file path to a delete vector @@ -483,7 +521,7 @@ mod tests { let file_scan_tasks = setup(table_location); let result = delete_file_manager - .load_deletes(&file_scan_tasks[0].deletes) + .load_deletes(&file_scan_tasks[0].deletes, file_scan_tasks[0].schema_ref()) .await; assert!(result.is_err_and(|e| e.kind() == ErrorKind::FeatureUnsupported)); diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 854bf07426..19cbc00e67 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -201,11 +201,14 @@ impl ArrowReader { // concurrently retrieve delete files and create RecordBatchStreamBuilder let (_, mut record_batch_stream_builder) = try_join!( - delete_file_manager.load_deletes(if delete_file_support_enabled { - &task.deletes - } else { - &[] - },), + delete_file_manager.load_deletes( + if delete_file_support_enabled { + &task.deletes + } else { + &[] + }, + task.schema.clone() + ), Self::create_parquet_record_batch_stream_builder( &task.data_file_path, file_io.clone(), From 79415ac048ed6a7a64ecb2250ff1c773ec25558c Mon Sep 17 00:00:00 2001 From: Scott Donnelly Date: Wed, 30 Apr 2025 20:07:35 +0100 Subject: [PATCH 05/12] refactor: split DeleteFileManager into DeleteFileLoader and DeleteFilter --- ..._file_manager.rs => delete_file_loader.rs} | 276 +++++--------- crates/iceberg/src/arrow/delete_filter.rs | 337 ++++++++++++++++++ crates/iceberg/src/arrow/mod.rs | 3 +- crates/iceberg/src/arrow/reader.rs | 55 ++- crates/iceberg/src/delete_vector.rs | 10 +- 5 files changed, 454 insertions(+), 227 deletions(-) rename crates/iceberg/src/arrow/{delete_file_manager.rs => delete_file_loader.rs} (70%) create mode 100644 crates/iceberg/src/arrow/delete_filter.rs diff --git a/crates/iceberg/src/arrow/delete_file_manager.rs b/crates/iceberg/src/arrow/delete_file_loader.rs similarity index 70% rename from crates/iceberg/src/arrow/delete_file_manager.rs rename to crates/iceberg/src/arrow/delete_file_loader.rs index 96034d917e..663b7e77cf 100644 --- a/crates/iceberg/src/arrow/delete_file_manager.rs +++ b/crates/iceberg/src/arrow/delete_file_loader.rs @@ -16,100 +16,57 @@ // under the License. use std::collections::HashMap; -use std::future::Future; -use std::pin::Pin; -use std::sync::{Arc, OnceLock, RwLock}; -use std::task::{Context, Poll}; +use std::sync::Arc; use futures::channel::oneshot; use futures::future::join_all; use futures::{StreamExt, TryStreamExt}; +use tokio::sync::oneshot::{channel, Receiver}; +use super::delete_filter::{DeleteFilter, EqDelFuture}; use crate::arrow::record_batch_transformer::RecordBatchTransformer; use crate::arrow::ArrowReader; use crate::delete_vector::DeleteVector; -use crate::expr::Predicate::AlwaysTrue; -use crate::expr::{Bind, BoundPredicate, Predicate}; +use crate::expr::Predicate; use crate::io::FileIO; -use crate::scan::{ArrowRecordBatchStream, FileScanTask, FileScanTaskDeleteFile}; +use crate::scan::{ArrowRecordBatchStream, FileScanTaskDeleteFile}; use crate::spec::{DataContentType, Schema, SchemaRef}; use crate::{Error, ErrorKind, Result}; #[allow(unused)] -pub trait DeleteFileManager { +pub trait DeleteFileLoader { /// Read the delete file referred to in the task /// - /// Returns the raw contents of the delete file as a RecordBatch stream - fn read_delete_file(task: &FileScanTaskDeleteFile) -> Result; + /// Returns the contents of the delete file as a RecordBatch stream. Applies schema evolution. + async fn read_delete_file( + &self, + task: &FileScanTaskDeleteFile, + schema: SchemaRef, + ) -> Result; } #[allow(unused)] #[derive(Clone, Debug)] -pub(crate) struct CachingDeleteFileManager { +pub(crate) struct CachingDeleteFileLoader { file_io: FileIO, concurrency_limit_data_files: usize, - state: Arc>, -} - -impl DeleteFileManager for CachingDeleteFileManager { - fn read_delete_file(_task: &FileScanTaskDeleteFile) -> Result { - // TODO, implementation in https://github.com/apache/iceberg-rust/pull/982 - - Err(Error::new( - ErrorKind::FeatureUnsupported, - "Reading delete files is not yet supported", - )) - } -} -// Equality deletes may apply to more than one DataFile in a scan, and so -// the same equality delete file may be present in more than one invocation of -// DeleteFileManager::load_deletes in the same scan. We want to deduplicate these -// to avoid having to load them twice, so we immediately store cloneable futures in the -// state that can be awaited upon to get te EQ deletes. That way we can check to see if -// a load of each Eq delete file is already in progress and avoid starting another one. -#[derive(Debug, Clone)] -struct EqDelFuture { - result: OnceLock, -} - -impl EqDelFuture { - pub fn new() -> (oneshot::Sender, Self) { - let (tx, rx) = oneshot::channel(); - let result = OnceLock::new(); - - crate::runtime::spawn({ - let result = result.clone(); - async move { result.set(rx.await.unwrap()) } - }); - - (tx, Self { result }) - } + del_filter: DeleteFilter, } -impl Future for EqDelFuture { - type Output = Predicate; +impl DeleteFileLoader for CachingDeleteFileLoader { + async fn read_delete_file( + &self, + task: &FileScanTaskDeleteFile, + schema: SchemaRef, + ) -> Result { + let raw_batch_stream = + CachingDeleteFileLoader::parquet_to_batch_stream(&task.file_path, self.file_io.clone()) + .await?; - fn poll(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll { - match self.result.get() { - None => Poll::Pending, - Some(predicate) => Poll::Ready(predicate.clone()), - } + Self::evolve_schema(raw_batch_stream, schema).await } } -#[derive(Debug, Default)] -struct DeleteFileManagerState { - // delete vectors and positional deletes get merged when loaded into a single delete vector - // per data file - delete_vectors: HashMap>>, - - // equality delete files are parsed into unbound `Predicate`s. We store them here as - // cloneable futures (see note below) - equality_deletes: HashMap, -} - -type StateRef = Arc>; - // Intermediate context during processing of a delete file task. enum DeleteFileContext { // TODO: Delete Vector loader from Puffin files @@ -130,12 +87,12 @@ enum ParsedDeleteFileContext { } #[allow(unused_variables)] -impl CachingDeleteFileManager { +impl CachingDeleteFileLoader { pub(crate) fn new(file_io: FileIO, concurrency_limit_data_files: usize) -> Self { - CachingDeleteFileManager { + CachingDeleteFileLoader { file_io, concurrency_limit_data_files, - state: Arc::new(Default::default()), + del_filter: DeleteFilter::default(), } } @@ -169,7 +126,7 @@ impl CachingDeleteFileManager { /// vector maps that resulted from any positional delete or delete vector files into a /// single map and persist it in the state. /// - /// + /// /// Conceptually, the data flow is like this: /// ```none /// FileScanTaskDeleteFile @@ -204,59 +161,74 @@ impl CachingDeleteFileManager { /// | /// [join!] /// ``` - pub(crate) async fn load_deletes( + pub(crate) fn load_deletes( &self, delete_file_entries: &[FileScanTaskDeleteFile], schema: SchemaRef, - ) -> Result<()> { + ) -> Receiver> { + let (tx, rx) = channel(); + let stream_items = delete_file_entries .iter() .map(|t| { ( t.clone(), self.file_io.clone(), - self.state.clone(), + self.del_filter.clone(), schema.clone(), ) }) .collect::>(); - // NOTE: removing the collect and just passing the iterator to futures::stream:iter - // results in an error 'implementation of `std::ops::FnOnce` is not general enough' - - let task_stream = futures::stream::iter(stream_items.into_iter()); + let task_stream = futures::stream::iter(stream_items); + let del_filter = self.del_filter.clone(); + let concurrency_limit_data_files = self.concurrency_limit_data_files; + crate::runtime::spawn(async move { + let result = async move { + let mut del_filter = del_filter; + + let results: Vec = task_stream + .map(move |(task, file_io, del_filter, schema)| async move { + Self::load_file_for_task(&task, file_io, del_filter, schema).await + }) + .map(move |ctx| { + Ok(async { Self::parse_file_content_for_task(ctx.await?).await }) + }) + .try_buffer_unordered(concurrency_limit_data_files) + .try_collect::>() + .await?; + + // wait for all in-progress EQ deletes from other tasks + let _ = join_all(results.iter().filter_map(|i| { + if let ParsedDeleteFileContext::InProgEqDel(fut) = i { + Some(fut.clone()) + } else { + None + } + })) + .await; + + for item in results { + if let ParsedDeleteFileContext::DelVecs(hash_map) = item { + for (data_file_path, delete_vector) in hash_map.into_iter() { + del_filter.upsert_delete_vector(data_file_path, delete_vector); + } + } + } - let results: Vec = task_stream - .map(move |(task, file_io, state_ref, schema)| async { - Self::load_file_for_task(task, file_io, state_ref, schema).await - }) - .map(move |ctx| Ok(async { Self::parse_file_content_for_task(ctx.await?).await })) - .try_buffer_unordered(self.concurrency_limit_data_files) - .try_collect::>() - .await?; - - // wait for all in-progress EQ deletes from other tasks - let _ = join_all(results.iter().filter_map(|i| { - if let ParsedDeleteFileContext::InProgEqDel(fut) = i { - Some(fut.clone()) - } else { - None + Ok(del_filter) } - })) - .await; - - let merged_delete_vectors = results - .into_iter() - .fold(HashMap::default(), Self::merge_delete_vectors); + .await; - self.state.write().unwrap().delete_vectors = merged_delete_vectors; + let _ = tx.send(result); + }); - Ok(()) + rx } async fn load_file_for_task( - task: FileScanTaskDeleteFile, + task: &FileScanTaskDeleteFile, file_io: FileIO, - state: StateRef, + del_filter: DeleteFilter, schema: SchemaRef, ) -> Result { match task.file_type { @@ -266,16 +238,15 @@ impl CachingDeleteFileManager { DataContentType::EqualityDeletes => { let sender = { - let mut state = state.write().unwrap(); - if let Some(existing) = state.equality_deletes.get(&task.file_path) { + if let Some(existing) = del_filter + .get_equality_delete_predicate_for_delete_file_path(&task.file_path) + { return Ok(DeleteFileContext::InProgEqDel(existing.clone())); } let (sender, fut) = EqDelFuture::new(); - state - .equality_deletes - .insert(task.file_path.to_string(), fut); + del_filter.insert_equality_delete(task.file_path.to_string(), fut); sender }; @@ -327,23 +298,6 @@ impl CachingDeleteFileManager { } } - fn merge_delete_vectors( - mut merged_delete_vectors: HashMap>>, - item: ParsedDeleteFileContext, - ) -> HashMap>> { - if let ParsedDeleteFileContext::DelVecs(del_vecs) = item { - del_vecs.into_iter().for_each(|(key, val)| { - let entry = merged_delete_vectors.entry(key).or_default(); - { - let mut inner = entry.write().unwrap(); - (*inner).intersect_assign(&val); - } - }); - } - - merged_delete_vectors - } - /// Loads a RecordBatchStream for a given datafile. async fn parquet_to_batch_stream( data_file_path: &str, @@ -416,72 +370,6 @@ impl CachingDeleteFileManager { "parsing of equality deletes is not yet supported", )) } - - /// Builds eq delete predicate for the provided task. - /// - /// Must await on load_deletes before calling this. - pub(crate) async fn build_delete_predicate_for_task( - &self, - file_scan_task: &FileScanTask, - ) -> Result> { - // * Filter the task's deletes into just the Equality deletes - // * Retrieve the unbound predicate for each from self.state.equality_deletes - // * Logical-AND them all together to get a single combined `Predicate` - // * Bind the predicate to the task's schema to get a `BoundPredicate` - - let mut combined_predicate = AlwaysTrue; - for delete in &file_scan_task.deletes { - if !is_equality_delete(delete) { - continue; - } - - let predicate = { - let state = self.state.read().unwrap(); - - let Some(predicate) = state.equality_deletes.get(&delete.file_path) else { - return Err(Error::new( - ErrorKind::Unexpected, - format!( - "Missing predicate for equality delete file '{}'", - delete.file_path - ), - )); - }; - - predicate.clone() - }; - - combined_predicate = combined_predicate.and(predicate.await); - } - - if combined_predicate == AlwaysTrue { - return Ok(None); - } - - // TODO: handle case-insensitive case - let bound_predicate = combined_predicate.bind(file_scan_task.schema.clone(), false)?; - Ok(Some(bound_predicate)) - } - - /// Retrieve a delete vector for the data file associated with a given file scan task - /// - /// Should only be called after awaiting on load_deletes. Takes the vector to avoid a - /// clone since each item is specific to a single data file and won't need to be used again - pub(crate) fn get_delete_vector_for_task( - &self, - file_scan_task: &FileScanTask, - ) -> Option>> { - self.state - .write() - .unwrap() - .delete_vectors - .get(file_scan_task.data_file_path()) - .map(Clone::clone) - } -} - -pub(crate) fn is_equality_delete(f: &FileScanTaskDeleteFile) -> bool { - matches!(f.file_type, DataContentType::EqualityDeletes) } #[cfg(test)] @@ -498,6 +386,7 @@ mod tests { use tempfile::TempDir; use super::*; + use crate::scan::FileScanTask; use crate::spec::{DataFileFormat, Schema}; type ArrowSchemaRef = Arc; @@ -516,13 +405,14 @@ mod tests { // Note that with the delete file parsing not yet in place, all we can test here is that // the call to the loader fails with the expected FeatureUnsupportedError. - let delete_file_manager = CachingDeleteFileManager::new(file_io.clone(), 10); + let delete_file_manager = CachingDeleteFileLoader::new(file_io.clone(), 10); let file_scan_tasks = setup(table_location); let result = delete_file_manager .load_deletes(&file_scan_tasks[0].deletes, file_scan_tasks[0].schema_ref()) - .await; + .await + .unwrap(); assert!(result.is_err_and(|e| e.kind() == ErrorKind::FeatureUnsupported)); } diff --git a/crates/iceberg/src/arrow/delete_filter.rs b/crates/iceberg/src/arrow/delete_filter.rs new file mode 100644 index 0000000000..aee69c3cce --- /dev/null +++ b/crates/iceberg/src/arrow/delete_filter.rs @@ -0,0 +1,337 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::HashMap; +use std::future::Future; +use std::pin::Pin; +use std::sync::{Arc, Mutex, OnceLock, RwLock}; +use std::task::{Context, Poll}; + +use futures::channel::oneshot; + +use crate::delete_vector::DeleteVector; +use crate::expr::Predicate::AlwaysTrue; +use crate::expr::{Bind, BoundPredicate, Predicate}; +use crate::scan::{FileScanTask, FileScanTaskDeleteFile}; +use crate::spec::DataContentType; +use crate::{Error, ErrorKind, Result}; + +// Equality deletes may apply to more than one DataFile in a scan, and so +// the same equality delete file may be present in more than one invocation of +// DeleteFileManager::load_deletes in the same scan. We want to deduplicate these +// to avoid having to load them twice, so we immediately store cloneable futures in the +// state that can be awaited upon to get te EQ deletes. That way we can check to see if +// a load of each Eq delete file is already in progress and avoid starting another one. +#[derive(Debug, Clone)] +pub(crate) struct EqDelFuture { + result: OnceLock, +} + +impl EqDelFuture { + pub(crate) fn new() -> (oneshot::Sender, Self) { + let (tx, rx) = oneshot::channel(); + let result = OnceLock::new(); + + crate::runtime::spawn({ + let result = result.clone(); + async move { result.set(rx.await.unwrap()) } + }); + + (tx, Self { result }) + } +} + +impl Future for EqDelFuture { + type Output = Predicate; + + fn poll(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll { + match self.result.get() { + None => Poll::Pending, + Some(predicate) => Poll::Ready(predicate.clone()), + } + } +} + +#[derive(Debug, Default)] +struct DeleteFileFilterState { + delete_vectors: HashMap>>, + equality_deletes: HashMap, +} + +#[derive(Clone, Debug, Default)] +pub struct DeleteFilter { + state: Arc>, +} + +impl DeleteFilter { + /// Retrieve a delete vector for the data file associated with a given file scan task + pub fn get_delete_vector( + &self, + file_scan_task: &FileScanTask, + ) -> Option>> { + self.get_delete_vector_for_path(file_scan_task.data_file_path()) + } + + /// Retrieve a delete vector for a data file + pub fn get_delete_vector_for_path( + &self, + delete_file_path: &str, + ) -> Option>> { + self.state + .read() + .ok() + .and_then(|st| st.delete_vectors.get(delete_file_path).cloned()) + } + + /// Retrieve the equality delete predicate for a given eq delete file path + pub(crate) fn get_equality_delete_predicate_for_delete_file_path( + &self, + file_path: &str, + ) -> Option { + self.state + .read() + .unwrap() + .equality_deletes + .get(file_path) + .cloned() + } + + /// Builds eq delete predicate for the provided task. + pub async fn build_equality_delete_predicate( + &self, + file_scan_task: &FileScanTask, + ) -> Result> { + // * Filter the task's deletes into just the Equality deletes + // * Retrieve the unbound predicate for each from self.state.equality_deletes + // * Logical-AND them all together to get a single combined `Predicate` + // * Bind the predicate to the task's schema to get a `BoundPredicate` + + let mut combined_predicate = AlwaysTrue; + for delete in &file_scan_task.deletes { + if !is_equality_delete(delete) { + continue; + } + + let Some(predicate) = + self.get_equality_delete_predicate_for_delete_file_path(&delete.file_path) + else { + return Err(Error::new( + ErrorKind::Unexpected, + format!( + "Missing predicate for equality delete file '{}'", + delete.file_path + ), + )); + }; + + combined_predicate = combined_predicate.and(predicate.await); + } + + if combined_predicate == AlwaysTrue { + return Ok(None); + } + + // TODO: handle case-insensitive case + let bound_predicate = combined_predicate.bind(file_scan_task.schema.clone(), false)?; + Ok(Some(bound_predicate)) + } + + pub(crate) fn upsert_delete_vector( + &mut self, + data_file_path: String, + delete_vector: DeleteVector, + ) { + let mut state = self.state.write().unwrap(); + + let Some(entry) = state.delete_vectors.get_mut(&data_file_path) else { + state + .delete_vectors + .insert(data_file_path, Arc::new(Mutex::new(delete_vector))); + return; + }; + + *entry.lock().unwrap() |= delete_vector; + } + + pub(crate) fn insert_equality_delete(&self, delete_file_path: String, eq_del: EqDelFuture) { + let mut state = self.state.write().unwrap(); + + state.equality_deletes.insert(delete_file_path, eq_del); + } +} + +pub(crate) fn is_equality_delete(f: &FileScanTaskDeleteFile) -> bool { + matches!(f.file_type, DataContentType::EqualityDeletes) +} + +#[cfg(test)] +mod tests { + use std::fs::File; + use std::path::Path; + use std::sync::Arc; + + use arrow_array::{Int64Array, RecordBatch, StringArray}; + use arrow_schema::Schema as ArrowSchema; + use parquet::arrow::{ArrowWriter, PARQUET_FIELD_ID_META_KEY}; + use parquet::basic::Compression; + use parquet::file::properties::WriterProperties; + use tempfile::TempDir; + + use super::*; + use crate::arrow::delete_file_loader::CachingDeleteFileLoader; + use crate::io::FileIO; + use crate::spec::{DataFileFormat, Schema}; + + type ArrowSchemaRef = Arc; + + const FIELD_ID_POSITIONAL_DELETE_FILE_PATH: u64 = 2147483546; + const FIELD_ID_POSITIONAL_DELETE_POS: u64 = 2147483545; + + #[tokio::test] + async fn test_delete_file_manager_load_deletes() { + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path(); + let file_io = FileIO::from_path(table_location.as_os_str().to_str().unwrap()) + .unwrap() + .build() + .unwrap(); + + // Note that with the delete file parsing not yet in place, all we can test here is that + // the call to the loader fails with the expected FeatureUnsupportedError. + let delete_file_manager = CachingDeleteFileLoader::new(file_io.clone(), 10); + + let file_scan_tasks = setup(table_location); + + let result = delete_file_manager + .load_deletes(&file_scan_tasks[0].deletes, file_scan_tasks[0].schema_ref()) + .await + .unwrap(); + + assert!(result.is_err_and(|e| e.kind() == ErrorKind::FeatureUnsupported)); + } + + fn setup(table_location: &Path) -> Vec { + let data_file_schema = Arc::new(Schema::builder().build().unwrap()); + let positional_delete_schema = create_pos_del_schema(); + + let file_path_values = vec![format!("{}/1.parquet", table_location.to_str().unwrap()); 8]; + let pos_values = vec![0, 1, 3, 5, 6, 8, 1022, 1023]; + + let file_path_col = Arc::new(StringArray::from_iter_values(file_path_values)); + let pos_col = Arc::new(Int64Array::from_iter_values(pos_values)); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + for n in 1..=3 { + let positional_deletes_to_write = + RecordBatch::try_new(positional_delete_schema.clone(), vec![ + file_path_col.clone(), + pos_col.clone(), + ]) + .unwrap(); + + let file = File::create(format!( + "{}/pos-del-{}.parquet", + table_location.to_str().unwrap(), + n + )) + .unwrap(); + let mut writer = ArrowWriter::try_new( + file, + positional_deletes_to_write.schema(), + Some(props.clone()), + ) + .unwrap(); + + writer + .write(&positional_deletes_to_write) + .expect("Writing batch"); + + // writer must be closed to write footer + writer.close().unwrap(); + } + + let pos_del_1 = FileScanTaskDeleteFile { + file_path: format!("{}/pos-del-1.parquet", table_location.to_str().unwrap()), + file_type: DataContentType::PositionDeletes, + partition_spec_id: 0, + equality_ids: vec![], + }; + + let pos_del_2 = FileScanTaskDeleteFile { + file_path: format!("{}/pos-del-2.parquet", table_location.to_str().unwrap()), + file_type: DataContentType::PositionDeletes, + partition_spec_id: 0, + equality_ids: vec![], + }; + + let pos_del_3 = FileScanTaskDeleteFile { + file_path: format!("{}/pos-del-3.parquet", table_location.to_str().unwrap()), + file_type: DataContentType::PositionDeletes, + partition_spec_id: 0, + equality_ids: vec![], + }; + + let file_scan_tasks = vec![ + FileScanTask { + start: 0, + length: 0, + record_count: None, + data_file_path: "".to_string(), + data_file_content: DataContentType::Data, + data_file_format: DataFileFormat::Parquet, + schema: data_file_schema.clone(), + project_field_ids: vec![], + predicate: None, + deletes: vec![pos_del_1, pos_del_2.clone()], + }, + FileScanTask { + start: 0, + length: 0, + record_count: None, + data_file_path: "".to_string(), + data_file_content: DataContentType::Data, + data_file_format: DataFileFormat::Parquet, + schema: data_file_schema.clone(), + project_field_ids: vec![], + predicate: None, + deletes: vec![pos_del_2, pos_del_3], + }, + ]; + + file_scan_tasks + } + + fn create_pos_del_schema() -> ArrowSchemaRef { + let fields = vec![ + arrow_schema::Field::new("file_path", arrow_schema::DataType::Utf8, false) + .with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + FIELD_ID_POSITIONAL_DELETE_FILE_PATH.to_string(), + )])), + arrow_schema::Field::new("pos", arrow_schema::DataType::Int64, false).with_metadata( + HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + FIELD_ID_POSITIONAL_DELETE_POS.to_string(), + )]), + ), + ]; + Arc::new(arrow_schema::Schema::new(fields)) + } +} diff --git a/crates/iceberg/src/arrow/mod.rs b/crates/iceberg/src/arrow/mod.rs index 56caeaf559..c5c144853a 100644 --- a/crates/iceberg/src/arrow/mod.rs +++ b/crates/iceberg/src/arrow/mod.rs @@ -23,7 +23,8 @@ pub use schema::*; mod nan_val_cnt_visitor; pub(crate) use nan_val_cnt_visitor::*; -pub(crate) mod delete_file_manager; +pub(crate) mod delete_file_loader; +pub(crate) mod delete_filter; mod reader; pub(crate) mod record_batch_projector; diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 19cbc00e67..42c8f00a09 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -42,7 +42,7 @@ use parquet::arrow::{PARQUET_FIELD_ID_META_KEY, ParquetRecordBatchStreamBuilder, use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader, RowGroupMetaData}; use parquet::schema::types::{SchemaDescriptor, Type as ParquetType}; -use crate::arrow::delete_file_manager::CachingDeleteFileManager; +use crate::arrow::delete_file_loader::CachingDeleteFileLoader; use crate::arrow::record_batch_transformer::RecordBatchTransformer; use crate::arrow::{arrow_schema_to_schema, get_arrow_datum}; use crate::delete_vector::DeleteVector; @@ -118,7 +118,7 @@ impl ArrowReaderBuilder { ArrowReader { batch_size: self.batch_size, file_io: self.file_io.clone(), - delete_file_manager: CachingDeleteFileManager::new( + delete_file_loader: CachingDeleteFileLoader::new( self.file_io.clone(), self.concurrency_limit_data_files, ), @@ -135,7 +135,7 @@ impl ArrowReaderBuilder { pub struct ArrowReader { batch_size: Option, file_io: FileIO, - delete_file_manager: CachingDeleteFileManager, + delete_file_loader: CachingDeleteFileLoader, /// the maximum number of data files that can be fetched at the same time concurrency_limit_data_files: usize, @@ -164,7 +164,7 @@ impl ArrowReader { task, batch_size, file_io, - self.delete_file_manager.clone(), + self.delete_file_loader.clone(), row_group_filtering_enabled, row_selection_enabled, delete_file_support_enabled, @@ -184,7 +184,7 @@ impl ArrowReader { task: FileScanTask, batch_size: Option, file_io: FileIO, - delete_file_manager: CachingDeleteFileManager, + delete_file_loader: CachingDeleteFileLoader, row_group_filtering_enabled: bool, row_selection_enabled: bool, delete_file_support_enabled: bool, @@ -199,22 +199,20 @@ impl ArrowReader { let should_load_page_index = (row_selection_enabled && task.predicate.is_some()) || !task.deletes.is_empty(); - // concurrently retrieve delete files and create RecordBatchStreamBuilder - let (_, mut record_batch_stream_builder) = try_join!( - delete_file_manager.load_deletes( - if delete_file_support_enabled { - &task.deletes - } else { - &[] - }, - task.schema.clone() - ), - Self::create_parquet_record_batch_stream_builder( - &task.data_file_path, - file_io.clone(), - should_load_page_index, - ) - )?; + let delete_filter_rx = delete_file_loader.load_deletes( + if delete_file_support_enabled { + &task.deletes + } else { + &[] + }, + task.schema.clone(), + ); + let mut record_batch_stream_builder = Self::create_parquet_record_batch_stream_builder( + &task.data_file_path, + file_io.clone(), + should_load_page_index, + ) + .await?; // Create a projection mask for the batch stream to select which columns in the // Parquet file that we want in the response @@ -236,9 +234,8 @@ impl ArrowReader { record_batch_stream_builder = record_batch_stream_builder.with_batch_size(batch_size); } - let delete_predicate = delete_file_manager - .build_delete_predicate_for_task(&task) - .await?; + let delete_filter = delete_filter_rx.await.unwrap()?; + let delete_predicate = delete_filter.build_equality_delete_predicate(&task).await?; // In addition to the optional predicate supplied in the `FileScanTask`, // we also have an optional predicate resulting from equality delete files. @@ -306,18 +303,18 @@ impl ArrowReader { } } - let positional_delete_indexes = delete_file_manager.get_delete_vector_for_task(&task); + let positional_delete_indexes = delete_filter.get_delete_vector(&task); if let Some(positional_delete_indexes) = positional_delete_indexes { let delete_row_selection = { - let positional_delete_indexes = positional_delete_indexes.read().unwrap(); + let positional_delete_indexes = positional_delete_indexes.lock().unwrap(); Self::build_deletes_row_selection( record_batch_stream_builder.metadata().row_groups(), &selected_row_group_indices, &positional_delete_indexes, - )? - }; + ) + }?; // merge the row selection from the delete files with the row selection // from the filter predicate, if there is one from the filter predicate @@ -1872,7 +1869,7 @@ message schema { /* cases to cover: * {skip|select} {first|intermediate|last} {one row|multiple rows} in - {first|imtermediate|last} {skipped|selected} row group + {first|intermediate|last} {skipped|selected} row group * row group selection disabled */ diff --git a/crates/iceberg/src/delete_vector.rs b/crates/iceberg/src/delete_vector.rs index fa8ef69349..feb4eeea99 100644 --- a/crates/iceberg/src/delete_vector.rs +++ b/crates/iceberg/src/delete_vector.rs @@ -38,10 +38,6 @@ impl DeleteVector { let outer = self.inner.bitmaps(); DeleteVectorIterator { outer, inner: None } } - - pub(crate) fn intersect_assign(&mut self, other: &DeleteVector) { - self.inner.bitor_assign(&other.inner); - } } // Ideally, we'd just wrap `roaring::RoaringTreemap`'s iterator, `roaring::treemap::Iter` here. @@ -109,3 +105,9 @@ impl DeleteVectorIterator<'_> { inner.bitmap_iter.advance_to(lo); } } + +impl BitOrAssign for DeleteVector { + fn bitor_assign(&mut self, other: Self) { + self.inner.bitor_assign(&other.inner); + } +} From 220725728cfa2d7ba90ca4eb2c3ac921d8bc1ec8 Mon Sep 17 00:00:00 2001 From: Scott Donnelly Date: Sat, 10 May 2025 12:19:03 +0100 Subject: [PATCH 06/12] fix: add waker for DeleteFileIndex --- Cargo.lock | 11 ++++++++ Cargo.toml | 1 + crates/iceberg/Cargo.toml | 1 + crates/iceberg/src/delete_file_index.rs | 37 ++++++++++++++++++++----- 4 files changed, 43 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0a2987a89b..dfd8ea8fbe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3547,6 +3547,7 @@ dependencies = [ "typed-builder 0.20.1", "url", "uuid", + "waker-set", "zstd", ] @@ -7514,6 +7515,16 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" +[[package]] +name = "waker-set" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e958152c46345e1af5c61812030ac85200573a0b384c137e83ce2c01ac4bc07" +dependencies = [ + "crossbeam-utils", + "slab", +] + [[package]] name = "walkdir" version = "2.5.0" diff --git a/Cargo.toml b/Cargo.toml index 432411b2de..a3b3c6a037 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -118,4 +118,5 @@ url = "2.5.4" uuid = { version = "1.16", features = ["v7"] } volo = "0.10.6" volo-thrift = "0.10.6" +waker-set = "0.2.0" zstd = "0.13.2" diff --git a/crates/iceberg/Cargo.toml b/crates/iceberg/Cargo.toml index 471b0cbe9f..b9614c91a6 100644 --- a/crates/iceberg/Cargo.toml +++ b/crates/iceberg/Cargo.toml @@ -85,6 +85,7 @@ tokio = { workspace = true, optional = false, features = ["sync"] } typed-builder = { workspace = true } url = { workspace = true } uuid = { workspace = true } +waker-set = { workspace = true } zstd = { workspace = true } [dev-dependencies] diff --git a/crates/iceberg/src/delete_file_index.rs b/crates/iceberg/src/delete_file_index.rs index 3f25bbda36..e06237c12e 100644 --- a/crates/iceberg/src/delete_file_index.rs +++ b/crates/iceberg/src/delete_file_index.rs @@ -16,6 +16,7 @@ // under the License. use std::collections::HashMap; +use std::fmt::{self, Debug}; use std::future::Future; use std::ops::Deref; use std::pin::Pin; @@ -24,6 +25,7 @@ use std::task::{Context, Poll}; use futures::StreamExt; use futures::channel::mpsc::{Sender, channel}; +use waker_set::WakerSet; use crate::runtime::spawn; use crate::scan::{DeleteFileContext, FileScanTaskDeleteFile}; @@ -31,9 +33,10 @@ use crate::spec::{DataContentType, DataFile, Struct}; use crate::{Error, ErrorKind, Result}; /// Index of delete files -#[derive(Clone, Debug)] +#[derive(Clone)] pub(crate) struct DeleteFileIndex { state: Arc>, + waker_set: Arc, } #[derive(Debug)] @@ -42,6 +45,15 @@ enum DeleteFileIndexState { Populated(PopulatedDeleteFileIndex), } +impl Debug for DeleteFileIndex { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("DeleteFileIndex") + .field("state", &self.state) + .field("waker_set", &"") + .finish() + } +} + #[derive(Debug)] struct PopulatedDeleteFileIndex { #[allow(dead_code)] @@ -59,22 +71,28 @@ impl DeleteFileIndex { pub(crate) fn new() -> (DeleteFileIndex, Sender) { // TODO: what should the channel limit be? let (tx, rx) = channel(10); + let waker_set = Arc::new(WakerSet::new()); let state = Arc::new(RwLock::new(DeleteFileIndexState::Populating)); let delete_file_stream = rx.boxed(); spawn({ let state = state.clone(); + let waker_set = waker_set.clone(); async move { let delete_files = delete_file_stream.collect::>().await; let populated_delete_file_index = PopulatedDeleteFileIndex::new(delete_files); - let mut guard = state.write().unwrap(); - *guard = DeleteFileIndexState::Populated(populated_delete_file_index); + { + let mut guard = state.write().unwrap(); + *guard = DeleteFileIndexState::Populated(populated_delete_file_index); + } + + waker_set.notify_all(); } }); - (DeleteFileIndex { state }, tx) + (DeleteFileIndex { state, waker_set }, tx) } /// Gets all the delete files that apply to the specified data file. @@ -89,6 +107,7 @@ impl DeleteFileIndex { state: self.state.clone(), data_file, seq_num, + waker_set: self.waker_set.clone(), } } } @@ -99,7 +118,7 @@ impl PopulatedDeleteFileIndex { /// /// 1. The partition information is extracted from each delete file's manifest entry. /// 2. If the partition is empty and the delete file is not a positional delete, - /// it is added to the `global_delees` vector + /// it is added to the `global_deletes` vector /// 3. Otherwise, the delete file is added to one of two hash maps based on its content type. fn new(files: Vec) -> PopulatedDeleteFileIndex { let mut eq_deletes_by_partition: HashMap>> = @@ -199,18 +218,22 @@ pub(crate) struct DeletesForDataFile<'a> { state: Arc>, data_file: &'a DataFile, seq_num: Option, + waker_set: Arc, } impl Future for DeletesForDataFile<'_> { type Output = Result>; - fn poll(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll { + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { match self.state.try_read() { Ok(guard) => match guard.deref() { DeleteFileIndexState::Populated(idx) => Poll::Ready(Ok( idx.get_deletes_for_data_file(self.data_file, self.seq_num) )), - _ => Poll::Pending, + _ => { + self.waker_set.insert(cx); + Poll::Pending + } }, Err(err) => Poll::Ready(Err(Error::new(ErrorKind::Unexpected, err.to_string()))), } From 2cf769228e6f0a46083d47cd3c8cd5669a2f65a3 Mon Sep 17 00:00:00 2001 From: Scott Donnelly Date: Wed, 14 May 2025 19:58:17 +0100 Subject: [PATCH 07/12] refactor: remove DeleteFileFilter from CachingDeleteFileLoader --- crates/iceberg/src/arrow/delete_file_loader.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/crates/iceberg/src/arrow/delete_file_loader.rs b/crates/iceberg/src/arrow/delete_file_loader.rs index 663b7e77cf..b3519305b6 100644 --- a/crates/iceberg/src/arrow/delete_file_loader.rs +++ b/crates/iceberg/src/arrow/delete_file_loader.rs @@ -50,7 +50,6 @@ pub trait DeleteFileLoader { pub(crate) struct CachingDeleteFileLoader { file_io: FileIO, concurrency_limit_data_files: usize, - del_filter: DeleteFilter, } impl DeleteFileLoader for CachingDeleteFileLoader { @@ -92,7 +91,6 @@ impl CachingDeleteFileLoader { CachingDeleteFileLoader { file_io, concurrency_limit_data_files, - del_filter: DeleteFilter::default(), } } @@ -167,6 +165,7 @@ impl CachingDeleteFileLoader { schema: SchemaRef, ) -> Receiver> { let (tx, rx) = channel(); + let del_filter = DeleteFilter::default(); let stream_items = delete_file_entries .iter() @@ -174,13 +173,13 @@ impl CachingDeleteFileLoader { ( t.clone(), self.file_io.clone(), - self.del_filter.clone(), + del_filter.clone(), schema.clone(), ) }) .collect::>(); let task_stream = futures::stream::iter(stream_items); - let del_filter = self.del_filter.clone(); + let del_filter = del_filter.clone(); let concurrency_limit_data_files = self.concurrency_limit_data_files; crate::runtime::spawn(async move { let result = async move { From 839232fd76e9956c7cb96329cd2ee0d6c739a591 Mon Sep 17 00:00:00 2001 From: Scott Donnelly Date: Thu, 15 May 2025 07:46:35 +0100 Subject: [PATCH 08/12] refactor: extract BasicDeleteFileLoader from CachingDeleteFileLoader --- .../src/arrow/caching_delete_file_loader.rs | 344 ++++++++++++++++++ .../iceberg/src/arrow/delete_file_loader.rs | 328 ++--------------- crates/iceberg/src/arrow/delete_filter.rs | 2 +- crates/iceberg/src/arrow/mod.rs | 6 +- crates/iceberg/src/arrow/reader.rs | 2 +- 5 files changed, 386 insertions(+), 296 deletions(-) create mode 100644 crates/iceberg/src/arrow/caching_delete_file_loader.rs diff --git a/crates/iceberg/src/arrow/caching_delete_file_loader.rs b/crates/iceberg/src/arrow/caching_delete_file_loader.rs new file mode 100644 index 0000000000..8bd4f43a7c --- /dev/null +++ b/crates/iceberg/src/arrow/caching_delete_file_loader.rs @@ -0,0 +1,344 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::HashMap; + +use futures::channel::oneshot; +use futures::future::join_all; +use futures::{StreamExt, TryStreamExt}; +use tokio::sync::oneshot::{channel, Receiver}; + +use super::delete_filter::{DeleteFilter, EqDelFuture}; +use crate::arrow::delete_file_loader::BasicDeleteFileLoader; +use crate::delete_vector::DeleteVector; +use crate::expr::Predicate; +use crate::io::FileIO; +use crate::scan::{ArrowRecordBatchStream, FileScanTaskDeleteFile}; +use crate::spec::{DataContentType, SchemaRef}; +use crate::{Error, ErrorKind, Result}; + +#[derive(Clone, Debug)] +pub(crate) struct CachingDeleteFileLoader { + basic_delete_file_loader: BasicDeleteFileLoader, + concurrency_limit_data_files: usize, +} + +// Intermediate context during processing of a delete file task. +enum DeleteFileContext { + // TODO: Delete Vector loader from Puffin files + InProgEqDel(EqDelFuture), + PosDels(ArrowRecordBatchStream), + FreshEqDel { + batch_stream: ArrowRecordBatchStream, + sender: oneshot::Sender, + }, +} + +// Final result of the processing of a delete file task before +// results are fully merged into the DeleteFileManager's state +enum ParsedDeleteFileContext { + InProgEqDel(EqDelFuture), + DelVecs(HashMap), + EqDel, +} + +#[allow(unused_variables)] +impl CachingDeleteFileLoader { + pub(crate) fn new(file_io: FileIO, concurrency_limit_data_files: usize) -> Self { + CachingDeleteFileLoader { + basic_delete_file_loader: BasicDeleteFileLoader::new(file_io), + concurrency_limit_data_files, + } + } + + /// Load the deletes for all the specified tasks + /// + /// Returned future completes once all loading has finished. + /// + /// * Create a single stream of all delete file tasks irrespective of type, + /// so that we can respect the combined concurrency limit + /// * We then process each in two phases: load and parse. + /// * for positional deletes the load phase instantiates an ArrowRecordBatchStream to + /// stream the file contents out + /// * for eq deletes, we first check if the EQ delete is already loaded or being loaded by + /// another concurrently processing data file scan task. If it is, we return a future + /// for the pre-existing task from the load phase. If not, we create such a future + /// and store it in the state to prevent other data file tasks from starting to load + /// the same equality delete file, and return a record batch stream from the load phase + /// as per the other delete file types - only this time it is accompanied by a one-shot + /// channel sender that we will eventually use to resolve the shared future that we stored + /// in the state. + /// * When this gets updated to add support for delete vectors, the load phase will return + /// a PuffinReader for them. + /// * The parse phase parses each record batch stream according to its associated data type. + /// The result of this is a map of data file paths to delete vectors for the positional + /// delete tasks (and in future for the delete vector tasks). For equality delete + /// file tasks, this results in an unbound Predicate. + /// * The unbound Predicates resulting from equality deletes are sent to their associated oneshot + /// channel to store them in the right place in the delete file managers state. + /// * The results of all of these futures are awaited on in parallel with the specified + /// level of concurrency and collected into a vec. We then combine all the delete + /// vector maps that resulted from any positional delete or delete vector files into a + /// single map and persist it in the state. + /// + /// + /// Conceptually, the data flow is like this: + /// ```none + /// FileScanTaskDeleteFile + /// | + /// Already-loading EQ Delete | Everything Else + /// +---------------------------------------------------+ + /// | | + /// [get existing future] [load recordbatch stream / puffin] + /// DeleteFileContext::InProgEqDel DeleteFileContext + /// | | + /// | | + /// | +-----------------------------+--------------------------+ + /// | Pos Del Del Vec (Not yet Implemented) EQ Del + /// | | | | + /// | [parse pos del stream] [parse del vec puffin] [parse eq del] + /// | HashMap HashMap (Predicate, Sender) + /// | | | | + /// | | | [persist to state] + /// | | | () + /// | | | | + /// | +-----------------------------+--------------------------+ + /// | | + /// | [buffer unordered] + /// | | + /// | [combine del vectors] + /// | HashMap + /// | | + /// | [persist del vectors to state] + /// | () + /// | | + /// +-------------------------+-------------------------+ + /// | + /// [join!] + /// ``` + pub(crate) fn load_deletes( + &self, + delete_file_entries: &[FileScanTaskDeleteFile], + schema: SchemaRef, + ) -> Receiver> { + let (tx, rx) = channel(); + let del_filter = DeleteFilter::default(); + + let stream_items = delete_file_entries + .iter() + .map(|t| { + ( + t.clone(), + self.basic_delete_file_loader.clone(), + del_filter.clone(), + schema.clone(), + ) + }) + .collect::>(); + let task_stream = futures::stream::iter(stream_items); + let del_filter = del_filter.clone(); + let concurrency_limit_data_files = self.concurrency_limit_data_files; + let basic_delete_file_loader = self.basic_delete_file_loader.clone(); + crate::runtime::spawn(async move { + let result = async move { + let mut del_filter = del_filter; + let basic_delete_file_loader = basic_delete_file_loader.clone(); + + let results: Vec = task_stream + .map(move |(task, file_io, del_filter, schema)| { + let basic_delete_file_loader = basic_delete_file_loader.clone(); + async move { + Self::load_file_for_task( + &task, + basic_delete_file_loader.clone(), + del_filter, + schema, + ) + .await + } + }) + .map(move |ctx| { + Ok(async { Self::parse_file_content_for_task(ctx.await?).await }) + }) + .try_buffer_unordered(concurrency_limit_data_files) + .try_collect::>() + .await?; + + // wait for all in-progress EQ deletes from other tasks + let _ = join_all(results.iter().filter_map(|i| { + if let ParsedDeleteFileContext::InProgEqDel(fut) = i { + Some(fut.clone()) + } else { + None + } + })) + .await; + + for item in results { + if let ParsedDeleteFileContext::DelVecs(hash_map) = item { + for (data_file_path, delete_vector) in hash_map.into_iter() { + del_filter.upsert_delete_vector(data_file_path, delete_vector); + } + } + } + + Ok(del_filter) + } + .await; + + let _ = tx.send(result); + }); + + rx + } + + async fn load_file_for_task( + task: &FileScanTaskDeleteFile, + basic_delete_file_loader: BasicDeleteFileLoader, + del_filter: DeleteFilter, + schema: SchemaRef, + ) -> Result { + match task.file_type { + DataContentType::PositionDeletes => Ok(DeleteFileContext::PosDels( + basic_delete_file_loader + .parquet_to_batch_stream(&task.file_path) + .await?, + )), + + DataContentType::EqualityDeletes => { + let sender = { + if let Some(existing) = del_filter + .get_equality_delete_predicate_for_delete_file_path(&task.file_path) + { + return Ok(DeleteFileContext::InProgEqDel(existing.clone())); + } + + let (sender, fut) = EqDelFuture::new(); + + del_filter.insert_equality_delete(task.file_path.to_string(), fut); + + sender + }; + + Ok(DeleteFileContext::FreshEqDel { + batch_stream: BasicDeleteFileLoader::evolve_schema( + basic_delete_file_loader + .parquet_to_batch_stream(&task.file_path) + .await?, + schema, + ) + .await?, + sender, + }) + } + + DataContentType::Data => Err(Error::new( + ErrorKind::Unexpected, + "tasks with files of type Data not expected here", + )), + } + } + + async fn parse_file_content_for_task( + ctx: DeleteFileContext, + ) -> Result { + match ctx { + DeleteFileContext::InProgEqDel(fut) => Ok(ParsedDeleteFileContext::InProgEqDel(fut)), + DeleteFileContext::PosDels(batch_stream) => { + let del_vecs = + Self::parse_positional_deletes_record_batch_stream(batch_stream).await?; + Ok(ParsedDeleteFileContext::DelVecs(del_vecs)) + } + DeleteFileContext::FreshEqDel { + sender, + batch_stream, + } => { + let predicate = + Self::parse_equality_deletes_record_batch_stream(batch_stream).await?; + + sender + .send(predicate) + .map_err(|err| { + Error::new( + ErrorKind::Unexpected, + "Could not send eq delete predicate to state", + ) + }) + .map(|_| ParsedDeleteFileContext::EqDel) + } + } + } + + /// Parses a record batch stream coming from positional delete files + /// + /// Returns a map of data file path to a delete vector + async fn parse_positional_deletes_record_batch_stream( + stream: ArrowRecordBatchStream, + ) -> Result> { + // TODO + + Err(Error::new( + ErrorKind::FeatureUnsupported, + "parsing of positional deletes is not yet supported", + )) + } + + /// Parses record batch streams from individual equality delete files + /// + /// Returns an unbound Predicate for each batch stream + async fn parse_equality_deletes_record_batch_stream( + streams: ArrowRecordBatchStream, + ) -> Result { + // TODO + + Err(Error::new( + ErrorKind::FeatureUnsupported, + "parsing of equality deletes is not yet supported", + )) + } +} + +#[cfg(test)] +mod tests { + use tempfile::TempDir; + + use super::*; + use crate::arrow::delete_file_loader::tests::setup; + + #[tokio::test] + async fn test_delete_file_manager_load_deletes() { + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path(); + let file_io = FileIO::from_path(table_location.as_os_str().to_str().unwrap()) + .unwrap() + .build() + .unwrap(); + + // Note that with the delete file parsing not yet in place, all we can test here is that + // the call to the loader fails with the expected FeatureUnsupportedError. + let delete_file_manager = CachingDeleteFileLoader::new(file_io.clone(), 10); + + let file_scan_tasks = setup(table_location); + + let result = delete_file_manager + .load_deletes(&file_scan_tasks[0].deletes, file_scan_tasks[0].schema_ref()) + .await + .unwrap(); + + assert!(result.is_err_and(|e| e.kind() == ErrorKind::FeatureUnsupported)); + } +} diff --git a/crates/iceberg/src/arrow/delete_file_loader.rs b/crates/iceberg/src/arrow/delete_file_loader.rs index b3519305b6..608d9b6ead 100644 --- a/crates/iceberg/src/arrow/delete_file_loader.rs +++ b/crates/iceberg/src/arrow/delete_file_loader.rs @@ -15,25 +15,20 @@ // specific language governing permissions and limitations // under the License. -use std::collections::HashMap; use std::sync::Arc; -use futures::channel::oneshot; -use futures::future::join_all; use futures::{StreamExt, TryStreamExt}; -use tokio::sync::oneshot::{channel, Receiver}; -use super::delete_filter::{DeleteFilter, EqDelFuture}; use crate::arrow::record_batch_transformer::RecordBatchTransformer; use crate::arrow::ArrowReader; -use crate::delete_vector::DeleteVector; -use crate::expr::Predicate; use crate::io::FileIO; use crate::scan::{ArrowRecordBatchStream, FileScanTaskDeleteFile}; -use crate::spec::{DataContentType, Schema, SchemaRef}; +use crate::spec::{Schema, SchemaRef}; use crate::{Error, ErrorKind, Result}; +/// Delete File Loader #[allow(unused)] +#[async_trait::async_trait] pub trait DeleteFileLoader { /// Read the delete file referred to in the task /// @@ -45,262 +40,20 @@ pub trait DeleteFileLoader { ) -> Result; } -#[allow(unused)] #[derive(Clone, Debug)] -pub(crate) struct CachingDeleteFileLoader { +pub(crate) struct BasicDeleteFileLoader { file_io: FileIO, - concurrency_limit_data_files: usize, -} - -impl DeleteFileLoader for CachingDeleteFileLoader { - async fn read_delete_file( - &self, - task: &FileScanTaskDeleteFile, - schema: SchemaRef, - ) -> Result { - let raw_batch_stream = - CachingDeleteFileLoader::parquet_to_batch_stream(&task.file_path, self.file_io.clone()) - .await?; - - Self::evolve_schema(raw_batch_stream, schema).await - } -} - -// Intermediate context during processing of a delete file task. -enum DeleteFileContext { - // TODO: Delete Vector loader from Puffin files - InProgEqDel(EqDelFuture), - PosDels(ArrowRecordBatchStream), - FreshEqDel { - batch_stream: ArrowRecordBatchStream, - sender: oneshot::Sender, - }, -} - -// Final result of the processing of a delete file task before -// results are fully merged into the DeleteFileManager's state -enum ParsedDeleteFileContext { - InProgEqDel(EqDelFuture), - DelVecs(HashMap), - EqDel, } #[allow(unused_variables)] -impl CachingDeleteFileLoader { - pub(crate) fn new(file_io: FileIO, concurrency_limit_data_files: usize) -> Self { - CachingDeleteFileLoader { - file_io, - concurrency_limit_data_files, - } - } - - /// Load the deletes for all the specified tasks - /// - /// Returned future completes once all loading has finished. - /// - /// * Create a single stream of all delete file tasks irrespective of type, - /// so that we can respect the combined concurrency limit - /// * We then process each in two phases: load and parse. - /// * for positional deletes the load phase instantiates an ArrowRecordBatchStream to - /// stream the file contents out - /// * for eq deletes, we first check if the EQ delete is already loaded or being loaded by - /// another concurrently processing data file scan task. If it is, we return a future - /// for the pre-existing task from the load phase. If not, we create such a future - /// and store it in the state to prevent other data file tasks from starting to load - /// the same equality delete file, and return a record batch stream from the load phase - /// as per the other delete file types - only this time it is accompanied by a one-shot - /// channel sender that we will eventually use to resolve the shared future that we stored - /// in the state. - /// * When this gets updated to add support for delete vectors, the load phase will return - /// a PuffinReader for them. - /// * The parse phase parses each record batch stream according to its associated data type. - /// The result of this is a map of data file paths to delete vectors for the positional - /// delete tasks (and in future for the delete vector tasks). For equality delete - /// file tasks, this results in an unbound Predicate. - /// * The unbound Predicates resulting from equality deletes are sent to their associated oneshot - /// channel to store them in the right place in the delete file managers state. - /// * The results of all of these futures are awaited on in parallel with the specified - /// level of concurrency and collected into a vec. We then combine all the delete - /// vector maps that resulted from any positional delete or delete vector files into a - /// single map and persist it in the state. - /// - /// - /// Conceptually, the data flow is like this: - /// ```none - /// FileScanTaskDeleteFile - /// | - /// Already-loading EQ Delete | Everything Else - /// +---------------------------------------------------+ - /// | | - /// [get existing future] [load recordbatch stream / puffin] - /// DeleteFileContext::InProgEqDel DeleteFileContext - /// | | - /// | | - /// | +-----------------------------+--------------------------+ - /// | Pos Del Del Vec (Not yet Implemented) EQ Del - /// | | | | - /// | [parse pos del stream] [parse del vec puffin] [parse eq del] - /// | HashMap HashMap (Predicate, Sender) - /// | | | | - /// | | | [persist to state] - /// | | | () - /// | | | | - /// | +-----------------------------+--------------------------+ - /// | | - /// | [buffer unordered] - /// | | - /// | [combine del vectors] - /// | HashMap - /// | | - /// | [persist del vectors to state] - /// | () - /// | | - /// +-------------------------+-------------------------+ - /// | - /// [join!] - /// ``` - pub(crate) fn load_deletes( - &self, - delete_file_entries: &[FileScanTaskDeleteFile], - schema: SchemaRef, - ) -> Receiver> { - let (tx, rx) = channel(); - let del_filter = DeleteFilter::default(); - - let stream_items = delete_file_entries - .iter() - .map(|t| { - ( - t.clone(), - self.file_io.clone(), - del_filter.clone(), - schema.clone(), - ) - }) - .collect::>(); - let task_stream = futures::stream::iter(stream_items); - let del_filter = del_filter.clone(); - let concurrency_limit_data_files = self.concurrency_limit_data_files; - crate::runtime::spawn(async move { - let result = async move { - let mut del_filter = del_filter; - - let results: Vec = task_stream - .map(move |(task, file_io, del_filter, schema)| async move { - Self::load_file_for_task(&task, file_io, del_filter, schema).await - }) - .map(move |ctx| { - Ok(async { Self::parse_file_content_for_task(ctx.await?).await }) - }) - .try_buffer_unordered(concurrency_limit_data_files) - .try_collect::>() - .await?; - - // wait for all in-progress EQ deletes from other tasks - let _ = join_all(results.iter().filter_map(|i| { - if let ParsedDeleteFileContext::InProgEqDel(fut) = i { - Some(fut.clone()) - } else { - None - } - })) - .await; - - for item in results { - if let ParsedDeleteFileContext::DelVecs(hash_map) = item { - for (data_file_path, delete_vector) in hash_map.into_iter() { - del_filter.upsert_delete_vector(data_file_path, delete_vector); - } - } - } - - Ok(del_filter) - } - .await; - - let _ = tx.send(result); - }); - - rx +impl BasicDeleteFileLoader { + pub fn new(file_io: FileIO) -> Self { + BasicDeleteFileLoader { file_io } } - - async fn load_file_for_task( - task: &FileScanTaskDeleteFile, - file_io: FileIO, - del_filter: DeleteFilter, - schema: SchemaRef, - ) -> Result { - match task.file_type { - DataContentType::PositionDeletes => Ok(DeleteFileContext::PosDels( - Self::parquet_to_batch_stream(&task.file_path, file_io).await?, - )), - - DataContentType::EqualityDeletes => { - let sender = { - if let Some(existing) = del_filter - .get_equality_delete_predicate_for_delete_file_path(&task.file_path) - { - return Ok(DeleteFileContext::InProgEqDel(existing.clone())); - } - - let (sender, fut) = EqDelFuture::new(); - - del_filter.insert_equality_delete(task.file_path.to_string(), fut); - - sender - }; - - Ok(DeleteFileContext::FreshEqDel { - batch_stream: Self::evolve_schema( - Self::parquet_to_batch_stream(&task.file_path, file_io).await?, - schema, - ) - .await?, - sender, - }) - } - - DataContentType::Data => Err(Error::new( - ErrorKind::Unexpected, - "tasks with files of type Data not expected here", - )), - } - } - - async fn parse_file_content_for_task( - ctx: DeleteFileContext, - ) -> Result { - match ctx { - DeleteFileContext::InProgEqDel(fut) => Ok(ParsedDeleteFileContext::InProgEqDel(fut)), - DeleteFileContext::PosDels(batch_stream) => { - let del_vecs = - Self::parse_positional_deletes_record_batch_stream(batch_stream).await?; - Ok(ParsedDeleteFileContext::DelVecs(del_vecs)) - } - DeleteFileContext::FreshEqDel { - sender, - batch_stream, - } => { - let predicate = - Self::parse_equality_deletes_record_batch_stream(batch_stream).await?; - - sender - .send(predicate) - .map_err(|err| { - Error::new( - ErrorKind::Unexpected, - "Could not send eq delete predicate to state", - ) - }) - .map(|_| ParsedDeleteFileContext::EqDel) - } - } - } - /// Loads a RecordBatchStream for a given datafile. - async fn parquet_to_batch_stream( + pub(crate) async fn parquet_to_batch_stream( + &self, data_file_path: &str, - file_io: FileIO, ) -> Result { /* Essentially a super-cut-down ArrowReader. We can't use ArrowReader directly @@ -308,7 +61,7 @@ impl CachingDeleteFileLoader { */ let record_batch_stream = ArrowReader::create_parquet_record_batch_stream_builder( data_file_path, - file_io.clone(), + self.file_io.clone(), false, ) .await? @@ -319,7 +72,7 @@ impl CachingDeleteFileLoader { } /// Evolves the schema of the RecordBatches from an equality delete file - async fn evolve_schema( + pub(crate) async fn evolve_schema( record_batch_stream: ArrowRecordBatchStream, target_schema: Arc, ) -> Result { @@ -341,38 +94,24 @@ impl CachingDeleteFileLoader { Ok(Box::pin(record_batch_stream) as ArrowRecordBatchStream) } +} - /// Parses a record batch stream coming from positional delete files - /// - /// Returns a map of data file path to a delete vector - async fn parse_positional_deletes_record_batch_stream( - stream: ArrowRecordBatchStream, - ) -> Result> { - // TODO - - Err(Error::new( - ErrorKind::FeatureUnsupported, - "parsing of positional deletes is not yet supported", - )) - } +#[async_trait::async_trait] +impl DeleteFileLoader for BasicDeleteFileLoader { + async fn read_delete_file( + &self, + task: &FileScanTaskDeleteFile, + schema: SchemaRef, + ) -> Result { + let raw_batch_stream = self.parquet_to_batch_stream(&task.file_path).await?; - /// Parses record batch streams from individual equality delete files - /// - /// Returns an unbound Predicate for each batch stream - async fn parse_equality_deletes_record_batch_stream( - streams: ArrowRecordBatchStream, - ) -> Result { - // TODO - - Err(Error::new( - ErrorKind::FeatureUnsupported, - "parsing of equality deletes is not yet supported", - )) + Self::evolve_schema(raw_batch_stream, schema).await } } #[cfg(test)] -mod tests { +pub(crate) mod tests { + use std::collections::HashMap; use std::fs::File; use std::path::Path; use std::sync::Arc; @@ -386,7 +125,7 @@ mod tests { use super::*; use crate::scan::FileScanTask; - use crate::spec::{DataFileFormat, Schema}; + use crate::spec::{DataContentType, DataFileFormat, Schema}; type ArrowSchemaRef = Arc; @@ -394,7 +133,7 @@ mod tests { const FIELD_ID_POSITIONAL_DELETE_POS: u64 = 2147483545; #[tokio::test] - async fn test_delete_file_manager_load_deletes() { + async fn test_basic_delete_file_loader_read_delete_file() { let tmp_dir = TempDir::new().unwrap(); let table_location = tmp_dir.path(); let file_io = FileIO::from_path(table_location.as_os_str().to_str().unwrap()) @@ -404,19 +143,24 @@ mod tests { // Note that with the delete file parsing not yet in place, all we can test here is that // the call to the loader fails with the expected FeatureUnsupportedError. - let delete_file_manager = CachingDeleteFileLoader::new(file_io.clone(), 10); + let delete_file_loader = BasicDeleteFileLoader::new(file_io.clone()); let file_scan_tasks = setup(table_location); - let result = delete_file_manager - .load_deletes(&file_scan_tasks[0].deletes, file_scan_tasks[0].schema_ref()) + let result = delete_file_loader + .read_delete_file( + &file_scan_tasks[0].deletes[0], + file_scan_tasks[0].schema_ref(), + ) .await .unwrap(); - assert!(result.is_err_and(|e| e.kind() == ErrorKind::FeatureUnsupported)); + let result = result.try_collect::>().await.unwrap(); + + assert_eq!(result.len(), 1); } - fn setup(table_location: &Path) -> Vec { + pub(crate) fn setup(table_location: &Path) -> Vec { let data_file_schema = Arc::new(Schema::builder().build().unwrap()); let positional_delete_schema = create_pos_del_schema(); @@ -510,7 +254,7 @@ mod tests { file_scan_tasks } - fn create_pos_del_schema() -> ArrowSchemaRef { + pub(crate) fn create_pos_del_schema() -> ArrowSchemaRef { let fields = vec![ arrow_schema::Field::new("file_path", arrow_schema::DataType::Utf8, false) .with_metadata(HashMap::from([( diff --git a/crates/iceberg/src/arrow/delete_filter.rs b/crates/iceberg/src/arrow/delete_filter.rs index aee69c3cce..6950efd8d2 100644 --- a/crates/iceberg/src/arrow/delete_filter.rs +++ b/crates/iceberg/src/arrow/delete_filter.rs @@ -192,7 +192,7 @@ mod tests { use tempfile::TempDir; use super::*; - use crate::arrow::delete_file_loader::CachingDeleteFileLoader; + use crate::arrow::caching_delete_file_loader::CachingDeleteFileLoader; use crate::io::FileIO; use crate::spec::{DataFileFormat, Schema}; diff --git a/crates/iceberg/src/arrow/mod.rs b/crates/iceberg/src/arrow/mod.rs index c5c144853a..949f842412 100644 --- a/crates/iceberg/src/arrow/mod.rs +++ b/crates/iceberg/src/arrow/mod.rs @@ -22,13 +22,15 @@ pub use schema::*; mod nan_val_cnt_visitor; pub(crate) use nan_val_cnt_visitor::*; - -pub(crate) mod delete_file_loader; +pub(crate) mod caching_delete_file_loader; +/// Delete File loader +pub mod delete_file_loader; pub(crate) mod delete_filter; mod reader; pub(crate) mod record_batch_projector; pub(crate) mod record_batch_transformer; mod value; + pub use reader::*; pub use value::*; diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 42c8f00a09..e0f6a77f97 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -42,7 +42,7 @@ use parquet::arrow::{PARQUET_FIELD_ID_META_KEY, ParquetRecordBatchStreamBuilder, use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader, RowGroupMetaData}; use parquet::schema::types::{SchemaDescriptor, Type as ParquetType}; -use crate::arrow::delete_file_loader::CachingDeleteFileLoader; +use crate::arrow::caching_delete_file_loader::CachingDeleteFileLoader; use crate::arrow::record_batch_transformer::RecordBatchTransformer; use crate::arrow::{arrow_schema_to_schema, get_arrow_datum}; use crate::delete_vector::DeleteVector; From d6a4a4d650480b31aecc8aada895278fbfcc5440 Mon Sep 17 00:00:00 2001 From: Scott Donnelly Date: Thu, 15 May 2025 19:47:10 +0100 Subject: [PATCH 09/12] feat: remove flag to selectively enable delete file processing in vavour of it being always on --- crates/iceberg/src/arrow/delete_filter.rs | 8 +- crates/iceberg/src/arrow/reader.rs | 30 +------ crates/iceberg/src/scan/context.rs | 40 ++++------ crates/iceberg/src/scan/mod.rs | 80 ++++++------------- .../shared_tests/read_positional_deletes.rs | 6 +- 5 files changed, 48 insertions(+), 116 deletions(-) diff --git a/crates/iceberg/src/arrow/delete_filter.rs b/crates/iceberg/src/arrow/delete_filter.rs index 6950efd8d2..b6e50da1c0 100644 --- a/crates/iceberg/src/arrow/delete_filter.rs +++ b/crates/iceberg/src/arrow/delete_filter.rs @@ -73,13 +73,13 @@ struct DeleteFileFilterState { } #[derive(Clone, Debug, Default)] -pub struct DeleteFilter { +pub(crate) struct DeleteFilter { state: Arc>, } impl DeleteFilter { /// Retrieve a delete vector for the data file associated with a given file scan task - pub fn get_delete_vector( + pub(crate) fn get_delete_vector( &self, file_scan_task: &FileScanTask, ) -> Option>> { @@ -87,7 +87,7 @@ impl DeleteFilter { } /// Retrieve a delete vector for a data file - pub fn get_delete_vector_for_path( + pub(crate) fn get_delete_vector_for_path( &self, delete_file_path: &str, ) -> Option>> { @@ -111,7 +111,7 @@ impl DeleteFilter { } /// Builds eq delete predicate for the provided task. - pub async fn build_equality_delete_predicate( + pub(crate) async fn build_equality_delete_predicate( &self, file_scan_task: &FileScanTask, ) -> Result> { diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index e0f6a77f97..b1903a94db 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -64,7 +64,6 @@ pub struct ArrowReaderBuilder { concurrency_limit_data_files: usize, row_group_filtering_enabled: bool, row_selection_enabled: bool, - delete_file_support_enabled: bool, } impl ArrowReaderBuilder { @@ -78,7 +77,6 @@ impl ArrowReaderBuilder { concurrency_limit_data_files: num_cpus, row_group_filtering_enabled: true, row_selection_enabled: false, - delete_file_support_enabled: false, } } @@ -107,12 +105,6 @@ impl ArrowReaderBuilder { self } - /// Determines whether to enable delete file support. - pub fn with_delete_file_support_enabled(mut self, delete_file_support_enabled: bool) -> Self { - self.delete_file_support_enabled = delete_file_support_enabled; - self - } - /// Build the ArrowReader. pub fn build(self) -> ArrowReader { ArrowReader { @@ -125,7 +117,6 @@ impl ArrowReaderBuilder { concurrency_limit_data_files: self.concurrency_limit_data_files, row_group_filtering_enabled: self.row_group_filtering_enabled, row_selection_enabled: self.row_selection_enabled, - delete_file_support_enabled: self.delete_file_support_enabled, } } } @@ -142,7 +133,6 @@ pub struct ArrowReader { row_group_filtering_enabled: bool, row_selection_enabled: bool, - delete_file_support_enabled: bool, } impl ArrowReader { @@ -154,7 +144,6 @@ impl ArrowReader { let concurrency_limit_data_files = self.concurrency_limit_data_files; let row_group_filtering_enabled = self.row_group_filtering_enabled; let row_selection_enabled = self.row_selection_enabled; - let delete_file_support_enabled = self.delete_file_support_enabled; let stream = tasks .map_ok(move |task| { @@ -167,7 +156,6 @@ impl ArrowReader { self.delete_file_loader.clone(), row_group_filtering_enabled, row_selection_enabled, - delete_file_support_enabled, ) }) .map_err(|err| { @@ -187,26 +175,12 @@ impl ArrowReader { delete_file_loader: CachingDeleteFileLoader, row_group_filtering_enabled: bool, row_selection_enabled: bool, - delete_file_support_enabled: bool, ) -> Result { - if !delete_file_support_enabled && !task.deletes.is_empty() { - return Err(Error::new( - ErrorKind::FeatureUnsupported, - "Delete file support is not enabled", - )); - } - let should_load_page_index = (row_selection_enabled && task.predicate.is_some()) || !task.deletes.is_empty(); - let delete_filter_rx = delete_file_loader.load_deletes( - if delete_file_support_enabled { - &task.deletes - } else { - &[] - }, - task.schema.clone(), - ); + let delete_filter_rx = delete_file_loader.load_deletes(&task.deletes, task.schema.clone()); + let mut record_batch_stream_builder = Self::create_parquet_record_batch_stream_builder( &task.data_file_path, file_io.clone(), diff --git a/crates/iceberg/src/scan/context.rs b/crates/iceberg/src/scan/context.rs index 6bfb12b23a..f40d0ea6b0 100644 --- a/crates/iceberg/src/scan/context.rs +++ b/crates/iceberg/src/scan/context.rs @@ -45,7 +45,7 @@ pub(crate) struct ManifestFileContext { object_cache: Arc, snapshot_schema: SchemaRef, expression_evaluator_cache: Arc, - delete_file_index: Option, + delete_file_index: DeleteFileIndex, } /// Wraps a [`ManifestEntryRef`] alongside the objects that are needed @@ -58,7 +58,7 @@ pub(crate) struct ManifestEntryContext { pub bound_predicates: Option>, pub partition_spec_id: i32, pub snapshot_schema: SchemaRef, - pub delete_file_index: Option, + pub delete_file_index: DeleteFileIndex, } impl ManifestFileContext { @@ -105,16 +105,13 @@ impl ManifestEntryContext { /// consume this `ManifestEntryContext`, returning a `FileScanTask` /// created from it pub(crate) async fn into_file_scan_task(self) -> Result { - let deletes = if let Some(delete_file_index) = self.delete_file_index { - delete_file_index - .get_deletes_for_data_file( - self.manifest_entry.data_file(), - self.manifest_entry.sequence_number(), - ) - .await? - } else { - vec![] - }; + let deletes = self + .delete_file_index + .get_deletes_for_data_file( + self.manifest_entry.data_file(), + self.manifest_entry.sequence_number(), + ) + .await?; Ok(FileScanTask { start: 0, @@ -188,7 +185,8 @@ impl PlanContext { &self, manifest_list: Arc, tx_data: Sender, - delete_file_idx_and_tx: Option<(DeleteFileIndex, Sender)>, + delete_file_idx: DeleteFileIndex, + delete_file_tx: Sender, ) -> Result> + 'static>> { let manifest_files = manifest_list.entries().iter(); @@ -196,16 +194,10 @@ impl PlanContext { let mut filtered_mfcs = vec![]; for manifest_file in manifest_files { - let (delete_file_idx, tx) = if manifest_file.content == ManifestContentType::Deletes { - let Some((delete_file_idx, tx)) = delete_file_idx_and_tx.as_ref() else { - continue; - }; - (Some(delete_file_idx.clone()), tx.clone()) + let tx = if manifest_file.content == ManifestContentType::Deletes { + delete_file_tx.clone() } else { - ( - delete_file_idx_and_tx.as_ref().map(|x| x.0.clone()), - tx_data.clone(), - ) + tx_data.clone() }; let partition_bound_predicate = if self.predicate.is_some() { @@ -233,7 +225,7 @@ impl PlanContext { manifest_file, partition_bound_predicate, tx, - delete_file_idx, + delete_file_idx.clone(), ); filtered_mfcs.push(Ok(mfc)); @@ -247,7 +239,7 @@ impl PlanContext { manifest_file: &ManifestFile, partition_filter: Option>, sender: Sender, - delete_file_index: Option, + delete_file_index: DeleteFileIndex, ) -> ManifestFileContext { let bound_predicates = if let (Some(ref partition_bound_predicate), Some(snapshot_bound_predicate)) = diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 8280d43080..f7a43a6475 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -59,11 +59,6 @@ pub struct TableScanBuilder<'a> { concurrency_limit_manifest_files: usize, row_group_filtering_enabled: bool, row_selection_enabled: bool, - - // TODO: defaults to false for now whilst delete file processing - // is still being worked on but will switch to a default of true - // once this work is complete - delete_file_processing_enabled: bool, } impl<'a> TableScanBuilder<'a> { @@ -82,7 +77,6 @@ impl<'a> TableScanBuilder<'a> { concurrency_limit_manifest_files: num_cpus, row_group_filtering_enabled: true, row_selection_enabled: false, - delete_file_processing_enabled: false, } } @@ -189,17 +183,6 @@ impl<'a> TableScanBuilder<'a> { self } - /// Determines whether to enable delete file processing (currently disabled by default) - /// - /// When disabled, delete files are ignored. - pub fn with_delete_file_processing_enabled( - mut self, - delete_file_processing_enabled: bool, - ) -> Self { - self.delete_file_processing_enabled = delete_file_processing_enabled; - self - } - /// Build the table scan. pub fn build(self) -> Result { let snapshot = match self.snapshot_id { @@ -226,7 +209,6 @@ impl<'a> TableScanBuilder<'a> { concurrency_limit_manifest_files: self.concurrency_limit_manifest_files, row_group_filtering_enabled: self.row_group_filtering_enabled, row_selection_enabled: self.row_selection_enabled, - delete_file_processing_enabled: self.delete_file_processing_enabled, }); }; current_snapshot_id.clone() @@ -317,7 +299,6 @@ impl<'a> TableScanBuilder<'a> { concurrency_limit_manifest_files: self.concurrency_limit_manifest_files, row_group_filtering_enabled: self.row_group_filtering_enabled, row_selection_enabled: self.row_selection_enabled, - delete_file_processing_enabled: self.delete_file_processing_enabled, }) } } @@ -346,7 +327,6 @@ pub struct TableScan { row_group_filtering_enabled: bool, row_selection_enabled: bool, - delete_file_processing_enabled: bool, } impl TableScan { @@ -368,12 +348,7 @@ impl TableScan { // used to stream the results back to the caller let (file_scan_task_tx, file_scan_task_rx) = channel(concurrency_limit_manifest_entries); - let delete_file_idx_and_tx: Option<(DeleteFileIndex, Sender)> = - if self.delete_file_processing_enabled { - Some(DeleteFileIndex::new()) - } else { - None - }; + let (delete_file_idx, delete_file_tx) = DeleteFileIndex::new(); let manifest_list = plan_context.get_manifest_list().await?; @@ -383,9 +358,8 @@ impl TableScan { let manifest_file_contexts = plan_context.build_manifest_file_contexts( manifest_list, manifest_entry_data_ctx_tx, - delete_file_idx_and_tx.as_ref().map(|(delete_file_idx, _)| { - (delete_file_idx.clone(), manifest_entry_delete_ctx_tx) - }), + delete_file_idx.clone(), + manifest_entry_delete_ctx_tx, )?; let mut channel_for_manifest_error = file_scan_task_tx.clone(); @@ -404,34 +378,30 @@ impl TableScan { }); let mut channel_for_data_manifest_entry_error = file_scan_task_tx.clone(); + let mut channel_for_delete_manifest_entry_error = file_scan_task_tx.clone(); - if let Some((_, delete_file_tx)) = delete_file_idx_and_tx { - let mut channel_for_delete_manifest_entry_error = file_scan_task_tx.clone(); - - // Process the delete file [`ManifestEntry`] stream in parallel - spawn(async move { - let result = manifest_entry_delete_ctx_rx - .map(|me_ctx| Ok((me_ctx, delete_file_tx.clone()))) - .try_for_each_concurrent( - concurrency_limit_manifest_entries, - |(manifest_entry_context, tx)| async move { - spawn(async move { - Self::process_delete_manifest_entry(manifest_entry_context, tx) - .await - }) - .await - }, - ) - .await; + // Process the delete file [`ManifestEntry`] stream in parallel + spawn(async move { + let result = manifest_entry_delete_ctx_rx + .map(|me_ctx| Ok((me_ctx, delete_file_tx.clone()))) + .try_for_each_concurrent( + concurrency_limit_manifest_entries, + |(manifest_entry_context, tx)| async move { + spawn(async move { + Self::process_delete_manifest_entry(manifest_entry_context, tx).await + }) + .await + }, + ) + .await; - if let Err(error) = result { - let _ = channel_for_delete_manifest_entry_error - .send(Err(error)) - .await; - } - }) - .await; - } + if let Err(error) = result { + let _ = channel_for_delete_manifest_entry_error + .send(Err(error)) + .await; + } + }) + .await; // Process the data file [`ManifestEntry`] stream in parallel spawn(async move { diff --git a/crates/integration_tests/tests/shared_tests/read_positional_deletes.rs b/crates/integration_tests/tests/shared_tests/read_positional_deletes.rs index 43a50c65f5..34085eeee4 100644 --- a/crates/integration_tests/tests/shared_tests/read_positional_deletes.rs +++ b/crates/integration_tests/tests/shared_tests/read_positional_deletes.rs @@ -37,11 +37,7 @@ async fn test_read_table_with_positional_deletes() { .await .unwrap(); - let scan = table - .scan() - .with_delete_file_processing_enabled(true) - .build() - .unwrap(); + let scan = table.scan().build().unwrap(); println!("{:?}", scan); let plan: Vec<_> = scan From 462b2f77c4096b4efd1239373cb8dcfd6b2cb90a Mon Sep 17 00:00:00 2001 From: Scott Donnelly Date: Sat, 17 May 2025 21:21:26 +0100 Subject: [PATCH 10/12] changes required after rebase on main --- .../src/arrow/caching_delete_file_loader.rs | 36 +++++++++---------- .../iceberg/src/arrow/delete_file_loader.rs | 2 +- crates/iceberg/src/delete_vector.rs | 4 +-- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/crates/iceberg/src/arrow/caching_delete_file_loader.rs b/crates/iceberg/src/arrow/caching_delete_file_loader.rs index 8bd4f43a7c..3c52ba49c0 100644 --- a/crates/iceberg/src/arrow/caching_delete_file_loader.rs +++ b/crates/iceberg/src/arrow/caching_delete_file_loader.rs @@ -20,7 +20,7 @@ use std::collections::HashMap; use futures::channel::oneshot; use futures::future::join_all; use futures::{StreamExt, TryStreamExt}; -use tokio::sync::oneshot::{channel, Receiver}; +use tokio::sync::oneshot::{Receiver, channel}; use super::delete_filter::{DeleteFilter, EqDelFuture}; use crate::arrow::delete_file_loader::BasicDeleteFileLoader; @@ -70,30 +70,30 @@ impl CachingDeleteFileLoader { /// Returned future completes once all loading has finished. /// /// * Create a single stream of all delete file tasks irrespective of type, - /// so that we can respect the combined concurrency limit + /// so that we can respect the combined concurrency limit /// * We then process each in two phases: load and parse. /// * for positional deletes the load phase instantiates an ArrowRecordBatchStream to - /// stream the file contents out + /// stream the file contents out /// * for eq deletes, we first check if the EQ delete is already loaded or being loaded by - /// another concurrently processing data file scan task. If it is, we return a future - /// for the pre-existing task from the load phase. If not, we create such a future - /// and store it in the state to prevent other data file tasks from starting to load - /// the same equality delete file, and return a record batch stream from the load phase - /// as per the other delete file types - only this time it is accompanied by a one-shot - /// channel sender that we will eventually use to resolve the shared future that we stored - /// in the state. + /// another concurrently processing data file scan task. If it is, we return a future + /// for the pre-existing task from the load phase. If not, we create such a future + /// and store it in the state to prevent other data file tasks from starting to load + /// the same equality delete file, and return a record batch stream from the load phase + /// as per the other delete file types - only this time it is accompanied by a one-shot + /// channel sender that we will eventually use to resolve the shared future that we stored + /// in the state. /// * When this gets updated to add support for delete vectors, the load phase will return - /// a PuffinReader for them. + /// a PuffinReader for them. /// * The parse phase parses each record batch stream according to its associated data type. - /// The result of this is a map of data file paths to delete vectors for the positional - /// delete tasks (and in future for the delete vector tasks). For equality delete - /// file tasks, this results in an unbound Predicate. + /// The result of this is a map of data file paths to delete vectors for the positional + /// delete tasks (and in future for the delete vector tasks). For equality delete + /// file tasks, this results in an unbound Predicate. /// * The unbound Predicates resulting from equality deletes are sent to their associated oneshot - /// channel to store them in the right place in the delete file managers state. + /// channel to store them in the right place in the delete file managers state. /// * The results of all of these futures are awaited on in parallel with the specified - /// level of concurrency and collected into a vec. We then combine all the delete - /// vector maps that resulted from any positional delete or delete vector files into a - /// single map and persist it in the state. + /// level of concurrency and collected into a vec. We then combine all the delete + /// vector maps that resulted from any positional delete or delete vector files into a + /// single map and persist it in the state. /// /// /// Conceptually, the data flow is like this: diff --git a/crates/iceberg/src/arrow/delete_file_loader.rs b/crates/iceberg/src/arrow/delete_file_loader.rs index 608d9b6ead..802ea794c4 100644 --- a/crates/iceberg/src/arrow/delete_file_loader.rs +++ b/crates/iceberg/src/arrow/delete_file_loader.rs @@ -19,8 +19,8 @@ use std::sync::Arc; use futures::{StreamExt, TryStreamExt}; -use crate::arrow::record_batch_transformer::RecordBatchTransformer; use crate::arrow::ArrowReader; +use crate::arrow::record_batch_transformer::RecordBatchTransformer; use crate::io::FileIO; use crate::scan::{ArrowRecordBatchStream, FileScanTaskDeleteFile}; use crate::spec::{Schema, SchemaRef}; diff --git a/crates/iceberg/src/delete_vector.rs b/crates/iceberg/src/delete_vector.rs index feb4eeea99..e4ab74f108 100644 --- a/crates/iceberg/src/delete_vector.rs +++ b/crates/iceberg/src/delete_vector.rs @@ -17,9 +17,9 @@ use std::ops::BitOrAssign; +use roaring::RoaringTreemap; use roaring::bitmap::Iter; use roaring::treemap::BitmapIter; -use roaring::RoaringTreemap; #[derive(Debug, Default)] pub struct DeleteVector { @@ -63,7 +63,7 @@ impl Iterator for DeleteVectorIterator<'_> { type Item = u64; fn next(&mut self) -> Option { - if let Some(ref mut inner) = &mut self.inner { + if let Some(inner) = &mut self.inner { if let Some(inner_next) = inner.bitmap_iter.next() { return Some(u64::from(inner.high_bits) << 32 | u64::from(inner_next)); } From ef288093fb0f0d4dcc1a77214c072ff61cae0aa5 Mon Sep 17 00:00:00 2001 From: Scott Donnelly Date: Mon, 19 May 2025 07:57:05 +0100 Subject: [PATCH 11/12] fix: handle WouldBlock correctly in DeleteFileIndex --- crates/iceberg/src/delete_file_index.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crates/iceberg/src/delete_file_index.rs b/crates/iceberg/src/delete_file_index.rs index e06237c12e..f9d5cba99b 100644 --- a/crates/iceberg/src/delete_file_index.rs +++ b/crates/iceberg/src/delete_file_index.rs @@ -20,7 +20,7 @@ use std::fmt::{self, Debug}; use std::future::Future; use std::ops::Deref; use std::pin::Pin; -use std::sync::{Arc, RwLock}; +use std::sync::{Arc, RwLock, TryLockError}; use std::task::{Context, Poll}; use futures::StreamExt; @@ -235,6 +235,10 @@ impl Future for DeletesForDataFile<'_> { Poll::Pending } }, + Err(TryLockError::WouldBlock) => { + self.waker_set.insert(cx); + Poll::Pending + } Err(err) => Poll::Ready(Err(Error::new(ErrorKind::Unexpected, err.to_string()))), } } From b1470983adb534f856abe2e49379f980ba2afb5b Mon Sep 17 00:00:00 2001 From: Scott Donnelly Date: Wed, 21 May 2025 19:32:03 +0100 Subject: [PATCH 12/12] refactor: use Notify and oneshot channel rather than custom Future --- Cargo.toml | 1 - crates/iceberg/Cargo.toml | 1 - .../src/arrow/caching_delete_file_loader.rs | 114 +++++++--------- crates/iceberg/src/arrow/delete_filter.rs | 124 ++++++++++-------- crates/iceberg/src/delete_file_index.rs | 97 +++++--------- crates/iceberg/src/scan/context.rs | 2 +- 6 files changed, 149 insertions(+), 190 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a3b3c6a037..432411b2de 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -118,5 +118,4 @@ url = "2.5.4" uuid = { version = "1.16", features = ["v7"] } volo = "0.10.6" volo-thrift = "0.10.6" -waker-set = "0.2.0" zstd = "0.13.2" diff --git a/crates/iceberg/Cargo.toml b/crates/iceberg/Cargo.toml index b9614c91a6..471b0cbe9f 100644 --- a/crates/iceberg/Cargo.toml +++ b/crates/iceberg/Cargo.toml @@ -85,7 +85,6 @@ tokio = { workspace = true, optional = false, features = ["sync"] } typed-builder = { workspace = true } url = { workspace = true } uuid = { workspace = true } -waker-set = { workspace = true } zstd = { workspace = true } [dev-dependencies] diff --git a/crates/iceberg/src/arrow/caching_delete_file_loader.rs b/crates/iceberg/src/arrow/caching_delete_file_loader.rs index 3c52ba49c0..a48ebe5dfd 100644 --- a/crates/iceberg/src/arrow/caching_delete_file_loader.rs +++ b/crates/iceberg/src/arrow/caching_delete_file_loader.rs @@ -17,12 +17,10 @@ use std::collections::HashMap; -use futures::channel::oneshot; -use futures::future::join_all; use futures::{StreamExt, TryStreamExt}; use tokio::sync::oneshot::{Receiver, channel}; -use super::delete_filter::{DeleteFilter, EqDelFuture}; +use super::delete_filter::DeleteFilter; use crate::arrow::delete_file_loader::BasicDeleteFileLoader; use crate::delete_vector::DeleteVector; use crate::expr::Predicate; @@ -40,18 +38,17 @@ pub(crate) struct CachingDeleteFileLoader { // Intermediate context during processing of a delete file task. enum DeleteFileContext { // TODO: Delete Vector loader from Puffin files - InProgEqDel(EqDelFuture), + ExistingEqDel, PosDels(ArrowRecordBatchStream), FreshEqDel { batch_stream: ArrowRecordBatchStream, - sender: oneshot::Sender, + sender: tokio::sync::oneshot::Sender, }, } // Final result of the processing of a delete file task before // results are fully merged into the DeleteFileManager's state enum ParsedDeleteFileContext { - InProgEqDel(EqDelFuture), DelVecs(HashMap), EqDel, } @@ -65,9 +62,11 @@ impl CachingDeleteFileLoader { } } - /// Load the deletes for all the specified tasks + /// Initiates loading of all deletes for all the specified tasks /// - /// Returned future completes once all loading has finished. + /// Returned future completes once all positional deletes and delete vectors + /// have loaded. EQ deletes are not waited for in this method but the returned + /// DeleteFilter will await their loading when queried for them. /// /// * Create a single stream of all delete file tasks irrespective of type, /// so that we can respect the combined concurrency limit @@ -75,13 +74,11 @@ impl CachingDeleteFileLoader { /// * for positional deletes the load phase instantiates an ArrowRecordBatchStream to /// stream the file contents out /// * for eq deletes, we first check if the EQ delete is already loaded or being loaded by - /// another concurrently processing data file scan task. If it is, we return a future - /// for the pre-existing task from the load phase. If not, we create such a future - /// and store it in the state to prevent other data file tasks from starting to load - /// the same equality delete file, and return a record batch stream from the load phase - /// as per the other delete file types - only this time it is accompanied by a one-shot - /// channel sender that we will eventually use to resolve the shared future that we stored - /// in the state. + /// another concurrently processing data file scan task. If it is, we skip it. + /// If not, the DeleteFilter is updated to contain a notifier to prevent other data file + /// tasks from starting to load the same equality delete file. We spawn a task to load + /// the EQ delete's record batch stream, convert it to a predicate, update the delete filter, + /// and notify any task that was waiting for it. /// * When this gets updated to add support for delete vectors, the load phase will return /// a PuffinReader for them. /// * The parse phase parses each record batch stream according to its associated data type. @@ -100,35 +97,34 @@ impl CachingDeleteFileLoader { /// ```none /// FileScanTaskDeleteFile /// | - /// Already-loading EQ Delete | Everything Else - /// +---------------------------------------------------+ - /// | | - /// [get existing future] [load recordbatch stream / puffin] - /// DeleteFileContext::InProgEqDel DeleteFileContext - /// | | - /// | | - /// | +-----------------------------+--------------------------+ - /// | Pos Del Del Vec (Not yet Implemented) EQ Del - /// | | | | - /// | [parse pos del stream] [parse del vec puffin] [parse eq del] - /// | HashMap HashMap (Predicate, Sender) - /// | | | | - /// | | | [persist to state] - /// | | | () - /// | | | | - /// | +-----------------------------+--------------------------+ - /// | | - /// | [buffer unordered] - /// | | - /// | [combine del vectors] - /// | HashMap - /// | | - /// | [persist del vectors to state] - /// | () - /// | | - /// +-------------------------+-------------------------+ - /// | - /// [join!] + /// Skip Started EQ Deletes + /// | + /// | + /// [load recordbatch stream / puffin] + /// DeleteFileContext + /// | + /// | + /// +-----------------------------+--------------------------+ + /// Pos Del Del Vec (Not yet Implemented) EQ Del + /// | | | + /// [parse pos del stream] [parse del vec puffin] [parse eq del] + /// HashMap HashMap (Predicate, Sender) + /// | | | + /// | | [persist to state] + /// | | () + /// | | | + /// +-----------------------------+--------------------------+ + /// | + /// [buffer unordered] + /// | + /// [combine del vectors] + /// HashMap + /// | + /// [persist del vectors to state] + /// () + /// | + /// | + /// [join!] /// ``` pub(crate) fn load_deletes( &self, @@ -150,6 +146,7 @@ impl CachingDeleteFileLoader { }) .collect::>(); let task_stream = futures::stream::iter(stream_items); + let del_filter = del_filter.clone(); let concurrency_limit_data_files = self.concurrency_limit_data_files; let basic_delete_file_loader = self.basic_delete_file_loader.clone(); @@ -178,16 +175,6 @@ impl CachingDeleteFileLoader { .try_collect::>() .await?; - // wait for all in-progress EQ deletes from other tasks - let _ = join_all(results.iter().filter_map(|i| { - if let ParsedDeleteFileContext::InProgEqDel(fut) = i { - Some(fut.clone()) - } else { - None - } - })) - .await; - for item in results { if let ParsedDeleteFileContext::DelVecs(hash_map) = item { for (data_file_path, delete_vector) in hash_map.into_iter() { @@ -220,20 +207,13 @@ impl CachingDeleteFileLoader { )), DataContentType::EqualityDeletes => { - let sender = { - if let Some(existing) = del_filter - .get_equality_delete_predicate_for_delete_file_path(&task.file_path) - { - return Ok(DeleteFileContext::InProgEqDel(existing.clone())); - } - - let (sender, fut) = EqDelFuture::new(); - - del_filter.insert_equality_delete(task.file_path.to_string(), fut); - - sender + let Some(notify) = del_filter.try_start_eq_del_load(&task.file_path) else { + return Ok(DeleteFileContext::ExistingEqDel); }; + let (sender, receiver) = channel(); + del_filter.insert_equality_delete(&task.file_path, receiver); + Ok(DeleteFileContext::FreshEqDel { batch_stream: BasicDeleteFileLoader::evolve_schema( basic_delete_file_loader @@ -257,7 +237,7 @@ impl CachingDeleteFileLoader { ctx: DeleteFileContext, ) -> Result { match ctx { - DeleteFileContext::InProgEqDel(fut) => Ok(ParsedDeleteFileContext::InProgEqDel(fut)), + DeleteFileContext::ExistingEqDel => Ok(ParsedDeleteFileContext::EqDel), DeleteFileContext::PosDels(batch_stream) => { let del_vecs = Self::parse_positional_deletes_record_batch_stream(batch_stream).await?; diff --git a/crates/iceberg/src/arrow/delete_filter.rs b/crates/iceberg/src/arrow/delete_filter.rs index b6e50da1c0..e2acab1923 100644 --- a/crates/iceberg/src/arrow/delete_filter.rs +++ b/crates/iceberg/src/arrow/delete_filter.rs @@ -16,12 +16,10 @@ // under the License. use std::collections::HashMap; -use std::future::Future; -use std::pin::Pin; -use std::sync::{Arc, Mutex, OnceLock, RwLock}; -use std::task::{Context, Poll}; +use std::sync::{Arc, Mutex, RwLock}; -use futures::channel::oneshot; +use tokio::sync::Notify; +use tokio::sync::oneshot::Receiver; use crate::delete_vector::DeleteVector; use crate::expr::Predicate::AlwaysTrue; @@ -30,46 +28,16 @@ use crate::scan::{FileScanTask, FileScanTaskDeleteFile}; use crate::spec::DataContentType; use crate::{Error, ErrorKind, Result}; -// Equality deletes may apply to more than one DataFile in a scan, and so -// the same equality delete file may be present in more than one invocation of -// DeleteFileManager::load_deletes in the same scan. We want to deduplicate these -// to avoid having to load them twice, so we immediately store cloneable futures in the -// state that can be awaited upon to get te EQ deletes. That way we can check to see if -// a load of each Eq delete file is already in progress and avoid starting another one. -#[derive(Debug, Clone)] -pub(crate) struct EqDelFuture { - result: OnceLock, -} - -impl EqDelFuture { - pub(crate) fn new() -> (oneshot::Sender, Self) { - let (tx, rx) = oneshot::channel(); - let result = OnceLock::new(); - - crate::runtime::spawn({ - let result = result.clone(); - async move { result.set(rx.await.unwrap()) } - }); - - (tx, Self { result }) - } -} - -impl Future for EqDelFuture { - type Output = Predicate; - - fn poll(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll { - match self.result.get() { - None => Poll::Pending, - Some(predicate) => Poll::Ready(predicate.clone()), - } - } +#[derive(Debug)] +enum EqDelState { + Loading(Arc), + Loaded(Predicate), } #[derive(Debug, Default)] struct DeleteFileFilterState { delete_vectors: HashMap>>, - equality_deletes: HashMap, + equality_deletes: HashMap, } #[derive(Clone, Debug, Default)] @@ -97,17 +65,42 @@ impl DeleteFilter { .and_then(|st| st.delete_vectors.get(delete_file_path).cloned()) } + pub(crate) fn try_start_eq_del_load(&self, file_path: &str) -> Option> { + let mut state = self.state.write().unwrap(); + + if !state.equality_deletes.contains_key(file_path) { + return None; + } + + let notifier = Arc::new(Notify::new()); + state + .equality_deletes + .insert(file_path.to_string(), EqDelState::Loading(notifier.clone())); + + Some(notifier) + } + /// Retrieve the equality delete predicate for a given eq delete file path - pub(crate) fn get_equality_delete_predicate_for_delete_file_path( + pub(crate) async fn get_equality_delete_predicate_for_delete_file_path( &self, file_path: &str, - ) -> Option { - self.state - .read() - .unwrap() - .equality_deletes - .get(file_path) - .cloned() + ) -> Option { + let notifier = { + match self.state.read().unwrap().equality_deletes.get(file_path) { + None => return None, + Some(EqDelState::Loading(notifier)) => notifier.clone(), + Some(EqDelState::Loaded(predicate)) => { + return Some(predicate.clone()); + } + } + }; + + notifier.notified().await; + + match self.state.read().unwrap().equality_deletes.get(file_path) { + Some(EqDelState::Loaded(predicate)) => Some(predicate.clone()), + _ => unreachable!("Cannot be any other state than loaded"), + } } /// Builds eq delete predicate for the provided task. @@ -126,8 +119,9 @@ impl DeleteFilter { continue; } - let Some(predicate) = - self.get_equality_delete_predicate_for_delete_file_path(&delete.file_path) + let Some(predicate) = self + .get_equality_delete_predicate_for_delete_file_path(&delete.file_path) + .await else { return Err(Error::new( ErrorKind::Unexpected, @@ -138,7 +132,7 @@ impl DeleteFilter { )); }; - combined_predicate = combined_predicate.and(predicate.await); + combined_predicate = combined_predicate.and(predicate); } if combined_predicate == AlwaysTrue { @@ -167,10 +161,32 @@ impl DeleteFilter { *entry.lock().unwrap() |= delete_vector; } - pub(crate) fn insert_equality_delete(&self, delete_file_path: String, eq_del: EqDelFuture) { - let mut state = self.state.write().unwrap(); + pub(crate) fn insert_equality_delete( + &self, + delete_file_path: &str, + eq_del: Receiver, + ) { + let notify = Arc::new(Notify::new()); + { + let mut state = self.state.write().unwrap(); + state.equality_deletes.insert( + delete_file_path.to_string(), + EqDelState::Loading(notify.clone()), + ); + } - state.equality_deletes.insert(delete_file_path, eq_del); + let state = self.state.clone(); + let delete_file_path = delete_file_path.to_string(); + crate::runtime::spawn(async move { + let eq_del = eq_del.await.unwrap(); + { + let mut state = state.write().unwrap(); + state + .equality_deletes + .insert(delete_file_path, EqDelState::Loaded(eq_del)); + } + notify.notify_waiters(); + }); } } diff --git a/crates/iceberg/src/delete_file_index.rs b/crates/iceberg/src/delete_file_index.rs index f9d5cba99b..d8f7a872e1 100644 --- a/crates/iceberg/src/delete_file_index.rs +++ b/crates/iceberg/src/delete_file_index.rs @@ -16,44 +16,29 @@ // under the License. use std::collections::HashMap; -use std::fmt::{self, Debug}; -use std::future::Future; use std::ops::Deref; -use std::pin::Pin; -use std::sync::{Arc, RwLock, TryLockError}; -use std::task::{Context, Poll}; +use std::sync::{Arc, RwLock}; use futures::StreamExt; use futures::channel::mpsc::{Sender, channel}; -use waker_set::WakerSet; +use tokio::sync::Notify; use crate::runtime::spawn; use crate::scan::{DeleteFileContext, FileScanTaskDeleteFile}; use crate::spec::{DataContentType, DataFile, Struct}; -use crate::{Error, ErrorKind, Result}; /// Index of delete files -#[derive(Clone)] +#[derive(Debug, Clone)] pub(crate) struct DeleteFileIndex { state: Arc>, - waker_set: Arc, } #[derive(Debug)] enum DeleteFileIndexState { - Populating, + Populating(Arc), Populated(PopulatedDeleteFileIndex), } -impl Debug for DeleteFileIndex { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("DeleteFileIndex") - .field("state", &self.state) - .field("waker_set", &"") - .finish() - } -} - #[derive(Debug)] struct PopulatedDeleteFileIndex { #[allow(dead_code)] @@ -71,13 +56,14 @@ impl DeleteFileIndex { pub(crate) fn new() -> (DeleteFileIndex, Sender) { // TODO: what should the channel limit be? let (tx, rx) = channel(10); - let waker_set = Arc::new(WakerSet::new()); - let state = Arc::new(RwLock::new(DeleteFileIndexState::Populating)); + let notify = Arc::new(Notify::new()); + let state = Arc::new(RwLock::new(DeleteFileIndexState::Populating( + notify.clone(), + ))); let delete_file_stream = rx.boxed(); spawn({ let state = state.clone(); - let waker_set = waker_set.clone(); async move { let delete_files = delete_file_stream.collect::>().await; @@ -87,27 +73,37 @@ impl DeleteFileIndex { let mut guard = state.write().unwrap(); *guard = DeleteFileIndexState::Populated(populated_delete_file_index); } - - waker_set.notify_all(); + notify.notify_waiters(); } }); - (DeleteFileIndex { state, waker_set }, tx) + (DeleteFileIndex { state }, tx) } /// Gets all the delete files that apply to the specified data file. - /// - /// Returns a future that resolves to a Result> - pub(crate) fn get_deletes_for_data_file<'a>( + pub(crate) async fn get_deletes_for_data_file( &self, - data_file: &'a DataFile, + data_file: &DataFile, seq_num: Option, - ) -> DeletesForDataFile<'a> { - DeletesForDataFile { - state: self.state.clone(), - data_file, - seq_num, - waker_set: self.waker_set.clone(), + ) -> Vec { + let notifier = { + let guard = self.state.read().unwrap(); + match *guard { + DeleteFileIndexState::Populating(ref notifier) => notifier.clone(), + DeleteFileIndexState::Populated(ref index) => { + return index.get_deletes_for_data_file(data_file, seq_num); + } + } + }; + + notifier.notified().await; + + let guard = self.state.read().unwrap(); + match guard.deref() { + DeleteFileIndexState::Populated(index) => { + index.get_deletes_for_data_file(data_file, seq_num) + } + _ => unreachable!("Cannot be any other state than loaded"), } } } @@ -212,34 +208,3 @@ impl PopulatedDeleteFileIndex { results } } - -/// Future for the `DeleteFileIndex::get_deletes_for_data_file` method -pub(crate) struct DeletesForDataFile<'a> { - state: Arc>, - data_file: &'a DataFile, - seq_num: Option, - waker_set: Arc, -} - -impl Future for DeletesForDataFile<'_> { - type Output = Result>; - - fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { - match self.state.try_read() { - Ok(guard) => match guard.deref() { - DeleteFileIndexState::Populated(idx) => Poll::Ready(Ok( - idx.get_deletes_for_data_file(self.data_file, self.seq_num) - )), - _ => { - self.waker_set.insert(cx); - Poll::Pending - } - }, - Err(TryLockError::WouldBlock) => { - self.waker_set.insert(cx); - Poll::Pending - } - Err(err) => Poll::Ready(Err(Error::new(ErrorKind::Unexpected, err.to_string()))), - } - } -} diff --git a/crates/iceberg/src/scan/context.rs b/crates/iceberg/src/scan/context.rs index f40d0ea6b0..703cbd01a6 100644 --- a/crates/iceberg/src/scan/context.rs +++ b/crates/iceberg/src/scan/context.rs @@ -111,7 +111,7 @@ impl ManifestEntryContext { self.manifest_entry.data_file(), self.manifest_entry.sequence_number(), ) - .await?; + .await; Ok(FileScanTask { start: 0,