Eventual-Inc · jaychia · Nov 14, 2024 · Nov 15, 2024 · Nov 15, 2024 · Nov 15, 2024
diff --git a/src/daft-micropartition/src/micropartition.rs b/src/daft-micropartition/src/micropartition.rs
@@ -1179,6 +1179,7 @@ pub fn read_parquet_into_micropartition<T: AsRef<str>>(
                 .map(|((url, metadata), rgs)| DataSource::File {
                     path: url,
                     chunk_spec: rgs.map(ChunkSpec::Parquet),
+                    // INVESTIGATE(jay): This seems wrong, as this is the sum of the sizes of all the files rather than individual files?
                     size_bytes: Some(size_bytes),
                     iceberg_delete_files: None,
                     metadata: None,
@@ -1216,6 +1217,9 @@ pub fn read_parquet_into_micropartition<T: AsRef<str>>(
                 num_rows,
             ),
             generated_fields,
+            // NOTE: Skip propagating estimated size for lazily-materialized MicroPartitions
+            // since this information is not actually leveraged beyond planning time
+            None,
         );
 
         let fill_map = scan_task.partition_spec().map(|pspec| pspec.to_fill_map());

diff --git a/src/daft-micropartition/src/ops/cast_to_schema.rs b/src/daft-micropartition/src/ops/cast_to_schema.rs
@@ -29,6 +29,9 @@ impl MicroPartition {
                         scan_task.storage_config.clone(),
                         scan_task.pushdowns.clone(),
                         scan_task.generated_fields.clone(),
+                        // NOTE: Skip propagating estimated size for lazily-materialized MicroPartitions
+                        // since this information is not actually leveraged beyond planning time
+                        None,
                     ))
                 };
                 Ok(Self::new_unloaded(

diff --git a/src/daft-scan/src/anonymous.rs b/src/daft-scan/src/anonymous.rs
@@ -114,6 +114,7 @@ impl ScanOperator for AnonymousScanOperator {
                     storage_config.clone(),
                     pushdowns.clone(),
                     None,
+                    None,
                 )
                 .into())
             }));

diff --git a/src/daft-scan/src/glob.rs b/src/daft-scan/src/glob.rs
@@ -22,6 +22,7 @@ use snafu::Snafu;
 use crate::{
     hive::{hive_partitions_to_fields, hive_partitions_to_series, parse_hive_partitioning},
     scan_task_iters::{merge_by_sizes, split_by_row_groups, BoxScanTaskIter},
+    size_estimations::FileInferredEstimator,
     storage_config::StorageConfig,
     ChunkSpec, DataSource, ScanTask,
 };
@@ -35,6 +36,7 @@ pub struct GlobScanOperator {
     hive_partitioning: bool,
     partitioning_keys: Vec<PartitionField>,
     generated_fields: SchemaRef,
+    size_estimator: Option<FileInferredEstimator>,
 }
 
 /// Wrapper struct that implements a sync Iterator for a BoxStream
@@ -179,13 +181,15 @@ impl GlobScanOperator {
             }
             .into()),
         }?;
+
         // If hive partitioning is set, create partition fields from the hive partitions.
         let mut partition_fields = if hive_partitioning {
             let hive_partitions = parse_hive_partitioning(&first_filepath)?;
             hive_partitions_to_fields(&hive_partitions)
         } else {
             vec![]
         };
+
         // If file path column is set, extend the partition fields.
         if let Some(fp_col) = &file_path_column {
             let fp_field = Field::new(fp_col, DataType::Utf8);
@@ -207,87 +211,110 @@ impl GlobScanOperator {
             (partitioning_keys, generated_fields)
         };
 
-        let schema = match infer_schema {
-            true => {
-                let inferred_schema = match file_format_config.as_ref() {
-                    &FileFormatConfig::Parquet(ParquetSourceConfig {
+        // Helper to handle schemas that are inferred from files, and resolve them against any user-provided schemas
+        let apply_user_provided_schema = |schema_from_file| -> DaftResult<SchemaRef> {
+            let final_schema = if infer_schema {
+                Arc::new(schema_from_file)
+            } else {
+                user_provided_schema
+                    .clone()
+                    .expect("Schema must be provided if infer_schema is false")
+            };
+            match user_provided_schema {
+                Some(hint) => Ok(Arc::new(final_schema.apply_hints(&hint)?)),
+                None => Ok(final_schema),
+            }
+        };
+
+        let (schema, size_estimator) = match file_format_config.as_ref() {
+            &FileFormatConfig::Parquet(ParquetSourceConfig {
+                coerce_int96_timestamp_unit,
+                ref field_id_mapping,
+                ..
+            }) => {
+                let io_stats = IOStatsContext::new(format!(
+                    "GlobScanOperator constructor read_parquet_schema: for uri {first_filepath}"
+                ));
+
+                let (schema_from_file, metadata) = daft_parquet::read::read_parquet_schema(
+                    first_filepath.as_str(),
+                    io_client,
+                    Some(io_stats),
+                    ParquetSchemaInferenceOptions {
                         coerce_int96_timestamp_unit,
-                        ref field_id_mapping,
-                        ..
-                    }) => {
-                        let io_stats = IOStatsContext::new(format!(
-                            "GlobScanOperator constructor read_parquet_schema: for uri {first_filepath}"
-                        ));
-
-                        let (schema, _metadata) = daft_parquet::read::read_parquet_schema(
-                            first_filepath.as_str(),
-                            io_client,
-                            Some(io_stats),
-                            ParquetSchemaInferenceOptions {
-                                coerce_int96_timestamp_unit,
-                                ..Default::default()
-                            },
-                            field_id_mapping.clone(),
-                        )?;
-
-                        schema
-                    }
-                    FileFormatConfig::Csv(CsvSourceConfig {
-                        delimiter,
-                        has_headers,
-                        double_quote,
-                        quote,
-                        escape_char,
-                        comment,
-                        allow_variable_columns,
-                        ..
-                    }) => {
-                        let (schema, _) = daft_csv::metadata::read_csv_schema(
-                            first_filepath.as_str(),
-                            Some(CsvParseOptions::new_with_defaults(
-                                *has_headers,
-                                *delimiter,
-                                *double_quote,
-                                *quote,
-                                *allow_variable_columns,
-                                *escape_char,
-                                *comment,
-                            )?),
-                            None,
-                            io_client,
-                            Some(io_stats),
-                        )?;
-                        schema
-                    }
-                    FileFormatConfig::Json(_) => daft_json::schema::read_json_schema(
-                        first_filepath.as_str(),
-                        None,
-                        None,
-                        io_client,
-                        Some(io_stats),
-                    )?,
-                    #[cfg(feature = "python")]
-                    FileFormatConfig::Database(_) => {
-                        return Err(DaftError::ValueError(
-                            "Cannot glob a database source".to_string(),
-                        ))
-                    }
-                    #[cfg(feature = "python")]
-                    FileFormatConfig::PythonFunction => {
-                        return Err(DaftError::ValueError(
-                            "Cannot glob a PythonFunction source".to_string(),
-                        ))
-                    }
-                };
-                match user_provided_schema {
-                    Some(hint) => Arc::new(inferred_schema.apply_hints(&hint)?),
-                    None => Arc::new(inferred_schema),
-                }
+                        ..Default::default()
+                    },
+                    field_id_mapping.clone(),
+                )?;
+                let final_schema = apply_user_provided_schema(schema_from_file)?;
+                let size_estimator =
+                    FileInferredEstimator::from_parquet_metadata(final_schema.clone(), &metadata);
+                (final_schema, Some(size_estimator))
             }
-            false => {
-                user_provided_schema.expect("Schema must be provided if infer_schema is false")
+
+            FileFormatConfig::Csv(CsvSourceConfig {
+                delimiter,
+                has_headers,
+                double_quote,
+                quote,
+                escape_char,
+                comment,
+                allow_variable_columns,
+                ..
+            }) => {
+                let (schema_from_file, _csv_read_stats) = daft_csv::metadata::read_csv_schema(
+                    first_filepath.as_str(),
+                    Some(CsvParseOptions::new_with_defaults(
+                        *has_headers,
+                        *delimiter,
+                        *double_quote,
+                        *quote,
+                        *allow_variable_columns,
+                        *escape_char,
+                        *comment,
+                    )?),
+                    None,
+                    io_client,
+                    Some(io_stats),
+                )?;
+
+                // TODO: Make use of read CSV stats to create a FileInferredEstimator
+                // let size_estimator = FileInferredEstimator::from_csv_metadata(final_schema.clone(), &metadata);
+                let final_schema = apply_user_provided_schema(schema_from_file)?;
+                let size_estimator = None;
+                (final_schema, size_estimator)
+            }
+
+            FileFormatConfig::Json(_) => {
+                let schema_from_file = daft_json::schema::read_json_schema(
+                    first_filepath.as_str(),
+                    None,
+                    None,
+                    io_client,
+                    Some(io_stats),
+                )?;
+
+                // TODO: Make use of read JSON stats to create a FileInferredEstimator
+                // let size_estimator = FileInferredEstimator::from_json_metadata(final_schema.clone(), &metadata);
+                let final_schema = apply_user_provided_schema(schema_from_file)?;
+                let size_estimator = None;
+                (final_schema, size_estimator)
+            }
+
+            #[cfg(feature = "python")]
+            FileFormatConfig::Database(_) => {
+                return Err(DaftError::ValueError(
+                    "Cannot glob a database source".to_string(),
+                ))
+            }
+            #[cfg(feature = "python")]
+            FileFormatConfig::PythonFunction => {
+                return Err(DaftError::ValueError(
+                    "Cannot glob a PythonFunction source".to_string(),
+                ))
             }
         };
+
         Ok(Self {
             glob_paths,
             file_format_config,
@@ -297,6 +324,7 @@ impl GlobScanOperator {
             hive_partitioning,
             partitioning_keys,
             generated_fields: Arc::new(generated_fields),
+            size_estimator,
         })
     }
 }
@@ -461,6 +489,11 @@ impl ScanOperator for GlobScanOperator {
                     storage_config.clone(),
                     pushdowns.clone(),
                     generated_fields,
+                    self.size_estimator.as_ref().and_then(|size_estimator| {
+                        size_bytes.and_then(|size_bytes| {
+                            size_estimator.estimate_from_size_on_disk(size_bytes, &pushdowns)
+                        })
+                    }),
                 )))
             })();
             match scan_task_result {

diff --git a/src/daft-scan/src/lib.rs b/src/daft-scan/src/lib.rs
@@ -19,6 +19,7 @@ mod hive;
 use common_daft_config::DaftExecutionConfig;
 pub mod builder;
 pub mod scan_task_iters;
+pub mod size_estimations;
 
 #[cfg(feature = "python")]
 pub mod python;
@@ -369,6 +370,9 @@ pub struct ScanTask {
     pub metadata: Option<TableMetadata>,
     pub statistics: Option<TableStatistics>,
     pub generated_fields: Option<SchemaRef>,
+
+    /// The estimated amount of bytes this ScanTask will take up in memory once materialized
+    estimated_materialized_size_bytes: Option<usize>,
 }
 
 #[typetag::serde]
@@ -442,6 +446,7 @@ impl ScanTask {
         storage_config: Arc<StorageConfig>,
         pushdowns: Pushdowns,
         generated_fields: Option<SchemaRef>,
+        estimated_materialized_size_bytes: Option<usize>,
     ) -> Self {
         assert!(!sources.is_empty());
         debug_assert!(
@@ -483,6 +488,7 @@ impl ScanTask {
             metadata,
             statistics,
             generated_fields,
+            estimated_materialized_size_bytes,
         }
     }
 
@@ -534,6 +540,10 @@ impl ScanTask {
             sc1.storage_config.clone(),
             sc1.pushdowns.clone(),
             sc1.generated_fields.clone(),
+            sc1.estimated_materialized_size_bytes.and_then(|est1| {
+                sc2.estimated_materialized_size_bytes
+                    .map(|est2| est1 + est2)
+            }),
         ))
     }
 
@@ -810,6 +820,7 @@ mod test {
             ))),
             Pushdowns::default(),
             None,
+            None,
         )
     }
 

diff --git a/src/daft-scan/src/python.rs b/src/daft-scan/src/python.rs
@@ -378,6 +378,7 @@ pub mod pylib {
                 storage_config.into(),
                 pushdowns.map(|p| p.0.as_ref().clone()).unwrap_or_default(),
                 None,
+                None, // TODO: Add estimations of size in bytes (Catalog)
             );
             Ok(Some(Self(scan_task.into())))
         }
@@ -411,6 +412,7 @@ pub mod pylib {
                 storage_config.into(),
                 pushdowns.map(|p| p.0.as_ref().clone()).unwrap_or_default(),
                 None,
+                None, // TODO: Add estimations of size in bytes (SQL)
             );
             Ok(Self(scan_task.into()))
         }
@@ -456,6 +458,7 @@ pub mod pylib {
                 ))),
                 pushdowns.map(|p| p.0.as_ref().clone()).unwrap_or_default(),
                 None,
+                None, // TODO: Add estimations of size in bytes (Python)
             );
             Ok(Self(scan_task.into()))
         }

diff --git a/src/daft-scan/src/scan_task_iters.rs b/src/daft-scan/src/scan_task_iters.rs
@@ -254,6 +254,7 @@ pub(crate) fn split_by_row_groups(
 
                             if curr_size_bytes >= min_size_bytes || i == num_row_groups - 1 {
                                 let mut new_source = source.clone();
+                                let new_estimated_size_bytes_in_memory;
 
                                 if let DataSource::File {
                                     chunk_spec,
@@ -269,6 +270,9 @@ pub(crate) fn split_by_row_groups(
 
                                     *chunk_spec = Some(ChunkSpec::Parquet(curr_row_group_indices));
                                     *size_bytes = Some(curr_size_bytes as u64);
+
+                                    // Re-estimate the size bytes in memory
+                                    new_estimated_size_bytes_in_memory = t.estimated_materialized_size_bytes.map(|est| (est as f64 * (curr_num_rows as f64 / file.num_rows as f64)) as usize);
                                 } else {
                                     unreachable!("Parquet file format should only be used with DataSource::File");
                                 }
@@ -294,6 +298,7 @@ pub(crate) fn split_by_row_groups(
                                     t.storage_config.clone(),
                                     t.pushdowns.clone(),
                                     t.generated_fields.clone(),
+                                    new_estimated_size_bytes_in_memory,
                                 )
                                 .into()));
                             }