polygon-io · zhuqi-lucas · Sep 17, 2025 · Nov 5, 2024 · Nov 5, 2024 · Dec 23, 2024
diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml
@@ -44,4 +44,4 @@ jobs:
       - name: Run audit check
         # Ignored until https://github.com/apache/datafusion/issues/15571
         # ignored py03 warning until arrow 55 upgrade
-        run: cargo audit --ignore RUSTSEC-2024-0370 --ignore RUSTSEC-2025-0020
+        run: cargo audit --ignore RUSTSEC-2024-0370 --ignore RUSTSEC-2025-0020 --ignore RUSTSEC-2025-0047
diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -539,7 +539,7 @@ config_namespace! {
 
         /// (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`,
         /// and `Binary/BinaryLarge` with `BinaryView`.
-        pub schema_force_view_types: bool, default = true
+        pub schema_force_view_types: bool, default = false
 
         /// (reading) If true, parquet reader will read columns of
         /// `Binary/LargeBinary` with `Utf8`, and `BinaryView` with `Utf8View`.
@@ -2521,6 +2521,10 @@ config_namespace! {
         // The input regex for Nulls when loading CSVs.
         pub null_regex: Option<String>, default = None
         pub comment: Option<u8>, default = None
+        // Whether to allow truncated rows when parsing.
+        // By default this is set to false and will error if the CSV rows have different lengths.
+        // When set to true then it will allow records with less than the expected number of columns
+        pub truncated_rows: Option<bool>, default = None
     }
 }
 
@@ -2613,6 +2617,15 @@ impl CsvOptions {
         self
     }
 
+    /// Whether to allow truncated rows when parsing.
+    /// By default this is set to false and will error if the CSV rows have different lengths.
+    /// When set to true then it will allow records with less than the expected number of columns and fill the missing columns with nulls.
+    /// If the record’s schema is not nullable, then it will still return an error.
+    pub fn with_truncated_rows(mut self, allow: bool) -> Self {
+        self.truncated_rows = Some(allow);
+        self
+    }
+
     /// The delimiter character.
     pub fn delimiter(&self) -> u8 {
         self.delimiter

diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs
@@ -48,7 +48,7 @@ mod tests {
     use datafusion_physical_plan::{collect, ExecutionPlan};
 
     use arrow::array::{
-        BooleanArray, Float64Array, Int32Array, RecordBatch, StringArray,
+        Array, BooleanArray, Float64Array, Int32Array, RecordBatch, StringArray,
     };
     use arrow::compute::concat_batches;
     use arrow::csv::ReaderBuilder;
@@ -1256,4 +1256,181 @@ mod tests {
             .build_decoder();
         DecoderDeserializer::new(CsvDecoder::new(decoder))
     }
+
+    fn csv_deserializer_with_truncated(
+        batch_size: usize,
+        schema: &Arc<Schema>,
+    ) -> impl BatchDeserializer<Bytes> {
+        // using Arrow's ReaderBuilder and enabling truncated_rows
+        let decoder = ReaderBuilder::new(schema.clone())
+            .with_batch_size(batch_size)
+            .with_truncated_rows(true) // <- enable runtime truncated_rows
+            .build_decoder();
+        DecoderDeserializer::new(CsvDecoder::new(decoder))
+    }
+
+    #[tokio::test]
+    async fn infer_schema_with_truncated_rows_true() -> Result<()> {
+        let session_ctx = SessionContext::new();
+        let state = session_ctx.state();
+
+        // CSV: header has 3 columns, but first data row has only 2 columns, second row has 3
+        let csv_data = Bytes::from("a,b,c\n1,2\n3,4,5\n");
+        let variable_object_store = Arc::new(VariableStream::new(csv_data, 1));
+        let object_meta = ObjectMeta {
+            location: Path::parse("/")?,
+            last_modified: DateTime::default(),
+            size: u64::MAX,
+            e_tag: None,
+            version: None,
+        };
+
+        // Construct CsvFormat and enable truncated_rows via CsvOptions
+        let csv_options = CsvOptions::default().with_truncated_rows(true);
+        let csv_format = CsvFormat::default()
+            .with_has_header(true)
+            .with_options(csv_options)
+            .with_schema_infer_max_rec(10);
+
+        let inferred_schema = csv_format
+            .infer_schema(
+                &state,
+                &(variable_object_store.clone() as Arc<dyn ObjectStore>),
+                &[object_meta],
+            )
+            .await?;
+
+        // header has 3 columns; inferred schema should also have 3
+        assert_eq!(inferred_schema.fields().len(), 3);
+
+        // inferred columns should be nullable
+        for f in inferred_schema.fields() {
+            assert!(f.is_nullable());
+        }
+
+        Ok(())
+    }
+    #[test]
+    fn test_decoder_truncated_rows_runtime() -> Result<()> {
+        // Synchronous test: Decoder API used here is synchronous
+        let schema = csv_schema(); // helper already defined in file
+
+        // Construct a decoder that enables truncated_rows at runtime
+        let mut deserializer = csv_deserializer_with_truncated(10, &schema);
+
+        // Provide two rows: first row complete, second row missing last column
+        let input = Bytes::from("0,0.0,true,0-string\n1,1.0,true\n");
+        deserializer.digest(input);
+
+        // Finish and collect output
+        deserializer.finish();
+
+        let output = deserializer.next()?;
+        match output {
+            DeserializerOutput::RecordBatch(batch) => {
+                // ensure at least two rows present
+                assert!(batch.num_rows() >= 2);
+                // column 4 (index 3) should be a StringArray where second row is NULL
+                let col4 = batch
+                    .column(3)
+                    .as_any()
+                    .downcast_ref::<StringArray>()
+                    .expect("column 4 should be StringArray");
+
+                // first row present, second row should be null
+                assert!(!col4.is_null(0));
+                assert!(col4.is_null(1));
+            }
+            other => panic!("expected RecordBatch but got {other:?}"),
+        }
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn infer_schema_truncated_rows_false_error() -> Result<()> {
+        let session_ctx = SessionContext::new();
+        let state = session_ctx.state();
+
+        // CSV: header has 4 cols, first data row has 3 cols -> truncated at end
+        let csv_data = Bytes::from("id,a,b,c\n1,foo,bar\n2,foo,bar,baz\n");
+        let variable_object_store = Arc::new(VariableStream::new(csv_data, 1));
+        let object_meta = ObjectMeta {
+            location: Path::parse("/")?,
+            last_modified: DateTime::default(),
+            size: u64::MAX,
+            e_tag: None,
+            version: None,
+        };
+
+        // CsvFormat without enabling truncated_rows (default behavior = false)
+        let csv_format = CsvFormat::default()
+            .with_has_header(true)
+            .with_schema_infer_max_rec(10);
+
+        let res = csv_format
+            .infer_schema(
+                &state,
+                &(variable_object_store.clone() as Arc<dyn ObjectStore>),
+                &[object_meta],
+            )
+            .await;
+
+        // Expect an error due to unequal lengths / incorrect number of fields
+        assert!(
+            res.is_err(),
+            "expected infer_schema to error on truncated rows when disabled"
+        );
+
+        // Optional: check message contains indicative text (two known possibilities)
+        if let Err(err) = res {
+            let msg = format!("{err}");
+            assert!(
+                msg.contains("Encountered unequal lengths")
+                    || msg.contains("incorrect number of fields"),
+                "unexpected error message: {msg}",
+            );
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_read_csv_truncated_rows_via_tempfile() -> Result<()> {
+        use std::io::Write;
+
+        // create a SessionContext
+        let ctx = SessionContext::new();
+
+        // Create a temp file with a .csv suffix so the reader accepts it
+        let mut tmp = tempfile::Builder::new().suffix(".csv").tempfile()?; // ensures path ends with .csv
+                                                                           // CSV has header "a,b,c". First data row is truncated (only "1,2"), second row is complete.
+        write!(tmp, "a,b,c\n1,2\n3,4,5\n")?;
+        let path = tmp.path().to_str().unwrap().to_string();
+
+        // Build CsvReadOptions: header present, enable truncated_rows.
+        // (Use the exact builder method your crate exposes: `truncated_rows(true)` here,
+        //  if the method name differs in your codebase use the appropriate one.)
+        let options = CsvReadOptions::default().truncated_rows(true);
+
+        println!("options: {}, path: {path}", options.truncated_rows);
+
+        // Call the API under test
+        let df = ctx.read_csv(&path, options).await?;
+
+        // Collect the results and combine batches so we can inspect columns
+        let batches = df.collect().await?;
+        let combined = concat_batches(&batches[0].schema(), &batches)?;
+
+        // Column 'c' is the 3rd column (index 2). The first data row was truncated -> should be NULL.
+        let col_c = combined.column(2);
+        assert!(
+            col_c.is_null(0),
+            "expected first row column 'c' to be NULL due to truncated row"
+        );
+
+        // Also ensure we read at least one row
+        assert!(combined.num_rows() >= 2);
+
+        Ok(())
+    }
 }
diff --git a/datafusion/core/src/datasource/file_format/options.rs b/datafusion/core/src/datasource/file_format/options.rs
@@ -91,6 +91,11 @@ pub struct CsvReadOptions<'a> {
     pub file_sort_order: Vec<Vec<SortExpr>>,
     /// Optional regex to match null values
     pub null_regex: Option<String>,
+    /// Whether to allow truncated rows when parsing.
+    /// By default this is set to false and will error if the CSV rows have different lengths.
+    /// When set to true then it will allow records with less than the expected number of columns and fill the missing columns with nulls.
+    /// If the record’s schema is not nullable, then it will still return an error.
+    pub truncated_rows: bool,
 }
 
 impl Default for CsvReadOptions<'_> {
@@ -117,6 +122,7 @@ impl<'a> CsvReadOptions<'a> {
             file_sort_order: vec![],
             comment: None,
             null_regex: None,
+            truncated_rows: false,
         }
     }
 
@@ -223,6 +229,15 @@ impl<'a> CsvReadOptions<'a> {
         self.null_regex = null_regex;
         self
     }
+
+    /// Configure whether to allow truncated rows when parsing.
+    /// By default this is set to false and will error if the CSV rows have different lengths
+    /// When set to true then it will allow records with less than the expected number of columns and fill the missing columns with nulls.
+    /// If the record’s schema is not nullable, then it will still return an error.
+    pub fn truncated_rows(mut self, truncated_rows: bool) -> Self {
+        self.truncated_rows = truncated_rows;
+        self
+    }
 }
 
 /// Options that control the reading of Parquet files.
@@ -558,7 +573,8 @@ impl ReadOptions<'_> for CsvReadOptions<'_> {
             .with_newlines_in_values(self.newlines_in_values)
             .with_schema_infer_max_rec(self.schema_infer_max_records)
             .with_file_compression_type(self.file_compression_type.to_owned())
-            .with_null_regex(self.null_regex.clone());
+            .with_null_regex(self.null_regex.clone())
+            .with_truncated_rows(self.truncated_rows);
 
         ListingOptions::new(Arc::new(file_format))
             .with_file_extension(self.file_extension)

diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs
@@ -581,11 +581,11 @@ mod tests {
         assert_eq!(string_truncation_stats.null_count, Precision::Exact(2));
         assert_eq!(
             string_truncation_stats.max_value,
-            Precision::Inexact(ScalarValue::Utf8View(Some("b".repeat(63) + "c")))
+            Precision::Inexact(Utf8(Some("b".repeat(63) + "c")))
         );
         assert_eq!(
             string_truncation_stats.min_value,
-            Precision::Inexact(ScalarValue::Utf8View(Some("a".repeat(64))))
+            Precision::Inexact(Utf8(Some("a".repeat(64))))
         );
 
         Ok(())

diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs
@@ -67,6 +67,9 @@ use datafusion_physical_expr::create_physical_expr;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use datafusion_physical_optimizer::optimizer::PhysicalOptimizer;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_plan::node_id::{
+    annotate_node_id_for_execution_plan, NodeIdAnnotator,
+};
 use datafusion_physical_plan::ExecutionPlan;
 use datafusion_session::Session;
 use datafusion_sql::parser::{DFParserBuilder, Statement};
@@ -647,9 +650,12 @@ impl SessionState {
         logical_plan: &LogicalPlan,
     ) -> datafusion_common::Result<Arc<dyn ExecutionPlan>> {
         let logical_plan = self.optimize(logical_plan)?;
-        self.query_planner
+        let physical_plan = self
+            .query_planner
             .create_physical_plan(&logical_plan, self)
-            .await
+            .await?;
+        let mut id_annotator = NodeIdAnnotator::new();
+        annotate_node_id_for_execution_plan(&physical_plan, &mut id_annotator)
     }
 
     /// Create a [`PhysicalExpr`] from an [`Expr`] after applying type

diff --git a/datafusion/core/tests/parquet/page_pruning.rs b/datafusion/core/tests/parquet/page_pruning.rs
@@ -165,7 +165,9 @@ async fn page_index_filter_one_col() {
 
     // 5.create filter date_string_col == "01/01/09"`;
     // Note this test doesn't apply type coercion so the literal must match the actual view type
-    let filter = col("date_string_col").eq(lit(ScalarValue::new_utf8view("01/01/09")));
+    // xudong: use new_utf8, because schema_force_view_types was changed to false now.
+    // qi: when schema_force_view_types setting to true, we should change back to utf8view
+    let filter = col("date_string_col").eq(lit(ScalarValue::new_utf8("01/01/09")));
     let batches = get_filter_results(&state, filter.clone(), false).await;
     assert_eq!(batches[0].num_rows(), 14);
 

diff --git a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs
@@ -3615,18 +3615,19 @@ fn test_replace_order_preserving_variants_with_fetch() -> Result<()> {
     );
 
     // Apply the function
-    let result = replace_order_preserving_variants(dist_context)?;
+    let result = replace_order_preserving_variants(dist_context, false)?;
 
     // Verify the plan was transformed to CoalescePartitionsExec
     result
+        .0
         .plan
         .as_any()
         .downcast_ref::<CoalescePartitionsExec>()
         .expect("Expected CoalescePartitionsExec");
 
     // Verify fetch was preserved
     assert_eq!(
-        result.plan.fetch(),
+        result.0.plan.fetch(),
         Some(5),
         "Fetch value was not preserved after transformation"
     );