diff --git a/datafusion-examples/examples/query_planning/expr_api.rs b/datafusion-examples/examples/query_planning/expr_api.rs index 225be0ee13c63..47066f2ba28fc 100644 --- a/datafusion-examples/examples/query_planning/expr_api.rs +++ b/datafusion-examples/examples/query_planning/expr_api.rs @@ -303,6 +303,7 @@ fn boundary_analysis_and_selectivity_demo() -> Result<()> { min_value: Precision::Exact(ScalarValue::Int64(Some(1))), sum_value: Precision::Absent, distinct_count: Precision::Absent, + byte_size: Precision::Absent, }; // We can then build our expression boundaries from the column statistics @@ -370,6 +371,7 @@ fn boundary_analysis_in_conjunctions_demo() -> Result<()> { min_value: Precision::Exact(ScalarValue::Int64(Some(14))), sum_value: Precision::Absent, distinct_count: Precision::Absent, + byte_size: Precision::Absent, }; let initial_boundaries = diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs index 1cd579365af71..ba13ef392d912 100644 --- a/datafusion/common/src/stats.rs +++ b/datafusion/common/src/stats.rs @@ -283,9 +283,13 @@ impl From> for Precision { /// and the transformations output are not always predictable. #[derive(Debug, Clone, PartialEq, Eq)] pub struct Statistics { - /// The number of table rows. + /// The number of rows estimated to be scanned. pub num_rows: Precision, - /// Total bytes of the table rows. + /// The total bytes of the output data. + /// Note that this is not the same as the total bytes that may be scanned, + /// processed, etc. + /// E.g. we may read 1GB of data from a Parquet file but the Arrow data + /// the node produces may be 2GB; it's this 2GB that is tracked here. pub total_byte_size: Precision, /// Statistics on a column level. /// @@ -502,15 +506,38 @@ impl Statistics { self.column_statistics = self .column_statistics .into_iter() - .map(ColumnStatistics::to_inexact) + .map(|cs| { + let mut cs = cs.to_inexact(); + // Scale byte_size by the row ratio + cs.byte_size = match cs.byte_size { + Precision::Exact(n) | Precision::Inexact(n) => { + Precision::Inexact((n as f64 * ratio) as usize) + } + Precision::Absent => Precision::Absent, + }; + cs + }) .collect(); - // Adjust the total_byte_size for the ratio of rows before and after, also marking it as inexact - self.total_byte_size = match &self.total_byte_size { - Precision::Exact(n) | Precision::Inexact(n) => { - let adjusted = (*n as f64 * ratio) as usize; - Precision::Inexact(adjusted) + + // Compute total_byte_size as sum of column byte_size values if all are present, + // otherwise fall back to scaling the original total_byte_size + let sum_scan_bytes: Option = self + .column_statistics + .iter() + .map(|cs| cs.byte_size.get_value().copied()) + .try_fold(0usize, |acc, val| val.map(|v| acc + v)); + + self.total_byte_size = match sum_scan_bytes { + Some(sum) => Precision::Inexact(sum), + None => { + // Fall back to scaling original total_byte_size if not all columns have byte_size + match &self.total_byte_size { + Precision::Exact(n) | Precision::Inexact(n) => { + Precision::Inexact((*n as f64 * ratio) as usize) + } + Precision::Absent => Precision::Absent, + } } - Precision::Absent => Precision::Absent, }; Ok(self) } @@ -606,6 +633,7 @@ impl Statistics { col_stats.min_value = col_stats.min_value.min(&item_col_stats.min_value); col_stats.sum_value = col_stats.sum_value.add(&item_col_stats.sum_value); col_stats.distinct_count = Precision::Absent; + col_stats.byte_size = col_stats.byte_size.add(&item_col_stats.byte_size); } Ok(Statistics { @@ -667,6 +695,11 @@ impl Display for Statistics { } else { s }; + let s = if cs.byte_size != Precision::Absent { + format!("{} ScanBytes={}", s, cs.byte_size) + } else { + s + }; s + ")" }) @@ -696,6 +729,21 @@ pub struct ColumnStatistics { pub sum_value: Precision, /// Number of distinct values pub distinct_count: Precision, + /// Estimated size of this column's data in bytes for the output. + /// + /// Note that this is not the same as the total bytes that may be scanned, + /// processed, etc. + /// + /// E.g. we may read 1GB of data from a Parquet file but the Arrow data + /// the node produces may be 2GB; it's this 2GB that is tracked here. + /// + /// Currently this is accurately calculated for primitive types only. + /// For complex types (like Utf8, List, Struct, etc), this value may be + /// absent or inexact (e.g. estimated from the size of the data in the source Parquet files). + /// + /// This value is automatically scaled when operations like limits or + /// filters reduce the number of rows (see [`Statistics::with_fetch`]). + pub byte_size: Precision, } impl ColumnStatistics { @@ -718,6 +766,7 @@ impl ColumnStatistics { min_value: Precision::Absent, sum_value: Precision::Absent, distinct_count: Precision::Absent, + byte_size: Precision::Absent, } } @@ -751,6 +800,13 @@ impl ColumnStatistics { self } + /// Set the scan byte size + /// This should initially be set to the total size of the column. + pub fn with_byte_size(mut self, byte_size: Precision) -> Self { + self.byte_size = byte_size; + self + } + /// If the exactness of a [`ColumnStatistics`] instance is lost, this /// function relaxes the exactness of all information by converting them /// [`Precision::Inexact`]. @@ -760,6 +816,7 @@ impl ColumnStatistics { self.min_value = self.min_value.to_inexact(); self.sum_value = self.sum_value.to_inexact(); self.distinct_count = self.distinct_count.to_inexact(); + self.byte_size = self.byte_size.to_inexact(); self } } @@ -1051,6 +1108,7 @@ mod tests { min_value: Precision::Exact(ScalarValue::Int64(Some(64))), sum_value: Precision::Exact(ScalarValue::Int64(Some(4600))), distinct_count: Precision::Exact(100), + byte_size: Precision::Exact(800), } } @@ -1073,6 +1131,7 @@ mod tests { min_value: Precision::Exact(ScalarValue::Int32(Some(1))), sum_value: Precision::Exact(ScalarValue::Int32(Some(500))), distinct_count: Precision::Absent, + byte_size: Precision::Exact(40), }, ColumnStatistics { null_count: Precision::Exact(2), @@ -1080,6 +1139,7 @@ mod tests { min_value: Precision::Exact(ScalarValue::Int32(Some(10))), sum_value: Precision::Exact(ScalarValue::Int32(Some(1000))), distinct_count: Precision::Absent, + byte_size: Precision::Exact(40), }, ], }; @@ -1094,6 +1154,7 @@ mod tests { min_value: Precision::Exact(ScalarValue::Int32(Some(-10))), sum_value: Precision::Exact(ScalarValue::Int32(Some(600))), distinct_count: Precision::Absent, + byte_size: Precision::Exact(60), }, ColumnStatistics { null_count: Precision::Exact(3), @@ -1101,6 +1162,7 @@ mod tests { min_value: Precision::Exact(ScalarValue::Int32(Some(5))), sum_value: Precision::Exact(ScalarValue::Int32(Some(1200))), distinct_count: Precision::Absent, + byte_size: Precision::Exact(60), }, ], }; @@ -1164,6 +1226,7 @@ mod tests { min_value: Precision::Inexact(ScalarValue::Int32(Some(1))), sum_value: Precision::Exact(ScalarValue::Int32(Some(500))), distinct_count: Precision::Absent, + byte_size: Precision::Exact(40), }], }; @@ -1176,6 +1239,7 @@ mod tests { min_value: Precision::Exact(ScalarValue::Int32(Some(-10))), sum_value: Precision::Absent, distinct_count: Precision::Absent, + byte_size: Precision::Inexact(60), }], }; @@ -1305,6 +1369,7 @@ mod tests { min_value: Precision::Exact(ScalarValue::Int32(Some(0))), sum_value: Precision::Exact(ScalarValue::Int32(Some(5050))), distinct_count: Precision::Exact(50), + byte_size: Precision::Exact(4000), }, ColumnStatistics { null_count: Precision::Exact(20), @@ -1312,6 +1377,7 @@ mod tests { min_value: Precision::Exact(ScalarValue::Int64(Some(10))), sum_value: Precision::Exact(ScalarValue::Int64(Some(10100))), distinct_count: Precision::Exact(75), + byte_size: Precision::Exact(8000), }, ], }; @@ -1322,9 +1388,9 @@ mod tests { // Check num_rows assert_eq!(result.num_rows, Precision::Exact(100)); - // Check total_byte_size is scaled proportionally and marked as inexact - // 100/1000 = 0.1, so 8000 * 0.1 = 800 - assert_eq!(result.total_byte_size, Precision::Inexact(800)); + // Check total_byte_size is computed as sum of scaled column byte_size values + // Column 1: 4000 * 0.1 = 400, Column 2: 8000 * 0.1 = 800, Sum = 1200 + assert_eq!(result.total_byte_size, Precision::Inexact(1200)); // Check column statistics are preserved but marked as inexact assert_eq!(result.column_statistics.len(), 2); @@ -1386,6 +1452,7 @@ mod tests { min_value: Precision::Inexact(ScalarValue::Int32(Some(0))), sum_value: Precision::Inexact(ScalarValue::Int32(Some(5050))), distinct_count: Precision::Inexact(50), + byte_size: Precision::Inexact(4000), }], }; @@ -1394,9 +1461,9 @@ mod tests { // Check num_rows is inexact assert_eq!(result.num_rows, Precision::Inexact(500)); - // Check total_byte_size is scaled and inexact - // 500/1000 = 0.5, so 8000 * 0.5 = 4000 - assert_eq!(result.total_byte_size, Precision::Inexact(4000)); + // Check total_byte_size is computed as sum of scaled column byte_size values + // Column 1: 4000 * 0.5 = 2000, Sum = 2000 + assert_eq!(result.total_byte_size, Precision::Inexact(2000)); // Column stats remain inexact assert_eq!( @@ -1453,8 +1520,8 @@ mod tests { .unwrap(); assert_eq!(result.num_rows, Precision::Exact(300)); - // 300/1000 = 0.3, so 8000 * 0.3 = 2400 - assert_eq!(result.total_byte_size, Precision::Inexact(2400)); + // Column 1: byte_size 800 * (300/500) = 240, Sum = 240 + assert_eq!(result.total_byte_size, Precision::Inexact(240)); } #[test] @@ -1470,8 +1537,8 @@ mod tests { let result = original_stats.clone().with_fetch(Some(100), 0, 4).unwrap(); assert_eq!(result.num_rows, Precision::Exact(400)); - // 400/1000 = 0.4, so 8000 * 0.4 = 3200 - assert_eq!(result.total_byte_size, Precision::Inexact(3200)); + // Column 1: byte_size 800 * 0.4 = 320, Sum = 320 + assert_eq!(result.total_byte_size, Precision::Inexact(320)); } #[test] @@ -1486,6 +1553,7 @@ mod tests { min_value: Precision::Absent, sum_value: Precision::Absent, distinct_count: Precision::Absent, + byte_size: Precision::Absent, }], }; @@ -1524,6 +1592,7 @@ mod tests { min_value: Precision::Exact(ScalarValue::Int32(Some(-100))), sum_value: Precision::Exact(ScalarValue::Int32(Some(123456))), distinct_count: Precision::Exact(789), + byte_size: Precision::Exact(4000), }; let original_stats = Statistics { @@ -1552,4 +1621,140 @@ mod tests { ); assert_eq!(result_col_stats.distinct_count, Precision::Inexact(789)); } + + #[test] + fn test_byte_size_try_merge() { + // Test that byte_size is summed correctly in try_merge + let col_stats1 = ColumnStatistics { + null_count: Precision::Exact(10), + max_value: Precision::Absent, + min_value: Precision::Absent, + sum_value: Precision::Absent, + distinct_count: Precision::Absent, + byte_size: Precision::Exact(1000), + }; + let col_stats2 = ColumnStatistics { + null_count: Precision::Exact(20), + max_value: Precision::Absent, + min_value: Precision::Absent, + sum_value: Precision::Absent, + distinct_count: Precision::Absent, + byte_size: Precision::Exact(2000), + }; + + let stats1 = Statistics { + num_rows: Precision::Exact(50), + total_byte_size: Precision::Exact(1000), + column_statistics: vec![col_stats1], + }; + let stats2 = Statistics { + num_rows: Precision::Exact(100), + total_byte_size: Precision::Exact(2000), + column_statistics: vec![col_stats2], + }; + + let merged = stats1.try_merge(&stats2).unwrap(); + assert_eq!( + merged.column_statistics[0].byte_size, + Precision::Exact(3000) // 1000 + 2000 + ); + } + + #[test] + fn test_byte_size_to_inexact() { + let col_stats = ColumnStatistics { + null_count: Precision::Exact(10), + max_value: Precision::Absent, + min_value: Precision::Absent, + sum_value: Precision::Absent, + distinct_count: Precision::Absent, + byte_size: Precision::Exact(5000), + }; + + let inexact = col_stats.to_inexact(); + assert_eq!(inexact.byte_size, Precision::Inexact(5000)); + } + + #[test] + fn test_with_byte_size_builder() { + let col_stats = + ColumnStatistics::new_unknown().with_byte_size(Precision::Exact(8192)); + assert_eq!(col_stats.byte_size, Precision::Exact(8192)); + } + + #[test] + fn test_with_fetch_scales_byte_size() { + // Test that byte_size is scaled by the row ratio in with_fetch + let original_stats = Statistics { + num_rows: Precision::Exact(1000), + total_byte_size: Precision::Exact(8000), + column_statistics: vec![ + ColumnStatistics { + null_count: Precision::Exact(10), + max_value: Precision::Absent, + min_value: Precision::Absent, + sum_value: Precision::Absent, + distinct_count: Precision::Absent, + byte_size: Precision::Exact(4000), + }, + ColumnStatistics { + null_count: Precision::Exact(20), + max_value: Precision::Absent, + min_value: Precision::Absent, + sum_value: Precision::Absent, + distinct_count: Precision::Absent, + byte_size: Precision::Exact(8000), + }, + ], + }; + + // Apply fetch of 100 rows (10% of original) + let result = original_stats.with_fetch(Some(100), 0, 1).unwrap(); + + // byte_size should be scaled: 4000 * 0.1 = 400, 8000 * 0.1 = 800 + assert_eq!( + result.column_statistics[0].byte_size, + Precision::Inexact(400) + ); + assert_eq!( + result.column_statistics[1].byte_size, + Precision::Inexact(800) + ); + + // total_byte_size should be computed as sum of byte_size values: 400 + 800 = 1200 + assert_eq!(result.total_byte_size, Precision::Inexact(1200)); + } + + #[test] + fn test_with_fetch_total_byte_size_fallback() { + // Test that total_byte_size falls back to scaling when not all columns have byte_size + let original_stats = Statistics { + num_rows: Precision::Exact(1000), + total_byte_size: Precision::Exact(8000), + column_statistics: vec![ + ColumnStatistics { + null_count: Precision::Exact(10), + max_value: Precision::Absent, + min_value: Precision::Absent, + sum_value: Precision::Absent, + distinct_count: Precision::Absent, + byte_size: Precision::Exact(4000), + }, + ColumnStatistics { + null_count: Precision::Exact(20), + max_value: Precision::Absent, + min_value: Precision::Absent, + sum_value: Precision::Absent, + distinct_count: Precision::Absent, + byte_size: Precision::Absent, // One column has no byte_size + }, + ], + }; + + // Apply fetch of 100 rows (10% of original) + let result = original_stats.with_fetch(Some(100), 0, 1).unwrap(); + + // total_byte_size should fall back to scaling: 8000 * 0.1 = 800 + assert_eq!(result.total_byte_size, Precision::Inexact(800)); + } } diff --git a/datafusion/core/tests/custom_sources_cases/statistics.rs b/datafusion/core/tests/custom_sources_cases/statistics.rs index 403c04f1737e1..820c2a470b376 100644 --- a/datafusion/core/tests/custom_sources_cases/statistics.rs +++ b/datafusion/core/tests/custom_sources_cases/statistics.rs @@ -214,6 +214,7 @@ fn fully_defined() -> (Statistics, Schema) { min_value: Precision::Exact(ScalarValue::Int32(Some(-24))), sum_value: Precision::Exact(ScalarValue::Int64(Some(10))), null_count: Precision::Exact(0), + byte_size: Precision::Absent, }, ColumnStatistics { distinct_count: Precision::Exact(13), @@ -221,6 +222,7 @@ fn fully_defined() -> (Statistics, Schema) { min_value: Precision::Exact(ScalarValue::Int64(Some(-6783))), sum_value: Precision::Exact(ScalarValue::Int64(Some(10))), null_count: Precision::Exact(5), + byte_size: Precision::Absent, }, ], }, diff --git a/datafusion/core/tests/physical_optimizer/partition_statistics.rs b/datafusion/core/tests/physical_optimizer/partition_statistics.rs index 173cbad8ad33e..cbfcb718836b0 100644 --- a/datafusion/core/tests/physical_optimizer/partition_statistics.rs +++ b/datafusion/core/tests/physical_optimizer/partition_statistics.rs @@ -120,21 +120,27 @@ mod test { max_value: i32, include_date_column: bool, ) -> Statistics { + // Int32 is 4 bytes per row + let int32_byte_size = num_rows * 4; let mut column_stats = vec![ColumnStatistics { null_count: Precision::Exact(0), max_value: Precision::Exact(ScalarValue::Int32(Some(max_value))), min_value: Precision::Exact(ScalarValue::Int32(Some(min_value))), sum_value: Precision::Absent, distinct_count: Precision::Absent, + byte_size: Precision::Exact(int32_byte_size), }]; if include_date_column { + // The date column is a partition column (from the directory path), + // not stored in the parquet file, so byte_size is Absent column_stats.push(ColumnStatistics { null_count: Precision::Absent, max_value: Precision::Absent, min_value: Precision::Absent, sum_value: Precision::Absent, distinct_count: Precision::Absent, + byte_size: Precision::Absent, }); } @@ -323,6 +329,8 @@ mod test { let filter: Arc = Arc::new(FilterExec::try_new(predicate, scan)?); let full_statistics = filter.partition_statistics(None)?; + // Filter preserves original total_rows and byte_size from input + // (4 total rows = 2 partitions * 2 rows each, byte_size = 4 * 4 = 16 bytes for int32) let expected_full_statistic = Statistics { num_rows: Precision::Inexact(0), total_byte_size: Precision::Inexact(0), @@ -333,6 +341,7 @@ mod test { min_value: Precision::Exact(ScalarValue::Null), sum_value: Precision::Exact(ScalarValue::Null), distinct_count: Precision::Exact(0), + byte_size: Precision::Exact(16), }, ColumnStatistics { null_count: Precision::Exact(0), @@ -340,6 +349,7 @@ mod test { min_value: Precision::Exact(ScalarValue::Null), sum_value: Precision::Exact(ScalarValue::Null), distinct_count: Precision::Exact(0), + byte_size: Precision::Absent, }, ], }; @@ -349,8 +359,31 @@ mod test { .map(|idx| filter.partition_statistics(Some(idx))) .collect::>>()?; assert_eq!(statistics.len(), 2); - assert_eq!(statistics[0], expected_full_statistic); - assert_eq!(statistics[1], expected_full_statistic); + // Per-partition stats: each partition has 2 rows, byte_size = 2 * 4 = 8 + let expected_partition_statistic = Statistics { + num_rows: Precision::Inexact(0), + total_byte_size: Precision::Inexact(0), + column_statistics: vec![ + ColumnStatistics { + null_count: Precision::Exact(0), + max_value: Precision::Exact(ScalarValue::Null), + min_value: Precision::Exact(ScalarValue::Null), + sum_value: Precision::Exact(ScalarValue::Null), + distinct_count: Precision::Exact(0), + byte_size: Precision::Exact(8), + }, + ColumnStatistics { + null_count: Precision::Exact(0), + max_value: Precision::Exact(ScalarValue::Null), + min_value: Precision::Exact(ScalarValue::Null), + sum_value: Precision::Exact(ScalarValue::Null), + distinct_count: Precision::Exact(0), + byte_size: Precision::Absent, + }, + ], + }; + assert_eq!(statistics[0], expected_partition_statistic); + assert_eq!(statistics[1], expected_partition_statistic); Ok(()) } @@ -415,6 +448,7 @@ mod test { .collect::>>()?; assert_eq!(stats.len(), 2); + // Each partition gets half of combined input, total_rows per partition = 4 let expected_stats = Statistics { num_rows: Precision::Inexact(4), total_byte_size: Precision::Inexact(32), @@ -460,28 +494,67 @@ mod test { .collect::>>()?; // Check that we have 2 partitions assert_eq!(statistics.len(), 2); - let mut expected_statistic_partition_1 = - create_partition_statistics(8, 512, 1, 4, true); - expected_statistic_partition_1 - .column_statistics - .push(ColumnStatistics { - null_count: Precision::Exact(0), - max_value: Precision::Exact(ScalarValue::Int32(Some(4))), - min_value: Precision::Exact(ScalarValue::Int32(Some(3))), - sum_value: Precision::Absent, - distinct_count: Precision::Absent, - }); - let mut expected_statistic_partition_2 = - create_partition_statistics(8, 512, 1, 4, true); - expected_statistic_partition_2 - .column_statistics - .push(ColumnStatistics { - null_count: Precision::Exact(0), - max_value: Precision::Exact(ScalarValue::Int32(Some(2))), - min_value: Precision::Exact(ScalarValue::Int32(Some(1))), - sum_value: Precision::Absent, - distinct_count: Precision::Absent, - }); + // Cross join doesn't propagate Column's byte_size + let expected_statistic_partition_1 = Statistics { + num_rows: Precision::Exact(8), + total_byte_size: Precision::Exact(512), + column_statistics: vec![ + ColumnStatistics { + null_count: Precision::Exact(0), + max_value: Precision::Exact(ScalarValue::Int32(Some(4))), + min_value: Precision::Exact(ScalarValue::Int32(Some(1))), + sum_value: Precision::Absent, + distinct_count: Precision::Absent, + byte_size: Precision::Absent, + }, + ColumnStatistics { + null_count: Precision::Absent, + max_value: Precision::Absent, + min_value: Precision::Absent, + sum_value: Precision::Absent, + distinct_count: Precision::Absent, + byte_size: Precision::Absent, + }, + ColumnStatistics { + null_count: Precision::Exact(0), + max_value: Precision::Exact(ScalarValue::Int32(Some(4))), + min_value: Precision::Exact(ScalarValue::Int32(Some(3))), + sum_value: Precision::Absent, + distinct_count: Precision::Absent, + byte_size: Precision::Absent, + }, + ], + }; + let expected_statistic_partition_2 = Statistics { + num_rows: Precision::Exact(8), + total_byte_size: Precision::Exact(512), + column_statistics: vec![ + ColumnStatistics { + null_count: Precision::Exact(0), + max_value: Precision::Exact(ScalarValue::Int32(Some(4))), + min_value: Precision::Exact(ScalarValue::Int32(Some(1))), + sum_value: Precision::Absent, + distinct_count: Precision::Absent, + byte_size: Precision::Absent, + }, + ColumnStatistics { + null_count: Precision::Absent, + max_value: Precision::Absent, + min_value: Precision::Absent, + sum_value: Precision::Absent, + distinct_count: Precision::Absent, + byte_size: Precision::Absent, + }, + ColumnStatistics { + null_count: Precision::Exact(0), + max_value: Precision::Exact(ScalarValue::Int32(Some(2))), + min_value: Precision::Exact(ScalarValue::Int32(Some(1))), + sum_value: Precision::Absent, + distinct_count: Precision::Absent, + byte_size: Precision::Absent, + }, + ], + }; assert_eq!(statistics[0], expected_statistic_partition_1); assert_eq!(statistics[1], expected_statistic_partition_2); @@ -622,12 +695,9 @@ mod test { let p0_statistics = aggregate_exec_partial.partition_statistics(Some(0))?; + // Aggregate doesn't propagate num_rows and ColumnStatistics byte_size from input let expected_p0_statistics = Statistics { num_rows: Precision::Inexact(2), - // Each row produces 8 bytes of data: - // - id column: Int32 (4 bytes) × 2 rows = 8 bytes - // - id + 1 column: Int32 (4 bytes) × 2 rows = 8 bytes - // AggregateExec cannot yet derive byte sizes for the COUNT(c) column total_byte_size: Precision::Inexact(16), column_statistics: vec![ ColumnStatistics { @@ -636,6 +706,7 @@ mod test { min_value: Precision::Exact(ScalarValue::Int32(Some(3))), sum_value: Precision::Absent, distinct_count: Precision::Absent, + byte_size: Precision::Absent, }, ColumnStatistics::new_unknown(), ColumnStatistics::new_unknown(), @@ -646,10 +717,6 @@ mod test { let expected_p1_statistics = Statistics { num_rows: Precision::Inexact(2), - // Each row produces 8 bytes of data: - // - id column: Int32 (4 bytes) × 2 rows = 8 bytes - // - id + 1 column: Int32 (4 bytes) × 2 rows = 8 bytes - // AggregateExec cannot yet derive byte sizes for the COUNT(c) column total_byte_size: Precision::Inexact(16), column_statistics: vec![ ColumnStatistics { @@ -658,6 +725,7 @@ mod test { min_value: Precision::Exact(ScalarValue::Int32(Some(1))), sum_value: Precision::Absent, distinct_count: Precision::Absent, + byte_size: Precision::Absent, }, ColumnStatistics::new_unknown(), ColumnStatistics::new_unknown(), @@ -854,6 +922,7 @@ mod test { .collect::>>()?; assert_eq!(statistics.len(), 3); + // Repartition preserves original total_rows from input (4 rows total) let expected_stats = Statistics { num_rows: Precision::Inexact(1), total_byte_size: Precision::Inexact(10), @@ -958,6 +1027,7 @@ mod test { .collect::>>()?; assert_eq!(stats.len(), 2); + // Repartition preserves original total_rows from input (4 rows total) let expected_stats = Statistics { num_rows: Precision::Inexact(2), total_byte_size: Precision::Inexact(16), diff --git a/datafusion/core/tests/physical_optimizer/test_utils.rs b/datafusion/core/tests/physical_optimizer/test_utils.rs index e410c495c8ce8..ef3f23b7cb766 100644 --- a/datafusion/core/tests/physical_optimizer/test_utils.rs +++ b/datafusion/core/tests/physical_optimizer/test_utils.rs @@ -104,6 +104,7 @@ fn int64_stats() -> ColumnStatistics { max_value: Precision::Exact(1_000_000.into()), min_value: Precision::Exact(0.into()), distinct_count: Precision::Absent, + byte_size: Precision::Absent, } } diff --git a/datafusion/datasource-parquet/src/metadata.rs b/datafusion/datasource-parquet/src/metadata.rs index 0640b19aeee51..e2ab3fd8279ea 100644 --- a/datafusion/datasource-parquet/src/metadata.rs +++ b/datafusion/datasource-parquet/src/metadata.rs @@ -38,10 +38,11 @@ use log::debug; use object_store::path::Path; use object_store::{ObjectMeta, ObjectStore}; use parquet::arrow::arrow_reader::statistics::StatisticsConverter; -use parquet::arrow::parquet_to_arrow_schema; +use parquet::arrow::{parquet_column, parquet_to_arrow_schema}; use parquet::file::metadata::{ PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader, RowGroupMetaData, }; +use parquet::schema::types::SchemaDescriptor; use std::any::Any; use std::collections::HashMap; use std::sync::Arc; @@ -227,24 +228,36 @@ impl<'a> DFParquetMetadata<'a> { /// - Exact row count /// - Exact byte size /// - All column statistics marked as unknown via Statistics::unknown_column(&table_schema) + /// - Column byte sizes are still calculated and recorded + /// /// # When only some columns have statistics: /// /// For columns with statistics: /// - Min/max values are properly extracted and represented as Precision::Exact /// - Null counts are calculated by summing across row groups + /// - Byte sizes are calculated and recorded /// /// For columns without statistics, /// - For min/max, there are two situations: /// 1. The column isn't in arrow schema, then min/max values are set to Precision::Absent /// 2. The column is in arrow schema, but not in parquet schema due to schema revolution, min/max values are set to Precision::Exact(null) /// - Null counts are set to Precision::Exact(num_rows) (conservatively assuming all values could be null) + /// + /// # Byte Size Calculation: + /// + /// - For primitive types with known fixed size, exact byte size is calculated as (byte width * number of rows) + /// - For other types, uncompressed Parquet size is used as an estimate for in-memory size + /// - If neither method is applicable, byte size is marked as Precision::Absent pub fn statistics_from_parquet_metadata( metadata: &ParquetMetaData, - table_schema: &SchemaRef, + logical_file_schema: &SchemaRef, ) -> Result { let row_groups_metadata = metadata.row_groups(); - let mut statistics = Statistics::new_unknown(table_schema); + // Use Statistics::default() as opposed to Statistics::new_unknown() + // because we are going to replace the column statistics below + // and we don't want to initialize them twice. + let mut statistics = Statistics::default(); let mut has_statistics = false; let mut num_rows = 0_usize; for row_group_meta in row_groups_metadata { @@ -258,33 +271,35 @@ impl<'a> DFParquetMetadata<'a> { } } statistics.num_rows = Precision::Exact(num_rows); - statistics.calculate_total_byte_size(table_schema); let file_metadata = metadata.file_metadata(); - let mut file_schema = parquet_to_arrow_schema( + let mut physical_file_schema = parquet_to_arrow_schema( file_metadata.schema_descr(), file_metadata.key_value_metadata(), )?; - if let Some(merged) = apply_file_schema_type_coercions(table_schema, &file_schema) + if let Some(merged) = + apply_file_schema_type_coercions(logical_file_schema, &physical_file_schema) { - file_schema = merged; + physical_file_schema = merged; } - statistics.column_statistics = if has_statistics { - let (mut max_accs, mut min_accs) = create_max_min_accs(table_schema); - let mut null_counts_array = - vec![Precision::Exact(0); table_schema.fields().len()]; - let mut is_max_value_exact = vec![Some(true); table_schema.fields().len()]; - let mut is_min_value_exact = vec![Some(true); table_schema.fields().len()]; - table_schema - .fields() - .iter() - .enumerate() - .for_each(|(idx, field)| { - match StatisticsConverter::try_new( + statistics.column_statistics = + if has_statistics { + let (mut max_accs, mut min_accs) = + create_max_min_accs(logical_file_schema); + let mut null_counts_array = + vec![Precision::Absent; logical_file_schema.fields().len()]; + let mut column_byte_sizes = + vec![Precision::Absent; logical_file_schema.fields().len()]; + let mut is_max_value_exact = + vec![Some(true); logical_file_schema.fields().len()]; + let mut is_min_value_exact = + vec![Some(true); logical_file_schema.fields().len()]; + logical_file_schema.fields().iter().enumerate().for_each( + |(idx, field)| match StatisticsConverter::try_new( field.name(), - &file_schema, + &physical_file_schema, file_metadata.schema_descr(), ) { Ok(stats_converter) => { @@ -294,8 +309,12 @@ impl<'a> DFParquetMetadata<'a> { null_counts_array: &mut null_counts_array, is_min_value_exact: &mut is_min_value_exact, is_max_value_exact: &mut is_max_value_exact, + column_byte_sizes: &mut column_byte_sizes, }; summarize_min_max_null_counts( + file_metadata.schema_descr(), + logical_file_schema, + &physical_file_schema, &mut accumulators, idx, &stats_converter, @@ -307,20 +326,53 @@ impl<'a> DFParquetMetadata<'a> { debug!("Failed to create statistics converter: {e}"); null_counts_array[idx] = Precision::Exact(num_rows); } - } - }); - - get_col_stats( - table_schema, - &null_counts_array, - &mut max_accs, - &mut min_accs, - &mut is_max_value_exact, - &mut is_min_value_exact, - ) - } else { - Statistics::unknown_column(table_schema) - }; + }, + ); + + get_col_stats( + logical_file_schema, + &null_counts_array, + &mut max_accs, + &mut min_accs, + &mut is_max_value_exact, + &mut is_min_value_exact, + &column_byte_sizes, + ) + } else { + // Record column sizes + logical_file_schema + .fields() + .iter() + .enumerate() + .map(|(logical_file_schema_index, field)| { + let arrow_field = + logical_file_schema.field(logical_file_schema_index); + let parquet_idx = parquet_column( + file_metadata.schema_descr(), + &physical_file_schema, + arrow_field.name(), + ) + .map(|(idx, _)| idx); + let byte_size = compute_arrow_column_size( + field.data_type(), + row_groups_metadata, + parquet_idx, + num_rows, + ); + ColumnStatistics::new_unknown().with_byte_size(byte_size) + }) + .collect() + }; + + #[cfg(debug_assertions)] + { + // Check that the column statistics length matches the table schema fields length + assert_eq!( + statistics.column_statistics.len(), + logical_file_schema.fields().len(), + "Column statistics length does not match table schema fields length" + ); + } Ok(statistics) } @@ -365,6 +417,7 @@ fn get_col_stats( min_values: &mut [Option], is_max_value_exact: &mut [Option], is_min_value_exact: &mut [Option], + column_byte_sizes: &[Precision], ) -> Vec { (0..schema.fields().len()) .map(|i| { @@ -398,6 +451,7 @@ fn get_col_stats( min_value: min_value.unwrap_or(Precision::Absent), sum_value: Precision::Absent, distinct_count: Precision::Absent, + byte_size: column_byte_sizes[i], } }) .collect() @@ -410,11 +464,15 @@ struct StatisticsAccumulators<'a> { null_counts_array: &'a mut [Precision], is_min_value_exact: &'a mut [Option], is_max_value_exact: &'a mut [Option], + column_byte_sizes: &'a mut [Precision], } fn summarize_min_max_null_counts( + parquet_schema: &SchemaDescriptor, + logical_file_schema: &Schema, + physical_file_schema: &Schema, accumulators: &mut StatisticsAccumulators, - arrow_schema_index: usize, + logical_schema_index: usize, stats_converter: &StatisticsConverter, row_groups_metadata: &[RowGroupMetaData], ) -> Result<()> { @@ -426,27 +484,27 @@ fn summarize_min_max_null_counts( let is_min_value_exact_stat = stats_converter.row_group_is_min_value_exact(row_groups_metadata)?; - if let Some(max_acc) = &mut accumulators.max_accs[arrow_schema_index] { + if let Some(max_acc) = &mut accumulators.max_accs[logical_schema_index] { max_acc.update_batch(&[Arc::clone(&max_values)])?; let mut cur_max_acc = max_acc.clone(); - accumulators.is_max_value_exact[arrow_schema_index] = has_any_exact_match( + accumulators.is_max_value_exact[logical_schema_index] = has_any_exact_match( &cur_max_acc.evaluate()?, &max_values, &is_max_value_exact_stat, ); } - if let Some(min_acc) = &mut accumulators.min_accs[arrow_schema_index] { + if let Some(min_acc) = &mut accumulators.min_accs[logical_schema_index] { min_acc.update_batch(&[Arc::clone(&min_values)])?; let mut cur_min_acc = min_acc.clone(); - accumulators.is_min_value_exact[arrow_schema_index] = has_any_exact_match( + accumulators.is_min_value_exact[logical_schema_index] = has_any_exact_match( &cur_min_acc.evaluate()?, &min_values, &is_min_value_exact_stat, ); } - accumulators.null_counts_array[arrow_schema_index] = match sum(&null_counts) { + accumulators.null_counts_array[logical_schema_index] = match sum(&null_counts) { Some(null_count) => Precision::Exact(null_count as usize), None => match null_counts.len() { // If sum() returned None we either have no rows or all values are null @@ -455,9 +513,55 @@ fn summarize_min_max_null_counts( }, }; + // This is the same logic as parquet_column but we start from arrow schema index + // instead of looking up by name. + let parquet_index = parquet_column( + parquet_schema, + physical_file_schema, + logical_file_schema.field(logical_schema_index).name(), + ) + .map(|(idx, _)| idx); + + let arrow_field = logical_file_schema.field(logical_schema_index); + accumulators.column_byte_sizes[logical_schema_index] = compute_arrow_column_size( + arrow_field.data_type(), + row_groups_metadata, + parquet_index, + row_groups_metadata + .iter() + .map(|rg| rg.num_rows() as usize) + .sum(), + ); + Ok(()) } +/// Compute the Arrow in-memory size for a single column +fn compute_arrow_column_size( + data_type: &DataType, + row_groups_metadata: &[RowGroupMetaData], + parquet_idx: Option, + num_rows: usize, +) -> Precision { + // For primitive types with known fixed size, compute exact size + if let Some(byte_width) = data_type.primitive_width() { + return Precision::Exact(byte_width * num_rows); + } + + // Use the uncompressed Parquet size as an estimate for other types + if let Some(parquet_idx) = parquet_idx { + let uncompressed_bytes: i64 = row_groups_metadata + .iter() + .filter_map(|rg| rg.columns().get(parquet_idx)) + .map(|col| col.uncompressed_size()) + .sum(); + return Precision::Inexact(uncompressed_bytes as usize); + } + + // Otherwise, we cannot determine the size + Precision::Absent +} + /// Checks if any occurrence of `value` in `array` corresponds to a `true` /// entry in the `exactness` array. /// diff --git a/datafusion/datasource/src/memory.rs b/datafusion/datasource/src/memory.rs index e0635435e9d09..1d12bb3200309 100644 --- a/datafusion/datasource/src/memory.rs +++ b/datafusion/datasource/src/memory.rs @@ -963,6 +963,7 @@ mod tests { max_value: Precision::Absent, min_value: Precision::Absent, sum_value: Precision::Absent, + byte_size: Precision::Absent, },], } ); diff --git a/datafusion/datasource/src/mod.rs b/datafusion/datasource/src/mod.rs index 85f3418d1ea9a..1e71825b99e87 100644 --- a/datafusion/datasource/src/mod.rs +++ b/datafusion/datasource/src/mod.rs @@ -434,6 +434,7 @@ pub fn generate_test_files(num_files: usize, overlap_factor: f64) -> Vec for ColumnStatistics { } else { Precision::Absent }, + byte_size: if let Some(sbs) = &cs.byte_size { + sbs.clone().into() + } else { + Precision::Absent + }, } } } diff --git a/datafusion/proto-common/src/generated/pbjson.rs b/datafusion/proto-common/src/generated/pbjson.rs index 66659ad14cbbd..7c08aaad98738 100644 --- a/datafusion/proto-common/src/generated/pbjson.rs +++ b/datafusion/proto-common/src/generated/pbjson.rs @@ -1091,6 +1091,9 @@ impl serde::Serialize for ColumnStats { if self.distinct_count.is_some() { len += 1; } + if self.byte_size.is_some() { + len += 1; + } let mut struct_ser = serializer.serialize_struct("datafusion_common.ColumnStats", len)?; if let Some(v) = self.min_value.as_ref() { struct_ser.serialize_field("minValue", v)?; @@ -1107,6 +1110,9 @@ impl serde::Serialize for ColumnStats { if let Some(v) = self.distinct_count.as_ref() { struct_ser.serialize_field("distinctCount", v)?; } + if let Some(v) = self.byte_size.as_ref() { + struct_ser.serialize_field("ByteSize", v)?; + } struct_ser.end() } } @@ -1127,6 +1133,8 @@ impl<'de> serde::Deserialize<'de> for ColumnStats { "nullCount", "distinct_count", "distinctCount", + "byte_size", + "ByteSize", ]; #[allow(clippy::enum_variant_names)] @@ -1136,6 +1144,8 @@ impl<'de> serde::Deserialize<'de> for ColumnStats { SumValue, NullCount, DistinctCount, + + ByteSize, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -1162,6 +1172,7 @@ impl<'de> serde::Deserialize<'de> for ColumnStats { "sumValue" | "sum_value" => Ok(GeneratedField::SumValue), "nullCount" | "null_count" => Ok(GeneratedField::NullCount), "distinctCount" | "distinct_count" => Ok(GeneratedField::DistinctCount), + "ByteSize" | "byte_size" => Ok(GeneratedField::ByteSize), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -1186,6 +1197,7 @@ impl<'de> serde::Deserialize<'de> for ColumnStats { let mut sum_value__ = None; let mut null_count__ = None; let mut distinct_count__ = None; + let mut byte_size__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::MinValue => { @@ -1218,6 +1230,12 @@ impl<'de> serde::Deserialize<'de> for ColumnStats { } distinct_count__ = map_.next_value()?; } + GeneratedField::ByteSize => { + if byte_size__.is_some() { + return Err(serde::de::Error::duplicate_field("ByteSize")); + } + byte_size__ = map_.next_value()?; + } } } Ok(ColumnStats { @@ -1226,6 +1244,7 @@ impl<'de> serde::Deserialize<'de> for ColumnStats { sum_value: sum_value__, null_count: null_count__, distinct_count: distinct_count__, + byte_size: byte_size__, }) } } diff --git a/datafusion/proto-common/src/generated/prost.rs b/datafusion/proto-common/src/generated/prost.rs index eaeed5276b241..9c4b7e1252a83 100644 --- a/datafusion/proto-common/src/generated/prost.rs +++ b/datafusion/proto-common/src/generated/prost.rs @@ -930,6 +930,8 @@ pub struct ColumnStats { pub null_count: ::core::option::Option, #[prost(message, optional, tag = "4")] pub distinct_count: ::core::option::Option, + #[prost(message, optional, tag = "6")] + pub byte_size: ::core::option::Option, } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs index 7addcde5956cc..ca1057da4e2da 100644 --- a/datafusion/proto-common/src/to_proto/mod.rs +++ b/datafusion/proto-common/src/to_proto/mod.rs @@ -795,6 +795,7 @@ impl From<&ColumnStatistics> for protobuf::ColumnStats { sum_value: Some(protobuf::Precision::from(&s.sum_value)), null_count: Some(protobuf::Precision::from(&s.null_count)), distinct_count: Some(protobuf::Precision::from(&s.distinct_count)), + byte_size: Some(protobuf::Precision::from(&s.byte_size)), } } } diff --git a/datafusion/proto/src/generated/datafusion_proto_common.rs b/datafusion/proto/src/generated/datafusion_proto_common.rs index eaeed5276b241..9c4b7e1252a83 100644 --- a/datafusion/proto/src/generated/datafusion_proto_common.rs +++ b/datafusion/proto/src/generated/datafusion_proto_common.rs @@ -930,6 +930,8 @@ pub struct ColumnStats { pub null_count: ::core::option::Option, #[prost(message, optional, tag = "4")] pub distinct_count: ::core::option::Option, + #[prost(message, optional, tag = "6")] + pub byte_size: ::core::option::Option, } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] diff --git a/datafusion/sqllogictest/test_files/explain.slt b/datafusion/sqllogictest/test_files/explain.slt index de4e5325d662f..9f8e264ee8bb0 100644 --- a/datafusion/sqllogictest/test_files/explain.slt +++ b/datafusion/sqllogictest/test_files/explain.slt @@ -287,22 +287,22 @@ CREATE EXTERNAL TABLE alltypes_plain STORED AS PARQUET LOCATION '../../parquet-t query TT EXPLAIN SELECT * FROM alltypes_plain limit 10; ---- -physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]] # explain verbose with both collect & show statistics on query TT EXPLAIN VERBOSE SELECT * FROM alltypes_plain limit 10; ---- initial_physical_plan -01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] +01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]] initial_physical_plan_with_schema 01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N] physical_plan after OutputRequirements -01)OutputRequirementExec: order_by=[], dist_by=Unspecified, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] -02)--GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] +01)OutputRequirementExec: order_by=[], dist_by=Unspecified, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]] +02)--GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]] physical_plan after aggregate_statistics SAME TEXT AS ABOVE physical_plan after join_selection SAME TEXT AS ABOVE physical_plan after LimitedDistinctAggregation SAME TEXT AS ABOVE @@ -314,16 +314,16 @@ physical_plan after OptimizeAggregateOrder SAME TEXT AS ABOVE physical_plan after ProjectionPushdown SAME TEXT AS ABOVE physical_plan after coalesce_batches SAME TEXT AS ABOVE physical_plan after OutputRequirements -01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] +01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]] physical_plan after LimitAggregation SAME TEXT AS ABOVE physical_plan after LimitPushPastWindows SAME TEXT AS ABOVE -physical_plan after LimitPushdown DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] +physical_plan after LimitPushdown DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]] physical_plan after ProjectionPushdown SAME TEXT AS ABOVE physical_plan after EnsureCooperative SAME TEXT AS ABOVE physical_plan after FilterPushdown(Post) SAME TEXT AS ABOVE physical_plan after SanityCheckPlan SAME TEXT AS ABOVE -physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]] physical_plan_with_schema DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N] @@ -338,8 +338,8 @@ initial_physical_plan 01)GlobalLimitExec: skip=0, fetch=10 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet initial_physical_plan_with_stats -01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] +01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]] initial_physical_plan_with_schema 01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N] @@ -368,7 +368,7 @@ physical_plan after EnsureCooperative SAME TEXT AS ABOVE physical_plan after FilterPushdown(Post) SAME TEXT AS ABOVE physical_plan after SanityCheckPlan SAME TEXT AS ABOVE physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet -physical_plan_with_stats DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] +physical_plan_with_stats DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]] physical_plan_with_schema DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N] diff --git a/datafusion/sqllogictest/test_files/listing_table_statistics.slt b/datafusion/sqllogictest/test_files/listing_table_statistics.slt index 233c2ff589ac9..4298320d4aaba 100644 --- a/datafusion/sqllogictest/test_files/listing_table_statistics.slt +++ b/datafusion/sqllogictest/test_files/listing_table_statistics.slt @@ -35,7 +35,7 @@ query TT explain format indent select * from t; ---- logical_plan TableScan: t projection=[int_col, str_col] -physical_plan DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/listing_table_statistics/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/listing_table_statistics/2.parquet]]}, projection=[int_col, str_col], file_type=parquet, statistics=[Rows=Exact(4), Bytes=Absent, [(Col[0]: Min=Exact(Int64(-1)) Max=Exact(Int64(3)) Null=Exact(0)),(Col[1]: Min=Exact(Utf8View("a")) Max=Exact(Utf8View("d")) Null=Exact(0))]] +physical_plan DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/listing_table_statistics/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/listing_table_statistics/2.parquet]]}, projection=[int_col, str_col], file_type=parquet, statistics=[Rows=Exact(4), Bytes=Absent, [(Col[0]: Min=Exact(Int64(-1)) Max=Exact(Int64(3)) Null=Exact(0) ScanBytes=Exact(32)),(Col[1]: Min=Exact(Utf8View("a")) Max=Exact(Utf8View("d")) Null=Exact(0) ScanBytes=Inexact(100))]] statement ok drop table t; diff --git a/datafusion/sqllogictest/test_files/parquet_statistics.slt b/datafusion/sqllogictest/test_files/parquet_statistics.slt index b3796cd551259..8c77fb96ba75c 100644 --- a/datafusion/sqllogictest/test_files/parquet_statistics.slt +++ b/datafusion/sqllogictest/test_files/parquet_statistics.slt @@ -59,9 +59,9 @@ query TT EXPLAIN SELECT * FROM test_table WHERE column1 = 1; ---- physical_plan -01)FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(10), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] -02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(40), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] -03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Inexact(5), Bytes=Inexact(40), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] +01)FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(10), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0) ScanBytes=Inexact(40))]] +02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(40), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0) ScanBytes=Inexact(40))]] +03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Inexact(5), Bytes=Inexact(40), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0) ScanBytes=Inexact(40))]] # cleanup statement ok @@ -84,9 +84,9 @@ query TT EXPLAIN SELECT * FROM test_table WHERE column1 = 1; ---- physical_plan -01)FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(10), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] -02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(40), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] -03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Inexact(5), Bytes=Inexact(40), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] +01)FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(10), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0) ScanBytes=Inexact(40))]] +02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(40), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0) ScanBytes=Inexact(40))]] +03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Inexact(5), Bytes=Inexact(40), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0) ScanBytes=Inexact(40))]] # cleanup statement ok diff --git a/docs/source/library-user-guide/query-optimizer.md b/docs/source/library-user-guide/query-optimizer.md index 651e147a765b7..6f2d75a97e895 100644 --- a/docs/source/library-user-guide/query-optimizer.md +++ b/docs/source/library-user-guide/query-optimizer.md @@ -478,13 +478,10 @@ fn analyze_filter_example() -> Result<()> { let schema = Arc::new(Schema::new(vec![age])); // Define column statistics - let column_stats = ColumnStatistics { - null_count: Precision::Exact(0), - max_value: Precision::Exact(ScalarValue::Int64(Some(79))), - min_value: Precision::Exact(ScalarValue::Int64(Some(14))), - distinct_count: Precision::Absent, - sum_value: Precision::Absent, - }; + let column_stats = ColumnStatistics::default() + .with_min_value(Precision::Exact(ScalarValue::Int64(Some(14)))) + .with_max_value(Precision::Exact(ScalarValue::Int64(Some(79)))) + .with_null_count(Precision::Exact(0)); // Create expression: age > 18 AND age <= 25 let expr = col("age")