diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/expr_api.rs index 2908edbb754d..349850df6148 100644 --- a/datafusion-examples/examples/expr_api.rs +++ b/datafusion-examples/examples/expr_api.rs @@ -22,8 +22,9 @@ use arrow::array::{BooleanArray, Int32Array, Int8Array}; use arrow::record_batch::RecordBatch; use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit}; +use datafusion::common::stats::Precision; use datafusion::common::tree_node::{Transformed, TreeNode}; -use datafusion::common::DFSchema; +use datafusion::common::{ColumnStatistics, DFSchema}; use datafusion::common::{ScalarValue, ToDFSchema}; use datafusion::error::Result; use datafusion::functions_aggregate::first_last::first_value_udaf; @@ -80,6 +81,9 @@ async fn main() -> Result<()> { // See how to analyze ranges in expressions range_analysis_demo()?; + // See how to analyze boundaries in different kinds of expressions. + boundary_analysis_and_selectivity_demo()?; + // See how to determine the data types of expressions expression_type_demo()?; @@ -275,6 +279,74 @@ fn range_analysis_demo() -> Result<()> { Ok(()) } +// DataFusion's analysis can infer boundary statistics and selectivity in +// various situations which can be helpful in building more efficient +// query plans. +fn boundary_analysis_and_selectivity_demo() -> Result<()> { + // Consider the example where we want all rows with an `id` greater than + // 5000. + let id_greater_5000 = col("id").gt_eq(lit(5000i64)); + + // As in most examples we must tell DaataFusion the type of the column. + let schema = Arc::new(Schema::new(vec![make_field("id", DataType::Int64)])); + + // DataFusion is able to do cardinality estimation on various column types + // these estimates represented by the `ColumnStatistics` type describe + // properties such as the maximum and minimum value, the number of distinct + // values and the number of null values. + let column_stats = ColumnStatistics { + null_count: Precision::Exact(0), + max_value: Precision::Exact(ScalarValue::Int64(Some(10000))), + min_value: Precision::Exact(ScalarValue::Int64(Some(1))), + sum_value: Precision::Absent, + distinct_count: Precision::Absent, + }; + + // We can then build our expression boundaries from the column statistics + // allowing the analysis to be more precise. + let initial_boundaries = + vec![ExprBoundaries::try_from_column(&schema, &column_stats, 0)?]; + + // With the above we can perform the boundary analysis similar to the previous + // example. + let df_schema = DFSchema::try_from(schema.clone())?; + + // Analysis case id >= 5000 + let physical_expr1 = + SessionContext::new().create_physical_expr(id_greater_5000, &df_schema)?; + let analysis = analyze( + &physical_expr1, + AnalysisContext::new(initial_boundaries.clone()), + df_schema.as_ref(), + )?; + + // The analysis will return better bounds thanks to the column statistics. + assert_eq!( + analysis.boundaries.first().map(|boundary| boundary + .interval + .clone() + .unwrap() + .into_bounds()), + Some(( + ScalarValue::Int64(Some(5000)), + ScalarValue::Int64(Some(10000)) + )) + ); + + // We can also infer selectivity from the column statistics by assuming + // that the column is uniformly distributed and using the following + // estimation formula: + // Assuming the original range is [a, b] and the new range: [a', b'] + // + // (a' - b' + 1) / (a - b) + // (10000 - 5000 + 1) / (10000 - 1) + assert!(analysis + .selectivity + .is_some_and(|selectivity| (0.5..=0.6).contains(&selectivity))); + + Ok(()) +} + fn make_field(name: &str, data_type: DataType) -> Field { let nullable = false; Field::new(name, data_type, nullable)